Implement basic clip logic

Adds a clip method to the (CPU) render context, plus a considerable amount of mechanism in coarse and fine rasterization to support clipping.

The coarse rasterization logic contains a similar set of optimizations as Vello. In particular, all-zero tiles have drawing suppressed, and all-one tiles pass drawing commands through with no additional work to clip.

Not extensively validated, but it does render a simple scene with clipping correctly.
diff --git a/vello_hybrid/examples/gpu.rs b/vello_hybrid/examples/gpu.rs
index eeb6a1b..433823f 100644
--- a/vello_hybrid/examples/gpu.rs
+++ b/vello_hybrid/examples/gpu.rs
@@ -61,7 +61,7 @@
     surface.configure(&device, &sc);
 
     let session = GpuSession::new(&device, format);
-    // TODO: actually render something
+
     let mut render_ctx = GpuRenderCtx::new(size.width as usize, size.height as usize);
     draw_simple_scene(&mut render_ctx);
     let bufs = render_ctx.harvest();
diff --git a/vello_hybrid/examples/simple.rs b/vello_hybrid/examples/simple.rs
index e91f017..bae2c10 100644
--- a/vello_hybrid/examples/simple.rs
+++ b/vello_hybrid/examples/simple.rs
@@ -7,24 +7,16 @@
 use std::io::BufWriter;
 
 use vello_api::peniko::color::palette;
-use vello_api::peniko::kurbo::{BezPath, Stroke};
+use vello_api::peniko::kurbo::{BezPath, Point, Stroke, Vec2};
 use vello_api::RenderCtx;
 use vello_hybrid::{CsRenderCtx, Pixmap};
 
 const WIDTH: usize = 1024;
-const HEIGHT: usize = 256;
+const HEIGHT: usize = 1024;
 
 pub fn main() {
     let mut ctx = CsRenderCtx::new(WIDTH, HEIGHT);
-    let mut path = BezPath::new();
-    path.move_to((10.0, 10.0));
-    path.line_to((180.0, 20.0));
-    path.line_to((30.0, 40.0));
-    path.close_path();
-    let piet_path = path.into();
-    ctx.fill(&piet_path, palette::css::REBECCA_PURPLE.into());
-    let stroke = Stroke::new(5.0);
-    ctx.stroke(&piet_path, &stroke, palette::css::DARK_BLUE.into());
+    draw_simple_scene(&mut ctx);
     if let Some(filename) = std::env::args().nth(1) {
         let mut pixmap = Pixmap::new(WIDTH, HEIGHT);
         ctx.render_to_pixmap(&mut pixmap);
@@ -39,3 +31,30 @@
         ctx.debug_dump();
     }
 }
+
+fn star(center: Point, n: usize, inner: f64, outer: f64) -> BezPath {
+    let mut path = BezPath::new();
+    path.move_to(center + Vec2::new(outer, 0.));
+    for i in 1..n * 2 {
+        let th = i as f64 * std::f64::consts::PI / n as f64;
+        let r = if i % 2 == 0 { outer } else { inner };
+        path.line_to(center + r * Vec2::from_angle(th));
+    }
+    path.close_path();
+    path
+}
+
+fn draw_simple_scene(ctx: &mut CsRenderCtx) {
+    let mut path = BezPath::new();
+    path.move_to((10.0, 10.0));
+    path.line_to((180.0, 20.0));
+    path.line_to((30.0, 180.0));
+    path.close_path();
+    // Note: we plan to change the API to have `into`.
+    let piet_path = path.into();
+    let stroke = Stroke::new(5.0);
+    ctx.stroke(&piet_path, &stroke, palette::css::DARK_BLUE.into());
+    let star_path = star(Point::new(100., 100.), 13, 50., 95.);
+    ctx.clip(&star_path.into());
+    ctx.fill(&piet_path, palette::css::REBECCA_PURPLE.into());
+}
diff --git a/vello_hybrid/src/fine.rs b/vello_hybrid/src/fine.rs
index 4b93552..52b94b0 100644
--- a/vello_hybrid/src/fine.rs
+++ b/vello_hybrid/src/fine.rs
@@ -15,7 +15,7 @@
     // f32 RGBA pixels
     // That said, if we use u8, then this is basically a block of
     // untyped memory.
-    pub(crate) scratch: [f32; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4],
+    pub(crate) scratch: Vec<[f32; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]>,
     #[allow(clippy::doc_markdown, reason = "false positive for x86_64")]
     /// Whether to use SIMD
     ///
@@ -31,7 +31,7 @@
 
 impl<'a> Fine<'a> {
     pub(crate) fn new(width: usize, height: usize, out_buf: &'a mut [u8]) -> Self {
-        let scratch = [0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4];
+        let scratch = vec![[0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]];
         Self {
             width,
             height,
@@ -42,7 +42,8 @@
     }
 
     pub(crate) fn clear_scalar(&mut self, color: [f32; 4]) {
-        for z in self.scratch.chunks_exact_mut(4) {
+        let scratch = self.scratch.last_mut().unwrap();
+        for z in scratch.chunks_exact_mut(4) {
             z.copy_from_slice(&color);
         }
     }
@@ -58,12 +59,13 @@
             (y + 1) * STRIP_HEIGHT <= self.height,
             "overflow of pixmap height"
         );
+        let scratch = self.scratch.last_mut().unwrap();
         let base_ix = (y * STRIP_HEIGHT * self.width + x * WIDE_TILE_WIDTH) * 4;
         for j in 0..STRIP_HEIGHT {
             let line_ix = base_ix + j * self.width * 4;
             for i in 0..WIDE_TILE_WIDTH {
                 let mut rgba_f32 = [0.0; 4];
-                rgba_f32.copy_from_slice(&self.scratch[(i * STRIP_HEIGHT + j) * 4..][..4]);
+                rgba_f32.copy_from_slice(&scratch[(i * STRIP_HEIGHT + j) * 4..][..4]);
                 let rgba_u8 = rgba_f32.map(|z| (z * 255.0).round() as u8);
                 self.out_buf[line_ix + i * 4..][..4].copy_from_slice(&rgba_u8);
             }
@@ -79,20 +81,28 @@
                 let aslice = &alphas[s.alpha_ix..];
                 self.strip(s.x as usize, s.width as usize, aslice, s.color.components);
             }
+            Cmd::PushClip => self.scratch.push([0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]),
+            Cmd::PopClip => _ = self.scratch.pop(),
+            Cmd::ClipFill(f) => {
+                self.clip_fill_scalar(f.x as usize, f.width as usize);
+            }
+            Cmd::ClipStrip(s) => {
+                let aslice = &alphas[s.alpha_ix..];
+                self.clip_strip_scalar(s.x as usize, s.width as usize, aslice);
+            }
         }
     }
 
     pub(crate) fn fill_scalar(&mut self, x: usize, width: usize, color: [f32; 4]) {
+        let scratch = self.scratch.last_mut().unwrap();
         if color[3] == 1.0 {
-            for z in
-                self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4)
+            for z in scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4)
             {
                 z.copy_from_slice(&color);
             }
         } else {
             let one_minus_alpha = 1.0 - color[3];
-            for z in
-                self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4)
+            for z in scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4)
             {
                 for i in 0..4 {
                     //z[i] = color[i] + one_minus_alpha * z[i];
@@ -106,9 +116,10 @@
     }
 
     pub(crate) fn strip_scalar(&mut self, x: usize, width: usize, alphas: &[u32], color: [f32; 4]) {
+        let scratch = self.scratch.last_mut().unwrap();
         debug_assert!(alphas.len() >= width, "overflow of alphas buffer");
         let cs = color.map(|z| z * (1.0 / 255.0));
-        for (z, a) in self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width]
+        for (z, a) in scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width]
             .chunks_exact_mut(16)
             .zip(alphas)
         {
@@ -121,4 +132,33 @@
             }
         }
     }
+
+    fn clip_fill_scalar(&mut self, x: usize, width: usize) {
+        let (tos, rest) = self.scratch.split_last_mut().unwrap();
+        let nos = rest.last_mut().unwrap();
+        for i in 0..width {
+            for j in 0..4 {
+                let ix = (x + i) * STRIP_HEIGHT_F32 + j * 4;
+                let one_minus_alpha = 1.0 - tos[ix + 3];
+                for k in 0..4 {
+                    nos[ix + k] = nos[ix + k].mul_add(one_minus_alpha, tos[ix + k]);
+                }
+            }
+        }
+    }
+
+    fn clip_strip_scalar(&mut self, x: usize, width: usize, alphas: &[u32]) {
+        let (tos, rest) = self.scratch.split_last_mut().unwrap();
+        let nos = rest.last_mut().unwrap();
+        for (i, a) in alphas.iter().take(width).enumerate() {
+            for j in 0..4 {
+                let ix = (x + i) * STRIP_HEIGHT_F32 + j * 4;
+                let mask_alpha = ((a >> (j * 8)) & 0xff) as f32 * (1. / 255.);
+                let one_minus_alpha = 1.0 - mask_alpha * tos[ix + 3];
+                for k in 0..4 {
+                    nos[ix + k] = nos[ix + k].mul_add(one_minus_alpha, mask_alpha * tos[ix + k]);
+                }
+            }
+        }
+    }
 }
diff --git a/vello_hybrid/src/gpu.rs b/vello_hybrid/src/gpu.rs
index b4ed688..c5a1b65 100644
--- a/vello_hybrid/src/gpu.rs
+++ b/vello_hybrid/src/gpu.rs
@@ -139,7 +139,8 @@
         }
     }
 
-    pub fn harvest(&self) -> GpuRenderBufs {
+    pub fn harvest(&mut self) -> GpuRenderBufs {
+        self.inner.finish();
         let mut strips = Vec::new();
         let width_tiles = (self.inner.width).div_ceil(WIDE_TILE_WIDTH);
         let height_tiles = (self.inner.height).div_ceil(STRIP_HEIGHT);
@@ -184,6 +185,7 @@
                             };
                             strips.push(strip);
                         }
+                        _ => todo!(),
                     }
                 }
             }
diff --git a/vello_hybrid/src/render.rs b/vello_hybrid/src/render.rs
index 1c7bfa6..7ecb8b2 100644
--- a/vello_hybrid/src/render.rs
+++ b/vello_hybrid/src/render.rs
@@ -19,7 +19,7 @@
     fine::Fine,
     strip::{self, Strip, Tile},
     tiling::{self, FlatLine},
-    wide_tile::{Cmd, CmdStrip, WideTile, STRIP_HEIGHT, WIDE_TILE_WIDTH},
+    wide_tile::{Cmd, CmdClipStrip, CmdStrip, WideTile, STRIP_HEIGHT, WIDE_TILE_WIDTH},
     Pixmap,
 };
 
@@ -34,6 +34,22 @@
     line_buf: Vec<FlatLine>,
     tile_buf: Vec<Tile>,
     strip_buf: Vec<Strip>,
+
+    state_stack: Vec<GfxState>,
+    clip_stack: Vec<Clip>,
+}
+
+struct GfxState {
+    // TODO: transform goes here (there's logic in piet-ts to copy)
+    n_clip: usize,
+}
+
+struct Clip {
+    // should probably be a bounding box type
+    /// The intersected bounding box after clip
+    clip_bbox: [usize; 4],
+    /// The rendered path in sparse strip representation
+    strips: Vec<Strip>,
 }
 
 pub struct CsResourceCtx;
@@ -45,18 +61,17 @@
         let tiles = (0..width_tiles * height_tiles)
             .map(|_| WideTile::default())
             .collect();
-        let alphas = vec![];
-        let line_buf = vec![];
-        let tile_buf = vec![];
-        let strip_buf = vec![];
+        let state = GfxState { n_clip: 0 };
         Self {
             width,
             height,
             tiles,
-            alphas,
-            line_buf,
-            tile_buf,
-            strip_buf,
+            alphas: vec![],
+            line_buf: vec![],
+            tile_buf: vec![],
+            strip_buf: vec![],
+            state_stack: vec![state],
+            clip_stack: vec![],
         }
     }
 
@@ -67,7 +82,16 @@
         }
     }
 
-    pub fn render_to_pixmap(&self, pixmap: &mut Pixmap) {
+    /// Finish the coarse rasterization prior to fine rendering.
+    ///
+    /// At the moment, this mostly involves resolving any open clips, but
+    /// might extend to other things.
+    pub(crate) fn finish(&mut self) {
+        self.pop_clips();
+    }
+
+    pub fn render_to_pixmap(&mut self, pixmap: &mut Pixmap) {
+        self.finish();
         let mut fine = Fine::new(pixmap.width, pixmap.height, &mut pixmap.buf);
         let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH);
         let height_tiles = (self.height).div_ceil(STRIP_HEIGHT);
@@ -94,27 +118,43 @@
         println!("total = {total}, {histo:?}");
     }
 
+    /// Render a path to the strip buffer.
+    fn render_path_common(&mut self) {
+        tiling::make_tiles(&self.line_buf, &mut self.tile_buf);
+        self.tile_buf.sort_unstable_by(Tile::cmp);
+        crate::simd::render_strips(&self.tile_buf, &mut self.strip_buf, &mut self.alphas);
+    }
+
     /// Render a path, which has already been flattened into `line_buf`.
     fn render_path(&mut self, brush: BrushRef<'_>) {
         // TODO: need to make sure tiles contained in viewport - we'll likely
         // panic otherwise.
-        tiling::make_tiles(&self.line_buf, &mut self.tile_buf);
-        self.tile_buf.sort_unstable_by(Tile::cmp);
-        crate::simd::render_strips(&self.tile_buf, &mut self.strip_buf, &mut self.alphas);
+        self.render_path_common();
         let color = brush_to_color(brush);
         let width_tiles = self.width.div_ceil(WIDE_TILE_WIDTH);
+        let bbox = self.get_bbox();
         for i in 0..self.strip_buf.len() - 1 {
             let strip = &self.strip_buf[i];
             let next_strip = &self.strip_buf[i + 1];
             let x0 = strip.x();
-            let y = strip.strip_y();
-            let row_start = y as usize * width_tiles;
+            let y = strip.strip_y() as usize;
+            if y < bbox[1] {
+                continue;
+            }
+            if y >= bbox[3] {
+                break;
+            }
+            let row_start = y * width_tiles;
             let strip_width = next_strip.col - strip.col;
             let x1 = x0 + strip_width;
-            let xtile0 = x0 as usize / WIDE_TILE_WIDTH;
-            let xtile1 = (x1 as usize).div_ceil(WIDE_TILE_WIDTH);
+            let xtile0 = (x0 as usize / WIDE_TILE_WIDTH).max(bbox[0]);
+            let xtile1 = (x1 as usize).div_ceil(WIDE_TILE_WIDTH).min(bbox[2]);
             let mut x = x0;
             let mut col = strip.col;
+            if (bbox[0] * WIDE_TILE_WIDTH) as u32 > x {
+                col += (bbox[0] * WIDE_TILE_WIDTH) as u32 - x;
+                x = (bbox[0] * WIDE_TILE_WIDTH) as u32;
+            }
             for xtile in xtile0..xtile1 {
                 let x_tile_rel = x % WIDE_TILE_WIDTH as u32;
                 let width = x1.min(((xtile + 1) * WIDE_TILE_WIDTH) as u32) - x;
@@ -126,13 +166,13 @@
                 };
                 x += width;
                 col += width;
-                self.tiles[row_start + xtile].push(Cmd::Strip(cmd));
+                self.tiles[row_start + xtile].strip(cmd);
             }
-            if next_strip.winding != 0 && y == next_strip.strip_y() {
+            if next_strip.winding != 0 && y == next_strip.strip_y() as usize {
                 x = x1;
                 let x2 = next_strip.x();
-                let fxt0 = x1 as usize / WIDE_TILE_WIDTH;
-                let fxt1 = (x2 as usize).div_ceil(WIDE_TILE_WIDTH);
+                let fxt0 = (x1 as usize / WIDE_TILE_WIDTH).max(bbox[0]);
+                let fxt1 = (x2 as usize).div_ceil(WIDE_TILE_WIDTH).min(bbox[2]);
                 for xtile in fxt0..fxt1 {
                     let x_tile_rel = x % WIDE_TILE_WIDTH as u32;
                     let width = x2.min(((xtile + 1) * WIDE_TILE_WIDTH) as u32) - x;
@@ -161,6 +201,133 @@
         // TODO: get from graphics state
         Affine::scale(5.0)
     }
+
+    fn get_bbox(&self) -> [usize; 4] {
+        if let Some(tos) = self.clip_stack.last() {
+            tos.clip_bbox
+        } else {
+            let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH);
+            let height_tiles = (self.height).div_ceil(STRIP_HEIGHT);
+            [0, 0, width_tiles, height_tiles]
+        }
+    }
+
+    fn pop_clip(&mut self) {
+        self.state_stack.last_mut().unwrap().n_clip -= 1;
+        let Clip { clip_bbox, strips } = self.clip_stack.pop().unwrap();
+        let n_strips = strips.len();
+        // The next bit of code accomplishes the following. For each tile in
+        // the intersected bounding box, it does one of three things depending
+        // on the contents of the clip path in that tile.
+        // If all-zero: pop a zero_clip.
+        // If all-one: do nothing.
+        // If contains one or more strips: render strips and fills, then pop a clip.
+        // This logic is the inverse of the push logic in `clip()`, and the stack
+        // should be balanced after running both.
+        let mut tile_x = clip_bbox[0];
+        let mut tile_y = clip_bbox[1];
+        let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH);
+        let mut pop_pending = false;
+        for i in 0..n_strips - 1 {
+            let strip = &strips[i];
+            let y = strip.strip_y() as usize;
+            if y < tile_y {
+                continue;
+            }
+            while tile_y < y.min(clip_bbox[3]) {
+                if core::mem::take(&mut pop_pending) {
+                    self.tiles[tile_y * width_tiles + tile_x].pop_clip();
+                    tile_x += 1;
+                }
+                for x in tile_x..clip_bbox[2] {
+                    self.tiles[tile_y * width_tiles + x].pop_zero_clip();
+                }
+                tile_x = clip_bbox[0];
+                tile_y += 1;
+            }
+            if tile_y == clip_bbox[3] {
+                break;
+            }
+            let x0 = strip.x() as usize;
+            let x_clamped = (x0 / WIDE_TILE_WIDTH).min(clip_bbox[2]);
+            if tile_x < x_clamped {
+                if core::mem::take(&mut pop_pending) {
+                    self.tiles[tile_y * width_tiles + tile_x].pop_clip();
+                    tile_x += 1;
+                }
+                // The winding check is probably not needed; if there was a fill,
+                // the logic below should have advanced tile_x.
+                if strip.winding == 0 {
+                    for x in tile_x..x_clamped {
+                        self.tiles[tile_y * width_tiles + x].pop_zero_clip();
+                    }
+                }
+                tile_x = x_clamped;
+            }
+            let next_strip = &strips[i + 1];
+            let strip_width = (next_strip.col - strip.col) as usize;
+            let x1 = x0 + strip_width;
+            let xtile0 = (x0 / WIDE_TILE_WIDTH).max(clip_bbox[0]);
+            let xtile1 = x1.div_ceil(WIDE_TILE_WIDTH).min(clip_bbox[2]);
+            let mut x = x0;
+            let mut alpha_ix = strip.col as usize;
+            if clip_bbox[0] * WIDE_TILE_WIDTH > x {
+                alpha_ix += clip_bbox[0] * WIDE_TILE_WIDTH - x;
+                x = clip_bbox[0] * WIDE_TILE_WIDTH;
+            }
+            for xtile in xtile0..xtile1 {
+                if xtile > tile_x && core::mem::take(&mut pop_pending) {
+                    self.tiles[tile_y * width_tiles + tile_x].pop_clip();
+                }
+                let x_tile_rel = (x % WIDE_TILE_WIDTH) as u32;
+                let width = x1.min((xtile + 1) * WIDE_TILE_WIDTH) - x;
+                let cmd = CmdClipStrip {
+                    x: x_tile_rel,
+                    width: width as u32,
+                    alpha_ix,
+                };
+                x += width;
+                alpha_ix += width;
+                self.tiles[tile_y * width_tiles + xtile].clip_strip(cmd);
+                tile_x = xtile;
+                pop_pending = true;
+            }
+            if next_strip.winding != 0 && y == next_strip.strip_y() as usize {
+                let x2 = next_strip.x() as usize;
+                let tile_x2 = x2.min((tile_x + 1) * WIDE_TILE_WIDTH);
+                let width = tile_x2 - x1;
+                if width > 0 {
+                    let x_tile_rel = (x1 % WIDE_TILE_WIDTH) as u32;
+                    self.tiles[tile_y * width_tiles + tile_x].clip_fill(x_tile_rel, width as u32);
+                }
+                if x2 > (tile_x + 1) * WIDE_TILE_WIDTH {
+                    self.tiles[tile_y * width_tiles + tile_x].pop_clip();
+                    let width2 = x2 % WIDE_TILE_WIDTH;
+                    tile_x = x2 / WIDE_TILE_WIDTH;
+                    if width2 > 0 {
+                        self.tiles[tile_y * width_tiles + tile_x].clip_fill(0, width2 as u32);
+                    }
+                }
+            }
+        }
+        if core::mem::take(&mut pop_pending) {
+            self.tiles[tile_y * width_tiles + tile_x].pop_clip();
+            tile_x += 1;
+        }
+        while tile_y < clip_bbox[3] {
+            for x in tile_x..clip_bbox[2] {
+                self.tiles[tile_y * width_tiles + x].pop_zero_clip();
+            }
+            tile_x = clip_bbox[0];
+            tile_y += 1;
+        }
+    }
+
+    fn pop_clips(&mut self) {
+        while self.state_stack.last().unwrap().n_clip > 0 {
+            self.pop_clip();
+        }
+    }
 }
 
 impl RenderCtx for CsRenderCtx {
@@ -200,15 +367,104 @@
     }
 
     fn clip(&mut self, path: &vello_api::Path) {
-        todo!()
+        let affine = self.get_affine();
+        crate::flatten::fill(&path.path, affine, &mut self.line_buf);
+        self.render_path_common();
+        let strips = core::mem::take(&mut self.strip_buf);
+        let n_strips = strips.len();
+        let path_bbox = if n_strips <= 1 {
+            [0, 0, 0, 0]
+        } else {
+            let y0 = strips[0].strip_y() as usize;
+            let y1 = strips[n_strips - 1].strip_y() as usize + 1;
+            let mut x0 = strips[0].x() as usize / WIDE_TILE_WIDTH;
+            let mut x1 = x0;
+            for i in 0..n_strips - 1 {
+                let strip = &strips[i];
+                let next_strip = &strips[i + 1];
+                let width = next_strip.col - strip.col;
+                let x = strip.x() as usize;
+                x0 = x0.min(x / WIDE_TILE_WIDTH);
+                x1 = x1.max((x + width as usize).div_ceil(WIDE_TILE_WIDTH));
+            }
+            [x0, x1, y0, y1]
+        };
+        let parent_bbox = self.get_bbox();
+        // intersect clip bounding box
+        let clip_bbox = [
+            parent_bbox[0].max(path_bbox[0]),
+            parent_bbox[1].max(path_bbox[1]),
+            parent_bbox[2].min(path_bbox[2]),
+            parent_bbox[3].min(path_bbox[3]),
+        ];
+        // The next bit of code accomplishes the following. For each tile in
+        // the intersected bounding box, it does one of three things depending
+        // on the contents of the clip path in that tile.
+        // If all-zero: push a zero_clip
+        // If all-one: do nothing
+        // If contains one or more strips: push a clip
+        let mut tile_x = clip_bbox[0];
+        let mut tile_y = clip_bbox[1];
+        let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH);
+        for i in 0..n_strips - 1 {
+            let strip = &strips[i];
+            let y = strip.strip_y() as usize;
+            if y < tile_y {
+                continue;
+            }
+            while tile_y < y.min(clip_bbox[3]) {
+                for x in tile_x..clip_bbox[2] {
+                    self.tiles[tile_y * width_tiles + x].push_zero_clip();
+                }
+                tile_x = clip_bbox[0];
+                tile_y += 1;
+            }
+            if tile_y == clip_bbox[3] {
+                break;
+            }
+            let x_pixels = strip.x() as usize;
+            let x_clamped = (x_pixels / WIDE_TILE_WIDTH).min(clip_bbox[2]);
+            if tile_x < x_clamped {
+                if strip.winding == 0 {
+                    for x in tile_x..x_clamped {
+                        self.tiles[tile_y * width_tiles + x].push_zero_clip();
+                    }
+                }
+                // If winding is nonzero, then wide tiles covered entirely
+                // by sparse fill are no-op (no clipping is applied).
+                tile_x = x_clamped;
+            }
+            let next_strip = &strips[i + 1];
+            let width = (next_strip.col - strip.col) as usize;
+            let x1 = (x_pixels + width)
+                .div_ceil(WIDE_TILE_WIDTH)
+                .min(clip_bbox[2]);
+            if tile_x < x1 {
+                for x in tile_x..x1 {
+                    self.tiles[tile_y * width_tiles + x].push_clip();
+                }
+                tile_x = x1;
+            }
+        }
+        while tile_y < clip_bbox[3] {
+            for x in tile_x..clip_bbox[2] {
+                self.tiles[tile_y * width_tiles + x].push_zero_clip();
+            }
+            tile_x = clip_bbox[0];
+            tile_y += 1;
+        }
+        let clip = Clip { clip_bbox, strips };
+        self.clip_stack.push(clip);
+        self.state_stack.last_mut().unwrap().n_clip += 1;
     }
 
     fn save(&mut self) {
-        todo!()
+        self.state_stack.push(GfxState { n_clip: 0 });
     }
 
     fn restore(&mut self) {
-        todo!()
+        self.pop_clips();
+        self.state_stack.pop();
     }
 
     fn transform(&mut self, affine: vello_api::peniko::kurbo::Affine) {
diff --git a/vello_hybrid/src/simd/neon.rs b/vello_hybrid/src/simd/neon.rs
index 72bf920..777b809 100644
--- a/vello_hybrid/src/simd/neon.rs
+++ b/vello_hybrid/src/simd/neon.rs
@@ -14,16 +14,18 @@
 
 impl Fine<'_> {
     pub(crate) unsafe fn clear_simd(&mut self, color: [f32; 4]) {
+        let scratch = self.scratch.last_mut().unwrap();
         unsafe {
             let v_color = vld1q_f32(color.as_ptr());
             let v_color_4 = float32x4x4_t(v_color, v_color, v_color, v_color);
             for i in 0..WIDE_TILE_WIDTH {
-                vst1q_f32_x4(self.scratch.as_mut_ptr().add(i * 16), v_color_4);
+                vst1q_f32_x4(scratch.as_mut_ptr().add(i * 16), v_color_4);
             }
         }
     }
 
     pub(crate) fn pack_simd(&mut self, x: usize, y: usize) {
+        let scratch = self.scratch.last_mut().unwrap();
         unsafe fn cvt(v: float32x4_t) -> uint8x16_t {
             unsafe {
                 let clamped = vminq_f32(v, vdupq_n_f32(1.0));
@@ -40,14 +42,14 @@
             let base_ix = (y * STRIP_HEIGHT * self.width + x * WIDE_TILE_WIDTH) * 4;
             for i in (0..WIDE_TILE_WIDTH).step_by(4) {
                 let chunk_ix = base_ix + i * 4;
-                let v0 = vld1q_f32_x4(self.scratch.as_ptr().add(i * 16));
-                let v1 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 1) * 16));
+                let v0 = vld1q_f32_x4(scratch.as_ptr().add(i * 16));
+                let v1 = vld1q_f32_x4(scratch.as_ptr().add((i + 1) * 16));
                 let x0 = cvt2(v0.0, v1.0);
                 let x1 = cvt2(v0.1, v1.1);
                 let x2 = cvt2(v0.2, v1.2);
                 let x3 = cvt2(v0.3, v1.3);
-                let v2 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 2) * 16));
-                let v3 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 3) * 16));
+                let v2 = vld1q_f32_x4(scratch.as_ptr().add((i + 2) * 16));
+                let v3 = vld1q_f32_x4(scratch.as_ptr().add((i + 3) * 16));
                 let x4 = cvt2(v2.0, v3.0);
                 let y0 = vuzp1q_u8(x0, x4);
                 vst1q_u8(self.out_buf.as_mut_ptr().add(chunk_ix), y0);
@@ -68,24 +70,25 @@
     }
 
     pub(crate) unsafe fn fill_simd(&mut self, x: usize, width: usize, color: [f32; 4]) {
+        let scratch = self.scratch.last_mut().unwrap();
         unsafe {
             let v_color = vld1q_f32(color.as_ptr());
             let alpha = color[3];
             if alpha == 1.0 {
                 let v_color_4 = float32x4x4_t(v_color, v_color, v_color, v_color);
                 for i in x..x + width {
-                    vst1q_f32_x4(self.scratch.as_mut_ptr().add(i * 16), v_color_4);
+                    vst1q_f32_x4(scratch.as_mut_ptr().add(i * 16), v_color_4);
                 }
             } else {
                 let one_minus_alpha = vdupq_n_f32(1.0 - alpha);
                 for i in x..x + width {
                     let ix = (x + i) * 16;
-                    let mut v = vld1q_f32_x4(self.scratch.as_ptr().add(ix));
+                    let mut v = vld1q_f32_x4(scratch.as_ptr().add(ix));
                     v.0 = vfmaq_f32(v_color, v.0, one_minus_alpha);
                     v.1 = vfmaq_f32(v_color, v.1, one_minus_alpha);
                     v.2 = vfmaq_f32(v_color, v.2, one_minus_alpha);
                     v.3 = vfmaq_f32(v_color, v.3, one_minus_alpha);
-                    vst1q_f32_x4(self.scratch.as_mut_ptr().add(ix), v);
+                    vst1q_f32_x4(scratch.as_mut_ptr().add(ix), v);
                 }
             }
         }
@@ -99,6 +102,7 @@
         alphas: &[u32],
         color: [f32; 4],
     ) {
+        let scratch = self.scratch.last_mut().unwrap();
         unsafe {
             debug_assert!(alphas.len() >= width, "overflow of alphas buffer");
             let v_color = vmulq_f32(vld1q_f32(color.as_ptr()), vdupq_n_f32(1.0 / 255.0));
@@ -111,13 +115,13 @@
                 let a4 = vreinterpretq_u32_u16(vzip1q_u16(a3, vdupq_n_u16(0)));
                 let alpha = vcvtq_f32_u32(a4);
                 let ix = (x + i) * 16;
-                let mut v = vld1q_f32_x4(self.scratch.as_ptr().add(ix));
+                let mut v = vld1q_f32_x4(scratch.as_ptr().add(ix));
                 let one_minus_alpha = vfmsq_laneq_f32(vdupq_n_f32(1.0), alpha, v_color, 3);
                 v.0 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 0), v.0, one_minus_alpha, 0);
                 v.1 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 1), v.1, one_minus_alpha, 1);
                 v.2 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 2), v.2, one_minus_alpha, 2);
                 v.3 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 3), v.3, one_minus_alpha, 3);
-                vst1q_f32_x4(self.scratch.as_mut_ptr().add(ix), v);
+                vst1q_f32_x4(scratch.as_mut_ptr().add(ix), v);
             }
         }
     }
diff --git a/vello_hybrid/src/wide_tile.rs b/vello_hybrid/src/wide_tile.rs
index 5e71153..9d8dbcd 100644
--- a/vello_hybrid/src/wide_tile.rs
+++ b/vello_hybrid/src/wide_tile.rs
@@ -6,15 +6,24 @@
 pub(crate) const WIDE_TILE_WIDTH: usize = 256;
 pub(crate) const STRIP_HEIGHT: usize = 4;
 
+#[derive(Debug)]
 pub(crate) struct WideTile {
     pub(crate) bg: AlphaColor<Srgb>,
     pub(crate) cmds: Vec<Cmd>,
+    n_zero_clip: usize,
+    n_clip: usize,
 }
 
 #[derive(Debug)]
 pub(crate) enum Cmd {
     Fill(CmdFill),
     Strip(CmdStrip),
+    /// Pushes a new transparent buffer to the clip stack.
+    PushClip,
+    /// Pops the clip stack.
+    PopClip,
+    ClipFill(CmdClipFill),
+    ClipStrip(CmdClipStrip),
 }
 
 #[derive(Debug)]
@@ -33,26 +42,111 @@
     pub(crate) color: AlphaColor<Srgb>,
 }
 
+/// Same as fill, but copies top of clip stack to next on stack.
+#[derive(Debug)]
+pub(crate) struct CmdClipFill {
+    pub(crate) x: u32,
+    pub(crate) width: u32,
+    // TODO: this should probably get at least an alpha for group opacity
+    // Also, this is where blend modes go.
+}
+
+/// Same as strip, but composites top of clip stack to next on stack.
+#[derive(Debug)]
+pub(crate) struct CmdClipStrip {
+    pub(crate) x: u32,
+    pub(crate) width: u32,
+    pub(crate) alpha_ix: usize,
+    // See `CmdClipFill` for blending extension points
+}
+
 impl Default for WideTile {
     fn default() -> Self {
         Self {
             bg: AlphaColor::TRANSPARENT,
             cmds: vec![],
+            n_zero_clip: 0,
+            n_clip: 0,
         }
     }
 }
 
 impl WideTile {
     pub(crate) fn fill(&mut self, x: u32, width: u32, color: AlphaColor<Srgb>) {
-        if x == 0 && width == WIDE_TILE_WIDTH as u32 && color.components[3] == 1.0 {
-            self.cmds.clear();
-            self.bg = color;
-        } else {
-            self.cmds.push(Cmd::Fill(CmdFill { x, width, color }));
+        if !self.is_zero_clip() {
+            // Note that we could be more aggressive in optimizing a whole-tile opaque fill
+            // even with a clip stack. It would be valid to elide all drawing commands from
+            // the enclosing clip push up to the fill. Further, we could extend the clip
+            // push command to include a background color, rather than always starting with
+            // a transparent buffer. Lastly, a sequence of push(bg); strip/fill; pop could
+            // be replaced with strip/fill with the color (the latter is true even with a
+            // non-opaque color).
+            //
+            // However, the extra cost of tracking such optimizations may outweigh the
+            // benefit, especially in hybrid mode with GPU painting.
+            if x == 0
+                && width == WIDE_TILE_WIDTH as u32
+                && color.components[3] == 1.0
+                && self.n_clip == 0
+            {
+                self.cmds.clear();
+                self.bg = color;
+            } else {
+                self.cmds.push(Cmd::Fill(CmdFill { x, width, color }));
+            }
+        }
+    }
+
+    pub(crate) fn strip(&mut self, cmd_strip: CmdStrip) {
+        if !self.is_zero_clip() {
+            self.cmds.push(Cmd::Strip(cmd_strip));
         }
     }
 
     pub(crate) fn push(&mut self, cmd: Cmd) {
         self.cmds.push(cmd);
     }
+
+    pub(crate) fn push_clip(&mut self) {
+        if !self.is_zero_clip() {
+            self.push(Cmd::PushClip);
+            self.n_clip += 1;
+        }
+    }
+
+    pub(crate) fn pop_clip(&mut self) {
+        if !self.is_zero_clip() {
+            if matches!(self.cmds.last(), Some(Cmd::PushClip)) {
+                // Nothing was drawn inside the clip, elide it.
+                self.cmds.pop();
+            } else {
+                self.push(Cmd::PopClip);
+            }
+            self.n_clip -= 1;
+        }
+    }
+
+    pub(crate) fn push_zero_clip(&mut self) {
+        self.n_zero_clip += 1;
+    }
+
+    pub(crate) fn pop_zero_clip(&mut self) {
+        self.n_zero_clip -= 1;
+    }
+
+    pub(crate) fn is_zero_clip(&mut self) -> bool {
+        self.n_zero_clip > 0
+    }
+
+    pub(crate) fn clip_strip(&mut self, cmd_clip_strip: CmdClipStrip) {
+        if !self.is_zero_clip() && !matches!(self.cmds.last(), Some(Cmd::PushClip)) {
+            self.cmds.push(Cmd::ClipStrip(cmd_clip_strip));
+        }
+    }
+
+    pub(crate) fn clip_fill(&mut self, x: u32, width: u32) {
+        if !self.is_zero_clip() && !matches!(self.cmds.last(), Some(Cmd::PushClip)) {
+            self.cmds.push(Cmd::ClipFill(CmdClipFill { x, width }));
+        }
+    }
 }