Merge pull request #370 from armansito/pr-longpathdash

[test_scenes] Port longpathdash test case from Skia
diff --git a/.vscode/settings.json b/.vscode/settings.json
index a75948f..57a124c 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -16,6 +16,6 @@
   },
   "wgsl-analyzer.diagnostics.nagaVersion": "main",
   "wgsl-analyzer.preprocessor.shaderDefs": [
-    "full"
+    "full", "msaa16", "msaa"
   ]
 }
diff --git a/crates/encoding/src/path.rs b/crates/encoding/src/path.rs
index 414ce23..b0a52b0 100644
--- a/crates/encoding/src/path.rs
+++ b/crates/encoding/src/path.rs
@@ -227,8 +227,11 @@
 pub struct Tile {
     /// Accumulated backdrop at the left edge of the tile.
     pub backdrop: i32,
-    /// Index of first path segment.
-    pub segments: u32,
+    /// An enum that holds either the count of the number of path
+    /// segments in this tile, or an index to the beginning of an
+    /// allocated slice of `PathSegment` objects. In the latter case,
+    /// the bits are inverted.
+    pub segment_count_or_ix: u32,
 }
 
 /// Encoder for path segments.
diff --git a/shader/draw_leaf.wgsl b/shader/draw_leaf.wgsl
index 6154b92..8278259 100644
--- a/shader/draw_leaf.wgsl
+++ b/shader/draw_leaf.wgsl
@@ -108,7 +108,6 @@
         // let x1 = f32(bbox.x1);
         // let y1 = f32(bbox.y1);
         // let bbox_f = vec4(x0, y0, x1, y1);
-        let fill_mode = u32(bbox.linewidth >= 0.0);
         var transform = Transform();
         var linewidth = bbox.linewidth;
         if linewidth >= 0.0 || tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT ||
diff --git a/shader/fine.wgsl b/shader/fine.wgsl
index 108c88c..5d500ad 100644
--- a/shader/fine.wgsl
+++ b/shader/fine.wgsl
@@ -2,8 +2,10 @@
 
 // Fine rasterizer. This can run in simple (just path rendering) and full
 // modes, controllable by #define.
+//
+// To enable multisampled rendering, turn on both the msaa ifdef and one of msaa8
+// or msaa16.
 
-// This is a cut'n'paste w/ backdrop.
 struct Tile {
     backdrop: i32,
     segments: u32,
@@ -18,8 +20,6 @@
 @group(0) @binding(1)
 var<storage> segments: array<Segment>;
 
-#ifdef full
-
 #import blend
 #import ptcl
 
@@ -40,6 +40,313 @@
 @group(0) @binding(6)
 var image_atlas: texture_2d<f32>;
 
+#ifdef msaa8
+let MASK_WIDTH = 32u;
+let MASK_HEIGHT = 32u;
+let SH_SAMPLES_SIZE = 256u;
+let SAMPLE_WORDS_PER_PIXEL = 1u;
+// This might be better in uniform, but that has 16 byte alignment
+@group(0) @binding(7)
+var<storage> mask_lut: array<u32, 256u>;
+#endif
+
+#ifdef msaa16
+let MASK_WIDTH = 64u;
+let MASK_HEIGHT = 64u;
+let SH_SAMPLES_SIZE = 512u;
+let SAMPLE_WORDS_PER_PIXEL = 2u;
+@group(0) @binding(7)
+var<storage> mask_lut: array<u32, 2048u>;
+#endif
+
+#ifdef msaa
+let WG_SIZE = 64u;
+var<workgroup> sh_count: array<u32, WG_SIZE>;
+
+// This is 8 winding numbers packed to a u32, 4 bits per sample
+var<workgroup> sh_winding: array<atomic<u32>, 32u>;
+// Same packing, one group of 8 per pixel
+var<workgroup> sh_samples: array<atomic<u32>, SH_SAMPLES_SIZE>;
+// Same packing, accumulating winding numbers for vertical edge crossings
+var<workgroup> sh_winding_y: array<atomic<u32>, 2u>;
+
+// number of integer cells spanned by interval defined by a, b
+fn span(a: f32, b: f32) -> u32 {
+    return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
+}
+
+let SEG_SIZE = 5u;
+
+// See cpu_shaders/util.rs for explanation of these.
+let ONE_MINUS_ULP: f32 = 0.99999994;
+let ROBUST_EPSILON: f32 = 2e-7;
+
+// New multisampled algorithm.
+fn fill_path_ms(fill: CmdFill, wg_id: vec2<u32>, local_id: vec2<u32>) -> array<f32, PIXELS_PER_THREAD> {
+    let n_segs = fill.size_and_rule >> 1u;
+    let even_odd = (fill.size_and_rule & 1u) != 0u;
+    let tile_origin = vec2(f32(wg_id.x) * f32(TILE_HEIGHT), f32(wg_id.y) * f32(TILE_WIDTH));
+    let th_ix = local_id.y * (TILE_WIDTH / PIXELS_PER_THREAD) + local_id.x;
+    if th_ix < 32u {
+        if th_ix < 2u {
+            atomicStore(&sh_winding_y[th_ix], 0x88888888u);
+        }
+        atomicStore(&sh_winding[th_ix], 0x88888888u);
+    }
+    let sample_count = PIXELS_PER_THREAD * SAMPLE_WORDS_PER_PIXEL;
+    for (var i = 0u; i < sample_count; i++) {
+        atomicStore(&sh_samples[th_ix * sample_count + i], 0x88888888u);
+    }
+    workgroupBarrier();
+    let n_batch = (n_segs + (WG_SIZE - 1u)) / WG_SIZE;
+    for (var batch = 0u; batch < n_batch; batch++) {
+        let seg_ix = batch * WG_SIZE + th_ix;
+        let seg_off = fill.seg_data + seg_ix;
+        var count = 0u;
+        let slice_size = min(n_segs - batch * WG_SIZE, WG_SIZE);
+        // TODO: might save a register rewriting this in terms of limit
+        if th_ix < slice_size {
+            let segment = segments[seg_off];
+            // Note: coords relative to tile origin probably a good idea in coarse path,
+            // especially as f16 would work. But keeping existing scheme for compatibility.
+            let xy0 = segment.origin - tile_origin;
+            let xy1 = xy0 + segment.delta;
+            var y_edge_f = f32(TILE_HEIGHT);
+            var delta = select(-1, 1, xy1.x <= xy0.x);
+            if xy0.x == 0.0 && xy1.x == 0.0 {
+                if xy0.y == 0.0 {
+                    y_edge_f = 0.0;
+                } else if xy1.y == 0.0 {
+                    y_edge_f = 0.0;
+                    delta = -delta;
+                }
+            } else {
+                if xy0.x == 0.0 {
+                    if xy0.y != 0.0 {
+                        y_edge_f = xy0.y;
+                    }
+                } else if xy1.x == 0.0 && xy1.y != 0.0 {
+                    y_edge_f = xy1.y;
+                }
+                // discard horizontal lines aligned to pixel grid
+                if !(xy0.y == xy1.y && xy0.y == floor(xy0.y)) {
+                    count = span(xy0.x, xy1.x) + span(xy0.y, xy1.y) - 1u;
+                }
+            }
+            let y_edge = u32(ceil(y_edge_f));
+            if y_edge < TILE_HEIGHT {
+                atomicAdd(&sh_winding_y[y_edge >> 3u], u32(delta) << ((y_edge & 7u) << 2u));
+            }
+        }
+        // workgroup prefix sum of counts
+        sh_count[th_ix] = count;
+        let lg_n = firstLeadingBit(slice_size * 2u - 1u);
+        for (var i = 0u; i < lg_n; i++) {
+            workgroupBarrier();
+            if th_ix >= 1u << i {
+                count += sh_count[th_ix - (1u << i)];
+            }
+            workgroupBarrier();
+            sh_count[th_ix] = count;
+        }
+        let total = workgroupUniformLoad(&sh_count[slice_size - 1u]);
+        for (var i = th_ix; i < total; i += WG_SIZE) {
+            // binary search to find pixel
+            var lo = 0u;
+            var hi = slice_size;
+            let goal = i;
+            while hi > lo + 1u {
+                let mid = (lo + hi) >> 1u;
+                if goal >= sh_count[mid - 1u] {
+                    lo = mid;
+                } else {
+                    hi = mid;
+                }
+            }
+            let el_ix = lo;
+            let last_pixel = i + 1u == sh_count[el_ix];
+            let sub_ix = i - select(0u, sh_count[el_ix - 1u], el_ix > 0u);
+            let seg_off = fill.seg_data + batch * WG_SIZE + el_ix;
+            let segment = segments[seg_off];
+            let xy0_in = segment.origin - tile_origin;
+            let xy1_in = xy0_in + segment.delta;
+            let is_down = xy1_in.y >= xy0_in.y;
+            let xy0 = select(xy1_in, xy0_in, is_down);
+            let xy1 = select(xy0_in, xy1_in, is_down);
+
+            // Set up data for line rasterization
+            // Note: this is duplicated work if total count exceeds a workgroup.
+            // One alternative is to compute it in a separate dispatch.
+            let dx = abs(xy1.x - xy0.x);
+            let dy = xy1.y - xy0.y;
+            let idxdy = 1.0 / (dx + dy);
+            var a = dx * idxdy;
+            let is_positive_slope = xy1.x >= xy0.x;
+            let sign = select(-1.0, 1.0, is_positive_slope);
+            let xt0 = floor(xy0.x * sign);
+            let c = xy0.x * sign - xt0;
+            let y0i = floor(xy0.y);
+            let ytop = y0i + 1.0;
+            let b = min((dy * c + dx * (ytop - xy0.y)) * idxdy, ONE_MINUS_ULP);
+            let count_x = span(xy0.x, xy1.x) - 1u;
+            let count = count_x + span(xy0.y, xy1.y);
+            let robust_err = floor(a * (f32(count) - 1.0) + b) - f32(count_x);
+            if robust_err != 0.0 {
+                a -= ROBUST_EPSILON * sign(robust_err);
+            }
+            let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
+            // Use line equation to plot pixel coordinates
+
+            let zf = a * f32(sub_ix) + b;
+            let z = floor(zf);
+            let x = x0i + i32(sign * z);
+            let y = i32(y0i) + i32(sub_ix) - i32(z);
+            var is_delta: bool;
+            // We need to adjust winding number if slope is positive and there
+            // is a crossing at the left edge of the pixel.
+            var is_bump = false;
+            let zp = floor(a * f32(sub_ix - 1u) + b);
+            if sub_ix == 0u {
+                is_delta = y0i == xy0.y && y0i != xy1.y;
+                is_bump = xy0.x == 0.0;
+            } else {
+                is_delta = z == zp;
+                is_bump = is_positive_slope && !is_delta;
+            }
+            let pix_ix = u32(y) * TILE_WIDTH + u32(x);
+            if u32(x) < TILE_WIDTH - 1u && u32(y) < TILE_HEIGHT {
+                let delta_pix = pix_ix + 1u;
+                if is_delta {
+                    let delta = select(u32(-1), 1u, is_down) << ((delta_pix & 7u) << 2u);
+                    atomicAdd(&sh_winding[delta_pix >> 3u], delta);
+                }
+            }
+            // Apply sample mask
+            let mask_block = u32(is_positive_slope) * (MASK_WIDTH * MASK_HEIGHT / 2u);
+            let half_height = f32(MASK_HEIGHT / 2u);
+            let mask_row = floor(min(a * half_height, half_height - 1.0)) * f32(MASK_WIDTH);
+            let mask_col = floor((zf - z) * f32(MASK_WIDTH));
+            let mask_ix = mask_block + u32(mask_row + mask_col);
+#ifdef msaa8
+            var mask = mask_lut[mask_ix / 4u] >> ((mask_ix % 4u) * 8u);
+            mask &= 0xffu;
+            // Intersect with y half-plane masks
+            if sub_ix == 0u && !is_bump {
+                let mask_shift = u32(round(8.0 * (xy0.y - f32(y))));
+                mask &= 0xffu << mask_shift;
+            }
+            if last_pixel && xy1.x != 0.0 {
+                let mask_shift = u32(round(8.0 * (xy1.y - f32(y))));
+                mask &= ~(0xffu << mask_shift);
+            }
+            let mask_a = mask | (mask << 6u);
+            let mask_b = mask_a | (mask_a << 12u);
+            let mask_exp = (mask_b & 0x1010101u) | ((mask_b << 3u) & 0x10101010u);
+            var mask_signed = select(mask_exp, u32(-i32(mask_exp)), is_down);
+            if is_bump {
+                mask_signed += select(u32(-0x11111111), 0x1111111u, is_down);
+            }
+            atomicAdd(&sh_samples[pix_ix], mask_signed);
+#endif
+#ifdef msaa16
+            var mask = mask_lut[mask_ix / 2u] >> ((mask_ix % 2u) * 16u);
+            mask &= 0xffffu;
+            // Intersect with y half-plane masks
+            if sub_ix == 0u && !is_bump {
+                let mask_shift = u32(round(16.0 * (xy0.y - f32(y))));
+                mask &= 0xffffu << mask_shift;
+            }
+            if last_pixel && xy1.x != 0.0 {
+                let mask_shift = u32(round(16.0 * (xy1.y - f32(y))));
+                mask &= ~(0xffffu << mask_shift);
+            }
+            let mask0 = mask & 0xffu;
+            let mask0_a = mask0 | (mask0 << 6u);
+            let mask0_b = mask0_a | (mask0_a << 12u);
+            let mask0_exp = (mask0_b & 0x1010101u) | ((mask0_b << 3u) & 0x10101010u);
+            var mask0_signed = select(mask0_exp, u32(-i32(mask0_exp)), is_down);
+            let mask1 = (mask >> 8u) & 0xffu;
+            let mask1_a = mask1 | (mask1 << 6u);
+            let mask1_b = mask1_a | (mask1_a << 12u);
+            let mask1_exp = (mask1_b & 0x1010101u) | ((mask1_b << 3u) & 0x10101010u);
+            var mask1_signed = select(mask1_exp, u32(-i32(mask1_exp)), is_down);
+            if is_bump {
+                let bump_delta = select(u32(-0x11111111), 0x1111111u, is_down);
+                mask0_signed += bump_delta;
+                mask1_signed += bump_delta;
+            }
+            atomicAdd(&sh_samples[pix_ix * 2u], mask0_signed);
+            atomicAdd(&sh_samples[pix_ix * 2u + 1u], mask1_signed);
+#endif
+        }
+        workgroupBarrier();
+    }
+    var area: array<f32, PIXELS_PER_THREAD>;
+    let major = (th_ix * PIXELS_PER_THREAD) >> 3u;
+    var packed_w = atomicLoad(&sh_winding[major]);
+    // Prefix sum of packed 4 bit values within u32
+    packed_w += (packed_w - 0x8888888u) << 4u;
+    packed_w += (packed_w - 0x888888u) << 8u;
+    packed_w += (packed_w - 0x8888u) << 16u;
+    // Note: could probably do bias in one go, but it would be inscrutable
+    if (major & 1u) != 0u {
+        // We could use shmem to communicate the value from another thread;
+        // if we had subgroups that would almost certainly be the most
+        // efficient way. But we just calculate again for simplicity.
+        var last_packed = atomicLoad(&sh_winding[major - 1u]);
+        last_packed += (last_packed - 0x8888888u) << 4u;
+        last_packed += (last_packed - 0x888888u) << 8u;
+        last_packed += (last_packed - 0x8888u) << 16u;
+        let bump = ((last_packed >> 28u) - 8u) * 0x11111111u;
+        packed_w += bump;
+    }
+    var packed_y = atomicLoad(&sh_winding_y[local_id.y >> 3u]);
+    packed_y += (packed_y - 0x8888888u) << 4u;
+    packed_y += (packed_y - 0x888888u) << 8u;
+    packed_y += (packed_y - 0x8888u) << 16u;
+    if th_ix == 0u {
+        atomicStore(&sh_winding_y[0], packed_y);        
+    }
+    workgroupBarrier();
+    var wind_y = (packed_y >> ((local_id.y & 7u) << 2u)) - 8u;
+    if local_id.y >= 8u {
+        wind_y += (atomicLoad(&sh_winding_y[0]) >> 28u) - 8u;
+    }
+
+    for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
+        let pix_ix = th_ix * PIXELS_PER_THREAD + i;
+        let minor = pix_ix & 7u;
+        //let nonzero = ((packed_w >> (minor << 2u)) & 0xfu) != u32(8 + backdrop);
+        // TODO: math might be off here
+        let expected_zero = (((packed_w >> (minor * 4u)) + wind_y) & 0xfu) - u32(fill.backdrop);
+        if expected_zero >= 16u {
+            area[i] = 1.0;
+        } else {
+#ifdef msaa8
+            let samples = atomicLoad(&sh_samples[pix_ix]);
+            let xored = (expected_zero * 0x11111111u) ^ samples;
+            // Each 4-bit nibble in xored is 0 for winding = 0, nonzero otherwise
+            let xored2 = xored | (xored * 2u);
+            let xored4 = xored2 | (xored2 * 4u);
+            area[i] = f32(countOneBits(xored4 & 0x88888888u)) * 0.125;
+#endif
+#ifdef msaa16
+            let samples0 = atomicLoad(&sh_samples[pix_ix * 2u]);
+            let samples1 = atomicLoad(&sh_samples[pix_ix * 2u + 1u]);
+            let xored0 = (expected_zero * 0x11111111u) ^ samples0;
+            let xored0_2 = xored0 | (xored0 * 2u);
+            let xored1 = (expected_zero * 0x11111111u) ^ samples1;
+            let xored1_2 = xored1 | (xored1 >> 1u);
+            let xored2 = (xored0_2 & 0xAAAAAAAAu) | (xored1_2 & 0x55555555u);
+            let xored4 = xored2 | (xored2 * 4u);
+            area[i] = f32(countOneBits(xored4 & 0xCCCCCCCCu)) * 0.0625;
+#endif
+        }
+    }
+    return area;
+}
+#endif
+
 fn read_fill(cmd_ix: u32) -> CmdFill {
     let size_and_rule = ptcl[cmd_ix + 1u];
     let seg_data = ptcl[cmd_ix + 2u];
@@ -126,15 +433,12 @@
     }
 }
 
-#else
-
-@group(0) @binding(3)
-var output: texture_storage_2d<r8, write>;
-
-#endif
-
 let PIXELS_PER_THREAD = 4u;
 
+// Analytic area antialiasing.
+//
+// This is currently dead code if msaa is enabled, but it would be fairly straightforward
+// to wire this so it's a dynamic choice (even per-path).
 fn fill_path(fill: CmdFill, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
     let n_segs = fill.size_and_rule >> 1u;
     let even_odd = (fill.size_and_rule & 1u) != 0u;
@@ -220,7 +524,11 @@
             // CMD_FILL
             case 1u: {
                 let fill = read_fill(cmd_ix);
+#ifdef msaa
+                area = fill_path_ms(fill, wg_id.xy, local_id.xy);
+#else
                 area = fill_path(fill, xy);
+#endif
                 cmd_ix += 4u;
             }
             // CMD_STROKE
diff --git a/shader/path_count.wgsl b/shader/path_count.wgsl
index e34ff7a..dabb6d1 100644
--- a/shader/path_count.wgsl
+++ b/shader/path_count.wgsl
@@ -36,6 +36,10 @@
     return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
 }
 
+// See cpu_shaders/util.rs for explanation of these.
+let ONE_MINUS_ULP: f32 = 0.99999994;
+let ROBUST_EPSILON: f32 = 2e-7;
+
 // Note regarding clipping to bounding box:
 //
 // We have to do the backdrop bumps for all tiles to the left of the bbox.
@@ -57,7 +61,8 @@
         let xy1 = select(line.p0, line.p1, is_down);
         let s0 = xy0 * TILE_SCALE;
         let s1 = xy1 * TILE_SCALE;
-        count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u;
+        let count_x = span(s0.x, s1.x) - 1u;
+        count = count_x + span(s0.y, s1.y);
         let line_ix = global_id.x;
 
         let dx = abs(s1.x - s0.x);
@@ -72,14 +77,18 @@
             return;
         }
         let idxdy = 1.0 / (dx + dy);
-        let a = dx * idxdy;
+        var a = dx * idxdy;
         let is_positive_slope = s1.x >= s0.x;
         let sign = select(-1.0, 1.0, is_positive_slope);
         let xt0 = floor(s0.x * sign);
         let c = s0.x * sign - xt0;
         let y0 = floor(s0.y);
         let ytop = select(y0 + 1.0, ceil(s0.y), s0.y == s1.y);
-        let b = (dy * c + dx * (ytop - s0.y)) * idxdy;
+        let b = min((dy * c + dx * (ytop - s0.y)) * idxdy, ONE_MINUS_ULP);
+        let robust_err = floor(a * (f32(count) - 1.0) + b) - f32(count_x);
+        if robust_err != 0.0 {
+            a -= ROBUST_EPSILON * sign(robust_err);
+        }
         let x0 = xt0 * sign + select(-1.0, 0.0, is_positive_slope);
 
         let path = paths[line.path_ix];
diff --git a/shader/path_tiling.wgsl b/shader/path_tiling.wgsl
index ace01ab..fad5d72 100644
--- a/shader/path_tiling.wgsl
+++ b/shader/path_tiling.wgsl
@@ -29,6 +29,10 @@
     return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
 }
 
+// See cpu_shaders/util.rs for explanation of these.
+let ONE_MINUS_ULP: f32 = 0.99999994;
+let ROBUST_EPSILON: f32 = 2e-7;
+
 // One invocation for each tile that is to be written.
 // Total number of invocations = bump.seg_counts
 @compute @workgroup_size(256)
@@ -49,20 +53,25 @@
         var xy1 = select(line.p0, line.p1, is_down);
         let s0 = xy0 * TILE_SCALE;
         let s1 = xy1 * TILE_SCALE;
-        let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u;
+        let count_x = span(s0.x, s1.x) - 1u;
+        let count = count_x + span(s0.y, s1.y);
         let dx = abs(s1.x - s0.x);
         let dy = s1.y - s0.y;
         // Division by zero can't happen because zero-length lines
         // have already been discarded in the path_count stage.
         let idxdy = 1.0 / (dx + dy);
-        let a = dx * idxdy;
+        var a = dx * idxdy;
         let is_positive_slope = s1.x >= s0.x;
         let sign = select(-1.0, 1.0, is_positive_slope);
         let xt0 = floor(s0.x * sign);
         let c = s0.x * sign - xt0;
         let y0i = floor(s0.y);
         let ytop = select(y0i + 1.0, ceil(s0.y), s0.y == s1.y);
-        let b = (dy * c + dx * (ytop - s0.y)) * idxdy;
+        let b = min((dy * c + dx * (ytop - s0.y)) * idxdy, ONE_MINUS_ULP);
+        let robust_err = floor(a * (f32(count) - 1.0) + b) - f32(count_x);
+        if robust_err != 0.0 {
+            a -= ROBUST_EPSILON * sign(robust_err);
+        }
         let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
         let z = floor(a * f32(seg_within_line) + b);
         let x = x0i + i32(sign * z);
diff --git a/src/cpu_dispatch.rs b/src/cpu_dispatch.rs
index 0b8bbc8..2c3409c 100644
--- a/src/cpu_dispatch.rs
+++ b/src/cpu_dispatch.rs
@@ -4,10 +4,12 @@
 //! Support for CPU implementations of compute shaders.
 
 use std::{
-    cell::{RefCell, RefMut},
-    ops::Deref,
+    cell::{Ref, RefCell, RefMut},
+    ops::{Deref, DerefMut},
 };
 
+use bytemuck::Pod;
+
 #[derive(Clone, Copy)]
 pub enum CpuBinding<'a> {
     Buffer(&'a [u8]),
@@ -16,39 +18,88 @@
     Texture(&'a CpuTexture),
 }
 
-pub enum CpuBufGuard<'a> {
-    Slice(&'a [u8]),
-    Interior(RefMut<'a, Vec<u8>>),
+pub enum TypedBufGuard<'a, T: ?Sized> {
+    Slice(&'a T),
+    Interior(Ref<'a, T>),
 }
 
-impl<'a> Deref for CpuBufGuard<'a> {
-    type Target = [u8];
+pub enum TypedBufGuardMut<'a, T: ?Sized> {
+    Slice(&'a mut T),
+    Interior(RefMut<'a, T>),
+}
+
+impl<'a, T: ?Sized> Deref for TypedBufGuard<'a, T> {
+    type Target = T;
 
     fn deref(&self) -> &Self::Target {
         match self {
-            CpuBufGuard::Slice(s) => s,
-            CpuBufGuard::Interior(r) => r,
+            TypedBufGuard::Slice(s) => s,
+            TypedBufGuard::Interior(r) => r,
         }
     }
 }
 
-impl<'a> CpuBufGuard<'a> {
-    /// Get a mutable reference to the buffer.
-    ///
-    /// Panics if the underlying resource is read-only.
-    pub fn as_mut(&mut self) -> &mut [u8] {
+impl<'a, T: ?Sized> Deref for TypedBufGuardMut<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
         match self {
-            CpuBufGuard::Interior(r) => &mut *r,
-            _ => panic!("tried to borrow immutable buffer as mutable"),
+            TypedBufGuardMut::Slice(s) => s,
+            TypedBufGuardMut::Interior(r) => r,
+        }
+    }
+}
+
+impl<'a, T: ?Sized> DerefMut for TypedBufGuardMut<'a, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        match self {
+            TypedBufGuardMut::Slice(s) => s,
+            TypedBufGuardMut::Interior(r) => r,
         }
     }
 }
 
 impl<'a> CpuBinding<'a> {
-    pub fn as_buf(&self) -> CpuBufGuard {
+    pub fn as_typed<T: Pod>(&self) -> TypedBufGuard<T> {
         match self {
-            CpuBinding::Buffer(b) => CpuBufGuard::Slice(b),
-            CpuBinding::BufferRW(b) => CpuBufGuard::Interior(b.borrow_mut()),
+            CpuBinding::Buffer(b) => TypedBufGuard::Slice(bytemuck::from_bytes(b)),
+            CpuBinding::BufferRW(b) => {
+                TypedBufGuard::Interior(Ref::map(b.borrow(), |buf| bytemuck::from_bytes(buf)))
+            }
+            _ => panic!("resource type mismatch"),
+        }
+    }
+
+    pub fn as_typed_mut<T: Pod>(&self) -> TypedBufGuardMut<T> {
+        match self {
+            CpuBinding::Buffer(_) => panic!("can't borrow external buffer mutably"),
+            CpuBinding::BufferRW(b) => {
+                TypedBufGuardMut::Interior(RefMut::map(b.borrow_mut(), |buf| {
+                    bytemuck::from_bytes_mut(buf)
+                }))
+            }
+            _ => panic!("resource type mismatch"),
+        }
+    }
+
+    pub fn as_slice<T: Pod>(&self) -> TypedBufGuard<[T]> {
+        match self {
+            CpuBinding::Buffer(b) => TypedBufGuard::Slice(bytemuck::cast_slice(b)),
+            CpuBinding::BufferRW(b) => {
+                TypedBufGuard::Interior(Ref::map(b.borrow(), |buf| bytemuck::cast_slice(buf)))
+            }
+            _ => panic!("resource type mismatch"),
+        }
+    }
+
+    pub fn as_slice_mut<T: Pod>(&self) -> TypedBufGuardMut<[T]> {
+        match self {
+            CpuBinding::Buffer(_) => panic!("can't borrow external buffer mutably"),
+            CpuBinding::BufferRW(b) => {
+                TypedBufGuardMut::Interior(RefMut::map(b.borrow_mut(), |buf| {
+                    bytemuck::cast_slice_mut(buf)
+                }))
+            }
             _ => panic!("resource type mismatch"),
         }
     }
diff --git a/src/cpu_shader/backdrop.rs b/src/cpu_shader/backdrop.rs
new file mode 100644
index 0000000..746efdc
--- /dev/null
+++ b/src/cpu_shader/backdrop.rs
@@ -0,0 +1,30 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{ConfigUniform, Path, Tile};
+
+use crate::cpu_dispatch::CpuBinding;
+
+fn backdrop_main(config: &ConfigUniform, paths: &[Path], tiles: &mut [Tile]) {
+    for drawobj_ix in 0..config.layout.n_draw_objects {
+        let path = paths[drawobj_ix as usize];
+        let width = path.bbox[2] - path.bbox[0];
+        let height = path.bbox[3] - path.bbox[1];
+        let base = path.tiles;
+        for y in 0..height {
+            let mut sum = 0;
+            for x in 0..width {
+                let tile = &mut tiles[(base + y * width + x) as usize];
+                sum += tile.backdrop;
+                tile.backdrop = sum;
+            }
+        }
+    }
+}
+
+pub fn backdrop(_n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let paths = resources[1].as_slice();
+    let mut tiles = resources[2].as_slice_mut();
+    backdrop_main(&config, &paths, &mut tiles);
+}
diff --git a/src/cpu_shader/bbox_clear.rs b/src/cpu_shader/bbox_clear.rs
new file mode 100644
index 0000000..1e02127
--- /dev/null
+++ b/src/cpu_shader/bbox_clear.rs
@@ -0,0 +1,21 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{ConfigUniform, PathBbox};
+
+use crate::cpu_dispatch::CpuBinding;
+
+fn bbox_clear_main(config: &ConfigUniform, path_bboxes: &mut [PathBbox]) {
+    for i in 0..(config.layout.n_paths as usize) {
+        path_bboxes[i].x0 = 0x7fff_ffff;
+        path_bboxes[i].y0 = 0x7fff_ffff;
+        path_bboxes[i].x1 = -0x8000_0000;
+        path_bboxes[i].y1 = -0x8000_0000;
+    }
+}
+
+pub fn bbox_clear(_n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let mut path_bboxes = resources[1].as_slice_mut();
+    bbox_clear_main(&config, &mut path_bboxes);
+}
diff --git a/src/cpu_shader/binning.rs b/src/cpu_shader/binning.rs
new file mode 100644
index 0000000..5ace850
--- /dev/null
+++ b/src/cpu_shader/binning.rs
@@ -0,0 +1,128 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{BinHeader, BumpAllocators, ConfigUniform, DrawMonoid, PathBbox};
+
+use crate::cpu_dispatch::CpuBinding;
+
+const WG_SIZE: usize = 256;
+const TILE_WIDTH: usize = 16;
+const TILE_HEIGHT: usize = 16;
+const N_TILE_X: usize = 16;
+const N_TILE_Y: usize = 16;
+const SX: f32 = 1.0 / ((N_TILE_X * TILE_WIDTH) as f32);
+const SY: f32 = 1.0 / ((N_TILE_Y * TILE_HEIGHT) as f32);
+
+fn bbox_intersect(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
+    [
+        a[0].max(b[0]),
+        a[1].max(b[1]),
+        a[2].min(b[2]),
+        a[3].min(b[3]),
+    ]
+}
+
+fn binning_main(
+    n_wg: u32,
+    config: &ConfigUniform,
+    draw_monoids: &[DrawMonoid],
+    path_bbox_buf: &[PathBbox],
+    clip_bbox_buf: &[[f32; 4]],
+    intersected_bbox: &mut [[f32; 4]],
+    bump: &mut BumpAllocators,
+    bin_data: &mut [u32],
+    bin_header: &mut [BinHeader],
+) {
+    for wg in 0..n_wg as usize {
+        let mut counts = [0; WG_SIZE];
+        let mut bboxes = [[0, 0, 0, 0]; WG_SIZE];
+        let width_in_bins =
+            ((config.width_in_tiles + N_TILE_X as u32 - 1) / N_TILE_X as u32) as i32;
+        let height_in_bins =
+            ((config.height_in_tiles + N_TILE_Y as u32 - 1) / N_TILE_Y as u32) as i32;
+        for local_ix in 0..WG_SIZE {
+            let element_ix = wg * WG_SIZE + local_ix;
+            let mut x0 = 0;
+            let mut y0 = 0;
+            let mut x1 = 0;
+            let mut y1 = 0;
+            if element_ix < config.layout.n_draw_objects as usize {
+                let draw_monoid = draw_monoids[element_ix];
+                let mut clip_bbox = [-1e9, -1e9, 1e9, 1e9];
+                if draw_monoid.clip_ix > 0 {
+                    assert!(draw_monoid.clip_ix - 1 < config.layout.n_clips);
+                    clip_bbox = clip_bbox_buf[draw_monoid.clip_ix as usize - 1];
+                }
+                let path_bbox = path_bbox_buf[draw_monoid.path_ix as usize];
+                let pb = [
+                    path_bbox.x0 as f32,
+                    path_bbox.y0 as f32,
+                    path_bbox.x1 as f32,
+                    path_bbox.y1 as f32,
+                ];
+                let bbox = bbox_intersect(clip_bbox, pb);
+                intersected_bbox[element_ix] = bbox;
+                if bbox[0] < bbox[2] && bbox[1] < bbox[3] {
+                    x0 = (bbox[0] * SX).floor() as i32;
+                    y0 = (bbox[1] * SY).floor() as i32;
+                    x1 = (bbox[2] * SX).ceil() as i32;
+                    y1 = (bbox[3] * SY).ceil() as i32;
+                }
+            }
+            x0 = x0.clamp(0, width_in_bins);
+            y0 = y0.clamp(0, height_in_bins);
+            x1 = x1.clamp(0, width_in_bins);
+            y1 = y1.clamp(0, height_in_bins);
+            for y in y0..y1 {
+                for x in x0..x1 {
+                    counts[(y * width_in_bins + x) as usize] += 1;
+                }
+            }
+            bboxes[local_ix] = [x0, y0, x1, y1];
+        }
+        let mut chunk_offset = [0; WG_SIZE];
+        for local_ix in 0..WG_SIZE {
+            let global_ix = wg * WG_SIZE + local_ix;
+            chunk_offset[local_ix] = bump.binning;
+            bump.binning += counts[local_ix];
+            bin_header[global_ix] = BinHeader {
+                element_count: counts[local_ix],
+                chunk_offset: chunk_offset[local_ix],
+            };
+        }
+        for local_ix in 0..WG_SIZE {
+            let element_ix = wg * WG_SIZE + local_ix;
+            let bbox = bboxes[local_ix];
+            for y in bbox[1]..bbox[3] {
+                for x in bbox[0]..bbox[2] {
+                    let bin_ix = (y * width_in_bins + x) as usize;
+                    let ix = config.layout.bin_data_start + chunk_offset[bin_ix];
+                    bin_data[ix as usize] = element_ix as u32;
+                    chunk_offset[bin_ix] += 1;
+                }
+            }
+        }
+    }
+}
+
+pub fn binning(n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let draw_monoids = resources[1].as_slice();
+    let path_bbox_buf = resources[2].as_slice();
+    let clip_bbox_buf = resources[3].as_slice();
+    let mut intersected_bbox = resources[4].as_slice_mut();
+    let mut bump = resources[5].as_typed_mut();
+    let mut bin_data = resources[6].as_slice_mut();
+    let mut bin_header = resources[7].as_slice_mut();
+    binning_main(
+        n_wg,
+        &config,
+        &draw_monoids,
+        &path_bbox_buf,
+        &clip_bbox_buf,
+        &mut intersected_bbox,
+        &mut bump,
+        &mut bin_data,
+        &mut bin_header,
+    );
+}
diff --git a/src/cpu_shader/clip_leaf.rs b/src/cpu_shader/clip_leaf.rs
new file mode 100644
index 0000000..0f5fc61
--- /dev/null
+++ b/src/cpu_shader/clip_leaf.rs
@@ -0,0 +1,86 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{Clip, ConfigUniform, DrawMonoid, PathBbox};
+
+use crate::cpu_dispatch::CpuBinding;
+
+struct ClipStackElement {
+    // index of draw object
+    parent_ix: u32,
+    path_ix: u32,
+    bbox: [f32; 4],
+}
+
+const BIG_BBOX: [f32; 4] = [-1e9, -1e9, 1e9, 1e9];
+
+// Note: this implementation doesn't rigorously follow the
+// WGSL original. In particular, it just computes the clips
+// sequentially rather than using the partition reductions.
+fn clip_leaf_main(
+    config: &ConfigUniform,
+    clip_inp: &[Clip],
+    path_bboxes: &[PathBbox],
+    draw_monoids: &mut [DrawMonoid],
+    clip_bboxes: &mut [[f32; 4]],
+) {
+    let mut stack: Vec<ClipStackElement> = Vec::new();
+    for global_ix in 0..config.layout.n_clips {
+        let clip_el = clip_inp[global_ix as usize];
+        if clip_el.path_ix >= 0 {
+            // begin clip
+            let path_ix = clip_el.path_ix as u32;
+            let path_bbox = path_bboxes[path_ix as usize];
+            let p_bbox = [
+                path_bbox.x0 as f32,
+                path_bbox.y0 as f32,
+                path_bbox.x1 as f32,
+                path_bbox.y1 as f32,
+            ];
+            let bbox = if let Some(last) = stack.last() {
+                [
+                    p_bbox[0].max(last.bbox[0]),
+                    p_bbox[1].max(last.bbox[1]),
+                    p_bbox[2].min(last.bbox[2]),
+                    p_bbox[3].min(last.bbox[3]),
+                ]
+            } else {
+                p_bbox
+            };
+            clip_bboxes[global_ix as usize] = bbox;
+            let parent_ix = clip_el.ix;
+            stack.push(ClipStackElement {
+                parent_ix,
+                path_ix,
+                bbox,
+            });
+        } else {
+            // end clip
+            let tos = stack.pop().unwrap();
+            let bbox = if let Some(nos) = stack.last() {
+                nos.bbox
+            } else {
+                BIG_BBOX
+            };
+            clip_bboxes[global_ix as usize] = bbox;
+            draw_monoids[clip_el.ix as usize].path_ix = tos.path_ix;
+            draw_monoids[clip_el.ix as usize].scene_offset =
+                draw_monoids[tos.parent_ix as usize].scene_offset;
+        }
+    }
+}
+
+pub fn clip_leaf(_n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let clip_inp = resources[1].as_slice();
+    let path_bboxes = resources[2].as_slice();
+    let mut draw_monoids = resources[5].as_slice_mut();
+    let mut clip_bboxes = resources[6].as_slice_mut();
+    clip_leaf_main(
+        &config,
+        &clip_inp,
+        &path_bboxes,
+        &mut draw_monoids,
+        &mut clip_bboxes,
+    );
+}
diff --git a/src/cpu_shader/clip_reduce.rs b/src/cpu_shader/clip_reduce.rs
new file mode 100644
index 0000000..fc30661
--- /dev/null
+++ b/src/cpu_shader/clip_reduce.rs
@@ -0,0 +1,56 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{Clip, ClipBic, ClipElement, PathBbox};
+
+use crate::cpu_dispatch::CpuBinding;
+
+const WG_SIZE: usize = 256;
+
+fn clip_reduce_main(
+    n_wg: u32,
+    clip_inp: &[Clip],
+    path_bboxes: &[PathBbox],
+    reduced: &mut [ClipBic],
+    clip_out: &mut [ClipElement],
+) {
+    let mut scratch = Vec::with_capacity(WG_SIZE);
+    for wg_ix in 0..n_wg {
+        scratch.clear();
+        let mut bic_reduced = ClipBic::default();
+        // reverse scan
+        for local_ix in (0..WG_SIZE).rev() {
+            let global_ix = wg_ix as usize * WG_SIZE + local_ix;
+            let inp = clip_inp[global_ix].path_ix;
+            let is_push = inp >= 0;
+            let bic = ClipBic::new(1 - is_push as u32, is_push as u32);
+            bic_reduced = bic.combine(bic_reduced);
+            if is_push && bic_reduced.a == 0 {
+                scratch.push(global_ix as u32);
+            }
+        }
+        reduced[wg_ix as usize] = bic_reduced;
+        for (i, parent_ix) in scratch.iter().rev().enumerate() {
+            let mut clip_el = ClipElement::default();
+            clip_el.parent_ix = *parent_ix;
+            let path_ix = clip_inp[*parent_ix as usize].path_ix;
+            let path_bbox = path_bboxes[path_ix as usize];
+            clip_el.bbox = [
+                path_bbox.x0 as f32,
+                path_bbox.y0 as f32,
+                path_bbox.x1 as f32,
+                path_bbox.y1 as f32,
+            ];
+            let global_ix = wg_ix as usize * WG_SIZE + i;
+            clip_out[global_ix] = clip_el;
+        }
+    }
+}
+
+pub fn clip_reduce(n_wg: u32, resources: &[CpuBinding]) {
+    let clip_inp = resources[0].as_slice();
+    let path_bboxes = resources[1].as_slice();
+    let mut reduced = resources[2].as_slice_mut();
+    let mut clip_out = resources[3].as_slice_mut();
+    clip_reduce_main(n_wg, &clip_inp, &path_bboxes, &mut reduced, &mut clip_out);
+}
diff --git a/src/cpu_shader/coarse.rs b/src/cpu_shader/coarse.rs
new file mode 100644
index 0000000..390df7f
--- /dev/null
+++ b/src/cpu_shader/coarse.rs
@@ -0,0 +1,344 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{BinHeader, BumpAllocators, ConfigUniform, DrawMonoid, DrawTag, Path, Tile};
+
+use crate::cpu_dispatch::CpuBinding;
+
+use super::{
+    CMD_BEGIN_CLIP, CMD_COLOR, CMD_END, CMD_END_CLIP, CMD_FILL, CMD_IMAGE, CMD_JUMP, CMD_LIN_GRAD,
+    CMD_RAD_GRAD, CMD_SOLID, PTCL_INITIAL_ALLOC,
+};
+
+const N_TILE_X: usize = 16;
+const N_TILE_Y: usize = 16;
+const N_TILE: usize = N_TILE_X * N_TILE_Y;
+
+const PTCL_INCREMENT: u32 = 256;
+const PTCL_HEADROOM: u32 = 2;
+
+// Modeled in the WGSL as private-scoped variables
+struct TileState {
+    cmd_offset: u32,
+    cmd_limit: u32,
+}
+
+impl TileState {
+    fn new(tile_ix: u32) -> TileState {
+        let cmd_offset = tile_ix * PTCL_INITIAL_ALLOC;
+        let cmd_limit = cmd_offset + (PTCL_INITIAL_ALLOC - PTCL_HEADROOM);
+        TileState {
+            cmd_offset,
+            cmd_limit,
+        }
+    }
+
+    fn alloc_cmd(
+        &mut self,
+        size: u32,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+    ) {
+        if self.cmd_offset + size >= self.cmd_limit {
+            let ptcl_dyn_start =
+                config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC;
+            let chunk_size = PTCL_INCREMENT.max(size + PTCL_HEADROOM);
+            let new_cmd = ptcl_dyn_start + bump.ptcl;
+            bump.ptcl += chunk_size;
+            ptcl[self.cmd_offset as usize] = CMD_JUMP;
+            ptcl[self.cmd_offset as usize + 1] = new_cmd;
+            self.cmd_offset = new_cmd;
+            self.cmd_limit = new_cmd + (PTCL_INCREMENT - PTCL_HEADROOM);
+        }
+    }
+
+    fn write(&mut self, ptcl: &mut [u32], offset: u32, value: u32) {
+        ptcl[(self.cmd_offset + offset) as usize] = value;
+    }
+
+    fn write_path(
+        &mut self,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+        tile: &mut Tile,
+    ) {
+        let n_segs = tile.segment_count_or_ix;
+        if n_segs != 0 {
+            let seg_ix = bump.segments;
+            tile.segment_count_or_ix = !seg_ix;
+            bump.segments += n_segs;
+            self.alloc_cmd(4, config, bump, ptcl);
+            self.write(ptcl, 0, CMD_FILL);
+            let even_odd = false; // TODO
+            let size_and_rule = (n_segs << 1) | (even_odd as u32);
+            self.write(ptcl, 1, size_and_rule);
+            self.write(ptcl, 2, seg_ix);
+            self.write(ptcl, 3, tile.backdrop as u32);
+            self.cmd_offset += 4;
+        } else {
+            self.alloc_cmd(1, config, bump, ptcl);
+            self.write(ptcl, 0, CMD_SOLID);
+            self.cmd_offset += 1;
+        }
+    }
+
+    fn write_color(
+        &mut self,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+        rgba_color: u32,
+    ) {
+        self.alloc_cmd(2, config, bump, ptcl);
+        self.write(ptcl, 0, CMD_COLOR);
+        self.write(ptcl, 1, rgba_color);
+        self.cmd_offset += 2;
+    }
+
+    fn write_image(
+        &mut self,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+        info_offset: u32,
+    ) {
+        self.alloc_cmd(2, config, bump, ptcl);
+        self.write(ptcl, 0, CMD_IMAGE);
+        self.write(ptcl, 1, info_offset);
+        self.cmd_offset += 2;
+    }
+
+    fn write_grad(
+        &mut self,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+        ty: u32,
+        index: u32,
+        info_offset: u32,
+    ) {
+        self.alloc_cmd(3, config, bump, ptcl);
+        self.write(ptcl, 0, ty);
+        self.write(ptcl, 1, index);
+        self.write(ptcl, 2, info_offset);
+        self.cmd_offset += 3;
+    }
+
+    fn write_begin_clip(
+        &mut self,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+    ) {
+        self.alloc_cmd(1, config, bump, ptcl);
+        self.write(ptcl, 0, CMD_BEGIN_CLIP);
+        self.cmd_offset += 1;
+    }
+
+    fn write_end_clip(
+        &mut self,
+        config: &ConfigUniform,
+        bump: &mut BumpAllocators,
+        ptcl: &mut [u32],
+        blend: u32,
+        alpha: f32,
+    ) {
+        self.alloc_cmd(3, config, bump, ptcl);
+        self.write(ptcl, 0, CMD_END_CLIP);
+        self.write(ptcl, 1, blend);
+        self.write(ptcl, 2, f32::to_bits(alpha));
+        self.cmd_offset += 3;
+    }
+}
+
+fn coarse_main(
+    config: &ConfigUniform,
+    scene: &[u32],
+    draw_monoids: &[DrawMonoid],
+    bin_headers: &[BinHeader],
+    info_bin_data: &[u32],
+    paths: &[Path],
+    tiles: &mut [Tile],
+    bump: &mut BumpAllocators,
+    ptcl: &mut [u32],
+) {
+    let width_in_tiles = config.width_in_tiles;
+    let height_in_tiles = config.height_in_tiles;
+    let width_in_bins = (width_in_tiles + N_TILE_X as u32 - 1) / N_TILE_X as u32;
+    let height_in_bins = (height_in_tiles + N_TILE_Y as u32 - 1) / N_TILE_Y as u32;
+    let n_bins = width_in_bins * height_in_bins;
+    let bin_data_start = config.layout.bin_data_start;
+    let drawtag_base = config.layout.draw_tag_base;
+    let mut compacted = vec![vec![]; N_TILE];
+    let n_partitions = (config.layout.n_draw_objects + N_TILE as u32 - 1) / N_TILE as u32;
+    for bin in 0..n_bins {
+        for v in &mut compacted {
+            v.clear();
+        }
+        let bin_x = bin % width_in_bins;
+        let bin_y = bin / width_in_bins;
+        let bin_tile_x = N_TILE_X as u32 * bin_x;
+        let bin_tile_y = N_TILE_Y as u32 * bin_y;
+        for part in 0..n_partitions {
+            let in_ix = part * N_TILE as u32 + bin;
+            let bin_header = bin_headers[in_ix as usize];
+            let start = bin_data_start + bin_header.chunk_offset;
+            for i in 0..bin_header.element_count {
+                let drawobj_ix = info_bin_data[(start + i) as usize];
+                let tag = scene[(drawtag_base + drawobj_ix) as usize];
+                if DrawTag(tag) != DrawTag::NOP {
+                    let draw_monoid = draw_monoids[drawobj_ix as usize];
+                    let path_ix = draw_monoid.path_ix;
+                    let path = paths[path_ix as usize];
+                    let dx = path.bbox[0] as i32 - bin_tile_x as i32;
+                    let dy = path.bbox[1] as i32 - bin_tile_y as i32;
+                    let x0 = dx.clamp(0, N_TILE_X as i32);
+                    let y0 = dy.clamp(0, N_TILE_Y as i32);
+                    let x1 = (path.bbox[2] as i32 - bin_tile_x as i32).clamp(0, N_TILE_X as i32);
+                    let y1 = (path.bbox[3] as i32 - bin_tile_y as i32).clamp(0, N_TILE_Y as i32);
+                    for y in y0..y1 {
+                        for x in x0..x1 {
+                            compacted[(y * N_TILE_X as i32 + x) as usize].push(drawobj_ix);
+                        }
+                    }
+                }
+            }
+        }
+        // compacted now has the list of draw objects for each tile.
+        // While the WGSL source does at most 256 draw objects at a time,
+        // this version does all the draw objects in a tile.
+        for tile_ix in 0..N_TILE {
+            let tile_x = (tile_ix % N_TILE_X) as u32;
+            let tile_y = (tile_ix / N_TILE_X) as u32;
+            let this_tile_ix = (bin_tile_y + tile_y) * width_in_tiles + bin_tile_x + tile_x;
+            let mut tile_state = TileState::new(this_tile_ix);
+            let blend_offset = tile_state.cmd_offset;
+            tile_state.cmd_offset += 1;
+            let mut clip_depth = 0;
+            let mut clip_zero_depth = 0;
+            for drawobj_ix in &compacted[tile_ix] {
+                let drawtag = scene[(drawtag_base + drawobj_ix) as usize];
+                if clip_zero_depth == 0 {
+                    let draw_monoid = draw_monoids[*drawobj_ix as usize];
+                    let path_ix = draw_monoid.path_ix;
+                    let path = paths[path_ix as usize];
+                    let bbox = path.bbox;
+                    let stride = bbox[2] - bbox[0];
+                    let x = bin_tile_x + tile_x - bbox[0];
+                    let y = bin_tile_y + tile_y - bbox[1];
+                    let tile = &mut tiles[(path.tiles + y * stride + x) as usize];
+                    let is_clip = (drawtag & 1) != 0;
+                    let mut is_blend = false;
+                    let dd = config.layout.draw_data_base + draw_monoid.scene_offset;
+                    let di = draw_monoid.info_offset;
+                    if is_clip {
+                        const BLEND_CLIP: u32 = (128 << 8) | 3;
+                        let blend = scene[dd as usize];
+                        is_blend = blend != BLEND_CLIP;
+                    }
+                    let n_segs = tile.segment_count_or_ix;
+                    let include_tile = n_segs != 0 || (tile.backdrop == 0) == is_clip || is_blend;
+                    if include_tile {
+                        // TODO: get drawinfo (linewidth for fills)
+                        match DrawTag(drawtag) {
+                            DrawTag::COLOR => {
+                                tile_state.write_path(config, bump, ptcl, tile);
+                                let rgba_color = scene[dd as usize];
+                                tile_state.write_color(config, bump, ptcl, rgba_color);
+                            }
+                            DrawTag::IMAGE => {
+                                tile_state.write_path(config, bump, ptcl, tile);
+                                tile_state.write_image(config, bump, ptcl, di + 1);
+                            }
+                            DrawTag::LINEAR_GRADIENT => {
+                                tile_state.write_path(config, bump, ptcl, tile);
+                                let index = scene[dd as usize];
+                                tile_state.write_grad(
+                                    config,
+                                    bump,
+                                    ptcl,
+                                    CMD_LIN_GRAD,
+                                    index,
+                                    di + 1,
+                                );
+                            }
+                            DrawTag::RADIAL_GRADIENT => {
+                                tile_state.write_path(config, bump, ptcl, tile);
+                                let index = scene[dd as usize];
+                                tile_state.write_grad(
+                                    config,
+                                    bump,
+                                    ptcl,
+                                    CMD_RAD_GRAD,
+                                    index,
+                                    di + 1,
+                                );
+                            }
+                            DrawTag::BEGIN_CLIP => {
+                                if tile.segment_count_or_ix == 0 && tile.backdrop == 0 {
+                                    clip_zero_depth = clip_depth + 1;
+                                } else {
+                                    tile_state.write_begin_clip(config, bump, ptcl);
+                                    // TODO: update blend depth
+                                }
+                                clip_depth += 1;
+                            }
+                            DrawTag::END_CLIP => {
+                                clip_depth -= 1;
+                                tile_state.write_path(config, bump, ptcl, tile);
+                                let blend = scene[dd as usize];
+                                let alpha = f32::from_bits(scene[dd as usize + 1]);
+                                tile_state.write_end_clip(config, bump, ptcl, blend, alpha);
+                            }
+                            _ => todo!(),
+                        }
+                    }
+                } else {
+                    // In "clip zero" state, suppress all drawing
+                    match DrawTag(drawtag) {
+                        DrawTag::BEGIN_CLIP => clip_depth += 1,
+                        DrawTag::END_CLIP => {
+                            if clip_depth == clip_zero_depth {
+                                clip_zero_depth = 0;
+                            }
+                            clip_depth -= 1;
+                        }
+                        _ => (),
+                    }
+                }
+            }
+
+            if bin_tile_x + tile_x < width_in_tiles && bin_tile_y + tile_y < height_in_tiles {
+                ptcl[tile_state.cmd_offset as usize] = CMD_END;
+                let scratch_size = 0; // TODO: actually compute blend depth
+                ptcl[blend_offset as usize] = bump.blend;
+                bump.blend += scratch_size;
+            }
+        }
+    }
+}
+
+pub fn coarse(_n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let draw_monoids = resources[2].as_slice();
+    let bin_headers = resources[3].as_slice();
+    let info_bin_data = resources[4].as_slice();
+    let paths = resources[5].as_slice();
+    let mut tiles = resources[6].as_slice_mut();
+    let mut bump = resources[7].as_typed_mut();
+    let mut ptcl = resources[8].as_slice_mut();
+    coarse_main(
+        &config,
+        &scene,
+        &draw_monoids,
+        &bin_headers,
+        &info_bin_data,
+        &paths,
+        &mut tiles,
+        &mut bump,
+        &mut ptcl,
+    );
+}
diff --git a/src/cpu_shader/draw_leaf.rs b/src/cpu_shader/draw_leaf.rs
new file mode 100644
index 0000000..0aa779e
--- /dev/null
+++ b/src/cpu_shader/draw_leaf.rs
@@ -0,0 +1,168 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{Clip, ConfigUniform, DrawMonoid, DrawTag, Monoid, PathBbox};
+
+use crate::cpu_dispatch::CpuBinding;
+
+use super::util::{read_draw_tag_from_scene, Transform, Vec2};
+
+const WG_SIZE: usize = 256;
+
+fn draw_leaf_main(
+    n_wg: u32,
+    config: &ConfigUniform,
+    scene: &[u32],
+    reduced: &[DrawMonoid],
+    path_bbox: &[PathBbox],
+    draw_monoid: &mut [DrawMonoid],
+    info: &mut [u32],
+    clip_inp: &mut [Clip],
+) {
+    let mut prefix = DrawMonoid::default();
+    for i in 0..n_wg {
+        let mut m = prefix;
+        for j in 0..WG_SIZE {
+            let ix = i * WG_SIZE as u32 + j as u32;
+            let tag_raw = read_draw_tag_from_scene(config, scene, ix);
+            let tag_word = DrawTag(tag_raw);
+            // store exclusive prefix sum
+            if ix < config.layout.n_draw_objects {
+                draw_monoid[ix as usize] = m;
+            }
+            let m_next = m.combine(&DrawMonoid::new(tag_word));
+            let dd = config.layout.draw_data_base + m.scene_offset;
+            let di = m.info_offset as usize;
+            if tag_word == DrawTag::COLOR
+                || tag_word == DrawTag::LINEAR_GRADIENT
+                || tag_word == DrawTag::RADIAL_GRADIENT
+                || tag_word == DrawTag::IMAGE
+                || tag_word == DrawTag::BEGIN_CLIP
+            {
+                let bbox = path_bbox[m.path_ix as usize];
+                let transform = Transform::read(config.layout.transform_base, bbox.trans_ix, scene);
+                let linewidth = bbox.linewidth;
+                match tag_word {
+                    DrawTag::COLOR => {
+                        info[di] = f32::to_bits(linewidth);
+                    }
+                    DrawTag::LINEAR_GRADIENT => {
+                        info[di] = f32::to_bits(linewidth);
+                        let p0 = Vec2::new(
+                            f32::from_bits(scene[dd as usize + 1]),
+                            f32::from_bits(scene[dd as usize + 2]),
+                        );
+                        let p1 = Vec2::new(
+                            f32::from_bits(scene[dd as usize + 3]),
+                            f32::from_bits(scene[dd as usize + 4]),
+                        );
+                        let p0 = transform.apply(p0);
+                        let p1 = transform.apply(p1);
+                        let dxy = p1 - p0;
+                        let scale = 1.0 / dxy.dot(dxy);
+                        let line_xy = dxy * scale;
+                        let line_c = -p0.dot(line_xy);
+                        info[di + 1] = f32::to_bits(line_xy.x);
+                        info[di + 2] = f32::to_bits(line_xy.y);
+                        info[di + 3] = f32::to_bits(line_c);
+                    }
+                    DrawTag::RADIAL_GRADIENT => {
+                        info[di] = f32::to_bits(linewidth);
+                        let p0 = Vec2::new(
+                            f32::from_bits(scene[dd as usize + 1]),
+                            f32::from_bits(scene[dd as usize + 2]),
+                        );
+                        let p1 = Vec2::new(
+                            f32::from_bits(scene[dd as usize + 3]),
+                            f32::from_bits(scene[dd as usize + 4]),
+                        );
+                        let r0 = f32::from_bits(scene[dd as usize + 5]);
+                        let r1 = f32::from_bits(scene[dd as usize + 6]);
+                        let z = transform.0;
+                        let inv_det = (z[0] * z[3] - z[1] * z[2]).recip();
+                        let inv_mat = [
+                            z[3] * inv_det,
+                            -z[1] * inv_det,
+                            -z[2] * inv_det,
+                            z[0] * inv_det,
+                        ];
+                        let inv_tr = [
+                            -(inv_mat[0] * z[4] + inv_mat[2] * z[5]) - p0.x,
+                            -(inv_mat[1] * z[4] + inv_mat[3] * z[5]) - p0.y,
+                        ];
+                        let center1 = p1 - p0;
+                        let rr = r1 / (r1 - r0);
+                        let ra_inv = rr / (r1 * r1 - center1.dot(center1));
+                        let c1 = center1 * ra_inv;
+                        let ra = rr * ra_inv;
+                        let roff = rr - 1.0;
+                        info[di + 1] = f32::to_bits(inv_mat[0]);
+                        info[di + 2] = f32::to_bits(inv_mat[1]);
+                        info[di + 3] = f32::to_bits(inv_mat[2]);
+                        info[di + 4] = f32::to_bits(inv_mat[3]);
+                        info[di + 5] = f32::to_bits(inv_tr[0]);
+                        info[di + 6] = f32::to_bits(inv_tr[1]);
+                        info[di + 7] = f32::to_bits(c1.x);
+                        info[di + 8] = f32::to_bits(c1.y);
+                        info[di + 9] = f32::to_bits(ra);
+                        info[di + 19] = f32::to_bits(roff);
+                    }
+                    DrawTag::IMAGE => {
+                        info[di] = f32::to_bits(linewidth);
+                        let z = transform.0;
+                        let inv_det = (z[0] * z[3] - z[1] * z[2]).recip();
+                        let inv_mat = [
+                            z[3] * inv_det,
+                            -z[1] * inv_det,
+                            -z[2] * inv_det,
+                            z[0] * inv_det,
+                        ];
+                        let inv_tr = [
+                            -(inv_mat[0] * z[4] + inv_mat[2] * z[5]),
+                            -(inv_mat[1] * z[4] + inv_mat[3] * z[5]),
+                        ];
+                        info[di + 1] = f32::to_bits(inv_mat[0]);
+                        info[di + 2] = f32::to_bits(inv_mat[1]);
+                        info[di + 3] = f32::to_bits(inv_mat[2]);
+                        info[di + 4] = f32::to_bits(inv_mat[3]);
+                        info[di + 5] = f32::to_bits(inv_tr[0]);
+                        info[di + 6] = f32::to_bits(inv_tr[1]);
+                        info[di + 7] = scene[dd as usize];
+                        info[di + 8] = scene[dd as usize + 1];
+                    }
+                    DrawTag::BEGIN_CLIP => (),
+                    _ => todo!("unhandled draw tag {:x}", tag_word.0),
+                }
+            }
+            if tag_word == DrawTag::BEGIN_CLIP {
+                let path_ix = m.path_ix as i32;
+                clip_inp[m.clip_ix as usize] = Clip { ix, path_ix };
+            } else if tag_word == DrawTag::END_CLIP {
+                let path_ix = !ix as i32;
+                clip_inp[m.clip_ix as usize] = Clip { ix, path_ix };
+            }
+            m = m_next;
+        }
+        prefix = prefix.combine(&reduced[i as usize]);
+    }
+}
+
+pub fn draw_leaf(n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let reduced = resources[2].as_slice();
+    let path_bbox = resources[3].as_slice();
+    let mut draw_monoid = resources[4].as_slice_mut();
+    let mut info = resources[5].as_slice_mut();
+    let mut clip_inp = resources[6].as_slice_mut();
+    draw_leaf_main(
+        n_wg,
+        &config,
+        &scene,
+        &reduced,
+        &path_bbox,
+        &mut draw_monoid,
+        &mut info,
+        &mut clip_inp,
+    );
+}
diff --git a/src/cpu_shader/draw_reduce.rs b/src/cpu_shader/draw_reduce.rs
new file mode 100644
index 0000000..61c338c
--- /dev/null
+++ b/src/cpu_shader/draw_reduce.rs
@@ -0,0 +1,29 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{ConfigUniform, DrawMonoid, DrawTag, Monoid};
+
+use crate::cpu_dispatch::CpuBinding;
+
+use super::util::read_draw_tag_from_scene;
+
+const WG_SIZE: usize = 256;
+
+fn draw_reduce_main(n_wg: u32, config: &ConfigUniform, scene: &[u32], reduced: &mut [DrawMonoid]) {
+    for i in 0..n_wg {
+        let mut m = DrawMonoid::default();
+        for j in 0..WG_SIZE {
+            let ix = i * WG_SIZE as u32 + j as u32;
+            let tag = read_draw_tag_from_scene(config, scene, ix);
+            m = m.combine(&DrawMonoid::new(DrawTag(tag)));
+        }
+        reduced[i as usize] = m;
+    }
+}
+
+pub fn draw_reduce(n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let mut reduced = resources[2].as_slice_mut();
+    draw_reduce_main(n_wg, &config, &scene, &mut reduced);
+}
diff --git a/src/cpu_shader/fine.rs b/src/cpu_shader/fine.rs
new file mode 100644
index 0000000..c64c876
--- /dev/null
+++ b/src/cpu_shader/fine.rs
@@ -0,0 +1,188 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{ConfigUniform, PathSegment, Tile};
+
+use crate::cpu_dispatch::CpuTexture;
+
+use super::{CMD_COLOR, CMD_END, CMD_FILL, CMD_JUMP, CMD_SOLID, PTCL_INITIAL_ALLOC};
+
+// These should also move into a common area
+const TILE_WIDTH: usize = 16;
+const TILE_HEIGHT: usize = 16;
+const TILE_SIZE: usize = TILE_WIDTH * TILE_HEIGHT;
+
+fn read_color(ptcl: &[u32], offset: u32) -> u32 {
+    ptcl[(offset + 1) as usize]
+}
+
+struct CmdFill {
+    size_and_rule: u32,
+    seg_data: u32,
+    backdrop: i32,
+}
+
+fn read_fill(ptcl: &[u32], offset: u32) -> CmdFill {
+    let size_and_rule = ptcl[(offset + 1) as usize];
+    let seg_data = ptcl[(offset + 2) as usize];
+    let backdrop = ptcl[(offset + 3) as usize] as i32;
+    CmdFill {
+        size_and_rule,
+        seg_data,
+        backdrop,
+    }
+}
+
+fn unpack4x8unorm(x: u32) -> [f32; 4] {
+    let mut result = [0.0; 4];
+    for i in 0..4 {
+        result[i] = ((x >> (i * 8)) & 0xff) as f32 * (1.0 / 255.0);
+    }
+    result
+}
+
+fn pack4x8unorm(x: [f32; 4]) -> u32 {
+    let mut result = 0;
+    for i in 0..4 {
+        let byte = (x[i].clamp(0.0, 1.0) * 255.0).round() as u32;
+        result |= byte << (i * 8);
+    }
+    result
+}
+
+fn fill_path(area: &mut [f32], segments: &[PathSegment], fill: &CmdFill, x_tile: f32, y_tile: f32) {
+    let n_segs = fill.size_and_rule >> 1;
+    let even_odd = (fill.size_and_rule & 1) != 0;
+    let backdrop_f = fill.backdrop as f32;
+    for a in area.iter_mut() {
+        *a = backdrop_f;
+    }
+    for segment in &segments[fill.seg_data as usize..][..n_segs as usize] {
+        for yi in 0..TILE_HEIGHT {
+            let y = segment.origin[1] - (y_tile + yi as f32);
+            let y0 = y.clamp(0.0, 1.0);
+            let y1 = (y + segment.delta[1]).clamp(0.0, 1.0);
+            let dy = y0 - y1;
+            let y_edge = segment.delta[0].signum()
+                * (y_tile + yi as f32 - segment.y_edge + 1.0).clamp(0.0, 1.0);
+            if dy != 0.0 {
+                let vec_y_recip = segment.delta[1].recip();
+                let t0 = (y0 - y) * vec_y_recip;
+                let t1 = (y1 - y) * vec_y_recip;
+                let startx = segment.origin[0] - x_tile;
+                let x0 = startx + t0 * segment.delta[0];
+                let x1 = startx + t1 * segment.delta[0];
+                let xmin0 = x0.min(x1);
+                let xmax0 = x0.max(x1);
+                for i in 0..TILE_WIDTH {
+                    let i_f = i as f32;
+                    let xmin = (xmin0 - i_f).min(1.0) - 1.0e-6;
+                    let xmax = xmax0 - i_f;
+                    let b = xmax.min(1.0);
+                    let c = b.max(0.0);
+                    let d = xmin.max(0.0);
+                    let a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
+                    area[yi * TILE_WIDTH + i] += y_edge + a * dy;
+                }
+            } else if y_edge != 0.0 {
+                for i in 0..TILE_WIDTH {
+                    area[yi * TILE_WIDTH + i] += y_edge;
+                }
+            }
+        }
+    }
+    if even_odd {
+        for a in area.iter_mut() {
+            {
+                *a = (*a - 2.0 * (0.5 * *a).round()).abs();
+            }
+        }
+    } else {
+        for a in area.iter_mut() {
+            {
+                *a = a.abs().min(1.0);
+            }
+        }
+    }
+}
+
+// Note: this is a draft. Texture resources are not yet wired up, so it
+// has not yet been tested.
+#[allow(unused)]
+fn fine_main(
+    config: &ConfigUniform,
+    tiles: &[Tile],
+    segments: &[PathSegment],
+    output: &mut CpuTexture,
+    ptcl: &[u32],
+    info: &[u32],
+    // TODO: image texture resources
+    // TODO: masks?
+) {
+    let width_in_tiles = config.width_in_tiles;
+    let height_in_tiles = config.height_in_tiles;
+    let n_tiles = width_in_tiles * height_in_tiles;
+    let mut area = vec![0.0f32; TILE_SIZE];
+    let mut rgba = vec![[0.0f32; 4]; TILE_SIZE];
+    for tile_ix in 0..n_tiles {
+        for x in &mut rgba {
+            *x = [0.0; 4];
+        }
+        for a in &mut area {
+            *a = 0.0;
+        }
+        let tile_x = tile_ix % width_in_tiles;
+        let tile_y = tile_ix / width_in_tiles;
+        let mut cmd_ix = tile_ix * PTCL_INITIAL_ALLOC;
+        // skip over blend stack allocation
+        cmd_ix += 1;
+        loop {
+            let tag = ptcl[cmd_ix as usize];
+            if tag == CMD_END {
+                break;
+            }
+            match tag {
+                CMD_FILL => {
+                    let fill = read_fill(ptcl, cmd_ix);
+                    // x0 and y0 will go away when we do tile-relative coords
+                    let x0 = (tile_x as usize * TILE_WIDTH) as f32;
+                    let y0 = (tile_y as usize * TILE_HEIGHT) as f32;
+                    fill_path(&mut area, segments, &fill, x0, y0);
+                    cmd_ix += 4;
+                }
+                CMD_SOLID => {
+                    for a in &mut area {
+                        *a = 1.0;
+                    }
+                    cmd_ix += 2;
+                }
+                CMD_COLOR => {
+                    let color = read_color(ptcl, cmd_ix);
+                    let fg = unpack4x8unorm(color);
+                    let fg = [fg[3], fg[2], fg[1], fg[0]];
+                    for i in 0..TILE_SIZE {
+                        let ai = area[i];
+                        let fg_i = [fg[0] * ai, fg[1] * ai, fg[2] * ai, fg[3] * ai];
+                        for j in 0..4 {
+                            rgba[i][j] = rgba[i][j] * (1.0 - fg_i[3]) + fg_i[j];
+                        }
+                    }
+                    cmd_ix += 2;
+                }
+                CMD_JUMP => {
+                    cmd_ix = ptcl[(cmd_ix + 1) as usize];
+                }
+                _ => todo!("unhandled ptcl command {tag}"),
+            }
+        }
+        // Write tile (in rgba)
+        for y in 0..TILE_HEIGHT {
+            let base =
+                output.width * (tile_y as usize * TILE_HEIGHT + y) + tile_x as usize * TILE_WIDTH;
+            for x in 0..TILE_WIDTH {
+                let rgba32 = pack4x8unorm(rgba[y * TILE_WIDTH + x]);
+                output.pixels[base + x] = rgba32;
+            }
+        }
+    }
+}
diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
new file mode 100644
index 0000000..2cdf725
--- /dev/null
+++ b/src/cpu_shader/flatten.rs
@@ -0,0 +1,299 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use crate::cpu_dispatch::CpuBinding;
+
+use super::util::{Transform, Vec2};
+use vello_encoding::{BumpAllocators, ConfigUniform, LineSoup, Monoid, PathBbox, PathMonoid};
+
+fn to_minus_one_quarter(x: f32) -> f32 {
+    // could also be written x.powf(-0.25)
+    x.sqrt().sqrt().recip()
+}
+
+const D: f32 = 0.67;
+fn approx_parabola_integral(x: f32) -> f32 {
+    x * to_minus_one_quarter(1.0 - D + (D * D * D * D + 0.25 * x * x))
+}
+
+const B: f32 = 0.39;
+fn approx_parabola_inv_integral(x: f32) -> f32 {
+    x * (1.0 - B + (B * B + 0.5 * x * x)).sqrt()
+}
+
+#[derive(Clone, Copy, Default)]
+struct SubdivResult {
+    val: f32,
+    a0: f32,
+    a2: f32,
+}
+
+fn estimate_subdiv(p0: Vec2, p1: Vec2, p2: Vec2, sqrt_tol: f32) -> SubdivResult {
+    let d01 = p1 - p0;
+    let d12 = p2 - p1;
+    let dd = d01 - d12;
+    let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
+    let cross_inv = if cross.abs() < 1.0e-9 {
+        1.0e9
+    } else {
+        cross.recip()
+    };
+    let x0 = d01.dot(dd) * cross_inv;
+    let x2 = d12.dot(dd) * cross_inv;
+    let scale = (cross / (dd.length() * (x2 - x0))).abs();
+    let a0 = approx_parabola_integral(x0);
+    let a2 = approx_parabola_integral(x2);
+    let mut val = 0.0;
+    if scale < 1e9 {
+        let da = (a2 - a0).abs();
+        let sqrt_scale = scale.sqrt();
+        if x0.signum() == x2.signum() {
+            val = sqrt_scale;
+        } else {
+            let xmin = sqrt_tol / sqrt_scale;
+            val = sqrt_tol / approx_parabola_integral(xmin);
+        }
+        val *= da;
+    }
+    SubdivResult { val, a0, a2 }
+}
+
+fn eval_quad(p0: Vec2, p1: Vec2, p2: Vec2, t: f32) -> Vec2 {
+    let mt = 1.0 - t;
+    p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t
+}
+
+fn eval_cubic(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2, t: f32) -> Vec2 {
+    let mt = 1.0 - t;
+    p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t
+}
+
+const MAX_QUADS: u32 = 16;
+
+struct Cubic {
+    p0: Vec2,
+    p1: Vec2,
+    p2: Vec2,
+    p3: Vec2,
+    path_ix: u32,
+}
+
+fn flatten_cubic(cubic: Cubic, line_ix: &mut usize, lines: &mut [LineSoup]) {
+    let p0 = cubic.p0;
+    let p1 = cubic.p1;
+    let p2 = cubic.p2;
+    let p3 = cubic.p3;
+    let err_v = (p2 - p1) * 3.0 + p0 - p3;
+    let err = err_v.dot(err_v);
+    const ACCURACY: f32 = 0.25;
+    const Q_ACCURACY: f32 = ACCURACY * 0.1;
+    const REM_ACCURACY: f32 = ACCURACY - Q_ACCURACY;
+    const MAX_HYPOT2: f32 = 432.0 * Q_ACCURACY * Q_ACCURACY;
+    let mut n_quads = ((err * (1.0 / MAX_HYPOT2)).powf(1.0 / 6.0).ceil() as u32).max(1);
+    n_quads = n_quads.min(MAX_QUADS);
+    let mut keep_params = [SubdivResult::default(); MAX_QUADS as usize];
+    let mut val = 0.0;
+    let mut qp0 = p0;
+    let step = (n_quads as f32).recip();
+    for i in 0..n_quads {
+        let t = (i + 1) as f32 * step;
+        let qp2 = eval_cubic(p0, p1, p2, p3, t);
+        let mut qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
+        qp1 = qp1 * 2.0 - (qp0 + qp2) * 0.5;
+        let params = estimate_subdiv(qp0, qp1, qp2, REM_ACCURACY.sqrt());
+        keep_params[i as usize] = params;
+        val += params.val;
+        qp0 = qp2;
+    }
+    let n = ((val * (0.5 / REM_ACCURACY.sqrt())).ceil() as u32).max(1);
+    let mut lp0 = p0;
+    qp0 = p0;
+    let v_step = val / (n as f32);
+    let mut n_out = 1;
+    let mut val_sum = 0.0;
+    for i in 0..n_quads {
+        let t = (i + 1) as f32 * step;
+        let qp2 = eval_cubic(p0, p1, p2, p3, t);
+        let mut qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
+        qp1 = qp1 * 2.0 - (qp0 + qp2) * 0.5;
+        let params = keep_params[i as usize];
+        let u0 = approx_parabola_inv_integral(params.a0);
+        let u2 = approx_parabola_inv_integral(params.a2);
+        let uscale = (u2 - u0).recip();
+        let mut val_target = (n_out as f32) * v_step;
+        while n_out == n || val_target < val_sum + params.val {
+            let lp1 = if n_out == n {
+                p3
+            } else {
+                let u = (val_target - val_sum) / params.val;
+                let a = params.a0 + (params.a2 - params.a0) * u;
+                let au = approx_parabola_inv_integral(a);
+                let t = (au - u0) * uscale;
+                eval_quad(qp0, qp1, qp2, t)
+            };
+            let ls = LineSoup {
+                path_ix: cubic.path_ix,
+                _padding: Default::default(),
+                p0: lp0.to_array(),
+                p1: lp1.to_array(),
+            };
+            lines[*line_ix] = ls;
+            *line_ix += 1;
+            n_out += 1;
+            val_target += v_step;
+            lp0 = lp1;
+        }
+        val_sum += params.val;
+        qp0 = qp2;
+    }
+}
+
+fn read_f32_point(ix: u32, pathdata: &[u32]) -> Vec2 {
+    let x = f32::from_bits(pathdata[ix as usize]);
+    let y = f32::from_bits(pathdata[ix as usize + 1]);
+    Vec2 { x, y }
+}
+
+struct IntBbox {
+    x0: i32,
+    y0: i32,
+    x1: i32,
+    y1: i32,
+}
+
+impl Default for IntBbox {
+    fn default() -> Self {
+        IntBbox {
+            x0: 0x7fff_ffff,
+            y0: 0x7fff_ffff,
+            x1: -0x8000_0000,
+            y1: -0x8000_0000,
+        }
+    }
+}
+
+impl IntBbox {
+    fn add_pt(&mut self, pt: Vec2) {
+        self.x0 = self.x0.min(pt.x.floor() as i32);
+        self.y0 = self.y0.min(pt.y.floor() as i32);
+        self.x1 = self.x1.max(pt.x.ceil() as i32);
+        self.y1 = self.y1.max(pt.y.ceil() as i32);
+    }
+}
+
+// TODO: we're skipping i16 point reading as it's not present in our scenes
+
+const WG_SIZE: usize = 256;
+
+const PATH_TAG_SEG_TYPE: u8 = 3;
+const PATH_TAG_PATH: u8 = 0x10;
+const PATH_TAG_LINETO: u8 = 1;
+const PATH_TAG_QUADTO: u8 = 2;
+const PATH_TAG_CUBICTO: u8 = 3;
+const PATH_TAG_F32: u8 = 8;
+
+fn flatten_main(
+    n_wg: u32,
+    config: &ConfigUniform,
+    scene: &[u32],
+    tag_monoids: &[PathMonoid],
+    path_bboxes: &mut [PathBbox],
+    bump: &mut BumpAllocators,
+    lines: &mut [LineSoup],
+) {
+    let mut line_ix = 0;
+    let mut bbox = IntBbox::default();
+    for ix in 0..n_wg as usize * WG_SIZE {
+        let tag_word = scene[config.layout.path_tag_base as usize + (ix >> 2)];
+        let shift = (ix & 3) * 8;
+        let mut tm = PathMonoid::new(tag_word & ((1 << shift) - 1));
+        let tag_byte = (tag_word >> shift) as u8;
+        if tag_byte != 0 {
+            tm = tag_monoids[ix >> 2].combine(&tm);
+        }
+        let linewidth =
+            f32::from_bits(scene[(config.layout.linewidth_base + tm.linewidth_ix) as usize]);
+        if (tag_byte & PATH_TAG_PATH) != 0 {
+            let out = &mut path_bboxes[tm.path_ix as usize];
+            out.linewidth = linewidth;
+            out.trans_ix = tm.trans_ix;
+        }
+        let seg_type = tag_byte & PATH_TAG_SEG_TYPE;
+        let pathdata = &scene[config.layout.path_data_base as usize..];
+        if seg_type != 0 {
+            let mut p0;
+            let mut p1;
+            let mut p2 = Vec2::default();
+            let mut p3 = Vec2::default();
+            if (tag_byte & PATH_TAG_F32) != 0 {
+                p0 = read_f32_point(tm.pathseg_offset, pathdata);
+                p1 = read_f32_point(tm.pathseg_offset + 2, pathdata);
+                if seg_type >= PATH_TAG_QUADTO {
+                    p2 = read_f32_point(tm.pathseg_offset + 4, pathdata);
+                    if seg_type == PATH_TAG_CUBICTO {
+                        p3 = read_f32_point(tm.pathseg_offset + 6, pathdata);
+                    }
+                }
+            } else {
+                todo!("i16 path data not supported yet");
+            }
+            let transform = Transform::read(config.layout.transform_base, tm.trans_ix, scene);
+            p0 = transform.apply(p0);
+            bbox.add_pt(p0);
+            p1 = transform.apply(p1);
+            bbox.add_pt(p1);
+            if seg_type == PATH_TAG_LINETO {
+                p3 = p1;
+                p2 = p3.mix(p0, 1.0 / 3.0);
+                p1 = p0.mix(p3, 1.0 / 3.0);
+            } else if seg_type >= PATH_TAG_QUADTO {
+                p2 = transform.apply(p2);
+                bbox.add_pt(p2);
+                if seg_type == PATH_TAG_CUBICTO {
+                    p3 = transform.apply(p3);
+                    bbox.add_pt(p3);
+                } else {
+                    p3 = p2;
+                    p2 = p1.mix(p2, 1.0 / 3.0);
+                    p1 = p1.mix(p0, 1.0 / 3.0);
+                }
+            }
+            let path_ix = tm.path_ix;
+            let cubic = Cubic {
+                p0,
+                p1,
+                p2,
+                p3,
+                path_ix,
+            };
+            flatten_cubic(cubic, &mut line_ix, lines);
+        }
+        if (tag_byte & PATH_TAG_PATH) != 0 {
+            let out = &mut path_bboxes[tm.path_ix as usize];
+            out.x0 = bbox.x0;
+            out.y0 = bbox.y0;
+            out.x1 = bbox.x1;
+            out.y1 = bbox.y1;
+            bbox = IntBbox::default();
+        }
+    }
+    bump.lines = line_ix as u32;
+}
+
+pub fn flatten(n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let tag_monoids = resources[2].as_slice();
+    let mut path_bboxes = resources[3].as_slice_mut();
+    let mut bump = resources[4].as_typed_mut();
+    let mut lines = resources[5].as_slice_mut();
+    flatten_main(
+        n_wg,
+        &config,
+        &scene,
+        &tag_monoids,
+        &mut path_bboxes,
+        &mut bump,
+        &mut lines,
+    );
+}
diff --git a/src/cpu_shader/mod.rs b/src/cpu_shader/mod.rs
index fed341c..16d261f 100644
--- a/src/cpu_shader/mod.rs
+++ b/src/cpu_shader/mod.rs
@@ -1,8 +1,61 @@
 // Copyright 2023 The Vello authors
-// SPDX-License-Identifier: Apache-2.0 OR MIT
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
 //! CPU implementations of shader stages.
 
-mod pathtag_reduce;
+// Allow un-idiomatic Rust to more closely match shaders
+#![allow(clippy::needless_range_loop)]
+#![allow(clippy::too_many_arguments)]
 
+mod backdrop;
+mod bbox_clear;
+mod binning;
+mod clip_leaf;
+mod clip_reduce;
+mod coarse;
+mod draw_leaf;
+mod draw_reduce;
+mod fine;
+mod flatten;
+mod path_count;
+mod path_count_setup;
+mod path_tiling;
+mod path_tiling_setup;
+mod pathtag_reduce;
+mod pathtag_scan;
+mod tile_alloc;
+mod util;
+
+pub use backdrop::backdrop;
+pub use bbox_clear::bbox_clear;
+pub use binning::binning;
+pub use clip_leaf::clip_leaf;
+pub use clip_reduce::clip_reduce;
+pub use coarse::coarse;
+pub use draw_leaf::draw_leaf;
+pub use draw_reduce::draw_reduce;
+pub use flatten::flatten;
+pub use path_count::path_count;
+pub use path_count_setup::path_count_setup;
+pub use path_tiling::path_tiling;
+pub use path_tiling_setup::path_tiling_setup;
 pub use pathtag_reduce::pathtag_reduce;
+pub use pathtag_scan::pathtag_scan;
+pub use tile_alloc::tile_alloc;
+
+// Common definitions
+
+const PTCL_INITIAL_ALLOC: u32 = 64;
+
+// Tags for PTCL commands
+const CMD_END: u32 = 0;
+const CMD_FILL: u32 = 1;
+//const CMD_STROKE: u32 = 2;
+const CMD_SOLID: u32 = 3;
+const CMD_COLOR: u32 = 5;
+const CMD_LIN_GRAD: u32 = 6;
+const CMD_RAD_GRAD: u32 = 7;
+const CMD_IMAGE: u32 = 8;
+const CMD_BEGIN_CLIP: u32 = 9;
+const CMD_END_CLIP: u32 = 10;
+const CMD_JUMP: u32 = 11;
diff --git a/src/cpu_shader/path_count.rs b/src/cpu_shader/path_count.rs
new file mode 100644
index 0000000..3cb8855
--- /dev/null
+++ b/src/cpu_shader/path_count.rs
@@ -0,0 +1,162 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{BumpAllocators, LineSoup, Path, SegmentCount, Tile};
+
+use crate::cpu_dispatch::CpuBinding;
+
+use super::util::{span, Vec2, ONE_MINUS_ULP, ROBUST_EPSILON};
+
+const TILE_SCALE: f32 = 1.0 / 16.0;
+
+fn path_count_main(
+    bump: &mut BumpAllocators,
+    lines: &[LineSoup],
+    paths: &[Path],
+    tile: &mut [Tile],
+    seg_counts: &mut [SegmentCount],
+) {
+    for line_ix in 0..bump.lines {
+        let line = lines[line_ix as usize];
+        let p0 = Vec2::from_array(line.p0);
+        let p1 = Vec2::from_array(line.p1);
+        let is_down = p1.y >= p0.y;
+        let (xy0, xy1) = if is_down { (p0, p1) } else { (p1, p0) };
+        let s0 = xy0 * TILE_SCALE;
+        let s1 = xy1 * TILE_SCALE;
+        let count_x = span(s0.x, s1.x) - 1;
+        let count = count_x + span(s0.y, s1.y);
+
+        let dx = (s1.x - s0.x).abs();
+        let dy = s1.y - s0.y;
+        if dx + dy == 0.0 {
+            continue;
+        }
+        if dy == 0.0 && s0.y.floor() == s0.y {
+            continue;
+        }
+        let idxdy = 1.0 / (dx + dy);
+        let mut a = dx * idxdy;
+        let is_positive_slope = s1.x >= s0.x;
+        let sign = if is_positive_slope { 1.0 } else { -1.0 };
+        let xt0 = (s0.x * sign).floor();
+        let c = s0.x * sign - xt0;
+        let y0 = s0.y.floor();
+        let ytop = if s0.y == s1.y { s0.y.ceil() } else { y0 + 1.0 };
+        let b = ((dy * c + dx * (ytop - s0.y)) * idxdy).min(ONE_MINUS_ULP);
+        let robust_err = (a * (count as f32 - 1.0) + b).floor() - count_x as f32;
+        if robust_err != 0.0 {
+            a -= ROBUST_EPSILON.copysign(robust_err);
+        }
+        let x0 = xt0 * sign + if is_positive_slope { 0.0 } else { -1.0 };
+
+        let path = paths[line.path_ix as usize];
+        let bbox = path.bbox;
+        let bbox = [
+            bbox[0] as i32,
+            bbox[1] as i32,
+            bbox[2] as i32,
+            bbox[3] as i32,
+        ];
+        let xmin = s0.x.min(s1.x);
+        let stride = bbox[2] - bbox[0];
+        if s0.y >= bbox[3] as f32 || s1.y < bbox[1] as f32 || xmin >= bbox[2] as f32 || stride == 0
+        {
+            continue;
+        }
+        // Clip line to bounding box. Clipping is done in "i" space.
+        let mut imin = 0;
+        if s0.y < bbox[1] as f32 {
+            let mut iminf = ((bbox[1] as f32 - y0 + b - a) / (1.0 - a)).round() - 1.0;
+            if y0 + iminf - (a * iminf + b).floor() < bbox[1] as f32 {
+                iminf += 1.0;
+            }
+            imin = iminf as u32;
+        }
+        let mut imax = count;
+        if s1.y > bbox[3] as f32 {
+            let mut imaxf = ((bbox[3] as f32 - y0 + b - a) / (1.0 - a)).round() - 1.0;
+            if y0 + imaxf - (a * imaxf + b).floor() < bbox[3] as f32 {
+                imaxf += 1.0;
+            }
+            imax = imaxf as u32;
+        }
+        let delta = if is_down { -1 } else { 1 };
+        let mut ymin = 0;
+        let mut ymax = 0;
+        if s0.x.max(s1.x) < bbox[0] as f32 {
+            ymin = s0.y.ceil() as i32;
+            ymax = s1.y.ceil() as i32;
+            imax = imin;
+        } else {
+            let fudge = if is_positive_slope { 0.0 } else { 1.0 };
+            if xmin < bbox[0] as f32 {
+                let mut f = ((sign * (bbox[0] as f32 - x0) - b + fudge) / a).round();
+                if (x0 + sign * (a * f + b).floor() < bbox[0] as f32) == is_positive_slope {
+                    f += 1.0;
+                }
+                let ynext = (y0 + f - (a * f + b).floor() + 1.0) as i32;
+                if is_positive_slope {
+                    if f as u32 > imin {
+                        ymin = (y0 + if y0 == s0.y { 0.0 } else { 1.0 }) as i32;
+                        ymax = ynext;
+                        imin = f as u32;
+                    }
+                } else if (f as u32) < imax {
+                    ymin = ynext;
+                    ymax = s1.y.ceil() as i32;
+                    imax = f as u32;
+                }
+            }
+            if s0.x.max(s1.x) > bbox[2] as f32 {
+                let mut f = ((sign * (bbox[2] as f32 - x0) - b + fudge) / a).round();
+                if (x0 + sign * (a * f + b).floor() < bbox[2] as f32) == is_positive_slope {
+                    f += 1.0;
+                }
+                if is_positive_slope {
+                    imax = imax.min(f as u32);
+                } else {
+                    imin = imin.max(f as u32);
+                }
+            }
+        }
+        imax = imin.max(imax);
+        ymin = ymin.max(bbox[1]);
+        ymax = ymax.min(bbox[3]);
+        for y in ymin..ymax {
+            let base = path.tiles as i32 + (y - bbox[1]) * stride;
+            tile[base as usize].backdrop += delta;
+        }
+        let mut last_z = (a * (imin as f32 - 1.0) + b).floor();
+        let seg_base = bump.seg_counts;
+        bump.seg_counts += imax - imin;
+        for i in imin..imax {
+            let zf = a * i as f32 + b;
+            let z = zf.floor();
+            let y = (y0 + i as f32 - z) as i32;
+            let x = (x0 + sign * z) as i32;
+            let base = path.tiles as i32 + (y - bbox[1]) * stride - bbox[0];
+            let top_edge = if i == 0 { y0 == s0.y } else { last_z == z };
+            if top_edge && x + 1 < bbox[2] {
+                let x_bump = (x + 1).max(bbox[0]);
+                tile[(base + x_bump) as usize].backdrop += delta;
+            }
+            // .segments is another name for the .count field; it's overloaded
+            let seg_within_slice = tile[(base + x) as usize].segment_count_or_ix;
+            tile[(base + x) as usize].segment_count_or_ix += 1;
+            let counts = (seg_within_slice << 16) | i;
+            let seg_count = SegmentCount { line_ix, counts };
+            seg_counts[(seg_base + i - imin) as usize] = seg_count;
+            last_z = z;
+        }
+    }
+}
+
+pub fn path_count(_n_wg: u32, resources: &[CpuBinding]) {
+    let mut bump = resources[1].as_typed_mut();
+    let lines = resources[2].as_slice();
+    let paths = resources[3].as_slice();
+    let mut tile = resources[4].as_slice_mut();
+    let mut seg_counts = resources[5].as_slice_mut();
+    path_count_main(&mut bump, &lines, &paths, &mut tile, &mut seg_counts);
+}
diff --git a/src/cpu_shader/path_count_setup.rs b/src/cpu_shader/path_count_setup.rs
new file mode 100644
index 0000000..6336cfd
--- /dev/null
+++ b/src/cpu_shader/path_count_setup.rs
@@ -0,0 +1,21 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{BumpAllocators, IndirectCount};
+
+use crate::cpu_dispatch::CpuBinding;
+
+const WG_SIZE: usize = 256;
+
+fn path_count_setup_main(bump: &BumpAllocators, indirect: &mut IndirectCount) {
+    let lines = bump.lines;
+    indirect.count_x = (lines + (WG_SIZE as u32 - 1)) / WG_SIZE as u32;
+    indirect.count_y = 1;
+    indirect.count_z = 1;
+}
+
+pub fn path_count_setup(_n_wg: u32, resources: &[CpuBinding]) {
+    let bump = resources[0].as_typed();
+    let mut indirect = resources[1].as_typed_mut();
+    path_count_setup_main(&bump, &mut indirect);
+}
diff --git a/src/cpu_shader/path_tiling.rs b/src/cpu_shader/path_tiling.rs
new file mode 100644
index 0000000..56bc2b4
--- /dev/null
+++ b/src/cpu_shader/path_tiling.rs
@@ -0,0 +1,160 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{BumpAllocators, LineSoup, Path, PathSegment, SegmentCount, Tile};
+
+use crate::{
+    cpu_dispatch::CpuBinding,
+    cpu_shader::util::{ONE_MINUS_ULP, ROBUST_EPSILON},
+};
+
+use super::util::{span, Vec2};
+
+const TILE_WIDTH: u32 = 16;
+const TILE_HEIGHT: u32 = 16;
+const TILE_SCALE: f32 = 1.0 / 16.0;
+
+fn path_tiling_main(
+    bump: &mut BumpAllocators,
+    seg_counts: &[SegmentCount],
+    lines: &[LineSoup],
+    paths: &[Path],
+    tiles: &[Tile],
+    segments: &mut [PathSegment],
+) {
+    for seg_ix in 0..bump.seg_counts {
+        let seg_count = seg_counts[seg_ix as usize];
+        let line = lines[seg_count.line_ix as usize];
+        let counts = seg_count.counts;
+        let seg_within_slice = counts >> 16;
+        let seg_within_line = counts & 0xffff;
+
+        // coarse rasterization logic
+        let p0 = Vec2::from_array(line.p0);
+        let p1 = Vec2::from_array(line.p1);
+        let is_down = p1.y >= p0.y;
+        let (mut xy0, mut xy1) = if is_down { (p0, p1) } else { (p1, p0) };
+        let s0 = xy0 * TILE_SCALE;
+        let s1 = xy1 * TILE_SCALE;
+        let count_x = span(s0.x, s1.x) - 1;
+        let count = count_x + span(s0.y, s1.y);
+
+        let dx = (s1.x - s0.x).abs();
+        let dy = s1.y - s0.y;
+        let idxdy = 1.0 / (dx + dy);
+        let mut a = dx * idxdy;
+        let is_positive_slope = s1.x >= s0.x;
+        let sign = if is_positive_slope { 1.0 } else { -1.0 };
+        let xt0 = (s0.x * sign).floor();
+        let c = s0.x * sign - xt0;
+        let y0 = s0.y.floor();
+        let ytop = if s0.y == s1.y { s0.y.ceil() } else { y0 + 1.0 };
+        let b = ((dy * c + dx * (ytop - s0.y)) * idxdy).min(ONE_MINUS_ULP);
+        let robust_err = (a * (count as f32 - 1.0) + b).floor() - count_x as f32;
+        if robust_err != 0.0 {
+            a -= ROBUST_EPSILON.copysign(robust_err);
+        }
+        let x0 = xt0 * sign + if is_positive_slope { 0.0 } else { -1.0 };
+        let z = (a * seg_within_line as f32 + b).floor();
+        let x = x0 as i32 + (sign * z) as i32;
+        let y = (y0 + seg_within_line as f32 - z) as i32;
+
+        let path = paths[line.path_ix as usize];
+        let bbox = path.bbox;
+        let bbox = [
+            bbox[0] as i32,
+            bbox[1] as i32,
+            bbox[2] as i32,
+            bbox[3] as i32,
+        ];
+        let stride = bbox[2] - bbox[0];
+        let tile_ix = path.tiles as i32 + (y - bbox[1]) * stride + x - bbox[0];
+        let tile = tiles[tile_ix as usize];
+        let seg_start = !tile.segment_count_or_ix;
+        if (seg_start as i32) < 0 {
+            continue;
+        }
+        let tile_xy = Vec2::new(x as f32 * TILE_WIDTH as f32, y as f32 * TILE_HEIGHT as f32);
+        let tile_xy1 = tile_xy + Vec2::new(TILE_WIDTH as f32, TILE_HEIGHT as f32);
+
+        if seg_within_line > 0 {
+            let z_prev = (a * (seg_within_line as f32 - 1.0) + b).floor();
+            if z == z_prev {
+                // Top edge is clipped
+                let mut xt = xy0.x + (xy1.x - xy0.x) * (tile_xy.y - xy0.y) / (xy1.y - xy0.y);
+                xt = xt.clamp(tile_xy.x + 1e-3, tile_xy1.x);
+                xy0 = Vec2::new(xt, tile_xy.y);
+            } else {
+                // If is_positive_slope, left edge is clipped, otherwise right
+                let x_clip = if is_positive_slope {
+                    tile_xy.x
+                } else {
+                    tile_xy1.x
+                };
+                let mut yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x);
+                yt = yt.clamp(tile_xy.y + 1e-3, tile_xy1.y);
+                xy0 = Vec2::new(x_clip, yt);
+            }
+        }
+        if seg_within_line < count - 1 {
+            let z_next = (a * (seg_within_line as f32 + 1.0) + b).floor();
+            if z == z_next {
+                // Bottom edge is clipped
+                let mut xt = xy0.x + (xy1.x - xy0.x) * (tile_xy1.y - xy0.y) / (xy1.y - xy0.y);
+                xt = xt.clamp(tile_xy.x + 1e-3, tile_xy1.x);
+                xy1 = Vec2::new(xt, tile_xy1.y);
+            } else {
+                // If is_positive_slope, right edge is clipped, otherwise left
+                let x_clip = if is_positive_slope {
+                    tile_xy1.x
+                } else {
+                    tile_xy.x
+                };
+                let mut yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x);
+                yt = yt.clamp(tile_xy.y + 1e-3, tile_xy1.y);
+                xy1 = Vec2::new(x_clip, yt);
+            }
+        }
+        if !is_down {
+            (xy0, xy1) = (xy1, xy0);
+        }
+        // TODO: figure out what to if both xy0 and xy1 are at left edge
+        // Also TODO (part of move to 8 byte encoding for segments): don't store y_edge at all,
+        // resolve this in fine.
+        let y_edge = if xy0.x == tile_xy.x {
+            xy0.y
+        } else if xy1.x == tile_xy.x {
+            xy1.y
+        } else {
+            1e9
+        };
+        let segment = PathSegment {
+            origin: xy0.to_array(),
+            delta: (xy1 - xy0).to_array(),
+            y_edge,
+            _padding: Default::default(),
+        };
+        assert!(xy0.x >= tile_xy.x && xy0.x <= tile_xy1.x);
+        assert!(xy0.y >= tile_xy.y && xy0.y <= tile_xy1.y);
+        assert!(xy1.x >= tile_xy.x && xy1.x <= tile_xy1.x);
+        assert!(xy1.y >= tile_xy.y && xy1.y <= tile_xy1.y);
+        segments[(seg_start + seg_within_slice) as usize] = segment;
+    }
+}
+
+pub fn path_tiling(_n_wg: u32, resources: &[CpuBinding]) {
+    let mut bump = resources[0].as_typed_mut();
+    let seg_counts = resources[1].as_slice();
+    let lines = resources[2].as_slice();
+    let paths = resources[3].as_slice();
+    let tiles = resources[4].as_slice();
+    let mut segments = resources[5].as_slice_mut();
+    path_tiling_main(
+        &mut bump,
+        &seg_counts,
+        &lines,
+        &paths,
+        &tiles,
+        &mut segments,
+    );
+}
diff --git a/src/cpu_shader/path_tiling_setup.rs b/src/cpu_shader/path_tiling_setup.rs
new file mode 100644
index 0000000..32e08f9
--- /dev/null
+++ b/src/cpu_shader/path_tiling_setup.rs
@@ -0,0 +1,21 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+use vello_encoding::{BumpAllocators, IndirectCount};
+
+use crate::cpu_dispatch::CpuBinding;
+
+const WG_SIZE: usize = 256;
+
+fn path_tiling_setup_main(bump: &BumpAllocators, indirect: &mut IndirectCount) {
+    let segments = bump.seg_counts;
+    indirect.count_x = (segments + (WG_SIZE as u32 - 1)) / WG_SIZE as u32;
+    indirect.count_y = 1;
+    indirect.count_z = 1;
+}
+
+pub fn path_tiling_setup(_n_wg: u32, resources: &[CpuBinding]) {
+    let bump = resources[0].as_typed();
+    let mut indirect = resources[1].as_typed_mut();
+    path_tiling_setup_main(&bump, &mut indirect);
+}
diff --git a/src/cpu_shader/pathtag_reduce.rs b/src/cpu_shader/pathtag_reduce.rs
index 38ee55c..58eb36c 100644
--- a/src/cpu_shader/pathtag_reduce.rs
+++ b/src/cpu_shader/pathtag_reduce.rs
@@ -1,5 +1,5 @@
 // Copyright 2023 The Vello authors
-// SPDX-License-Identifier: Apache-2.0 OR MIT
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
 use vello_encoding::{ConfigUniform, Monoid, PathMonoid};
 
@@ -25,11 +25,8 @@
 }
 
 pub fn pathtag_reduce(n_wg: u32, resources: &[CpuBinding]) {
-    let r0 = resources[0].as_buf();
-    let r1 = resources[1].as_buf();
-    let mut r2 = resources[2].as_buf();
-    let config = bytemuck::from_bytes(&r0);
-    let scene = bytemuck::cast_slice(&r1);
-    let reduced = bytemuck::cast_slice_mut(r2.as_mut());
-    pathtag_reduce_main(n_wg, config, scene, reduced);
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let mut reduced = resources[2].as_slice_mut();
+    pathtag_reduce_main(n_wg, &config, &scene, &mut reduced);
 }
diff --git a/src/cpu_shader/pathtag_scan.rs b/src/cpu_shader/pathtag_scan.rs
new file mode 100644
index 0000000..8a8aa60
--- /dev/null
+++ b/src/cpu_shader/pathtag_scan.rs
@@ -0,0 +1,37 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use vello_encoding::{ConfigUniform, Monoid, PathMonoid};
+
+use crate::cpu_dispatch::CpuBinding;
+
+const WG_SIZE: usize = 256;
+
+fn pathtag_scan_main(
+    n_wg: u32,
+    config: &ConfigUniform,
+    scene: &[u32],
+    reduced: &[PathMonoid],
+    tag_monoids: &mut [PathMonoid],
+) {
+    let pathtag_base = config.layout.path_tag_base;
+    let mut prefix = PathMonoid::default();
+    for i in 0..n_wg {
+        let mut m = prefix;
+        for j in 0..WG_SIZE {
+            let ix = (i * WG_SIZE as u32) as usize + j;
+            tag_monoids[ix] = m;
+            let tag = scene[pathtag_base as usize + ix];
+            m = m.combine(&PathMonoid::new(tag));
+        }
+        prefix = prefix.combine(&reduced[i as usize]);
+    }
+}
+
+pub fn pathtag_scan(n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let reduced = resources[2].as_slice();
+    let mut tag_monoids = resources[3].as_slice_mut();
+    pathtag_scan_main(n_wg, &config, &scene, &reduced, &mut tag_monoids);
+}
diff --git a/src/cpu_shader/tile_alloc.rs b/src/cpu_shader/tile_alloc.rs
new file mode 100644
index 0000000..367f28d
--- /dev/null
+++ b/src/cpu_shader/tile_alloc.rs
@@ -0,0 +1,72 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use vello_encoding::{BumpAllocators, ConfigUniform, DrawTag, Path, Tile};
+
+use crate::cpu_dispatch::CpuBinding;
+
+const TILE_WIDTH: usize = 16;
+const TILE_HEIGHT: usize = 16;
+const SX: f32 = 1.0 / (TILE_WIDTH as f32);
+const SY: f32 = 1.0 / (TILE_HEIGHT as f32);
+
+fn tile_alloc_main(
+    config: &ConfigUniform,
+    scene: &[u32],
+    draw_bboxes: &[[f32; 4]],
+    bump: &mut BumpAllocators,
+    paths: &mut [Path],
+    tiles: &mut [Tile],
+) {
+    let drawtag_base = config.layout.draw_tag_base;
+    let width_in_tiles = config.width_in_tiles as i32;
+    let height_in_tiles = config.height_in_tiles as i32;
+    for drawobj_ix in 0..config.layout.n_draw_objects {
+        let drawtag = DrawTag(scene[(drawtag_base + drawobj_ix) as usize]);
+        let mut x0 = 0;
+        let mut y0 = 0;
+        let mut x1 = 0;
+        let mut y1 = 0;
+        if drawtag != DrawTag::NOP && drawtag != DrawTag::END_CLIP {
+            let bbox = draw_bboxes[drawobj_ix as usize];
+            if bbox[0] < bbox[2] && bbox[1] < bbox[3] {
+                x0 = (bbox[0] * SX).floor() as i32;
+                y0 = (bbox[1] * SY).floor() as i32;
+                x1 = (bbox[2] * SX).ceil() as i32;
+                y1 = (bbox[3] * SY).ceil() as i32;
+            }
+        }
+        let ux0 = x0.clamp(0, width_in_tiles) as u32;
+        let uy0 = y0.clamp(0, height_in_tiles) as u32;
+        let ux1 = x1.clamp(0, width_in_tiles) as u32;
+        let uy1 = y1.clamp(0, height_in_tiles) as u32;
+        let tile_count = (ux1 - ux0) * (uy1 - uy0);
+        let offset = bump.tile;
+        bump.tile += tile_count;
+        // We construct it this way because padding is private.
+        let mut path = Path::default();
+        path.bbox = [ux0, uy0, ux1, uy1];
+        path.tiles = offset;
+        paths[drawobj_ix as usize] = path;
+        for i in 0..tile_count {
+            tiles[(offset + i) as usize] = Tile::default();
+        }
+    }
+}
+
+pub fn tile_alloc(_n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let scene = resources[1].as_slice();
+    let draw_bboxes = resources[2].as_slice();
+    let mut bump = resources[3].as_typed_mut();
+    let mut paths = resources[4].as_slice_mut();
+    let mut tiles = resources[5].as_slice_mut();
+    tile_alloc_main(
+        &config,
+        &scene,
+        &draw_bboxes,
+        &mut bump,
+        &mut paths,
+        &mut tiles,
+    );
+}
diff --git a/src/cpu_shader/util.rs b/src/cpu_shader/util.rs
new file mode 100644
index 0000000..eae6f8b
--- /dev/null
+++ b/src/cpu_shader/util.rs
@@ -0,0 +1,128 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! Utility types
+
+use vello_encoding::ConfigUniform;
+
+#[derive(Clone, Copy, Default, Debug)]
+#[repr(C)]
+pub struct Vec2 {
+    pub x: f32,
+    pub y: f32,
+}
+
+impl std::ops::Add for Vec2 {
+    type Output = Vec2;
+
+    fn add(self, rhs: Self) -> Self {
+        Vec2 {
+            x: self.x + rhs.x,
+            y: self.y + rhs.y,
+        }
+    }
+}
+
+impl std::ops::Sub for Vec2 {
+    type Output = Vec2;
+
+    fn sub(self, rhs: Self) -> Self {
+        Vec2 {
+            x: self.x - rhs.x,
+            y: self.y - rhs.y,
+        }
+    }
+}
+
+impl std::ops::Mul<f32> for Vec2 {
+    type Output = Vec2;
+
+    fn mul(self, rhs: f32) -> Self {
+        Vec2 {
+            x: self.x * rhs,
+            y: self.y * rhs,
+        }
+    }
+}
+
+impl Vec2 {
+    pub fn new(x: f32, y: f32) -> Self {
+        Vec2 { x, y }
+    }
+
+    pub fn dot(self, other: Vec2) -> f32 {
+        self.x * other.x + self.y * other.y
+    }
+
+    pub fn length(self) -> f32 {
+        self.x.hypot(self.y)
+    }
+
+    pub fn to_array(self) -> [f32; 2] {
+        [self.x, self.y]
+    }
+
+    pub fn from_array(a: [f32; 2]) -> Self {
+        Vec2 { x: a[0], y: a[1] }
+    }
+
+    pub fn mix(self, other: Vec2, t: f32) -> Self {
+        let x = self.x + (other.x - self.x) * t;
+        let y = self.y + (other.y - self.y) * t;
+        Vec2 { x, y }
+    }
+}
+
+pub struct Transform(pub [f32; 6]);
+
+impl Transform {
+    pub fn apply(&self, p: Vec2) -> Vec2 {
+        let z = self.0;
+        let x = z[0] * p.x + z[2] * p.y + z[4];
+        let y = z[1] * p.x + z[3] * p.y + z[5];
+        Vec2 { x, y }
+    }
+
+    pub fn read(transform_base: u32, ix: u32, data: &[u32]) -> Transform {
+        let mut z = [0.0; 6];
+        let base = (transform_base + ix * 6) as usize;
+        for i in 0..6 {
+            z[i] = f32::from_bits(data[base + i]);
+        }
+        Transform(z)
+    }
+}
+
+pub fn span(a: f32, b: f32) -> u32 {
+    (a.max(b).ceil() - a.min(b).floor()).max(1.0) as u32
+}
+
+const DRAWTAG_NOP: u32 = 0;
+
+/// Read draw tag, guarded by number of draw objects.
+///
+/// The `ix` argument is allowed to exceed the number of draw objects,
+/// in which case a NOP is returned.
+pub fn read_draw_tag_from_scene(config: &ConfigUniform, scene: &[u32], ix: u32) -> u32 {
+    if ix < config.layout.n_draw_objects {
+        let tag_ix = config.layout.draw_tag_base + ix;
+        scene[tag_ix as usize]
+    } else {
+        DRAWTAG_NOP
+    }
+}
+
+/// The largest floating point value strictly less than 1.
+///
+/// This value is used to limit the value of b so that its floor is strictly less
+/// than 1. That guarantees that floor(a * i + b) == 0 for i == 0, which lands on
+/// the correct first tile.
+pub const ONE_MINUS_ULP: f32 = 0.99999994;
+
+/// An epsilon to be applied in path numerical robustness.
+///
+/// When floor(a * (n - 1) + b) does not match the expected value (the width in
+/// grid cells minus one), this delta is applied to a to push it in the correct
+/// direction. The theory is that a is not off by more than a few ulp, and it's
+/// always in the range of 0..1.
+pub const ROBUST_EPSILON: f32 = 2e-7;
diff --git a/src/lib.rs b/src/lib.rs
index 55147ac..006accd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,6 +17,7 @@
 mod cpu_dispatch;
 mod cpu_shader;
 mod engine;
+mod mask;
 mod render;
 mod scene;
 mod shaders;
@@ -61,6 +62,19 @@
 /// Specialization of `Result` for our catch-all error type.
 pub type Result<T> = std::result::Result<T, Error>;
 
+/// Possible configurations for antialiasing.
+#[derive(PartialEq, Eq)]
+#[allow(unused)]
+enum AaConfig {
+    Area,
+    Msaa8,
+    Msaa16,
+}
+
+/// Configuration of antialiasing. Currently this is static, but could be switched to
+/// a launch option or even finer-grained.
+const ANTIALIASING: AaConfig = AaConfig::Area;
+
 /// Renders a scene into a texture or surface.
 #[cfg(feature = "wgpu")]
 pub struct Renderer {
@@ -72,6 +86,8 @@
     profiler: GpuProfiler,
     #[cfg(feature = "wgpu-profiler")]
     pub profile_result: Option<Vec<wgpu_profiler::GpuTimerScopeResult>>,
+    #[cfg(feature = "hot_reload")]
+    use_cpu: bool,
 }
 
 /// Parameters used in a single render that are configurable by the client.
@@ -101,7 +117,10 @@
     /// Creates a new renderer for the specified device.
     pub fn new(device: &Device, render_options: &RendererOptions) -> Result<Self> {
         let mut engine = WgpuEngine::new();
-        let shaders = shaders::full_shaders(device, &mut engine, render_options.use_cpu)?;
+        let mut shaders = shaders::full_shaders(device, &mut engine)?;
+        if render_options.use_cpu {
+            shaders.install_cpu_shaders(&mut engine);
+        }
         let blit = render_options
             .surface_format
             .map(|surface_format| BlitPipeline::new(device, surface_format));
@@ -115,6 +134,8 @@
             profiler: GpuProfiler::new(3, render_options.timestamp_period, device.features()),
             #[cfg(feature = "wgpu-profiler")]
             profile_result: None,
+            #[cfg(feature = "hot_reload")]
+            use_cpu: render_options.use_cpu,
         })
     }
 
@@ -220,7 +241,10 @@
     pub async fn reload_shaders(&mut self, device: &Device) -> Result<()> {
         device.push_error_scope(wgpu::ErrorFilter::Validation);
         let mut engine = WgpuEngine::new();
-        let shaders = shaders::full_shaders(device, &mut engine, false)?;
+        let mut shaders = shaders::full_shaders(device, &mut engine)?;
+        if self.use_cpu {
+            shaders.install_cpu_shaders(&mut engine);
+        }
         let error = device.pop_error_scope().await;
         if let Some(error) = error {
             return Err(error.into());
diff --git a/src/mask.rs b/src/mask.rs
new file mode 100644
index 0000000..61cacf0
--- /dev/null
+++ b/src/mask.rs
@@ -0,0 +1,98 @@
+// Copyright 2022 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! Create a lookup table of half-plane sample masks.
+
+// Width is number of discrete translations
+const MASK_WIDTH: usize = 32;
+// Height is the number of discrete slopes
+const MASK_HEIGHT: usize = 32;
+
+const PATTERN: [u8; 8] = [0, 5, 3, 7, 1, 4, 6, 2];
+
+fn one_mask(slope: f64, mut translation: f64, is_pos: bool) -> u8 {
+    if is_pos {
+        translation = 1. - translation;
+    }
+    let mut result = 0;
+    for (i, item) in PATTERN.iter().enumerate() {
+        let mut y = (i as f64 + 0.5) * 0.125;
+        let x = (*item as f64 + 0.5) * 0.125;
+        if !is_pos {
+            y = 1. - y;
+        }
+        if (x - (1.0 - translation)) * (1. - slope) - (y - translation) * slope >= 0. {
+            result |= 1 << i;
+        }
+    }
+    result
+}
+
+/// Make a lookup table of half-plane masks.
+///
+/// The table is organized into two blocks each with MASK_HEIGHT/2 slopes.
+/// The first block is negative slopes (x decreases as y increates),
+/// the second as positive.
+pub fn make_mask_lut() -> Vec<u8> {
+    (0..MASK_WIDTH * MASK_HEIGHT)
+        .map(|i| {
+            const HALF_HEIGHT: usize = MASK_HEIGHT / 2;
+            let u = i % MASK_WIDTH;
+            let v = i / MASK_WIDTH;
+            let is_pos = v >= HALF_HEIGHT;
+            let y = ((v % HALF_HEIGHT) as f64 + 0.5) * (1.0 / HALF_HEIGHT as f64);
+            let x = (u as f64 + 0.5) * (1.0 / MASK_WIDTH as f64);
+            one_mask(y, x, is_pos)
+        })
+        .collect()
+}
+
+// Width is number of discrete translations
+const MASK16_WIDTH: usize = 64;
+// Height is the number of discrete slopes
+const MASK16_HEIGHT: usize = 64;
+
+// This is based on the [D3D11 standard sample pattern].
+//
+// [D3D11 standard sample pattern]: https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_standard_multisample_quality_levels
+const PATTERN_16: [u8; 16] = [1, 8, 4, 11, 15, 7, 3, 12, 0, 9, 5, 13, 2, 10, 6, 14];
+
+fn one_mask_16(slope: f64, mut translation: f64, is_pos: bool) -> u16 {
+    if is_pos {
+        translation = 1. - translation;
+    }
+    let mut result = 0;
+    for (i, item) in PATTERN_16.iter().enumerate() {
+        let mut y = (i as f64 + 0.5) * 0.0625;
+        let x = (*item as f64 + 0.5) * 0.0625;
+        if !is_pos {
+            y = 1. - y;
+        }
+        if (x - (1.0 - translation)) * (1. - slope) - (y - translation) * slope >= 0. {
+            result |= 1 << i;
+        }
+    }
+    result
+}
+
+/// Make a lookup table of half-plane masks.
+///
+/// The table is organized into two blocks each with MASK16_HEIGHT/2 slopes.
+/// The first block is negative slopes (x decreases as y increates),
+/// the second as positive.
+pub fn make_mask_lut_16() -> Vec<u8> {
+    let v16 = (0..MASK16_WIDTH * MASK16_HEIGHT)
+        .map(|i| {
+            const HALF_HEIGHT: usize = MASK16_HEIGHT / 2;
+            let u = i % MASK16_WIDTH;
+            let v = i / MASK16_WIDTH;
+            let is_pos = v >= HALF_HEIGHT;
+            let y = ((v % HALF_HEIGHT) as f64 + 0.5) * (1.0 / HALF_HEIGHT as f64);
+            let x = (u as f64 + 0.5) * (1.0 / MASK16_WIDTH as f64);
+            one_mask_16(y, x, is_pos)
+        })
+        .collect::<Vec<_>>();
+    // This annoyingly makes another copy. We can avoid that by pushing two
+    // bytes per iteration of the above loop.
+    bytemuck::cast_slice(&v16).into()
+}
diff --git a/src/render.rs b/src/render.rs
index 4625636..0bb6579 100644
--- a/src/render.rs
+++ b/src/render.rs
@@ -3,7 +3,7 @@
 use crate::{
     engine::{BufProxy, ImageFormat, ImageProxy, Recording, ResourceProxy},
     shaders::FullShaders,
-    RenderParams, Scene,
+    AaConfig, RenderParams, Scene, ANTIALIASING,
 };
 use vello_encoding::{Encoding, WorkgroupSize};
 
@@ -11,6 +11,7 @@
 pub struct Render {
     fine_wg_count: Option<WorkgroupSize>,
     fine_resources: Option<FineResources>,
+    mask_buf: Option<ResourceProxy>,
 }
 
 /// Resources produced by pipeline, needed for fine rasterization.
@@ -62,6 +63,7 @@
         Render {
             fine_wg_count: None,
             fine_resources: None,
+            mask_buf: None,
         }
     }
 
@@ -139,7 +141,8 @@
         );
         let mut pathtag_parent = reduced_buf;
         let mut large_pathtag_bufs = None;
-        if wg_counts.use_large_path_scan {
+        let use_large_path_scan = wg_counts.use_large_path_scan && !shaders.pathtag_is_cpu;
+        if use_large_path_scan {
             let reduced2_buf = ResourceProxy::new_buf(
                 buffer_sizes.path_reduced2.size_in_bytes().into(),
                 "reduced2_buf",
@@ -166,7 +169,7 @@
             buffer_sizes.path_monoids.size_in_bytes().into(),
             "tagmonoid_buf",
         );
-        let pathtag_scan = if wg_counts.use_large_path_scan {
+        let pathtag_scan = if use_large_path_scan {
             shaders.pathtag_scan_large
         } else {
             shaders.pathtag_scan
@@ -411,19 +414,48 @@
     pub fn record_fine(&mut self, shaders: &FullShaders, recording: &mut Recording) {
         let fine_wg_count = self.fine_wg_count.take().unwrap();
         let fine = self.fine_resources.take().unwrap();
-        recording.dispatch(
-            shaders.fine,
-            fine_wg_count,
-            [
-                fine.config_buf,
-                fine.segments_buf,
-                fine.ptcl_buf,
-                fine.info_bin_data_buf,
-                ResourceProxy::Image(fine.out_image),
-                fine.gradient_image,
-                fine.image_atlas,
-            ],
-        );
+        match ANTIALIASING {
+            AaConfig::Area => {
+                recording.dispatch(
+                    shaders.fine,
+                    fine_wg_count,
+                    [
+                        fine.config_buf,
+                        fine.segments_buf,
+                        fine.ptcl_buf,
+                        fine.info_bin_data_buf,
+                        ResourceProxy::Image(fine.out_image),
+                        fine.gradient_image,
+                        fine.image_atlas,
+                    ],
+                );
+            }
+            _ => {
+                if self.mask_buf.is_none() {
+                    let mask_lut = match ANTIALIASING {
+                        AaConfig::Msaa16 => crate::mask::make_mask_lut_16(),
+                        AaConfig::Msaa8 => crate::mask::make_mask_lut(),
+                        _ => unreachable!(),
+                    };
+                    let buf = recording.upload("mask lut", mask_lut);
+                    self.mask_buf = Some(buf.into());
+                }
+                recording.dispatch(
+                    shaders.fine,
+                    fine_wg_count,
+                    [
+                        fine.config_buf,
+                        fine.segments_buf,
+                        fine.ptcl_buf,
+                        fine.info_bin_data_buf,
+                        ResourceProxy::Image(fine.out_image),
+                        fine.gradient_image,
+                        fine.image_atlas,
+                        self.mask_buf.unwrap(),
+                    ],
+                );
+            }
+        }
         recording.free_resource(fine.config_buf);
         recording.free_resource(fine.tile_buf);
         recording.free_resource(fine.segments_buf);
@@ -431,6 +463,10 @@
         recording.free_resource(fine.gradient_image);
         recording.free_resource(fine.image_atlas);
         recording.free_resource(fine.info_bin_data_buf);
+        // TODO: make mask buf persistent
+        if let Some(mask_buf) = self.mask_buf.take() {
+            recording.free_resource(mask_buf);
+        }
     }
 
     /// Get the output image.
diff --git a/src/shaders.rs b/src/shaders.rs
index 23a3950..668dafa 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -79,14 +79,15 @@
     pub path_tiling_setup: ShaderId,
     pub path_tiling: ShaderId,
     pub fine: ShaderId,
+    // 2-level dispatch works for CPU pathtag scan even for large
+    // inputs, 3-level is not yet implemented.
+    pub pathtag_is_cpu: bool,
 }
 
 #[cfg(feature = "wgpu")]
-pub fn full_shaders(
-    device: &Device,
-    engine: &mut WgpuEngine,
-    use_cpu: bool,
-) -> Result<FullShaders, Error> {
+pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShaders, Error> {
+    use crate::ANTIALIASING;
+
     let imports = SHARED_SHADERS
         .iter()
         .copied()
@@ -94,6 +95,17 @@
     let empty = HashSet::new();
     let mut full_config = HashSet::new();
     full_config.insert("full".into());
+    match crate::ANTIALIASING {
+        crate::AaConfig::Msaa16 => {
+            full_config.insert("msaa".into());
+            full_config.insert("msaa16".into());
+        }
+        crate::AaConfig::Msaa8 => {
+            full_config.insert("msaa".into());
+            full_config.insert("msaa8".into());
+        }
+        crate::AaConfig::Area => (),
+    }
     let mut small_config = HashSet::new();
     small_config.insert("full".into());
     small_config.insert("small".into());
@@ -103,9 +115,6 @@
         preprocess::preprocess(shader!("pathtag_reduce"), &full_config, &imports).into(),
         &[BindType::Uniform, BindType::BufReadOnly, BindType::Buffer],
     )?;
-    if use_cpu {
-        engine.set_cpu_shader(pathtag_reduce, cpu_shader::pathtag_reduce);
-    }
     let pathtag_reduce2 = engine.add_shader(
         device,
         "pathtag_reduce2",
@@ -296,20 +305,39 @@
             BindType::Buffer,
         ],
     )?;
-    let fine = engine.add_shader(
-        device,
-        "fine",
-        preprocess::preprocess(shader!("fine"), &full_config, &imports).into(),
-        &[
-            BindType::Uniform,
-            BindType::BufReadOnly,
-            BindType::BufReadOnly,
-            BindType::BufReadOnly,
-            BindType::Image(ImageFormat::Rgba8),
-            BindType::ImageRead(ImageFormat::Rgba8),
-            BindType::ImageRead(ImageFormat::Rgba8),
-        ],
-    )?;
+    let fine = match ANTIALIASING {
+        crate::AaConfig::Area => engine.add_shader(
+            device,
+            "fine",
+            preprocess::preprocess(shader!("fine"), &full_config, &imports).into(),
+            &[
+                BindType::Uniform,
+                BindType::BufReadOnly,
+                BindType::BufReadOnly,
+                BindType::BufReadOnly,
+                BindType::Image(ImageFormat::Rgba8),
+                BindType::ImageRead(ImageFormat::Rgba8),
+                BindType::ImageRead(ImageFormat::Rgba8),
+            ],
+        )?,
+        _ => {
+            engine.add_shader(
+                device,
+                "fine",
+                preprocess::preprocess(shader!("fine"), &full_config, &imports).into(),
+                &[
+                    BindType::Uniform,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::Image(ImageFormat::Rgba8),
+                    BindType::ImageRead(ImageFormat::Rgba8),
+                    BindType::ImageRead(ImageFormat::Rgba8),
+                    BindType::BufReadOnly, // mask buffer
+                ],
+            )?
+        }
+    };
     Ok(FullShaders {
         pathtag_reduce,
         pathtag_reduce2,
@@ -331,9 +359,42 @@
         path_tiling_setup,
         path_tiling,
         fine,
+        pathtag_is_cpu: false,
     })
 }
 
+#[cfg(feature = "wgpu")]
+impl FullShaders {
+    /// Install the CPU shaders.
+    ///
+    /// There are a couple things to note here. The granularity provided by
+    /// this method is coarse; it installs all the shaders. There are many
+    /// use cases (including debugging), where a mix is desired, or the
+    /// choice between GPU and CPU dispatch might be dynamic.
+    ///
+    /// Second, the actual mapping to CPU shaders is not really specific to
+    /// the engine, and should be split out into a back-end agnostic struct.
+    pub fn install_cpu_shaders(&mut self, engine: &mut WgpuEngine) {
+        engine.set_cpu_shader(self.pathtag_reduce, cpu_shader::pathtag_reduce);
+        engine.set_cpu_shader(self.pathtag_scan, cpu_shader::pathtag_scan);
+        engine.set_cpu_shader(self.bbox_clear, cpu_shader::bbox_clear);
+        engine.set_cpu_shader(self.flatten, cpu_shader::flatten);
+        engine.set_cpu_shader(self.draw_reduce, cpu_shader::draw_reduce);
+        engine.set_cpu_shader(self.draw_leaf, cpu_shader::draw_leaf);
+        engine.set_cpu_shader(self.clip_reduce, cpu_shader::clip_reduce);
+        engine.set_cpu_shader(self.clip_leaf, cpu_shader::clip_leaf);
+        engine.set_cpu_shader(self.binning, cpu_shader::binning);
+        engine.set_cpu_shader(self.tile_alloc, cpu_shader::tile_alloc);
+        engine.set_cpu_shader(self.path_count_setup, cpu_shader::path_count_setup);
+        engine.set_cpu_shader(self.path_count, cpu_shader::path_count);
+        engine.set_cpu_shader(self.backdrop, cpu_shader::backdrop);
+        engine.set_cpu_shader(self.coarse, cpu_shader::coarse);
+        engine.set_cpu_shader(self.path_tiling_setup, cpu_shader::path_tiling_setup);
+        engine.set_cpu_shader(self.path_tiling, cpu_shader::path_tiling);
+        self.pathtag_is_cpu = true;
+    }
+}
+
 macro_rules! shared_shader {
     ($name:expr) => {
         (
diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs
index 12380e3..c5359c1 100644
--- a/src/wgpu_engine.rs
+++ b/src/wgpu_engine.rs
@@ -19,6 +19,7 @@
     BufProxy, Command, Id, ImageProxy, Recording, ResourceProxy, ShaderId,
 };
 
+#[derive(Default)]
 pub struct WgpuEngine {
     shaders: Vec<Shader>,
     pool: ResourcePool,
@@ -90,12 +91,7 @@
 
 impl WgpuEngine {
     pub fn new() -> WgpuEngine {
-        WgpuEngine {
-            shaders: vec![],
-            pool: Default::default(),
-            bind_map: Default::default(),
-            downloads: Default::default(),
-        }
+        Default::default()
     }
 
     /// Add a shader.