Contiguous storage for path segments

This is part of the larger multisampled path rendering work, under stroke rework (#303). It refactors the GPU pipeline so that the path segments available to fine rasterization are stored as a contiguous slice rather than a linked list as before.

Numerous parts of the pipeline are refactored. In the old pipeline, path segment decoding generated cubic line segments and also estimated a bounding box (somewhat imprecise), and the combination of flattening those cubics and tiling was in a separate stage (path_coarse) quite a bit later in the pipeline. In the new pipeline, path decoding is fused with flattening, generating a `LineSoup` structure (line segments associated with paths, otherwise unordered) (with bbox as a side effect), and tiling is spread over multiple stages, later in the pipeline.

The first tiling stage (path_count) counts the number of tiles that will be generated. Then coarse rasterization allocates contiguous slices based on those counts. The second stage does a scattered write of the resulting tiles. Both of these stages rely on indirect dispatch, as the number of lines and the number of segments (respectively) are not known at encode time.

These changes only make sense for filled paths, thus they relied on stroke expansion being done earlier, currently on the CPU.
diff --git a/crates/encoding/src/clip.rs b/crates/encoding/src/clip.rs
index 5a4e8f0..025b86b 100644
--- a/crates/encoding/src/clip.rs
+++ b/crates/encoding/src/clip.rs
@@ -41,3 +41,14 @@
 pub struct ClipBbox {
     pub bbox: [f32; 4],
 }
+
+impl ClipBic {
+    pub fn new(a: u32, b: u32) -> Self {
+        ClipBic { a, b }
+    }
+
+    pub fn combine(self, other: ClipBic) -> Self {
+        let m = self.b.min(other.a);
+        ClipBic::new(self.a + other.a - m, self.b + other.b - m)
+    }
+}
diff --git a/crates/encoding/src/config.rs b/crates/encoding/src/config.rs
index 93ffe80..d853a08 100644
--- a/crates/encoding/src/config.rs
+++ b/crates/encoding/src/config.rs
@@ -1,9 +1,11 @@
 // Copyright 2023 The Vello authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
+use crate::SegmentCount;
+
 use super::{
-    BinHeader, Clip, ClipBbox, ClipBic, ClipElement, Cubic, DrawBbox, DrawMonoid, Layout, Path,
-    PathBbox, PathMonoid, PathSegment, Tile,
+    BinHeader, Clip, ClipBbox, ClipBic, ClipElement, Cubic, DrawBbox, DrawMonoid, Layout, LineSoup,
+    Path, PathBbox, PathMonoid, PathSegment, Tile,
 };
 use bytemuck::{Pod, Zeroable};
 use std::mem;
@@ -29,8 +31,24 @@
     pub binning: u32,
     pub ptcl: u32,
     pub tile: u32,
+    pub seg_counts: u32,
     pub segments: u32,
     pub blend: u32,
+    pub lines: u32,
+}
+
+/// Storage of indirect dispatch size values.
+///
+/// The original plan was to reuse BumpAllocators, but the WebGPU compatible
+/// usage list rules forbid that being used as indirect counts while also
+/// bound as writable.
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+#[repr(C)]
+pub struct IndirectCount {
+    pub count_x: u32,
+    pub count_y: u32,
+    pub count_z: u32,
+    pub pad0: u32,
 }
 
 /// Uniform render configuration data used by all GPU stages.
@@ -114,7 +132,7 @@
     pub path_scan1: WorkgroupSize,
     pub path_scan: WorkgroupSize,
     pub bbox_clear: WorkgroupSize,
-    pub path_seg: WorkgroupSize,
+    pub flatten: WorkgroupSize,
     pub draw_reduce: WorkgroupSize,
     pub draw_leaf: WorkgroupSize,
     pub clip_reduce: WorkgroupSize,
@@ -159,7 +177,7 @@
             path_scan1: (reduced_size / PATH_REDUCE_WG, 1, 1),
             path_scan: (path_tag_wgs, 1, 1),
             bbox_clear: (draw_object_wgs, 1, 1),
-            path_seg: (path_coarse_wgs, 1, 1),
+            flatten: (path_coarse_wgs, 1, 1),
             draw_reduce: (draw_object_wgs, 1, 1),
             draw_leaf: (draw_object_wgs, 1, 1),
             clip_reduce: (clip_reduce_wgs, 1, 1),
@@ -248,11 +266,14 @@
     pub clip_bboxes: BufferSize<ClipBbox>,
     pub draw_bboxes: BufferSize<DrawBbox>,
     pub bump_alloc: BufferSize<BumpAllocators>,
+    pub indirect_count: BufferSize<IndirectCount>,
     pub bin_headers: BufferSize<BinHeader>,
     pub paths: BufferSize<Path>,
     // Bump allocated buffers
+    pub lines: BufferSize<LineSoup>,
     pub bin_data: BufferSize<u32>,
     pub tiles: BufferSize<Tile>,
+    pub seg_counts: BufferSize<SegmentCount>,
     pub segments: BufferSize<PathSegment>,
     pub ptcl: BufferSize<u32>,
 }
@@ -284,6 +305,7 @@
         let clip_bboxes = BufferSize::new(n_clips);
         let draw_bboxes = BufferSize::new(n_paths);
         let bump_alloc = BufferSize::new(1);
+        let indirect_count = BufferSize::new(1);
         let bin_headers = BufferSize::new(draw_object_wgs * 256);
         let n_paths_aligned = align_up(n_paths, 256);
         let paths = BufferSize::new(n_paths_aligned);
@@ -293,6 +315,8 @@
         // reasonable heuristics.
         let bin_data = BufferSize::new(1 << 18);
         let tiles = BufferSize::new(1 << 21);
+        let lines = BufferSize::new(1 << 21);
+        let seg_counts = BufferSize::new(1 << 21);
         let segments = BufferSize::new(1 << 21);
         let ptcl = BufferSize::new(1 << 23);
         Self {
@@ -311,10 +335,13 @@
             clip_bboxes,
             draw_bboxes,
             bump_alloc,
+            indirect_count,
+            lines,
             bin_headers,
             paths,
             bin_data,
             tiles,
+            seg_counts,
             segments,
             ptcl,
         }
diff --git a/crates/encoding/src/lib.rs b/crates/encoding/src/lib.rs
index 62a7d6c..3d24c19 100644
--- a/crates/encoding/src/lib.rs
+++ b/crates/encoding/src/lib.rs
@@ -24,8 +24,8 @@
 pub use binning::BinHeader;
 pub use clip::{Clip, ClipBbox, ClipBic, ClipElement};
 pub use config::{
-    BufferSize, BufferSizes, BumpAllocators, ConfigUniform, RenderConfig, WorkgroupCounts,
-    WorkgroupSize,
+    BufferSize, BufferSizes, BumpAllocators, ConfigUniform, IndirectCount, RenderConfig,
+    WorkgroupCounts, WorkgroupSize,
 };
 pub use draw::{
     DrawBbox, DrawBeginClip, DrawColor, DrawImage, DrawLinearGradient, DrawMonoid,
@@ -35,7 +35,8 @@
 pub use math::Transform;
 pub use monoid::Monoid;
 pub use path::{
-    Cubic, Path, PathBbox, PathEncoder, PathMonoid, PathSegment, PathSegmentType, PathTag, Tile,
+    Cubic, LineSoup, Path, PathBbox, PathEncoder, PathMonoid, PathSegment, PathSegmentType,
+    PathTag, SegmentCount, Tile,
 };
 pub use resolve::{resolve_solid_paths_only, Layout};
 
diff --git a/crates/encoding/src/path.rs b/crates/encoding/src/path.rs
index 10e7499..414ce23 100644
--- a/crates/encoding/src/path.rs
+++ b/crates/encoding/src/path.rs
@@ -6,6 +6,28 @@
 
 use super::Monoid;
 
+/// Line segment (after flattening, before tiling).
+#[derive(Clone, Copy, Debug, Zeroable, Pod, Default)]
+#[repr(C)]
+pub struct LineSoup {
+    pub path_ix: u32,
+    pub _padding: u32,
+    pub p0: [f32; 2],
+    pub p1: [f32; 2],
+}
+
+/// Line segment (after flattening, before tiling).
+#[derive(Clone, Copy, Debug, Zeroable, Pod, Default)]
+#[repr(C)]
+pub struct SegmentCount {
+    pub line_ix: u32,
+    // This could more accurately be modeled as:
+    //     segment_within_line: u16,
+    //     segment_within_slice: u16,
+    // However, here we mirror the way it's written in WGSL
+    pub counts: u32,
+}
+
 /// Path segment.
 #[derive(Clone, Copy, Debug, Zeroable, Pod, Default)]
 #[repr(C)]
@@ -13,7 +35,7 @@
     pub origin: [f32; 2],
     pub delta: [f32; 2],
     pub y_edge: f32,
-    pub next: u32,
+    pub _padding: u32,
 }
 
 /// Path segment type.
@@ -193,7 +215,7 @@
 #[repr(C)]
 pub struct Path {
     /// Bounding box in tiles.
-    pub bbox: [f32; 4],
+    pub bbox: [u32; 4],
     /// Offset (in u32s) to tile rectangle.
     pub tiles: u32,
     _padding: [u32; 3],
diff --git a/shader/coarse.wgsl b/shader/coarse.wgsl
index e497c85..2d324c7 100644
--- a/shader/coarse.wgsl
+++ b/shader/coarse.wgsl
@@ -33,7 +33,7 @@
 var<storage> paths: array<Path>;
 
 @group(0) @binding(6)
-var<storage> tiles: array<Tile>;
+var<storage, read_write> tiles: array<Tile>;
 
 @group(0) @binding(7)
 var<storage, read_write> bump: BumpAllocators;
@@ -82,31 +82,30 @@
     }
 }
 
-fn write_path(tile: Tile, linewidth: f32) -> bool {
-    // TODO: take flags
-    alloc_cmd(3u);
-    if linewidth < 0.0 {
-        let even_odd = linewidth < -1.0;
-        if tile.segments != 0u {
-            let fill = CmdFill(tile.segments, tile.backdrop);
-            ptcl[cmd_offset] = CMD_FILL;
-            let segments_and_rule = select(fill.tile << 1u, (fill.tile << 1u) | 1u, even_odd);
-            ptcl[cmd_offset + 1u] = segments_and_rule;
-            ptcl[cmd_offset + 2u] = u32(fill.backdrop);
-            cmd_offset += 3u;
-        } else {
-            if even_odd && (abs(tile.backdrop) & 1) == 0 {
-                return false;
-            }
-            ptcl[cmd_offset] = CMD_SOLID;
-            cmd_offset += 1u;
-        }
+fn write_path(tile: Tile, tile_ix: u32, linewidth: f32) -> bool {
+    let even_odd = linewidth < -1.0;
+    // We overload the "segments" field to store both count (written by
+    // path_count stage) and segment allocation (used by path_tiling and
+    // fine).
+    let n_segs = tile.segments;
+    if n_segs != 0u {
+        var seg_ix = atomicAdd(&bump.segments, n_segs);
+        tiles[tile_ix].segments = ~seg_ix;
+        alloc_cmd(4u);
+        ptcl[cmd_offset] = CMD_FILL;
+        let size_and_rule = (n_segs << 1u) | u32(even_odd);
+        let fill = CmdFill(size_and_rule, seg_ix, tile.backdrop);
+        ptcl[cmd_offset + 1u] = fill.size_and_rule;
+        ptcl[cmd_offset + 2u] = fill.seg_data;
+        ptcl[cmd_offset + 3u] = u32(fill.backdrop);
+        cmd_offset += 4u;
     } else {
-        let stroke = CmdStroke(tile.segments, 0.5 * linewidth);
-        ptcl[cmd_offset] = CMD_STROKE;
-        ptcl[cmd_offset + 1u] = stroke.tile;
-        ptcl[cmd_offset + 2u] = bitcast<u32>(stroke.half_width);
-        cmd_offset += 3u;
+        if even_odd && (abs(tile.backdrop) & 1) == 0 {
+            return false;
+        }
+        alloc_cmd(1u);
+        ptcl[cmd_offset] = CMD_SOLID;
+        cmd_offset += 1u;
     }
     return true;
 }
@@ -352,7 +351,7 @@
                     // DRAWTAG_FILL_COLOR
                     case 0x44u: {
                         let linewidth = bitcast<f32>(info_bin_data[di]);
-                        if write_path(tile, linewidth) {
+                        if write_path(tile, tile_ix, linewidth) {
                             let rgba_color = scene[dd];
                             write_color(CmdColor(rgba_color));
                         }
@@ -360,7 +359,7 @@
                     // DRAWTAG_FILL_LIN_GRADIENT
                     case 0x114u: {
                         let linewidth = bitcast<f32>(info_bin_data[di]);
-                        if write_path(tile, linewidth) {
+                        if write_path(tile, tile_ix, linewidth) {
                             let index = scene[dd];
                             let info_offset = di + 1u;
                             write_grad(CMD_LIN_GRAD, index, info_offset);
@@ -369,7 +368,7 @@
                     // DRAWTAG_FILL_RAD_GRADIENT
                     case 0x29cu: {
                         let linewidth = bitcast<f32>(info_bin_data[di]);
-                        if write_path(tile, linewidth) {
+                        if write_path(tile, tile_ix, linewidth) {
                             let index = scene[dd];
                             let info_offset = di + 1u;
                             write_grad(CMD_RAD_GRAD, index, info_offset);
@@ -378,7 +377,7 @@
                     // DRAWTAG_FILL_IMAGE
                     case 0x248u: {
                         let linewidth = bitcast<f32>(info_bin_data[di]);
-                        if write_path(tile, linewidth) {                            
+                        if write_path(tile, tile_ix, linewidth) {
                             write_image(di + 1u);
                         }
                     }
@@ -396,7 +395,7 @@
                     // DRAWTAG_END_CLIP
                     case 0x21u: {
                         clip_depth -= 1u;
-                        write_path(tile, -1.0);
+                        write_path(tile, tile_ix, -1.0);
                         let blend = scene[dd];
                         let alpha = bitcast<f32>(scene[dd + 1u]);
                         write_end_clip(CmdEndClip(blend, alpha));
diff --git a/shader/fine.wgsl b/shader/fine.wgsl
index f64f015..7998459 100644
--- a/shader/fine.wgsl
+++ b/shader/fine.wgsl
@@ -41,15 +41,10 @@
 var image_atlas: texture_2d<f32>;
 
 fn read_fill(cmd_ix: u32) -> CmdFill {
-    let tile = ptcl[cmd_ix + 1u];
-    let backdrop = i32(ptcl[cmd_ix + 2u]);
-    return CmdFill(tile, backdrop);
-}
-
-fn read_stroke(cmd_ix: u32) -> CmdStroke {
-    let tile = ptcl[cmd_ix + 1u];
-    let half_width = bitcast<f32>(ptcl[cmd_ix + 2u]);
-    return CmdStroke(tile, half_width);
+    let size_and_rule = ptcl[cmd_ix + 1u];
+    let seg_data = ptcl[cmd_ix + 2u];
+    let backdrop = i32(ptcl[cmd_ix + 3u]);
+    return CmdFill(size_and_rule, seg_data, backdrop);
 }
 
 fn read_color(cmd_ix: u32) -> CmdColor {
@@ -140,15 +135,15 @@
 
 let PIXELS_PER_THREAD = 4u;
 
-fn fill_path(tile: Tile, xy: vec2<f32>, even_odd: bool) -> array<f32, PIXELS_PER_THREAD> {
+fn fill_path(seg_data: u32, n_segs: u32, backdrop: i32, xy: vec2<f32>, even_odd: bool) -> array<f32, PIXELS_PER_THREAD> {
     var area: array<f32, PIXELS_PER_THREAD>;
-    let backdrop_f = f32(tile.backdrop);
+    let backdrop_f = f32(backdrop);
     for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
         area[i] = backdrop_f;
     }
-    var segment_ix = tile.segments;
-    while segment_ix != 0u {
-        let segment = segments[segment_ix];
+    for (var i = 0u; i < n_segs; i++) {
+        let seg_off = seg_data + i;
+        let segment = segments[seg_off];
         let y = segment.origin.y - xy.y;
         let y0 = clamp(y, 0.0, 1.0);
         let y1 = clamp(y + segment.delta.y, 0.0, 1.0);
@@ -177,7 +172,6 @@
         for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
             area[i] += y_edge;
         }
-        segment_ix = segment.next;
     }
     if even_odd {
         // even-odd winding rule
@@ -194,32 +188,6 @@
     return area;
 }
 
-fn stroke_path(seg: u32, half_width: f32, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
-    var df: array<f32, PIXELS_PER_THREAD>;
-    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
-        df[i] = 1e9;
-    }
-    var segment_ix = seg;
-    while segment_ix != 0u {
-        let segment = segments[segment_ix];
-        let delta = segment.delta;
-        let dpos0 = xy + vec2(0.5, 0.5) - segment.origin;
-        let scale = 1.0 / dot(delta, delta);
-        for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
-            let dpos = vec2(dpos0.x + f32(i), dpos0.y);
-            let t = clamp(dot(dpos, delta) * scale, 0.0, 1.0);
-            // performance idea: hoist sqrt out of loop
-            df[i] = min(df[i], length(delta * t - dpos));
-        }
-        segment_ix = segment.next;
-    }
-    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
-        // reuse array; return alpha rather than distance
-        df[i] = clamp(half_width + 0.5 - df[i], 0.0, 1.0);
-    }
-    return df;
-}
-
 // The X size should be 16 / PIXELS_PER_THREAD
 @compute @workgroup_size(4, 16)
 fn main(
@@ -250,16 +218,19 @@
             // CMD_FILL
             case 1u: {
                 let fill = read_fill(cmd_ix);
-                let segments = fill.tile >> 1u;
-                let even_odd = (fill.tile & 1u) != 0u;
-                let tile = Tile(fill.backdrop, segments);
-                area = fill_path(tile, xy, even_odd);
-                cmd_ix += 3u;
+                let n_segs = fill.size_and_rule >> 1u;
+                let even_odd = (fill.size_and_rule & 1u) != 0u;
+                area = fill_path(fill.seg_data, n_segs, fill.backdrop, xy, even_odd);
+                cmd_ix += 4u;
             }
             // CMD_STROKE
             case 2u: {
-                let stroke = read_stroke(cmd_ix);
-                area = stroke_path(stroke.tile, stroke.half_width, xy);
+                // Stroking in fine rasterization is disabled, as strokes will be expanded
+                // to fills earlier in the pipeline. This implementation is a stub, just to
+                // keep the shader from crashing.
+                for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
+                    area[i] = 0.0;
+                }
                 cmd_ix += 3u;
             }
             // CMD_SOLID
diff --git a/shader/flatten.wgsl b/shader/flatten.wgsl
new file mode 100644
index 0000000..b4197bc
--- /dev/null
+++ b/shader/flatten.wgsl
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Flatten curves to lines
+
+#import config
+#import pathtag
+#import segment
+#import cubic
+#import bump
+
+@group(0) @binding(0)
+var<uniform> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage> tag_monoids: array<TagMonoid>;
+
+struct AtomicPathBbox {
+    x0: atomic<i32>,
+    y0: atomic<i32>,
+    x1: atomic<i32>,
+    y1: atomic<i32>,
+    linewidth: f32,
+    trans_ix: u32,
+}
+
+@group(0) @binding(3)
+var<storage, read_write> path_bboxes: array<AtomicPathBbox>;
+
+@group(0) @binding(4)
+var<storage, read_write> bump: BumpAllocators;
+
+@group(0) @binding(5)
+var<storage, read_write> lines: array<LineSoup>;
+
+struct SubdivResult {
+    val: f32,
+    a0: f32,
+    a2: f32,
+}
+
+let D = 0.67;
+fn approx_parabola_integral(x: f32) -> f32 {
+    return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
+}
+
+let B = 0.39;
+fn approx_parabola_inv_integral(x: f32) -> f32 {
+    return x * sqrt(1.0 - B + (B * B + 0.5 * x * x));
+}
+
+fn estimate_subdiv(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, sqrt_tol: f32) -> SubdivResult {
+    let d01 = p1 - p0;
+    let d12 = p2 - p1;
+    let dd = d01 - d12;
+    let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
+    let cross_inv = select(1.0 / cross, 1.0e9, abs(cross) < 1.0e-9);
+    let x0 = dot(d01, dd) * cross_inv;
+    let x2 = dot(d12, dd) * cross_inv;
+    let scale = abs(cross / (length(dd) * (x2 - x0)));
+
+    let a0 = approx_parabola_integral(x0);
+    let a2 = approx_parabola_integral(x2);
+    var val = 0.0;
+    if scale < 1e9 {
+        let da = abs(a2 - a0);
+        let sqrt_scale = sqrt(scale);
+        if sign(x0) == sign(x2) {
+            val = sqrt_scale;
+        } else {
+            let xmin = sqrt_tol / sqrt_scale;
+            val = sqrt_tol / approx_parabola_integral(xmin);
+        }
+        val *= da;
+    }
+    return SubdivResult(val, a0, a2);
+}
+
+fn eval_quad(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, t: f32) -> vec2<f32> {
+    let mt = 1.0 - t;
+    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
+}
+
+fn eval_cubic(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, p3: vec2<f32>, t: f32) -> vec2<f32> {
+    let mt = 1.0 - t;
+    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
+}
+
+let MAX_QUADS = 16u;
+
+fn flatten_cubic(cubic: Cubic) {
+    let p0 = cubic.p0;
+    let p1 = cubic.p1;
+    let p2 = cubic.p2;
+    let p3 = cubic.p3;
+    let err_v = 3.0 * (p2 - p1) + p0 - p3;
+    let err = dot(err_v, err_v);
+    let ACCURACY = 0.25;
+    let Q_ACCURACY = ACCURACY * 0.1;
+    let REM_ACCURACY = ACCURACY - Q_ACCURACY;
+    let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY;
+    var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u);
+    n_quads = min(n_quads, MAX_QUADS);
+    var keep_params: array<SubdivResult, MAX_QUADS>;
+    var val = 0.0;
+    var qp0 = p0;
+    let step = 1.0 / f32(n_quads);
+    for (var i = 0u; i < n_quads; i += 1u) {
+        let t = f32(i + 1u) * step;
+        let qp2 = eval_cubic(p0, p1, p2, p3, t);
+        var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
+        qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
+        let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
+        keep_params[i] = params;
+        val += params.val;
+        qp0 = qp2;
+    }
+    let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u);
+    var lp0 = p0;
+    qp0 = p0;
+    let v_step = val / f32(n);
+    var n_out = 1u;
+    var val_sum = 0.0;
+    for (var i = 0u; i < n_quads; i += 1u) {
+        let t = f32(i + 1u) * step;
+        let qp2 = eval_cubic(p0, p1, p2, p3, t);
+        var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
+        qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
+        let params = keep_params[i];
+        let u0 = approx_parabola_inv_integral(params.a0);
+        let u2 = approx_parabola_inv_integral(params.a2);
+        let uscale = 1.0 / (u2 - u0);
+        var val_target = f32(n_out) * v_step;
+        while n_out == n || val_target < val_sum + params.val {
+            var lp1: vec2<f32>;
+            if n_out == n {
+                lp1 = p3;
+            } else {
+                let u = (val_target - val_sum) / params.val;
+                let a = mix(params.a0, params.a2, u);
+                let au = approx_parabola_inv_integral(a);
+                let t = (au - u0) * uscale;
+                lp1 = eval_quad(qp0, qp1, qp2, t);
+            }
+
+            // Output line segment lp0..lp1
+            let line_ix = atomicAdd(&bump.lines, 1u);
+            // TODO: check failure
+            lines[line_ix] = LineSoup(cubic.path_ix, lp0, lp1);
+            n_out += 1u;
+            val_target += v_step;
+            lp0 = lp1;
+        }
+        val_sum += params.val;
+        qp0 = qp2;
+    }
+}
+
+var<private> pathdata_base: u32;
+
+fn read_f32_point(ix: u32) -> vec2<f32> {
+    let x = bitcast<f32>(scene[pathdata_base + ix]);
+    let y = bitcast<f32>(scene[pathdata_base + ix + 1u]);
+    return vec2(x, y);
+}
+
+fn read_i16_point(ix: u32) -> vec2<f32> {
+    let raw = scene[pathdata_base + ix];
+    let x = f32(i32(raw << 16u) >> 16u);
+    let y = f32(i32(raw) >> 16u);
+    return vec2(x, y);
+}
+
+struct Transform {
+    matrx: vec4<f32>,
+    translate: vec2<f32>,
+}
+
+fn read_transform(transform_base: u32, ix: u32) -> Transform {
+    let base = transform_base + ix * 6u;
+    let c0 = bitcast<f32>(scene[base]);
+    let c1 = bitcast<f32>(scene[base + 1u]);
+    let c2 = bitcast<f32>(scene[base + 2u]);
+    let c3 = bitcast<f32>(scene[base + 3u]);
+    let c4 = bitcast<f32>(scene[base + 4u]);
+    let c5 = bitcast<f32>(scene[base + 5u]);
+    let matrx = vec4(c0, c1, c2, c3);
+    let translate = vec2(c4, c5);
+    return Transform(matrx, translate);
+}
+
+fn transform_apply(transform: Transform, p: vec2<f32>) -> vec2<f32> {
+    return transform.matrx.xy * p.x + transform.matrx.zw * p.y + transform.translate;
+}
+
+fn round_down(x: f32) -> i32 {
+    return i32(floor(x));
+}
+
+fn round_up(x: f32) -> i32 {
+    return i32(ceil(x));
+}
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+    let ix = global_id.x;
+    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
+    pathdata_base = config.pathdata_base;
+    let shift = (ix & 3u) * 8u;
+    var tm = reduce_tag(tag_word & ((1u << shift) - 1u));
+    // TODO: this can be a read buf overflow. Conditionalize by tag byte?
+    tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm);
+    var tag_byte = (tag_word >> shift) & 0xffu;
+
+    let out = &path_bboxes[tm.path_ix];
+    let linewidth = bitcast<f32>(scene[config.linewidth_base + tm.linewidth_ix]);
+    if (tag_byte & PATH_TAG_PATH) != 0u {
+        (*out).linewidth = linewidth;
+        (*out).trans_ix = tm.trans_ix;
+    }
+    // Decode path data
+    let seg_type = tag_byte & PATH_TAG_SEG_TYPE;
+    if seg_type != 0u {
+        var p0: vec2<f32>;
+        var p1: vec2<f32>;
+        var p2: vec2<f32>;
+        var p3: vec2<f32>;
+        if (tag_byte & PATH_TAG_F32) != 0u {
+            p0 = read_f32_point(tm.pathseg_offset);
+            p1 = read_f32_point(tm.pathseg_offset + 2u);
+            if seg_type >= PATH_TAG_QUADTO {
+                p2 = read_f32_point(tm.pathseg_offset + 4u);
+                if seg_type == PATH_TAG_CUBICTO {
+                    p3 = read_f32_point(tm.pathseg_offset + 6u);
+                }
+            }
+        } else {
+            p0 = read_i16_point(tm.pathseg_offset);
+            p1 = read_i16_point(tm.pathseg_offset + 1u);
+            if seg_type >= PATH_TAG_QUADTO {
+                p2 = read_i16_point(tm.pathseg_offset + 2u);
+                if seg_type == PATH_TAG_CUBICTO {
+                    p3 = read_i16_point(tm.pathseg_offset + 3u);
+                }
+            }
+        }
+        let transform = read_transform(config.transform_base, tm.trans_ix);
+        p0 = transform_apply(transform, p0);
+        p1 = transform_apply(transform, p1);
+        var bbox = vec4(min(p0, p1), max(p0, p1));
+        // Degree-raise
+        if seg_type == PATH_TAG_LINETO {
+            p3 = p1;
+            p2 = mix(p3, p0, 1.0 / 3.0);
+            p1 = mix(p0, p3, 1.0 / 3.0);
+        } else if seg_type >= PATH_TAG_QUADTO {
+            p2 = transform_apply(transform, p2);
+            bbox = vec4(min(bbox.xy, p2), max(bbox.zw, p2));
+            if seg_type == PATH_TAG_CUBICTO {
+                p3 = transform_apply(transform, p3);
+                bbox = vec4(min(bbox.xy, p3), max(bbox.zw, p3));
+            } else {
+                p3 = p2;
+                p2 = mix(p1, p2, 1.0 / 3.0);
+                p1 = mix(p1, p0, 1.0 / 3.0);
+            }
+        }
+        var stroke = vec2(0.0, 0.0);
+        if linewidth >= 0.0 {
+            // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
+            // This is the correct bounding box, but we're not handling rendering
+            // in the isotropic case, so it may mismatch.
+            stroke = 0.5 * linewidth * vec2(length(transform.matrx.xz), length(transform.matrx.yw));
+            bbox += vec4(-stroke, stroke);
+        }
+        let flags = u32(linewidth >= 0.0);
+        flatten_cubic(Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags));
+        // Update bounding box using atomics only. Computing a monoid is a
+        // potential future optimization.
+        if bbox.z > bbox.x || bbox.w > bbox.y {
+            atomicMin(&(*out).x0, round_down(bbox.x));
+            atomicMin(&(*out).y0, round_down(bbox.y));
+            atomicMax(&(*out).x1, round_up(bbox.z));
+            atomicMax(&(*out).y1, round_up(bbox.w));
+        }
+    }
+}
diff --git a/shader/path_coarse.wgsl b/shader/path_coarse.wgsl
deleted file mode 100644
index 5f168a6..0000000
--- a/shader/path_coarse.wgsl
+++ /dev/null
@@ -1,309 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-#import config
-#import pathtag
-
-@group(0) @binding(0)
-var<uniform> config: Config;
-
-@group(0) @binding(1)
-var<storage> scene: array<u32>;
-
-@group(0) @binding(2)
-var<storage> tag_monoids: array<TagMonoid>;
-
-#ifdef cubics_out
-@group(0) @binding(3)
-var<storage, read_write> output: array<vec2<f32>>;
-#else
-// We don't get this from import as it's the atomic version
-struct AtomicTile {
-    backdrop: atomic<i32>,
-    segments: atomic<u32>,
-}
-
-#import segment
-
-@group(0) @binding(3)
-var<storage, read_write> tiles: array<AtomicTile>;
-
-@group(0) @binding(4)
-var<storage, read_write> segments: array<Segment>;
-#endif
-
-var<private> pathdata_base: u32;
-
-fn read_f32_point(ix: u32) -> vec2<f32> {
-    let x = bitcast<f32>(scene[pathdata_base + ix]);
-    let y = bitcast<f32>(scene[pathdata_base + ix + 1u]);
-    return vec2(x, y);
-}
-
-fn read_i16_point(ix: u32) -> vec2<f32> {
-    let raw = scene[pathdata_base + ix];
-    let x = f32(i32(raw << 16u) >> 16u);
-    let y = f32(i32(raw) >> 16u);
-    return vec2(x, y);
-}
-
-#ifndef cubics_out
-struct SubdivResult {
-    val: f32,
-    a0: f32,
-    a2: f32,
-}
-
-let D = 0.67;
-fn approx_parabola_integral(x: f32) -> f32 {
-    return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
-}
-
-let B = 0.39;
-fn approx_parabola_inv_integral(x: f32) -> f32 {
-    return x * sqrt(1.0 - B + (B * B + 0.5 * x * x));
-}
-
-fn estimate_subdiv(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, sqrt_tol: f32) -> SubdivResult {
-    let d01 = p1 - p0;
-    let d12 = p2 - p1;
-    let dd = d01 - d12;
-    let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
-    let cross_inv = 1.0 / cross;
-    let x0 = dot(d01, dd) * cross_inv;
-    let x2 = dot(d12, dd) * cross_inv;
-    let scale = abs(cross / (length(dd) * (x2 - x0)));
-
-    let a0 = approx_parabola_integral(x0);
-    let a2 = approx_parabola_integral(x2);
-    var val = 0.0;
-    if scale < 1e9 {
-        let da = abs(a2 - a0);
-        let sqrt_scale = sqrt(scale);
-        if sign(x0) == sign(x2) {
-            val = sqrt_scale;
-        } else {
-            let xmin = sqrt_tol / sqrt_scale;
-            val = sqrt_tol / approx_parabola_integral(xmin);
-        }
-        val *= da;
-    }
-    return SubdivResult(val, a0, a2);
-}
-
-fn eval_quad(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, t: f32) -> vec2<f32> {
-    let mt = 1.0 - t;
-    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
-}
-
-fn eval_cubic(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, p3: vec2<f32>, t: f32) -> vec2<f32> {
-    let mt = 1.0 - t;
-    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
-}
-
-fn alloc_segment() -> u32 {
-    // Use 0-index segment (address is sentinel) as counter
-    // TODO: separate small buffer binding for this?
-    return atomicAdd(&tiles[4096].segments, 1u) + 1u;
-}
-#endif
-
-let MAX_QUADS = 16u;
-
-@compute @workgroup_size(256)
-fn main(
-    @builtin(global_invocation_id) global_id: vec3<u32>,
-    @builtin(local_invocation_id) local_id: vec3<u32>,
-) {
-    // Obtain exclusive prefix sum of tag monoid
-    let ix = global_id.x;
-    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
-    pathdata_base = config.pathdata_base;
-    let shift = (ix & 3u) * 8u;
-    var tm = reduce_tag(tag_word & ((1u << shift) - 1u));
-    tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm);
-    var tag_byte = (tag_word >> shift) & 0xffu;
-    // should be extractBits(tag_word, shift, 8)?
-
-    // Decode path data
-    let seg_type = tag_byte & PATH_TAG_SEG_TYPE;
-    if seg_type != 0u {
-        var p0: vec2<f32>;
-        var p1: vec2<f32>;
-        var p2: vec2<f32>;
-        var p3: vec2<f32>;
-        if (tag_byte & PATH_TAG_F32) != 0u {
-            p0 = read_f32_point(tm.pathseg_offset);
-            p1 = read_f32_point(tm.pathseg_offset + 2u);
-            if seg_type >= PATH_TAG_QUADTO {
-                p2 = read_f32_point(tm.pathseg_offset + 4u);
-                if seg_type == PATH_TAG_CUBICTO {
-                    p3 = read_f32_point(tm.pathseg_offset + 6u);
-                }
-            }
-        } else {
-            p0 = read_i16_point(tm.pathseg_offset);
-            p1 = read_i16_point(tm.pathseg_offset + 1u);
-            if seg_type >= PATH_TAG_QUADTO {
-                p2 = read_i16_point(tm.pathseg_offset + 2u);
-                if seg_type == PATH_TAG_CUBICTO {
-                    p3 = read_i16_point(tm.pathseg_offset + 3u);
-                }
-            }
-        }
-        // TODO: transform goes here
-        // Degree-raise
-        if seg_type == PATH_TAG_LINETO {
-            p3 = p1;
-            p2 = mix(p3, p0, 1.0 / 3.0);
-            p1 = mix(p0, p3, 1.0 / 3.0);
-        } else if seg_type == PATH_TAG_QUADTO {
-            p3 = p2;
-            p2 = mix(p1, p2, 1.0 / 3.0);
-            p1 = mix(p1, p0, 1.0 / 3.0);
-        }
-#ifdef cubics_out
-        let out_ix = ix * 4u;
-        output[out_ix] = p0;
-        output[out_ix + 1u] = p1;
-        output[out_ix + 2u] = p2;
-        output[out_ix + 3u] = p3;
-#else
-        let err_v = 3.0 * (p2 - p1) + p0 - p3;
-        let err = dot(err_v, err_v);
-        let ACCURACY = 0.25;
-        let Q_ACCURACY = ACCURACY * 0.1;
-        let REM_ACCURACY = (ACCURACY - Q_ACCURACY);
-        let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY;
-        var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u);
-        n_quads = min(n_quads, MAX_QUADS);
-        var keep_params: array<SubdivResult, MAX_QUADS>;
-        var val = 0.0;
-        var qp0 = p0;
-        let step = 1.0 / f32(n_quads);
-        for (var i = 0u; i < n_quads; i += 1u) {
-            let t = f32(i + 1u) * step;
-            let qp2 = eval_cubic(p0, p1, p2, p3, t);
-            var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
-            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-            let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
-            keep_params[i] = params;
-            val += params.val;
-            qp0 = qp2;
-        }
-        let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u);
-        var lp0 = p0;
-        qp0 = p0;
-        let v_step = val / f32(n);
-        var n_out = 1u;
-        var val_sum = 0.0;
-        for (var i = 0u; i < n_quads; i += 1u) {
-            let t = f32(i + 1u) * step;
-            let qp2 = eval_cubic(p0, p1, p2, p3, t);
-            var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
-            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-            let params = keep_params[i];
-            let u0 = approx_parabola_inv_integral(params.a0);
-            let u2 = approx_parabola_inv_integral(params.a2);
-            let uscale = 1.0 / (u2 - u0);
-            var val_target = f32(n_out) * v_step;
-            while n_out == n || val_target < val_sum + params.val {
-                var lp1: vec2<f32>;
-                if n_out == n {
-                    lp1 = p3;
-                } else {
-                    let u = (val_target - val_sum) / params.val;
-                    let a = mix(params.a0, params.a2, u);
-                    let au = approx_parabola_inv_integral(a);
-                    let t = (au - u0) * uscale;
-                    lp1 = eval_quad(qp0, qp1, qp2, t);
-                }
-
-                // Output line segment lp0..lp1
-                let xymin = min(lp0, lp1);
-                let xymax = max(lp0, lp1);
-                let dp = lp1 - lp0;
-                let recip_dx = 1.0 / dp.x;
-                let invslope = select(dp.x / dp.y, 1.0e9, abs(dp.y) < 1.0e-9);
-                let c = 0.5 * abs(invslope);
-                let b = invslope;
-                let SX = 1.0 / f32(TILE_WIDTH);
-                let SY = 1.0 / f32(TILE_HEIGHT);
-                let a = (lp0.x - (lp0.y - 0.5 * f32(TILE_HEIGHT)) * b) * SX;
-                var x0 = i32(floor(xymin.x * SX));
-                var x1 = i32(floor(xymax.x * SX) + 1.0);
-                var y0 = i32(floor(xymin.y * SY));
-                var y1 = i32(floor(xymax.y * SY) + 1.0);
-                x0 = clamp(x0, 0, i32(config.width_in_tiles));
-                x1 = clamp(x1, 0, i32(config.width_in_tiles));
-                y0 = clamp(y0, 0, i32(config.height_in_tiles));
-                y1 = clamp(y1, 0, i32(config.height_in_tiles));
-                var xc = a + b * f32(y0);
-                var xray = i32(floor(lp0.x * SX));
-                var last_xray = i32(floor(lp1.x * SX));
-                if dp.y < 0.0 {
-                    let tmp = xray;
-                    xray = last_xray;
-                    last_xray = tmp;
-                }
-                for (var y = y0; y < y1; y += 1) {
-                    let tile_y0 = f32(y) * f32(TILE_HEIGHT);
-                    let xbackdrop = max(xray + 1, 0);
-                    if xymin.y < tile_y0 && xbackdrop < i32(config.width_in_tiles) {
-                        let backdrop = select(-1, 1, dp.y < 0.0);
-                        let tile_ix = y * i32(config.width_in_tiles) + xbackdrop;
-                        atomicAdd(&tiles[tile_ix].backdrop, backdrop);
-                    }
-                    var next_xray = last_xray;
-                    if y + 1 < y1 {
-                        let tile_y1 = f32(y + 1) * f32(TILE_HEIGHT);
-                        let x_edge = lp0.x + (tile_y1 - lp0.y) * invslope;
-                        next_xray = i32(floor(x_edge * SX));
-                    }
-                    let min_xray = min(xray, next_xray);
-                    let max_xray = max(xray, next_xray);
-                    var xx0 = min(i32(floor(xc - c)), min_xray);
-                    var xx1 = max(i32(ceil(xc + c)), max_xray + 1);
-                    xx0 = clamp(xx0, x0, x1);
-                    xx1 = clamp(xx1, x0, x1);
-                    var tile_seg: Segment;
-                    for (var x = xx0; x < xx1; x += 1) {
-                        let tile_x0 = f32(x) * f32(TILE_WIDTH);
-                        let tile_ix = y * i32(config.width_in_tiles) + x;
-                        // allocate segment, insert linked list
-                        let seg_ix = alloc_segment();
-                        let old = atomicExchange(&tiles[tile_ix].segments, seg_ix);
-                        tile_seg.origin = lp0;
-                        tile_seg.delta = dp;
-                        var y_edge = mix(lp0.y, lp1.y, (tile_x0 - lp0.x) * recip_dx);
-                        if xymin.x < tile_x0 {
-                            let p = vec2(tile_x0, y_edge);
-                            if dp.x < 0.0 {
-                                tile_seg.delta = p - lp0;
-                            } else {
-                                tile_seg.origin = p;
-                                tile_seg.delta = lp1 - p;
-                            }
-                            if tile_seg.delta.x == 0.0 {
-                                tile_seg.delta.x = sign(dp.x) * 1e-9;
-                            }
-                        }
-                        if x <= min_xray || max_xray < x {
-                            y_edge = 1e9;
-                        }
-                        tile_seg.y_edge = y_edge;
-                        tile_seg.next = old;
-                        segments[seg_ix] = tile_seg;
-                    }
-                    xc += b;
-                    xray = next_xray;
-                }
-                n_out += 1u;
-                val_target += v_step;
-                lp0 = lp1;
-            }
-            val_sum += params.val;
-            qp0 = qp2;
-        }
-#endif
-    }
-}
diff --git a/shader/path_coarse_full.wgsl b/shader/path_coarse_full.wgsl
deleted file mode 100644
index cf03301..0000000
--- a/shader/path_coarse_full.wgsl
+++ /dev/null
@@ -1,273 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Path coarse rasterization for the full implementation.
-
-#import config
-#import pathtag
-#import tile
-#import segment
-#import cubic
-#import bump
-
-@group(0) @binding(0)
-var<uniform> config: Config;
-
-@group(0) @binding(1)
-var<storage> scene: array<u32>;
-
-@group(0) @binding(2)
-var<storage> cubics: array<Cubic>;
-
-@group(0) @binding(3)
-var<storage> paths: array<Path>;
-
-// We don't get this from import as it's the atomic version
-struct AtomicTile {
-    backdrop: atomic<i32>,
-    segments: atomic<u32>,
-}
-
-@group(0) @binding(4)
-var<storage, read_write> bump: BumpAllocators;
-
-@group(0) @binding(5)
-var<storage, read_write> tiles: array<AtomicTile>;
-
-@group(0) @binding(6)
-var<storage, read_write> segments: array<Segment>;
-
-struct SubdivResult {
-    val: f32,
-    a0: f32,
-    a2: f32,
-}
-
-let D = 0.67;
-fn approx_parabola_integral(x: f32) -> f32 {
-    return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
-}
-
-let B = 0.39;
-fn approx_parabola_inv_integral(x: f32) -> f32 {
-    return x * sqrt(1.0 - B + (B * B + 0.5 * x * x));
-}
-
-fn estimate_subdiv(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, sqrt_tol: f32) -> SubdivResult {
-    let d01 = p1 - p0;
-    let d12 = p2 - p1;
-    let dd = d01 - d12;
-    let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
-    let cross_inv = select(1.0 / cross, 1.0e9, abs(cross) < 1.0e-9);
-    let x0 = dot(d01, dd) * cross_inv;
-    let x2 = dot(d12, dd) * cross_inv;
-    let scale = abs(cross / (length(dd) * (x2 - x0)));
-
-    let a0 = approx_parabola_integral(x0);
-    let a2 = approx_parabola_integral(x2);
-    var val = 0.0;
-    if scale < 1e9 {
-        let da = abs(a2 - a0);
-        let sqrt_scale = sqrt(scale);
-        if sign(x0) == sign(x2) {
-            val = sqrt_scale;
-        } else {
-            let xmin = sqrt_tol / sqrt_scale;
-            val = sqrt_tol / approx_parabola_integral(xmin);
-        }
-        val *= da;
-    }
-    return SubdivResult(val, a0, a2);
-}
-
-fn eval_quad(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, t: f32) -> vec2<f32> {
-    let mt = 1.0 - t;
-    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
-}
-
-fn eval_cubic(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, p3: vec2<f32>, t: f32) -> vec2<f32> {
-    let mt = 1.0 - t;
-    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
-}
-
-fn alloc_segment() -> u32 {
-    var offset = atomicAdd(&bump.segments, 1u) + 1u;
-    if offset + 1u > config.segments_size {
-        offset = 0u;
-        atomicOr(&bump.failed, STAGE_PATH_COARSE);
-    }
-    return offset;
-}
-
-let MAX_QUADS = 16u;
-
-@compute @workgroup_size(256)
-fn main(
-    @builtin(global_invocation_id) global_id: vec3<u32>,
-) {
-    // Exit early if prior stages failed, as we can't run this stage.
-    // We need to check only prior stages, as if this stage has failed in another workgroup, 
-    // we still want to know this workgroup's memory requirement.   
-    if (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC)) != 0u {
-        return;
-    }
-    let ix = global_id.x;
-    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
-    let shift = (ix & 3u) * 8u;
-    var tag_byte = (tag_word >> shift) & 0xffu;
-
-    if (tag_byte & PATH_TAG_SEG_TYPE) != 0u {
-        // Discussion question: it might actually be cheaper to do the path segment
-        // decoding & transform again rather than store the result in a buffer;
-        // classic memory vs ALU tradeoff.
-        let cubic = cubics[global_id.x];
-        let path = paths[cubic.path_ix];
-        let is_stroke = (cubic.flags & CUBIC_IS_STROKE) != 0u;
-        let bbox = vec4<i32>(path.bbox);
-        let p0 = cubic.p0;
-        let p1 = cubic.p1;
-        let p2 = cubic.p2;
-        let p3 = cubic.p3;
-        let err_v = 3.0 * (p2 - p1) + p0 - p3;
-        let err = dot(err_v, err_v);
-        let ACCURACY = 0.25;
-        let Q_ACCURACY = ACCURACY * 0.1;
-        let REM_ACCURACY = (ACCURACY - Q_ACCURACY);
-        let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY;
-        var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u);
-        n_quads = min(n_quads, MAX_QUADS);
-        var keep_params: array<SubdivResult, MAX_QUADS>;
-        var val = 0.0;
-        var qp0 = p0;
-        let step = 1.0 / f32(n_quads);
-        for (var i = 0u; i < n_quads; i += 1u) {
-            let t = f32(i + 1u) * step;
-            let qp2 = eval_cubic(p0, p1, p2, p3, t);
-            var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
-            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-            let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
-            keep_params[i] = params;
-            val += params.val;
-            qp0 = qp2;
-        }
-        let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u);
-        var lp0 = p0;
-        qp0 = p0;
-        let v_step = val / f32(n);
-        var n_out = 1u;
-        var val_sum = 0.0;
-        for (var i = 0u; i < n_quads; i += 1u) {
-            let t = f32(i + 1u) * step;
-            let qp2 = eval_cubic(p0, p1, p2, p3, t);
-            var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
-            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-            let params = keep_params[i];
-            let u0 = approx_parabola_inv_integral(params.a0);
-            let u2 = approx_parabola_inv_integral(params.a2);
-            let uscale = 1.0 / (u2 - u0);
-            var val_target = f32(n_out) * v_step;
-            while n_out == n || val_target < val_sum + params.val {
-                var lp1: vec2<f32>;
-                if n_out == n {
-                    lp1 = p3;
-                } else {
-                    let u = (val_target - val_sum) / params.val;
-                    let a = mix(params.a0, params.a2, u);
-                    let au = approx_parabola_inv_integral(a);
-                    let t = (au - u0) * uscale;
-                    lp1 = eval_quad(qp0, qp1, qp2, t);
-                }
-
-                // Output line segment lp0..lp1
-                let xymin = min(lp0, lp1) - cubic.stroke;
-                let xymax = max(lp0, lp1) + cubic.stroke;
-                let dp = lp1 - lp0;
-                let recip_dx = 1.0 / dp.x;
-                let invslope = select(dp.x / dp.y, 1.0e9, abs(dp.y) < 1.0e-9);
-                let SX = 1.0 / f32(TILE_WIDTH);
-                let SY = 1.0 / f32(TILE_HEIGHT);
-                let c = (cubic.stroke.x + abs(invslope) * (0.5 * f32(TILE_HEIGHT) + cubic.stroke.y)) * SX;
-                let b = invslope;
-                let a = (lp0.x - (lp0.y - 0.5 * f32(TILE_HEIGHT)) * b) * SX;
-                var x0 = i32(floor(xymin.x * SX));
-                var x1 = i32(floor(xymax.x * SX) + 1.0);
-                var y0 = i32(floor(xymin.y * SY));
-                var y1 = i32(floor(xymax.y * SY) + 1.0);
-                x0 = clamp(x0, bbox.x, bbox.z);
-                x1 = clamp(x1, bbox.x, bbox.z);
-                y0 = clamp(y0, bbox.y, bbox.w);
-                y1 = clamp(y1, bbox.y, bbox.w);
-                var xc = a + b * f32(y0);
-                let stride = bbox.z - bbox.x;
-                var base = i32(path.tiles) + (y0 - bbox.y) * stride - bbox.x;
-                var xray = i32(floor(lp0.x * SX));
-                var last_xray = i32(floor(lp1.x * SX));
-                if dp.y < 0.0 {
-                    let tmp = xray;
-                    xray = last_xray;
-                    last_xray = tmp;
-                }
-                for (var y = y0; y < y1; y += 1) {
-                    let tile_y0 = f32(y) * f32(TILE_HEIGHT);
-                    let xbackdrop = max(xray + 1, bbox.x);
-                    if !is_stroke && xymin.y < tile_y0 && xbackdrop < bbox.z {
-                        let backdrop = select(-1, 1, dp.y < 0.0);
-                        let tile_ix = base + xbackdrop;
-                        atomicAdd(&tiles[tile_ix].backdrop, backdrop);
-                    }
-                    var next_xray = last_xray;
-                    if y + 1 < y1 {
-                        let tile_y1 = f32(y + 1) * f32(TILE_HEIGHT);
-                        let x_edge = lp0.x + (tile_y1 - lp0.y) * invslope;
-                        next_xray = i32(floor(x_edge * SX));
-                    }
-                    let min_xray = min(xray, next_xray);
-                    let max_xray = max(xray, next_xray);
-                    var xx0 = min(i32(floor(xc - c)), min_xray);
-                    var xx1 = max(i32(ceil(xc + c)), max_xray + 1);
-                    xx0 = clamp(xx0, x0, x1);
-                    xx1 = clamp(xx1, x0, x1);
-                    var tile_seg: Segment;
-                    for (var x = xx0; x < xx1; x += 1) {
-                        let tile_x0 = f32(x) * f32(TILE_WIDTH);
-                        let tile_ix = base + x;
-                        // allocate segment, insert linked list
-                        let seg_ix = alloc_segment();
-                        let old = atomicExchange(&tiles[tile_ix].segments, seg_ix);
-                        tile_seg.origin = lp0;
-                        tile_seg.delta = dp;
-                        var y_edge = 0.0;
-                        if !is_stroke {
-                            y_edge = mix(lp0.y, lp1.y, (tile_x0 - lp0.x) * recip_dx);
-                            if xymin.x < tile_x0 {
-                                let p = vec2(tile_x0, y_edge);
-                                if dp.x < 0.0 {
-                                    tile_seg.delta = p - lp0;
-                                } else {
-                                    tile_seg.origin = p;
-                                    tile_seg.delta = lp1 - p;
-                                }
-                                if tile_seg.delta.x == 0.0 {
-                                    tile_seg.delta.x = sign(dp.x) * 1e-9;
-                                }
-                            }
-                            if x <= min_xray || max_xray < x {
-                                y_edge = 1e9;
-                            }
-                        }
-                        tile_seg.y_edge = y_edge;
-                        tile_seg.next = old;
-                        segments[seg_ix] = tile_seg;
-                    }
-                    xc += b;
-                    base += stride;
-                    xray = next_xray;
-                }
-                n_out += 1u;
-                val_target += v_step;
-                lp0 = lp1;
-            }
-            val_sum += params.val;
-            qp0 = qp2;
-        }
-    }
-}
diff --git a/shader/path_count.wgsl b/shader/path_count.wgsl
new file mode 100644
index 0000000..3cf4e4c
--- /dev/null
+++ b/shader/path_count.wgsl
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Stage to compute counts of number of segments in each tile
+
+#import bump
+#import config
+#import segment
+#import tile
+
+@group(0) @binding(0)
+var<uniform> config: Config;
+
+// TODO: this is cut'n'pasted from path_coarse.
+struct AtomicTile {
+    backdrop: atomic<i32>,
+    // This is "segments" but we're renaming
+    count: atomic<u32>,
+}
+
+@group(0) @binding(1)
+var<storage, read_write> bump: BumpAllocators;
+
+@group(0) @binding(2)
+var<storage> lines: array<LineSoup>;
+
+@group(0) @binding(3)
+var<storage> paths: array<Path>;
+
+@group(0) @binding(4)
+var<storage, read_write> tile: array<AtomicTile>;
+
+@group(0) @binding(5)
+var<storage, read_write> seg_counts: array<SegmentCount>;
+
+// number of integer cells spanned by interval defined by a, b
+fn span(a: f32, b: f32) -> u32 {
+    return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
+}
+
+// Note regarding clipping to bounding box:
+//
+// We have to do the backdrop bumps for all tiles to the left of the bbox.
+// This should probably be a separate loop. This also causes a numerical
+// robustness issue.
+
+// This shader is dispatched with one thread for each line.
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+) {
+    let n_lines = atomicLoad(&bump.lines);
+    var count = 0u;
+    if global_id.x < n_lines {
+        let line = lines[global_id.x];
+        // coarse rasterization logic to count number of tiles touched by line
+        let is_down = line.p1.y >= line.p0.y;
+        let xy0 = select(line.p1, line.p0, is_down);
+        let xy1 = select(line.p0, line.p1, is_down);
+        let s0 = xy0 * TILE_SCALE;
+        let s1 = xy1 * TILE_SCALE;
+        // TODO: clip line to bounding box
+        count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u;
+        let line_ix = global_id.x;
+
+        let dx = abs(s1.x - s0.x);
+        let dy = s1.y - s0.y;
+        if dx + dy == 0.0 {
+            // Zero-length segment, drop it. Note, this should be culled earlier.
+            return;
+        }
+        if dy == 0.0 && floor(s0.y) == s0.y {
+            return;
+        }
+        let idxdy = 1.0 / (dx + dy);
+        let a = dx * idxdy;
+        let is_positive_slope = s1.x >= s0.x;
+        let sign = select(-1.0, 1.0, is_positive_slope);
+        let xt0 = floor(s0.x * sign);
+        let c = s0.x * sign - xt0;
+        let y0 = floor(s0.y);
+        let ytop = select(y0 + 1.0, ceil(s0.y), s0.y == s1.y);
+        let b = (dy * c + dx * (ytop - s0.y)) * idxdy;
+        let x0 = xt0 * sign + select(-1.0, 0.0, is_positive_slope);
+
+        let path = paths[line.path_ix];
+        let bbox = vec4<i32>(path.bbox);
+        let xmin = min(s0.x, s1.x);
+        // If line is to left of bbox, we may still need to do backdrop
+        let stride = bbox.z - bbox.x;
+        if s0.y >= f32(bbox.w) || s1.y <= f32(bbox.y) || xmin >= f32(bbox.z) || stride == 0 {
+            return;
+        }
+        // Clip line to bounding box. Clipping is done in "i" space.
+        var imin = 0u;
+        if s0.y < f32(bbox.y) {
+            var iminf = round((f32(bbox.y) - y0 + b - a) / (1.0 - a)) - 1.0;
+            // Numerical robustness: goal is to find the first i value for which
+            // the following predicate is false. Above formula is designed to
+            // undershoot by 0.5.
+            if y0 + iminf - floor(a * iminf + b) < f32(bbox.y) {
+                iminf += 1.0;
+            }
+            imin = u32(iminf);
+        }
+        var imax = count;
+        if s1.y > f32(bbox.w) {
+            var imaxf = round((f32(bbox.w) - y0 + b - a) / (1.0 - a)) - 1.0;
+            if y0 + imaxf - floor(a * imaxf + b) < f32(bbox.w) {
+                imaxf += 1.0;
+            }
+            imax = u32(imaxf);
+        }
+        let delta = select(1, -1, is_down);
+        var ymin = 0;
+        var ymax = 0;
+        if max(s0.x, s1.x) <= f32(bbox.x) {
+            ymin = i32(ceil(s0.y));
+            ymax = i32(ceil(s1.y));
+            imax = imin;
+        } else {
+            let fudge = select(1.0, 0.0, is_positive_slope);
+            if xmin < f32(bbox.x) {
+                var f = round((sign * (f32(bbox.x) - x0) - b + fudge) / a);
+                if (x0 + sign * floor(a * f + b) < f32(bbox.x)) == is_positive_slope {
+                    f += 1.0;
+                }
+                let ynext = i32(y0 + f - floor(a * f + b) + 1.0);
+                if is_positive_slope {
+                    if u32(f) > imin {
+                        ymin = i32(y0 + select(1.0, 0.0, y0 == s0.y));
+                        ymax = ynext;
+                        imin = u32(f);
+                    }
+                } else {
+                    if u32(f) < imax {
+                        ymin = ynext;
+                        ymax = i32(ceil(s1.y));
+                        imax = u32(f);
+                    }
+                }
+            }
+            if max(s0.x, s1.x) > f32(bbox.z) {
+                var f = round((sign * (f32(bbox.z) - x0) - b + fudge) / a);
+                if (x0 + sign * floor(a * f + b) < f32(bbox.z)) == is_positive_slope {
+                    f += 1.0;
+                }
+                if is_positive_slope {
+                    imax = min(imax, u32(f));
+                } else {
+                    imin = max(imin, u32(f));
+                }
+            }
+        }
+        imax = max(imin, imax);
+        // Apply backdrop for part of line left of bbox
+        ymin = max(ymin, bbox.y);
+        ymax = min(ymax, bbox.w);
+        for (var y = ymin; y < ymax; y++) {
+            let base = i32(path.tiles) + (y - bbox.y) * stride;
+            atomicAdd(&tile[base].backdrop, delta);
+        }
+        var last_z = floor(a * (f32(imin) - 1.0) + b);
+        let seg_base = atomicAdd(&bump.seg_counts, imax - imin);
+        for (var i = imin; i < imax; i++) {
+            let subix = i;
+            // coarse rasterization logic
+            // Note: we hope fast-math doesn't strength reduce this.
+            let zf = a * f32(subix) + b;
+            let z = floor(zf);
+            // x, y are tile coordinates relative to render target
+            let y = i32(y0 + f32(subix) - z);
+            let x = i32(x0 + sign * z);
+            let base = i32(path.tiles) + (y - bbox.y) * stride - bbox.x;
+            let top_edge = select(last_z == z, y0 == s0.y, subix == 0u);
+            if top_edge && x + 1 < bbox.z {
+                let x_bump = max(x + 1, bbox.x);
+                atomicAdd(&tile[base + x_bump].backdrop, delta);
+            }
+            let seg_within_slice = atomicAdd(&tile[base + x].count, 1u);
+            // Pack two count values into a single u32
+            let counts = (seg_within_slice << 16u) | subix;
+            let seg_count = SegmentCount(line_ix, counts);
+            seg_counts[seg_base + i - imin] = seg_count;
+            // Note: since we're iterating, we have a reliable value for
+            // last_z.
+            last_z = z;
+        }
+    }
+}
diff --git a/shader/path_count_setup.wgsl b/shader/path_count_setup.wgsl
new file mode 100644
index 0000000..95af29c
--- /dev/null
+++ b/shader/path_count_setup.wgsl
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Set up dispatch size for path count stage.
+
+#import bump
+
+@group(0) @binding(0)
+var<storage> bump: BumpAllocators;
+
+@group(0) @binding(1)
+var<storage, read_write> indirect: IndirectCount;
+
+// Partition size for path count stage
+let WG_SIZE = 256u;
+
+@compute @workgroup_size(1)
+fn main() {
+    let lines = atomicLoad(&bump.lines);
+    indirect.count_x = (lines + (WG_SIZE - 1u)) / WG_SIZE;
+    indirect.count_y = 1u;
+    indirect.count_z = 1u;
+}
diff --git a/shader/path_tiling.wgsl b/shader/path_tiling.wgsl
new file mode 100644
index 0000000..444974e
--- /dev/null
+++ b/shader/path_tiling.wgsl
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Write path segments
+
+#import bump
+#import config
+#import segment
+#import tile
+
+@group(0) @binding(0)
+var<storage> bump: BumpAllocators;
+
+@group(0) @binding(1)
+var<storage> seg_counts: array<SegmentCount>;
+
+@group(0) @binding(2)
+var<storage> lines: array<LineSoup>;
+
+@group(0) @binding(3)
+var<storage> paths: array<Path>;
+
+@group(0) @binding(4)
+var<storage> tiles: array<Tile>;
+
+@group(0) @binding(5)
+var<storage, read_write> segments: array<Segment>;
+
+fn span(a: f32, b: f32) -> u32 {
+    return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
+}
+
+// One invocation for each tile that is to be written.
+// Total number of invocations = bump.seg_counts
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+) {
+    let n_segments = atomicLoad(&bump.seg_counts);
+    if global_id.x < n_segments {
+        let seg_count = seg_counts[global_id.x];
+        let line = lines[seg_count.line_ix];
+        let counts = seg_count.counts;
+        let seg_within_slice = counts >> 16u;
+        let seg_within_line = counts & 0xffffu;
+
+        // coarse rasterization logic
+        let is_down = line.p1.y >= line.p0.y;
+        var xy0 = select(line.p1, line.p0, is_down);
+        var xy1 = select(line.p0, line.p1, is_down);
+        let s0 = xy0 * TILE_SCALE;
+        let s1 = xy1 * TILE_SCALE;
+        let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u;
+        let dx = abs(s1.x - s0.x);
+        let dy = s1.y - s0.y;
+        let idxdy = 1.0 / (dx + dy);
+        let a = dx * idxdy;
+        let is_positive_slope = s1.x >= s0.x;
+        let sign = select(-1.0, 1.0, is_positive_slope);
+        let xt0 = floor(s0.x * sign);
+        let c = s0.x * sign - xt0;
+        let y0i = floor(s0.y);
+        let ytop = select(y0i + 1.0, ceil(s0.y), s0.y == s1.y);
+        let b = (dy * c + dx * (ytop - s0.y)) * idxdy;
+        let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
+        let z = floor(a * f32(seg_within_line) + b);
+        let x = x0i + i32(sign * z);
+        let y = i32(y0i + f32(seg_within_line) - z);
+
+        let path = paths[line.path_ix];
+        let bbox = vec4<i32>(path.bbox);
+        let stride = bbox.z - bbox.x;
+        let tile_ix = i32(path.tiles) + (y - bbox.y) * stride + x - bbox.x;
+        let tile = tiles[tile_ix];
+        let seg_start = ~tile.segments;
+        if i32(seg_start) < 0 {
+            return;
+        }
+        let tile_xy = vec2(f32(x) * f32(TILE_WIDTH), f32(y) * f32(TILE_HEIGHT));
+        let tile_xy1 = tile_xy + vec2(f32(TILE_WIDTH), f32(TILE_HEIGHT));
+
+        if seg_within_line > 0u {
+            let z_prev = floor(a * (f32(seg_within_line) - 1.0) + b);
+            if z == z_prev {
+                // Top edge is clipped
+                var xt = xy0.x + (xy1.x - xy0.x) * (tile_xy.y - xy0.y) / (xy1.y - xy0.y);
+                // TODO: we want to switch to tile-relative coordinates
+                xt = clamp(xt, tile_xy.x + 1e-3, tile_xy1.x);
+                xy0 = vec2(xt, tile_xy.y);
+            } else {
+                // If is_positive_slope, left edge is clipped, otherwise right
+                let x_clip = select(tile_xy1.x, tile_xy.x, is_positive_slope);
+                var yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x);
+                yt = clamp(yt, tile_xy.y + 1e-3, tile_xy1.y);
+                xy0 = vec2(x_clip, yt);
+            }
+        }
+        if seg_within_line < count - 1u {
+            let z_next = floor(a * (f32(seg_within_line) + 1.0) + b);
+            if z == z_next {
+                // Bottom edge is clipped
+                var xt = xy0.x + (xy1.x - xy0.x) * (tile_xy1.y - xy0.y) / (xy1.y - xy0.y);
+                xt = clamp(xt, tile_xy.x + 1e-3, tile_xy1.x);
+                xy1 = vec2(xt, tile_xy1.y);
+            } else {
+                // If is_positive_slope, right edge is clipped, otherwise left
+                let x_clip = select(tile_xy.x, tile_xy1.x, is_positive_slope);
+                var yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x);
+                yt = clamp(yt, tile_xy.y + 1e-3, tile_xy1.y);
+                xy1 = vec2(x_clip, yt);
+            }
+        }
+        // See comments in CPU version of shader
+        var y_edge = 1e9;
+        if xy0.x == tile_xy.x {
+            y_edge = xy0.y;
+        } else if xy1.x == tile_xy.x {
+            y_edge = xy1.y;
+        }
+        if !is_down {
+            let tmp = xy0;
+            xy0 = xy1;
+            xy1 = tmp;
+        }
+        let segment = Segment(xy0, xy1 - xy0, y_edge);
+        segments[seg_start + seg_within_slice] = segment;
+    }
+}
diff --git a/shader/path_tiling_setup.wgsl b/shader/path_tiling_setup.wgsl
new file mode 100644
index 0000000..722be19
--- /dev/null
+++ b/shader/path_tiling_setup.wgsl
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Set up dispatch size for path tiling stage.
+
+#import bump
+
+@group(0) @binding(0)
+var<storage> bump: BumpAllocators;
+
+@group(0) @binding(1)
+var<storage, read_write> indirect: IndirectCount;
+
+// Partition size for path tiling stage
+let WG_SIZE = 256u;
+
+@compute @workgroup_size(1)
+fn main() {
+    let segments = atomicLoad(&bump.seg_counts);
+    indirect.count_x = (segments + (WG_SIZE - 1u)) / WG_SIZE;
+    indirect.count_y = 1u;
+    indirect.count_z = 1u;
+}
diff --git a/shader/pathseg.wgsl b/shader/pathseg.wgsl
deleted file mode 100644
index bf02419..0000000
--- a/shader/pathseg.wgsl
+++ /dev/null
@@ -1,194 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Path segment decoding for the full case.
-
-// In the simple case, path segments are decoded as part of the coarse
-// path rendering stage. In the full case, they are separated, as the
-// decoding process also generates bounding boxes, and those in turn are
-// used for tile allocation and clipping; actual coarse path rasterization
-// can't proceed until those are complete.
-
-// There's some duplication of the decoding code but we won't worry about
-// that just now. Perhaps it could be factored more nicely later.
-
-#import config
-#import pathtag
-#import cubic
-#import transform
-
-@group(0) @binding(0)
-var<uniform> config: Config;
-
-@group(0) @binding(1)
-var<storage> scene: array<u32>;
-
-@group(0) @binding(2)
-var<storage> tag_monoids: array<TagMonoid>;
-
-struct AtomicPathBbox {
-    x0: atomic<i32>,
-    y0: atomic<i32>,
-    x1: atomic<i32>,
-    y1: atomic<i32>,
-    linewidth: f32,
-    trans_ix: u32,
-}
-
-@group(0) @binding(3)
-var<storage, read_write> path_bboxes: array<AtomicPathBbox>;
-
-@group(0) @binding(4)
-var<storage, read_write> cubics: array<Cubic>;
-
-// Monoid is yagni, for future optimization
-
-// struct BboxMonoid {
-//     bbox: vec4<f32>,
-//     flags: u32,
-// }
-
-// let FLAG_RESET_BBOX = 1u;
-// let FLAG_SET_BBOX = 2u;
-
-// fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid {
-//     var c: BboxMonoid;
-//     c.bbox = b.bbox;
-//     // TODO: previous-me thought this should be gated on b & SET_BBOX == false also
-//     if (a.flags & FLAG_RESET_BBOX) == 0u && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y {
-//         c.bbox = a.bbox;
-//     } else if (a.flags & FLAG_RESET_BBOX) == 0u && (b.flags & FLAG_SET_BBOX) == 0u ||
-//         (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)
-//     {
-//         c.bbox = vec4<f32>(min(a.bbox.xy, c.bbox.xy), max(a.bbox.xw, c.bbox.zw));
-//     }
-//     c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
-//     c.flags |= (a.flags & FLAG_RESET_BBOX) << 1u;
-//     return c;
-// }
-
-// fn bbox_monoid_identity() -> BboxMonoid {
-//     return BboxMonoid();
-// }
-
-var<private> pathdata_base: u32;
-
-fn read_f32_point(ix: u32) -> vec2<f32> {
-    let x = bitcast<f32>(scene[pathdata_base + ix]);
-    let y = bitcast<f32>(scene[pathdata_base + ix + 1u]);
-    return vec2(x, y);
-}
-
-fn read_i16_point(ix: u32) -> vec2<f32> {
-    let raw = scene[pathdata_base + ix];
-    let x = f32(i32(raw << 16u) >> 16u);
-    let y = f32(i32(raw) >> 16u);
-    return vec2(x, y);
-}
-
-fn read_transform(transform_base: u32, ix: u32) -> Transform {
-    let base = transform_base + ix * 6u;
-    let c0 = bitcast<f32>(scene[base]);
-    let c1 = bitcast<f32>(scene[base + 1u]);
-    let c2 = bitcast<f32>(scene[base + 2u]);
-    let c3 = bitcast<f32>(scene[base + 3u]);
-    let c4 = bitcast<f32>(scene[base + 4u]);
-    let c5 = bitcast<f32>(scene[base + 5u]);
-    let matrx = vec4(c0, c1, c2, c3);
-    let translate = vec2(c4, c5);
-    return Transform(matrx, translate);
-}
-
-fn round_down(x: f32) -> i32 {
-    return i32(floor(x));
-}
-
-fn round_up(x: f32) -> i32 {
-    return i32(ceil(x));
-}
-
-@compute @workgroup_size(256)
-fn main(
-    @builtin(global_invocation_id) global_id: vec3<u32>,
-    @builtin(local_invocation_id) local_id: vec3<u32>,
-) {
-    let ix = global_id.x;
-    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
-    pathdata_base = config.pathdata_base;
-    let shift = (ix & 3u) * 8u;
-    var tm = reduce_tag(tag_word & ((1u << shift) - 1u));
-    tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm);
-    var tag_byte = (tag_word >> shift) & 0xffu;
-
-    let out = &path_bboxes[tm.path_ix];
-    let linewidth = bitcast<f32>(scene[config.linewidth_base + tm.linewidth_ix]);
-    if (tag_byte & PATH_TAG_PATH) != 0u {
-        (*out).linewidth = linewidth;
-        (*out).trans_ix = tm.trans_ix;
-    }
-    // Decode path data
-    let seg_type = tag_byte & PATH_TAG_SEG_TYPE;
-    if seg_type != 0u {
-        var p0: vec2<f32>;
-        var p1: vec2<f32>;
-        var p2: vec2<f32>;
-        var p3: vec2<f32>;
-        if (tag_byte & PATH_TAG_F32) != 0u {
-            p0 = read_f32_point(tm.pathseg_offset);
-            p1 = read_f32_point(tm.pathseg_offset + 2u);
-            if seg_type >= PATH_TAG_QUADTO {
-                p2 = read_f32_point(tm.pathseg_offset + 4u);
-                if seg_type == PATH_TAG_CUBICTO {
-                    p3 = read_f32_point(tm.pathseg_offset + 6u);
-                }
-            }
-        } else {
-            p0 = read_i16_point(tm.pathseg_offset);
-            p1 = read_i16_point(tm.pathseg_offset + 1u);
-            if seg_type >= PATH_TAG_QUADTO {
-                p2 = read_i16_point(tm.pathseg_offset + 2u);
-                if seg_type == PATH_TAG_CUBICTO {
-                    p3 = read_i16_point(tm.pathseg_offset + 3u);
-                }
-            }
-        }
-        let transform = read_transform(config.transform_base, tm.trans_ix);
-        p0 = transform_apply(transform, p0);
-        p1 = transform_apply(transform, p1);
-        var bbox = vec4(min(p0, p1), max(p0, p1));
-        // Degree-raise
-        if seg_type == PATH_TAG_LINETO {
-            p3 = p1;
-            p2 = mix(p3, p0, 1.0 / 3.0);
-            p1 = mix(p0, p3, 1.0 / 3.0);
-        } else if seg_type >= PATH_TAG_QUADTO {
-            p2 = transform_apply(transform, p2);
-            bbox = vec4(min(bbox.xy, p2), max(bbox.zw, p2));
-            if seg_type == PATH_TAG_CUBICTO {
-                p3 = transform_apply(transform, p3);
-                bbox = vec4(min(bbox.xy, p3), max(bbox.zw, p3));
-            } else {
-                p3 = p2;
-                p2 = mix(p1, p2, 1.0 / 3.0);
-                p1 = mix(p1, p0, 1.0 / 3.0);
-            }
-        }
-        var stroke = vec2(0.0, 0.0);
-        if linewidth >= 0.0 {
-            // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
-            // This is the correct bounding box, but we're not handling rendering
-            // in the isotropic case, so it may mismatch.
-            stroke = 0.5 * linewidth * vec2(length(transform.matrx.xz), length(transform.matrx.yw));
-            bbox += vec4(-stroke, stroke);
-        }
-        let flags = u32(linewidth >= 0.0);
-        cubics[global_id.x] = Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags);
-        // Update bounding box using atomics only. Computing a monoid is a
-        // potential future optimization.
-        if bbox.z > bbox.x || bbox.w > bbox.y {
-            atomicMin(&(*out).x0, round_down(bbox.x));
-            atomicMin(&(*out).y0, round_down(bbox.y));
-            atomicMax(&(*out).x1, round_up(bbox.z));
-            atomicMax(&(*out).y1, round_up(bbox.w));
-        }
-    }
-}
diff --git a/shader/shared/bump.wgsl b/shader/shared/bump.wgsl
index 4864f31..6c5789f 100644
--- a/shader/shared/bump.wgsl
+++ b/shader/shared/bump.wgsl
@@ -6,13 +6,21 @@
 let STAGE_PATH_COARSE: u32 = 0x4u;
 let STAGE_COARSE: u32 = 0x8u;
 
-// This must be kept in sync with the struct in src/render.rs
+// This must be kept in sync with the struct in config.rs in the encoding crate.
 struct BumpAllocators {
     // Bitmask of stages that have failed allocation.
     failed: atomic<u32>,
     binning: atomic<u32>,
     ptcl: atomic<u32>,
     tile: atomic<u32>,
+    seg_counts: atomic<u32>,
     segments: atomic<u32>,
     blend: atomic<u32>,
+    lines: atomic<u32>,
+}
+
+struct IndirectCount {
+    count_x: u32,
+    count_y: u32,
+    count_z: u32,
 }
diff --git a/shader/shared/config.wgsl b/shader/shared/config.wgsl
index 4ac718c..1e553f9 100644
--- a/shader/shared/config.wgsl
+++ b/shader/shared/config.wgsl
@@ -48,6 +48,9 @@
 //let N_TILE = N_TILE_X * N_TILE_Y;
 let N_TILE = 256u;
 
+// Not currently supporting non-square tiles
+let TILE_SCALE = 0.0625;
+
 let BLEND_STACK_SPLIT = 4u;
 
 // The following are computed in draw_leaf from the generic gradient parameters
diff --git a/shader/shared/ptcl.wgsl b/shader/shared/ptcl.wgsl
index d5fc5d4..cf759a5 100644
--- a/shader/shared/ptcl.wgsl
+++ b/shader/shared/ptcl.wgsl
@@ -25,7 +25,8 @@
 // hand in the relevant shaders
 
 struct CmdFill {
-    tile: u32,
+    size_and_rule: u32,
+    seg_data: u32,
     backdrop: i32,
 }
 
diff --git a/shader/shared/segment.wgsl b/shader/shared/segment.wgsl
index 7d03592..9fe1344 100644
--- a/shader/shared/segment.wgsl
+++ b/shader/shared/segment.wgsl
@@ -1,8 +1,33 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
+// Segments laid out for contiguous storage
 struct Segment {
     origin: vec2<f32>,
     delta: vec2<f32>,
     y_edge: f32,
-    next: u32,
+}
+
+// A line segment produced by flattening and ready for rasterization.
+//
+// The name is perhaps too playful, but reflects the fact that these
+// lines are completely unordered. They will flow through coarse path
+// rasterization, then the per-tile segments will be scatter-written into
+// segment storage so that each (tile, path) tuple gets a contiguous
+// slice of segments.
+struct LineSoup {
+    path_ix: u32,
+    // Note: this creates an alignment gap. Don't worry about
+    // this now, but maybe move to scalars.
+    p0: vec2<f32>,
+    p1: vec2<f32>,
+}
+
+// An intermediate data structure for sorting tile segments.
+struct SegmentCount {
+    // Reference to element of LineSoup array
+    line_ix: u32,
+    // Two count values packed into a single u32
+    // Lower 16 bits: index of segment within line
+    // Upper 16 bits: index of segment within segment slice
+    counts: u32,
 }
diff --git a/src/lib.rs b/src/lib.rs
index f45bdcf..55147ac 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -251,7 +251,9 @@
     ) -> Result<Option<BumpAllocators>> {
         let mut render = Render::new();
         let encoding = scene.data();
-        let recording = render.render_encoding_coarse(encoding, &self.shaders, params, true);
+        // TODO: turn this on; the download feature interacts with CPU dispatch
+        let robust = false;
+        let recording = render.render_encoding_coarse(encoding, &self.shaders, params, robust);
         let target = render.out_image();
         let bump_buf = render.bump_buf();
         self.engine.run_recording(
diff --git a/src/render.rs b/src/render.rs
index 03e6484..4625636 100644
--- a/src/render.rs
+++ b/src/render.rs
@@ -190,17 +190,21 @@
             wg_counts.bbox_clear,
             [config_buf, path_bbox_buf],
         );
-        let cubic_buf =
-            ResourceProxy::new_buf(buffer_sizes.cubics.size_in_bytes().into(), "cubic_buf");
+        let bump_buf = BufProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf");
+        recording.clear_all(bump_buf);
+        let bump_buf = ResourceProxy::Buf(bump_buf);
+        let lines_buf =
+            ResourceProxy::new_buf(buffer_sizes.lines.size_in_bytes().into(), "lines_buf");
         recording.dispatch(
-            shaders.pathseg,
-            wg_counts.path_seg,
+            shaders.flatten,
+            wg_counts.flatten,
             [
                 config_buf,
                 scene_buf,
                 tagmonoid_buf,
                 path_bbox_buf,
-                cubic_buf,
+                bump_buf,
+                lines_buf,
             ],
         );
         let draw_reduced_buf = ResourceProxy::new_buf(
@@ -273,13 +277,10 @@
             buffer_sizes.draw_bboxes.size_in_bytes().into(),
             "draw_bbox_buf",
         );
-        let bump_buf = BufProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf");
         let bin_header_buf = ResourceProxy::new_buf(
             buffer_sizes.bin_headers.size_in_bytes().into(),
             "bin_header_buf",
         );
-        recording.clear_all(bump_buf);
-        let bump_buf = ResourceProxy::Buf(bump_buf);
         recording.dispatch(
             shaders.binning,
             wg_counts.binning,
@@ -314,21 +315,33 @@
             ],
         );
         recording.free_resource(draw_bbox_buf);
+        recording.free_resource(tagmonoid_buf);
+        let indirect_count_buf = BufProxy::new(
+            buffer_sizes.indirect_count.size_in_bytes().into(),
+            "indirect_count",
+        );
         recording.dispatch(
-            shaders.path_coarse,
-            wg_counts.path_coarse,
+            shaders.path_count_setup,
+            (1, 1, 1),
+            [bump_buf, indirect_count_buf.into()],
+        );
+        let seg_counts_buf = ResourceProxy::new_buf(
+            buffer_sizes.seg_counts.size_in_bytes().into(),
+            "seg_counts_buf",
+        );
+        recording.dispatch_indirect(
+            shaders.path_count,
+            indirect_count_buf,
+            0,
             [
                 config_buf,
-                scene_buf,
-                cubic_buf,
-                path_buf,
                 bump_buf,
+                lines_buf,
+                path_buf,
                 tile_buf,
-                segments_buf,
+                seg_counts_buf,
             ],
         );
-        recording.free_resource(tagmonoid_buf);
-        recording.free_resource(cubic_buf);
         recording.dispatch(
             shaders.backdrop,
             wg_counts.backdrop,
@@ -349,6 +362,27 @@
                 ptcl_buf,
             ],
         );
+        recording.dispatch(
+            shaders.path_tiling_setup,
+            (1, 1, 1),
+            [bump_buf, indirect_count_buf.into()],
+        );
+        recording.dispatch_indirect(
+            shaders.path_tiling,
+            indirect_count_buf,
+            0,
+            [
+                bump_buf,
+                seg_counts_buf,
+                lines_buf,
+                path_buf,
+                tile_buf,
+                segments_buf,
+            ],
+        );
+        recording.free_buf(indirect_count_buf);
+        recording.free_resource(seg_counts_buf);
+        recording.free_resource(lines_buf);
         recording.free_resource(scene_buf);
         recording.free_resource(draw_monoid_buf);
         recording.free_resource(bin_header_buf);
diff --git a/src/shaders.rs b/src/shaders.rs
index b55f158..23a3950 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -65,16 +65,19 @@
     pub pathtag_scan: ShaderId,
     pub pathtag_scan_large: ShaderId,
     pub bbox_clear: ShaderId,
-    pub pathseg: ShaderId,
+    pub flatten: ShaderId,
     pub draw_reduce: ShaderId,
     pub draw_leaf: ShaderId,
     pub clip_reduce: ShaderId,
     pub clip_leaf: ShaderId,
     pub binning: ShaderId,
     pub tile_alloc: ShaderId,
-    pub path_coarse: ShaderId,
     pub backdrop: ShaderId,
+    pub path_count_setup: ShaderId,
+    pub path_count: ShaderId,
     pub coarse: ShaderId,
+    pub path_tiling_setup: ShaderId,
+    pub path_tiling: ShaderId,
     pub fine: ShaderId,
 }
 
@@ -147,16 +150,17 @@
         preprocess::preprocess(shader!("bbox_clear"), &empty, &imports).into(),
         &[BindType::Uniform, BindType::Buffer],
     )?;
-    let pathseg = engine.add_shader(
+    let flatten = engine.add_shader(
         device,
-        "pathseg",
-        preprocess::preprocess(shader!("pathseg"), &full_config, &imports).into(),
+        "flatten",
+        preprocess::preprocess(shader!("flatten"), &full_config, &imports).into(),
         &[
             BindType::Uniform,
             BindType::BufReadOnly,
             BindType::BufReadOnly,
             BindType::Buffer,
             BindType::Buffer,
+            BindType::Buffer,
         ],
     )?;
     let draw_reduce = engine.add_shader(
@@ -232,17 +236,21 @@
             BindType::Buffer,
         ],
     )?;
-
-    let path_coarse = engine.add_shader(
+    let path_count_setup = engine.add_shader(
         device,
-        "path_coarse_full",
-        preprocess::preprocess(shader!("path_coarse_full"), &full_config, &imports).into(),
+        "path_count_setup",
+        preprocess::preprocess(shader!("path_count_setup"), &empty, &imports).into(),
+        &[BindType::BufReadOnly, BindType::Buffer],
+    )?;
+    let path_count = engine.add_shader(
+        device,
+        "path_count",
+        preprocess::preprocess(shader!("path_count"), &full_config, &imports).into(),
         &[
             BindType::Uniform,
-            BindType::BufReadOnly,
-            BindType::BufReadOnly,
-            BindType::BufReadOnly,
             BindType::Buffer,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
             BindType::Buffer,
             BindType::Buffer,
         ],
@@ -264,9 +272,28 @@
             BindType::BufReadOnly,
             BindType::BufReadOnly,
             BindType::BufReadOnly,
-            BindType::BufReadOnly,
             BindType::Buffer,
             BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let path_tiling_setup = engine.add_shader(
+        device,
+        "path_tiling_setup",
+        preprocess::preprocess(shader!("path_tiling_setup"), &empty, &imports).into(),
+        &[BindType::BufReadOnly, BindType::Buffer],
+    )?;
+    let path_tiling = engine.add_shader(
+        device,
+        "path_tiling",
+        preprocess::preprocess(shader!("path_tiling"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
         ],
     )?;
     let fine = engine.add_shader(
@@ -290,16 +317,19 @@
         pathtag_scan1,
         pathtag_scan_large,
         bbox_clear,
-        pathseg,
+        flatten,
         draw_reduce,
         draw_leaf,
         clip_reduce,
         clip_leaf,
         binning,
         tile_alloc,
-        path_coarse,
+        path_count_setup,
+        path_count,
         backdrop,
         coarse,
+        path_tiling_setup,
+        path_tiling,
         fine,
     })
 }