First cut at bbox improvement

This works but is currently limited to 64k path segments.
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 883465a..6deaef7 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,6 +1,7 @@
 {
   "wgsl-analyzer.customImports": {
     "bbox": "${workspaceFolder}/shader/shared/bbox.wgsl",
+    "bbox_monoid": "${workspaceFolder}/shader/shared/bbox_monoid.wgsl",
     "blend": "${workspaceFolder}/shader/shared/blend.wgsl",
     "bump": "${workspaceFolder}/shader/shared/bump.wgsl",
     "clip": "${workspaceFolder}/shader/shared/clip.wgsl",
diff --git a/shader/bbox_clear.wgsl b/shader/bbox_clear.wgsl
deleted file mode 100644
index fe8cceb..0000000
--- a/shader/bbox_clear.wgsl
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-#import config
-
-@group(0) @binding(0)
-var<uniform> config: Config;
-
-struct PathBbox {
-    x0: i32,
-    y0: i32,
-    x1: i32,
-    y1: i32,
-    linewidth: f32,
-    trans_ix: u32,
-}
-
-@group(0) @binding(1)
-var<storage, read_write> path_bboxes: array<PathBbox>;
-
-@compute @workgroup_size(256)
-fn main(
-    @builtin(global_invocation_id) global_id: vec3<u32>,
-) {
-    let ix = global_id.x;
-    if ix < config.n_path {
-        path_bboxes[ix].x0 = 0x7fffffff;
-        path_bboxes[ix].y0 = 0x7fffffff;
-        path_bboxes[ix].x1 = -0x80000000;
-        path_bboxes[ix].y1 = -0x80000000;
-    }
-}
diff --git a/shader/bbox_fixup.wgsl b/shader/bbox_fixup.wgsl
new file mode 100644
index 0000000..d336466
--- /dev/null
+++ b/shader/bbox_fixup.wgsl
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+#import config
+#import pathtag
+#import bbox
+#import bbox_monoid
+
+@group(0) @binding(0)
+var<uniform> config: Config;
+
+@group(0) @binding(1)
+var<storage> tag_monoids: array<TagMonoid>;
+
+@group(0) @binding(2)
+var<storage> bbox_reduced: array<BboxMonoid>;
+
+@group(0) @binding(3)
+var<storage, read_write> path_bboxes: array<PathBbox>;
+
+let WG_SIZE = 256u;
+var<workgroup> sh_bbox: array<BboxMonoid, WG_SIZE>;
+
+fn round_down(x: f32) -> i32 {
+    return i32(floor(x));
+}
+
+fn round_up(x: f32) -> i32 {
+    return i32(ceil(x));
+}
+
+// In the configuration with <= 64k pathtags, there's only one
+// workgroup here, so the distinction between global and local is
+// not meaningful. But we'll probably want to #ifdef a larger
+// configuration, in which we also bind a doubly reduced buffer.
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+    var agg: BboxMonoid;
+    if global_id.x * WG_SIZE < config.n_pathtag {
+        agg = bbox_reduced[global_id.x];
+    }
+    sh_bbox[local_id.x] = agg;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) {
+        workgroupBarrier();
+        if local_id.x >= 1u << i {
+            let other = sh_bbox[local_id.x - (1u << i)];
+            agg = combine_bbox_monoid(other, agg);
+        }
+        workgroupBarrier();
+        sh_bbox[local_id.x] = agg;
+    }
+    // Explanation of this trick: we don't need to fix up first bbox.
+    // By offsetting the index, we can use the inclusive scan.
+    let ix = global_id.x + 1u;
+    if ix * WG_SIZE < config.n_pathtag {
+        // First path of the workgroup.
+        let path_ix = tag_monoids[ix * (WG_SIZE / 4u)].path_ix;
+        if (agg.flags & FLAG_RESET_BBOX) == 0u && (agg.bbox.z > agg.bbox.x || agg.bbox.w > agg.bbox.y) {
+            let out = &path_bboxes[path_ix];
+            // TODO: casting goes away
+            var bbox = vec4(f32((*out).x0), f32((*out).y0), f32((*out).x1), f32((*out).y1));
+            if bbox.z > bbox.x || bbox.w > bbox.y {
+                bbox = vec4(min(agg.bbox.xy, bbox.xy), max(agg.bbox.zw, bbox.zw));
+            } else {
+                bbox = agg.bbox;
+            }
+            (*out).x0 = round_down(bbox.x);
+            (*out).y0 = round_down(bbox.y);
+            (*out).x1 = round_up(bbox.z);
+            (*out).y1 = round_up(bbox.w);
+        }
+    }
+}
diff --git a/shader/pathseg.wgsl b/shader/pathseg.wgsl
index ec059ab..a8f32f2 100644
--- a/shader/pathseg.wgsl
+++ b/shader/pathseg.wgsl
@@ -11,6 +11,8 @@
 // There's some duplication of the decoding code but we won't worry about
 // that just now. Perhaps it could be factored more nicely later.
 
+#import bbox
+#import bbox_monoid
 #import config
 #import pathtag
 #import cubic
@@ -24,51 +26,14 @@
 @group(0) @binding(2)
 var<storage> tag_monoids: array<TagMonoid>;
 
-struct AtomicPathBbox {
-    x0: atomic<i32>,
-    y0: atomic<i32>,
-    x1: atomic<i32>,
-    y1: atomic<i32>,
-    linewidth: f32,
-    trans_ix: u32,
-}
-
 @group(0) @binding(3)
-var<storage, read_write> path_bboxes: array<AtomicPathBbox>;
-
+var<storage, read_write> path_bboxes: array<PathBbox>;
 
 @group(0) @binding(4)
 var<storage, read_write> cubics: array<Cubic>;
 
-// Monoid is yagni, for future optimization
-
-// struct BboxMonoid {
-//     bbox: vec4<f32>,
-//     flags: u32,
-// }
-
-// let FLAG_RESET_BBOX = 1u;
-// let FLAG_SET_BBOX = 2u;
-
-// fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid {
-//     var c: BboxMonoid;
-//     c.bbox = b.bbox;
-//     // TODO: previous-me thought this should be gated on b & SET_BBOX == false also
-//     if (a.flags & FLAG_RESET_BBOX) == 0u && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y {
-//         c.bbox = a.bbox;
-//     } else if (a.flags & FLAG_RESET_BBOX) == 0u && (b.flags & FLAG_SET_BBOX) == 0u ||
-//         (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)
-//     {
-//         c.bbox = vec4<f32>(min(a.bbox.xy, c.bbox.xy), max(a.bbox.xw, c.bbox.zw));
-//     }
-//     c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
-//     c.flags |= (a.flags & FLAG_RESET_BBOX) << 1u;
-//     return c;
-// }
-
-// fn bbox_monoid_identity() -> BboxMonoid {
-//     return BboxMonoid();
-// }
+@group(0) @binding(5)
+var<storage, read_write> bbox_reduced: array<BboxMonoid>;
 
 var<private> pathdata_base: u32;
 
@@ -115,10 +80,14 @@
     return i32(ceil(x));
 }
 
+let WG_SIZE = 256u;
+var<workgroup> sh_bbox: array<BboxMonoid, WG_SIZE>;
+
 @compute @workgroup_size(256)
 fn main(
     @builtin(global_invocation_id) global_id: vec3<u32>,
     @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
     let ix = global_id.x;
     let tag_word = scene[config.pathtag_base + (ix >> 2u)];
@@ -130,12 +99,10 @@
 
     let out = &path_bboxes[tm.path_ix];
     let linewidth = bitcast<f32>(scene[config.linewidth_base + tm.linewidth_ix]);
-    if (tag_byte & PATH_TAG_PATH) != 0u {
-        (*out).linewidth = linewidth;
-        (*out).trans_ix = tm.trans_ix;
-    }
+    let bbox_flags = u32((tag_byte & PATH_TAG_PATH) != 0u);
     // Decode path data
     let seg_type = tag_byte & PATH_TAG_SEG_TYPE;
+    var bbox: vec4<f32>;
     if seg_type != 0u {
         var p0: vec2<f32>;
         var p1: vec2<f32>;
@@ -163,7 +130,7 @@
         let transform = read_transform(config.transform_base, tm.trans_ix);
         p0 = transform_apply(transform, p0);
         p1 = transform_apply(transform, p1);
-        var bbox = vec4(min(p0, p1), max(p0, p1));
+        bbox = vec4(min(p0, p1), max(p0, p1));
         // Degree-raise
         if seg_type == PATH_TAG_LINETO {
             p3 = p1;
@@ -191,13 +158,29 @@
         }
         let flags = u32(linewidth >= 0.0);
         cubics[global_id.x] = Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags);
-        // Update bounding box using atomics only. Computing a monoid is a
-        // potential future optimization.
-        if bbox.z > bbox.x || bbox.w > bbox.y {
-            atomicMin(&(*out).x0, round_down(bbox.x));
-            atomicMin(&(*out).y0, round_down(bbox.y));
-            atomicMax(&(*out).x1, round_up(bbox.z));
-            atomicMax(&(*out).y1, round_up(bbox.w));
+    }
+    var agg = BboxMonoid(bbox, bbox_flags);
+    sh_bbox[local_id.x] = agg;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) {
+        workgroupBarrier();
+        if local_id.x >= 1u << i {
+            let other = sh_bbox[local_id.x - (1u << i)];
+            agg = combine_bbox_monoid(other, agg);
         }
+        workgroupBarrier();
+        sh_bbox[local_id.x] = agg;
+    }
+    if local_id.x == WG_SIZE - 1u {
+        bbox_reduced[wg_id.x] = agg;
+    }
+    if bbox_flags != 0u {
+        let out = &path_bboxes[tm.path_ix];
+        // TODO: now that we're not atomic, don't need fixed-point
+        (*out).x0 = round_down(agg.bbox.x);
+        (*out).y0 = round_down(agg.bbox.y);
+        (*out).x1 = round_up(agg.bbox.z);
+        (*out).y1 = round_up(agg.bbox.w);
+        (*out).linewidth = linewidth;
+        (*out).trans_ix = tm.trans_ix;
     }
 }
diff --git a/shader/shared/bbox_monoid.wgsl b/shader/shared/bbox_monoid.wgsl
new file mode 100644
index 0000000..1a301d2
--- /dev/null
+++ b/shader/shared/bbox_monoid.wgsl
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+struct BboxMonoid {
+    bbox: vec4<f32>,
+    flags: u32,
+}
+
+let FLAG_RESET_BBOX = 1u;
+let FLAG_SET_BBOX = 2u;
+
+// Technically this is a semigroup with a left identity rather than a
+// true monoid, but that is good enough for our purposes.
+fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid {
+    var bbox = b.bbox;
+    if (b.flags & FLAG_SET_BBOX) == 0u && (a.flags & FLAG_RESET_BBOX) == 0u {
+        if bbox.z <= bbox.x && bbox.w <= bbox.y {
+            bbox = a.bbox;
+        } else if a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y {
+            bbox = vec4(min(a.bbox.xy, bbox.xy), max(a.bbox.zw, bbox.zw));
+        }
+    }
+    let flags = ((a.flags | (a.flags << 1u)) & FLAG_SET_BBOX) | b.flags;
+    return BboxMonoid(bbox, flags);
+}
+
diff --git a/shader/shared/config.wgsl b/shader/shared/config.wgsl
index 0cb56d8..6ef9434 100644
--- a/shader/shared/config.wgsl
+++ b/shader/shared/config.wgsl
@@ -10,6 +10,7 @@
     n_drawobj: u32,
     n_path: u32,
     n_clip: u32,
+    n_pathtag: u32,
 
     // To reduce the number of bindings, info and bin data are combined
     // into one buffer.
diff --git a/src/encoding/packed.rs b/src/encoding/packed.rs
index 7004477..34f7aef 100644
--- a/src/encoding/packed.rs
+++ b/src/encoding/packed.rs
@@ -32,6 +32,8 @@
     pub n_paths: u32,
     /// Number of clips.
     pub n_clips: u32,
+    /// Number of path tags.
+    pub n_pathtag: u32,
     /// Start of binning data.
     pub bin_data_start: u32,
     /// Start of path tag stream.
@@ -139,13 +141,14 @@
         // Pack encoded data.
         let layout = &mut self.layout;
         *layout = Layout::default();
+        let n_path_tags = encoding.path_tags.len();
         layout.n_paths = encoding.n_paths;
         layout.n_draw_objects = encoding.n_paths;
         layout.n_clips = encoding.n_clips;
+        layout.n_pathtag = n_path_tags as u32;
         let data = &mut self.data;
         data.clear();
         // Path tag stream
-        let n_path_tags = encoding.path_tags.len();
         let path_tag_padded = align_up(n_path_tags, 4 * shaders::PATHTAG_REDUCE_WG);
         let capacity = path_tag_padded
             + slice_size_in_bytes(&encoding.path_data)
diff --git a/src/render.rs b/src/render.rs
index 0ee195a..fb5c253 100644
--- a/src/render.rs
+++ b/src/render.rs
@@ -23,7 +23,10 @@
 const DRAW_BBOX_SIZE: u64 = 16;
 const BUMP_SIZE: u64 = 16;
 const BIN_HEADER_SIZE: u64 = 8;
+const BBOX_MONOID_SIZE: u64 = 32;
 
+// Note: this is defined here as it's still used by the reduced pipeline,
+// but for the full pipeline, use the version in `encoding`.
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
 struct Config {
@@ -34,6 +37,7 @@
     n_drawobj: u32,
     n_path: u32,
     n_clip: u32,
+    n_pathtag: u32,
     bin_data_start: u32,
     pathtag_base: u32,
     pathdata_base: u32,
@@ -202,14 +206,10 @@
     );
     let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
     let path_bbox_buf = ResourceProxy::new_buf(n_paths as u64 * PATH_BBOX_SIZE);
-    recording.dispatch(
-        shaders.bbox_clear,
-        (drawobj_wgs, 1, 1),
-        [config_buf, path_bbox_buf],
-    );
     let cubic_buf = ResourceProxy::new_buf(n_pathtag as u64 * CUBIC_SIZE);
     let path_coarse_wgs =
         (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
+    let bbox_reduced_buf = ResourceProxy::new_buf(path_coarse_wgs as u64 * BBOX_MONOID_SIZE);
     recording.dispatch(
         shaders.pathseg,
         (path_coarse_wgs, 1, 1),
@@ -219,8 +219,14 @@
             tagmonoid_buf,
             path_bbox_buf,
             cubic_buf,
+            bbox_reduced_buf,
         ],
     );
+    recording.dispatch(
+        shaders.bbox_fixup,
+        (1, 1, 1),
+        [config_buf, tagmonoid_buf, bbox_reduced_buf, path_bbox_buf],
+    );
     let draw_reduced_buf = ResourceProxy::new_buf(drawobj_wgs as u64 * DRAWMONOID_SIZE);
     recording.dispatch(
         shaders.draw_reduce,
diff --git a/src/shaders.rs b/src/shaders.rs
index 1df6b14..9371d1a 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -48,8 +48,8 @@
 pub struct FullShaders {
     pub pathtag_reduce: ShaderId,
     pub pathtag_scan: ShaderId,
-    pub bbox_clear: ShaderId,
     pub pathseg: ShaderId,
+    pub bbox_fixup: ShaderId,
     pub draw_reduce: ShaderId,
     pub draw_leaf: ShaderId,
     pub clip_reduce: ShaderId,
@@ -144,11 +144,6 @@
             BindType::Buffer,
         ],
     )?;
-    let bbox_clear = engine.add_shader(
-        device,
-        preprocess::preprocess(shader!("bbox_clear"), &empty, &imports).into(),
-        &[BindType::Uniform, BindType::Buffer],
-    )?;
     let pathseg = engine.add_shader(
         device,
         preprocess::preprocess(shader!("pathseg"), &full_config, &imports).into(),
@@ -158,6 +153,17 @@
             BindType::BufReadOnly,
             BindType::Buffer,
             BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let bbox_fixup = engine.add_shader(
+        device,
+        preprocess::preprocess(shader!("bbox_fixup"), &full_config, &imports).into(),
+        &[
+            BindType::Uniform,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
         ],
     )?;
     let draw_reduce = engine.add_shader(
@@ -279,8 +285,8 @@
     Ok(FullShaders {
         pathtag_reduce,
         pathtag_scan,
-        bbox_clear,
         pathseg,
+        bbox_fixup,
         draw_reduce,
         draw_leaf,
         clip_reduce,
@@ -305,6 +311,7 @@
 
 const SHARED_SHADERS: &[(&str, &str)] = &[
     shared_shader!("bbox"),
+    shared_shader!("bbox_monoid"),
     shared_shader!("blend"),
     shared_shader!("bump"),
     shared_shader!("clip"),