Checkpoint work on sparse strip rendering

WIP, at this point more of a code sketch. Will keep committing.
diff --git a/shader/merge.wgsl b/shader/merge.wgsl
new file mode 100644
index 0000000..8ac27cf
--- /dev/null
+++ b/shader/merge.wgsl
@@ -0,0 +1,131 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Merge tiles
+
+// TODO: lots of rework here
+struct Minitile {
+    path_ix: u32,
+    x: u32,
+    y: u32,
+    delta: i32,
+    // TODO: slope etc
+}
+
+@group(0) @binding(0)
+var<storage> input: array<Minitile>;
+
+struct MergeMonoid {
+    winding: i32,
+    n_strips: u32,
+    // this probably doesn't belong in reduced
+    start: u32,
+    start_x: u32,
+}
+
+@group(1) @binding(1)
+var<storage> reduced_mm: array<MergeMonoid>;
+
+fn combine_merge_monoid(a: MergeMonoid, b: MergeMonoid) -> MergeMonoid {
+    var c: MergeMonoid;
+    c.winding = a.winding + b.winding;
+    c.n_strips = a.n_strips + b.n_strips;
+    c.start = max(a.start, b.start);
+    return c;
+}
+
+fn mm_histogram(t: Minitile) -> u32 {
+    // TODO: get these from tile
+    let xmin = 0u;
+    let xmax = 4u;
+    let rshift = (4u - (xmax - xmin)) * 8u;
+    let lshift = xmin * 8u;
+    return (0x01010101u >> rshift) << lshift;
+}
+
+fn reduce_histo(histo: u32) -> u32 {
+    let tmp = (histo & 0xff00ffu) + ((histo >> 8u) & 0xff00ffu);
+    return (tmp >> 16u) + (tmp & 0xffffu);
+}
+
+const WG_SIZE = 256u;
+
+var<workgroup> sh_mm: array<MergeMonoid, WG_SIZE>;
+var<workgroup> sh_histo: array<u32, WG_SIZE>;
+var<workgroup> sh_prefix_cols: array<u32, WG_SIZE>;
+var<workgroup> sh_seg_end: array<u32, WG_SIZE>;
+var<workgroup> sh_inclusive_cols: array<u32, WG_SIZE>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    // scan merge monoid
+    var first = false;
+    var first_x = false;
+    // predicate? or pad?
+    let tile = input[global_id.x];
+    if global_id.x != 0u {
+        let prev = input[global_id.x - 1u];
+        first = tile.path_ix != prev.path_ix || tile.y != prev.y;
+        first_x = first || tile.x != prev.x;
+    }
+    let winding = tile.delta;
+    let n_strips = u32(first);
+    let start = select(0u, local_id.x, first);
+    let start_x = select(0u, local_id.x, first_x);
+    var agg = MergeMonoid(winding, n_strips, start, start_x);
+
+    let local_histo = mm_histogram(tile);
+    var histo = local_histo;
+    sh_mm[local_id.x] = agg;
+    sh_histo[local_id.x] = histo;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) {
+        workgroupBarrier();
+        if local_id.x + (1u << i) < WG_SIZE {
+            let other = sh_mm[local_id.x + (1u << i)];
+            agg = combine_merge_monoid(agg, other);
+            histo += sh_histo[local_id.x + (1u << i)];
+        }
+        workgroupBarrier();
+        sh_mm[local_id.x] = agg;
+        sh_histo[local_id.x] = histo;
+    }
+    if wg_id.x > 0u {
+        let prefix = reduced_mm[wg_id.x - 1u];
+        agg = combine_merge_monoid(prefix, agg);
+    }
+    workgroupBarrier();
+    // subtract off start of scanline winding number
+
+    var prefix_cols = 0u;
+    if agg.start_x != 0u {
+        prefix_cols = reduce_histo(sh_histo[agg.start - 1u]);
+    }
+    sh_prefix_cols[local_id.x] = prefix_cols;
+    let last_x = local_id.x == WG_SIZE - 1u || sh_mm[local_id.x + 1u].start_x != agg.start_x;
+    if last_x {
+        sh_seg_end[agg.start_x] = local_id.x;
+    }
+
+    sh_inclusive_cols[local_id.x] = reduce_histo(local_histo) + reduce_histo(histo - local_histo);
+    let total_cols = workgroupUniformLoad(&sh_inclusive_cols[WG_SIZE - 1u]);
+    if agg.start_x != local_id.x {
+        sh_seg_end[local_id.x] = sh_seg_end[agg.start_x];
+    }
+    let n_blocks = (total_cols + WG_SIZE - 1u) / WG_SIZE;
+    for (var block_ix = 0u; block_ix < n_blocks; block_ix++) {
+        let ix = block_ix * WG_SIZE + local_id.x;
+        // binary search to find work item
+        var cols = 0u; // misnamed
+        for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) {
+            let probe = cols + ((WG_SIZE / 2u) >> i);
+            if ix > sh_inclusive_cols[probe - 1u] {
+                cols = probe;
+            }
+        }
+        let seg_start = sh_mm[cols]
+    }
+}