Checkpoint of work

Getting farther into actually rendering. Allocation of strips is next.
diff --git a/shader/merge.wgsl b/shader/merge.wgsl
index 28b8350..d20c5c5 100644
--- a/shader/merge.wgsl
+++ b/shader/merge.wgsl
@@ -8,8 +8,14 @@
     path_ix: u32,
     x: u32,
     y: u32,
-    delta: i32,
-    // TODO: slope etc
+    p0: u32, // packed
+    p1: u32, // packed
+}
+
+fn unpack_point(p: u32) -> vec2f {
+    let x = f32(p & 0xffffu) * (1.0 / 8192.0);
+    let y = f32(p >> 16u) * (1.0 / 8192.0);
+    return vec2(x, y);
 }
 
 @group(0) @binding(0)
@@ -34,10 +40,15 @@
     return c;
 }
 
-fn mm_histogram(t: Minitile) -> u32 {
-    // TODO: get these from tile
-    let xmin = 0u;
-    let xmax = 4u;
+fn mt_delta(t: Minitile) -> i32 {
+    return i32((t.p1 >> 16u) == 0u) - i32((t.p0 >> 16u) == 0u);
+}
+
+fn mt_histogram(t: Minitile) -> u32 {
+    let x0 = f32(t.p0 & 0xffffu) * (1.0 / 8192.0);
+    let x1 = f32(t.p1 & 0xffffu) * (1.0 / 8192.0);
+    let xmin = u32(floor(min(x0, x1)));
+    let xmax = u32(ceil(max(x0, x1)));
     let rshift = (4u - (xmax - xmin)) * 8u;
     let lshift = xmin * 8u;
     return (0x01010101u >> rshift) << lshift;
@@ -54,10 +65,11 @@
 var<workgroup> sh_histo: array<u32, WG_SIZE>;
 var<workgroup> sh_seg_end: array<u32, WG_SIZE>;
 var<workgroup> sh_inclusive_cols: array<u32, WG_SIZE>;
+var<workgroup> sh_area: array<atomic<i32>, WG_SIZE>;
+var<workgroup> sh_carryover: array<i32, 4>;
 
 @compute @workgroup_size(256)
 fn main(
-    @builtin(global_invocation_id) global_id: vec3<u32>,
     @builtin(local_invocation_id) local_id: vec3<u32>,
     @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
@@ -65,19 +77,20 @@
     var first = false;
     var first_x = false;
     // predicate? or pad?
-    let tile = input[global_id.x];
-    if global_id.x != 0u {
-        let prev = input[global_id.x - 1u];
+    let global_ix = wg_id.x * WG_SIZE + local_id.x;
+    let tile = input[global_ix];
+    if global_ix != 0u {
+        let prev = input[global_ix - 1u];
         first = tile.path_ix != prev.path_ix || tile.y != prev.y;
         first_x = first || tile.x != prev.x;
     }
-    let winding = tile.delta;
+    let winding = mt_delta(tile);
     let n_strips = u32(first);
     let start = select(0u, local_id.x, first);
     let start_x = select(0u, local_id.x, first_x);
     var agg = MergeMonoid(winding, n_strips, start, start_x);
 
-    let local_histo = mm_histogram(tile);
+    let local_histo = mt_histogram(tile);
     var histo = local_histo;
     sh_mm[local_id.x] = agg;
     sh_histo[local_id.x] = histo;
@@ -159,5 +172,34 @@
             }
         }
         // at this point, lo should index our tile
+        // TODO: predicate on ix < total_cols?
+        let render_tile = input[wg_id.x * WG_SIZE + lo];
+        var alphas = 0u;
+        for (var y = 0u; y < 4u; y++) {
+            if tile_within_col == 0u {
+                atomicStore(&sh_area[local_id.x], 0);
+            }
+            workgroupBarrier();
+            var area_init = 0;
+            if local_id.x == 0u && block_ix != 0u {
+                area_init = sh_carryover[y];
+            }
+            let area = area_init; // TODO: compute from tile
+            atomicAdd(&sh_area[local_id.x - tile_within_col], area);
+            workgroupBarrier();
+            if tile_within_col == 0u {
+                let summed_area = atomicLoad(&sh_area[local_id.x]);
+                if seg_end == WG_SIZE - 1u {
+                    // TODO: only if last column
+                    sh_carryover[y] = summed_area;
+                }
+                let winding_area = sh_mm[seg_end].winding * 256;
+                let alpha_u8 = u32(min(abs(summed_area + winding_area), 255));
+                alphas = (alphas >> 8u) + (alpha_u8 << 24u);
+            }
+        }
+        if tile_within_col == 0u {
+            // TODO: store alphas
+        }
     }
 }