prototype gpu merge
diff --git a/Cargo.lock b/Cargo.lock
index 82f33ab..079fa00 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3922,6 +3922,8 @@
 name = "vello_hybrid_winit"
 version = "0.0.0"
 dependencies = [
+ "bytemuck",
+ "futures-intrusive",
  "pollster",
  "vello_common",
  "vello_example_scenes",
diff --git a/sparse_strips/vello_bench/src/data.rs b/sparse_strips/vello_bench/src/data.rs
index 989eeec..b860060 100644
--- a/sparse_strips/vello_bench/src/data.rs
+++ b/sparse_strips/vello_bench/src/data.rs
@@ -147,6 +147,7 @@
     /// Get the alpha buffer and rendered strips.
     pub fn strips(&self) -> (Vec<u8>, Vec<Strip>) {
         let mut strip_buf = vec![];
+        let mut pmt_buf = vec![];
         let mut alpha_buf = vec![];
         let lines = self.lines();
         let tiles = self.sorted_tiles();
@@ -155,6 +156,7 @@
             Level::fallback(),
             &tiles,
             &mut strip_buf,
+            &mut pmt_buf,
             &mut alpha_buf,
             Fill::NonZero,
             None,
diff --git a/sparse_strips/vello_bench/src/strip.rs b/sparse_strips/vello_bench/src/strip.rs
index 8690941..f395dec 100644
--- a/sparse_strips/vello_bench/src/strip.rs
+++ b/sparse_strips/vello_bench/src/strip.rs
@@ -17,6 +17,7 @@
 
             g.bench_function(format!("{}_{}", $item.name.clone(), $suffix), |b| {
                 let mut strip_buf = vec![];
+                let mut pmt_buf = vec![];
                 let mut alpha_buf = vec![];
 
                 b.iter(|| {
@@ -27,6 +28,7 @@
                         $level,
                         &tiles,
                         &mut strip_buf,
+                        &mut pmt_buf,
                         &mut alpha_buf,
                         Fill::NonZero,
                         None,
diff --git a/sparse_strips/vello_common/src/recording.rs b/sparse_strips/vello_common/src/recording.rs
index e840e84..481364f 100644
--- a/sparse_strips/vello_common/src/recording.rs
+++ b/sparse_strips/vello_common/src/recording.rs
@@ -11,7 +11,7 @@
 #[cfg(feature = "text")]
 use crate::peniko::FontData;
 use crate::peniko::{BlendMode, Fill};
-use crate::strip::Strip;
+use crate::strip::{PreMergeTile, Strip};
 use crate::strip_generator::StripStorage;
 use alloc::vec::Vec;
 
@@ -59,6 +59,10 @@
         &self.strip_storage.strips
     }
 
+    pub fn pre_merge_tiles(&self) -> &[PreMergeTile] {
+        &self.strip_storage.pre_merge_tiles
+    }
+
     /// Get alphas as slice
     pub fn alphas(&self) -> &[u8] {
         &self.strip_storage.alphas
@@ -177,8 +181,12 @@
     }
 
     /// Get cached strips.
-    pub fn get_cached_strips(&self) -> (&[Strip], &[u8]) {
-        (self.cached_strips.strips(), self.cached_strips.alphas())
+    pub fn get_cached_strips(&self) -> (&[Strip], &[PreMergeTile], &[u8]) {
+        (
+            self.cached_strips.strips(),
+            self.cached_strips.pre_merge_tiles(),
+            self.cached_strips.alphas(),
+        )
     }
 
     /// Takes cached strip buffers.
diff --git a/sparse_strips/vello_common/src/strip.rs b/sparse_strips/vello_common/src/strip.rs
index f2e5b7e..c8f99be 100644
--- a/sparse_strips/vello_common/src/strip.rs
+++ b/sparse_strips/vello_common/src/strip.rs
@@ -2,13 +2,17 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
 //! Rendering strips.
-
 use crate::flatten::Line;
 use crate::peniko::Fill;
 use crate::tile::{Tile, Tiles};
 use crate::util::f32_to_u8;
 use alloc::vec::Vec;
+use bytemuck::{Pod, Zeroable};
 use fearless_simd::*;
+use std::format;
+use std::println;
+use std::string::String;
+use std::vec;
 
 /// A strip.
 #[derive(Debug, Clone, Copy)]
@@ -85,17 +89,115 @@
     }
 }
 
+/// Pre-Merge-Tile:
+#[repr(C)]
+#[derive(Debug, Clone, Copy, Zeroable, Pod)]
+pub struct PreMergeTile {
+    /// The index into the alpha buffer that an "end tile" should write to.
+    pub alpha_index: u32,
+    /// Contains tile location information
+    pub packed_info: u32,
+    /// The exclusive prefix sum of the signed winding number, uploading it skips scan in shader.
+    pub scanned_winding: i32,
+    /// Necessary padding for alignment
+    pub padding: u32,
+    /// Line points, adjusted by tile position
+    pub p0: [f32; 2],
+    /// Line points, adjusted by tile position
+    pub p1: [f32; 2],
+}
+
+// MSB                                                                  LSB
+// 31------------------21|20------------------12|11-------------------3|2|1|0|
+// |      Unused (11)    |  Seg Start ID (9)    |  Tile Start ID (9)   |F|T|E|
+//
+// F = FILL_RULE_MASK
+// E = IS_END_TILE_MASK
+// T = IS_TILE_FIRST_COL_MASK
+const IS_END_TILE_MASK: u32 = 1 << 0;
+const IS_TILE_FIRST_COL_MASK: u32 = 1 << 1;
+const FILL_RULE_MASK: u32 = 1 << 2;
+const TILE_START_ID_SHIFT: u32 = 3;
+const SEG_START_ID_SHIFT: u32 = 12;
+const ID_MASK: u32 = 0x1ff;
+const INVALID_ID: u32 = 256;
+const BLOCK_DIM_LG: u32 = 8;
+
+impl PreMergeTile {
+    pub fn set_is_end_tile(&mut self, is_end: bool) {
+        if is_end {
+            self.packed_info |= IS_END_TILE_MASK;
+        } else {
+            self.packed_info &= !IS_END_TILE_MASK;
+        }
+    }
+
+    pub fn is_end_tile(&self) -> bool {
+        (self.packed_info & IS_END_TILE_MASK) != 0
+    }
+
+    pub fn set_is_tile_first_col(&mut self, is_first: bool) {
+        if is_first {
+            self.packed_info |= IS_TILE_FIRST_COL_MASK;
+        }
+    }
+
+    pub fn is_tile_first_col(&self) -> bool {
+        (self.packed_info & IS_TILE_FIRST_COL_MASK) != 0
+    }
+
+    pub fn set_fill_rule(& mut self, is_fill_rule_non_zero: bool) {
+        if is_fill_rule_non_zero {
+            self.packed_info |= FILL_RULE_MASK;
+        }
+    }
+
+    pub fn set_tile_start_id(&mut self, id: u32) {
+        self.packed_info |= (id & ID_MASK) << TILE_START_ID_SHIFT;
+    }
+
+    pub fn tile_start_id(&self) -> u32 {
+        (self.packed_info >> TILE_START_ID_SHIFT) & ID_MASK
+    }
+
+    pub fn set_seg_start_id(&mut self, id: u32) {
+        self.packed_info |= (id & ID_MASK) << SEG_START_ID_SHIFT;
+    }
+
+    pub fn seg_start_id(&self) -> u32 {
+        (self.packed_info >> SEG_START_ID_SHIFT) & ID_MASK
+    }
+}
+
 /// Render the tiles stored in `tiles` into the strip and alpha buffer.
+/// Aliasing threshold is unused
 pub fn render(
-    level: Level,
+    _level: Level,
     tiles: &Tiles,
-    strip_buf: &mut Vec<Strip>,
-    alpha_buf: &mut Vec<u8>,
+    strip_buf: &mut Vec<Strip>,      // Output parameter for strips
+    pmt_buf: &mut Vec<PreMergeTile>, // Output paramter for pre merge tiles
+    alpha_buf: &mut Vec<u8>,         // Output parameter for alpha bytes
     fill_rule: Fill,
     aliasing_threshold: Option<u8>,
     lines: &[Line],
 ) {
-    dispatch!(level, simd => render_impl(simd, tiles, strip_buf, alpha_buf, fill_rule, aliasing_threshold, lines));
+    // let mut winding_fine_ref: Vec<[[f32; 4]; 4]> = Vec::new();
+    // let mut winding_acc_ref: Vec<[f32; 4]> = Vec::new();
+    // let mut winding_coarse_ref: Vec<i32> = Vec::new();
+    // let mut winding_fine_comp: Vec<[[f32; 4]; 4]> = Vec::new();
+    // let mut winding_acc_comp: Vec<[f32; 4]> = Vec::new();
+    // let mut winding_coarse_comp: Vec<i32> = Vec::new();
+
+    prepare_gpu_inputs(tiles, strip_buf, pmt_buf, alpha_buf, fill_rule, lines);
+
+    // compare_windings(
+    //     &winding_fine_ref,
+    //     &winding_coarse_ref,
+    //     &winding_acc_ref,
+    //     &winding_fine_comp,
+    //     &winding_coarse_comp,
+    //     &winding_acc_comp,
+    // );
 }
 
 fn render_impl<S: Simd>(
@@ -106,6 +208,9 @@
     fill_rule: Fill,
     aliasing_threshold: Option<u8>,
     lines: &[Line],
+    winding_fine_ref: &mut Vec<[[f32; 4]; 4]>,
+    winding_acc_ref: &mut Vec<[f32; 4]>,
+    winding_coarse_ref: &mut Vec<i32>,
 ) {
     if tiles.is_empty() {
         return;
@@ -155,6 +260,10 @@
         // Push out the winding as an alpha mask when we move to the next location (i.e., a tile
         // without the same location).
         if !prev_tile.same_loc(&tile) {
+            //winding_fine_ref.push(location_winding.map(|v| v.val));
+            // winding_acc_ref.push(accumulated_winding.val);
+            // winding_coarse_ref.push(winding_delta);
+
             match fill_rule {
                 Fill::NonZero => {
                     let p1 = f32x4::splat(s, 0.5);
@@ -203,7 +312,7 @@
                 );
             }
 
-            alpha_buf.extend_from_slice(&u8_vals.val);
+            //alpha_buf.extend_from_slice(&u8_vals.val);
 
             #[expect(clippy::needless_range_loop, reason = "dimension clarity")]
             for x in 0..Tile::WIDTH as usize {
@@ -213,26 +322,26 @@
 
         // Push out the strip if we're moving to a next strip.
         if !prev_tile.same_loc(&tile) && !prev_tile.prev_loc(&tile) {
-            debug_assert_eq!(
-                (prev_tile.x + 1) * Tile::WIDTH - strip.x,
-                ((alpha_buf.len() - strip.alpha_idx() as usize) / usize::from(Tile::HEIGHT)) as u16,
-                "The number of columns written to the alpha buffer should equal the number of columns spanned by this strip."
-            );
-            strip_buf.push(strip);
+            // debug_assert_eq!(
+            //     (prev_tile.x + 1) * Tile::WIDTH - strip.x,
+            //     ((alpha_buf.len() - strip.alpha_idx() as usize) / usize::from(Tile::HEIGHT)) as u16,
+            //     "The number of columns written to the alpha buffer should equal the number of columns spanned by this strip."
+            // );
+            //strip_buf.push(strip);
 
             let is_sentinel = tile_idx == tiles.len() as usize;
             if !prev_tile.same_row(&tile) {
                 // Emit a final strip in the row if there is non-zero winding for the sparse fill,
                 // or unconditionally if we've reached the sentinel tile to end the path (the
                 // `alpha_idx` field is used for width calculations).
-                if winding_delta != 0 || is_sentinel {
-                    strip_buf.push(Strip::new(
-                        u16::MAX,
-                        prev_tile.y * Tile::HEIGHT,
-                        alpha_buf.len() as u32,
-                        should_fill(winding_delta),
-                    ));
-                }
+                // if winding_delta != 0 || is_sentinel {
+                //     strip_buf.push(Strip::new(
+                //         u16::MAX,
+                //         prev_tile.y * Tile::HEIGHT,
+                //         alpha_buf.len() as u32,
+                //         should_fill(winding_delta),
+                //     ));
+                // }
 
                 winding_delta = 0;
                 accumulated_winding = f32x4::splat(s, 0.0);
@@ -405,10 +514,739 @@
             // The trapezoidal area enclosed between the line and the right edge of the pixel
             // square.
             let area = 0.5 * h * (2. * px_right_x - line_px_right_yx - line_px_left_yx);
+            let local_contribution = acc.madd(sign, area);
+            if x_idx == 0 {
+                //println!("{}: {:<10.4} ", tile_idx, local_contribution.val[0]);
+            }
+            location_winding[x_idx as usize] = location_winding[x_idx as usize] + local_contribution;
+            acc = acc.madd(sign, h);
             location_winding[x_idx as usize] += area.madd(sign, acc);
             acc = h.madd(sign, acc);
         }
 
-        accumulated_winding += acc;
+        accumulated_winding = accumulated_winding + acc;
+    }
+}
+
+/// Prepares gpu inputs
+fn prepare_gpu_inputs(
+    tiles: &Tiles,
+    strip_buf: &mut Vec<Strip>,
+    pre_merge_buf: &mut Vec<PreMergeTile>,
+    alpha_buf: &mut Vec<u8>,
+    fill_rule: Fill,
+    lines: &[Line],
+) {
+    if tiles.is_empty() {
+        return;
+    }
+
+    let should_fill = |winding: i32| match fill_rule {
+        Fill::NonZero => winding != 0,
+        Fill::EvenOdd => winding % 2 != 0,
+    };
+
+    let mut winding_delta: i32 = 0;
+    let mut prev_tile = *tiles.get(0);
+    let mut alpha_offset: u32 = 0;
+    let initial_alpha_len = alpha_buf.len() as u32;
+
+    const SENTINEL: Tile = Tile::new(u16::MAX, u16::MAX, 0, false);
+
+    let mut start_tile_idx = 0;
+    let mut start_seg_idx = 0;
+    let mut strip = Strip::new(
+        prev_tile.x * Tile::WIDTH,
+        prev_tile.y * Tile::HEIGHT,
+        initial_alpha_len,
+        false,
+    );
+    for (tile_idx, tile) in tiles.iter().copied().chain([SENTINEL]).enumerate() {
+        let is_start_tile = tile_idx == 0 || !prev_tile.same_loc(&tile);
+        let is_start_segment = tile_idx == 0 || (is_start_tile && !prev_tile.prev_loc(&tile));
+        if is_start_tile {
+            start_tile_idx = pre_merge_buf.len();
+            if tile_idx > 0 {
+                alpha_offset += (Tile::WIDTH * Tile::HEIGHT) as u32;
+            }
+        }
+
+        if is_start_segment {
+            start_seg_idx = pre_merge_buf.len();
+            strip_buf.push(strip);
+
+            let is_sentinel = tile_idx == tiles.len() as usize;
+            if !prev_tile.same_row(&tile) {
+                if winding_delta != 0 || is_sentinel {
+                    strip_buf.push(Strip::new(
+                        u16::MAX,
+                        prev_tile.y * Tile::HEIGHT,
+                        initial_alpha_len + alpha_offset,
+                        should_fill(winding_delta),
+                    ));
+                }
+                winding_delta = 0;
+            }
+
+            if is_sentinel {
+                break;
+            }
+
+            strip = Strip::new(
+                tile.x * Tile::WIDTH,
+                tile.y * Tile::HEIGHT,
+                initial_alpha_len + alpha_offset,
+                should_fill(winding_delta),
+            );
+        }
+
+        let line = lines[tile.line_idx() as usize];
+        let tile_left_x = f32::from(tile.x) * f32::from(Tile::WIDTH);
+        let tile_top_y = f32::from(tile.y) * f32::from(Tile::HEIGHT);
+        let p0_x = line.p0.x - tile_left_x;
+        let p0_y = line.p0.y - tile_top_y;
+        let p1_x = line.p1.x - tile_left_x;
+        let p1_y = line.p1.y - tile_top_y;
+
+        let mut pmt = PreMergeTile {
+            alpha_index: initial_alpha_len + alpha_offset,
+            packed_info: 0,
+            scanned_winding: winding_delta,
+            padding: 0,
+            p0: [p0_x, p0_y],
+            p1: [p1_x, p1_y],
+        };
+
+        let sign = (p0_y - p1_y).signum();
+        let signed_winding = sign as i32 * tile.winding() as i32;
+        winding_delta += signed_winding;
+
+        pmt.set_is_end_tile(
+            tile_idx == tiles.len() as usize - 1 ||
+            !tiles.get(tile_idx as u32 + 1).same_loc(&tile),
+        );
+        pmt.set_is_tile_first_col(tile.x == 0);
+
+        let block_idx = pre_merge_buf.len() >> BLOCK_DIM_LG << BLOCK_DIM_LG;
+        let start_tile_pack = if start_tile_idx < block_idx {
+            INVALID_ID
+        } else {
+            (start_tile_idx  - block_idx) as u32
+        };
+        pmt.set_tile_start_id(start_tile_pack);
+
+        let start_seg_pack = if start_seg_idx < block_idx {
+            INVALID_ID
+        } else {
+            (start_seg_idx - block_idx) as u32
+        };
+        pmt.set_seg_start_id(start_seg_pack);
+        pmt.set_fill_rule(fill_rule == Fill::NonZero);
+
+        pre_merge_buf.push(pmt);
+        prev_tile = tile;
+    }
+
+    //TODO alpha buff should be removed in place of a single accumulated u32
+    alpha_buf.resize(alpha_buf.len() + alpha_offset as usize, 0);
+}
+
+
+#[derive(Debug, Clone, Copy, Default)]
+pub struct StitchIndicator {
+    /// True if the tile's location dependency must be stitched.
+    pub loc_stitch_required: bool,
+    /// True if the tile's accumulation dependency must be stitched.
+    pub acc_stitch_required: bool,
+    /// The ID of the partition (workgroup) that processed this tile.
+    pub partition_id: usize,
+}
+
+#[derive(Debug, Clone, Copy, Default)]
+pub struct PartitionIndicator {
+    /// Does this partition contain a `seg_start`?
+    pub has_seg_start: bool,
+    /// Does this partition contain a `tile_start`?
+    pub has_tile_start: bool,
+}
+
+/// Simulate the merge_shader using the PreMergeTile intermediate representation.
+/// Note: THIS WILL NOT WORK with the current prepare_gpu_inputs because it is expecting local
+/// indexes relative to the call of strip::render, rather than the current global indexes provided
+/// by using the pre_merge_buffer length.
+fn cpu_merge(
+    pmt_buf: &mut Vec<PreMergeTile>,
+    alpha_buf: &mut Vec<u8>,
+    fill_rule: Fill,
+    winding_fine_comp: &mut Vec<[[f32; 4]; 4]>,
+    winding_acc_comp: &mut Vec<[f32; 4]>,
+    winding_coarse_comp: &mut Vec<i32>,
+) {
+    if pmt_buf.is_empty() {
+        return;
+    }
+
+    let pmt_count = pmt_buf.len();
+    let mut temp_acc = vec![[0f32; Tile::HEIGHT as usize]; pmt_count];
+    let mut temp_fine = vec![[[0f32; Tile::HEIGHT as usize]; Tile::WIDTH as usize]; pmt_count];
+
+    for gid in 0..pmt_buf.len() {
+        let pmt = pmt_buf[gid];
+        let p0_x = pmt.p0[0];
+        let p0_y = pmt.p0[1];
+        let p1_x = pmt.p1[0];
+        let p1_y = pmt.p1[1];
+
+        if (p0_y - p1_y).abs() < 1e-6 {
+            continue;
+        }
+
+        let (line_top_y, line_top_x, line_bottom_y, line_bottom_x) = if p0_y < p1_y {
+            (p0_y, p0_x, p1_y, p1_x)
+        } else {
+            (p1_y, p1_x, p0_y, p0_x)
+        };
+        let (line_left_x, line_left_y, line_right_x) = if p0_x < p1_x {
+            (p0_x, p0_y, p1_x)
+        } else {
+            (p1_x, p1_y, p0_x)
+        };
+
+
+        let dx = line_bottom_x - line_top_x; // this becomes the operative issue.
+        let is_vertical = dx.abs() < 1e-6;  // Now use an epsilon to be sure
+        let dy = line_bottom_y - line_top_y; // we skip horizontal or close to horizontal lines by now
+        let y_slope = if dx == 0.0 { f32::MAX } else { dy / dx };
+        let x_slope = if dy == 0.0 { f32::MAX } else { dx / dy };
+        let sign = (p0_y - p1_y).signum();
+
+        if pmt.is_tile_first_col() && line_left_x < 0. {
+            let (ymin, ymax) = if is_vertical {
+                (line_top_y, line_bottom_y)
+            } else {
+                let line_viewport_left_y = (line_top_y - line_top_x * y_slope)
+                    .max(line_top_y)
+                    .min(line_bottom_y);
+                (
+                    f32::min(line_left_y, line_viewport_left_y),
+                    f32::max(line_left_y, line_viewport_left_y),
+                )
+            };
+
+            let px_top_y_arr = [0.0, 1.0, 2.0, 3.0];
+            let mut h = [0.0; Tile::HEIGHT as usize];
+            for y in 0..Tile::HEIGHT as usize {
+                let px_top_y: f32 = px_top_y_arr[y];
+                let px_bottom_y: f32 = 1.0 + px_top_y;
+                let ymin_clamped = px_top_y.max(ymin);
+                let ymax_clamped = px_bottom_y.min(ymax);
+                h[y] = (ymax_clamped - ymin_clamped).max(0.0);
+                temp_acc[gid][y] = sign * h[y];
+            }
+
+            if line_right_x < 0. {
+                for x_idx in 0..Tile::WIDTH as usize {
+                    for y in 0..Tile::HEIGHT as usize {
+                        temp_fine[gid][x_idx][y] = temp_acc[gid][y];
+                    }
+                }
+                continue;
+            }
+        }
+
+        if is_vertical {
+            let line_x = line_top_x;
+            let event_x_idx = line_x.floor() as usize;
+            for x_idx in 0..Tile::WIDTH as usize {
+                for y in 0..Tile::HEIGHT as usize {
+                    let mut h = 0.0;
+                    let mut area = 0.0;
+                    if x_idx == event_x_idx {
+                        let px_top_y = y as f32;
+                        let px_bottom_y = 1.0 + px_top_y;
+
+                        let ymin = line_top_y.max(px_top_y);
+                        let ymax = line_bottom_y.min(px_bottom_y);
+                        let coverage_right = (x_idx as f32 + 1.0) - line_x;
+                        h = (ymax - ymin).max(0.0);
+                        area = h * coverage_right;
+                    }
+                    temp_fine[gid][x_idx][y] = temp_acc[gid][y] + sign * area;
+                    temp_acc[gid][y] += sign * h;
+                }
+            }
+        } else {
+            for x_idx in 0..Tile::WIDTH {
+                let x_idx_f = x_idx as f32;
+                let px_left_x = x_idx_f;
+                let px_right_x = 1.0 + x_idx_f;
+
+                for y in 0..Tile::HEIGHT as usize {
+                    let px_top_y = y as f32;
+                    let px_bottom_y = 1.0 + px_top_y;
+                    let ymin = line_top_y.max(px_top_y);
+                    let ymax = line_bottom_y.min(px_bottom_y);
+
+                    let line_px_left_y = (line_top_y + (px_left_x - line_top_x) * y_slope)
+                        .max(ymin)
+                        .min(ymax);
+                    let line_px_right_y = (line_top_y + (px_right_x - line_top_x) * y_slope)
+                        .max(ymin)
+                        .min(ymax);
+
+                    let line_px_left_yx = line_top_x + (line_px_left_y - line_top_y) * x_slope;
+                    let line_px_right_yx = line_top_x + (line_px_right_y - line_top_y) * x_slope;
+                    let h = (line_px_right_y - line_px_left_y).abs();
+                    let area = 0.5 * h * (2. * px_right_x - line_px_right_yx - line_px_left_yx);
+
+                    temp_fine[gid][x_idx as usize][y] = temp_acc[gid][y] + sign * area;
+                    temp_acc[gid][y] += sign * h;
+                }
+            }
+        }
+    }
+
+    const BLOCK_DIM: usize = 256;
+    let GRID_DIM: usize = (pmt_buf.len() + BLOCK_DIM - 1) / BLOCK_DIM;
+    let mut stitch_indicator: Vec<StitchIndicator> = Vec::new();
+    let mut part_indicator: Vec<PartitionIndicator> = Vec::new();
+    let mut stitch_loc: Vec<[[f32; Tile::HEIGHT as usize]; Tile::WIDTH as usize]> = Vec::new();
+    let mut part_loc: Vec<[[f32; Tile::HEIGHT as usize]; Tile::WIDTH as usize]> = Vec::new();
+    let mut part_acc_seg: Vec<[f32; Tile::HEIGHT as usize]> = Vec::new();
+    let mut part_acc_mine: Vec<[f32; Tile::HEIGHT as usize]> = Vec::new();
+
+    for wgid in 0..GRID_DIM {
+        let mut wg_loc = [[[0f32; Tile::HEIGHT as usize]; Tile::WIDTH as usize]; BLOCK_DIM];
+        let mut wg_acc = [[0f32; Tile::HEIGHT as usize]; BLOCK_DIM];
+
+        // dirty, blend the scan with the stitch preparation
+        for tid in 0..BLOCK_DIM {
+            let gid = tid + wgid * BLOCK_DIM;
+            if gid < pmt_count {
+                wg_acc[tid] = temp_acc[gid];
+                wg_loc[tid] = temp_fine[gid];
+            }
+
+            if tid > 0 {
+                for y in 0..Tile::HEIGHT as usize {
+                    wg_acc[tid][y] += wg_acc[tid - 1][y];
+                }
+                for x in 0..Tile::WIDTH as usize {
+                    for y in 0..Tile::HEIGHT as usize {
+                        wg_loc[tid][x][y] += wg_loc[tid - 1][x][y];
+                    }
+                }
+            }
+            // barrier
+
+            // now duplicate previous logic. . .
+            if gid < pmt_count {
+                let pmt = pmt_buf[gid];
+                if pmt.is_end_tile() || tid == BLOCK_DIM - 1 {
+                    let mut my_acc = wg_acc[tid];
+                    let mut my_loc = wg_loc[tid];
+
+                    let tile_start_id = pmt.tile_start_id() as usize;
+                    let seg_start_id = pmt.seg_start_id() as usize;
+                    let tile_start_valid = tile_start_id != INVALID_ID as usize;
+                    let seg_start_valid = seg_start_id != INVALID_ID as usize;
+
+                    if tile_start_valid {
+                        if tile_start_id != 0 {
+                            for x in 0..Tile::WIDTH as usize {
+                                for y in 0..Tile::HEIGHT as usize {
+                                    my_loc[x][y] -= wg_loc[tile_start_id - 1][x][y];
+                                }
+                            }
+
+                            my_acc = wg_acc[tile_start_id - 1];
+                        } else {
+                            my_acc = [0f32; 4];
+                        }
+                    }
+
+                    if seg_start_valid {
+                        let scanned_winding = pmt_buf[wgid * BLOCK_DIM + seg_start_id].scanned_winding as f32;
+                        if seg_start_id != 0 {
+                            for y in 0..Tile::HEIGHT as usize {
+                                my_acc[y] += scanned_winding - wg_acc[seg_start_id - 1][y];
+                            }
+                        } else {
+                            for y in 0..Tile::HEIGHT as usize {
+                                my_acc[y] += scanned_winding;
+                            }
+                        }
+                    }
+
+                    if tile_start_valid {
+                        for x in 0..Tile::WIDTH as usize {
+                            for y in 0..Tile::HEIGHT as usize {
+                                my_loc[x][y] += my_acc[y];
+                            }
+                        }
+                    }
+
+                    let mut stitch = StitchIndicator {
+                        loc_stitch_required: false,
+                        acc_stitch_required: false,
+                        partition_id: 0,
+                    };
+
+                    // only end tiles participate in the write out
+                    if pmt.is_end_tile() {
+                        if tile_start_valid {
+                            if seg_start_valid {
+                                temp_fine[gid] = my_loc;
+                            } else {
+                                stitch.acc_stitch_required = true;
+                                stitch.partition_id = wgid;
+                            }
+                        } else {
+                            stitch.loc_stitch_required = true;
+                            stitch.partition_id = wgid;
+                        }
+
+                    }
+                    stitch_loc.push(my_loc);
+                    stitch_indicator.push(stitch);
+
+                    if tid == BLOCK_DIM - 1 {
+                        // Full OR tile - seg_start
+                        part_acc_mine.push(my_acc);
+
+                        // Push Full
+                        let mut seg_acc = [0f32; Tile::HEIGHT as usize];
+                        if seg_start_valid {
+                            let scanned_winding = pmt_buf[wgid * BLOCK_DIM + seg_start_id].scanned_winding as f32;
+                            if seg_start_id != 0 {
+                                for y in 0..Tile::HEIGHT as usize {
+                                    seg_acc[y] = scanned_winding + wg_acc[tid][y] -
+                                        wg_acc[seg_start_id - 1][y];
+                                }
+                            } else {
+                                for y in 0..Tile::HEIGHT as usize {
+                                    seg_acc[y] = scanned_winding + wg_acc[tid][y];
+                                }
+                            }
+                            part_acc_seg.push(seg_acc);
+                        } else {
+                            part_acc_seg.push(wg_acc[tid]);
+                        }
+
+                        if tile_start_valid && tile_start_id != 0 {
+                            let mut p_loc = [[0f32; Tile::HEIGHT as usize]; Tile::WIDTH as usize];
+                            for x in 0..Tile::WIDTH as usize {
+                                for y in 0..Tile::HEIGHT as usize {
+                                    p_loc[x][y] += wg_loc[tid][x][y] - wg_loc[pmt.tile_start_id() as usize - 1][x][y];
+                                }
+                            }
+                            part_loc.push(p_loc);
+                        } else {
+                            part_loc.push(wg_loc[tid]);
+                        }
+                        let p_ind = PartitionIndicator {
+                            has_seg_start: seg_start_valid,
+                            has_tile_start: tile_start_valid,
+                        };
+                        part_indicator.push(p_ind);
+                    }
+                } else {
+                    let stitch = StitchIndicator {
+                        loc_stitch_required: false,
+                        acc_stitch_required: false,
+                        partition_id: 0,
+                    };
+                    stitch_loc.push([[0f32; 4]; 4]);
+                    stitch_indicator.push(stitch);
+                }
+            }
+        }
+    }
+
+    //Stitching is not done in grid blocked
+    for gid in 0..pmt_count {
+        let s_ind = stitch_indicator[gid];
+        if s_ind.loc_stitch_required && !s_ind.acc_stitch_required {
+            let mut loc = stitch_loc[gid];
+            let mut lookback_id = s_ind.partition_id - 1;
+            let mut part_ind:PartitionIndicator;
+            loop {
+                // Sum the prev
+                for x in 0..Tile::WIDTH as usize {
+                    for y in 0..Tile::HEIGHT as usize {
+                        loc[x][y] += part_loc[lookback_id][x][y];
+                    }
+                }
+
+                // Did we hit a tile_start?
+                part_ind = part_indicator[lookback_id];
+                if part_ind.has_tile_start {
+                    break;
+                } else {
+                    lookback_id -= 1;
+                }
+            }
+
+            //Once we hit a tile start. . . Then we have to stitch the acc
+            //Since this partition is guaranteed to have the tile start
+            let mut acc = part_acc_mine[lookback_id];
+
+            // If this partition also had a seg start, we're done, otherwise we need to continue
+            // the traversal
+            if !part_ind.has_seg_start {
+                lookback_id -= 1;
+                loop {
+                    // Sum the prev
+                    for y in 0..Tile::HEIGHT as usize {
+                        acc[y] += part_acc_seg[lookback_id][y];
+                    }
+
+                    // Did we hit a seg_start?
+                    part_ind = part_indicator[lookback_id];
+                    if part_ind.has_seg_start {
+                        break;
+                    } else {
+                        lookback_id -= 1;
+                    }
+                }
+            }
+
+            //Push out
+            temp_acc[gid] = acc;
+            for x in 0..Tile::WIDTH as usize {
+                for y in 0..Tile::HEIGHT as usize {
+                    temp_fine[gid][x][y] = loc[x][y] + acc[y];
+                }
+            }
+        }
+
+        if s_ind.acc_stitch_required && !s_ind.loc_stitch_required {
+            let mut acc = [0f32; Tile::HEIGHT as usize];
+            let mut lookback_id = s_ind.partition_id - 1;
+            loop {
+                let s_acc = part_acc_seg[lookback_id];
+                for y in 0..Tile::HEIGHT as usize {
+                    acc[y] += s_acc[y];
+                }
+                if part_indicator[lookback_id].has_seg_start {
+                    break;
+                } else {
+                    lookback_id -= 1;
+                }
+            }
+
+            // Push out
+            let loc = stitch_loc[gid];
+            for x in 0..Tile::WIDTH as usize {
+                for y in 0..Tile::HEIGHT as usize {
+                    temp_fine[gid][x][y] = loc[x][y] + acc[y];
+                }
+            }
+        }
+    }
+
+    for gid in 0..(pmt_count - 1) {
+        let pmt = pmt_buf[gid];
+        if pmt.is_end_tile() {
+            winding_fine_comp.push(temp_fine[gid]);
+            winding_acc_comp.push(temp_acc[gid + 1]);
+            winding_coarse_comp.push(pmt_buf[gid + 1].scanned_winding);
+
+            match fill_rule {
+                Fill::NonZero => {
+                    for x in 0..Tile::WIDTH as usize {
+                        for y in 0..Tile::HEIGHT as usize {
+                            let area = temp_fine[gid][x][y];
+                            let coverage = area.abs();
+                            let mulled = 0.5 + coverage * 255.0;
+                            temp_fine[gid][x][y] = mulled.min(255.0);
+                        }
+                    }
+                }
+                Fill::EvenOdd => {
+                    for x in 0..Tile::WIDTH as usize {
+                        for y in 0..Tile::HEIGHT as usize {
+                            let area = temp_fine[gid][x][y];
+                            let im1 = (0.5 + area * 0.5).floor();
+                            let coverage = (im1 + area * -2.0).abs();
+                            let mulled = 0.5 + coverage * 255.0;
+                            temp_fine[gid][x][y] = mulled.min(255.0);
+                        }
+                    }
+                }
+            };
+
+            let mut u8_vals = [0u8; 16];
+            let mut i = 0;
+            for x in 0..Tile::WIDTH as usize {
+                for y in 0..Tile::HEIGHT as usize {
+                    u8_vals[i] = temp_fine[gid][x][y].round() as u8;
+                    i += 1;
+                }
+            }
+
+            // this will be indexed into
+            alpha_buf.extend_from_slice(&u8_vals);
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+pub fn compare_windings(
+    winding_fine_ref: &[[[f32; 4]; 4]],
+    winding_coarse_ref: &[i32],
+    winding_acc_ref: &[[f32; 4]],
+    winding_fine_comp: &[[[f32; 4]; 4]],
+    winding_coarse_comp: &[i32],
+    winding_acc_comp: &[[f32; 4]],
+) -> bool {
+    let coarse_match = compare_coarse_windings(winding_coarse_ref, winding_coarse_comp);
+    if !coarse_match {
+        println!("\nComparison halted due to coarse winding mismatch.");
+        return false;
+    }
+    let fine_match = compare_fine_windings(winding_fine_ref, winding_fine_comp);
+    if !fine_match {
+        println!("\nComparison failed at fine winding stage.");
+        return false;
+    }
+    let acc_match = compare_acc_windings(winding_acc_ref, winding_acc_comp);
+
+    acc_match && coarse_match && fine_match
+}
+
+fn compare_coarse_windings(reference: &[i32], comp: &[i32]) -> bool {
+    if reference.len() != comp.len() {
+        println!(
+            "❌ FATAL ERROR: Coarse winding vectors have different lengths! Reference: {}, Comp: {}",
+            reference.len(),
+            comp.len()
+        );
+        return false;
+    }
+    let mut mismatches = 0;
+    for (i, (ref_val, comp_val)) in reference.iter().zip(comp.iter()).enumerate() {
+        if ref_val != comp_val {
+            if mismatches == 0 {
+                println!("\n--- Coarse Winding Mismatches Found ---");
+            }
+            println!(
+                "Mismatch at tile index {}: Reference = {}, Comp = {}",
+                i, ref_val, comp_val
+            );
+            mismatches += 1;
+        }
+    }
+    if mismatches == 0 {
+        true
+    } else {
+        println!(
+            "❌ Coarse winding comparison FAILED. Found {} mismatches out of {} tiles.",
+            mismatches,
+            reference.len()
+        );
+        false
+    }
+}
+
+fn compare_fine_windings(reference: &[[[f32; 4]; 4]], comp: &[[[f32; 4]; 4]]) -> bool {
+    if reference.len() != comp.len() {
+        println!(
+            "❌ FATAL ERROR: Fine winding vectors have different lengths! Reference: {}, Comp: {}",
+            reference.len(),
+            comp.len()
+        );
+        return false;
+    }
+
+    let mut total_mismatches = 0;
+    let mut first_mismatch_tile_index = None;
+
+    for i in 0..reference.len() {
+        let ref_tile = reference[i];
+        let comp_tile = comp[i];
+        let mut tile_has_mismatch = false;
+        for x in 0..4 {
+            for y in 0..4 {
+                let ref_val = ref_tile[x][y];
+                let comp_val = comp_tile[x][y];
+                if (ref_val - comp_val).abs() > 0.01 {
+                    total_mismatches += 1;
+                    tile_has_mismatch = true;
+                }
+            }
+        }
+        if tile_has_mismatch && first_mismatch_tile_index.is_none() {
+            first_mismatch_tile_index = Some(i);
+        }
+    }
+
+    if total_mismatches == 0 {
+        true
+    } else {
+        println!(
+            "❌ Fine winding comparison FAILED. Found {} total mismatches.",
+            total_mismatches
+        );
+        if let Some(i) = first_mismatch_tile_index {
+            println!(
+                "\n--- Detailed Mismatch Report for First Failing Tile (Index {}) ---",
+                i
+            );
+            println!("Column-major format: pixel (x, y)");
+            println!("{:<45} {:<45}", "Reference (cpu_merge_ref)", "Comp");
+            println!("{:-<45} {:-<45}", "-", "-");
+
+            let ref_tile = reference[i];
+            let comp_tile = comp[i];
+
+            for y in 0..4 {
+                let mut ref_row_str = String::new();
+                let mut comp_row_str = String::new();
+                for x in 0..4 {
+                    ref_row_str.push_str(&format!("({x},{y})={:<+8.4} ", ref_tile[x][y]));
+                    comp_row_str.push_str(&format!("({x},{y})={:<+8.4} ", comp_tile[x][y]));
+                }
+                println!("{}   {}", ref_row_str.trim_end(), comp_row_str.trim_end());
+            }
+        }
+        false
+    }
+}
+
+fn compare_acc_windings(reference: &[[f32; 4]], comp: &[[f32; 4]]) -> bool {
+    if reference.len() != comp.len() {
+        println!(
+            "❌ FATAL ERROR: Accumulator winding vectors have different lengths! Reference: {}, Comp: {}",
+            reference.len(),
+            comp.len()
+        );
+        return false;
+    }
+    let mut mismatches = 0;
+    for i in 0..reference.len() {
+        let ref_tile_acc = reference[i];
+        let comp_tile_acc = comp[i];
+        for y in 0..4 {
+            if (ref_tile_acc[y] - comp_tile_acc[y]).abs() > 0.01 {
+                if mismatches == 0 {
+                    println!("\n--- Accumulator Winding Mismatches Found ---");
+                }
+                println!(
+                    "Mismatch at tile index {} row {}: Reference = {:.4}, Comp = {:.4}",
+                    i, y, ref_tile_acc[y], comp_tile_acc[y]
+                );
+                mismatches += 1;
+            }
+        }
+    }
+    if mismatches == 0 {
+        true
+    } else {
+        println!(
+            "❌ Accumulator winding comparison FAILED. Found {} mismatches.",
+            mismatches,
+        );
+        false
     }
 }
diff --git a/sparse_strips/vello_common/src/strip_generator.rs b/sparse_strips/vello_common/src/strip_generator.rs
index e7bae82..2198c98 100644
--- a/sparse_strips/vello_common/src/strip_generator.rs
+++ b/sparse_strips/vello_common/src/strip_generator.rs
@@ -7,7 +7,7 @@
 use crate::flatten::{FlattenCtx, Line};
 use crate::kurbo::{Affine, PathEl, Stroke};
 use crate::peniko::Fill;
-use crate::strip::Strip;
+use crate::strip::{PreMergeTile, Strip};
 use crate::tile::Tiles;
 use crate::{flatten, strip};
 use alloc::vec::Vec;
@@ -20,6 +20,9 @@
     pub strips: Vec<Strip>,
     /// The alphas in the storage.
     pub alphas: Vec<u8>,
+    /// For the pre merge tiles
+    pub pre_merge_tiles: Vec<PreMergeTile>,
+
     generation_mode: GenerationMode,
 }
 
@@ -141,6 +144,7 @@
             self.level,
             &self.tiles,
             &mut strip_storage.strips,
+            &mut strip_storage.pre_merge_tiles,
             &mut strip_storage.alphas,
             fill_rule,
             aliasing_threshold,
diff --git a/sparse_strips/vello_cpu/src/render.rs b/sparse_strips/vello_cpu/src/render.rs
index ebcb5ef..e5e497b 100644
--- a/sparse_strips/vello_cpu/src/render.rs
+++ b/sparse_strips/vello_cpu/src/render.rs
@@ -27,7 +27,7 @@
 use vello_common::peniko::{BlendMode, Compose, Fill, Mix};
 use vello_common::pixmap::Pixmap;
 use vello_common::recording::{PushLayerCommand, Recordable, Recorder, Recording, RenderCommand};
-use vello_common::strip::Strip;
+use vello_common::strip::{PreMergeTile, Strip};
 use vello_common::strip_generator::{GenerationMode, StripGenerator, StripStorage};
 #[cfg(feature = "text")]
 use vello_common::{
@@ -691,8 +691,8 @@
     }
 
     fn execute_recording(&mut self, recording: &Recording) {
-        let (cached_strips, cached_alphas) = recording.get_cached_strips();
-        let adjusted_strips = self.prepare_cached_strips(cached_strips, cached_alphas);
+        let (cached_strips, cached_pmt, cached_alphas) = recording.get_cached_strips();
+        let adjusted_strips = self.prepare_cached_strips(cached_strips, cached_pmt, cached_alphas);
 
         // Use pre-calculated strip start indices from when we generated the cache.
         let strip_start_indices = recording.get_strip_start_indices();
@@ -910,18 +910,23 @@
     fn prepare_cached_strips(
         &mut self,
         cached_strips: &[Strip],
+        cached_pmt: &[PreMergeTile],
         cached_alphas: &[u8],
     ) -> Vec<Strip> {
         // Calculate offset for alpha indices based on current dispatcher's alpha buffer size.
         let alpha_offset = {
             let storage = self.dispatcher.strip_storage_mut();
             let offset = storage.alphas.len() as u32;
-            // Extend the dispatcher's alpha buffer with cached alphas.
-            storage.alphas.extend(cached_alphas);
+            storage.alphas.extend_from_slice(cached_alphas);
+            storage.pre_merge_tiles.extend(cached_pmt.iter().map(|pmt| {
+                let mut adjusted_pmt = *pmt;
+                adjusted_pmt.alpha_index += offset;
+                adjusted_pmt
+            }));
 
             offset
         };
-        // Create adjusted strips with corrected alpha indices.
+
         cached_strips
             .iter()
             .map(move |strip| {
diff --git a/sparse_strips/vello_hybrid/examples/winit/Cargo.toml b/sparse_strips/vello_hybrid/examples/winit/Cargo.toml
index 23e8426..20a0dcd 100644
--- a/sparse_strips/vello_hybrid/examples/winit/Cargo.toml
+++ b/sparse_strips/vello_hybrid/examples/winit/Cargo.toml
@@ -16,3 +16,5 @@
 vello_hybrid = { workspace = true }
 vello_example_scenes = { workspace = true }
 pollster = { workspace = true }
+futures-intrusive = "0.5"
+bytemuck = { version = "1.15", features = ["derive"] }
\ No newline at end of file
diff --git a/sparse_strips/vello_hybrid/examples/winit/src/main.rs b/sparse_strips/vello_hybrid/examples/winit/src/main.rs
index 5d429ff..8843a8c 100644
--- a/sparse_strips/vello_hybrid/examples/winit/src/main.rs
+++ b/sparse_strips/vello_hybrid/examples/winit/src/main.rs
@@ -22,6 +22,8 @@
     window::{Window, WindowId},
 };
 
+use futures_intrusive::channel::shared::oneshot_channel;
+
 const ZOOM_STEP: f64 = 0.1;
 
 struct App<'s> {
@@ -306,9 +308,58 @@
                     )
                     .unwrap();
 
+                let renderer = self.renderers[surface.dev_id].as_mut().unwrap();
+                let source_buffer = renderer.get_stitch_indicator_buffer();
+                let buffer_size = source_buffer.size();
+
+                let staging_buffer = device_handle.device.create_buffer(&wgpu::BufferDescriptor {
+                    label: Some("Stitch Indicator Readback Buffer"),
+                    size: buffer_size,
+                    usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
+                });
+
+                encoder.copy_buffer_to_buffer(
+                    source_buffer,
+                    0, // source offset
+                    &staging_buffer,
+                    0, // destination offset
+                    buffer_size,
+                );
+
                 device_handle.queue.submit([encoder.finish()]);
                 surface_texture.present();
 
+                let buffer_slice = staging_buffer.slice(..);
+                let (sender, receiver) = oneshot_channel();
+                buffer_slice.map_async(wgpu::MapMode::Read, move |result| {
+                    sender.send(result).unwrap();
+                });
+
+                let _ = device_handle.device.poll(wgpu::PollType::Wait);
+
+                // if let Some(Ok(())) = pollster::block_on(receiver.receive()) {
+                //     let data = buffer_slice.get_mapped_range();
+                //     let u32_data: &[u32] = bytemuck::cast_slice(&data);
+                //     let num_to_print = u32_data.len().min(256);
+                //     println!("--- Readback Buffer (First {} u32s) ---", num_to_print);
+                //     for (i, val) in u32_data.iter().take(num_to_print).enumerate() {
+                //         println!("{}: {:<10} ", i, val)
+                //     }
+                //     println!("\n--- End of Readback ---");
+                // }
+
+                // if let Some(Ok(())) = pollster::block_on(receiver.receive()) {
+                //     let data = buffer_slice.get_mapped_range();
+                //     let f32_data: &[f32] = bytemuck::cast_slice(&data);
+                //     let num_to_print = f32_data.len().min(128);
+                //     println!("--- Readback Buffer (First {} f32s) ---", num_to_print);
+                //     for (i, val) in f32_data.iter().take(num_to_print).enumerate() {
+                //         println!("{}: {:<10.4} ", i, val);
+                //     }
+                //     println!("\n--- End of Readback ---");
+                // }
+
                 device_handle.device.poll(wgpu::PollType::Poll).unwrap();
             }
             _ => {}
diff --git a/sparse_strips/vello_hybrid/src/lib.rs b/sparse_strips/vello_hybrid/src/lib.rs
index 1865cc4..87c1e27 100644
--- a/sparse_strips/vello_hybrid/src/lib.rs
+++ b/sparse_strips/vello_hybrid/src/lib.rs
@@ -29,7 +29,7 @@
 //!
 //! See the individual module documentation for more details on usage and implementation.
 
-#![no_std]
+//#![no_std]
 
 extern crate alloc;
 
diff --git a/sparse_strips/vello_hybrid/src/render/common.rs b/sparse_strips/vello_hybrid/src/render/common.rs
index 253445d..bbe9aa0 100644
--- a/sparse_strips/vello_hybrid/src/render/common.rs
+++ b/sparse_strips/vello_hybrid/src/render/common.rs
@@ -44,6 +44,20 @@
     pub alphas_tex_width_bits: u32,
 }
 
+/// Configuration for the Merge and Stitch compute shaders
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Pod, Zeroable)]
+pub struct ComputeConfig {
+    /// PreMergeTileCount; 1:1 with tiles
+    pub pmt_count: u32,
+    /// The total number writing tiles. alpha_buffer.len() / (tile::WIDTH * tile::HEIGHT)
+    pub end_tile_count: u32,
+    /// Placeholder
+    pub c: u32,
+    /// Placeholder
+    pub d: u32,
+}
+
 /// Represents a GPU strip for rendering.
 ///
 /// This struct corresponds to the `StripInstance` struct in the shader.
diff --git a/sparse_strips/vello_hybrid/src/render/wgpu.rs b/sparse_strips/vello_hybrid/src/render/wgpu.rs
index a862776..51756e4 100644
--- a/sparse_strips/vello_hybrid/src/render/wgpu.rs
+++ b/sparse_strips/vello_hybrid/src/render/wgpu.rs
@@ -22,6 +22,7 @@
 use alloc::{sync::Arc, vec};
 use core::{fmt::Debug, mem, num::NonZeroU64};
 use wgpu::Extent3d;
+use std::println;
 
 use crate::AtlasConfig;
 use crate::multi_atlas::AtlasId;
@@ -32,7 +33,7 @@
     render::{
         Config,
         common::{
-            GPU_ENCODED_IMAGE_SIZE_TEXELS, GPU_LINEAR_GRADIENT_SIZE_TEXELS,
+            ComputeConfig, GPU_ENCODED_IMAGE_SIZE_TEXELS, GPU_LINEAR_GRADIENT_SIZE_TEXELS,
             GPU_RADIAL_GRADIENT_SIZE_TEXELS, GPU_SWEEP_GRADIENT_SIZE_TEXELS, GpuEncodedImage,
             GpuEncodedPaint, GpuLinearGradient, GpuRadialGradient, GpuSweepGradient,
             pack_image_offset, pack_image_params, pack_image_size, pack_radial_kind_and_swapped,
@@ -50,6 +51,7 @@
     paint::ImageSource,
     peniko,
     pixmap::Pixmap,
+    strip::PreMergeTile,
     tile::Tile,
 };
 use wgpu::{
@@ -99,6 +101,11 @@
         Self::new_with(device, render_target_config, RenderSettings::default())
     }
 
+    /// TODO
+    pub fn get_stitch_indicator_buffer(&self) -> &wgpu::Buffer {
+        &self.programs.resources.stitch_indicator_buffer
+    }
+
     /// Creates a new renderer with specific settings.
     pub fn new_with(
         device: &Device,
@@ -146,12 +153,41 @@
         self.programs.prepare(
             device,
             queue,
+            &scene.strip_storage.pre_merge_tiles,
             &mut self.gradient_cache,
             &self.encoded_paints,
             &scene.strip_storage.alphas,
             render_size,
             &self.paint_idxs,
         );
+
+        const BLOCK_DIM: u32 = 256;
+        {
+            let tile_count = scene.strip_storage.pre_merge_tiles.len() as u32;
+            println!("PMT Size {}\n", tile_count);
+            println!("Alpha Size {}\n", scene.strip_storage.alphas.len());
+            let workgroup_x = (tile_count + BLOCK_DIM - 1) / BLOCK_DIM;
+
+            let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                label: Some("Merge Compute Pass"),
+                timestamp_writes: None,
+            });
+            compute_pass.set_pipeline(&self.programs.merge_pipeline);
+            compute_pass.set_bind_group(0, &self.programs.resources.merge_bind_group, &[]);
+            compute_pass.dispatch_workgroups(workgroup_x, 1, 1);
+        }
+        {
+            let end_tile_count = (scene.strip_storage.alphas.len() / 16) as u32;
+            let workgroup_x = (end_tile_count + BLOCK_DIM - 1) / BLOCK_DIM;
+            let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                label: Some("Stitch Compute Pass"),
+                timestamp_writes: None,
+            });
+            compute_pass.set_pipeline(&self.programs.stitch_pipeline);
+            compute_pass.set_bind_group(0, &self.programs.resources.merge_bind_group, &[]);
+            compute_pass.dispatch_workgroups(workgroup_x, 1, 1);
+        }
+
         let mut junk = RendererContext {
             programs: &mut self.programs,
             device,
@@ -447,10 +483,19 @@
     gradient_bind_group_layout: BindGroupLayout,
     /// Bind group layout for atlas textures
     atlas_bind_group_layout: BindGroupLayout,
+    /// Bind group layout for merge and stitch shaders
+    merge_bind_group_layout: BindGroupLayout,
+
     /// Pipeline for clearing slots in slot textures.
     clear_pipeline: RenderPipeline,
     /// Pipeline for clearing atlas regions.
     atlas_clear_pipeline: RenderPipeline,
+
+    /// Merge compute pipeline
+    merge_pipeline: wgpu::ComputePipeline,
+    /// Stitch compute pipeline
+    stitch_pipeline: wgpu::ComputePipeline,
+
     /// GPU resources for rendering (created during prepare)
     resources: GpuResources,
     /// Dimensions of the rendering target
@@ -487,6 +532,16 @@
     view_config_buffer: Buffer,
     /// Config buffer for rendering wide tile commands into a slot texture.
     slot_config_buffer: Buffer,
+    /// Config buffer for Stitch and Merge compute shaders.
+    compute_config_buffer: Buffer,
+
+    // Merge and stitch
+    premerge_tile_buffer: Buffer,
+    stitch_indicator_buffer: Buffer,
+    stitch_loc_buffer: Buffer,
+    part_indicator_buffer: Buffer,
+    part_acc_buffer: Buffer,
+    part_loc_buffer: Buffer,
 
     /// Buffer for slot indices used in `clear_slots`
     clear_slot_indices_buffer: Buffer,
@@ -497,6 +552,8 @@
 
     /// Bind group for clear slots operation
     clear_bind_group: BindGroup,
+
+    merge_bind_group: BindGroup,
 }
 
 const SIZE_OF_CONFIG: NonZeroU64 = NonZeroU64::new(size_of::<Config>() as u64).unwrap();
@@ -633,6 +690,93 @@
                 }],
             });
 
+        let merge_bind_group_layout =
+            device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label: Some("Merge/Stitch Bind Group Layout"),
+                entries: &[
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0, // config
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Uniform,
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 1, // pmt_in
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: true },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 2, // stitch_indicator
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 3, // part_indicator
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 4, // stitch_loc
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 5, // part_acc
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 6, // part_loc
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 7, // output (alphas texture)
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::StorageTexture {
+                            access: wgpu::StorageTextureAccess::WriteOnly,
+                            format: wgpu::TextureFormat::Rgba32Uint,
+                            view_dimension: wgpu::TextureViewDimension::D2,
+                        },
+                        count: None,
+                    },
+                ],
+            });
+
         let strip_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
             label: Some("Strip Shader"),
             source: wgpu::ShaderSource::Wgsl(vello_sparse_shaders::wgsl::RENDER_STRIPS.into()),
@@ -643,6 +787,9 @@
             source: wgpu::ShaderSource::Wgsl(vello_sparse_shaders::wgsl::CLEAR_SLOTS.into()),
         });
 
+        let merge_shader =
+            device.create_shader_module(wgpu::include_wgsl!("../shaders/merge.wgsl"));
+
         let strip_pipeline_layout =
             device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
                 label: Some("Strip Pipeline Layout"),
@@ -770,6 +917,31 @@
             cache: None,
         });
 
+        let merge_pipeline_layout =
+            device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                label: Some("Merge/Stitch Pipeline Layout"),
+                bind_group_layouts: &[&merge_bind_group_layout],
+                push_constant_ranges: &[],
+            });
+
+        let merge_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: Some("Merge Pipeline"),
+            layout: Some(&merge_pipeline_layout),
+            module: &merge_shader,
+            entry_point: Some("merge"),
+            compilation_options: Default::default(),
+            cache: None,
+        });
+
+        let stitch_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: Some("Stitch Pipeline"),
+            layout: Some(&merge_pipeline_layout),
+            module: &merge_shader,
+            entry_point: Some("stitch"),
+            compilation_options: Default::default(),
+            cache: None,
+        });
+
         let slot_texture_views: [TextureView; 2] = core::array::from_fn(|_| {
             device
                 .create_texture(&wgpu::TextureDescriptor {
@@ -896,7 +1068,54 @@
             &slot_texture_views,
         );
 
-        let resources = GpuResources {
+        let compute_config_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: Some("Compute Config Buffer"),
+            contents: bytemuck::bytes_of(&ComputeConfig {
+                pmt_count: 0,
+                end_tile_count: 0,
+                c: 0,
+                d: 0,
+            }),
+            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+        });
+        let premerge_tile_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Pre-merge Tile Buffer"),
+            size: 128,
+            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+        let stitch_indicator_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Stitch Indicator Buffer"),
+            size: 128,
+            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
+            mapped_at_creation: false,
+        });
+        let stitch_loc_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Stitch Location Buffer"),
+            size: 128,
+            usage: wgpu::BufferUsages::STORAGE,
+            mapped_at_creation: false,
+        });
+        let part_indicator_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Partition Indicator Buffer"),
+            size: 128,
+            usage: wgpu::BufferUsages::STORAGE,
+            mapped_at_creation: false,
+        });
+        let part_acc_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Partition Accumulator Buffer"),
+            size: 128,
+            usage: wgpu::BufferUsages::STORAGE,
+            mapped_at_creation: false,
+        });
+        let part_loc_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Partition Location Buffer"),
+            size: 128,
+            usage: wgpu::BufferUsages::STORAGE,
+            mapped_at_creation: false,
+        });
+
+        let mut resources = GpuResources {
             strips_buffer: Self::create_strips_buffer(device, 0),
             clear_slot_indices_buffer,
             slot_texture_views,
@@ -912,13 +1131,35 @@
             gradient_texture,
             gradient_bind_group,
             view_config_buffer,
+            compute_config_buffer,
+            premerge_tile_buffer,
+            stitch_indicator_buffer,
+            stitch_loc_buffer,
+            part_indicator_buffer,
+            part_acc_buffer,
+            part_loc_buffer,
+            merge_bind_group: Self::create_placeholder_bind_group(device, &merge_bind_group_layout),
         };
 
+        resources.merge_bind_group = Self::create_merge_bind_group(
+            device,
+            &merge_bind_group_layout,
+            &resources.compute_config_buffer,
+            &resources.premerge_tile_buffer,
+            &resources.stitch_indicator_buffer,
+            &resources.part_indicator_buffer,
+            &resources.stitch_loc_buffer,
+            &resources.part_acc_buffer,
+            &resources.part_loc_buffer,
+            &resources.alphas_texture,
+        );
+
         Self {
             strip_pipeline,
             strip_bind_group_layout,
             encoded_paints_bind_group_layout,
             gradient_bind_group_layout,
+            merge_bind_group_layout,
             atlas_bind_group_layout,
             resources,
             alpha_data,
@@ -929,9 +1170,110 @@
             },
             clear_pipeline,
             atlas_clear_pipeline,
+            merge_pipeline,
+            stitch_pipeline,
         }
     }
 
+    fn create_placeholder_bind_group(
+    device: &wgpu::Device,
+    layout: &wgpu::BindGroupLayout
+) -> wgpu::BindGroup {
+    // Create dummy resources that match the binding types
+    let dummy_uniform_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+        label: Some("Dummy Uniform"),
+        size: 16,
+        usage: wgpu::BufferUsages::UNIFORM,
+        mapped_at_creation: false,
+    });
+    let dummy_storage_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+        label: Some("Dummy Storage"),
+        size: 16,
+        usage: wgpu::BufferUsages::STORAGE,
+        mapped_at_creation: false,
+    });
+    let dummy_texture = device.create_texture(&wgpu::TextureDescriptor {
+        label: Some("Dummy Texture"),
+        size: wgpu::Extent3d { width: 1, height: 1, depth_or_array_layers: 1 },
+        mip_level_count: 1,
+        sample_count: 1,
+        dimension: wgpu::TextureDimension::D2,
+        format: wgpu::TextureFormat::Rgba32Uint,
+        usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::STORAGE_BINDING,
+        view_formats: &[],
+    });
+    let dummy_texture_view = dummy_texture.create_view(&Default::default());
+
+    device.create_bind_group(&wgpu::BindGroupDescriptor {
+        label: Some("Placeholder Merge Bind Group"),
+        layout,
+        entries: &[
+            wgpu::BindGroupEntry { binding: 0, resource: dummy_uniform_buffer.as_entire_binding() },
+            wgpu::BindGroupEntry { binding: 1, resource: dummy_storage_buffer.as_entire_binding() },
+            wgpu::BindGroupEntry { binding: 2, resource: dummy_storage_buffer.as_entire_binding() },
+            wgpu::BindGroupEntry { binding: 3, resource: dummy_storage_buffer.as_entire_binding() },
+            wgpu::BindGroupEntry { binding: 4, resource: dummy_storage_buffer.as_entire_binding() },
+            wgpu::BindGroupEntry { binding: 5, resource: dummy_storage_buffer.as_entire_binding() },
+            wgpu::BindGroupEntry { binding: 6, resource: dummy_storage_buffer.as_entire_binding() },
+            wgpu::BindGroupEntry { binding: 7, resource: wgpu::BindingResource::TextureView(&dummy_texture_view) },
+        ],
+    })
+}
+
+    fn create_merge_bind_group(
+        device: &wgpu::Device,
+        layout: &wgpu::BindGroupLayout,
+        config: &wgpu::Buffer,
+        pmt: &wgpu::Buffer,
+        stitch_indicator: &wgpu::Buffer,
+        part_indicator: &wgpu::Buffer,
+        stitch_loc: &wgpu::Buffer,
+        part_acc: &wgpu::Buffer,
+        part_loc: &wgpu::Buffer,
+        alphas: &wgpu::Texture,
+    ) -> wgpu::BindGroup {
+        device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("Merge/Stitch Bind Group"),
+            layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: config.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: pmt.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 2,
+                    resource: stitch_indicator.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 3,
+                    resource: part_indicator.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 4,
+                    resource: stitch_loc.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 5,
+                    resource: part_acc.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 6,
+                    resource: part_loc.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 7,
+                    resource: wgpu::BindingResource::TextureView(
+                        &alphas.create_view(&Default::default()),
+                    ),
+                },
+            ],
+        })
+    }
+
     fn create_strips_buffer(device: &Device, required_strips_size: u64) -> Buffer {
         device.create_buffer(&wgpu::BufferDescriptor {
             label: Some("Strips Buffer"),
@@ -979,7 +1321,9 @@
             sample_count: 1,
             dimension: wgpu::TextureDimension::D2,
             format: wgpu::TextureFormat::Rgba32Uint,
-            usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST,
+            usage: wgpu::TextureUsages::TEXTURE_BINDING
+                | wgpu::TextureUsages::COPY_DST
+                | wgpu::TextureUsages::STORAGE_BINDING,
             view_formats: &[],
         })
     }
@@ -1172,18 +1516,99 @@
         &mut self,
         device: &Device,
         queue: &Queue,
+        pmt: &[PreMergeTile],
         gradient_cache: &mut GradientRampCache,
         encoded_paints: &[GpuEncodedPaint],
         alphas: &[u8],
         new_render_size: &RenderSize,
         paint_idxs: &[u32],
     ) {
+        let mut new_bg = false;
         let max_texture_dimension_2d = device.limits().max_texture_dimension_2d;
-        self.maybe_resize_alphas_tex(device, max_texture_dimension_2d, alphas);
+        new_bg |= self.maybe_resize_alphas_tex(device, max_texture_dimension_2d, alphas);
         self.maybe_resize_encoded_paints_tex(device, max_texture_dimension_2d, paint_idxs);
         self.maybe_update_config_buffer(queue, max_texture_dimension_2d, new_render_size);
 
-        self.upload_alpha_texture(queue, alphas);
+        new_bg |= Self::maybe_resize_buffer(
+            device,
+            &mut self.resources.premerge_tile_buffer,
+            pmt.len() as u64 * std::mem::size_of::<PreMergeTile>() as u64,
+            "PreMergeTile Buffer",
+        );
+
+        let end_tiles = (alphas.len() / 16) as u64;
+        new_bg |= Self::maybe_resize_buffer(
+            device,
+            &mut self.resources.stitch_indicator_buffer,
+            end_tiles * 4,
+            "StitchIndicator",
+        );
+
+        new_bg |= Self::maybe_resize_buffer(
+            device,
+            &mut self.resources.stitch_loc_buffer,
+            end_tiles * 64,
+            "StitchLoc",
+        );
+
+        const BLOCK_DIM: u64 = 256;
+        let num_partitions = (pmt.len() as u64 + BLOCK_DIM - 1) / BLOCK_DIM;
+        new_bg |= Self::maybe_resize_buffer(
+            device,
+            &mut self.resources.part_indicator_buffer,
+            num_partitions * 4,
+            "PartIndicator",
+        );
+        new_bg |= Self::maybe_resize_buffer(
+            device,
+            &mut self.resources.part_acc_buffer,
+            num_partitions * 32,
+            "PartAcc",
+        );
+        new_bg |= Self::maybe_resize_buffer(
+            device,
+            &mut self.resources.part_loc_buffer,
+            num_partitions * 64,
+            "PartLoc",
+        );
+
+        if new_bg {
+            self.resources.merge_bind_group = Self::create_merge_bind_group(
+                device,
+                &self.merge_bind_group_layout,
+                &self.resources.compute_config_buffer,
+                &self.resources.premerge_tile_buffer,
+                &self.resources.stitch_indicator_buffer,
+                &self.resources.part_indicator_buffer,
+                &self.resources.stitch_loc_buffer,
+                &self.resources.part_acc_buffer,
+                &self.resources.part_loc_buffer,
+                &self.resources.alphas_texture,
+            );
+        }
+
+        //TODO maybe_update_compute_config_buffer
+        queue.write_buffer(
+            &self.resources.compute_config_buffer,
+            0,
+            bytemuck::bytes_of(&ComputeConfig {
+                pmt_count: pmt.len() as u32,
+                end_tile_count: alphas.len() as u32 / 16,
+                c: 0,
+                d: 0,
+            }),
+        );
+
+        if !pmt.is_empty() {
+            queue.write_buffer(
+                &self.resources.premerge_tile_buffer,
+                0,
+                bytemuck::cast_slice(pmt),
+            );
+        }
+
+
+        //self.upload_alpha_texture(queue, alphas); //delete this when alpha_buff is removed
         self.upload_encoded_paints_texture(queue, encoded_paints);
 
         if gradient_cache.has_changed() {
@@ -1199,7 +1624,7 @@
         device: &Device,
         max_texture_dimension_2d: u32,
         alphas: &[u8],
-    ) {
+    ) -> bool {
         let required_alpha_height = u32::try_from(alphas.len())
             .unwrap()
             // There are 16 1-byte alpha values per texel.
@@ -1239,7 +1664,9 @@
                 &self.resources.view_config_buffer,
                 &self.resources.slot_texture_views,
             );
+            return true
         }
+        false
     }
 
     /// Update the encoded paints texture size if needed.
@@ -1345,6 +1772,26 @@
         }
     }
 
+    fn maybe_resize_buffer(
+        device: &Device,
+        buffer: &mut Buffer,
+        required_size: u64,
+        label: &'static str,
+    ) -> bool {
+        if required_size > buffer.size() {
+            let new_size = (required_size + 15) & !15;
+            *buffer = device.create_buffer(&wgpu::BufferDescriptor {
+                label: Some(label),
+                size: new_size,
+                usage: buffer.usage(),
+                mapped_at_creation: false,
+            });
+            true
+        } else {
+            false
+        }
+    }
+
     /// Resize the texture array to accommodate more atlases.
     fn maybe_resize_atlas_texture_array(
         device: &Device,
diff --git a/sparse_strips/vello_hybrid/src/scene.rs b/sparse_strips/vello_hybrid/src/scene.rs
index 40234d8..986aced 100644
--- a/sparse_strips/vello_hybrid/src/scene.rs
+++ b/sparse_strips/vello_hybrid/src/scene.rs
@@ -16,7 +16,7 @@
 use vello_common::peniko::color::palette::css::BLACK;
 use vello_common::peniko::{BlendMode, Compose, Fill, Mix};
 use vello_common::recording::{PushLayerCommand, Recordable, Recorder, Recording, RenderCommand};
-use vello_common::strip::Strip;
+use vello_common::strip::{PreMergeTile, Strip};
 use vello_common::strip_generator::{GenerationMode, StripGenerator, StripStorage};
 
 use crate::AtlasConfig;
@@ -439,8 +439,8 @@
     }
 
     fn execute_recording(&mut self, recording: &Recording) {
-        let (cached_strips, cached_alphas) = recording.get_cached_strips();
-        let adjusted_strips = self.prepare_cached_strips(cached_strips, cached_alphas);
+        let (cached_strips, cached_pmt, cached_alphas) = recording.get_cached_strips();
+        let adjusted_strips = self.prepare_cached_strips(cached_strips, cached_pmt, cached_alphas);
 
         // Use pre-calculated strip start indices from when we generated the cache
         let strip_start_indices = recording.get_strip_start_indices();
@@ -637,13 +637,23 @@
     fn prepare_cached_strips(
         &mut self,
         cached_strips: &[Strip],
+        cached_pmt: &[PreMergeTile],
         cached_alphas: &[u8],
     ) -> Vec<Strip> {
         // Calculate offset for alpha indices based on current buffer size.
         let alpha_offset = self.strip_storage.alphas.len() as u32;
-        // Extend current alpha buffer with cached alphas.
-        self.strip_storage.alphas.extend(cached_alphas);
-        // Create adjusted strips with corrected alpha indices
+
+        // Extend current alpha and pre-merge tile buffers with cached data.
+        self.strip_storage.alphas.extend_from_slice(cached_alphas);
+        self.strip_storage
+            .pre_merge_tiles
+            .extend(cached_pmt.iter().map(move |pmt| {
+                let mut adjusted_pmt = *pmt;
+                adjusted_pmt.alpha_index += alpha_offset;
+                adjusted_pmt
+            }));
+
+        // Create adjusted strips with corrected alpha indices.
         cached_strips
             .iter()
             .map(move |strip| {
diff --git a/sparse_strips/vello_hybrid/src/shaders/merge.wgsl b/sparse_strips/vello_hybrid/src/shaders/merge.wgsl
new file mode 100644
index 0000000..a027467
--- /dev/null
+++ b/sparse_strips/vello_hybrid/src/shaders/merge.wgsl
@@ -0,0 +1,536 @@
+// Copyright 2025 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+struct Config {
+    // Count of pre merge tiles
+    pmt_count: u32,
+    // Count of tiles which will write to the alpha texture
+    end_tile_count: u32,
+    c: u32,
+    d: u32,
+};
+
+struct PreMergeTile {
+    // The index into the alpha buffer that an "end tile" should write to.
+    alpha_index: u32,
+    // Contains tile location information
+    packed_info: u32,
+    // The exclusive prefix sum of the signed winding number.
+    // Uploading it saves doing the scan here
+    scanned_winding: i32,
+    padding: u32,
+    // Line points, adjusted by the tile position
+    p0: vec2f,
+    p1: vec2f,
+};
+
+const TILE_HEIGHT = 4u;
+const TILE_WIDTH = 4u;
+
+@group(0) @binding(0)
+var<uniform> config : Config;
+
+@group(0) @binding(1)
+var<storage, read> pmt_in: array<PreMergeTile>;
+
+@group(0) @binding(2)
+var<storage, read_write> stitch_indicator: array<u32>;
+
+@group(0) @binding(3)
+var<storage, read_write> part_indicator: array<u32>;
+
+@group(0) @binding(4)
+var<storage, read_write> stitch_loc: array<array<array<f32, TILE_HEIGHT>, TILE_WIDTH>>;
+
+// This is double strided. This is because we need either:
+// 1) The reduction from the last segment start to the end of the partition
+// 2) The reduction from the last segment start to the last tile start
+// These are not mutually exclusive! Stitching requires both!
+@group(0) @binding(5)
+var<storage, read_write> part_acc: array<array<f32, TILE_HEIGHT>>;
+
+@group(0) @binding(6)
+var<storage, read_write> part_loc: array<array<array<f32, TILE_HEIGHT>, TILE_WIDTH>>;
+
+@group(0) @binding(7)
+var output: texture_storage_2d<rgba32uint, write>;
+
+// @group(0) @binding(7)
+// var<storage, read_write> alpha_buff: array<vec4<u32>>;
+
+// MSB                                                                  LSB
+// 31------------------21|20------------------12|11-------------------3|2|1|0|
+// |      Unused (11)    |  Seg Start ID (9)    |  Tile Start ID (9)   |F|T|E|
+//
+// F = FILL_RULE_MASK
+// T = IS_TILE_FIRST_COL_MASK
+// E = IS_END_TILE_MASK
+const IS_END_TILE_MASK: u32 = 1u;
+const IS_TILE_FIRST_COL_MASK: u32 = 2u;
+const FILL_RULE_MASK: u32 = 4u;
+const TILE_START_ID_SHIFT: u32 = 3u;
+const SEG_START_ID_SHIFT: u32 = 12u;
+const ID_MASK: u32 = 0x1ffu;
+const INVALID_ID: u32 = 256u;
+
+fn is_end_tile(packed_info: u32) -> bool {
+    return (packed_info & IS_END_TILE_MASK) != 0u;
+}
+
+fn is_tile_first_col(packed_info: u32) -> bool {
+    return (packed_info & IS_TILE_FIRST_COL_MASK) != 0u;
+}
+
+fn is_fill_rule_non_zero(packed_info: u32) -> bool {
+    return (packed_info & FILL_RULE_MASK) != 0u;
+}
+
+fn get_tile_start_id(packed_info: u32) -> u32 {
+    return (packed_info >> TILE_START_ID_SHIFT) & ID_MASK;
+}
+
+fn get_seg_start_id(packed_info: u32) -> u32 {
+    return (packed_info >> SEG_START_ID_SHIFT) & ID_MASK;
+}
+
+// MSB                                                                  LSB
+// 31|30|29--------------------------------------------------------------0|
+// |L|A|                     Partition ID (30 bits)                       |
+//
+// T = STITCH_LOC_MASK
+// A = STITCH_ACC_MASK
+const STITCH_LOC_MASK = 1u << 31u;
+const STITCH_ACC_MASK = 1u << 30u;
+const PART_ID_MASK = 0x3fffffffu;
+
+fn loc_stitch_required(in: u32) -> bool {
+    return (in & STITCH_LOC_MASK) != 0u;
+}
+
+fn acc_stitch_required(in: u32) -> bool {
+    return (in & STITCH_ACC_MASK) != 0u;
+}
+
+fn get_part_id(in: u32) -> u32 {
+    return in & PART_ID_MASK;
+}
+
+// MSB                                                                  LSB
+// 31-----------------------------------------------------------------3|1|0|
+// |                      Unused (30 bits)                             |T|S|
+//
+// S = PART_SEG_START_MASK
+// T = PART_TILE_START_MASK
+const PART_SEG_START_MASK: u32 = 1u << 0u;
+const PART_TILE_START_MASK: u32 = 1u << 1u;
+fn part_has_seg_start(in: u32) -> bool {
+    return (in & PART_SEG_START_MASK) != 0u;
+}
+
+fn part_has_tile_start(in: u32) -> bool {
+    return (in & PART_TILE_START_MASK) != 0u;
+}
+
+const BLOCK_DIM = 256u;
+const LG_BLOCK_DIM = 8u;
+const SCAN_CONST = LG_BLOCK_DIM - 1u;
+const TILE_SIZE = TILE_HEIGHT * TILE_WIDTH;
+const EPSILON = 1e-6;
+
+// TODO! Do not use more than 4096
+var<workgroup> wg_acc: array<array<f32, TILE_HEIGHT>, BLOCK_DIM>;
+var<workgroup> wg_loc: array<array<array<f32, TILE_HEIGHT>, TILE_WIDTH>, BLOCK_DIM>;
+
+// 1 thread per pre_pmt
+@compute @workgroup_size(BLOCK_DIM, 1, 1)
+fn merge(@builtin(local_invocation_id) tid: vec3<u32>,
+         @builtin(global_invocation_id) gid: vec3<u32>,
+         @builtin(workgroup_id) wgid: vec3<u32>)  {
+    var pmt: PreMergeTile;
+    if (gid.x < config.pmt_count) {
+        pmt = pmt_in[gid.x];
+    } else {
+        pmt = PreMergeTile(
+            0xffffffffu,
+            0u,
+            0,
+            0u,
+            vec2f(0.0, 0.0),
+            vec2f(0.0, 0.0),
+        );
+    }
+
+    var acc = array<f32, TILE_HEIGHT>();
+    var loc = array<array<f32, TILE_HEIGHT>, TILE_WIDTH>();
+    if (abs(pmt.p0.y - pmt.p1.y) >= EPSILON) { // If not horizontal. . .
+        var line_top_y: f32;
+        var line_top_x: f32;
+        var line_bottom_y: f32;
+        var line_bottom_x: f32;
+        if (pmt.p0.y < pmt.p1.y) {
+            line_top_y = pmt.p0.y;
+            line_top_x = pmt.p0.x;
+            line_bottom_y = pmt.p1.y;
+            line_bottom_x = pmt.p1.x;
+        } else {
+            line_top_y = pmt.p1.y;
+            line_top_x = pmt.p1.x;
+            line_bottom_y = pmt.p0.y;
+            line_bottom_x = pmt.p0.x;
+        }
+
+        var line_left_x: f32;
+        var line_left_y: f32;
+        var line_right_x: f32;
+        if (pmt.p0.x < pmt.p1.x) {
+            line_left_x = pmt.p0.x;
+            line_left_y = pmt.p0.y;
+            line_right_x = pmt.p1.x;
+        } else {
+            line_left_x = pmt.p1.x;
+            line_left_y = pmt.p1.y;
+            line_right_x = pmt.p0.x;
+        }
+
+        let dx = line_bottom_x - line_top_x;
+        let dy = line_bottom_y - line_top_y;
+        let is_vertical = abs(dx) < EPSILON;
+        let y_slope = dy / dx;
+        let x_slope = dx / dy;
+
+        // Unnecessary, we remove vertical and horizontal cases, so a comparison will be sufficient;
+        // i.e. dont care about sign of zero
+        //let sign = select(1.0f, -1.0f, (bitcast<u32>(pmt.p0.y - pmt.p1.y) & 0x80000000u) != 0u);
+        let sign = select(1.0f, -1.0f, pmt.p0.y < pmt.p1.y);
+
+        if (is_tile_first_col(pmt.packed_info) && line_left_x < 0.0f) {
+            var ymin: f32;
+            var ymax: f32;
+            if (is_vertical) {
+                ymin = line_top_y;
+                ymax = line_bottom_y;
+            } else {
+                let line_viewport_left_y =
+                    min(max((line_top_y - line_top_x * y_slope), line_top_y), line_bottom_y);
+                ymin = min(line_left_y, line_viewport_left_y);
+                ymax = max(line_left_y, line_viewport_left_y);
+            }
+
+            var h = array<f32, TILE_HEIGHT>();
+            for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                let px_top_y = f32(y);
+                let px_bottom_y = 1.0f + px_top_y;
+                let ymin_clamped = max(ymin, px_top_y);
+                let ymax_clamped = min(ymax, px_bottom_y);
+                h[y] = max(ymax_clamped - ymin_clamped, 0.0f);
+                acc[y] = sign * h[y];
+            }
+        } else {
+            // No need to clear in wgsl, but other shader languages required.
+            for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                acc[y] = 0.0f;
+            }
+        }
+
+        if (line_right_x >= 0.0f) {
+            if (is_vertical) {
+                let x_int = u32(floor(line_top_x));
+                for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+                    for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                        var h = 0.0f;
+                        var area = 0.0f;
+                        if x == x_int {
+                            let px_top_y = f32(y);
+                            let px_bottom_y = 1.0f + px_top_y;
+                            let ymin = max(px_top_y, line_top_y);
+                            let ymax = min(px_bottom_y, line_bottom_y);
+                            let coverage_right = f32(x) + 1.0f - line_top_x;
+                            h = max(ymax - ymin, 0.0f);
+                            area = h * coverage_right;
+                        }
+                        loc[x][y] = acc[y] + sign * area;
+                        acc[y] += sign * h;
+                    }
+                }
+            }
+
+            if (!is_vertical) {
+                for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+                    let px_left_x = f32(x);
+                    let px_right_x = 1.0f + px_left_x;
+                    for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                        let px_top_y = f32(y);
+                        let px_bottom_y = 1.0f + px_top_y;
+                        let ymin = max(line_top_y, px_top_y);
+                        let ymax = min(line_bottom_y, px_bottom_y);
+
+                        let line_px_left_y =
+                            min(max((line_top_y + (px_left_x - line_top_x) * y_slope), ymin), ymax);
+                        let line_px_right_y =
+                            min(max((line_top_y + (px_right_x - line_top_x) * y_slope), ymin), ymax);
+                        let line_px_left_yx =
+                            line_top_x + (line_px_left_y - line_top_y) * x_slope;
+                        let line_px_right_yx =
+                            line_top_x + (line_px_right_y - line_top_y) * x_slope;
+                        let h = abs(line_px_right_y - line_px_left_y);
+                        let area =
+                            0.5f * h * (2.0f * px_right_x - line_px_right_yx - line_px_left_yx);
+                        loc[x][y] = acc[y] + sign * area;
+                        acc[y] += sign * h;
+                    }
+                }
+            }
+
+            wg_loc[tid.x] = loc;
+        } else {
+            for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+                wg_loc[tid.x][x] = acc;
+            }
+        }
+        wg_acc[tid.x] = acc;
+    } else {
+        // TODO For non wgsl this needs to clear
+        wg_acc[tid.x] = acc;
+        wg_loc[tid.x] = loc;
+    }
+
+    for (var i = 1u; i <= BLOCK_DIM; i <<= 1u) {
+        workgroupBarrier();
+        let ii = tid.x - i;
+        if (tid.x >= i) {
+            for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                acc[y] += wg_acc[ii][y];
+            }
+
+            for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+                for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                    loc[x][y] += wg_loc[ii][x][y];
+                }
+            }
+        }
+        workgroupBarrier();
+
+        if (tid.x >= i) {
+            wg_acc[tid.x] = acc;
+            wg_loc[tid.x] = loc;
+        }
+    }
+    workgroupBarrier();
+
+    var scanned_winding = 0.0f;
+    let end_tile = is_end_tile(pmt.packed_info);
+    let seg_start_id = get_seg_start_id(pmt.packed_info);
+    let tile_start_id = get_tile_start_id(pmt.packed_info);
+    let seg_is_valid = seg_start_id != INVALID_ID;
+    let tile_is_valid = tile_start_id != INVALID_ID;
+    if (end_tile || tid.x == BLOCK_DIM - 1u) {
+        if (tile_is_valid) {
+            if (tile_start_id != 0u) {
+                for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+                    for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                        loc[x][y] -= wg_loc[tile_start_id - 1u][x][y];
+                    }
+                }
+                acc = wg_acc[tile_start_id - 1u];
+            } else {
+                for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                    acc[y] = 0.0f;
+                }
+            }
+        }
+
+        if (seg_is_valid) {
+            scanned_winding = f32(pmt_in[(wgid.x << LG_BLOCK_DIM) + seg_start_id].scanned_winding);
+            if (seg_start_id != 0u) {
+                for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                    acc[y] += scanned_winding - wg_acc[seg_start_id - 1u][y];
+                }
+            } else {
+                for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                    acc[y] += scanned_winding;
+                }
+            }
+        }
+
+        // At this point, if this is the last thread in the threadBlock, this contains the reduction
+        // from the tile start
+        if (tid.x == BLOCK_DIM - 1u) {
+            part_loc[wgid.x] = loc;
+        }
+
+        // This will add in the acc for the cases we want: tile_is_valid, tile_is_valid && seg_is_valid
+        if (tile_is_valid) {
+            for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+                for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                    loc[x][y] += acc[y];
+                }
+            }
+        }
+
+        // Only end tiles participate in the write out
+        if (end_tile) {
+            var s_ind = 0u;
+            if (tile_is_valid && seg_is_valid) { // Safe to write out
+                var final_alphas: vec4<u32>;
+                if (is_fill_rule_non_zero(pmt.packed_info)) {
+                    for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+                        final_alphas[x] = pack4x8unorm(abs(vec4<f32>(loc[x][0], loc[x][1],
+                                                                     loc[x][2], loc[x][3])));
+                    }
+                } else {
+                    // EvenOdd fill rule logic
+                    for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+                        let area = vec4<f32>(loc[x][0], loc[x][1], loc[x][2], loc[x][3]);
+                        let im1 = floor(area * 0.5 + 0.5);
+                        let coverage = abs(area - 2.0 * im1);
+                        final_alphas[x] = pack4x8unorm(coverage);
+                    }
+                }
+                let tex_dims = textureDimensions(output);
+                let tex_width = tex_dims.x;
+                let output_coords = vec2<u32>(
+                    (pmt.alpha_index >> 4) % tex_width,
+                    (pmt.alpha_index >> 4) / tex_width,
+                );
+                if (output_coords.y < tex_dims.y) {
+                    textureStore(output, output_coords, final_alphas);
+                }
+            } else {
+                // TODO fill rule on stitch
+                s_ind = wgid.x |
+                    select(0u, STITCH_ACC_MASK, tile_is_valid && !seg_is_valid) |
+                    select(0u, STITCH_LOC_MASK, !tile_is_valid);
+                stitch_loc[pmt.alpha_index >> 4u] = loc;
+            }
+            stitch_indicator[pmt.alpha_index >> 4u] = s_ind;
+        }
+    }
+
+    if (tid.x == BLOCK_DIM - 1) {
+        part_acc[wgid.x << 1u] = acc;
+        var seg_acc = wg_acc[tid.x];
+        if (seg_is_valid) {
+            if (seg_start_id != 0u) {
+                for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                    seg_acc[y] += scanned_winding - wg_acc[seg_start_id - 1u][y];
+                }
+            } else {
+                for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                    seg_acc[y] += scanned_winding;
+                }
+            }
+        }
+        part_acc[(wgid.x << 1u) + 1u] = seg_acc;
+
+        part_indicator[wgid.x] = select(0u, PART_TILE_START_MASK, tile_is_valid) |
+                                 select(0u, PART_SEG_START_MASK, seg_is_valid);
+    }
+}
+
+// 1 thread per end_tile
+// TODO once subgroups, this should be 1 subgroup : tile
+@compute @workgroup_size(BLOCK_DIM, 1, 1)
+fn stitch(@builtin(local_invocation_id) tid: vec3<u32>,
+          @builtin(global_invocation_id) gid: vec3<u32>,
+          @builtin(workgroup_id) wgid: vec3<u32>)  {
+    var s_indicator = 0u;
+    if (gid.x < config.end_tile_count) {
+        s_indicator = stitch_indicator[gid.x];
+    }
+
+    // No stitching needed or oob
+    if (s_indicator == 0u) {
+        return;
+    }
+    let part_id = get_part_id(s_indicator);
+    var loc = stitch_loc[gid.x];
+    var acc = array<f32, TILE_HEIGHT>();
+    if (loc_stitch_required(s_indicator)) {
+        var lookback_id = part_id - 1u;
+        var part_ind: u32;
+        while (true) {
+            let p_loc = part_loc[lookback_id];
+            for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+                for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                    loc[x][y] += p_loc[x][y];
+                }
+            }
+
+            // Did we hit a tile start?
+            part_ind = part_indicator[lookback_id];
+            if (part_has_tile_start(part_ind)) {
+                break;
+            } else {
+                lookback_id -= 1u;
+            }
+        }
+
+        lookback_id <<= 1u;
+        acc = part_acc[lookback_id];
+
+        // If the tile start also included a seg start, we're done. Else we will have to traverse to
+        // the previous seg start
+        if (!part_has_seg_start(part_ind)) {
+            lookback_id -= 1u; // Down to the upper
+            while (true) {
+                let p_acc = part_acc[lookback_id];
+                for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                    acc[y] += p_acc[y];
+                }
+
+                if (part_has_seg_start(part_indicator[lookback_id >> 1u])) {
+                    break;
+                } else {
+                    lookback_id -= 2u;
+                }
+            }
+        }
+    }
+
+    if (acc_stitch_required(s_indicator)) {
+        var lookback_id = (part_id - 1u) << 1u;
+        while (true) {
+            var s_acc = part_acc[lookback_id + 1u];
+            for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+                acc[y] += s_acc[y];
+            }
+
+            if (part_has_seg_start(part_indicator[lookback_id >> 1u])) {
+                break;
+            } else {
+                lookback_id -= 2u;
+            }
+        }
+    }
+
+    // Combine acc and loc
+    for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+        for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+            loc[x][y] += acc[y];
+        }
+    }
+
+    var final_alphas: vec4<u32>;
+    if (true) { // TODO put fill rule onto the stitch
+        for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+            final_alphas[x] = pack4x8unorm(abs(vec4<f32>(loc[x][0], loc[x][1],
+                                                         loc[x][2], loc[x][3])));
+        }
+    } else {
+        // EvenOdd fill rule logic
+        for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+            let area = vec4<f32>(loc[x][0], loc[x][1], loc[x][2], loc[x][3]);
+            let im1 = floor(area * 0.5 + 0.5);
+            let coverage = abs(area - 2.0 * im1);
+            final_alphas[x] = pack4x8unorm(coverage);
+        }
+    }
+    let tex_dims = textureDimensions(output);
+    let tex_width = tex_dims.x;
+    let output_coords = vec2<u32>(
+        gid.x % tex_width,
+        gid.x / tex_width,
+    );
+    if (output_coords.y < tex_dims.y) {
+        textureStore(output, output_coords, final_alphas);
+    }
+}
diff --git a/sparse_strips/vello_toy/src/debug.rs b/sparse_strips/vello_toy/src/debug.rs
index 01732a8..f2b390e 100644
--- a/sparse_strips/vello_toy/src/debug.rs
+++ b/sparse_strips/vello_toy/src/debug.rs
@@ -33,6 +33,7 @@
     let mut line_buf = vec![];
     let mut tiles = Tiles::new(Level::new());
     let mut strip_buf = vec![];
+    let mut pmt_buf = vec![];
     let mut alpha_buf = vec![];
     let mut wide = Wide::<MODE_CPU>::new(args.width, args.height);
 
@@ -79,6 +80,7 @@
             Level::new(),
             &tiles,
             &mut strip_buf,
+            &mut pmt_buf,
             &mut alpha_buf,
             args.fill_rule,
             None,