it's jank

diff --git a/sparse_strips/vello_common/src/coarse.rs b/sparse_strips/vello_common/src/coarse.rs
index cbbc58f..66b1bdb 100644
--- a/sparse_strips/vello_common/src/coarse.rs
+++ b/sparse_strips/vello_common/src/coarse.rs

@@ -899,25 +899,27 @@
                 // inline the blend mode instead.
                 let (_, tail) = self.cmds.split_at(self.cmds.len() - 3);
 
+                // For vello_hybrid I am not sure how this is possible.
+                // Temporarily commented out so I can focus on the layers golden path.
                 let updated = match tail {
-                    [Cmd::PushBuf, Cmd::AlphaFill(a), Cmd::Blend(b)] => {
-                        if !b.is_destructive() && a.blend_mode.is_none() {
-                            let mut blended = a.clone();
-                            blended.blend_mode = Some(*b);
-                            Some(Cmd::AlphaFill(blended))
-                        } else {
-                            None
-                        }
-                    }
-                    [Cmd::PushBuf, Cmd::Fill(a), Cmd::Blend(b)] => {
-                        if !b.is_destructive() && a.blend_mode.is_none() {
-                            let mut blended = a.clone();
-                            blended.blend_mode = Some(*b);
-                            Some(Cmd::Fill(blended))
-                        } else {
-                            None
-                        }
-                    }
+                    // [Cmd::PushBuf, Cmd::AlphaFill(a), Cmd::Blend(b)] => {
+                    //     if !b.is_destructive() && a.blend_mode.is_none() {
+                    //         let mut blended = a.clone();
+                    //         blended.blend_mode = Some(*b);
+                    //         Some(Cmd::AlphaFill(blended))
+                    //     } else {
+                    //         None
+                    //     }
+                    // }
+                    // [Cmd::PushBuf, Cmd::Fill(a), Cmd::Blend(b)] => {
+                    //     if !b.is_destructive() && a.blend_mode.is_none() {
+                    //         let mut blended = a.clone();
+                    //         blended.blend_mode = Some(*b);
+                    //         Some(Cmd::Fill(blended))
+                    //     } else {
+                    //         None
+                    //     }
+                    // }
                     _ => None,
                 };
 

diff --git a/sparse_strips/vello_hybrid/examples/scenes/src/lib.rs b/sparse_strips/vello_hybrid/examples/scenes/src/lib.rs
index f9f5394..38a8478 100644
--- a/sparse_strips/vello_hybrid/examples/scenes/src/lib.rs
+++ b/sparse_strips/vello_hybrid/examples/scenes/src/lib.rs

@@ -50,24 +50,9 @@
 /// Get all available example scenes
 /// Unlike the Wasm version, this function allows for passing custom SVGs.
 #[cfg(not(target_arch = "wasm32"))]
-pub fn get_example_scenes(svg_paths: Option<Vec<&str>>) -> Box<[AnyScene]> {
+pub fn get_example_scenes() -> Box<[AnyScene]> {
     let mut scenes = Vec::new();
-
-    // Create SVG scenes for each provided path
-    if let Some(paths) = svg_paths {
-        for path in paths {
-            scenes.push(AnyScene::new(
-                svg::SvgScene::with_svg_file(path.into()).unwrap(),
-            ));
-        }
-    } else {
-        scenes.push(AnyScene::new(svg::SvgScene::tiger()));
-    }
-
-    scenes.push(AnyScene::new(text::TextScene::new("Hello, Vello!")));
     scenes.push(AnyScene::new(simple::SimpleScene::new()));
-    scenes.push(AnyScene::new(clip::ClipScene::new()));
-    scenes.push(AnyScene::new(image::ImageScene::new()));
 
     scenes.into_boxed_slice()
 }

diff --git a/sparse_strips/vello_hybrid/examples/scenes/src/simple.rs b/sparse_strips/vello_hybrid/examples/scenes/src/simple.rs
index b8da87b..8f19b44 100644
--- a/sparse_strips/vello_hybrid/examples/scenes/src/simple.rs
+++ b/sparse_strips/vello_hybrid/examples/scenes/src/simple.rs

@@ -3,8 +3,12 @@
 
 //! Simple example scene with basic shapes.
 
+use parley::Rect;
+use vello_common::color::palette::css::{BLUE, WHITE, YELLOW};
 use vello_common::kurbo::{Affine, BezPath, Stroke};
 use vello_common::peniko::color::palette;
+use vello_common::kurbo::Shape;
+use vello_common::peniko::{BlendMode, Compose, Mix};
 use vello_hybrid::Scene;
 
 use crate::ExampleScene;
@@ -33,21 +37,32 @@
 }
 
 /// Draws a simple scene with shapes
-pub fn render(ctx: &mut Scene, root_transform: Affine) {
-    let mut path = BezPath::new();
-    path.move_to((10.0, 10.0));
-    path.line_to((180.0, 20.0));
-    path.line_to((30.0, 40.0));
-    path.close_path();
+pub fn render(ctx: &mut Scene, _root_transform: Affine) {
+    let path = Rect::new(0.0, 0.0, 100 as f64, 100 as f64).to_path(0.1);
 
-    // Use a combined transform that includes the root transform
-    let scene_transform = Affine::scale(5.0);
-    ctx.set_transform(root_transform * scene_transform);
-
-    ctx.set_paint(palette::css::REBECCA_PURPLE);
+    ctx.set_paint(WHITE);
     ctx.fill_path(&path);
-    let stroke = Stroke::new(1.0);
-    ctx.set_paint(palette::css::DARK_BLUE);
-    ctx.set_stroke(stroke);
-    ctx.stroke_path(&path);
+
+    ctx.push_layer(
+        None,
+        Some(BlendMode::new(Mix::Normal, Compose::SrcOver)),
+        None,
+        None,
+    );
+
+    // Draw the destination layer.
+    ctx.set_paint(YELLOW.with_alpha(1.0));
+    ctx.fill_rect(&Rect::new(10.0, 10.0, 70.0, 70.0));
+    // Draw the source layer.
+    ctx.push_layer(
+        None,
+        Some(BlendMode::new(Mix::Normal, Compose::Xor)),
+        None,
+        None,
+    );
+    ctx.set_paint(BLUE.with_alpha(1.0));
+    ctx.fill_rect(&Rect::new(30.0, 30.0, 90.0, 90.0));
+    // Compose.
+    ctx.pop_layer();
+    ctx.pop_layer();
 }

diff --git a/sparse_strips/vello_hybrid/examples/winit/src/main.rs b/sparse_strips/vello_hybrid/examples/winit/src/main.rs
index 2482073..fa5256f 100644
--- a/sparse_strips/vello_hybrid/examples/winit/src/main.rs
+++ b/sparse_strips/vello_hybrid/examples/winit/src/main.rs

@@ -65,11 +65,8 @@
                 }
             }
         }
-        let scenes = if svg_paths.is_empty() {
-            get_example_scenes(None)
-        } else {
-            get_example_scenes(Some(svg_paths))
-        };
+        
+        let scenes = get_example_scenes();
 
         start_scene_index = start_scene_index.min(scenes.len() - 1);
         (scenes, start_scene_index)
@@ -83,7 +80,7 @@
         scenes,
         current_scene: start_scene_index,
         render_state: RenderState::Suspended(None),
-        scene: Scene::new(1800, 1200),
+        scene: Scene::new(100, 100),
         transform: Affine::IDENTITY,
         mouse_down: false,
         last_cursor_position: None,
@@ -119,8 +116,7 @@
         let window = cached_window.take().unwrap_or_else(|| {
             create_winit_window(
                 event_loop,
-                self.scene.width().into(),
-                self.scene.height().into(),
+                800,600,
                 true,
             )
         });
@@ -166,10 +162,10 @@
             WindowEvent::Resized(size) => {
                 self.context
                     .resize_surface(surface, size.width, size.height);
-                self.scene = Scene::new(
-                    u16::try_from(size.width).unwrap(),
-                    u16::try_from(size.height).unwrap(),
-                );
+                // self.scene = Scene::new(
+                //     u16::try_from(size.width).unwrap(),
+                //     u16::try_from(size.height).unwrap(),
+                // );
             }
             WindowEvent::KeyboardInput {
                 event:

diff --git a/sparse_strips/vello_hybrid/src/lib.rs b/sparse_strips/vello_hybrid/src/lib.rs
index 306acc3..04a6d02 100644
--- a/sparse_strips/vello_hybrid/src/lib.rs
+++ b/sparse_strips/vello_hybrid/src/lib.rs

@@ -29,7 +29,8 @@
 //!
 //! See the individual module documentation for more details on usage and implementation.
 
-#![no_std]
+//  Commented out to allow dbg! and println.
+// #![no_std]
 
 extern crate alloc;
 

diff --git a/sparse_strips/vello_hybrid/src/render/common.rs b/sparse_strips/vello_hybrid/src/render/common.rs
index 1a811e7..f21c6d6 100644
--- a/sparse_strips/vello_hybrid/src/render/common.rs
+++ b/sparse_strips/vello_hybrid/src/render/common.rs

@@ -74,6 +74,73 @@
     pub _padding2: [u32; 2],
 }
 
+/// Represents a GPU blend command for wide tile blending operations.
+///
+/// This struct corresponds to the `BlendCommand` struct in the blend_wide_tile.wgsl shader.
+#[repr(C)]
+#[derive(Debug, Clone, Copy, Zeroable, Pod)]
+pub struct GpuBlendCommand {
+    /// [x, y] packed as u16's - coordinates of the top left of the source wide tile
+    pub xy_src: u32,
+    /// [x, y] packed as u16's - coordinates of the top left of the destination wide tile  
+    pub xy_dst: u32,
+    /// Bits 0-7: opacity
+    /// Bits 8-11: compose
+    /// Bits 12-15: mix
+    /// Bits 16: source texture (0 = slots of ix=0, 1 = slots of ix=1)
+    /// Bits 17-18: dest texture (0 = slots of ix=0, 1 = slots of ix=1, 2 = final target)
+    /// Bits 19-26: blend slot index
+    pub payload: u32,
+}
+
+/// Represents a GPU copy command for copying slots between textures.
+///
+/// This struct corresponds to the `CopyCommand` struct in the copy_slot.wgsl shader.
+#[repr(C)]
+#[derive(Debug, Clone, Copy, Zeroable, Pod)]
+pub struct GpuCopyCommand {
+    /// [x, y] packed as u16's - coordinates of the top left of the target wide tile
+    pub xy_target: u32,
+    /// Slot index to identify the pixel position to sample from
+    pub slot_ix: u32,
+}
+
+/// Configuration for the blend wide tile operations
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Pod, Zeroable)]
+pub struct BlendConfig {
+    /// Width of a wide tile (matching `WideTile::WIDTH`).
+    pub wide_tile_width: u32,
+    /// Height of a wide tile (matching `WideTile::HEIGHT`).
+    pub wide_tile_height: u32,
+    /// Height of the slot texture.
+    pub slot_texture_height: u32,
+    /// Height of the final target texture.
+    pub final_target_height: u32,
+    /// Height of the blend texture.
+    pub blend_texture_height: u32,
+    /// Padding for 16-byte alignment
+    pub _padding: [u32; 3],
+}
+
+/// Configuration for the copy slot operations
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Pod, Zeroable)]
+pub struct CopyConfig {
+    /// Width of a wide tile (matching `WideTile::WIDTH`).
+    pub wide_tile_width: u32,
+    /// Height of a wide tile (matching `WideTile::HEIGHT`).
+    pub wide_tile_height: u32,
+    /// Height of the slot texture (source).
+    pub slot_texture_height: u32,
+    /// Width of the target texture (destination).
+    pub target_texture_width: u32,
+    /// Height of the target texture (destination).
+    pub target_texture_height: u32,
+    /// Padding for 16-byte alignment
+    pub _padding: [u32; 3],
+}
+
 #[cfg(all(target_arch = "wasm32", feature = "webgl", feature = "wgpu"))]
 pub(crate) fn maybe_warn_about_webgl_feature_conflict() {
     use core::sync::atomic::{AtomicBool, Ordering};

diff --git a/sparse_strips/vello_hybrid/src/render/wgpu.rs b/sparse_strips/vello_hybrid/src/render/wgpu.rs
index 6968e22..ed21bb7 100644
--- a/sparse_strips/vello_hybrid/src/render/wgpu.rs
+++ b/sparse_strips/vello_hybrid/src/render/wgpu.rs

@@ -36,9 +36,12 @@
 use crate::{
     GpuStrip, RenderError, RenderSize,
     image_cache::{ImageCache, ImageResource},
-    render::{Config, common::GpuEncodedImage},
+    render::{
+        Config,
+        common::{BlendConfig, CopyConfig, GpuBlendCommand, GpuCopyCommand, GpuEncodedImage},
+    },
     scene::Scene,
-    schedule::{LoadOp, RendererBackend, Scheduler},
+    schedule::{BlendCommand, LoadOp, Location, RendererBackend, Scheduler},
 };
 
 /// Options for the renderer
@@ -76,6 +79,14 @@
         }
     }
 
+    pub fn get_slots_texture_views(&self) -> &[TextureView; 2] {
+        &self.programs.resources.slot_texture_views
+    }
+
+    pub fn get_blend_texture_view(&self) -> &TextureView {
+        &self.programs.resources.blend_texture_view
+    }
+
     /// Render `scene` into the provided command encoder.
     ///
     /// This method creates GPU resources as needed and schedules potentially multiple
@@ -257,6 +268,16 @@
     /// Pipeline for clearing slots in slot textures.
     clear_pipeline: RenderPipeline,
 
+    /// Pipeline for blending wide tiles.
+    blend_pipeline: RenderPipeline,
+    /// Bind group layout for blend operations
+    blend_bind_group_layout: BindGroupLayout,
+
+    /// Pipeline for copying slots between textures.
+    copy_pipeline: RenderPipeline,
+    /// Bind group layout for copy operations
+    copy_bind_group_layout: BindGroupLayout,
+
     /// GPU resources for rendering (created during prepare)
     resources: GpuResources,
     /// Dimensions of the rendering target
@@ -298,6 +319,26 @@
 
     /// Bind group for clear slots operation
     clear_bind_group: BindGroup,
+
+    /// Blend texture for wide tile blending operations
+    blend_texture: Texture,
+    /// Blend texture view
+    blend_texture_view: TextureView,
+    /// Config buffer for blend operations
+    blend_config_buffer: Buffer,
+
+    /// Config buffer for copy operations to slots
+    copy_slot_config_buffer: Buffer,
+    /// Config buffer for copy operations to final target
+    copy_target_config_buffer: Buffer,
+    /// Bind group for copy operations to slots
+    copy_slot_bind_group: BindGroup,
+    /// Bind group for copy operations to final target
+    copy_target_bind_group: BindGroup,
+
+    /// Buffers for blend and copy commands
+    blend_commands_buffer: Buffer,
+    copy_commands_buffer: Buffer,
 }
 
 const SIZE_OF_CONFIG: NonZeroU64 = NonZeroU64::new(size_of::<Config>() as u64).unwrap();
@@ -329,6 +370,27 @@
     }
 }
 
+impl GpuBlendCommand {
+    /// Vertex attributes for the blend command
+    pub fn vertex_attributes() -> [wgpu::VertexAttribute; 3] {
+        wgpu::vertex_attr_array![
+            0 => Uint32,  // xy_src
+            1 => Uint32,  // xy_dst
+            2 => Uint32,  // payload
+        ]
+    }
+}
+
+impl GpuCopyCommand {
+    /// Vertex attributes for the copy command
+    pub fn vertex_attributes() -> [wgpu::VertexAttribute; 2] {
+        wgpu::vertex_attr_array![
+            0 => Uint32,  // xy_target
+            1 => Uint32,  // slot_ix
+        ]
+    }
+}
+
 impl Programs {
     fn new(device: &Device, render_target_config: &RenderTargetConfig, slot_count: usize) -> Self {
         let strip_bind_group_layout =
@@ -414,6 +476,72 @@
                 }],
             });
 
+        // Create bind group layout for blend operations
+        let blend_bind_group_layout =
+            device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label: Some("Blend Bind Group Layout"),
+                entries: &[
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::VERTEX,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Uniform,
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 1,
+                        visibility: wgpu::ShaderStages::FRAGMENT,
+                        ty: wgpu::BindingType::Texture {
+                            sample_type: wgpu::TextureSampleType::Float { filterable: false },
+                            view_dimension: wgpu::TextureViewDimension::D2,
+                            multisampled: false,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 2,
+                        visibility: wgpu::ShaderStages::FRAGMENT,
+                        ty: wgpu::BindingType::Texture {
+                            sample_type: wgpu::TextureSampleType::Float { filterable: false },
+                            view_dimension: wgpu::TextureViewDimension::D2,
+                            multisampled: false,
+                        },
+                        count: None,
+                    },
+                ],
+            });
+
+        // Create bind group layout for copy operations
+        let copy_bind_group_layout =
+            device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label: Some("Copy Bind Group Layout"),
+                entries: &[
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Uniform,
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 1,
+                        visibility: wgpu::ShaderStages::FRAGMENT,
+                        ty: wgpu::BindingType::Texture {
+                            sample_type: wgpu::TextureSampleType::Float { filterable: false },
+                            view_dimension: wgpu::TextureViewDimension::D2,
+                            multisampled: false,
+                        },
+                        count: None,
+                    },
+                ],
+            });
+
         let strip_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
             label: Some("Strip Shader"),
             source: wgpu::ShaderSource::Wgsl(vello_sparse_shaders::wgsl::RENDER_STRIPS.into()),
@@ -424,6 +552,16 @@
             source: wgpu::ShaderSource::Wgsl(vello_sparse_shaders::wgsl::CLEAR_SLOTS.into()),
         });
 
+        let blend_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("Blend Wide Tile Shader"),
+            source: wgpu::ShaderSource::Wgsl(vello_sparse_shaders::wgsl::BLEND_WIDE_TILE.into()),
+        });
+
+        let copy_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("Copy Slot Shader"),
+            source: wgpu::ShaderSource::Wgsl(vello_sparse_shaders::wgsl::COPY_SLOT.into()),
+        });
+
         let strip_pipeline_layout =
             device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
                 label: Some("Strip Pipeline Layout"),
@@ -442,6 +580,19 @@
                 push_constant_ranges: &[],
             });
 
+        let blend_pipeline_layout =
+            device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                label: Some("Blend Pipeline Layout"),
+                bind_group_layouts: &[&blend_bind_group_layout],
+                push_constant_ranges: &[],
+            });
+
+        let copy_pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: Some("Copy Pipeline Layout"),
+            bind_group_layouts: &[&copy_bind_group_layout],
+            push_constant_ranges: &[],
+        });
+
         let strip_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
             label: Some("Strip Pipeline"),
             layout: Some(&strip_pipeline_layout),
@@ -513,6 +664,72 @@
             cache: None,
         });
 
+        let blend_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
+            label: Some("Blend Wide Tile Pipeline"),
+            layout: Some(&blend_pipeline_layout),
+            vertex: wgpu::VertexState {
+                module: &blend_shader,
+                entry_point: Some("vs_main"),
+                buffers: &[wgpu::VertexBufferLayout {
+                    array_stride: size_of::<GpuBlendCommand>() as u64,
+                    step_mode: wgpu::VertexStepMode::Instance,
+                    attributes: &GpuBlendCommand::vertex_attributes(),
+                }],
+                compilation_options: PipelineCompilationOptions::default(),
+            },
+            fragment: Some(wgpu::FragmentState {
+                module: &blend_shader,
+                entry_point: Some("fs_main"),
+                targets: &[Some(ColorTargetState {
+                    format: render_target_config.format,
+                    blend: Some(BlendState::REPLACE),
+                    write_mask: ColorWrites::ALL,
+                })],
+                compilation_options: PipelineCompilationOptions::default(),
+            }),
+            primitive: wgpu::PrimitiveState {
+                topology: wgpu::PrimitiveTopology::TriangleStrip,
+                ..Default::default()
+            },
+            depth_stencil: None,
+            multisample: wgpu::MultisampleState::default(),
+            multiview: None,
+            cache: None,
+        });
+
+        let copy_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
+            label: Some("Copy Slot Pipeline"),
+            layout: Some(&copy_pipeline_layout),
+            vertex: wgpu::VertexState {
+                module: &copy_shader,
+                entry_point: Some("vs_main"),
+                buffers: &[wgpu::VertexBufferLayout {
+                    array_stride: size_of::<GpuCopyCommand>() as u64,
+                    step_mode: wgpu::VertexStepMode::Instance,
+                    attributes: &GpuCopyCommand::vertex_attributes(),
+                }],
+                compilation_options: PipelineCompilationOptions::default(),
+            },
+            fragment: Some(wgpu::FragmentState {
+                module: &copy_shader,
+                entry_point: Some("fs_main"),
+                targets: &[Some(ColorTargetState {
+                    format: render_target_config.format,
+                    blend: Some(BlendState::REPLACE),
+                    write_mask: ColorWrites::ALL,
+                })],
+                compilation_options: PipelineCompilationOptions::default(),
+            }),
+            primitive: wgpu::PrimitiveState {
+                topology: wgpu::PrimitiveTopology::TriangleStrip,
+                ..Default::default()
+            },
+            depth_stencil: None,
+            multisample: wgpu::MultisampleState::default(),
+            multiview: None,
+            cache: None,
+        });
+
         let slot_texture_views: [TextureView; 2] = core::array::from_fn(|_| {
             device
                 .create_texture(&wgpu::TextureDescriptor {
@@ -619,6 +836,103 @@
             &slot_texture_views,
         );
 
+        // Create blend texture
+        let blend_texture = device.create_texture(&wgpu::TextureDescriptor {
+            label: Some("Blend Texture"),
+            size: wgpu::Extent3d {
+                width: u32::from(WideTile::WIDTH),
+                height: u32::from(Tile::HEIGHT) * slot_count as u32,
+                depth_or_array_layers: 1,
+            },
+            mip_level_count: 1,
+            sample_count: 1,
+            dimension: wgpu::TextureDimension::D2,
+            format: render_target_config.format,
+            usage: wgpu::TextureUsages::TEXTURE_BINDING
+                | wgpu::TextureUsages::RENDER_ATTACHMENT
+                | wgpu::TextureUsages::COPY_SRC,
+            view_formats: &[],
+        });
+        let blend_texture_view = blend_texture.create_view(&wgpu::TextureViewDescriptor::default());
+
+        // Create blend config buffer
+        let blend_config_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: Some("Blend Config"),
+            contents: bytemuck::bytes_of(&BlendConfig {
+                wide_tile_width: u32::from(WideTile::WIDTH),
+                wide_tile_height: u32::from(Tile::HEIGHT),
+                slot_texture_height: u32::from(Tile::HEIGHT) * slot_count as u32,
+                final_target_height: render_target_config.height,
+                blend_texture_height: u32::from(Tile::HEIGHT) * slot_count as u32,
+                _padding: [0; 3],
+            }),
+            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+        });
+
+        // Create copy config buffers for slots and final target
+        let copy_slot_config_buffer =
+            device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                label: Some("Copy Slot Config"),
+                contents: bytemuck::bytes_of(&CopyConfig {
+                    wide_tile_width: u32::from(WideTile::WIDTH),
+                    wide_tile_height: u32::from(Tile::HEIGHT),
+                    slot_texture_height: u32::from(Tile::HEIGHT) * slot_count as u32,
+                    target_texture_width: u32::from(WideTile::WIDTH),
+                    target_texture_height: u32::from(Tile::HEIGHT) * slot_count as u32,
+                    _padding: [0; 3],
+                }),
+                usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+            });
+
+        let copy_target_config_buffer =
+            device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                label: Some("Copy Target Config"),
+                contents: bytemuck::bytes_of(&CopyConfig {
+                    wide_tile_width: u32::from(WideTile::WIDTH),
+                    wide_tile_height: u32::from(Tile::HEIGHT),
+                    slot_texture_height: u32::from(Tile::HEIGHT) * slot_count as u32,
+                    target_texture_width: render_target_config.width,
+                    target_texture_height: render_target_config.height,
+                    _padding: [0; 3],
+                }),
+                usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+            });
+
+        // Create copy bind groups
+        let copy_slot_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("Copy Slot Bind Group"),
+            layout: &copy_bind_group_layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: copy_slot_config_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: wgpu::BindingResource::TextureView(&blend_texture_view),
+                },
+            ],
+        });
+
+        let copy_target_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("Copy Target Bind Group"),
+            layout: &copy_bind_group_layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: copy_target_config_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: wgpu::BindingResource::TextureView(&blend_texture_view),
+                },
+            ],
+        });
+
+        // Create command buffers
+        let blend_commands_buffer = Self::make_commands_buffer(device, 0);
+        let copy_commands_buffer = Self::make_commands_buffer(device, 0);
+
         let resources = GpuResources {
             strips_buffer: Self::make_strips_buffer(device, 0),
             clear_slot_indices_buffer,
@@ -632,12 +946,26 @@
             encoded_paints_texture,
             encoded_paints_bind_group,
             view_config_buffer,
+            blend_texture,
+            blend_texture_view,
+            blend_config_buffer,
+            copy_slot_config_buffer,
+            copy_target_config_buffer,
+            copy_slot_bind_group,
+            copy_target_bind_group,
+            blend_commands_buffer,
+            copy_commands_buffer,
         };
 
         Self {
             strip_pipeline,
             strip_bind_group_layout,
             encoded_paints_bind_group_layout,
+            clear_pipeline,
+            blend_pipeline,
+            blend_bind_group_layout,
+            copy_pipeline,
+            copy_bind_group_layout,
             resources,
             alpha_data,
             encoded_paints_data,
@@ -645,8 +973,6 @@
                 width: render_target_config.width,
                 height: render_target_config.height,
             },
-
-            clear_pipeline,
         }
     }
 
@@ -668,6 +994,15 @@
         })
     }
 
+    fn make_commands_buffer(device: &Device, required_size: u64) -> Buffer {
+        device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Commands Buffer"),
+            size: required_size,
+            usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        })
+    }
+
     fn make_config_buffer(
         device: &Device,
         render_size: &RenderSize,
@@ -1129,6 +1464,73 @@
             render_pass.draw(0..4, 0..u32::try_from(slot_indices.len()).unwrap());
         }
     }
+
+    fn upload_blend_commands(&mut self, commands: &[crate::render::common::GpuBlendCommand]) {
+        let required_size = mem::size_of_val(commands) as u64;
+        self.programs.resources.blend_commands_buffer =
+            Programs::make_commands_buffer(self.device, required_size);
+
+        let mut buffer = self
+            .queue
+            .write_buffer_with(
+                &self.programs.resources.blend_commands_buffer,
+                0,
+                required_size.try_into().unwrap(),
+            )
+            .expect("Capacity handled in creation");
+        buffer.copy_from_slice(bytemuck::cast_slice(commands));
+    }
+
+    fn upload_copy_commands(&mut self, commands: &[crate::render::common::GpuCopyCommand]) {
+        let required_size = mem::size_of_val(commands) as u64;
+        self.programs.resources.copy_commands_buffer =
+            Programs::make_commands_buffer(self.device, required_size);
+
+        let mut buffer = self
+            .queue
+            .write_buffer_with(
+                &self.programs.resources.copy_commands_buffer,
+                0,
+                required_size.try_into().unwrap(),
+            )
+            .expect("Capacity handled in creation");
+        buffer.copy_from_slice(bytemuck::cast_slice(commands));
+    }
+
+    fn do_copy_render_pass(
+        &mut self,
+        commands: &[crate::render::common::GpuCopyCommand],
+        target_index: usize,
+    ) {
+        let (bind_group, target_view) = if target_index == 2 {
+            (&self.programs.resources.copy_target_bind_group, self.view)
+        } else {
+            (
+                &self.programs.resources.copy_slot_bind_group,
+                &self.programs.resources.slot_texture_views[target_index],
+            )
+        };
+
+        let mut render_pass = self.encoder.begin_render_pass(&RenderPassDescriptor {
+            label: Some("Copy Pass"),
+            color_attachments: &[Some(RenderPassColorAttachment {
+                view: target_view,
+                resolve_target: None,
+                ops: wgpu::Operations {
+                    load: wgpu::LoadOp::Load,
+                    store: wgpu::StoreOp::Store,
+                },
+            })],
+            depth_stencil_attachment: None,
+            occlusion_query_set: None,
+            timestamp_writes: None,
+        });
+
+        render_pass.set_pipeline(&self.programs.copy_pipeline);
+        render_pass.set_bind_group(0, bind_group, &[]);
+        render_pass.set_vertex_buffer(0, self.programs.resources.copy_commands_buffer.slice(..));
+        render_pass.draw(0..4, 0..commands.len() as u32);
+    }
 }
 
 impl RendererBackend for RendererContext<'_> {
@@ -1151,6 +1553,169 @@
 
         self.do_strip_render_pass(strips, target_index, wgpu_load_op);
     }
+
+    fn blend_pass(&mut self, commands: &[BlendCommand]) {
+        use crate::render::common::{GpuBlendCommand, GpuCopyCommand};
+
+        // Process each blend command individually with its copy
+        for cmd in commands {
+            // Convert single blend command to GPU command
+            let src_xy = ((cmd.src_slot as u32 * Tile::HEIGHT as u32) << 16) | 0u32; // x=0 for slots
+
+            let (dst_xy, copy_target) = match &cmd.dst_location {
+                Location::XY(x, y) => {
+                    // Destination is final target
+                    let xy = ((*y as u32) << 16) | (*x as u32);
+                    (xy, None)
+                }
+                Location::Slot(slot) => {
+                    // Destination is a slot
+                    let xy = ((*slot as u32 * Tile::HEIGHT as u32) << 16) | 0u32;
+                    (xy, Some(cmd.dst_texture as usize))
+                }
+            };
+
+            // Encode payload: opacity (0-7), compose (8-11), mix (12-15),
+            // source texture (16), dest texture (17-18), blend slot (19-26)
+            let opacity = cmd.opacity as u32;
+            let compose = encode_compose_mode(cmd.mode.compose) << 8;
+            let mix = encode_mix_mode(cmd.mode.mix) << 12;
+            let src_texture = (cmd.src_texture as u32) << 16;
+            let dst_texture = (cmd.dst_texture as u32) << 17;
+            let blend_slot = (cmd.blend_slot as u32) << 19;
+
+            let payload = opacity | compose | mix | src_texture | dst_texture | blend_slot;
+
+            let blend_command = GpuBlendCommand {
+                xy_src: src_xy,
+                xy_dst: dst_xy,
+                payload,
+            };
+
+            println!("Processing blend command: {:?}", blend_command);
+
+            // Upload this single blend command
+            self.upload_blend_commands(&[blend_command]);
+
+            // Create blend bind group
+            let blend_bind_group = self.device.create_bind_group(&wgpu::BindGroupDescriptor {
+                label: Some("Dynamic Blend Bind Group"),
+                layout: &self.programs.blend_bind_group_layout,
+                entries: &[
+                    wgpu::BindGroupEntry {
+                        binding: 0,
+                        resource: self
+                            .programs
+                            .resources
+                            .blend_config_buffer
+                            .as_entire_binding(),
+                    },
+                    wgpu::BindGroupEntry {
+                        binding: 1,
+                        resource: wgpu::BindingResource::TextureView(
+                            &self.programs.resources.slot_texture_views[0],
+                        ),
+                    },
+                    wgpu::BindGroupEntry {
+                        binding: 2,
+                        resource: wgpu::BindingResource::TextureView(
+                            &self.programs.resources.slot_texture_views[1],
+                        ),
+                    },
+                ],
+            });
+
+            // Execute blend pass for this single command
+            {
+                let mut render_pass = self.encoder.begin_render_pass(&RenderPassDescriptor {
+                    label: Some("Blend Pass"),
+                    color_attachments: &[Some(RenderPassColorAttachment {
+                        view: &self.programs.resources.blend_texture_view,
+                        resolve_target: None,
+                        ops: wgpu::Operations {
+                            load: wgpu::LoadOp::Load,
+                            store: wgpu::StoreOp::Store,
+                        },
+                    })],
+                    depth_stencil_attachment: None,
+                    occlusion_query_set: None,
+                    timestamp_writes: None,
+                });
+
+                render_pass.set_pipeline(&self.programs.blend_pipeline);
+                render_pass.set_bind_group(0, &blend_bind_group, &[]);
+                render_pass
+                    .set_vertex_buffer(0, self.programs.resources.blend_commands_buffer.slice(..));
+                render_pass.draw(0..4, 0..1); // Only 1 instance for this single command
+            }
+
+            // Execute copy pass for this single command
+            let copy_command = GpuCopyCommand {
+                xy_target: dst_xy,
+                slot_ix: cmd.blend_slot as u32,
+            };
+
+            println!("Executing copy: {:?} to {:?}", copy_command, copy_target);
+
+            self.upload_copy_commands(&[copy_command]);
+
+            match copy_target {
+                None => {
+                    // Copy to final target
+                    self.do_copy_render_pass(&[copy_command], 2);
+                }
+                Some(slot_texture_ix) => {
+                    // Copy to slot texture
+                    self.do_copy_render_pass(&[copy_command], slot_texture_ix);
+                }
+            }
+        }
+    }
+}
+
+/// Encode compose mode to u32 for shader
+fn encode_compose_mode(compose: vello_common::peniko::Compose) -> u32 {
+    use vello_common::peniko::Compose;
+    match compose {
+        Compose::Clear => 0,
+        Compose::Copy => 1,
+        Compose::Dest => 2,
+        Compose::SrcOver => 3,
+        Compose::DestOver => 4,
+        Compose::SrcIn => 5,
+        Compose::DestIn => 6,
+        Compose::SrcOut => 7,
+        Compose::DestOut => 8,
+        Compose::SrcAtop => 9,
+        Compose::DestAtop => 10,
+        Compose::Xor => 11,
+        Compose::Plus => 12,
+        Compose::PlusLighter => 13,
+    }
+}
+
+/// Encode mix mode to u32 for shader
+fn encode_mix_mode(mix: vello_common::peniko::Mix) -> u32 {
+    use vello_common::peniko::Mix;
+    match mix {
+        Mix::Normal => 0,
+        Mix::Multiply => 1,
+        Mix::Screen => 2,
+        Mix::Overlay => 3,
+        Mix::Darken => 4,
+        Mix::Lighten => 5,
+        Mix::ColorDodge => 6,
+        Mix::ColorBurn => 7,
+        Mix::HardLight => 8,
+        Mix::SoftLight => 9,
+        Mix::Difference => 10,
+        Mix::Exclusion => 11,
+        Mix::Hue => 12,
+        Mix::Saturation => 13,
+        Mix::Color => 14,
+        Mix::Luminosity => 15,
+        Mix::Clip => 16,
+    }
 }
 
 /// Trait for types that can write image data directly to the atlas texture.

diff --git a/sparse_strips/vello_hybrid/src/scene.rs b/sparse_strips/vello_hybrid/src/scene.rs
index 0be42a0..70849a9 100644
--- a/sparse_strips/vello_hybrid/src/scene.rs
+++ b/sparse_strips/vello_hybrid/src/scene.rs

@@ -201,17 +201,13 @@
             None
         };
 
-        // Blend mode, opacity, and mask are not supported yet.
-        if blend_mode.is_some() {
-            unimplemented!()
-        }
         if mask.is_some() {
             unimplemented!()
         }
 
         self.wide.push_layer(
             clip,
-            BlendMode::new(Mix::Normal, Compose::SrcOver),
+            blend_mode.unwrap_or(BlendMode::new(Mix::Normal, Compose::SrcOver)),
             None,
             opacity.unwrap_or(1.),
             0,

diff --git a/sparse_strips/vello_hybrid/src/schedule.rs b/sparse_strips/vello_hybrid/src/schedule.rs
index 3d5665b..61d361c 100644
--- a/sparse_strips/vello_hybrid/src/schedule.rs
+++ b/sparse_strips/vello_hybrid/src/schedule.rs

@@ -196,6 +196,9 @@
 
     /// Execute a render pass for strips.
     fn render_strips(&mut self, strips: &[GpuStrip], target_index: usize, load_op: LoadOp);
+
+    /// Execute a blend pass.
+    fn blend_pass(&mut self, commands: &[BlendCommand]);
 }
 
 /// Backend agnostic enum that specifies the operation to perform to the output attachment at the
@@ -208,6 +211,30 @@
 }
 
 #[derive(Debug)]
+pub(crate) struct BlendCommand {
+    // Source location
+    pub src_texture: u8, // 0 or 1 for slot textures
+    pub src_slot: u16,   // Which slot in that texture
+
+    // Destination location (where result ultimately goes)
+    pub dst_texture: u8,        // 0, 1 for slots, 2 for final target
+    pub dst_location: Location, // Either a slot index or (x,y) for final target
+
+    // Blend parameters
+    pub mode: BlendMode,
+    pub opacity: u8,
+
+    // Where to render in the blend texture
+    pub blend_slot: usize, // Index into blend texture
+}
+
+#[derive(Debug)]
+pub enum Location {
+    Slot(u16),    // For slot textures
+    XY(u16, u16), // For final target
+}
+
+#[derive(Debug)]
 pub(crate) struct Scheduler {
     /// Index of the current round
     round: usize,
@@ -221,6 +248,8 @@
     rounds_queue: VecDeque<Round>,
     /// State for a single wide tile.
     tile_state: TileState,
+    /// Blend slot on a third buffer texture.
+    next_blend_slot: usize,
 }
 
 /// A "round" is a coarse scheduling quantum.
@@ -233,6 +262,7 @@
     draws: [Draw; 3],
     /// Slots that will be freed after drawing into the two slot textures [0, 1].
     free: [Vec<usize>; 2],
+    blend_commands: Vec<BlendCommand>,
 }
 
 /// State for a single wide tile.
@@ -264,6 +294,7 @@
             clear,
             rounds_queue: Default::default(),
             tile_state: Default::default(),
+            next_blend_slot: 0,
         }
     }
 
@@ -318,7 +349,8 @@
     ///
     /// The rounds queue must not be empty.
     fn flush<R: RendererBackend>(&mut self, renderer: &mut R) {
-        let round = self.rounds_queue.pop_front().unwrap();
+        println!("FLUSH");
+        let round = dbg!(self.rounds_queue.pop_front().unwrap());
         for (i, draw) in round.draws.iter().enumerate() {
             if draw.0.is_empty() {
                 continue;
@@ -342,9 +374,13 @@
             };
             renderer.render_strips(&draw.0, i, load);
         }
+        renderer.blend_pass(&round.blend_commands);
         for i in 0..2 {
             self.free[i].extend(&round.free[i]);
         }
+        if self.rounds_queue.is_empty() {
+            self.next_blend_slot = 0;
+        }
         self.round += 1;
     }
 
@@ -397,7 +433,7 @@
                 });
             }
         }
-        for cmd in &tile.cmds {
+        for cmd in dbg!(&tile.cmds) {
             // Note: this starts at 1 (for the final target)
             let clip_depth = state.stack.len();
             match cmd {
@@ -539,44 +575,39 @@
                     state.stack.last_mut().unwrap().opacity = *opacity;
                 }
                 Cmd::Blend(mode) => {
-                    // This blend mode is implicitly supported. Currently no other blend mode is
-                    // supported in `vello_hybrid`.
-                    assert!(
-                        matches!(
-                            mode,
-                            BlendMode {
-                                mix: Mix::Normal,
-                                compose: Compose::SrcOver
-                            }
-                        ),
-                        "Changing blend mode is unsupported"
-                    );
-
                     let tos = state.stack.last().unwrap();
                     let nos = &state.stack[state.stack.len() - 2];
 
-                    let next_round = clip_depth % 2 == 0 && clip_depth > 2;
-                    let round = nos.round.max(tos.round + usize::from(next_round));
-                    let draw = self.draw_mut(round, clip_depth - 1);
-                    let (x, y) = if clip_depth <= 2 {
-                        (wide_tile_x, wide_tile_y)
+                    let opacity_u8 = (tos.opacity * 255.0) as u8;
+                    let blend_slot = self.next_blend_slot;
+                    self.next_blend_slot += 1;
+
+                    let blend_command = if clip_depth <= 2 {
+                        // Blending to final target
+                        BlendCommand {
+                            src_texture: (1 - clip_depth % 2) as u8,
+                            src_slot: tos.slot_ix as u16,
+                            dst_texture: 2,
+                            dst_location: Location::XY(wide_tile_x, wide_tile_y),
+                            mode: *mode,
+                            opacity: opacity_u8,
+                            blend_slot,
+                        }
                     } else {
-                        (0, nos.slot_ix as u16 * Tile::HEIGHT)
+                        // Blending between slots
+                        let dst_texture = (clip_depth % 2) as u8;
+                        BlendCommand {
+                            src_texture: (1 - clip_depth % 2) as u8,
+                            src_slot: tos.slot_ix as u16,
+                            dst_texture,
+                            dst_location: Location::Slot(nos.slot_ix as u16),
+                            mode: *mode,
+                            opacity: opacity_u8,
+                            blend_slot,
+                        }
                     };
 
-                    // Opacity packed into the first 8 bits.
-                    let opacity_u8 = (tos.opacity * 255.0) as u32;
-                    let paint = (COLOR_SOURCE_SLOT << 31) | opacity_u8;
-
-                    draw.0.push(GpuStrip {
-                        x,
-                        y,
-                        width: WideTile::WIDTH,
-                        dense_width: 0,
-                        col_idx: 0,
-                        payload: tos.slot_ix as u32,
-                        paint,
-                    });
+                    self.rounds_queue[0].blend_commands.push(blend_command);
                 }
                 _ => unimplemented!(),
             }

diff --git a/sparse_strips/vello_sparse_shaders/shaders/blend_wide_tile.wgsl b/sparse_strips/vello_sparse_shaders/shaders/blend_wide_tile.wgsl
new file mode 100644
index 0000000..b347f12
--- /dev/null
+++ b/sparse_strips/vello_sparse_shaders/shaders/blend_wide_tile.wgsl

@@ -0,0 +1,466 @@
+// Copyright 2025 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+struct Config {
+    // Width of a wide tile (matching `WideTile::WIDTH`).
+    wide_tile_width: u32,
+    // Height of a wide tile (matching `WideTile::HEIGHT`).
+    wide_tile_height: u32,
+    // Height of the slot texture.
+    slot_texture_height: u32,
+    // Height of the final target texture.
+    final_target_height: u32,
+    // Height of the blend texture.
+    blend_texture_height: u32,
+}
+
+struct BlendCommand {
+    // [x, y] packed as u16's
+    // x, y — coordinates of the top left of the source wide tile
+    @location(0) xy_src: u32,
+    // [x, y] packed as u16's
+    // x, y — coordinates of the top left of the destination wide tile
+    @location(1) xy_dst: u32,
+    // Bits 0-7: opacity
+    // Bits 8-11: compose
+    // Bits 12-15: mix
+    // Bits 16: source texture (TODO: Consider passing slot_ix alone)
+    //       0 = slots of ix=0
+    //       1 = slots of ix=1
+    // Bits 17-18: dest texture
+    //       0 = slots of ix=0
+    //       1 = slots of ix=1
+    //       2 = final target
+    // Bits 19-26: blend slot index
+    @location(2) payload: u32,
+}
+
+struct VertexOutput {
+    // Normalized device coordinates (NDC) for the current vertex
+    @builtin(position) position: vec4<f32>,
+    // Texture coordinates for the current fragment
+    @location(0) src_tex_coord: vec2<f32>,
+    @location(1) dst_tex_coord: vec2<f32>,
+    // See `BlendCommand` documentation.
+    @location(2) payload: u32,
+}
+
+@group(0) @binding(0)
+var<uniform> config: Config;
+
+@group(0) @binding(1)
+var slot_texture_0: texture_2d<f32>;
+
+@group(0) @binding(2)
+var slot_texture_1: texture_2d<f32>;
+
+@group(0) @binding(3)
+var final_target: texture_2d<f32>;
+
+@vertex
+fn vs_main(
+    @builtin(vertex_index) in_vertex_index: u32,
+    command: BlendCommand,
+) -> VertexOutput {
+    var out: VertexOutput;
+    out.payload = command.payload;
+
+    // Map vertex_index (0-3) to quad corners:
+    // 0 → (0,0), 1 → (1,0), 2 → (0,1), 3 → (1,1)
+    let x = f32(in_vertex_index & 1u);
+    let y = f32(in_vertex_index >> 1u);
+
+    // Calculate `position` for output.
+    {
+        // Extract bits 19-26 for blend slot index
+        let blend_slot_ix = (command.payload >> 19u) & 0xffu;
+
+        // Calculate the y-position based on the slot index
+        let slot_y_offset = f32(blend_slot_ix * config.wide_tile_height);
+
+        // Scale to match slot dimensions
+        let pix_x = x * f32(config.wide_tile_width);
+        let pix_y = slot_y_offset + y * f32(config.wide_tile_height);
+
+        // Convert to NDC
+        let ndc_x = pix_x * 2.0 / f32(config.wide_tile_width) - 1.0;
+        let ndc_y = 1.0 - pix_y * 2.0 / f32(config.blend_texture_height);
+
+        out.position = vec4(ndc_x, ndc_y, 0.0, 1.0);
+    }
+
+    // Calculate `src_tex_coord` for the source texture.
+    {
+        let src_x0 = f32(command.xy_src & 0xffffu);
+        let src_y0 = f32(command.xy_src >> 16u);
+
+        let src_x = src_x0 + x * f32(config.wide_tile_width);
+        let src_y = src_y0 + y * f32(config.wide_tile_height);
+
+        out.src_tex_coord = vec2f(src_x, src_y);
+    }
+
+    // Calculate `dst_tex_coord` for the destination texture.
+    {
+        let dst_x0 = f32(command.xy_dst & 0xffffu);
+        let dst_y0 = f32(command.xy_dst >> 16u);
+
+        let dst_texture_ix = (command.payload >> 17u) & 3u;
+
+        let dst_height = f32(dst_texture_ix != 2u) * f32(config.wide_tile_height) + f32(dst_texture_ix == 2) * f32(config.final_target_height);
+
+        let dst_x = dst_x0 + x * f32(config.wide_tile_width);
+        let dst_y = dst_y0 + y * f32(dst_height);
+
+        out.dst_tex_coord = vec2f(dst_x, dst_y);
+    }
+
+    return out;
+}
+
+@fragment
+fn fs_main(in: VertexOutput) -> @location(0) vec4<f32> {
+    var bg_color: vec4<f32>;
+    var fg_color: vec4<f32>;
+
+    // Calculate `fg_color` of the foreground texture.
+    {
+        let src_texture_ix = (in.payload >> 16u) & 1u;
+        let src = vec2u(u32(floor(in.src_tex_coord.x)), u32(floor(in.src_tex_coord.y)));
+
+        if src_texture_ix == 0u {
+            fg_color = textureLoad(slot_texture_0, src, 0);
+        } else {
+            fg_color = textureLoad(slot_texture_1, src, 0);
+        }
+    }
+
+    // Calculate `bg_color` of the background texture.
+    {
+        let dst_texture_ix = (in.payload >> 17u) & 3u;
+        let dst = vec2u(u32(floor(in.dst_tex_coord.x)), u32(floor(in.dst_tex_coord.y)));
+
+        if dst_texture_ix == 0u {
+            bg_color = textureLoad(slot_texture_0, dst, 0);
+        } else if dst_texture_ix == 1u {
+            bg_color = textureLoad(slot_texture_1, dst, 0);
+        } else {
+            // Encode a special marker in the red channel to indicate "untouched" background
+            // Use a small non-zero value that won't affect visual output but can be detected.
+            // This is so janky it isn't funny. This "special value" is used in the copy shader.
+            bg_color = vec4(1.0 / 255.0, 0.0, 0.0, 0.0);
+        }
+    }
+
+    let opacity = f32(in.payload & 0xFFu) / 255.0;
+    let mixed = blend_mix_compose(bg_color, fg_color, in.payload >> 8u);
+    return mixed * opacity;
+}
+
+// Color mixing modes
+
+const MIX_NORMAL = 0u;
+const MIX_MULTIPLY = 1u;
+const MIX_SCREEN = 2u;
+const MIX_OVERLAY = 3u;
+const MIX_DARKEN = 4u;
+const MIX_LIGHTEN = 5u;
+const MIX_COLOR_DODGE = 6u;
+const MIX_COLOR_BURN = 7u;
+const MIX_HARD_LIGHT = 8u;
+const MIX_SOFT_LIGHT = 9u;
+const MIX_DIFFERENCE = 10u;
+const MIX_EXCLUSION = 11u;
+const MIX_HUE = 12u;
+const MIX_SATURATION = 13u;
+const MIX_COLOR = 14u;
+const MIX_LUMINOSITY = 15u;
+const MIX_CLIP = 16u;
+
+fn screen(cb: vec3<f32>, cs: vec3<f32>) -> vec3<f32> {
+    return cb + cs - (cb * cs);
+}
+
+fn color_dodge(cb: f32, cs: f32) -> f32 {
+    if cb == 0.0 {
+        return 0.0;
+    } else if cs == 1.0 {
+        return 1.0;
+    } else {
+        return min(1.0, cb / (1.0 - cs));
+    }
+}
+
+fn color_burn(cb: f32, cs: f32) -> f32 {
+    if cb == 1.0 {
+        return 1.0;
+    } else if cs == 0.0 {
+        return 0.0;
+    } else {
+        return 1.0 - min(1.0, (1.0 - cb) / cs);
+    }
+}
+
+fn hard_light(cb: vec3<f32>, cs: vec3<f32>) -> vec3<f32> {
+    return select(
+        screen(cb, 2.0 * cs - 1.0),
+        cb * 2.0 * cs,
+        cs <= vec3(0.5)
+    );
+}
+
+fn soft_light(cb: vec3<f32>, cs: vec3<f32>) -> vec3<f32> {
+    let d = select(
+        sqrt(cb),
+        ((16.0 * cb - 12.0) * cb + 4.0) * cb,
+        cb <= vec3(0.25)
+    );
+    return select(
+        cb + (2.0 * cs - 1.0) * (d - cb),
+        cb - (1.0 - 2.0 * cs) * cb * (1.0 - cb),
+        cs <= vec3(0.5)
+    );
+}
+
+fn sat(c: vec3<f32>) -> f32 {
+    return max(c.x, max(c.y, c.z)) - min(c.x, min(c.y, c.z));
+}
+
+fn lum(c: vec3<f32>) -> f32 {
+    let f = vec3(0.3, 0.59, 0.11);
+    return dot(c, f);
+}
+
+fn clip_color(c_in: vec3<f32>) -> vec3<f32> {
+    var c = c_in;
+    let l = lum(c);
+    let n = min(c.x, min(c.y, c.z));
+    let x = max(c.x, max(c.y, c.z));
+    if n < 0.0 {
+        c = l + (((c - l) * l) / (l - n));
+    }
+    if x > 1.0 {
+        c = l + (((c - l) * (1.0 - l)) / (x - l));
+    }
+    return c;
+}
+
+fn set_lum(c: vec3<f32>, l: f32) -> vec3<f32> {
+    return clip_color(c + (l - lum(c)));
+}
+
+fn set_sat_inner(
+    cmin: ptr<function, f32>,
+    cmid: ptr<function, f32>,
+    cmax: ptr<function, f32>,
+    s: f32
+) {
+    if *cmax > *cmin {
+        *cmid = ((*cmid - *cmin) * s) / (*cmax - *cmin);
+        *cmax = s;
+    } else {
+        *cmid = 0.0;
+        *cmax = 0.0;
+    }
+    *cmin = 0.0;
+}
+
+fn set_sat(c: vec3<f32>, s: f32) -> vec3<f32> {
+    var r = c.r;
+    var g = c.g;
+    var b = c.b;
+    if r <= g {
+        if g <= b {
+            set_sat_inner(&r, &g, &b, s);
+        } else {
+            if r <= b {
+                set_sat_inner(&r, &b, &g, s);
+            } else {
+                set_sat_inner(&b, &r, &g, s);
+            }
+        }
+    } else {
+        if r <= b {
+            set_sat_inner(&g, &r, &b, s);
+        } else {
+            if g <= b {
+                set_sat_inner(&g, &b, &r, s);
+            } else {
+                set_sat_inner(&b, &g, &r, s);
+            }
+        }
+    }
+    return vec3(r, g, b);
+}
+
+// Blends two RGB colors together. The colors are assumed to be in sRGB
+// color space, and this function does not take alpha into account.
+fn blend_mix(cb: vec3<f32>, cs: vec3<f32>, mode: u32) -> vec3<f32> {
+    var b = vec3(0.0);
+    switch mode {
+        case MIX_MULTIPLY: {
+            b = cb * cs;
+        }
+        case MIX_SCREEN: {
+            b = screen(cb, cs);
+        }
+        case MIX_OVERLAY: {
+            b = hard_light(cs, cb);
+        }
+        case MIX_DARKEN: {
+            b = min(cb, cs);
+        }
+        case MIX_LIGHTEN: {
+            b = max(cb, cs);
+        }
+        case MIX_COLOR_DODGE: {
+            b = vec3(color_dodge(cb.x, cs.x), color_dodge(cb.y, cs.y), color_dodge(cb.z, cs.z));
+        }
+        case MIX_COLOR_BURN: {
+            b = vec3(color_burn(cb.x, cs.x), color_burn(cb.y, cs.y), color_burn(cb.z, cs.z));
+        }
+        case MIX_HARD_LIGHT: {
+            b = hard_light(cb, cs);
+        }
+        case MIX_SOFT_LIGHT: {
+            b = soft_light(cb, cs);
+        }
+        case MIX_DIFFERENCE: {
+            b = abs(cb - cs);
+        }
+        case MIX_EXCLUSION: {
+            b = cb + cs - 2.0 * cb * cs;
+        }
+        case MIX_HUE: {
+            b = set_lum(set_sat(cs, sat(cb)), lum(cb));
+        }
+        case MIX_SATURATION: {
+            b = set_lum(set_sat(cb, sat(cs)), lum(cb));
+        }
+        case MIX_COLOR: {
+            b = set_lum(cs, lum(cb));
+        }
+        case MIX_LUMINOSITY: {
+            b = set_lum(cb, lum(cs));
+        }
+        default: {
+            b = cs;
+        }
+    }
+    return b;
+}
+
+// Composition modes
+
+const COMPOSE_CLEAR = 0u;
+const COMPOSE_COPY = 1u;
+const COMPOSE_DEST = 2u;
+const COMPOSE_SRC_OVER = 3u;
+const COMPOSE_DEST_OVER = 4u;
+const COMPOSE_SRC_IN = 5u;
+const COMPOSE_DEST_IN = 6u;
+const COMPOSE_SRC_OUT = 7u;
+const COMPOSE_DEST_OUT = 8u;
+const COMPOSE_SRC_ATOP = 9u;
+const COMPOSE_DEST_ATOP = 10u;
+const COMPOSE_XOR = 11u;
+const COMPOSE_PLUS = 12u;
+const COMPOSE_PLUS_LIGHTER = 13u;
+
+// Apply general compositing operation.
+// Inputs are separated colors and alpha, output is premultiplied.
+fn blend_compose(
+    cb: vec3<f32>,
+    cs: vec3<f32>,
+    ab: f32,
+    as_: f32,
+    mode: u32
+) -> vec4<f32> {
+    var fa = 0.0;
+    var fb = 0.0;
+    switch mode {
+        case COMPOSE_COPY: {
+            fa = 1.0;
+            fb = 0.0;
+        }
+        case COMPOSE_DEST: {
+            fa = 0.0;
+            fb = 1.0;
+        }
+        case COMPOSE_SRC_OVER: {
+            fa = 1.0;
+            fb = 1.0 - as_;
+        }
+        case COMPOSE_DEST_OVER: {
+            fa = 1.0 - ab;
+            fb = 1.0;
+        }
+        case COMPOSE_SRC_IN: {
+            fa = ab;
+            fb = 0.0;
+        }
+        case COMPOSE_DEST_IN: {
+            fa = 0.0;
+            fb = as_;
+        }
+        case COMPOSE_SRC_OUT: {
+            fa = 1.0 - ab;
+            fb = 0.0;
+        }
+        case COMPOSE_DEST_OUT: {
+            fa = 0.0;
+            fb = 1.0 - as_;
+        }
+        case COMPOSE_SRC_ATOP: {
+            fa = ab;
+            fb = 1.0 - as_;
+        }
+        case COMPOSE_DEST_ATOP: {
+            fa = 1.0 - ab;
+            fb = as_;
+        }
+        case COMPOSE_XOR: {
+            fa = 1.0 - ab;
+            fb = 1.0 - as_;
+        }
+        case COMPOSE_PLUS: {
+            fa = 1.0;
+            fb = 1.0;
+        }
+        case COMPOSE_PLUS_LIGHTER: {
+            return min(vec4(1.0), vec4(as_ * cs + ab * cb, as_ + ab));
+        }
+        default: {}
+    }
+    let as_fa = as_ * fa;
+    let ab_fb = ab * fb;
+    let co = as_fa * cs + ab_fb * cb;
+    // Modes like COMPOSE_PLUS can generate alpha > 1.0, so clamp.
+    return vec4(co, min(as_fa + ab_fb, 1.0));
+}
+
+// Apply color mixing and composition. Both input and output colors are
+// premultiplied RGB.
+fn blend_mix_compose(backdrop: vec4<f32>, src: vec4<f32>, mode: u32) -> vec4<f32> {
+    let BLEND_DEFAULT = ((MIX_NORMAL << 4u) | COMPOSE_SRC_OVER);
+    let EPSILON = 1e-15;
+    if (mode & 0xffu) == BLEND_DEFAULT {
+        // Both normal+src_over blend and clip case
+        return backdrop * (1.0 - src.a) + src;
+    }
+    // Un-premultiply colors for blending. Max with a small epsilon to avoid NaNs.
+    let inv_src_a = 1.0 / max(src.a, EPSILON);
+    var cs = src.rgb * inv_src_a;
+    let inv_backdrop_a = 1.0 / max(backdrop.a, EPSILON);
+    let cb = backdrop.rgb * inv_backdrop_a;
+    let mix_mode = (mode >> 4u) & 0xfu;
+    let mixed = blend_mix(cb, cs, mix_mode);
+    cs = mix(cs, mixed, backdrop.a);
+    let compose_mode = mode & 0xfu;
+    if compose_mode == COMPOSE_SRC_OVER {
+        let co = mix(backdrop.rgb, cs, src.a);
+        return vec4(co, src.a + backdrop.a * (1.0 - src.a));
+    } else {
+        return blend_compose(cb, cs, backdrop.a, src.a, compose_mode);
+    }
+}

diff --git a/sparse_strips/vello_sparse_shaders/shaders/copy_slot.wgsl b/sparse_strips/vello_sparse_shaders/shaders/copy_slot.wgsl
new file mode 100644
index 0000000..1d88e3a
--- /dev/null
+++ b/sparse_strips/vello_sparse_shaders/shaders/copy_slot.wgsl

@@ -0,0 +1,82 @@
+// Copyright 2025 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+// This shader copies a wide tile slot from a slot texture to a target location.
+
+struct Config {
+    // Width of a wide tile (matching `WideTile::WIDTH`).
+    wide_tile_width: u32,
+    // Height of a wide tile (matching `WideTile::HEIGHT`).
+    wide_tile_height: u32,
+    // Height of the slot texture (source).
+    slot_texture_height: u32,
+    // Width of the target texture (destination).
+    target_texture_width: u32,
+    // Height of the target texture (destination).
+    target_texture_height: u32,
+}
+
+struct CopyCommand {
+    // [x, y] packed as u16's
+    // x, y — coordinates of the top left of the target wide tile
+    @location(0) xy_target: u32,
+    // Slot index to identify the pixel position to sample from
+    @location(1) slot_ix: u32,
+}
+
+struct VertexOutput {
+    // Normalized device coordinates (NDC) for the current vertex
+    @builtin(position) position: vec4<f32>,
+    // Slot index passed to the fragment shader
+    @location(0) @interpolate(flat) slot_ix: u32,
+}
+
+@group(0) @binding(0)
+var<uniform> config: Config;
+
+@group(0) @binding(1)
+var slot_texture: texture_2d<f32>;
+
+@vertex
+fn vs_main(
+    @builtin(vertex_index) vertex_index: u32,
+    command: CopyCommand,
+) -> VertexOutput {
+    var out: VertexOutput;
+    out.slot_ix = command.slot_ix;
+
+    // Map vertex_index (0-3) to quad corners:
+    // 0 → (0,0), 1 → (1,0), 2 → (0,1), 3 → (1,1)
+    let x = f32(vertex_index & 1u);
+    let y = f32(vertex_index >> 1u);
+    
+    // Unpack target coordinates
+    let target_x0 = command.xy_target & 0xffffu;
+    let target_y0 = command.xy_target >> 16u;
+    
+    // Calculate pixel coordinates of the current vertex within the wide tile
+    let pix_x = f32(target_x0) + x * f32(config.wide_tile_width);
+    let pix_y = f32(target_y0) + y * f32(config.wide_tile_height);
+    
+    // Convert to NDC for the target texture
+    let ndc_x = pix_x * 2.0 / f32(config.target_texture_width) - 1.0;
+    let ndc_y = 1.0 - pix_y * 2.0 / f32(config.target_texture_height);
+    
+    out.position = vec4<f32>(ndc_x, ndc_y, 0.0, 1.0);
+    
+    return out;
+}
+
+@fragment
+fn fs_main(in: VertexOutput) -> @location(0) vec4<f32> {
+    // Calculate the coordinates to sample from the slot texture
+    let slot_x = u32(in.position.x) & 0xFFu;
+    let slot_y = (u32(in.position.y) & 3u) + in.slot_ix * config.wide_tile_height;
+    
+    let color = textureLoad(slot_texture, vec2u(slot_x, slot_y), 0);
+    
+    if color.a == 0.0 && color.r > 0.0 && color.r <= (1.0 / 255.0) {
+        discard;
+    }
+    return color;
+}

diff --git a/sparse_strips/vello_sparse_tests/tests/renderer.rs b/sparse_strips/vello_sparse_tests/tests/renderer.rs
index c71cd49..fac25d7 100644
--- a/sparse_strips/vello_sparse_tests/tests/renderer.rs
+++ b/sparse_strips/vello_sparse_tests/tests/renderer.rs

@@ -275,8 +275,8 @@
         self.scene.push_clip_layer(path);
     }
 
-    fn push_blend_layer(&mut self, _: BlendMode) {
-        unimplemented!()
+    fn push_blend_layer(&mut self, blend_mode: BlendMode) {
+        self.scene.push_layer(None, Some(blend_mode), None, None);
     }
 
     fn push_opacity_layer(&mut self, opacity: f32) {