Start of spatio-temportal allocation for clipping

This commit has a sketch of spatio-temporal allocation for clipping, but it is not fully wired up yet. Scenes without clipping should work, but there is a fair amount of TODO remaining for clipping.

There's a fair amount of refactoring here. The biggest change is that draw calls and render passes can be issued from inside the scheduler, as opposed to separate "prepare" and "render" calls. The number of render passes needed will vary by the scene.
diff --git a/sparse_strips/vello_hybrid/examples/render_to_file.rs b/sparse_strips/vello_hybrid/examples/render_to_file.rs
index 041598d..9c510c3 100644
--- a/sparse_strips/vello_hybrid/examples/render_to_file.rs
+++ b/sparse_strips/vello_hybrid/examples/render_to_file.rs
@@ -91,7 +91,8 @@
         width: width.into(),
         height: height.into(),
     };
-    renderer.prepare(&device, &queue, &scene, &render_size);
+    let render_data = scene.prepare_render_data();
+    renderer.prepare(&device, &queue, &render_data, &render_size);
     // Copy texture to buffer
     let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
         label: Some("Vello Render To Buffer"),
@@ -111,7 +112,7 @@
             occlusion_query_set: None,
             timestamp_writes: None,
         });
-        renderer.render(&scene, &mut pass);
+        renderer.render(&render_data, &mut pass);
     }
 
     // Create a buffer to copy the texture data
diff --git a/sparse_strips/vello_hybrid/examples/winit/src/main.rs b/sparse_strips/vello_hybrid/examples/winit/src/main.rs
index 3687e8d..1db65a6 100644
--- a/sparse_strips/vello_hybrid/examples/winit/src/main.rs
+++ b/sparse_strips/vello_hybrid/examples/winit/src/main.rs
@@ -13,7 +13,6 @@
 use vello_common::kurbo::{Affine, Vec2};
 use vello_hybrid::{RenderSize, Renderer, Scene};
 use vello_hybrid_scenes::{AnyScene, get_example_scenes};
-use wgpu::RenderPassDescriptor;
 use winit::{
     application::ApplicationHandler,
     event::{ElementState, KeyEvent, MouseButton, MouseScrollDelta, WindowEvent},
@@ -271,12 +270,6 @@
                     width: surface.config.width,
                     height: surface.config.height,
                 };
-                self.renderers[surface.dev_id].as_mut().unwrap().prepare(
-                    &device_handle.device,
-                    &device_handle.queue,
-                    &self.scene,
-                    &render_size,
-                );
 
                 let surface_texture = surface
                     .surface
@@ -294,24 +287,14 @@
                             label: Some("Vello Render to Surface pass"),
                         });
                 {
-                    let mut pass = encoder.begin_render_pass(&RenderPassDescriptor {
-                        label: Some("Render to Texture Pass"),
-                        color_attachments: &[Some(wgpu::RenderPassColorAttachment {
-                            view: &texture_view,
-                            resolve_target: None,
-                            ops: wgpu::Operations {
-                                load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
-                                store: wgpu::StoreOp::Store,
-                            },
-                        })],
-                        depth_stencil_attachment: None,
-                        occlusion_query_set: None,
-                        timestamp_writes: None,
-                    });
-                    self.renderers[surface.dev_id]
-                        .as_mut()
-                        .unwrap()
-                        .render(&self.scene, &mut pass);
+                    self.renderers[surface.dev_id].as_mut().unwrap().render2(
+                        &self.scene,
+                        &device_handle.device,
+                        &device_handle.queue,
+                        &mut encoder,
+                        &render_size,
+                        &texture_view,
+                    );
                 }
 
                 device_handle.queue.submit([encoder.finish()]);
diff --git a/sparse_strips/vello_hybrid/shaders/sparse_strip_clip.wgsl b/sparse_strips/vello_hybrid/shaders/sparse_strip_clip.wgsl
new file mode 100644
index 0000000..56fe098
--- /dev/null
+++ b/sparse_strips/vello_hybrid/shaders/sparse_strip_clip.wgsl
@@ -0,0 +1,150 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+// This shader is a copy of sparse_strip_renderer but organized for clipping.
+
+struct Config {
+    // Width of the rendering target
+    width: u32,
+    // Height of the rendering target
+    height: u32,
+    // Height of a strip in the rendering
+    strip_height: u32,
+    // Number of trailing zeros in alphas_tex_width (log2 of width).
+    // Pre-calculated on CPU since WebGL2 doesn't support `firstTrailingBit`.
+    alphas_tex_width_bits: u32,
+}
+
+struct StripInstance {
+    // [x, y] packed as u16's
+    @location(0) xy: u32,
+    // [width, dense_width] packed as u16's
+    @location(1) widths: u32,
+    // Alpha texture column index where this strip's alpha values begin
+    @location(2) col: u32,
+    // [r, g, b, a] packed as u8's
+    @location(3) rgba: u32,
+}
+
+struct VertexOutput {
+    // Texture coordinates for the current fragment
+    @location(0) tex_coord: vec2<f32>,
+    // Ending x-position of the dense (alpha) region
+    @location(1) @interpolate(flat) dense_end: u32,
+    // RGBA color value
+    @location(2) @interpolate(flat) color: u32,
+    // Normalized device coordinates (NDC) for the current vertex
+    @builtin(position) position: vec4<f32>,
+};
+
+// TODO: Measure performance of moving to a separate group
+@group(0) @binding(1)
+var<uniform> config: Config;
+
+@vertex
+fn vs_main(
+    @builtin(vertex_index) in_vertex_index: u32,
+    instance: StripInstance,
+) -> VertexOutput {
+    var out: VertexOutput;
+    // Map vertex_index (0-3) to quad corners:
+    // 0 → (0,0), 1 → (1,0), 2 → (0,1), 3 → (1,1)
+    let x = f32(in_vertex_index & 1u);
+    let y = f32(in_vertex_index >> 1u);
+    // Unpack the x and y coordinates from the packed u32 instance.xy
+    let x0 = instance.xy & 0xffffu;
+    let y0 = instance.xy >> 16u;
+    // Unpack the total width and dense (alpha) width from the packed u32 instance.widths
+    let width = instance.widths & 0xffffu;
+    let dense_width = instance.widths >> 16u;
+    // Calculate the ending x-position of the dense (alpha) region
+    // This boundary is used in the fragment shader to determine if alpha sampling is needed
+    out.dense_end = instance.col + dense_width;
+    // Calculate the pixel coordinates of the current vertex within the strip
+    let pix_x = f32(x0) + f32(width) * x;
+    let pix_y = f32(y0) + y * f32(config.strip_height);
+    // Convert pixel coordinates to normalized device coordinates (NDC)
+    // NDC ranges from -1 to 1, with (0,0) at the center of the viewport
+    let ndc_x = pix_x * 2.0 / f32(config.width) - 1.0;
+    let ndc_y = 1.0 - pix_y * 2.0 / f32(config.height);
+
+    out.position = vec4<f32>(ndc_x, ndc_y, 0.0, 1.0);
+    out.tex_coord = vec2<f32>(f32(instance.col) + x * f32(width), y * f32(config.strip_height));
+    out.color = instance.rgba;
+    return out;
+}
+
+@group(0) @binding(0)
+var alphas_texture: texture_2d<u32>;
+
+@group(0) @binding(2)
+var clip_input_texture: texture_2d<f32>;
+
+@fragment
+fn fs_main(in: VertexOutput) -> @location(0) vec4<f32> {
+    let x = u32(floor(in.tex_coord.x));
+    var alpha = 1.0;
+    // Determine if the current fragment is within the dense (alpha) region
+    // If so, sample the alpha value from the texture; otherwise, alpha remains fully opaque (1.0)
+    if x < in.dense_end {
+        let y = u32(floor(in.tex_coord.y));
+        // Retrieve alpha value from the texture. We store 16 1-byte alpha
+        // values per texel, with each color channel packing 4 alpha values.
+        // The code here assumes the strip height is 4, i.e., each color
+        // channel encodes the alpha values for a single column within a strip.
+        // Divide x by 4 to get the texel position.
+        let alphas_index = x;
+        let tex_dimensions = textureDimensions(alphas_texture);
+        let alphas_tex_width = tex_dimensions.x;
+        // Which texel contains the alpha values for this column
+        let texel_index = alphas_index / 4u;
+        // Which channel (R,G,B,A) in the texel contains the alpha values for this column
+        let channel_index = alphas_index % 4u;
+        // Calculate texel coordinates
+        let tex_x = texel_index & (alphas_tex_width - 1u);
+        let tex_y = texel_index >> config.alphas_tex_width_bits;
+
+        // Load all 4 channels from the texture
+        let rgba_values = textureLoad(alphas_texture, vec2<u32>(tex_x, tex_y), 0);
+
+        // Get the column's alphas from the appropriate RGBA channel based on the index
+        let alphas_u32 = unpack_alphas_from_channel(rgba_values, channel_index);
+        // Extract the alpha value for the current y-position from the packed u32 data
+        alpha = f32((alphas_u32 >> (y * 8u)) & 0xffu) * (1.0 / 255.0);
+    }
+    // Apply the alpha value to the unpacked RGBA color
+    let alpha_byte = in.color >> 24u;
+    if alpha_byte != 0 {
+        return alpha * unpack4x8unorm(in.color);
+    } else {
+        // in.color encodes a slot in the source texture
+        let clip_x = u32(in.position.x) & 0xFFu;
+        let clip_y = (u32(in.position.y) & 3) + in.color * 4;
+        let clip_in_color = textureLoad(clip_input_texture, vec2(clip_x, clip_y), 0);
+        return alpha * clip_in_color;
+    }
+}
+
+fn unpack_alphas_from_channel(rgba: vec4<u32>, channel_index: u32) -> u32 {
+    switch channel_index {
+        case 0u: { return rgba.x; }
+        case 1u: { return rgba.y; }
+        case 2u: { return rgba.z; }
+        case 3u: { return rgba.w; }
+        // Fallback, should never happen
+        default: { return rgba.x; }
+    }
+}
+
+// Polyfills `unpack4x8unorm`.
+//
+// Downlevel targets do not support native WGSL `unpack4x8unorm`.
+fn unpack4x8unorm(rgba_packed: u32) -> vec4<f32> {
+    // Extract each byte and convert to float in range [0,1]
+    return vec4<f32>(
+        f32((rgba_packed >> 0u) & 0xFFu) / 255.0,  // r
+        f32((rgba_packed >> 8u) & 0xFFu) / 255.0,  // g
+        f32((rgba_packed >> 16u) & 0xFFu) / 255.0, // b
+        f32((rgba_packed >> 24u) & 0xFFu) / 255.0  // a
+    );
+}
diff --git a/sparse_strips/vello_hybrid/src/lib.rs b/sparse_strips/vello_hybrid/src/lib.rs
index 8d53450..e5192c7 100644
--- a/sparse_strips/vello_hybrid/src/lib.rs
+++ b/sparse_strips/vello_hybrid/src/lib.rs
@@ -31,6 +31,7 @@
 
 mod render;
 mod scene;
+mod schedule;
 pub mod util;
 
 pub use render::{Config, GpuStrip, RenderData, RenderSize, RenderTargetConfig, Renderer};
diff --git a/sparse_strips/vello_hybrid/src/render.rs b/sparse_strips/vello_hybrid/src/render.rs
index 0fa721f..fdae0d6 100644
--- a/sparse_strips/vello_hybrid/src/render.rs
+++ b/sparse_strips/vello_hybrid/src/render.rs
@@ -17,11 +17,12 @@
 use bytemuck::{Pod, Zeroable};
 use vello_common::tile::Tile;
 use wgpu::{
-    BindGroup, BindGroupLayout, BlendState, Buffer, ColorTargetState, ColorWrites, Device,
-    PipelineCompilationOptions, Queue, RenderPass, RenderPipeline, Texture, util::DeviceExt,
+    BindGroup, BindGroupLayout, BlendState, Buffer, ColorTargetState, ColorWrites, CommandEncoder,
+    Device, PipelineCompilationOptions, Queue, RenderPass, RenderPassColorAttachment,
+    RenderPassDescriptor, RenderPipeline, Texture, TextureView, util::DeviceExt,
 };
 
-use crate::scene::Scene;
+use crate::{scene::Scene, schedule::Schedule};
 
 /// Dimensions of the rendering target
 #[derive(Debug, PartialEq, Eq, Clone)]
@@ -44,6 +45,10 @@
 }
 
 /// Contains all GPU resources needed for rendering
+///
+/// This struct contains the GPU resources that may be reallocated depending
+/// on the scene. Resources that are created once at startup are simply in
+/// `Renderer`.
 #[derive(Debug)]
 struct GpuResources {
     /// Buffer for strip data
@@ -54,15 +59,26 @@
     pub render_bind_group: BindGroup,
     /// Buffer for config data
     pub config_buffer: Buffer,
+    // Bind groups for rendering with clip buffers
+    //pub clip_bind_groups: [BindGroup; 3],
 }
 
 /// GPU renderer for the hybrid rendering system
+///
+/// This struct contains GPU resources that are created once at startup and
+/// are never reallocated or rebuilt.
 #[derive(Debug)]
 pub struct Renderer {
     /// Bind group layout for rendering
     pub render_bind_group_layout: BindGroupLayout,
     /// Pipeline for rendering
     pub render_pipeline: RenderPipeline,
+    /// Bind group layout for clip draws
+    pub clip_bind_group_layout: BindGroupLayout,
+    /// Pipeline for rendering clip draws
+    pub clip_pipeline: RenderPipeline,
+    /// Clip temporary textures
+    pub clip_textures: [Texture; 2],
     /// GPU resources for rendering (created during prepare)
     resources: Option<GpuResources>,
 
@@ -117,6 +133,16 @@
     pub rgba: u32,
 }
 
+/// A struct containing references to the many objects needed to get work
+/// scheduled onto the GPU.
+pub(crate) struct RendererJunk<'a> {
+    renderer: &'a mut Renderer,
+    device: &'a Device,
+    queue: &'a Queue,
+    encoder: &'a mut CommandEncoder,
+    view: &'a TextureView,
+}
+
 impl GpuStrip {
     /// Vertex attributes for the strip
     pub fn vertex_attributes() -> [wgpu::VertexAttribute; 4] {
@@ -166,6 +192,42 @@
                     },
                 ],
             });
+        let clip_bind_group_layout =
+            device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label: None,
+                entries: &[
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::FRAGMENT,
+                        ty: wgpu::BindingType::Texture {
+                            sample_type: wgpu::TextureSampleType::Uint,
+                            view_dimension: wgpu::TextureViewDimension::D2,
+                            multisampled: false,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 1,
+                        visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Uniform,
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 2,
+                        visibility: wgpu::ShaderStages::FRAGMENT,
+                        ty: wgpu::BindingType::Texture {
+                            sample_type: wgpu::TextureSampleType::Float { filterable: false },
+                            view_dimension: wgpu::TextureViewDimension::D2,
+                            multisampled: false,
+                        },
+                        count: None,
+                    },
+                ],
+            });
         let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
             label: None,
             bind_group_layouts: &[&render_bind_group_layout],
@@ -203,194 +265,263 @@
             multiview: None,
             cache: None,
         });
+        let clip_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: None,
+            source: wgpu::ShaderSource::Wgsl(
+                include_str!("../shaders/sparse_strip_clip.wgsl").into(),
+            ),
+        });
+        let clip_pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: None,
+            bind_group_layouts: &[&clip_bind_group_layout],
+            push_constant_ranges: &[],
+        });
+        let clip_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
+            label: None,
+            layout: Some(&clip_pipeline_layout),
+            vertex: wgpu::VertexState {
+                module: &clip_shader,
+                entry_point: Some("vs_main"),
+                buffers: &[wgpu::VertexBufferLayout {
+                    array_stride: size_of::<GpuStrip>() as u64,
+                    step_mode: wgpu::VertexStepMode::Instance,
+                    attributes: &GpuStrip::vertex_attributes(),
+                }],
+                compilation_options: PipelineCompilationOptions::default(),
+            },
+            fragment: Some(wgpu::FragmentState {
+                module: &clip_shader,
+                entry_point: Some("fs_main"),
+                targets: &[Some(ColorTargetState {
+                    format: render_target_config.format,
+                    blend: Some(BlendState::PREMULTIPLIED_ALPHA_BLENDING),
+                    write_mask: ColorWrites::ALL,
+                })],
+                compilation_options: PipelineCompilationOptions::default(),
+            }),
+            primitive: wgpu::PrimitiveState {
+                topology: wgpu::PrimitiveTopology::TriangleStrip,
+                ..Default::default()
+            },
+            depth_stencil: None,
+            multisample: wgpu::MultisampleState::default(),
+            multiview: None,
+            cache: None,
+        });
+        let clip_textures = std::array::from_fn(|_| {
+            device.create_texture(&wgpu::TextureDescriptor {
+                label: Some("clip temp texture"),
+                size: wgpu::Extent3d {
+                    width: 256, // TODO: make configurable
+                    height: 1024,
+                    depth_or_array_layers: 1,
+                },
+                mip_level_count: 1,
+                sample_count: 1,
+                dimension: wgpu::TextureDimension::D2,
+                format: wgpu::TextureFormat::Rgba8Unorm,
+                usage: wgpu::TextureUsages::TEXTURE_BINDING
+                    | wgpu::TextureUsages::RENDER_ATTACHMENT,
+                view_formats: &[],
+            })
+        });
 
         Self {
             render_bind_group_layout,
             render_pipeline,
+            clip_bind_group_layout,
+            clip_pipeline,
             resources: None,
             alpha_data: Vec::new(),
             render_size: RenderSize {
                 width: render_target_config.width,
                 height: render_target_config.height,
             },
+            clip_textures,
         }
     }
 
-    /// Prepare the GPU buffers for rendering
-    pub fn prepare(
+    fn make_strips_buffer(&self, device: &Device, required_strips_size: u64) -> Buffer {
+        device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Strips Buffer"),
+            size: required_strips_size,
+            usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        })
+    }
+
+    fn make_config_buffer(
+        &self,
+        device: &Device,
+        render_size: &RenderSize,
+        max_texture_dimension_2d: u32,
+    ) -> Buffer {
+        device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: Some("Config Buffer"),
+            contents: bytemuck::bytes_of(&Config {
+                width: render_size.width,
+                height: render_size.height,
+                strip_height: Tile::HEIGHT.into(),
+                alphas_tex_width_bits: max_texture_dimension_2d.trailing_zeros(),
+            }),
+            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+        })
+    }
+
+    fn make_alphas_texture(
+        &self,
+        device: &Device,
+        max_texture_dimension_2d: u32,
+        alpha_texture_height: u32,
+    ) -> Texture {
+        device.create_texture(&wgpu::TextureDescriptor {
+            label: Some("Alpha Texture"),
+            size: wgpu::Extent3d {
+                width: max_texture_dimension_2d,
+                height: alpha_texture_height,
+                depth_or_array_layers: 1,
+            },
+            mip_level_count: 1,
+            sample_count: 1,
+            dimension: wgpu::TextureDimension::D2,
+            format: wgpu::TextureFormat::Rgba32Uint,
+            usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST,
+            view_formats: &[],
+        })
+    }
+
+    fn make_render_bind_group(
+        &self,
+        device: &Device,
+        alphas_texture: &Texture,
+        config_buffer: &Buffer,
+    ) -> BindGroup {
+        let alphas_texture_view =
+            alphas_texture.create_view(&wgpu::TextureViewDescriptor::default());
+        device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("Render Bind Group"),
+            layout: &self.render_bind_group_layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: wgpu::BindingResource::TextureView(&alphas_texture_view),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: config_buffer.as_entire_binding(),
+                },
+            ],
+        })
+    }
+
+    /// Prepare the GPU buffers for rendering, given alphas
+    ///
+    /// Does not guarantee that the strip buffer is big enough
+    fn prepare_alphas(
         &mut self,
         device: &Device,
         queue: &Queue,
-        scene: &Scene,
+        alphas: &[u8],
         new_render_size: &RenderSize,
+        est_strip_count: usize,
     ) {
-        let render_data = scene.prepare_render_data();
-        let required_strips_size = size_of::<GpuStrip>() as u64 * render_data.strips.len() as u64;
+        let required_strips_size = size_of::<GpuStrip>() as u64 * est_strip_count as u64;
         let max_texture_dimension_2d = device.limits().max_texture_dimension_2d;
+        if self.resources.is_none() {
+            let strips_buffer = self.make_strips_buffer(device, required_strips_size);
+            let alpha_len = alphas.len();
+            // There are 16 1-byte alpha values per texel.
+            let alpha_texture_height =
+                (u32::try_from(alpha_len).unwrap()).div_ceil(max_texture_dimension_2d * 16);
 
-        let (needs_new_strips_buffer, needs_new_alpha_texture, needs_new_config) =
-            match &self.resources {
-                Some(resources) => {
-                    let strips_too_small = required_strips_size > resources.strips_buffer.size();
+            assert!(
+                alpha_texture_height <= max_texture_dimension_2d,
+                "Alpha texture height exceeds max texture dimensions"
+            );
 
-                    let alpha_len = render_data.alphas.len();
-                    // There are 16 1-byte alpha values per texel.
-                    let required_alpha_height =
-                        (u32::try_from(alpha_len).unwrap()).div_ceil(max_texture_dimension_2d * 16);
-                    let required_alpha_size = max_texture_dimension_2d * required_alpha_height * 16;
+            // Resize the alpha texture staging buffer.
+            self.alpha_data.resize(
+                (max_texture_dimension_2d * alpha_texture_height * 16) as usize,
+                0,
+            );
+            // The alpha texture encodes 16 1-byte alpha values per texel, with 4 alpha values packed in each channel
+            let alphas_texture =
+                self.make_alphas_texture(device, max_texture_dimension_2d, alpha_texture_height);
+            let config_buffer =
+                self.make_config_buffer(device, new_render_size, max_texture_dimension_2d);
 
-                    let current_alpha_size =
-                        resources.alphas_texture.width() * resources.alphas_texture.height() * 16;
-                    let alpha_too_small = required_alpha_size > current_alpha_size;
-
-                    let dimensions_changed = self.render_size != *new_render_size;
-
-                    (strips_too_small, alpha_too_small, dimensions_changed)
-                }
-                // self.resources is None if prepare has not been called yet
-                None => (true, true, true),
-            };
-
-        if needs_new_strips_buffer || needs_new_alpha_texture {
-            // Create strips buffer if it doesn't exist, or reuse existing strips buffer
-            let strips_buffer = if needs_new_strips_buffer {
-                device.create_buffer(&wgpu::BufferDescriptor {
-                    label: Some("Strips Buffer"),
-                    size: required_strips_size,
-                    usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::COPY_DST,
-                    mapped_at_creation: false,
-                })
-            } else {
-                self.resources
-                    .as_ref()
-                    .expect("Strips buffer not initialized")
-                    .strips_buffer
-                    .clone()
-            };
-
-            // Create config buffer if it doesn't exist, or reuse existing config buffer
-            let config_buffer = if self.resources.is_none() {
-                device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
-                    label: Some("Config Buffer"),
-                    contents: bytemuck::bytes_of(&Config {
-                        width: new_render_size.width,
-                        height: new_render_size.height,
-                        strip_height: Tile::HEIGHT.into(),
-                        alphas_tex_width_bits: max_texture_dimension_2d.trailing_zeros(),
-                    }),
-                    usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
-                })
-            } else {
-                self.resources
-                    .as_ref()
-                    .expect("Config buffer not initialized")
-                    .config_buffer
-                    .clone()
-            };
-
-            // Create alpha texture if it doesn't exist, or reuse existing alpha texture
-            let (alphas_texture, render_bind_group) = if needs_new_alpha_texture {
-                let alpha_len = render_data.alphas.len();
-                // There are 16 1-byte alpha values per texel.
-                let alpha_texture_height =
-                    (u32::try_from(alpha_len).unwrap()).div_ceil(max_texture_dimension_2d * 16);
-
-                assert!(
-                    alpha_texture_height <= max_texture_dimension_2d,
-                    "Alpha texture height exceeds max texture dimensions"
-                );
-
-                // Resize the alpha texture staging buffer.
-                self.alpha_data.resize(
-                    (max_texture_dimension_2d * alpha_texture_height * 16) as usize,
-                    0,
-                );
-                // The alpha texture encodes 16 1-byte alpha values per texel, with 4 alpha values packed in each channel
-                let alphas_texture = device.create_texture(&wgpu::TextureDescriptor {
-                    label: Some("Alpha Texture"),
-                    size: wgpu::Extent3d {
-                        width: max_texture_dimension_2d,
-                        height: alpha_texture_height,
-                        depth_or_array_layers: 1,
-                    },
-                    mip_level_count: 1,
-                    sample_count: 1,
-                    dimension: wgpu::TextureDimension::D2,
-                    format: wgpu::TextureFormat::Rgba32Uint,
-                    usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST,
-                    view_formats: &[],
-                });
-                let alphas_texture_view =
-                    alphas_texture.create_view(&wgpu::TextureViewDescriptor::default());
-
-                let render_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
-                    label: Some("Render Bind Group"),
-                    layout: &self.render_bind_group_layout,
-                    entries: &[
-                        wgpu::BindGroupEntry {
-                            binding: 0,
-                            resource: wgpu::BindingResource::TextureView(&alphas_texture_view),
-                        },
-                        wgpu::BindGroupEntry {
-                            binding: 1,
-                            resource: config_buffer.as_entire_binding(),
-                        },
-                    ],
-                });
-                (alphas_texture, render_bind_group)
-            } else {
-                let resources = self.resources.as_ref().expect("Resources not initialized");
-                (
-                    resources.alphas_texture.clone(),
-                    resources.render_bind_group.clone(),
-                )
-            };
-
+            let render_bind_group =
+                self.make_render_bind_group(device, &alphas_texture, &config_buffer);
             self.resources = Some(GpuResources {
                 strips_buffer,
                 alphas_texture,
                 render_bind_group,
                 config_buffer,
             });
-        };
+        } else {
+            // Update existing resources as needed
+            let alpha_len = alphas.len();
+            // There are 16 1-byte alpha values per texel.
+            let required_alpha_height =
+                (u32::try_from(alpha_len).unwrap()).div_ceil(max_texture_dimension_2d * 16);
+            let required_alpha_size = max_texture_dimension_2d * required_alpha_height * 16;
 
-        // Update config buffer if dimensions changed and config buffer exists.
+            let current_alpha_size = {
+                let alphas_texture = &self.resources.as_ref().unwrap().alphas_texture;
+                alphas_texture.width() * alphas_texture.height() * 16
+            };
+            if required_alpha_size > current_alpha_size {
+                assert!(
+                    required_alpha_height <= max_texture_dimension_2d,
+                    "Alpha texture height exceeds max texture dimensions"
+                );
+
+                // Resize the alpha texture staging buffer.
+                self.alpha_data.resize(
+                    (max_texture_dimension_2d * required_alpha_height * 16) as usize,
+                    0,
+                );
+                // The alpha texture encodes 16 1-byte alpha values per texel, with 4 alpha values packed in each channel
+                let alphas_texture = self.make_alphas_texture(
+                    device,
+                    max_texture_dimension_2d,
+                    required_alpha_height,
+                );
+                let config_buffer = &self.resources.as_ref().unwrap().config_buffer;
+                let render_bind_group =
+                    self.make_render_bind_group(device, &alphas_texture, config_buffer);
+                let resources = self.resources.as_mut().unwrap();
+                resources.alphas_texture = alphas_texture;
+                resources.render_bind_group = render_bind_group;
+            }
+        }
+
+        // Resources have been created by now.
+        let resources = self.resources.as_ref().unwrap();
+
+        // Update config buffer if dimensions changed.
         // We don't need to initialize a new config buffer because it's fixed size (uniform buffer).
-        if needs_new_config && self.resources.is_some() {
+        if self.render_size != *new_render_size {
             let config = Config {
                 width: new_render_size.width,
                 height: new_render_size.height,
                 strip_height: Tile::HEIGHT.into(),
                 alphas_tex_width_bits: max_texture_dimension_2d.trailing_zeros(),
             };
-            queue.write_buffer(
-                &self.resources.as_ref().unwrap().config_buffer,
-                0,
-                bytemuck::bytes_of(&config),
-            );
+            queue.write_buffer(&resources.config_buffer, 0, bytemuck::bytes_of(&config));
             self.render_size = new_render_size.clone();
         }
 
-        // Resources are created in above blocks.
-        let resources = self.resources.as_ref().unwrap();
-
-        // TODO: Explore using `write_buffer_with` to avoid copying the data twice
-        queue.write_buffer(
-            &resources.strips_buffer,
-            0,
-            bytemuck::cast_slice(&render_data.strips),
-        );
-
         // Prepare alpha data for the texture with 16 1-byte alpha values per texel (4 per channel)
         let texture_width = resources.alphas_texture.width();
         let texture_height = resources.alphas_texture.height();
         assert!(
-            render_data.alphas.len() <= (texture_width * texture_height * 16) as usize,
+            alphas.len() <= (texture_width * texture_height * 16) as usize,
             "Alpha texture dimensions are too small to fit the alpha data"
         );
         // After this copy to `self.alpha_data`, there may be stale trailing alpha values. These
         // are not sampled, so can be left as-is.
-        self.alpha_data[0..render_data.alphas.len()].copy_from_slice(&render_data.alphas);
+        self.alpha_data[0..alphas.len()].copy_from_slice(alphas);
 
         queue.write_texture(
             wgpu::TexelCopyTextureInfo {
@@ -414,23 +545,130 @@
         );
     }
 
+    /// Upload the strip data
+    fn upload_strips(&mut self, device: &Device, queue: &Queue, strips: &[GpuStrip]) {
+        let required_strips_size = size_of_val(strips) as u64;
+
+        if required_strips_size > self.resources.as_ref().unwrap().strips_buffer.size() {
+            self.resources.as_mut().unwrap().strips_buffer =
+                self.make_strips_buffer(device, required_strips_size);
+        }
+
+        // TODO: Explore using `write_buffer_with` to avoid copying the data twice
+        queue.write_buffer(
+            &self.resources.as_ref().unwrap().strips_buffer,
+            0,
+            bytemuck::cast_slice(strips),
+        );
+    }
+
+    /// Prepare the GPU buffers for rendering
+    pub fn prepare(
+        &mut self,
+        device: &Device,
+        queue: &Queue,
+        render_data: &RenderData,
+        new_render_size: &RenderSize,
+    ) {
+        self.prepare_alphas(
+            device,
+            queue,
+            &render_data.alphas,
+            new_render_size,
+            render_data.strips.len(),
+        );
+        self.upload_strips(device, queue, &render_data.strips);
+    }
+
     /// Render `scene` into the provided render pass.
     ///
     /// You must call [`prepare`](Self::prepare) with this scene before
     /// calling `render`.
     /// The provided pass can be rendering to a surface, or to a "off-screen" buffer.
-    pub fn render(&mut self, scene: &Scene, render_pass: &mut RenderPass<'_>) {
+    pub fn render(&mut self, render_data: &RenderData, render_pass: &mut RenderPass<'_>) {
         // TODO: Consider API that forces the user to call `prepare` before `render`.
         // For example, `prepare` could return some struct that is consumed by `render`.
-        let resources = &self
+        let resources = self
             .resources
             .as_ref()
             .expect("`prepare` should be called before `render`");
-        let render_data = scene.prepare_render_data();
         render_pass.set_pipeline(&self.render_pipeline);
         render_pass.set_bind_group(0, &resources.render_bind_group, &[]);
         render_pass.set_vertex_buffer(0, resources.strips_buffer.slice(..));
         let strips_to_draw = render_data.strips.len();
         render_pass.draw(0..4, 0..u32::try_from(strips_to_draw).unwrap());
     }
+
+    /// Render `scene` into the provided command encoder.
+    ///
+    /// This method creates GPU resources as needed, and schedules potentially multiple
+    /// render passes.
+    pub fn render2(
+        &mut self,
+        scene: &Scene,
+        device: &Device,
+        queue: &Queue,
+        encoder: &mut CommandEncoder,
+        render_size: &RenderSize,
+        view: &TextureView,
+    ) {
+        let render_data = scene.prepare_render_data();
+        // For the time being, we upload the entire alpha buffer as one big chunk. As a future
+        // refinement, we could have a bounded alpha buffer, and break draws when the alpha
+        // buffer fills.
+        self.prepare_alphas(
+            device,
+            queue,
+            &scene.alphas,
+            render_size,
+            render_data.strips.len(),
+        );
+        let mut junk = RendererJunk {
+            renderer: self,
+            device,
+            queue,
+            encoder,
+            view,
+        };
+        // TODO: make this configurable, and make it match the allocation
+        let n_slots = 1024;
+        let mut schedule = Schedule::new(n_slots);
+        schedule.do_scene(&mut junk, scene);
+    }
+}
+
+impl RendererJunk<'_> {
+    pub(crate) fn do_render_pass(&mut self, strips: &[GpuStrip], round: usize, _ix: usize) {
+        self.renderer.upload_strips(self.device, self.queue, strips);
+        let load = if round == 0 {
+            wgpu::LoadOp::Clear(wgpu::Color::BLACK)
+        } else {
+            wgpu::LoadOp::Load
+        };
+        let mut render_pass = self.encoder.begin_render_pass(&RenderPassDescriptor {
+            label: Some("render to texture pass"),
+            color_attachments: &[Some(RenderPassColorAttachment {
+                // TODO: view is clip buffer[ix] for ix != 2
+                view: self.view,
+                resolve_target: None,
+                ops: wgpu::Operations {
+                    load,
+                    store: wgpu::StoreOp::Store,
+                },
+            })],
+            depth_stencil_attachment: None,
+            occlusion_query_set: None,
+            timestamp_writes: None,
+        });
+        let resources = self
+            .renderer
+            .resources
+            .as_ref()
+            .expect("`prepare` should be called before `render`");
+        render_pass.set_pipeline(&self.renderer.render_pipeline);
+        render_pass.set_bind_group(0, &resources.render_bind_group, &[]);
+        render_pass.set_vertex_buffer(0, resources.strips_buffer.slice(..));
+        let strips_to_draw = strips.len();
+        render_pass.draw(0..4, 0..u32::try_from(strips_to_draw).unwrap());
+    }
 }
diff --git a/sparse_strips/vello_hybrid/src/schedule.rs b/sparse_strips/vello_hybrid/src/schedule.rs
new file mode 100644
index 0000000..75e7442
--- /dev/null
+++ b/sparse_strips/vello_hybrid/src/schedule.rs
@@ -0,0 +1,230 @@
+// Copyright 2025 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use std::collections::VecDeque;
+
+use vello_common::{
+    coarse::{Cmd, WideTile},
+    paint::Paint,
+    tile::Tile,
+};
+
+use crate::{GpuStrip, Scene, render::RendererJunk};
+
+pub(crate) struct Schedule {
+    /// Index of the current round
+    round: usize,
+    free: [Vec<usize>; 2],
+    rounds: VecDeque<Round>,
+}
+
+/// A "round" is a coarse scheduling quantum.
+///
+/// It represents draws in up to three render targets; two for intermediate
+/// clip/blend buffers, and the third for the actual render target. The two
+/// clip buffers are for even and odd clip depths.
+#[derive(Default)]
+struct Round {
+    draws: [Draw; 3],
+    /// Slots that will be freed after the draws
+    free: [Vec<usize>; 2],
+}
+
+/// State for a single tile.
+///
+/// Perhaps this should just be a field in the scheduler.
+#[derive(Default)]
+struct TileState {
+    stack: Vec<TileEl>,
+}
+
+#[derive(Clone, Copy)]
+struct TileEl {
+    slot_ix: usize,
+    round: usize,
+}
+
+#[derive(Default)]
+struct Draw(Vec<GpuStrip>);
+
+impl Schedule {
+    pub(crate) fn new(n_slots: usize) -> Self {
+        let free0: Vec<_> = (0..n_slots).collect();
+        let free1 = free0.clone();
+        let free = [free0, free1];
+        let mut rounds = VecDeque::new();
+        rounds.push_back(Round::default());
+        Self {
+            round: 0,
+            free,
+            rounds,
+        }
+    }
+
+    pub(crate) fn do_scene(&mut self, junk: &mut RendererJunk<'_>, scene: &Scene) {
+        let mut state = TileState::default();
+        let wide_tiles_per_row = (scene.width).div_ceil(WideTile::WIDTH);
+        let wide_tiles_per_col = (scene.height).div_ceil(Tile::HEIGHT);
+        for wide_tile_row in 0..wide_tiles_per_col {
+            for wide_tile_col in 0..wide_tiles_per_row {
+                let wide_tile_idx = usize::from(wide_tile_row) * usize::from(wide_tiles_per_row)
+                    + usize::from(wide_tile_col);
+                let wide_tile = &scene.wide.tiles[wide_tile_idx];
+                let wide_tile_x = wide_tile_col * WideTile::WIDTH;
+                let wide_tile_y = wide_tile_row * Tile::HEIGHT;
+                self.do_tile(junk, wide_tile_x, wide_tile_y, wide_tile, &mut state);
+            }
+        }
+        while !self.rounds.is_empty() {
+            self.flush(junk);
+        }
+    }
+
+    /// Flush one round.
+    ///
+    /// The rounds queue must not be empty.
+    fn flush(&mut self, junk: &mut RendererJunk<'_>) {
+        let round = self.rounds.pop_front().unwrap();
+        for (i, draw) in round.draws.iter().enumerate() {
+            if !draw.0.is_empty() {
+                junk.do_render_pass(&draw.0, self.round, i);
+            }
+        }
+        for i in 0..1 {
+            self.free[i].extend(&round.free[i]);
+        }
+        self.round += 1;
+    }
+
+    #[allow(clippy::todo, reason = "still working on this")]
+    fn do_tile(
+        &mut self,
+        junk: &mut RendererJunk<'_>,
+        wide_tile_x: u16,
+        wide_tile_y: u16,
+        tile: &WideTile,
+        state: &mut TileState,
+    ) {
+        state.stack.clear();
+        state.stack.push(TileEl {
+            slot_ix: !0,
+            round: self.round,
+        });
+        let bg = tile.bg.to_u32();
+        if bg >= 0x1_00_00_00 {
+            let draw = self.draw_mut(self.round, 1);
+            draw.0.push(GpuStrip {
+                x: wide_tile_x,
+                y: wide_tile_y,
+                width: WideTile::WIDTH,
+                dense_width: 0,
+                col: 0,
+                rgba: bg,
+            });
+        }
+        for cmd in &tile.cmds {
+            // Note: this starts at 1 (for the final target)
+            let clip_depth = state.stack.len();
+            match cmd {
+                Cmd::Fill(fill) => {
+                    let el = state.stack.last().unwrap();
+                    let draw = self.draw_mut(el.round, clip_depth);
+                    let color = match fill.paint {
+                        Paint::Solid(color) => color,
+                        Paint::Indexed(_) => unimplemented!(),
+                    };
+                    let rgba = color.to_u32();
+                    // color fields with 0 alpha are reserved for clipping
+                    if rgba >= 0x1_00_00_00 {
+                        // TODO: x and y base coordinates are from wide_tile if
+                        // clip depth is 1, otherwise point to slot ix
+                        draw.0.push(GpuStrip {
+                            x: wide_tile_x + fill.x,
+                            y: wide_tile_y,
+                            width: fill.width,
+                            dense_width: 0,
+                            col: 0,
+                            rgba,
+                        });
+                    }
+                }
+                Cmd::AlphaFill(alpha_fill) => {
+                    let el = state.stack.last().unwrap();
+                    let draw = self.draw_mut(el.round, clip_depth);
+                    let color = match alpha_fill.paint {
+                        Paint::Solid(color) => color,
+                        Paint::Indexed(_) => unimplemented!(),
+                    };
+                    let rgba = color.to_u32();
+                    // color fields with 0 alpha are reserved for clipping
+                    if rgba >= 0x1_00_00_00 {
+                        // msg is a variable here to work around rustfmt failure
+                        let msg = "GpuStrip fields use u32 and values are expected to fit within that range";
+                        draw.0.push(GpuStrip {
+                            x: wide_tile_x + alpha_fill.x,
+                            y: wide_tile_y,
+                            width: alpha_fill.width,
+                            dense_width: alpha_fill.width,
+                            col: (alpha_fill.alpha_idx / usize::from(Tile::HEIGHT))
+                                .try_into()
+                                .expect(msg),
+                            rgba,
+                        });
+                    }
+                }
+                Cmd::PushClip => {
+                    let ix = clip_depth % 2;
+                    while self.free[ix].is_empty() {
+                        if self.rounds.is_empty() {
+                            // Probably should return error here
+                            panic!("failed to allocate slot");
+                        }
+                        self.flush(junk);
+                    }
+                    let slot_ix = self.free[ix].pop().unwrap();
+                    // Note: the allocated slot will need to get cleared before
+                    // drawing, maybe add it to a clear list. Of course, if all slots
+                    // can be cleared, then do clear with `LoadOp::Clear` instead.
+                    state.stack.push(TileEl {
+                        slot_ix,
+                        round: self.round,
+                    });
+                }
+                Cmd::PopClip => {
+                    let tos = state.stack.pop().unwrap();
+                    let nos = state.stack.last_mut().unwrap();
+                    let next_round = clip_depth % 2 == 0 && clip_depth > 2;
+                    let round = nos.round.max(tos.round + next_round as usize);
+                    nos.round = round;
+                    // free slot after draw
+                    // TODO: ensure round exists
+                    // TODO: saturating_sub here, or do we have guarantee round >= self.round?
+                    self.rounds[round - self.round].free[1 - clip_depth % 2].push(tos.slot_ix);
+                }
+                Cmd::ClipFill(_cmd_clip_fill) => {
+                    let next_round = clip_depth % 2 == 0 && clip_depth > 2;
+                    let tos = &state.stack[clip_depth - 1];
+                    let nos = &state.stack[clip_depth - 2];
+                    let round = nos.round.max(tos.round + next_round as usize);
+                    let _draw = self.draw_mut(round, clip_depth - 1);
+                    // TODO: push GpuStrip; use `tos.slot_x` for rgba field
+                }
+                Cmd::ClipStrip(_cmd_clip_alpha_fill) => todo!(),
+            }
+        }
+    }
+
+    // Find the appropriate draw call for rendering.
+    fn draw_mut(&mut self, el_round: usize, clip_depth: usize) -> &mut Draw {
+        let rel_round = el_round.saturating_sub(self.round);
+        let ix = if clip_depth == 1 {
+            2
+        } else {
+            1 - clip_depth % 2
+        };
+        if self.rounds.len() == rel_round {
+            self.rounds.push_back(Round::default());
+        }
+        &mut self.rounds[rel_round].draws[ix]
+    }
+}