Merge branch 'master' into prefix
diff --git a/Cargo.lock b/Cargo.lock
index 1bec058..5f9b877 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7,6 +7,12 @@
 checksum = "5d2e7343e7fc9de883d1b0341e0b13970f764c14101234857d2ddafa1cb1cac2"
 
 [[package]]
+name = "arrayvec"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cff77d8686867eceff3105329d4698d96c2391c176d5d03adc90c7389162b5b8"
+
+[[package]]
 name = "ash"
 version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -70,6 +76,12 @@
 ]
 
 [[package]]
+name = "half"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f36b5f248235f45773d4944f555f83ea61fe07b18b561ccf99d7483d7381e54d"
+
+[[package]]
 name = "inflate"
 version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -79,6 +91,14 @@
 ]
 
 [[package]]
+name = "kurbo"
+version = "0.5.11"
+source = "git+https://github.com/linebender/kurbo?rev=7bd7e66bd137e757305d170a0f9f2b4f7beeb299#7bd7e66bd137e757305d170a0f9f2b4f7beeb299"
+dependencies = [
+ "arrayvec",
+]
+
+[[package]]
 name = "libc"
 version = "0.2.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -95,9 +115,26 @@
 ]
 
 [[package]]
+name = "once_cell"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1c601810575c99596d4afc46f78a678c80105117c379eb3650cf99b8a21ce5b"
+
+[[package]]
+name = "piet"
+version = "0.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29420eccb68d6b9ad2f8dd87caf9c3bcd3bbd056bfe67871c48b6efab9316b79"
+dependencies = [
+ "kurbo",
+]
+
+[[package]]
 name = "piet-gpu"
 version = "0.1.0"
 dependencies = [
+ "kurbo",
+ "piet",
  "piet-gpu-hal",
  "piet-gpu-types",
  "png",
@@ -118,12 +155,14 @@
 version = "0.1.0"
 dependencies = [
  "ash",
+ "once_cell",
 ]
 
 [[package]]
 name = "piet-gpu-types"
 version = "0.0.0"
 dependencies = [
+ "half",
  "piet-gpu-derive",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index f71f2de..efa5f88 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,3 +6,7 @@
     "piet-gpu-hal",
     "piet-gpu-types"
 ]
+
+# TODO: remove when the flatten feature is published
+[patch.crates-io]
+kurbo = { git = "https://github.com/linebender/kurbo", rev = "7bd7e66bd137e757305d170a0f9f2b4f7beeb299" }
diff --git a/piet-gpu-derive/src/derive.rs b/piet-gpu-derive/src/derive.rs
index bc84bfb..3b4c478 100644
--- a/piet-gpu-derive/src/derive.rs
+++ b/piet-gpu-derive/src/derive.rs
@@ -14,6 +14,16 @@
     }
     quote! {
         mod #module_name {
+            pub trait HalfToLeBytes {
+                fn to_le_bytes(&self) -> [u8; 2];
+            }
+
+            impl HalfToLeBytes for half::f16 {
+                fn to_le_bytes(&self) -> [u8; 2] {
+                    self.to_bits().to_le_bytes()
+                }
+            }
+
             #ts
         }
     }
@@ -121,6 +131,7 @@
 
 fn gen_derive_scalar_ty(ty: &GpuScalar) -> proc_macro2::TokenStream {
     match ty {
+        GpuScalar::F16 => quote!(half::f16),
         GpuScalar::F32 => quote!(f32),
         GpuScalar::I8 => quote!(i8),
         GpuScalar::I16 => quote!(i16),
diff --git a/piet-gpu-derive/src/glsl.rs b/piet-gpu-derive/src/glsl.rs
index 617669a..ec87640 100644
--- a/piet-gpu-derive/src/glsl.rs
+++ b/piet-gpu-derive/src/glsl.rs
@@ -14,6 +14,7 @@
     for name in &module.def_names {
         gen_refdef(&mut r, &name);
     }
+
     for name in &module.def_names {
         match module.defs.get(name).unwrap() {
             (size, LayoutTypeDef::Struct(fields)) => {
@@ -26,6 +27,7 @@
             }
         }
     }
+
     for name in &module.def_names {
         let def = module.defs.get(name).unwrap();
         match def {
@@ -43,6 +45,7 @@
             }
         }
     }
+
     r
 }
 
@@ -98,9 +101,21 @@
         }
     }
     writeln!(r, "    {} s;", name).unwrap();
+
+    let mut preload: bool = false;
     for (name, offset, ty) in fields {
-        writeln!(r, "    s.{} = {};", name, gen_extract(*offset, &ty.ty)).unwrap();
+        let (setup, extract) = gen_extract(*offset, &ty.ty, preload);
+        writeln!(r, "{}    s.{} = {};", setup, name, extract).unwrap();
+
+        if let GpuType::Scalar(GpuScalar::F16) = &ty.ty {
+            if offset % 4 == 0 {
+                preload = true;
+                continue;
+            }
+        }
+        preload = false;
     }
+
     writeln!(r, "    return s;").unwrap();
     writeln!(r, "}}\n").unwrap();
 }
@@ -136,34 +151,67 @@
     }
 }
 
-fn gen_extract(offset: usize, ty: &GpuType) -> String {
+fn gen_extract(offset: usize, ty: &GpuType, preload: bool) -> (String, String) {
     match ty {
-        GpuType::Scalar(scalar) => gen_extract_scalar(offset, scalar),
+        GpuType::Scalar(scalar) => {
+            let setup = match scalar {
+                GpuScalar::F16 => {
+                    if preload {
+                        String::new()
+                    } else {
+                        let ix = offset / 4;
+                        format!("    vec2 halves{} = unpackHalf2x16(raw{});\n", ix, ix)
+                    }
+                }
+                _ => String::new(),
+            };
+
+            (setup, gen_extract_scalar(offset, scalar))
+        }
         GpuType::Vector(scalar, size) => {
-            let mut r = glsl_type(ty);
-            r.push_str("(");
+            let is_f16 = match scalar {
+                GpuScalar::F16 => true,
+                _ => false,
+            };
+
+            let mut setup = String::new();
+            let mut extract = glsl_type(ty);
+            &extract.push_str("(");
             for i in 0..*size {
                 if i != 0 {
-                    r.push_str(", ");
+                    &extract.push_str(", ");
                 }
+
+                if is_f16 && i % 2 == 0 {
+                    let ix = (offset + i * scalar.size()) / 4;
+                    let s = format!("    vec2 halves{} = unpackHalf2x16(raw{});\n", ix, ix);
+                    setup.push_str(&s);
+                };
+
                 let el_offset = offset + i * scalar.size();
-                r.push_str(&gen_extract_scalar(el_offset, scalar));
+                &extract.push_str(&gen_extract_scalar(el_offset, scalar));
             }
-            r.push_str(")");
-            r
+            &extract.push_str(")");
+            (setup, extract)
         }
-        GpuType::InlineStruct(name) => format!(
-            "{}_read({}Ref({}))",
-            name,
-            name,
-            simplified_add("ref.offset", offset)
+        GpuType::InlineStruct(name) => (
+            String::new(),
+            format!(
+                "{}_read({}Ref({}))",
+                name,
+                name,
+                simplified_add("ref.offset", offset)
+            ),
         ),
         GpuType::Ref(inner) => {
             if let GpuType::InlineStruct(name) = inner.deref() {
-                format!(
-                    "{}Ref({})",
-                    name,
-                    gen_extract_scalar(offset, &GpuScalar::U32)
+                (
+                    String::new(),
+                    format!(
+                        "{}Ref({})",
+                        name,
+                        gen_extract_scalar(offset, &GpuScalar::U32)
+                    ),
                 )
             } else {
                 panic!("only know how to deal with Ref of struct")
@@ -174,7 +222,7 @@
 
 fn gen_extract_scalar(offset: usize, ty: &GpuScalar) -> String {
     match ty {
-        GpuScalar::F32 => format!("uintBitsToFloat(raw{})", offset / 4),
+        GpuScalar::F16 | GpuScalar::F32 => extract_fbits(offset, ty.size()),
         GpuScalar::U8 | GpuScalar::U16 | GpuScalar::U32 => extract_ubits(offset, ty.size()),
         GpuScalar::I8 | GpuScalar::I16 | GpuScalar::I32 => extract_ibits(offset, ty.size()),
     }
@@ -210,8 +258,41 @@
     }
 }
 
+fn extract_fbits(offset: usize, nbytes: usize) -> String {
+    match nbytes {
+        4 => format!("uintBitsToFloat(raw{})", offset / 4),
+        2 => match offset % 4 {
+            0 => {
+                let ix = offset / 4;
+                format!("halves{}.x", ix)
+            }
+            2 => format!("halves{}.y", offset / 4),
+            _ => panic!("unexpected packing of f16 at offset {}", offset % 4),
+        },
+        _ => {
+            panic!("unexpected extraction of float with nbytes = {}", nbytes);
+        }
+    }
+}
+
 // Writing
 
+fn is_f16(ty: &GpuType) -> bool {
+    match ty {
+        GpuType::Scalar(GpuScalar::F16) => true,
+        GpuType::Vector(GpuScalar::F16, _) => true,
+        _ => false,
+    }
+}
+
+fn is_f16_pair(field_ixs: &[usize], fields: &[(String, usize, LayoutType)]) -> bool {
+    if field_ixs.len() == 2 {
+        fields.iter().all(|(_, _, t)| is_f16(&t.ty))
+    } else {
+        false
+    }
+}
+
 fn gen_struct_write(
     r: &mut String,
     bufname: &str,
@@ -221,39 +302,78 @@
     writeln!(r, "void {}_write({}Ref ref, {} s) {{", name, name, name).unwrap();
     writeln!(r, "    uint ix = ref.offset >> 2;").unwrap();
     let coverage = crate::layout::struct_coverage(fields, true);
+
     for (i, field_ixs) in coverage.iter().enumerate() {
         let mut pieces = Vec::new();
-        for field_ix in field_ixs {
-            let (name, offset, ty) = &fields[*field_ix];
-            match &ty.ty {
-                GpuType::Scalar(scalar) => {
-                    let inner = format!("s.{}", name);
-                    pieces.push(gen_pack_bits_scalar(scalar, *offset, &inner));
-                }
-                GpuType::Vector(scalar, len) => {
-                    let size = scalar.size();
-                    let ix_lo = (i * 4 - offset) / size;
-                    let ix_hi = ((4 + i * 4 - offset) / size).min(*len);
-                    for ix in ix_lo..ix_hi {
-                        let scalar_offset = offset + ix * size;
-                        let inner = format!("s.{}.{}", name, &"xyzw"[ix..ix + 1]);
-                        pieces.push(gen_pack_bits_scalar(scalar, scalar_offset, &inner));
+
+        if is_f16_pair(field_ixs, fields) {
+            let (ix0, ix1) = (field_ixs[0], field_ixs[1]);
+            let inner0 = format!("s.{}", fields[ix0].0);
+            let inner1 = format!("s.{}", fields[ix1].0);
+            pieces.push(format!("packHalf2x16(vec2({}, {}))", &inner0, &inner1));
+        } else {
+            for field_ix in field_ixs {
+                let (name, offset, ty) = &fields[*field_ix];
+                match &ty.ty {
+                    GpuType::Scalar(scalar) => {
+                        let inner = format!("s.{}", name);
+                        pieces.push(gen_pack_bits_scalar(scalar, *offset, &inner));
                     }
+                    GpuType::Vector(scalar, len) => {
+                        let size = scalar.size();
+                        let ix_lo = (i * 4 - offset) / size;
+                        let ix_hi = ((4 + i * 4 - offset) / size).min(*len);
+                        match scalar {
+                            GpuScalar::F16 => {
+                                if ix_hi - ix_lo == 2 {
+                                    let inner0 =
+                                        format!("s.{}.{}", name, &"xyzw"[ix_lo..ix_lo + 1]);
+                                    let inner1 =
+                                        format!("s.{}.{}", name, &"xyzw"[ix_lo + 1..ix_hi]);
+                                    pieces.push(format!(
+                                        "packHalf2x16(vec2({}, {}))",
+                                        &inner0, &inner1
+                                    ));
+                                } else {
+                                    let ix = ix_lo;
+                                    let scalar_offset = offset + ix * size;
+                                    let inner = format!("s.{}.{}", name, &"xyzw"[ix..ix + 1]);
+                                    pieces.push(gen_pack_bits_scalar(
+                                        scalar,
+                                        scalar_offset,
+                                        &inner,
+                                    ));
+                                }
+                            }
+                            _ => {
+                                for ix in ix_lo..ix_hi {
+                                    let scalar_offset = offset + ix * size;
+                                    let inner = format!("s.{}.{}", name, &"xyzw"[ix..ix + 1]);
+                                    pieces.push(gen_pack_bits_scalar(
+                                        scalar,
+                                        scalar_offset,
+                                        &inner,
+                                    ));
+                                }
+                            }
+                        }
+                    }
+                    GpuType::InlineStruct(structname) => {
+                        writeln!(
+                            r,
+                            "    {}_write({}Ref({}), s.{});",
+                            structname,
+                            structname,
+                            simplified_add("ref.offset", *offset),
+                            name
+                        )
+                        .unwrap();
+                    }
+                    GpuType::Ref(_) => pieces.push(format!("s.{}.offset", name)),
                 }
-                GpuType::InlineStruct(structname) => {
-                    writeln!(
-                        r,
-                        "    {}_write({}Ref({}), s.{});",
-                        structname,
-                        structname,
-                        simplified_add("ref.offset", *offset),
-                        name
-                    )
-                    .unwrap();
-                }
-                GpuType::Ref(_) => pieces.push(format!("s.{}.offset", name)),
             }
         }
+
         if !pieces.is_empty() {
             write!(r, "    {}[ix + {}] = ", bufname, i).unwrap();
             for (j, piece) in pieces.iter().enumerate() {
@@ -271,6 +391,7 @@
 fn gen_pack_bits_scalar(ty: &GpuScalar, offset: usize, inner: &str) -> String {
     let shift = (offset % 4) * 8;
     let bits = match ty {
+        GpuScalar::F16 => format!("packHalf2x16(vec2({}, 0.0)) & 0xffff", inner),
         GpuScalar::F32 => format!("floatBitsToUint({})", inner),
         // Note: this doesn't mask small unsigned int types; the caller is
         // responsible for making sure they don't overflow.
@@ -367,7 +488,7 @@
 // GLSL type that can contain the scalar value.
 fn glsl_scalar(s: &GpuScalar) -> &'static str {
     match s {
-        GpuScalar::F32 => "float",
+        GpuScalar::F16 | GpuScalar::F32 => "float",
         GpuScalar::I8 | GpuScalar::I16 | GpuScalar::I32 => "int",
         GpuScalar::U8 | GpuScalar::U16 | GpuScalar::U32 => "uint",
     }
@@ -375,7 +496,7 @@
 
 fn glsl_vecname(s: &GpuScalar) -> &'static str {
     match s {
-        GpuScalar::F32 => "vec",
+        GpuScalar::F16 | GpuScalar::F32 => "vec",
         GpuScalar::I8 | GpuScalar::I16 | GpuScalar::I32 => "ivec",
         GpuScalar::U8 | GpuScalar::U16 | GpuScalar::U32 => "uvec",
     }
diff --git a/piet-gpu-derive/src/parse.rs b/piet-gpu-derive/src/parse.rs
index 8e51bab..9461338 100644
--- a/piet-gpu-derive/src/parse.rs
+++ b/piet-gpu-derive/src/parse.rs
@@ -12,14 +12,14 @@
 /// A scalar that can be represented in a packed data structure.
 #[derive(Clone, Copy, PartialEq)]
 pub enum GpuScalar {
+    F16,
+    F32,
     I8,
     I16,
     I32,
-    F32,
     U8,
     U16,
     U32,
-    // TODO: Add F16
 }
 
 /// An algebraic datatype.
@@ -52,6 +52,7 @@
     fn from_syn(ty: &syn::Type) -> Option<Self> {
         ty_as_single_ident(ty).and_then(|ident| match ident.as_str() {
             "f32" => Some(GpuScalar::F32),
+            "f16" => Some(GpuScalar::F16),
             "i8" => Some(GpuScalar::I8),
             "i16" => Some(GpuScalar::I16),
             "i32" => Some(GpuScalar::I32),
@@ -70,7 +71,7 @@
         match self {
             GpuScalar::F32 | GpuScalar::I32 | GpuScalar::U32 => 4,
             GpuScalar::I8 | GpuScalar::U8 => 1,
-            GpuScalar::I16 | GpuScalar::U16 => 2,
+            GpuScalar::F16 | GpuScalar::I16 | GpuScalar::U16 => 2,
         }
     }
 }
diff --git a/piet-gpu-hal/Cargo.toml b/piet-gpu-hal/Cargo.toml
index a6a373a..7019002 100644
--- a/piet-gpu-hal/Cargo.toml
+++ b/piet-gpu-hal/Cargo.toml
@@ -8,3 +8,4 @@
 
 [dependencies]
 ash = "0.30"
+once_cell = "1.3.1"
diff --git a/piet-gpu-hal/examples/collatz.rs b/piet-gpu-hal/examples/collatz.rs
index 7195891..a4777b4 100644
--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@@ -17,6 +17,7 @@
         let query_pool = device.create_query_pool(2).unwrap();
         let mut cmd_buf = device.create_cmd_buf().unwrap();
         cmd_buf.begin();
+        cmd_buf.reset_query_pool(&query_pool);
         cmd_buf.write_timestamp(&query_pool, 0);
         cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1));
         cmd_buf.write_timestamp(&query_pool, 1);
diff --git a/piet-gpu-hal/examples/prefix.rs b/piet-gpu-hal/examples/prefix.rs
index 6a38e53..2f80a20 100644
--- a/piet-gpu-hal/examples/prefix.rs
+++ b/piet-gpu-hal/examples/prefix.rs
@@ -35,6 +35,7 @@
         cmd_buf.clear_buffer(&work_buffer);
         cmd_buf.copy_buffer(&buffer, &buffer_dev);
         cmd_buf.memory_barrier();
+        cmd_buf.reset_query_pool(&query_pool);
         cmd_buf.write_timestamp(&query_pool, 0);
         cmd_buf.dispatch(&pipeline, &descriptor_set, (n_tiles as u32, 1, 1));
         cmd_buf.write_timestamp(&query_pool, 1);
diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index c62678f..d215490 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -71,10 +71,22 @@
 
     unsafe fn memory_barrier(&mut self);
 
+    /// Clear the buffer.
+    ///
+    /// This is readily supported in Vulkan, but for portability it is remarkably
+    /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
+    /// kernel, or organize the code not to need it.
     unsafe fn clear_buffer(&self, buffer: &D::Buffer);
 
     unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
 
+    /// Reset the query pool.
+    ///
+    /// The query pool must be reset before each use, to avoid validation errors.
+    /// This is annoying, and we could tweak the API to make it implicit, doing
+    /// the reset before the first timestamp write.
+    unsafe fn reset_query_pool(&mut self, pool: &D::QueryPool);
+
     unsafe fn write_timestamp(&mut self, pool: &D::QueryPool, query: u32);
 }
 
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 8ad7a13..35cf68f 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -1,10 +1,13 @@
 //! Vulkan implemenation of HAL trait.
 
-use std::ffi::CString;
+use std::borrow::Cow;
+use std::ffi::{CStr, CString};
 use std::sync::Arc;
 
+use ash::extensions::ext::DebugUtils;
 use ash::version::{DeviceV1_0, EntryV1_0, InstanceV1_0};
 use ash::{vk, Device, Entry, Instance};
+use once_cell::sync::Lazy;
 
 use crate::Error;
 
@@ -12,8 +15,9 @@
     /// Retain the dynamic lib.
     #[allow(unused)]
     entry: Entry,
-
     instance: Instance,
+    _dbg_loader: Option<DebugUtils>,
+    _dbg_callbk: Option<vk::DebugUtilsMessengerEXT>,
 }
 
 pub struct VkDevice {
@@ -61,6 +65,55 @@
 #[derive(Clone, Copy)]
 pub struct MemFlags(vk::MemoryPropertyFlags);
 
+unsafe extern "system" fn vulkan_debug_callback(
+    message_severity: vk::DebugUtilsMessageSeverityFlagsEXT,
+    message_type: vk::DebugUtilsMessageTypeFlagsEXT,
+    p_callback_data: *const vk::DebugUtilsMessengerCallbackDataEXT,
+    _user_data: *mut std::os::raw::c_void,
+) -> vk::Bool32 {
+    let callback_data = &*p_callback_data;
+    let message_id_number: i32 = callback_data.message_id_number as i32;
+
+    let message_id_name = if callback_data.p_message_id_name.is_null() {
+        Cow::from("")
+    } else {
+        CStr::from_ptr(callback_data.p_message_id_name).to_string_lossy()
+    };
+
+    let message = if callback_data.p_message.is_null() {
+        Cow::from("")
+    } else {
+        CStr::from_ptr(callback_data.p_message).to_string_lossy()
+    };
+
+    println!(
+        "{:?}:\n{:?} [{} ({})] : {}\n",
+        message_severity,
+        message_type,
+        message_id_name,
+        message_id_number,
+        message,
+    );
+
+    vk::FALSE
+}
+
+static LAYERS: Lazy<Vec<&'static CStr>> = Lazy::new(|| {
+    let mut layers: Vec<&'static CStr> = vec![];
+    if cfg!(debug_assertions) {
+        layers.push(CStr::from_bytes_with_nul(b"VK_LAYER_KHRONOS_validation\0").unwrap());
+    }
+    layers
+});
+
+static EXTS: Lazy<Vec<&'static CStr>> = Lazy::new(|| {
+    let mut exts: Vec<&'static CStr> = vec![];
+    if cfg!(debug_assertions) {
+        exts.push(DebugUtils::name());
+    }
+    exts
+});
+
 impl VkInstance {
     /// Create a new instance.
     ///
@@ -70,18 +123,74 @@
         unsafe {
             let app_name = CString::new("VkToy").unwrap();
             let entry = Entry::new()?;
+
+            let exist_layers = entry
+                .enumerate_instance_layer_properties()?;
+            let layers = LAYERS.iter().filter_map(|&lyr| {
+                exist_layers
+                    .iter()
+                    .find(|x|
+                        CStr::from_ptr(x.layer_name.as_ptr()) == lyr
+                    )
+                    .map(|_| lyr.as_ptr())
+                    .or_else(|| {
+                        println!("Unable to find layer: {}, have you installed the Vulkan SDK?", lyr.to_string_lossy());
+                        None
+                    })
+            }).collect::<Vec<_>>();
+
+            let exist_exts = entry
+                .enumerate_instance_extension_properties()?;
+            let exts = EXTS.iter().filter_map(|&ext| {
+                exist_exts
+                    .iter()
+                    .find(|x|
+                        CStr::from_ptr(x.extension_name.as_ptr()) == ext
+                    )
+                    .map(|_| ext.as_ptr())
+                    .or_else(|| {
+                        println!("Unable to find extension: {}, have you installed the Vulkan SDK?", ext.to_string_lossy());
+                        None
+                    })
+            }).collect::<Vec<_>>();
+
             let instance = entry.create_instance(
-                &vk::InstanceCreateInfo::builder().application_info(
-                    &vk::ApplicationInfo::builder()
-                        .application_name(&app_name)
-                        .application_version(0)
-                        .engine_name(&app_name)
-                        .api_version(vk::make_version(1, 0, 0)),
-                ),
+                &vk::InstanceCreateInfo::builder()
+                    .application_info(
+                        &vk::ApplicationInfo::builder()
+                            .application_name(&app_name)
+                            .application_version(0)
+                            .engine_name(&app_name)
+                            .api_version(vk::make_version(1, 0, 0)),
+                    )
+                    .enabled_layer_names(&layers)
+                    .enabled_extension_names(&exts),
                 None,
             )?;
 
-            Ok(VkInstance { entry, instance })
+            let (_dbg_loader, _dbg_callbk) = if cfg!(debug_assertions) {
+                let dbg_info = vk::DebugUtilsMessengerCreateInfoEXT::builder()
+                    .message_severity(
+                        vk::DebugUtilsMessageSeverityFlagsEXT::ERROR
+                            | vk::DebugUtilsMessageSeverityFlagsEXT::WARNING,
+                    )
+                    .message_type(vk::DebugUtilsMessageTypeFlagsEXT::all())
+                    .pfn_user_callback(Some(vulkan_debug_callback));
+                let dbg_loader = DebugUtils::new(&entry, &instance);
+                let dbg_callbk = dbg_loader
+                    .create_debug_utils_messenger(&dbg_info, None)
+                    .unwrap();
+                (Some(dbg_loader), Some(dbg_callbk))
+            } else {
+                (None, None)
+            };
+
+            Ok(VkInstance {
+                entry,
+                instance,
+                _dbg_loader,
+                _dbg_callbk,
+            })
         }
     }
 
@@ -467,6 +576,16 @@
         );
     }
 
+    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
+        let device = &self.device.device;
+        device.cmd_reset_query_pool(
+            self.cmd_buf,
+            pool.pool,
+            0,
+            pool.n_queries,
+        );
+    }
+
     unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
         let device = &self.device.device;
         device.cmd_write_timestamp(
diff --git a/piet-gpu-types/Cargo.toml b/piet-gpu-types/Cargo.toml
index 6de92a5..629cd62 100644
--- a/piet-gpu-types/Cargo.toml
+++ b/piet-gpu-types/Cargo.toml
@@ -9,3 +9,4 @@
 
 [dependencies]
 piet-gpu-derive = { path = "../piet-gpu-derive" }
+half = "1.5.0"
diff --git a/piet-gpu-types/src/lib.rs b/piet-gpu-types/src/lib.rs
index 44d4843..db9516f 100644
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@@ -1,4 +1,6 @@
 pub mod encoder;
 pub mod ptcl;
 pub mod scene;
+pub mod segment;
+pub mod test;
 pub mod tilegroup;
diff --git a/piet-gpu-types/src/main.rs b/piet-gpu-types/src/main.rs
index 7ed941f..834f1b6 100644
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@@ -6,7 +6,9 @@
     match mod_name.as_str() {
         "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
         "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
+        "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
         "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()),
+        "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()),
         _ => println!("Oops, unknown module name"),
     }
 }
diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs
index f5e42af..3faffb9 100644
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@@ -4,16 +4,19 @@
     #[gpu_write]
     mod ptcl {
         struct CmdCircle {
-            // In existing code, this is packed; we might need an annotation for this.
-            bbox: [u16; 4],
+            center: [f32; 2],
+            radius: f32,
+            rgba_color: u32,
         }
         struct CmdLine {
             start: [f32; 2],
             end: [f32; 2],
         }
         struct CmdStroke {
-            // In existing code, this is f16. Should we have support?
-            halfWidth: f32,
+            n_segs: u32,
+            // Should be Ref<Segment> if we had cross-module references.
+            seg_ref: u32,
+            half_width: f32,
             rgba_color: u32,
         }
         struct CmdFill {
@@ -32,6 +35,9 @@
         struct CmdSolid {
             rgba_color: u32,
         }
+        struct CmdJump {
+            new_ref: u32,
+        }
         enum Cmd {
             End,
             Circle(CmdCircle),
@@ -41,6 +47,7 @@
             FillEdge(CmdFillEdge),
             DrawFill(CmdDrawFill),
             Solid(CmdSolid),
+            Jump(CmdJump),
             Bail,
         }
     }
diff --git a/piet-gpu-types/src/segment.rs b/piet-gpu-types/src/segment.rs
new file mode 100644
index 0000000..ba5f3e2
--- /dev/null
+++ b/piet-gpu-types/src/segment.rs
@@ -0,0 +1,27 @@
+use piet_gpu_derive::piet_gpu;
+
+// Structures representing segments for stroke/fill items.
+
+piet_gpu! {
+    #[gpu_write]
+    mod segment {
+        struct TileHeader {
+            n: u32,
+            items: Ref<ItemHeader>,
+        }
+
+        // Note: this is only suitable for strokes, fills require backdrop.
+        struct ItemHeader {
+            n: u32,
+            segments: Ref<Segment>,
+        }
+
+        // TODO: strongly consider using f16. If so, these would be
+        // relative to the tile. We're doing f32 for now to minimize
+        // divergence from piet-metal originals.
+        struct Segment {
+            start: [f32; 2],
+            end: [f32; 2],
+        }
+    }
+}
diff --git a/piet-gpu-types/src/test.rs b/piet-gpu-types/src/test.rs
new file mode 100644
index 0000000..e92aaca
--- /dev/null
+++ b/piet-gpu-types/src/test.rs
@@ -0,0 +1,33 @@
+use piet_gpu_derive::piet_gpu;
+
+piet_gpu! {
+    #[rust_encode]
+    #[gpu_write]
+    mod test {
+        struct StructA {
+            a: f16,
+            b: f16,
+        }
+
+        struct StructB {
+            a: f16,
+            b: u16,
+            c: f16,
+        }
+
+        struct StructC {
+            a: f16,
+            b: u16,
+            c: u16,
+            d: f16,
+        }
+
+        struct StructD {
+            a: [f16; 2],
+        }
+
+        struct StructE {
+            a: [f16; 3],
+        }
+    }
+}
diff --git a/piet-gpu-types/src/tilegroup.rs b/piet-gpu-types/src/tilegroup.rs
index 4824178..ea295d9 100644
--- a/piet-gpu-types/src/tilegroup.rs
+++ b/piet-gpu-types/src/tilegroup.rs
@@ -1,5 +1,18 @@
 use piet_gpu_derive::piet_gpu;
 
+// Structures representing tilegroup instances (output of kernel 1).
+// There are three outputs: the main instances, the stroke instances,
+// and the fill instances. All three are conceptually a list of
+// instances, but the encoding is slightly different. The first is
+// encoded with Instance, Jump, and End. The other two are encoded
+// as a linked list of Chunk.
+
+// The motivation for the difference is that the first requires fewer
+// registers to track state, but the second contains information that
+// is useful up front for doing dynamic allocation in kernel 2, as
+// well as increasing read parallelism; the "jump" approach really is
+// geared to sequential reading.
+
 piet_gpu! {
     #[gpu_write]
     mod tilegroup {
@@ -10,8 +23,16 @@
             // A better type would be Point.
             offset: [f32; 2],
         }
+        struct Jump {
+            new_ref: Ref<TileGroup>,
+        }
+        struct Chunk {
+            chunk_n: u32,
+            next: Ref<Chunk>,
+        }
         enum TileGroup {
             Instance(Instance),
+            Jump(Jump),
             End,
         }
     }
diff --git a/piet-gpu/Cargo.toml b/piet-gpu/Cargo.toml
index 4b7a7e9..b082868 100644
--- a/piet-gpu/Cargo.toml
+++ b/piet-gpu/Cargo.toml
@@ -13,5 +13,7 @@
 path = "../piet-gpu-types"
 
 [dependencies]
+kurbo = "0.5.11"
+piet = "0.0.12"
 png = "0.16.2"
 rand = "0.7.3"
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 5befa7f..3da40c9 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -9,4 +9,10 @@
 
 build image.spv: glsl image.comp | scene.h
 
-build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h
+build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h
+
+build kernel2s.spv: glsl kernel2s.comp | scene.h tilegroup.h segment.h setup.h
+
+build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h ptcl.h setup.h
+
+build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
diff --git a/piet-gpu/shader/image.comp b/piet-gpu/shader/image.comp
index 60739d5..6d84eb5 100644
--- a/piet-gpu/shader/image.comp
+++ b/piet-gpu/shader/image.comp
@@ -40,7 +40,7 @@
         if (tag == PietItem_Circle) {
             PietCircle circle = PietItem_Circle_read(item_ref);
             float r = length(xy + vec2(0.5, 0.5) - circle.center.xy);
-            float alpha = clamp(circle.radius - r, 0.0, 1.0);
+            float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
             vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color);
             // TODO: sRGB
             rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
diff --git a/piet-gpu/shader/image.spv b/piet-gpu/shader/image.spv
index 527c9ae..097add1 100644
--- a/piet-gpu/shader/image.spv
+++ b/piet-gpu/shader/image.spv
Binary files differ
diff --git a/piet-gpu/shader/kernel1.comp b/piet-gpu/shader/kernel1.comp
index 436b8bd..ce99005 100644
--- a/piet-gpu/shader/kernel1.comp
+++ b/piet-gpu/shader/kernel1.comp
@@ -1,3 +1,15 @@
+// This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph
+// and outputs "instances" (references to item + translation) for each item
+// that intersects the tilegroup.
+//
+// This implementation is simplistic and leaves a lot of performance on the
+// table. A fancier implementation would use threadgroup shared memory or
+// subgroups (or possibly both) to parallelize the reading of the input and
+// the computation of tilegroup intersection.
+//
+// In addition, there are some features currently missing, such as support
+// for clipping.
+
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
@@ -12,16 +24,14 @@
     uint[] tilegroup;
 };
 
+layout(set = 0, binding = 2) buffer AllocBuf {
+    uint alloc;
+};
+
 #include "scene.h"
 #include "tilegroup.h"
 
-// TODO: compute this
-#define WIDTH_IN_TILEGROUPS 4
-
-#define TILEGROUP_WIDTH 512
-#define TILEGROUP_HEIGHT 16
-
-#define INITIAL_ALLOC 1024
+#include "setup.h"
 
 #define MAX_STACK 8
 
@@ -35,8 +45,18 @@
     StackElement stack[MAX_STACK];
     uint stack_ix = 0;
     uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;
-    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * INITIAL_ALLOC);
-    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH, TILEGROUP_HEIGHT);
+    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
+    uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
+
+    // State for stroke references.
+    TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START);
+    ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4);
+    InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size);
+    uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_ALLOC - Instance_size;
+    uint stroke_chunk_n = 0;
+    uint stroke_n = 0;
+
+    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
     PietItemRef root = PietItemRef(0);
     SimpleGroup group = PietItem_Group_read(root);
     StackElement tos = StackElement(root, 0, group.offset.xy);
@@ -45,19 +65,42 @@
         if (tos.index < group.n_items) {
             Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index));
             vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy;
-            bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH))
-                && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT));
+            bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX))
+                && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX));
             bool is_group = false;
+            uint tag;
             if (hit) {
                 PietItemRef item_ref = PietItem_index(group.items, tos.index);
-                is_group = PietItem_tag(item_ref) == PietItem_Group;
+                tag = PietItem_tag(item_ref);
+                is_group = tag == PietItem_Group;
             }
             if (hit && !is_group) {
                 PietItemRef item_ref = PietItem_index(group.items, tos.index);
                 Instance ins = Instance(item_ref.offset, tos.offset);
+                if (tg_ref.offset > tg_limit) {
+                    // Allocation exceeded; do atomic bump alloc.
+                    uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC);
+                    Jump jump = Jump(TileGroupRef(new_tg));
+                    TileGroup_Jump_write(tg_ref, jump);
+                    tg_ref = TileGroupRef(new_tg);
+                    tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
+                }
                 TileGroup_Instance_write(tg_ref, ins);
                 tg_ref.offset += TileGroup_size;
-                // TODO: bump allocate if allocation exceeded
+                if (tag == PietItem_Poly) {
+                    if (stroke_ref.offset > stroke_limit) {
+                        uint new_stroke = atomicAdd(alloc, TILEGROUP_STROKE_ALLOC);
+                        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(new_stroke)));
+                        stroke_chunk_start = ChunkRef(new_stroke);
+                        stroke_ref = InstanceRef(new_stroke + Chunk_size);
+                        stroke_n += stroke_chunk_n;
+                        stroke_chunk_n = 0;
+                        stroke_limit = new_stroke + TILEGROUP_STROKE_ALLOC - Instance_size;
+                    }
+                    Instance_write(stroke_ref, ins);
+                    stroke_chunk_n++;
+                    stroke_ref.offset += Instance_size;
+                }
             }
             if (is_group) {
                 PietItemRef item_ref = PietItem_index(group.items, tos.index);
@@ -80,4 +123,10 @@
         }
     }
     TileGroup_End_write(tg_ref);
+
+    stroke_n += stroke_chunk_n;
+    if (stroke_n > 0) {
+        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0)));
+    }
+    tilegroup[stroke_start.offset >> 2] = stroke_n;
 }
diff --git a/piet-gpu/shader/kernel1.spv b/piet-gpu/shader/kernel1.spv
index 0e9a497..8430d74 100644
--- a/piet-gpu/shader/kernel1.spv
+++ b/piet-gpu/shader/kernel1.spv
Binary files differ
diff --git a/piet-gpu/shader/kernel2s.comp b/piet-gpu/shader/kernel2s.comp
new file mode 100644
index 0000000..3eb2d00
--- /dev/null
+++ b/piet-gpu/shader/kernel2s.comp
@@ -0,0 +1,127 @@
+// This is "kernel 2" (strokes) in a 4-kernel pipeline. It processes the stroke
+// (polyline) items in the scene and generates a list of segments for each, for
+// each tile.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+layout(local_size_x = 32) in;
+
+layout(set = 0, binding = 0) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+layout(set = 0, binding = 1) buffer TilegroupBuf {
+    uint[] tilegroup;
+};
+
+layout(set = 0, binding = 2) buffer SegmentBuf {
+    uint[] segment;
+};
+
+layout(set = 0, binding = 3) buffer AllocBuf {
+    uint alloc;
+};
+
+#include "scene.h"
+#include "tilegroup.h"
+#include "segment.h"
+
+#include "setup.h"
+
+void main() {
+    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
+    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
+        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
+    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
+    TileGroupRef stroke_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_STROKE_START);
+    uint stroke_n = tilegroup[stroke_start.offset >> 2];
+
+    TileHeaderRef tile_header_ref = TileHeaderRef(tile_ix * TileHeader_size);
+    if (stroke_n > 0) {
+        ChunkRef chunk_ref = ChunkRef(stroke_start.offset + 4);
+        Chunk chunk = Chunk_read(chunk_ref);
+        InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
+        ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size));
+        TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header));
+        SegmentRef seg_ref = SegmentRef(0);
+        uint seg_limit = 0;
+        // Iterate through items; stroke_n holds count remaining.
+        while (true) {
+            if (chunk.chunk_n == 0) {
+                chunk_ref = chunk.next;
+                chunk = Chunk_read(chunk_ref);
+                stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
+            }
+            Instance ins = Instance_read(stroke_ref);
+            PietStrokePolyLine poly = PietItem_Poly_read(PietItemRef(ins.item_ref));
+
+            // Process the stroke polyline item.
+            uint max_n_segs = poly.n_points - 1;
+            uint reserve = max_n_segs * Segment_size;
+            if (seg_ref.offset + reserve > seg_limit) {
+                // This is a heuristic to balance atomic bandwidth and utilization.
+                // The output always gets a contiguous allocation. We might use
+                // all, some, or none of the capacity.
+                uint capacity_bytes = stroke_n > 1 ? reserve * 2 + 128 : reserve;
+                seg_ref.offset = atomicAdd(alloc, capacity_bytes);
+                seg_limit = seg_ref.offset + capacity_bytes;
+            }
+            uint n_segs = 0;
+            vec2 start = Point_read(poly.points).xy;
+            for (uint j = 0; j < max_n_segs; j++) {
+                poly.points.offset += Point_size;
+                vec2 end = Point_read(poly.points).xy;
+
+                // Process one segment.
+
+                // This logic just tests for collision. What we probably want to do
+                // is a clipping algorithm like Liang-Barsky, and then store coords
+                // relative to the tile in f16. See also:
+                // https://tavianator.com/fast-branchless-raybounding-box-intersections/
+
+                // Also note that when we go to the fancy version, we want to compute
+                // the (horizontal projection of) the bounding box of the intersection
+                // once per tilegroup, so we can assign work to individual tiles.
+
+                float a = end.y - start.y;
+                float b = start.x - end.x;
+                float c = -(a * start.x + b * start.y);
+                float half_width = 0.5 * poly.width;
+                // Tile boundaries padded by half-width.
+                float xmin = xy0.x - half_width;
+                float ymin = xy0.y - half_width;
+                float xmax = xy0.x + float(TILE_WIDTH_PX) + half_width;
+                float ymax = xy0.y + float(TILE_HEIGHT_PX) + half_width;
+                float s00 = sign(b * ymin + a * xmin + c);
+                float s01 = sign(b * ymin + a * xmax + c);
+                float s10 = sign(b * ymax + a * xmin + c);
+                float s11 = sign(b * ymax + a * xmax + c);
+                // If bounding boxes intersect and not all four corners are on the same side, hit.
+                // Also note: this is designed to be false on NAN input.
+                if (max(min(start.x, end.x), xmin) < min(max(start.x, end.x), xmax)
+                    && max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax)
+                    && s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
+                {
+                    Segment seg = Segment(start, end);
+                    Segment_write(Segment_index(seg_ref, n_segs), seg);
+                    n_segs++;
+                }
+
+                start = end;
+            }
+            ItemHeader_write(item_header, ItemHeader(n_segs, seg_ref));
+            if (--stroke_n == 0) {
+                break;
+            }
+            seg_ref.offset += n_segs * Segment_size;
+
+            stroke_ref.offset += Instance_size;
+            chunk.chunk_n--;
+            item_header.offset += ItemHeader_size;
+        }
+    } else {
+        // As an optimization, we could just write 0 for the size.
+        TileHeader_write(tile_header_ref, TileHeader(stroke_n, ItemHeaderRef(0)));
+    }
+}
diff --git a/piet-gpu/shader/kernel2s.spv b/piet-gpu/shader/kernel2s.spv
new file mode 100644
index 0000000..7c7f48f
--- /dev/null
+++ b/piet-gpu/shader/kernel2s.spv
Binary files differ
diff --git a/piet-gpu/shader/kernel3.comp b/piet-gpu/shader/kernel3.comp
new file mode 100644
index 0000000..fc4f9ea
--- /dev/null
+++ b/piet-gpu/shader/kernel3.comp
@@ -0,0 +1,107 @@
+// This is "kernel 3" in a 4-kernel pipeline. It walks the active items
+// for the tilegroup and produces a per-tile command list for each tile.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+layout(local_size_x = 32, local_size_y = 1) in;
+
+layout(set = 0, binding = 0) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+// TODO: this should have a `readonly` qualifier, but then inclusion
+// of ptcl.h would fail because of the writers.
+layout(set = 0, binding = 1) buffer TilegroupBuf {
+    uint[] tilegroup;
+};
+
+// Used readonly
+layout(set = 0, binding = 2) buffer SegmentBuf {
+    uint[] segment;
+};
+
+layout(set = 0, binding = 3) buffer PtclBuf {
+    uint[] ptcl;
+};
+
+layout(set = 0, binding = 4) buffer AllocBuf {
+    uint alloc;
+};
+
+#include "scene.h"
+#include "tilegroup.h"
+#include "segment.h"
+#include "ptcl.h"
+
+#include "setup.h"
+
+void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
+    if (cmd_ref.offset > cmd_limit) {
+        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
+        CmdJump jump = CmdJump(new_cmd);
+        Cmd_Jump_write(cmd_ref, jump);
+        cmd_ref = CmdRef(new_cmd);
+        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+    }
+}
+
+void main() {
+    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
+    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
+        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
+    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
+    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
+    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
+    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+
+    TileHeader stroke_th = TileHeader_read(TileHeaderRef(tile_ix * TileHeader_size));
+
+    while (true) {
+        uint tg_tag = TileGroup_tag(tg_ref);
+        if (tg_tag == TileGroup_End) {
+            break;
+        }
+        if (tg_tag == TileGroup_Jump) {
+            tg_ref = TileGroup_Jump_read(tg_ref).new_ref;
+            continue;
+        }
+        // Assume tg_tag is `Instance`, though there will be more cases.
+        Instance ins = TileGroup_Instance_read(tg_ref);
+        PietItemRef item_ref = PietItemRef(ins.item_ref);
+        uint item_tag = PietItem_tag(item_ref);
+        switch (item_tag) {
+        case PietItem_Circle:
+            PietCircle circle = PietItem_Circle_read(item_ref);
+            vec2 center = ins.offset + circle.center.xy;
+            float r = circle.radius;
+            if (max(center.x - r, xy0.x) < min(center.x + r, xy0.x + float(TILE_WIDTH_PX))
+                && max(center.y - r, xy0.y) < min(center.y + r, xy0.y + float(TILE_HEIGHT_PX)))
+            {
+                CmdCircle cmd = CmdCircle(center, r, circle.rgba_color);
+                alloc_cmd(cmd_ref, cmd_limit);
+                Cmd_Circle_write(cmd_ref, cmd);
+                cmd_ref.offset += Cmd_size;
+            }
+            break;
+        case PietItem_Poly:
+            ItemHeader stroke_item = ItemHeader_read(stroke_th.items);
+            stroke_th.items.offset += ItemHeader_size;
+            if (stroke_item.n > 0) {
+                PietStrokePolyLine poly = PietItem_Poly_read(item_ref);
+                CmdStroke cmd = CmdStroke(
+                    stroke_item.n,
+                    stroke_item.segments.offset,
+                    0.5 * poly.width,
+                    poly.rgba_color
+                );
+                alloc_cmd(cmd_ref, cmd_limit);
+                Cmd_Stroke_write(cmd_ref, cmd);
+                cmd_ref.offset += Cmd_size;
+            }
+            break;
+        }
+        tg_ref.offset += TileGroup_size;
+    }
+    Cmd_End_write(cmd_ref);
+}
diff --git a/piet-gpu/shader/kernel3.spv b/piet-gpu/shader/kernel3.spv
new file mode 100644
index 0000000..f5b83bc
--- /dev/null
+++ b/piet-gpu/shader/kernel3.spv
Binary files differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
new file mode 100644
index 0000000..931f28b
--- /dev/null
+++ b/piet-gpu/shader/kernel4.comp
@@ -0,0 +1,79 @@
+// This is "kernel 4" in a 4-kernel pipeline. It renders the commands
+// in the per-tile command list to an image.
+
+// Right now, this kernel stores the image in a buffer, but a better
+// plan is to use a texture. This is because of limited support.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+layout(local_size_x = 16, local_size_y = 16) in;
+
+// Same concern that this should be readonly as in kernel 3.
+layout(set = 0, binding = 0) buffer PtclBuf {
+    uint[] ptcl;
+};
+
+// Used readonly
+layout(set = 0, binding = 1) buffer SegmentBuf {
+    uint[] segment;
+};
+
+layout(set = 0, binding = 2) buffer ImageBuf {
+    uint[] image;
+};
+
+#include "ptcl.h"
+#include "segment.h"
+
+#include "setup.h"
+
+void main() {
+    uint tile_ix = gl_WorkGroupID.y * WIDTH_IN_TILES + gl_WorkGroupID.x;
+    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
+
+    uvec2 xy_uint = gl_GlobalInvocationID.xy;
+    vec2 xy = vec2(xy_uint);
+    vec2 uv = xy * vec2(1.0 / IMAGE_WIDTH, 1.0 / IMAGE_HEIGHT);
+    vec3 rgb = uv.xyy;
+
+    while (true) {
+        uint tag = Cmd_tag(cmd_ref);
+        if (tag == Cmd_End) {
+            break;
+        }
+        switch (tag) {
+        case Cmd_Circle:
+            CmdCircle circle = Cmd_Circle_read(cmd_ref);
+            float r = length(xy + vec2(0.5, 0.5) - circle.center.xy);
+            float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
+            vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color).wzyx;
+            // TODO: sRGB
+            rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
+            break;
+        case Cmd_Stroke:
+            CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
+            float df = 1e9;
+            for (int i = 0; i < stroke.n_segs; i++) {
+                Segment seg = Segment_read(Segment_index(SegmentRef(stroke.seg_ref), i));
+                vec2 line_vec = seg.end - seg.start;
+                vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
+                float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
+                df = min(df, length(line_vec * t - dpos));
+            }
+            fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
+            alpha = clamp(stroke.half_width + 0.5 - df, 0.0, 1.0);
+            rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
+            break;
+        case Cmd_Jump:
+            cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref);
+            continue;
+        }
+        cmd_ref.offset += Cmd_size;
+    }
+
+    // TODO: sRGB
+    uvec4 s = uvec4(round(vec4(rgb, 1.0) * 255.0));
+    uint rgba_packed = s.r | (s.g << 8) | (s.b << 16) | (s.a << 24);
+    image[xy_uint.y * IMAGE_WIDTH + xy_uint.x] = rgba_packed;
+}
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
new file mode 100644
index 0000000..b931f23
--- /dev/null
+++ b/piet-gpu/shader/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
new file mode 100644
index 0000000..8b62538
--- /dev/null
+++ b/piet-gpu/shader/ptcl.h
@@ -0,0 +1,368 @@
+// Code auto-generated by piet-gpu-derive
+
+struct CmdCircleRef {
+    uint offset;
+};
+
+struct CmdLineRef {
+    uint offset;
+};
+
+struct CmdStrokeRef {
+    uint offset;
+};
+
+struct CmdFillRef {
+    uint offset;
+};
+
+struct CmdFillEdgeRef {
+    uint offset;
+};
+
+struct CmdDrawFillRef {
+    uint offset;
+};
+
+struct CmdSolidRef {
+    uint offset;
+};
+
+struct CmdJumpRef {
+    uint offset;
+};
+
+struct CmdRef {
+    uint offset;
+};
+
+struct CmdCircle {
+    vec2 center;
+    float radius;
+    uint rgba_color;
+};
+
+#define CmdCircle_size 16
+
+CmdCircleRef CmdCircle_index(CmdCircleRef ref, uint index) {
+    return CmdCircleRef(ref.offset + index * CmdCircle_size);
+}
+
+struct CmdLine {
+    vec2 start;
+    vec2 end;
+};
+
+#define CmdLine_size 16
+
+CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
+    return CmdLineRef(ref.offset + index * CmdLine_size);
+}
+
+struct CmdStroke {
+    uint n_segs;
+    uint seg_ref;
+    float half_width;
+    uint rgba_color;
+};
+
+#define CmdStroke_size 16
+
+CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
+    return CmdStrokeRef(ref.offset + index * CmdStroke_size);
+}
+
+struct CmdFill {
+    vec2 start;
+    vec2 end;
+};
+
+#define CmdFill_size 16
+
+CmdFillRef CmdFill_index(CmdFillRef ref, uint index) {
+    return CmdFillRef(ref.offset + index * CmdFill_size);
+}
+
+struct CmdFillEdge {
+    int sign;
+    float y;
+};
+
+#define CmdFillEdge_size 8
+
+CmdFillEdgeRef CmdFillEdge_index(CmdFillEdgeRef ref, uint index) {
+    return CmdFillEdgeRef(ref.offset + index * CmdFillEdge_size);
+}
+
+struct CmdDrawFill {
+    int backdrop;
+    uint rgba_color;
+};
+
+#define CmdDrawFill_size 8
+
+CmdDrawFillRef CmdDrawFill_index(CmdDrawFillRef ref, uint index) {
+    return CmdDrawFillRef(ref.offset + index * CmdDrawFill_size);
+}
+
+struct CmdSolid {
+    uint rgba_color;
+};
+
+#define CmdSolid_size 4
+
+CmdSolidRef CmdSolid_index(CmdSolidRef ref, uint index) {
+    return CmdSolidRef(ref.offset + index * CmdSolid_size);
+}
+
+struct CmdJump {
+    uint new_ref;
+};
+
+#define CmdJump_size 4
+
+CmdJumpRef CmdJump_index(CmdJumpRef ref, uint index) {
+    return CmdJumpRef(ref.offset + index * CmdJump_size);
+}
+
+#define Cmd_End 0
+#define Cmd_Circle 1
+#define Cmd_Line 2
+#define Cmd_Fill 3
+#define Cmd_Stroke 4
+#define Cmd_FillEdge 5
+#define Cmd_DrawFill 6
+#define Cmd_Solid 7
+#define Cmd_Jump 8
+#define Cmd_Bail 9
+#define Cmd_size 20
+
+CmdRef Cmd_index(CmdRef ref, uint index) {
+    return CmdRef(ref.offset + index * Cmd_size);
+}
+
+CmdCircle CmdCircle_read(CmdCircleRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    uint raw2 = ptcl[ix + 2];
+    uint raw3 = ptcl[ix + 3];
+    CmdCircle s;
+    s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.radius = uintBitsToFloat(raw2);
+    s.rgba_color = raw3;
+    return s;
+}
+
+void CmdCircle_write(CmdCircleRef ref, CmdCircle s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = floatBitsToUint(s.center.x);
+    ptcl[ix + 1] = floatBitsToUint(s.center.y);
+    ptcl[ix + 2] = floatBitsToUint(s.radius);
+    ptcl[ix + 3] = s.rgba_color;
+}
+
+CmdLine CmdLine_read(CmdLineRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    uint raw2 = ptcl[ix + 2];
+    uint raw3 = ptcl[ix + 3];
+    CmdLine s;
+    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void CmdLine_write(CmdLineRef ref, CmdLine s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = floatBitsToUint(s.start.x);
+    ptcl[ix + 1] = floatBitsToUint(s.start.y);
+    ptcl[ix + 2] = floatBitsToUint(s.end.x);
+    ptcl[ix + 3] = floatBitsToUint(s.end.y);
+}
+
+CmdStroke CmdStroke_read(CmdStrokeRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    uint raw2 = ptcl[ix + 2];
+    uint raw3 = ptcl[ix + 3];
+    CmdStroke s;
+    s.n_segs = raw0;
+    s.seg_ref = raw1;
+    s.half_width = uintBitsToFloat(raw2);
+    s.rgba_color = raw3;
+    return s;
+}
+
+void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = s.n_segs;
+    ptcl[ix + 1] = s.seg_ref;
+    ptcl[ix + 2] = floatBitsToUint(s.half_width);
+    ptcl[ix + 3] = s.rgba_color;
+}
+
+CmdFill CmdFill_read(CmdFillRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    uint raw2 = ptcl[ix + 2];
+    uint raw3 = ptcl[ix + 3];
+    CmdFill s;
+    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void CmdFill_write(CmdFillRef ref, CmdFill s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = floatBitsToUint(s.start.x);
+    ptcl[ix + 1] = floatBitsToUint(s.start.y);
+    ptcl[ix + 2] = floatBitsToUint(s.end.x);
+    ptcl[ix + 3] = floatBitsToUint(s.end.y);
+}
+
+CmdFillEdge CmdFillEdge_read(CmdFillEdgeRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    CmdFillEdge s;
+    s.sign = int(raw0);
+    s.y = uintBitsToFloat(raw1);
+    return s;
+}
+
+void CmdFillEdge_write(CmdFillEdgeRef ref, CmdFillEdge s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = uint(s.sign);
+    ptcl[ix + 1] = floatBitsToUint(s.y);
+}
+
+CmdDrawFill CmdDrawFill_read(CmdDrawFillRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    CmdDrawFill s;
+    s.backdrop = int(raw0);
+    s.rgba_color = raw1;
+    return s;
+}
+
+void CmdDrawFill_write(CmdDrawFillRef ref, CmdDrawFill s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = uint(s.backdrop);
+    ptcl[ix + 1] = s.rgba_color;
+}
+
+CmdSolid CmdSolid_read(CmdSolidRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    CmdSolid s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+void CmdSolid_write(CmdSolidRef ref, CmdSolid s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = s.rgba_color;
+}
+
+CmdJump CmdJump_read(CmdJumpRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    CmdJump s;
+    s.new_ref = raw0;
+    return s;
+}
+
+void CmdJump_write(CmdJumpRef ref, CmdJump s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = s.new_ref;
+}
+
+uint Cmd_tag(CmdRef ref) {
+    return ptcl[ref.offset >> 2];
+}
+
+CmdCircle Cmd_Circle_read(CmdRef ref) {
+    return CmdCircle_read(CmdCircleRef(ref.offset + 4));
+}
+
+CmdLine Cmd_Line_read(CmdRef ref) {
+    return CmdLine_read(CmdLineRef(ref.offset + 4));
+}
+
+CmdFill Cmd_Fill_read(CmdRef ref) {
+    return CmdFill_read(CmdFillRef(ref.offset + 4));
+}
+
+CmdStroke Cmd_Stroke_read(CmdRef ref) {
+    return CmdStroke_read(CmdStrokeRef(ref.offset + 4));
+}
+
+CmdFillEdge Cmd_FillEdge_read(CmdRef ref) {
+    return CmdFillEdge_read(CmdFillEdgeRef(ref.offset + 4));
+}
+
+CmdDrawFill Cmd_DrawFill_read(CmdRef ref) {
+    return CmdDrawFill_read(CmdDrawFillRef(ref.offset + 4));
+}
+
+CmdSolid Cmd_Solid_read(CmdRef ref) {
+    return CmdSolid_read(CmdSolidRef(ref.offset + 4));
+}
+
+CmdJump Cmd_Jump_read(CmdRef ref) {
+    return CmdJump_read(CmdJumpRef(ref.offset + 4));
+}
+
+void Cmd_End_write(CmdRef ref) {
+    ptcl[ref.offset >> 2] = Cmd_End;
+}
+
+void Cmd_Circle_write(CmdRef ref, CmdCircle s) {
+    ptcl[ref.offset >> 2] = Cmd_Circle;
+    CmdCircle_write(CmdCircleRef(ref.offset + 4), s);
+}
+
+void Cmd_Line_write(CmdRef ref, CmdLine s) {
+    ptcl[ref.offset >> 2] = Cmd_Line;
+    CmdLine_write(CmdLineRef(ref.offset + 4), s);
+}
+
+void Cmd_Fill_write(CmdRef ref, CmdFill s) {
+    ptcl[ref.offset >> 2] = Cmd_Fill;
+    CmdFill_write(CmdFillRef(ref.offset + 4), s);
+}
+
+void Cmd_Stroke_write(CmdRef ref, CmdStroke s) {
+    ptcl[ref.offset >> 2] = Cmd_Stroke;
+    CmdStroke_write(CmdStrokeRef(ref.offset + 4), s);
+}
+
+void Cmd_FillEdge_write(CmdRef ref, CmdFillEdge s) {
+    ptcl[ref.offset >> 2] = Cmd_FillEdge;
+    CmdFillEdge_write(CmdFillEdgeRef(ref.offset + 4), s);
+}
+
+void Cmd_DrawFill_write(CmdRef ref, CmdDrawFill s) {
+    ptcl[ref.offset >> 2] = Cmd_DrawFill;
+    CmdDrawFill_write(CmdDrawFillRef(ref.offset + 4), s);
+}
+
+void Cmd_Solid_write(CmdRef ref, CmdSolid s) {
+    ptcl[ref.offset >> 2] = Cmd_Solid;
+    CmdSolid_write(CmdSolidRef(ref.offset + 4), s);
+}
+
+void Cmd_Jump_write(CmdRef ref, CmdJump s) {
+    ptcl[ref.offset >> 2] = Cmd_Jump;
+    CmdJump_write(CmdJumpRef(ref.offset + 4), s);
+}
+
+void Cmd_Bail_write(CmdRef ref) {
+    ptcl[ref.offset >> 2] = Cmd_Bail;
+}
+
diff --git a/piet-gpu/shader/segment.h b/piet-gpu/shader/segment.h
new file mode 100644
index 0000000..517c115
--- /dev/null
+++ b/piet-gpu/shader/segment.h
@@ -0,0 +1,99 @@
+// Code auto-generated by piet-gpu-derive
+
+struct TileHeaderRef {
+    uint offset;
+};
+
+struct ItemHeaderRef {
+    uint offset;
+};
+
+struct SegmentRef {
+    uint offset;
+};
+
+struct TileHeader {
+    uint n;
+    ItemHeaderRef items;
+};
+
+#define TileHeader_size 8
+
+TileHeaderRef TileHeader_index(TileHeaderRef ref, uint index) {
+    return TileHeaderRef(ref.offset + index * TileHeader_size);
+}
+
+struct ItemHeader {
+    uint n;
+    SegmentRef segments;
+};
+
+#define ItemHeader_size 8
+
+ItemHeaderRef ItemHeader_index(ItemHeaderRef ref, uint index) {
+    return ItemHeaderRef(ref.offset + index * ItemHeader_size);
+}
+
+struct Segment {
+    vec2 start;
+    vec2 end;
+};
+
+#define Segment_size 16
+
+SegmentRef Segment_index(SegmentRef ref, uint index) {
+    return SegmentRef(ref.offset + index * Segment_size);
+}
+
+TileHeader TileHeader_read(TileHeaderRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = segment[ix + 0];
+    uint raw1 = segment[ix + 1];
+    TileHeader s;
+    s.n = raw0;
+    s.items = ItemHeaderRef(raw1);
+    return s;
+}
+
+void TileHeader_write(TileHeaderRef ref, TileHeader s) {
+    uint ix = ref.offset >> 2;
+    segment[ix + 0] = s.n;
+    segment[ix + 1] = s.items.offset;
+}
+
+ItemHeader ItemHeader_read(ItemHeaderRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = segment[ix + 0];
+    uint raw1 = segment[ix + 1];
+    ItemHeader s;
+    s.n = raw0;
+    s.segments = SegmentRef(raw1);
+    return s;
+}
+
+void ItemHeader_write(ItemHeaderRef ref, ItemHeader s) {
+    uint ix = ref.offset >> 2;
+    segment[ix + 0] = s.n;
+    segment[ix + 1] = s.segments.offset;
+}
+
+Segment Segment_read(SegmentRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = segment[ix + 0];
+    uint raw1 = segment[ix + 1];
+    uint raw2 = segment[ix + 2];
+    uint raw3 = segment[ix + 3];
+    Segment s;
+    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void Segment_write(SegmentRef ref, Segment s) {
+    uint ix = ref.offset >> 2;
+    segment[ix + 0] = floatBitsToUint(s.start.x);
+    segment[ix + 1] = floatBitsToUint(s.start.y);
+    segment[ix + 2] = floatBitsToUint(s.end.x);
+    segment[ix + 3] = floatBitsToUint(s.end.y);
+}
+
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
new file mode 100644
index 0000000..a644dc0
--- /dev/null
+++ b/piet-gpu/shader/setup.h
@@ -0,0 +1,34 @@
+// Various constants for the sizes of groups and tiles.
+
+// Much of this will be made dynamic in various ways, but for now it's easiest
+// to hardcode and keep all in one place.
+
+// TODO: make the image size dynamic.
+#define IMAGE_WIDTH 2048
+#define IMAGE_HEIGHT 1536
+
+// TODO: compute this
+#define WIDTH_IN_TILEGROUPS 4
+
+#define TILEGROUP_WIDTH_PX 512
+#define TILEGROUP_HEIGHT_PX 16
+
+#define TILEGROUP_INITIAL_ALLOC 1024
+
+// Quick note on layout of tilegroups (k1 output): in the base,
+// there is a region of size TILEGROUP_STRIDE for each tilegroup.
+// At offset 0 are the main instances, encoded with Jump. At offset
+// TILEGROUP_STROKE_START are the stroke instances, encoded with
+// Head and Link.
+#define TILEGROUP_STRIDE 2048
+#define TILEGROUP_STROKE_START 1024
+#define TILEGROUP_STROKE_ALLOC 1024
+
+// TODO: compute all these
+
+#define WIDTH_IN_TILES 128
+#define TILEGROUP_WIDTH_TILES 32
+#define TILE_WIDTH_PX 16
+#define TILE_HEIGHT_PX 16
+
+#define PTCL_INITIAL_ALLOC 1024
diff --git a/piet-gpu/shader/tilegroup.h b/piet-gpu/shader/tilegroup.h
index f1d646f..213ddc3 100644
--- a/piet-gpu/shader/tilegroup.h
+++ b/piet-gpu/shader/tilegroup.h
@@ -4,6 +4,14 @@
     uint offset;
 };
 
+struct JumpRef {
+    uint offset;
+};
+
+struct ChunkRef {
+    uint offset;
+};
+
 struct TileGroupRef {
     uint offset;
 };
@@ -19,8 +27,30 @@
     return InstanceRef(ref.offset + index * Instance_size);
 }
 
+struct Jump {
+    TileGroupRef new_ref;
+};
+
+#define Jump_size 4
+
+JumpRef Jump_index(JumpRef ref, uint index) {
+    return JumpRef(ref.offset + index * Jump_size);
+}
+
+struct Chunk {
+    uint chunk_n;
+    ChunkRef next;
+};
+
+#define Chunk_size 8
+
+ChunkRef Chunk_index(ChunkRef ref, uint index) {
+    return ChunkRef(ref.offset + index * Chunk_size);
+}
+
 #define TileGroup_Instance 0
-#define TileGroup_End 1
+#define TileGroup_Jump 1
+#define TileGroup_End 2
 #define TileGroup_size 16
 
 TileGroupRef TileGroup_index(TileGroupRef ref, uint index) {
@@ -45,6 +75,35 @@
     tilegroup[ix + 2] = floatBitsToUint(s.offset.y);
 }
 
+Jump Jump_read(JumpRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = tilegroup[ix + 0];
+    Jump s;
+    s.new_ref = TileGroupRef(raw0);
+    return s;
+}
+
+void Jump_write(JumpRef ref, Jump s) {
+    uint ix = ref.offset >> 2;
+    tilegroup[ix + 0] = s.new_ref.offset;
+}
+
+Chunk Chunk_read(ChunkRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = tilegroup[ix + 0];
+    uint raw1 = tilegroup[ix + 1];
+    Chunk s;
+    s.chunk_n = raw0;
+    s.next = ChunkRef(raw1);
+    return s;
+}
+
+void Chunk_write(ChunkRef ref, Chunk s) {
+    uint ix = ref.offset >> 2;
+    tilegroup[ix + 0] = s.chunk_n;
+    tilegroup[ix + 1] = s.next.offset;
+}
+
 uint TileGroup_tag(TileGroupRef ref) {
     return tilegroup[ref.offset >> 2];
 }
@@ -53,11 +112,20 @@
     return Instance_read(InstanceRef(ref.offset + 4));
 }
 
+Jump TileGroup_Jump_read(TileGroupRef ref) {
+    return Jump_read(JumpRef(ref.offset + 4));
+}
+
 void TileGroup_Instance_write(TileGroupRef ref, Instance s) {
     tilegroup[ref.offset >> 2] = TileGroup_Instance;
     Instance_write(InstanceRef(ref.offset + 4), s);
 }
 
+void TileGroup_Jump_write(TileGroupRef ref, Jump s) {
+    tilegroup[ref.offset >> 2] = TileGroup_Jump;
+    Jump_write(JumpRef(ref.offset + 4), s);
+}
+
 void TileGroup_End_write(TileGroupRef ref) {
     tilegroup[ref.offset >> 2] = TileGroup_End;
 }
diff --git a/piet-gpu/src/main.rs b/piet-gpu/src/main.rs
index 72f0d3c..9f4f25f 100644
--- a/piet-gpu/src/main.rs
+++ b/piet-gpu/src/main.rs
@@ -4,11 +4,15 @@
 
 use rand::{Rng, RngCore};
 
+use piet::kurbo::{BezPath, Circle, Line, Point, Vec2};
+use piet::{Color, RenderContext};
+
 use piet_gpu_hal::vulkan::VkInstance;
 use piet_gpu_hal::{CmdBuf, Device, MemFlags};
 
-use piet_gpu_types::encoder::{Encode, Encoder};
-use piet_gpu_types::scene::{Bbox, PietCircle, PietItem, Point, SimpleGroup};
+mod render_ctx;
+
+use render_ctx::PietGpuRenderContext;
 
 const WIDTH: usize = 2048;
 const HEIGHT: usize = 1536;
@@ -16,52 +20,53 @@
 const TILE_W: usize = 16;
 const TILE_H: usize = 16;
 
-const N_CIRCLES: usize = 100;
+const WIDTH_IN_TILEGROUPS: usize = 4;
+const HEIGHT_IN_TILEGROUPS: usize = 96;
+const TILEGROUP_STRIDE: usize = 2048;
 
-fn make_scene() -> Vec<u8> {
+const WIDTH_IN_TILES: usize = 128;
+const HEIGHT_IN_TILES: usize = 96;
+const PTCL_INITIAL_ALLOC: usize = 1024;
+
+const K2_PER_TILE_SIZE: usize = 8;
+
+const N_CIRCLES: usize = 1;
+
+fn render_scene(rc: &mut impl RenderContext) {
     let mut rng = rand::thread_rng();
-    let mut encoder = Encoder::new();
-    let _reserve_root = encoder.alloc_chunk(PietItem::fixed_size() as u32);
-
-    let mut items = Vec::new();
-    let mut bboxes = Vec::new();
     for _ in 0..N_CIRCLES {
-        let circle = PietCircle {
-            rgba_color: rng.next_u32(),
-            center: Point {
-                xy: [
-                    rng.gen_range(0.0, WIDTH as f32),
-                    rng.gen_range(0.0, HEIGHT as f32),
-                ],
-            },
-            radius: rng.gen_range(0.0, 50.0),
-        };
-        let bbox = Bbox {
-            bbox: [
-                (circle.center.xy[0] - circle.radius).floor() as i16,
-                (circle.center.xy[1] - circle.radius).floor() as i16,
-                (circle.center.xy[0] + circle.radius).ceil() as i16,
-                (circle.center.xy[1] + circle.radius).ceil() as i16,
-            ],
-        };
-        items.push(PietItem::Circle(circle));
-        bboxes.push(bbox);
+        let color = Color::from_rgba32_u32(rng.next_u32());
+        let center = Point::new(
+            rng.gen_range(0.0, WIDTH as f64),
+            rng.gen_range(0.0, HEIGHT as f64),
+        );
+        let radius = rng.gen_range(0.0, 50.0);
+        let circle = Circle::new(center, radius);
+        rc.fill(circle, &color);
     }
+    rc.stroke(
+        Line::new((100.0, 100.0), (200.0, 150.0)),
+        &Color::WHITE,
+        5.0,
+    );
+    render_cardioid(rc);
+}
 
-    let n_items = bboxes.len() as u32;
-    let bboxes = bboxes.encode(&mut encoder).transmute();
-    let items = items.encode(&mut encoder).transmute();
-    let offset = Point { xy: [0.0, 0.0] };
-    let simple_group = SimpleGroup {
-        n_items,
-        bboxes,
-        items,
-        offset,
-    };
-    let root_item = PietItem::Group(simple_group);
-    root_item.encode_to(&mut encoder.buf_mut()[0..PietItem::fixed_size()]);
-    // We should avoid this clone.
-    encoder.buf().to_owned()
+fn render_cardioid(rc: &mut impl RenderContext) {
+    let n = 100;
+    let dth = std::f64::consts::PI * 2.0 / (n as f64);
+    let center = Point::new(1024.0, 768.0);
+    let r = 750.0;
+    let mut path = BezPath::new();
+    for i in 1..n {
+        let p0 = center + Vec2::from_angle(i as f64 * dth) * r;
+        let p1 = center + Vec2::from_angle(((i * 2) % n) as f64 * dth) * r;
+        rc.fill(&Circle::new(p0, 8.0), &Color::WHITE);
+        path.move_to(p0);
+        path.line_to(p1);
+        //rc.stroke(Line::new(p0, p1), &Color::BLACK, 2.0);
+    }
+    rc.stroke(&path, &Color::BLACK, 2.0);
 }
 
 #[allow(unused)]
@@ -73,6 +78,7 @@
     }
 }
 
+#[allow(unused)]
 fn dump_k1_data(k1_buf: &[u32]) {
     for i in 0..k1_buf.len() {
         if k1_buf[i] != 0 {
@@ -87,7 +93,9 @@
         let device = instance.device().unwrap();
         let host = MemFlags::host_coherent();
         let dev = MemFlags::device_local();
-        let scene = make_scene();
+        let mut ctx = PietGpuRenderContext::new();
+        render_scene(&mut ctx);
+        let scene = ctx.get_scene_buf();
         //dump_scene(&scene);
         let scene_buf = device
             .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host)
@@ -96,7 +104,9 @@
             .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev)
             .unwrap();
         device.write_buffer(&scene_buf, &scene).unwrap();
-        let tilegroup_buf = device.create_buffer(384 * 1024, host).unwrap();
+        let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev).unwrap();
+        let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev).unwrap();
+        let segment_buf = device.create_buffer(64 * 1024 * 1024, dev).unwrap();
         let image_buf = device
             .create_buffer((WIDTH * HEIGHT * 4) as u64, host)
             .unwrap();
@@ -104,23 +114,74 @@
             .create_buffer((WIDTH * HEIGHT * 4) as u64, dev)
             .unwrap();
 
+        let k1_alloc_buf_host = device.create_buffer(4, host).unwrap();
+        let k1_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
+        let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE;
+        device
+            .write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])
+            .unwrap();
         let k1_code = include_bytes!("../shader/kernel1.spv");
-        let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 2).unwrap();
+        let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 3).unwrap();
         let k1_ds = device
-            .create_descriptor_set(&k1_pipeline, &[&scene_dev, &tilegroup_buf])
+            .create_descriptor_set(
+                &k1_pipeline,
+                &[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
+            )
             .unwrap();
 
-        let code = include_bytes!("../shader/image.spv");
-        let pipeline = device.create_simple_compute_pipeline(code, 2).unwrap();
-        let descriptor_set = device
-            .create_descriptor_set(&pipeline, &[&scene_dev, &image_dev])
+        let k2s_alloc_buf_host = device.create_buffer(4, host).unwrap();
+        let k2s_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
+        let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
+        device
+            .write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])
             .unwrap();
-        let query_pool = device.create_query_pool(3).unwrap();
+        let k2s_code = include_bytes!("../shader/kernel2s.spv");
+        let k2s_pipeline = device.create_simple_compute_pipeline(k2s_code, 4).unwrap();
+        let k2s_ds = device
+            .create_descriptor_set(
+                &k2s_pipeline,
+                &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
+            )
+            .unwrap();
+
+        let k3_alloc_buf_host = device.create_buffer(4, host).unwrap();
+        let k3_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
+        let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
+        device
+            .write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])
+            .unwrap();
+        let k3_code = include_bytes!("../shader/kernel3.spv");
+        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 5).unwrap();
+        let k3_ds = device
+            .create_descriptor_set(
+                &k3_pipeline,
+                &[
+                    &scene_dev,
+                    &tilegroup_buf,
+                    &segment_buf,
+                    &ptcl_buf,
+                    &k3_alloc_buf_dev,
+                ],
+            )
+            .unwrap();
+
+        let k4_code = include_bytes!("../shader/kernel4.spv");
+        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3).unwrap();
+        let k4_ds = device
+            .create_descriptor_set(&k4_pipeline, &[&ptcl_buf, &segment_buf, &image_dev])
+            .unwrap();
+
+        let query_pool = device.create_query_pool(5).unwrap();
         let mut cmd_buf = device.create_cmd_buf().unwrap();
         cmd_buf.begin();
         cmd_buf.copy_buffer(&scene_buf, &scene_dev);
+        cmd_buf.copy_buffer(&k1_alloc_buf_host, &k1_alloc_buf_dev);
+        cmd_buf.copy_buffer(&k2s_alloc_buf_host, &k2s_alloc_buf_dev);
+        cmd_buf.copy_buffer(&k3_alloc_buf_host, &k3_alloc_buf_dev);
         cmd_buf.clear_buffer(&tilegroup_buf);
+        cmd_buf.clear_buffer(&ptcl_buf);
         cmd_buf.memory_barrier();
+        cmd_buf.reset_query_pool(&query_pool);
         cmd_buf.write_timestamp(&query_pool, 0);
         cmd_buf.dispatch(
             &k1_pipeline,
@@ -130,22 +191,49 @@
         cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
         cmd_buf.dispatch(
-            &pipeline,
-            &descriptor_set,
-            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
+            &k2s_pipeline,
+            &k2s_ds,
+            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
         );
         cmd_buf.write_timestamp(&query_pool, 2);
         cmd_buf.memory_barrier();
+        cmd_buf.dispatch(
+            &k3_pipeline,
+            &k3_ds,
+            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
+        );
+        cmd_buf.write_timestamp(&query_pool, 3);
+        cmd_buf.memory_barrier();
+        cmd_buf.dispatch(
+            &k4_pipeline,
+            &k4_ds,
+            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
+        );
+        cmd_buf.write_timestamp(&query_pool, 4);
+        cmd_buf.memory_barrier();
         cmd_buf.copy_buffer(&image_dev, &image_buf);
         cmd_buf.finish();
         device.run_cmd_buf(&cmd_buf).unwrap();
         let timestamps = device.reap_query_pool(query_pool).unwrap();
         println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3);
-        println!("Render time: {:.3}ms", (timestamps[1] - timestamps[0]) * 1e3);
+        println!(
+            "Kernel 2 time: {:.3}ms",
+            (timestamps[1] - timestamps[0]) * 1e3
+        );
+        println!(
+            "Kernel 3 time: {:.3}ms",
+            (timestamps[2] - timestamps[1]) * 1e3
+        );
+        println!(
+            "Render time: {:.3}ms",
+            (timestamps[3] - timestamps[2]) * 1e3
+        );
 
+        /*
         let mut k1_data: Vec<u32> = Default::default();
-        device.read_buffer(&tilegroup_buf, &mut k1_data).unwrap();
+        device.read_buffer(&segment_buf, &mut k1_data).unwrap();
         dump_k1_data(&k1_data);
+        */
 
         let mut img_data: Vec<u8> = Default::default();
         // Note: because png can use a `&[u8]` slice, we could avoid an extra copy
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
new file mode 100644
index 0000000..f5b6897
--- /dev/null
+++ b/piet-gpu/src/render_ctx.rs
@@ -0,0 +1,356 @@
+use std::borrow::Cow;
+
+use piet_gpu_types::encoder::{Encode, Encoder, Ref};
+use piet_gpu_types::scene;
+use piet_gpu_types::scene::{Bbox, PietCircle, PietItem, PietStrokePolyLine, SimpleGroup};
+
+use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};
+
+use piet::{
+    Color, Error, FixedGradient, Font, FontBuilder, HitTestPoint, HitTestTextPosition, ImageFormat,
+    InterpolationMode, IntoBrush, LineMetric, RenderContext, StrokeStyle, Text, TextLayout,
+    TextLayoutBuilder,
+};
+
+pub struct PietGpuImage;
+
+pub struct PietGpuFont;
+
+pub struct PietGpuFontBuilder;
+
+#[derive(Clone)]
+pub struct PietGpuTextLayout;
+
+pub struct PietGpuTextLayoutBuilder;
+
+pub struct PietGpuText;
+
+pub struct PietGpuRenderContext {
+    encoder: Encoder,
+    bboxes: Vec<Bbox>,
+    items: Vec<PietItem>,
+    // Will probably need direct accesss to hal Device to create images etc.
+    inner_text: PietGpuText,
+}
+
+#[derive(Clone)]
+pub enum PietGpuBrush {
+    Solid(u32),
+    Gradient,
+}
+
+const TOLERANCE: f64 = 0.1;
+
+impl PietGpuRenderContext {
+    pub fn new() -> PietGpuRenderContext {
+        let mut encoder = Encoder::new();
+        let _reserve_root = encoder.alloc_chunk(PietItem::fixed_size() as u32);
+        let bboxes = Vec::new();
+        let items = Vec::new();
+        let inner_text = PietGpuText;
+        PietGpuRenderContext {
+            encoder,
+            bboxes,
+            items,
+            inner_text,
+        }
+    }
+
+    pub fn get_scene_buf(&mut self) -> &[u8] {
+        let n_items = self.bboxes.len() as u32;
+        let bboxes = self.bboxes.encode(&mut self.encoder).transmute();
+        let items = self.items.encode(&mut self.encoder).transmute();
+        let offset = scene::Point { xy: [0.0, 0.0] };
+        let simple_group = SimpleGroup {
+            n_items,
+            bboxes,
+            items,
+            offset,
+        };
+        let root_item = PietItem::Group(simple_group);
+        root_item.encode_to(&mut self.encoder.buf_mut()[0..PietItem::fixed_size()]);
+        self.encoder.buf()
+    }
+
+    fn push_item(&mut self, item: PietItem, bbox: Rect) {
+        let scene_bbox = Bbox {
+            bbox: [
+                bbox.x0.floor() as i16,
+                bbox.y0.floor() as i16,
+                bbox.x1.ceil() as i16,
+                bbox.y1.ceil() as i16,
+            ],
+        };
+        self.items.push(item);
+        self.bboxes.push(scene_bbox);
+    }
+}
+
+impl RenderContext for PietGpuRenderContext {
+    type Brush = PietGpuBrush;
+    type Image = PietGpuImage;
+    type Text = PietGpuText;
+    type TextLayout = PietGpuTextLayout;
+
+    fn status(&mut self) -> Result<(), Error> {
+        Ok(())
+    }
+
+    fn solid_brush(&mut self, color: Color) -> Self::Brush {
+        PietGpuBrush::Solid(color.as_rgba_u32())
+    }
+
+    fn gradient(&mut self, _gradient: impl Into<FixedGradient>) -> Result<Self::Brush, Error> {
+        Ok(Self::Brush::Gradient)
+    }
+
+    fn clear(&mut self, _color: Color) {}
+
+    fn stroke(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>, width: f64) {
+        let bbox = shape.bounding_box();
+        let brush = brush.make_brush(self, || bbox).into_owned();
+        let path = shape.to_bez_path(TOLERANCE);
+        let (n_points, points) = flatten_shape(&mut self.encoder, path);
+        match brush {
+            PietGpuBrush::Solid(rgba_color) => {
+                let poly_line = PietStrokePolyLine {
+                    rgba_color,
+                    width: width as f32,
+                    n_points,
+                    points,
+                };
+                self.push_item(PietItem::Poly(poly_line), bbox);
+            }
+            _ => (),
+        }
+    }
+
+    fn stroke_styled(
+        &mut self,
+        _shape: impl Shape,
+        _brush: &impl IntoBrush<Self>,
+        _width: f64,
+        _style: &StrokeStyle,
+    ) {
+    }
+
+    fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>) {
+        let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
+
+        match shape.as_circle() {
+            Some(circle) => match brush {
+                PietGpuBrush::Solid(rgba_color) => {
+                    let piet_circle = PietCircle {
+                        rgba_color,
+                        center: to_scene_point(circle.center),
+                        radius: circle.radius as f32,
+                    };
+                    let bbox = circle.bounding_box();
+                    self.push_item(PietItem::Circle(piet_circle), bbox);
+                }
+                _ => {}
+            },
+            None => {}
+        }
+    }
+
+    fn fill_even_odd(&mut self, _shape: impl Shape, _brush: &impl IntoBrush<Self>) {}
+
+    fn clip(&mut self, _shape: impl Shape) {}
+
+    fn text(&mut self) -> &mut Self::Text {
+        &mut self.inner_text
+    }
+
+    fn draw_text(
+        &mut self,
+        _layout: &Self::TextLayout,
+        pos: impl Into<Point>,
+        brush: &impl IntoBrush<Self>,
+    ) {
+        let _pos = pos.into();
+
+        let brush: PietGpuBrush = brush.make_brush(self, || Rect::ZERO).into_owned();
+
+        match brush {
+            PietGpuBrush::Solid(_rgba) => {
+                // TODO: draw text
+            }
+            _ => {}
+        }
+    }
+
+    fn save(&mut self) -> Result<(), Error> {
+        Ok(())
+    }
+    fn restore(&mut self) -> Result<(), Error> {
+        Ok(())
+    }
+    fn finish(&mut self) -> Result<(), Error> {
+        Ok(())
+    }
+    fn transform(&mut self, _transform: Affine) {}
+
+    fn make_image(
+        &mut self,
+        _width: usize,
+        _height: usize,
+        _buf: &[u8],
+        _format: ImageFormat,
+    ) -> Result<Self::Image, Error> {
+        Ok(PietGpuImage)
+    }
+
+    fn draw_image(
+        &mut self,
+        _image: &Self::Image,
+        _rect: impl Into<Rect>,
+        _interp: InterpolationMode,
+    ) {
+    }
+
+    fn draw_image_area(
+        &mut self,
+        _image: &Self::Image,
+        _src_rect: impl Into<Rect>,
+        _dst_rect: impl Into<Rect>,
+        _interp: InterpolationMode,
+    ) {
+    }
+
+    fn blurred_rect(&mut self, _rect: Rect, _blur_radius: f64, _brush: &impl IntoBrush<Self>) {}
+
+    fn current_transform(&self) -> Affine {
+        Default::default()
+    }
+}
+
+fn flatten_shape(
+    encoder: &mut Encoder,
+    path: impl Iterator<Item = PathEl>,
+) -> (u32, Ref<scene::Point>) {
+    let mut points = Vec::new();
+    let mut start_pt = None;
+    let mut last_pt = None;
+    kurbo::flatten(path, TOLERANCE, |el| {
+        match el {
+            PathEl::MoveTo(p) => {
+                let scene_pt = to_scene_point(p);
+                start_pt = Some(clone_scene_pt(&scene_pt));
+                if !points.is_empty() {
+                    points.push(scene::Point {
+                        xy: [std::f32::NAN, std::f32::NAN],
+                    });
+                }
+                last_pt = Some(clone_scene_pt(&scene_pt));
+                points.push(scene_pt);
+            }
+            PathEl::LineTo(p) => {
+                let scene_pt = to_scene_point(p);
+                last_pt = Some(clone_scene_pt(&scene_pt));
+                points.push(scene_pt);
+            }
+            PathEl::ClosePath => {
+                if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
+                    if start.xy != last.xy {
+                        points.push(start);
+                    }
+                }
+            }
+            _ => (),
+        }
+        //println!("{:?}", el);
+    });
+    let n_points = points.len() as u32;
+    let points_ref = points.encode(encoder).transmute();
+    (n_points, points_ref)
+}
+
+impl Text for PietGpuText {
+    type Font = PietGpuFont;
+    type FontBuilder = PietGpuFontBuilder;
+    type TextLayout = PietGpuTextLayout;
+    type TextLayoutBuilder = PietGpuTextLayoutBuilder;
+
+    fn new_font_by_name(&mut self, _name: &str, _size: f64) -> Self::FontBuilder {
+        unimplemented!();
+    }
+
+    fn new_text_layout(
+        &mut self,
+        _font: &Self::Font,
+        _text: &str,
+        _width: f64,
+    ) -> Self::TextLayoutBuilder {
+        unimplemented!();
+    }
+}
+
+impl Font for PietGpuFont {}
+
+impl FontBuilder for PietGpuFontBuilder {
+    type Out = PietGpuFont;
+
+    fn build(self) -> Result<Self::Out, Error> {
+        unimplemented!();
+    }
+}
+
+impl TextLayoutBuilder for PietGpuTextLayoutBuilder {
+    type Out = PietGpuTextLayout;
+
+    fn build(self) -> Result<Self::Out, Error> {
+        unimplemented!()
+    }
+}
+
+impl TextLayout for PietGpuTextLayout {
+    fn width(&self) -> f64 {
+        0.0
+    }
+
+    fn update_width(&mut self, _new_width: f64) -> Result<(), Error> {
+        unimplemented!()
+    }
+
+    fn line_text(&self, _line_number: usize) -> Option<&str> {
+        unimplemented!()
+    }
+
+    fn line_metric(&self, _line_number: usize) -> Option<LineMetric> {
+        unimplemented!()
+    }
+
+    fn line_count(&self) -> usize {
+        unimplemented!()
+    }
+
+    fn hit_test_point(&self, _point: Point) -> HitTestPoint {
+        unimplemented!()
+    }
+
+    fn hit_test_text_position(&self, _text_position: usize) -> Option<HitTestTextPosition> {
+        unimplemented!()
+    }
+}
+
+impl IntoBrush<PietGpuRenderContext> for PietGpuBrush {
+    fn make_brush<'b>(
+        &'b self,
+        _piet: &mut PietGpuRenderContext,
+        _bbox: impl FnOnce() -> Rect,
+    ) -> std::borrow::Cow<'b, PietGpuBrush> {
+        Cow::Borrowed(self)
+    }
+}
+
+fn to_scene_point(point: Point) -> scene::Point {
+    scene::Point {
+        xy: [point.x as f32, point.y as f32],
+    }
+}
+
+// TODO: allow #[derive(Clone)] in piet-gpu-derive.
+fn clone_scene_pt(p: &scene::Point) -> scene::Point {
+    scene::Point { xy: p.xy }
+}