Beginnings of new element pipeline

This successfully renders the tiger; fills and strokes are supported.
Other parts of the imaging model, not yet.

Progress toward #119
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index c48f65f..60c9660 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -276,8 +276,8 @@
 
         /*
         let mut data: Vec<u32> = Default::default();
-        renderer.tile_buf.read(&mut data).unwrap();
-        piet_gpu::dump_k1_data(&data);
+        renderer.memory_buf_dev.read(&mut data).unwrap();
+        piet_gpu::dump_k1_data(&data[2..]);
         trace_ptcl(&data);
         */
 
diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv
index 4dd01ed..a1ed332 100644
--- a/piet-gpu/shader/backdrop.spv
+++ b/piet-gpu/shader/backdrop.spv
Binary files differ
diff --git a/piet-gpu/shader/backdrop_lg.spv b/piet-gpu/shader/backdrop_lg.spv
index b00e3cd..457cb02 100644
--- a/piet-gpu/shader/backdrop_lg.spv
+++ b/piet-gpu/shader/backdrop_lg.spv
Binary files differ
diff --git a/piet-gpu/shader/bbox_clear.comp b/piet-gpu/shader/bbox_clear.comp
index 4ac5062..c609642 100644
--- a/piet-gpu/shader/bbox_clear.comp
+++ b/piet-gpu/shader/bbox_clear.comp
@@ -19,8 +19,8 @@
 
 void main() {
     uint ix = gl_GlobalInvocationID.x;
-    if (ix < conf.n_elements) {
-        uint out_ix = (conf.bbox_alloc.offset >> 2) + 4 * ix;
+    if (ix < conf.n_path) {
+        uint out_ix = (conf.bbox_alloc.offset >> 2) + 6 * ix;
         memory[out_ix] = 0xffff;
         memory[out_ix + 1] = 0xffff;
         memory[out_ix + 2] = 0;
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index 38d10b3..5ec7aec 100644
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
Binary files differ
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 1df1876..497915c 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -25,7 +25,7 @@
 
 build elements.spv: glsl elements.comp | scene.h state.h annotated.h
 
-build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h
+build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h mem.h
 
 build tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h
 
@@ -90,7 +90,7 @@
 build gen/draw_root.dxil: dxil gen/draw_root.hlsl
 build gen/draw_root.msl: msl gen/draw_root.spv
 
-build gen/draw_leaf.spv: glsl draw_leaf.comp | scene.h drawtag.h setup.h mem.h
+build gen/draw_leaf.spv: glsl draw_leaf.comp | scene.h drawtag.h annotated.h setup.h mem.h
 build gen/draw_leaf.hlsl: hlsl gen/draw_leaf.spv
 build gen/draw_leaf.dxil: dxil gen/draw_leaf.hlsl
 build gen/draw_leaf.msl: msl gen/draw_leaf.spv
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index a2071ad..8d4f7c0 100644
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp
index ec6a928..85d9528 100644
--- a/piet-gpu/shader/draw_leaf.comp
+++ b/piet-gpu/shader/draw_leaf.comp
@@ -28,6 +28,7 @@
 #include "scene.h"
 #include "tile.h"
 #include "drawtag.h"
+#include "annotated.h"
 
 #define Monoid DrawMonoid
 
@@ -70,10 +71,93 @@
     if (gl_LocalInvocationID.x > 0) {
         row = combine_tag_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
     }
-    uint out_base = (conf.drawmonoid_alloc.offset >> 2) + gl_GlobalInvocationID.x * 2 * N_ROWS;
+    uint out_ix = gl_GlobalInvocationID.x * N_ROWS;
+    uint out_base = (conf.drawmonoid_alloc.offset >> 2) + out_ix * 2;
+    AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + out_ix * Annotated_size);
     for (uint i = 0; i < N_ROWS; i++) {
         Monoid m = combine_tag_monoid(row, local[i]);
         memory[out_base + i * 2] = m.path_ix;
         memory[out_base + i * 2 + 1] = m.clip_ix;
+
+        // For compatibility, we'll generate an Annotated object, same as old
+        // pipeline. However, going forward we'll get rid of that, and have
+        // later stages read scene + bbox etc.
+        ElementRef this_ref = Element_index(ref, i);
+        tag_word = Element_tag(this_ref).tag;
+        if (tag_word == Element_FillColor || tag_word == Element_FillLinGradient || tag_word == Element_FillImage) {
+            uint bbox_offset = (conf.bbox_alloc.offset >> 2) + 6 * (m.path_ix - 1);
+            float bbox_l = float(memory[bbox_offset]) - 32768.0;
+            float bbox_t = float(memory[bbox_offset + 1]) - 32768.0;
+            float bbox_r = float(memory[bbox_offset + 2]) - 32768.0;
+            float bbox_b = float(memory[bbox_offset + 3]) - 32768.0;
+            vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
+            float linewidth = uintBitsToFloat(memory[bbox_offset + 4]);
+            uint fill_mode = uint(linewidth >= 0.0);
+            vec4 mat;
+            vec2 translate;
+            if (linewidth >= 0.0 || tag_word == Element_FillLinGradient) {
+                uint trans_ix = memory[bbox_offset + 5];
+                uint t = (conf.trans_alloc.offset >> 2) + 6 * trans_ix;
+                mat = uintBitsToFloat(uvec4(memory[t], memory[t + 1], memory[t + 2], memory[t + 3]));
+                if (tag_word == Element_FillLinGradient) {
+                    translate = uintBitsToFloat(uvec2(memory[t + 4], memory[t + 5]));
+                }
+            }
+            if (linewidth >= 0.0) {
+                // TODO: need to deal with anisotropic case
+                linewidth *= sqrt(abs(mat.x * mat.w - mat.y * mat.z));
+            }
+            linewidth = max(linewidth, 0.0);
+            switch (tag_word) {
+            case Element_FillColor:
+                FillColor fill = Element_FillColor_read(this_ref);
+                AnnoColor anno_fill;
+                anno_fill.bbox = bbox;
+                anno_fill.linewidth = linewidth;
+                anno_fill.rgba_color = fill.rgba_color;
+                Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill);
+                break;
+            case Element_FillLinGradient:
+                FillLinGradient lin = Element_FillLinGradient_read(this_ref);
+                AnnoLinGradient anno_lin;
+                anno_lin.bbox = bbox;
+                anno_lin.linewidth = linewidth;
+                anno_lin.index = lin.index;
+                vec2 p0 = mat.xy * lin.p0.x + mat.zw * lin.p0.y + translate;
+                vec2 p1 = mat.xy * lin.p1.x + mat.zw * lin.p1.y + translate;
+                vec2 dxy = p1 - p0;
+                float scale = 1.0 / (dxy.x * dxy.x + dxy.y * dxy.y);
+                float line_x = dxy.x * scale;
+                float line_y = dxy.y * scale;
+                anno_lin.line_x = line_x;
+                anno_lin.line_y = line_y;
+                anno_lin.line_c = -(p0.x * line_x + p0.y * line_y);
+                Annotated_LinGradient_write(conf.anno_alloc, out_ref, fill_mode, anno_lin);
+                break;
+            case Element_FillImage:
+                FillImage fill_img = Element_FillImage_read(this_ref);
+                AnnoImage anno_img;
+                anno_img.bbox = bbox;
+                anno_img.linewidth = linewidth;
+                anno_img.index = fill_img.index;
+                anno_img.offset = fill_img.offset;
+                Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
+                break;
+            }
+        } else if (tag_word == Element_BeginClip) {
+            Clip begin_clip = Element_BeginClip_read(this_ref);
+            AnnoBeginClip anno_begin_clip;
+            // This is the absolute bbox, it's been transformed during encoding.
+            anno_begin_clip.bbox = begin_clip.bbox;
+            anno_begin_clip.linewidth = 0.0; // don't support clip-with-stroke
+            Annotated_BeginClip_write(conf.anno_alloc, out_ref, 0, anno_begin_clip);
+        } else if (tag_word == Element_EndClip) {
+            Clip end_clip = Element_EndClip_read(this_ref);
+            AnnoEndClip anno_end_clip;
+            // This bbox is expected to be the same as the begin one.
+            anno_end_clip.bbox = end_clip.bbox;
+            Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
+        }
+        out_ref.offset += Annotated_size;
     }
 }
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index 873fc41..6f33544 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -445,7 +445,7 @@
                 vec2 lw = get_linewidth(st);
                 anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
             } else {
-                anno_fill.linewidth = 0.0;
+                anno_begin_clip.linewidth = 0.0;
             }
             out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
             Annotated_BeginClip_write(conf.anno_alloc, out_ref, fill_mode, anno_begin_clip);
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index 37cc051..f906dac 100644
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil
index 8a46725..9ce0add 100644
--- a/piet-gpu/shader/gen/bbox_clear.dxil
+++ b/piet-gpu/shader/gen/bbox_clear.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl
index 7a4e86a..903a185 100644
--- a/piet-gpu/shader/gen/bbox_clear.hlsl
+++ b/piet-gpu/shader/gen/bbox_clear.hlsl
@@ -18,16 +18,17 @@
     Alloc bbox_alloc;
     Alloc drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
 
 ByteAddressBuffer _21 : register(t1);
-RWByteAddressBuffer _44 : register(u0);
+RWByteAddressBuffer _45 : register(u0);
 
 static uint3 gl_GlobalInvocationID;
 struct SPIRV_Cross_Input
@@ -38,13 +39,13 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x;
-    if (ix < _21.Load(0))
+    if (ix < _21.Load(52))
     {
-        uint out_ix = (_21.Load(40) >> uint(2)) + (4u * ix);
-        _44.Store(out_ix * 4 + 8, 65535u);
-        _44.Store((out_ix + 1u) * 4 + 8, 65535u);
-        _44.Store((out_ix + 2u) * 4 + 8, 0u);
-        _44.Store((out_ix + 3u) * 4 + 8, 0u);
+        uint out_ix = (_21.Load(40) >> uint(2)) + (6u * ix);
+        _45.Store(out_ix * 4 + 8, 65535u);
+        _45.Store((out_ix + 1u) * 4 + 8, 65535u);
+        _45.Store((out_ix + 2u) * 4 + 8, 0u);
+        _45.Store((out_ix + 3u) * 4 + 8, 0u);
     }
 }
 
diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl
index 6f73531..9af5b11 100644
--- a/piet-gpu/shader/gen/bbox_clear.msl
+++ b/piet-gpu/shader/gen/bbox_clear.msl
@@ -23,9 +23,10 @@
     Alloc bbox_alloc;
     Alloc drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
@@ -43,16 +44,16 @@
 
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
 
-kernel void main0(device Memory& _44 [[buffer(0)]], const device ConfigBuf& _21 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+kernel void main0(device Memory& _45 [[buffer(0)]], const device ConfigBuf& _21 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
     uint ix = gl_GlobalInvocationID.x;
-    if (ix < _21.conf.n_elements)
+    if (ix < _21.conf.n_path)
     {
-        uint out_ix = (_21.conf.bbox_alloc.offset >> uint(2)) + (4u * ix);
-        _44.memory[out_ix] = 65535u;
-        _44.memory[out_ix + 1u] = 65535u;
-        _44.memory[out_ix + 2u] = 0u;
-        _44.memory[out_ix + 3u] = 0u;
+        uint out_ix = (_21.conf.bbox_alloc.offset >> uint(2)) + (6u * ix);
+        _45.memory[out_ix] = 65535u;
+        _45.memory[out_ix + 1u] = 65535u;
+        _45.memory[out_ix + 2u] = 0u;
+        _45.memory[out_ix + 3u] = 0u;
     }
 }
 
diff --git a/piet-gpu/shader/gen/bbox_clear.spv b/piet-gpu/shader/gen/bbox_clear.spv
index 2b659f4..c459502 100644
--- a/piet-gpu/shader/gen/bbox_clear.spv
+++ b/piet-gpu/shader/gen/bbox_clear.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil
index 17bace7..f95e5bc 100644
--- a/piet-gpu/shader/gen/draw_leaf.dxil
+++ b/piet-gpu/shader/gen/draw_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl
index e5f50fd..0ef9538 100644
--- a/piet-gpu/shader/gen/draw_leaf.hlsl
+++ b/piet-gpu/shader/gen/draw_leaf.hlsl
@@ -1,8 +1,56 @@
+struct Alloc
+{
+    uint offset;
+};
+
 struct ElementRef
 {
     uint offset;
 };
 
+struct FillColorRef
+{
+    uint offset;
+};
+
+struct FillColor
+{
+    uint rgba_color;
+};
+
+struct FillLinGradientRef
+{
+    uint offset;
+};
+
+struct FillLinGradient
+{
+    uint index;
+    float2 p0;
+    float2 p1;
+};
+
+struct FillImageRef
+{
+    uint offset;
+};
+
+struct FillImage
+{
+    uint index;
+    int2 offset;
+};
+
+struct ClipRef
+{
+    uint offset;
+};
+
+struct Clip
+{
+    float4 bbox;
+};
+
 struct ElementTag
 {
     uint tag;
@@ -15,7 +63,68 @@
     uint clip_ix;
 };
 
-struct Alloc
+struct AnnoImageRef
+{
+    uint offset;
+};
+
+struct AnnoImage
+{
+    float4 bbox;
+    float linewidth;
+    uint index;
+    int2 offset;
+};
+
+struct AnnoColorRef
+{
+    uint offset;
+};
+
+struct AnnoColor
+{
+    float4 bbox;
+    float linewidth;
+    uint rgba_color;
+};
+
+struct AnnoLinGradientRef
+{
+    uint offset;
+};
+
+struct AnnoLinGradient
+{
+    float4 bbox;
+    float linewidth;
+    uint index;
+    float line_x;
+    float line_y;
+    float line_c;
+};
+
+struct AnnoBeginClipRef
+{
+    uint offset;
+};
+
+struct AnnoBeginClip
+{
+    float4 bbox;
+    float linewidth;
+};
+
+struct AnnoEndClipRef
+{
+    uint offset;
+};
+
+struct AnnoEndClip
+{
+    float4 bbox;
+};
+
+struct AnnotatedRef
 {
     uint offset;
 };
@@ -35,23 +144,24 @@
     Alloc bbox_alloc;
     Alloc drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
 
-static const DrawMonoid _67 = { 0u, 0u };
-static const DrawMonoid _94 = { 1u, 0u };
-static const DrawMonoid _96 = { 1u, 1u };
-static const DrawMonoid _98 = { 0u, 1u };
+static const DrawMonoid _418 = { 0u, 0u };
+static const DrawMonoid _443 = { 1u, 0u };
+static const DrawMonoid _445 = { 1u, 1u };
+static const DrawMonoid _447 = { 0u, 1u };
 
-ByteAddressBuffer _49 : register(t2);
-ByteAddressBuffer _218 : register(t3);
-ByteAddressBuffer _248 : register(t1);
-RWByteAddressBuffer _277 : register(u0);
+RWByteAddressBuffer _201 : register(u0);
+ByteAddressBuffer _225 : register(t2);
+ByteAddressBuffer _1008 : register(t3);
+ByteAddressBuffer _1042 : register(t1);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -67,9 +177,9 @@
 
 ElementTag Element_tag(ElementRef ref)
 {
-    uint tag_and_flags = _49.Load((ref.offset >> uint(2)) * 4 + 0);
-    ElementTag _63 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _63;
+    uint tag_and_flags = _225.Load((ref.offset >> uint(2)) * 4 + 0);
+    ElementTag _375 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _375;
 }
 
 DrawMonoid map_tag(uint tag_word)
@@ -80,27 +190,27 @@
         case 5u:
         case 6u:
         {
-            return _94;
+            return _443;
         }
         case 9u:
         {
-            return _96;
+            return _445;
         }
         case 10u:
         {
-            return _98;
+            return _447;
         }
         default:
         {
-            return _67;
+            return _418;
         }
     }
 }
 
 ElementRef Element_index(ElementRef ref, uint index)
 {
-    ElementRef _42 = { ref.offset + (index * 36u) };
-    return _42;
+    ElementRef _214 = { ref.offset + (index * 36u) };
+    return _214;
 }
 
 DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
@@ -113,14 +223,326 @@
 
 DrawMonoid tag_monoid_identity()
 {
-    return _67;
+    return _418;
+}
+
+FillColor FillColor_read(FillColorRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = _225.Load((ix + 0u) * 4 + 0);
+    FillColor s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+FillColor Element_FillColor_read(ElementRef ref)
+{
+    FillColorRef _381 = { ref.offset + 4u };
+    FillColorRef param = _381;
+    return FillColor_read(param);
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    _201.Store(offset * 4 + 8, val);
+}
+
+void AnnoColor_write(Alloc a, AnnoColorRef ref, AnnoColor s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = asuint(s.bbox.x);
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.bbox.y);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s.bbox.z);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s.bbox.w);
+    write_mem(param_9, param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = asuint(s.linewidth);
+    write_mem(param_12, param_13, param_14);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = s.rgba_color;
+    write_mem(param_15, param_16, param_17);
+}
+
+void Annotated_Color_write(Alloc a, AnnotatedRef ref, uint flags, AnnoColor s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = (flags << uint(16)) | 1u;
+    write_mem(param, param_1, param_2);
+    AnnoColorRef _808 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    AnnoColorRef param_4 = _808;
+    AnnoColor param_5 = s;
+    AnnoColor_write(param_3, param_4, param_5);
+}
+
+FillLinGradient FillLinGradient_read(FillLinGradientRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = _225.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _225.Load((ix + 1u) * 4 + 0);
+    uint raw2 = _225.Load((ix + 2u) * 4 + 0);
+    uint raw3 = _225.Load((ix + 3u) * 4 + 0);
+    uint raw4 = _225.Load((ix + 4u) * 4 + 0);
+    FillLinGradient s;
+    s.index = raw0;
+    s.p0 = float2(asfloat(raw1), asfloat(raw2));
+    s.p1 = float2(asfloat(raw3), asfloat(raw4));
+    return s;
+}
+
+FillLinGradient Element_FillLinGradient_read(ElementRef ref)
+{
+    FillLinGradientRef _389 = { ref.offset + 4u };
+    FillLinGradientRef param = _389;
+    return FillLinGradient_read(param);
+}
+
+void AnnoLinGradient_write(Alloc a, AnnoLinGradientRef ref, AnnoLinGradient s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = asuint(s.bbox.x);
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.bbox.y);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s.bbox.z);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s.bbox.w);
+    write_mem(param_9, param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = asuint(s.linewidth);
+    write_mem(param_12, param_13, param_14);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = s.index;
+    write_mem(param_15, param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 6u;
+    uint param_20 = asuint(s.line_x);
+    write_mem(param_18, param_19, param_20);
+    Alloc param_21 = a;
+    uint param_22 = ix + 7u;
+    uint param_23 = asuint(s.line_y);
+    write_mem(param_21, param_22, param_23);
+    Alloc param_24 = a;
+    uint param_25 = ix + 8u;
+    uint param_26 = asuint(s.line_c);
+    write_mem(param_24, param_25, param_26);
+}
+
+void Annotated_LinGradient_write(Alloc a, AnnotatedRef ref, uint flags, AnnoLinGradient s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = (flags << uint(16)) | 2u;
+    write_mem(param, param_1, param_2);
+    AnnoLinGradientRef _829 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    AnnoLinGradientRef param_4 = _829;
+    AnnoLinGradient param_5 = s;
+    AnnoLinGradient_write(param_3, param_4, param_5);
+}
+
+FillImage FillImage_read(FillImageRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = _225.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _225.Load((ix + 1u) * 4 + 0);
+    FillImage s;
+    s.index = raw0;
+    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+    return s;
+}
+
+FillImage Element_FillImage_read(ElementRef ref)
+{
+    FillImageRef _397 = { ref.offset + 4u };
+    FillImageRef param = _397;
+    return FillImage_read(param);
+}
+
+void AnnoImage_write(Alloc a, AnnoImageRef ref, AnnoImage s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = asuint(s.bbox.x);
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.bbox.y);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s.bbox.z);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s.bbox.w);
+    write_mem(param_9, param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = asuint(s.linewidth);
+    write_mem(param_12, param_13, param_14);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = s.index;
+    write_mem(param_15, param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 6u;
+    uint param_20 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
+    write_mem(param_18, param_19, param_20);
+}
+
+void Annotated_Image_write(Alloc a, AnnotatedRef ref, uint flags, AnnoImage s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = (flags << uint(16)) | 3u;
+    write_mem(param, param_1, param_2);
+    AnnoImageRef _850 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    AnnoImageRef param_4 = _850;
+    AnnoImage param_5 = s;
+    AnnoImage_write(param_3, param_4, param_5);
+}
+
+Clip Clip_read(ClipRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = _225.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _225.Load((ix + 1u) * 4 + 0);
+    uint raw2 = _225.Load((ix + 2u) * 4 + 0);
+    uint raw3 = _225.Load((ix + 3u) * 4 + 0);
+    Clip s;
+    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    return s;
+}
+
+Clip Element_BeginClip_read(ElementRef ref)
+{
+    ClipRef _405 = { ref.offset + 4u };
+    ClipRef param = _405;
+    return Clip_read(param);
+}
+
+void AnnoBeginClip_write(Alloc a, AnnoBeginClipRef ref, AnnoBeginClip s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = asuint(s.bbox.x);
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.bbox.y);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s.bbox.z);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s.bbox.w);
+    write_mem(param_9, param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = asuint(s.linewidth);
+    write_mem(param_12, param_13, param_14);
+}
+
+void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoBeginClip s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = (flags << uint(16)) | 4u;
+    write_mem(param, param_1, param_2);
+    AnnoBeginClipRef _871 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    AnnoBeginClipRef param_4 = _871;
+    AnnoBeginClip param_5 = s;
+    AnnoBeginClip_write(param_3, param_4, param_5);
+}
+
+Clip Element_EndClip_read(ElementRef ref)
+{
+    ClipRef _413 = { ref.offset + 4u };
+    ClipRef param = _413;
+    return Clip_read(param);
+}
+
+void AnnoEndClip_write(Alloc a, AnnoEndClipRef ref, AnnoEndClip s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = asuint(s.bbox.x);
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.bbox.y);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s.bbox.z);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s.bbox.w);
+    write_mem(param_9, param_10, param_11);
+}
+
+void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoEndClip s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 5u;
+    write_mem(param, param_1, param_2);
+    AnnoEndClipRef _889 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    AnnoEndClipRef param_4 = _889;
+    AnnoEndClip param_5 = s;
+    AnnoEndClip_write(param_3, param_4, param_5);
 }
 
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    ElementRef _115 = { ix * 36u };
-    ElementRef ref = _115;
+    ElementRef _907 = { ix * 36u };
+    ElementRef ref = _907;
     ElementRef param = ref;
     uint tag_word = Element_tag(param).tag;
     uint param_1 = tag_word;
@@ -157,11 +579,11 @@
     DrawMonoid row = tag_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        DrawMonoid _224;
-        _224.path_ix = _218.Load((gl_WorkGroupID.x - 1u) * 8 + 0);
-        _224.clip_ix = _218.Load((gl_WorkGroupID.x - 1u) * 8 + 4);
-        row.path_ix = _224.path_ix;
-        row.clip_ix = _224.clip_ix;
+        DrawMonoid _1014;
+        _1014.path_ix = _1008.Load((gl_WorkGroupID.x - 1u) * 8 + 0);
+        _1014.clip_ix = _1008.Load((gl_WorkGroupID.x - 1u) * 8 + 4);
+        row.path_ix = _1014.path_ix;
+        row.clip_ix = _1014.clip_ix;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -169,14 +591,154 @@
         DrawMonoid param_11 = sh_scratch[gl_LocalInvocationID.x - 1u];
         row = combine_tag_monoid(param_10, param_11);
     }
-    uint out_base = (_248.Load(44) >> uint(2)) + ((gl_GlobalInvocationID.x * 2u) * 8u);
+    uint out_ix = gl_GlobalInvocationID.x * 8u;
+    uint out_base = (_1042.Load(44) >> uint(2)) + (out_ix * 2u);
+    AnnotatedRef _1058 = { _1042.Load(32) + (out_ix * 40u) };
+    AnnotatedRef out_ref = _1058;
+    float4 mat;
+    float2 translate;
+    AnnoColor anno_fill;
+    Alloc param_18;
+    AnnoLinGradient anno_lin;
+    Alloc param_23;
+    AnnoImage anno_img;
+    Alloc param_28;
+    AnnoBeginClip anno_begin_clip;
+    Alloc param_33;
+    AnnoEndClip anno_end_clip;
+    Alloc param_38;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid param_12 = row;
         DrawMonoid param_13 = local[i_2];
         DrawMonoid m = combine_tag_monoid(param_12, param_13);
-        _277.Store((out_base + (i_2 * 2u)) * 4 + 8, m.path_ix);
-        _277.Store(((out_base + (i_2 * 2u)) + 1u) * 4 + 8, m.clip_ix);
+        _201.Store((out_base + (i_2 * 2u)) * 4 + 8, m.path_ix);
+        _201.Store(((out_base + (i_2 * 2u)) + 1u) * 4 + 8, m.clip_ix);
+        ElementRef param_14 = ref;
+        uint param_15 = i_2;
+        ElementRef this_ref = Element_index(param_14, param_15);
+        ElementRef param_16 = this_ref;
+        tag_word = Element_tag(param_16).tag;
+        if (((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u))
+        {
+            uint bbox_offset = (_1042.Load(40) >> uint(2)) + (6u * (m.path_ix - 1u));
+            float bbox_l = float(_201.Load(bbox_offset * 4 + 8)) - 32768.0f;
+            float bbox_t = float(_201.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
+            float bbox_r = float(_201.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
+            float bbox_b = float(_201.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
+            float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+            float linewidth = asfloat(_201.Load((bbox_offset + 4u) * 4 + 8));
+            uint fill_mode = uint(linewidth >= 0.0f);
+            if ((linewidth >= 0.0f) || (tag_word == 5u))
+            {
+                uint trans_ix = _201.Load((bbox_offset + 5u) * 4 + 8);
+                uint t = (_1042.Load(36) >> uint(2)) + (6u * trans_ix);
+                mat = asfloat(uint4(_201.Load(t * 4 + 8), _201.Load((t + 1u) * 4 + 8), _201.Load((t + 2u) * 4 + 8), _201.Load((t + 3u) * 4 + 8)));
+                if (tag_word == 5u)
+                {
+                    translate = asfloat(uint2(_201.Load((t + 4u) * 4 + 8), _201.Load((t + 5u) * 4 + 8)));
+                }
+            }
+            if (linewidth >= 0.0f)
+            {
+                linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z)));
+            }
+            linewidth = max(linewidth, 0.0f);
+            switch (tag_word)
+            {
+                case 4u:
+                {
+                    ElementRef param_17 = this_ref;
+                    FillColor fill = Element_FillColor_read(param_17);
+                    anno_fill.bbox = bbox;
+                    anno_fill.linewidth = linewidth;
+                    anno_fill.rgba_color = fill.rgba_color;
+                    Alloc _1261;
+                    _1261.offset = _1042.Load(32);
+                    param_18.offset = _1261.offset;
+                    AnnotatedRef param_19 = out_ref;
+                    uint param_20 = fill_mode;
+                    AnnoColor param_21 = anno_fill;
+                    Annotated_Color_write(param_18, param_19, param_20, param_21);
+                    break;
+                }
+                case 5u:
+                {
+                    ElementRef param_22 = this_ref;
+                    FillLinGradient lin = Element_FillLinGradient_read(param_22);
+                    anno_lin.bbox = bbox;
+                    anno_lin.linewidth = linewidth;
+                    anno_lin.index = lin.index;
+                    float2 p0 = ((mat.xy * lin.p0.x) + (mat.zw * lin.p0.y)) + translate;
+                    float2 p1 = ((mat.xy * lin.p1.x) + (mat.zw * lin.p1.y)) + translate;
+                    float2 dxy = p1 - p0;
+                    float scale = 1.0f / ((dxy.x * dxy.x) + (dxy.y * dxy.y));
+                    float line_x = dxy.x * scale;
+                    float line_y = dxy.y * scale;
+                    anno_lin.line_x = line_x;
+                    anno_lin.line_y = line_y;
+                    anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y));
+                    Alloc _1357;
+                    _1357.offset = _1042.Load(32);
+                    param_23.offset = _1357.offset;
+                    AnnotatedRef param_24 = out_ref;
+                    uint param_25 = fill_mode;
+                    AnnoLinGradient param_26 = anno_lin;
+                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26);
+                    break;
+                }
+                case 6u:
+                {
+                    ElementRef param_27 = this_ref;
+                    FillImage fill_img = Element_FillImage_read(param_27);
+                    anno_img.bbox = bbox;
+                    anno_img.linewidth = linewidth;
+                    anno_img.index = fill_img.index;
+                    anno_img.offset = fill_img.offset;
+                    Alloc _1385;
+                    _1385.offset = _1042.Load(32);
+                    param_28.offset = _1385.offset;
+                    AnnotatedRef param_29 = out_ref;
+                    uint param_30 = fill_mode;
+                    AnnoImage param_31 = anno_img;
+                    Annotated_Image_write(param_28, param_29, param_30, param_31);
+                    break;
+                }
+            }
+        }
+        else
+        {
+            if (tag_word == 9u)
+            {
+                ElementRef param_32 = this_ref;
+                Clip begin_clip = Element_BeginClip_read(param_32);
+                anno_begin_clip.bbox = begin_clip.bbox;
+                anno_begin_clip.linewidth = 0.0f;
+                Alloc _1413;
+                _1413.offset = _1042.Load(32);
+                param_33.offset = _1413.offset;
+                AnnotatedRef param_34 = out_ref;
+                uint param_35 = 0u;
+                AnnoBeginClip param_36 = anno_begin_clip;
+                Annotated_BeginClip_write(param_33, param_34, param_35, param_36);
+            }
+            else
+            {
+                if (tag_word == 10u)
+                {
+                    ElementRef param_37 = this_ref;
+                    Clip end_clip = Element_EndClip_read(param_37);
+                    anno_end_clip.bbox = end_clip.bbox;
+                    Alloc _1438;
+                    _1438.offset = _1042.Load(32);
+                    param_38.offset = _1438.offset;
+                    AnnotatedRef param_39 = out_ref;
+                    AnnoEndClip param_40 = anno_end_clip;
+                    Annotated_EndClip_write(param_38, param_39, param_40);
+                }
+            }
+        }
+        out_ref.offset += 40u;
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl
index d52a560..f713186 100644
--- a/piet-gpu/shader/gen/draw_leaf.msl
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@@ -44,11 +44,59 @@
     }
 };
 
+struct Alloc
+{
+    uint offset;
+};
+
 struct ElementRef
 {
     uint offset;
 };
 
+struct FillColorRef
+{
+    uint offset;
+};
+
+struct FillColor
+{
+    uint rgba_color;
+};
+
+struct FillLinGradientRef
+{
+    uint offset;
+};
+
+struct FillLinGradient
+{
+    uint index;
+    float2 p0;
+    float2 p1;
+};
+
+struct FillImageRef
+{
+    uint offset;
+};
+
+struct FillImage
+{
+    uint index;
+    int2 offset;
+};
+
+struct ClipRef
+{
+    uint offset;
+};
+
+struct Clip
+{
+    float4 bbox;
+};
+
 struct ElementTag
 {
     uint tag;
@@ -61,6 +109,79 @@
     uint clip_ix;
 };
 
+struct AnnoImageRef
+{
+    uint offset;
+};
+
+struct AnnoImage
+{
+    float4 bbox;
+    float linewidth;
+    uint index;
+    int2 offset;
+};
+
+struct AnnoColorRef
+{
+    uint offset;
+};
+
+struct AnnoColor
+{
+    float4 bbox;
+    float linewidth;
+    uint rgba_color;
+};
+
+struct AnnoLinGradientRef
+{
+    uint offset;
+};
+
+struct AnnoLinGradient
+{
+    float4 bbox;
+    float linewidth;
+    uint index;
+    float line_x;
+    float line_y;
+    float line_c;
+};
+
+struct AnnoBeginClipRef
+{
+    uint offset;
+};
+
+struct AnnoBeginClip
+{
+    float4 bbox;
+    float linewidth;
+};
+
+struct AnnoEndClipRef
+{
+    uint offset;
+};
+
+struct AnnoEndClip
+{
+    float4 bbox;
+};
+
+struct AnnotatedRef
+{
+    uint offset;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
 struct SceneBuf
 {
     uint scene[1];
@@ -77,7 +198,7 @@
     DrawMonoid_1 parent[1];
 };
 
-struct Alloc
+struct Alloc_1
 {
     uint offset;
 };
@@ -88,18 +209,19 @@
     uint n_pathseg;
     uint width_in_tiles;
     uint height_in_tiles;
-    Alloc tile_alloc;
-    Alloc bin_alloc;
-    Alloc ptcl_alloc;
-    Alloc pathseg_alloc;
-    Alloc anno_alloc;
-    Alloc trans_alloc;
-    Alloc bbox_alloc;
-    Alloc drawmonoid_alloc;
+    Alloc_1 tile_alloc;
+    Alloc_1 bin_alloc;
+    Alloc_1 ptcl_alloc;
+    Alloc_1 pathseg_alloc;
+    Alloc_1 anno_alloc;
+    Alloc_1 trans_alloc;
+    Alloc_1 bbox_alloc;
+    Alloc_1 drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
@@ -108,19 +230,12 @@
     Config conf;
 };
 
-struct Memory
-{
-    uint mem_offset;
-    uint mem_error;
-    uint memory[1];
-};
-
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_49)
+ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_225)
 {
-    uint tag_and_flags = v_49.scene[ref.offset >> uint(2)];
+    uint tag_and_flags = v_225.scene[ref.offset >> uint(2)];
     return ElementTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
@@ -171,13 +286,336 @@
     return DrawMonoid{ 0u, 0u };
 }
 
-kernel void main0(device Memory& _277 [[buffer(0)]], const device ConfigBuf& _248 [[buffer(1)]], const device SceneBuf& v_49 [[buffer(2)]], const device ParentBuf& _218 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+static inline __attribute__((always_inline))
+FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_225)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = v_225.scene[ix + 0u];
+    FillColor s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_225)
+{
+    FillColorRef param = FillColorRef{ ref.offset + 4u };
+    return FillColor_read(param, v_225);
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_201)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    v_201.memory[offset] = val;
+}
+
+static inline __attribute__((always_inline))
+void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_201)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = as_type<uint>(s.bbox.x);
+    write_mem(param, param_1, param_2, v_201);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = as_type<uint>(s.bbox.y);
+    write_mem(param_3, param_4, param_5, v_201);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = as_type<uint>(s.bbox.z);
+    write_mem(param_6, param_7, param_8, v_201);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = as_type<uint>(s.bbox.w);
+    write_mem(param_9, param_10, param_11, v_201);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = as_type<uint>(s.linewidth);
+    write_mem(param_12, param_13, param_14, v_201);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = s.rgba_color;
+    write_mem(param_15, param_16, param_17, v_201);
+}
+
+static inline __attribute__((always_inline))
+void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_201)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = (flags << uint(16)) | 1u;
+    write_mem(param, param_1, param_2, v_201);
+    Alloc param_3 = a;
+    AnnoColorRef param_4 = AnnoColorRef{ ref.offset + 4u };
+    AnnoColor param_5 = s;
+    AnnoColor_write(param_3, param_4, param_5, v_201);
+}
+
+static inline __attribute__((always_inline))
+FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_225)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = v_225.scene[ix + 0u];
+    uint raw1 = v_225.scene[ix + 1u];
+    uint raw2 = v_225.scene[ix + 2u];
+    uint raw3 = v_225.scene[ix + 3u];
+    uint raw4 = v_225.scene[ix + 4u];
+    FillLinGradient s;
+    s.index = raw0;
+    s.p0 = float2(as_type<float>(raw1), as_type<float>(raw2));
+    s.p1 = float2(as_type<float>(raw3), as_type<float>(raw4));
+    return s;
+}
+
+static inline __attribute__((always_inline))
+FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_225)
+{
+    FillLinGradientRef param = FillLinGradientRef{ ref.offset + 4u };
+    return FillLinGradient_read(param, v_225);
+}
+
+static inline __attribute__((always_inline))
+void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_201)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = as_type<uint>(s.bbox.x);
+    write_mem(param, param_1, param_2, v_201);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = as_type<uint>(s.bbox.y);
+    write_mem(param_3, param_4, param_5, v_201);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = as_type<uint>(s.bbox.z);
+    write_mem(param_6, param_7, param_8, v_201);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = as_type<uint>(s.bbox.w);
+    write_mem(param_9, param_10, param_11, v_201);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = as_type<uint>(s.linewidth);
+    write_mem(param_12, param_13, param_14, v_201);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = s.index;
+    write_mem(param_15, param_16, param_17, v_201);
+    Alloc param_18 = a;
+    uint param_19 = ix + 6u;
+    uint param_20 = as_type<uint>(s.line_x);
+    write_mem(param_18, param_19, param_20, v_201);
+    Alloc param_21 = a;
+    uint param_22 = ix + 7u;
+    uint param_23 = as_type<uint>(s.line_y);
+    write_mem(param_21, param_22, param_23, v_201);
+    Alloc param_24 = a;
+    uint param_25 = ix + 8u;
+    uint param_26 = as_type<uint>(s.line_c);
+    write_mem(param_24, param_25, param_26, v_201);
+}
+
+static inline __attribute__((always_inline))
+void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_201)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = (flags << uint(16)) | 2u;
+    write_mem(param, param_1, param_2, v_201);
+    Alloc param_3 = a;
+    AnnoLinGradientRef param_4 = AnnoLinGradientRef{ ref.offset + 4u };
+    AnnoLinGradient param_5 = s;
+    AnnoLinGradient_write(param_3, param_4, param_5, v_201);
+}
+
+static inline __attribute__((always_inline))
+FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_225)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = v_225.scene[ix + 0u];
+    uint raw1 = v_225.scene[ix + 1u];
+    FillImage s;
+    s.index = raw0;
+    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_225)
+{
+    FillImageRef param = FillImageRef{ ref.offset + 4u };
+    return FillImage_read(param, v_225);
+}
+
+static inline __attribute__((always_inline))
+void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_201)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = as_type<uint>(s.bbox.x);
+    write_mem(param, param_1, param_2, v_201);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = as_type<uint>(s.bbox.y);
+    write_mem(param_3, param_4, param_5, v_201);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = as_type<uint>(s.bbox.z);
+    write_mem(param_6, param_7, param_8, v_201);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = as_type<uint>(s.bbox.w);
+    write_mem(param_9, param_10, param_11, v_201);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = as_type<uint>(s.linewidth);
+    write_mem(param_12, param_13, param_14, v_201);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = s.index;
+    write_mem(param_15, param_16, param_17, v_201);
+    Alloc param_18 = a;
+    uint param_19 = ix + 6u;
+    uint param_20 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
+    write_mem(param_18, param_19, param_20, v_201);
+}
+
+static inline __attribute__((always_inline))
+void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_201)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = (flags << uint(16)) | 3u;
+    write_mem(param, param_1, param_2, v_201);
+    Alloc param_3 = a;
+    AnnoImageRef param_4 = AnnoImageRef{ ref.offset + 4u };
+    AnnoImage param_5 = s;
+    AnnoImage_write(param_3, param_4, param_5, v_201);
+}
+
+static inline __attribute__((always_inline))
+Clip Clip_read(thread const ClipRef& ref, const device SceneBuf& v_225)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = v_225.scene[ix + 0u];
+    uint raw1 = v_225.scene[ix + 1u];
+    uint raw2 = v_225.scene[ix + 2u];
+    uint raw3 = v_225.scene[ix + 3u];
+    Clip s;
+    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    return s;
+}
+
+static inline __attribute__((always_inline))
+Clip Element_BeginClip_read(thread const ElementRef& ref, const device SceneBuf& v_225)
+{
+    ClipRef param = ClipRef{ ref.offset + 4u };
+    return Clip_read(param, v_225);
+}
+
+static inline __attribute__((always_inline))
+void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_201)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = as_type<uint>(s.bbox.x);
+    write_mem(param, param_1, param_2, v_201);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = as_type<uint>(s.bbox.y);
+    write_mem(param_3, param_4, param_5, v_201);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = as_type<uint>(s.bbox.z);
+    write_mem(param_6, param_7, param_8, v_201);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = as_type<uint>(s.bbox.w);
+    write_mem(param_9, param_10, param_11, v_201);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = as_type<uint>(s.linewidth);
+    write_mem(param_12, param_13, param_14, v_201);
+}
+
+static inline __attribute__((always_inline))
+void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_201)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = (flags << uint(16)) | 4u;
+    write_mem(param, param_1, param_2, v_201);
+    Alloc param_3 = a;
+    AnnoBeginClipRef param_4 = AnnoBeginClipRef{ ref.offset + 4u };
+    AnnoBeginClip param_5 = s;
+    AnnoBeginClip_write(param_3, param_4, param_5, v_201);
+}
+
+static inline __attribute__((always_inline))
+Clip Element_EndClip_read(thread const ElementRef& ref, const device SceneBuf& v_225)
+{
+    ClipRef param = ClipRef{ ref.offset + 4u };
+    return Clip_read(param, v_225);
+}
+
+static inline __attribute__((always_inline))
+void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_201)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = as_type<uint>(s.bbox.x);
+    write_mem(param, param_1, param_2, v_201);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = as_type<uint>(s.bbox.y);
+    write_mem(param_3, param_4, param_5, v_201);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = as_type<uint>(s.bbox.z);
+    write_mem(param_6, param_7, param_8, v_201);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = as_type<uint>(s.bbox.w);
+    write_mem(param_9, param_10, param_11, v_201);
+}
+
+static inline __attribute__((always_inline))
+void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const AnnoEndClip& s, device Memory& v_201)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 5u;
+    write_mem(param, param_1, param_2, v_201);
+    Alloc param_3 = a;
+    AnnoEndClipRef param_4 = AnnoEndClipRef{ ref.offset + 4u };
+    AnnoEndClip param_5 = s;
+    AnnoEndClip_write(param_3, param_4, param_5, v_201);
+}
+
+kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1042 [[buffer(1)]], const device SceneBuf& v_225 [[buffer(2)]], const device ParentBuf& _1008 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup DrawMonoid sh_scratch[512];
     uint ix = gl_GlobalInvocationID.x * 8u;
     ElementRef ref = ElementRef{ ix * 36u };
     ElementRef param = ref;
-    uint tag_word = Element_tag(param, v_49).tag;
+    uint tag_word = Element_tag(param, v_225).tag;
     uint param_1 = tag_word;
     DrawMonoid agg = map_tag(param_1);
     spvUnsafeArray<DrawMonoid, 8> local;
@@ -187,7 +625,7 @@
         ElementRef param_2 = ref;
         uint param_3 = i;
         ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4, v_49).tag;
+        tag_word = Element_tag(param_4, v_225).tag;
         uint param_5 = tag_word;
         DrawMonoid param_6 = agg;
         DrawMonoid param_7 = map_tag(param_5);
@@ -212,9 +650,9 @@
     DrawMonoid row = tag_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        uint _221 = gl_WorkGroupID.x - 1u;
-        row.path_ix = _218.parent[_221].path_ix;
-        row.clip_ix = _218.parent[_221].clip_ix;
+        uint _1011 = gl_WorkGroupID.x - 1u;
+        row.path_ix = _1008.parent[_1011].path_ix;
+        row.clip_ix = _1008.parent[_1011].clip_ix;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -222,14 +660,143 @@
         DrawMonoid param_11 = sh_scratch[gl_LocalInvocationID.x - 1u];
         row = combine_tag_monoid(param_10, param_11);
     }
-    uint out_base = (_248.conf.drawmonoid_alloc.offset >> uint(2)) + ((gl_GlobalInvocationID.x * 2u) * 8u);
+    uint out_ix = gl_GlobalInvocationID.x * 8u;
+    uint out_base = (_1042.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u);
+    AnnotatedRef out_ref = AnnotatedRef{ _1042.conf.anno_alloc.offset + (out_ix * 40u) };
+    float4 mat;
+    float2 translate;
+    AnnoColor anno_fill;
+    Alloc param_18;
+    AnnoLinGradient anno_lin;
+    Alloc param_23;
+    AnnoImage anno_img;
+    Alloc param_28;
+    AnnoBeginClip anno_begin_clip;
+    Alloc param_33;
+    AnnoEndClip anno_end_clip;
+    Alloc param_38;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid param_12 = row;
         DrawMonoid param_13 = local[i_2];
         DrawMonoid m = combine_tag_monoid(param_12, param_13);
-        _277.memory[out_base + (i_2 * 2u)] = m.path_ix;
-        _277.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
+        v_201.memory[out_base + (i_2 * 2u)] = m.path_ix;
+        v_201.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
+        ElementRef param_14 = ref;
+        uint param_15 = i_2;
+        ElementRef this_ref = Element_index(param_14, param_15);
+        ElementRef param_16 = this_ref;
+        tag_word = Element_tag(param_16, v_225).tag;
+        if (((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u))
+        {
+            uint bbox_offset = (_1042.conf.bbox_alloc.offset >> uint(2)) + (6u * (m.path_ix - 1u));
+            float bbox_l = float(v_201.memory[bbox_offset]) - 32768.0;
+            float bbox_t = float(v_201.memory[bbox_offset + 1u]) - 32768.0;
+            float bbox_r = float(v_201.memory[bbox_offset + 2u]) - 32768.0;
+            float bbox_b = float(v_201.memory[bbox_offset + 3u]) - 32768.0;
+            float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+            float linewidth = as_type<float>(v_201.memory[bbox_offset + 4u]);
+            uint fill_mode = uint(linewidth >= 0.0);
+            if ((linewidth >= 0.0) || (tag_word == 5u))
+            {
+                uint trans_ix = v_201.memory[bbox_offset + 5u];
+                uint t = (_1042.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
+                mat = as_type<float4>(uint4(v_201.memory[t], v_201.memory[t + 1u], v_201.memory[t + 2u], v_201.memory[t + 3u]));
+                if (tag_word == 5u)
+                {
+                    translate = as_type<float2>(uint2(v_201.memory[t + 4u], v_201.memory[t + 5u]));
+                }
+            }
+            if (linewidth >= 0.0)
+            {
+                linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z)));
+            }
+            linewidth = fast::max(linewidth, 0.0);
+            switch (tag_word)
+            {
+                case 4u:
+                {
+                    ElementRef param_17 = this_ref;
+                    FillColor fill = Element_FillColor_read(param_17, v_225);
+                    anno_fill.bbox = bbox;
+                    anno_fill.linewidth = linewidth;
+                    anno_fill.rgba_color = fill.rgba_color;
+                    param_18.offset = _1042.conf.anno_alloc.offset;
+                    AnnotatedRef param_19 = out_ref;
+                    uint param_20 = fill_mode;
+                    AnnoColor param_21 = anno_fill;
+                    Annotated_Color_write(param_18, param_19, param_20, param_21, v_201);
+                    break;
+                }
+                case 5u:
+                {
+                    ElementRef param_22 = this_ref;
+                    FillLinGradient lin = Element_FillLinGradient_read(param_22, v_225);
+                    anno_lin.bbox = bbox;
+                    anno_lin.linewidth = linewidth;
+                    anno_lin.index = lin.index;
+                    float2 p0 = ((mat.xy * lin.p0.x) + (mat.zw * lin.p0.y)) + translate;
+                    float2 p1 = ((mat.xy * lin.p1.x) + (mat.zw * lin.p1.y)) + translate;
+                    float2 dxy = p1 - p0;
+                    float scale = 1.0 / ((dxy.x * dxy.x) + (dxy.y * dxy.y));
+                    float line_x = dxy.x * scale;
+                    float line_y = dxy.y * scale;
+                    anno_lin.line_x = line_x;
+                    anno_lin.line_y = line_y;
+                    anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y));
+                    param_23.offset = _1042.conf.anno_alloc.offset;
+                    AnnotatedRef param_24 = out_ref;
+                    uint param_25 = fill_mode;
+                    AnnoLinGradient param_26 = anno_lin;
+                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_201);
+                    break;
+                }
+                case 6u:
+                {
+                    ElementRef param_27 = this_ref;
+                    FillImage fill_img = Element_FillImage_read(param_27, v_225);
+                    anno_img.bbox = bbox;
+                    anno_img.linewidth = linewidth;
+                    anno_img.index = fill_img.index;
+                    anno_img.offset = fill_img.offset;
+                    param_28.offset = _1042.conf.anno_alloc.offset;
+                    AnnotatedRef param_29 = out_ref;
+                    uint param_30 = fill_mode;
+                    AnnoImage param_31 = anno_img;
+                    Annotated_Image_write(param_28, param_29, param_30, param_31, v_201);
+                    break;
+                }
+            }
+        }
+        else
+        {
+            if (tag_word == 9u)
+            {
+                ElementRef param_32 = this_ref;
+                Clip begin_clip = Element_BeginClip_read(param_32, v_225);
+                anno_begin_clip.bbox = begin_clip.bbox;
+                anno_begin_clip.linewidth = 0.0;
+                param_33.offset = _1042.conf.anno_alloc.offset;
+                AnnotatedRef param_34 = out_ref;
+                uint param_35 = 0u;
+                AnnoBeginClip param_36 = anno_begin_clip;
+                Annotated_BeginClip_write(param_33, param_34, param_35, param_36, v_201);
+            }
+            else
+            {
+                if (tag_word == 10u)
+                {
+                    ElementRef param_37 = this_ref;
+                    Clip end_clip = Element_EndClip_read(param_37, v_225);
+                    anno_end_clip.bbox = end_clip.bbox;
+                    param_38.offset = _1042.conf.anno_alloc.offset;
+                    AnnotatedRef param_39 = out_ref;
+                    AnnoEndClip param_40 = anno_end_clip;
+                    Annotated_EndClip_write(param_38, param_39, param_40, v_201);
+                }
+            }
+        }
+        out_ref.offset += 40u;
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv
index 30740a2..8fade68 100644
--- a/piet-gpu/shader/gen/draw_leaf.spv
+++ b/piet-gpu/shader/gen/draw_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl
index 27c206a..b28c956 100644
--- a/piet-gpu/shader/gen/draw_reduce.hlsl
+++ b/piet-gpu/shader/gen/draw_reduce.hlsl
@@ -37,9 +37,10 @@
     Alloc bbox_alloc;
     Alloc drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl
index dd2f517..550cf8c 100644
--- a/piet-gpu/shader/gen/draw_reduce.msl
+++ b/piet-gpu/shader/gen/draw_reduce.msl
@@ -67,9 +67,10 @@
     Alloc bbox_alloc;
     Alloc drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv
index 286bd33..02ebc5d 100644
--- a/piet-gpu/shader/gen/draw_reduce.spv
+++ b/piet-gpu/shader/gen/draw_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil
index 4464d9d..0ca0d18 100644
--- a/piet-gpu/shader/gen/pathseg.dxil
+++ b/piet-gpu/shader/gen/pathseg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl
index e29ddd3..c7f7df0 100644
--- a/piet-gpu/shader/gen/pathseg.hlsl
+++ b/piet-gpu/shader/gen/pathseg.hlsl
@@ -65,9 +65,10 @@
     Alloc bbox_alloc;
     Alloc drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
@@ -79,7 +80,7 @@
 RWByteAddressBuffer _111 : register(u0);
 ByteAddressBuffer _574 : register(t2);
 ByteAddressBuffer _639 : register(t1);
-ByteAddressBuffer _710 : register(t3);
+ByteAddressBuffer _709 : register(t3);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -355,7 +356,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 4u;
-    uint tag_word = _574.Load(((_639.Load(56) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
+    uint tag_word = _574.Load(((_639.Load(64) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
     uint param = tag_word;
     TagMonoid local_tm = reduce_tag(param);
     sh_tag[gl_LocalInvocationID.x] = local_tm;
@@ -376,17 +377,17 @@
     TagMonoid tm = tag_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        TagMonoid _716;
-        _716.trans_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 0);
-        _716.linewidth_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 4);
-        _716.pathseg_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 8);
-        _716.path_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 12);
-        _716.pathseg_offset = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 16);
-        tm.trans_ix = _716.trans_ix;
-        tm.linewidth_ix = _716.linewidth_ix;
-        tm.pathseg_ix = _716.pathseg_ix;
-        tm.path_ix = _716.path_ix;
-        tm.pathseg_offset = _716.pathseg_offset;
+        TagMonoid _715;
+        _715.trans_ix = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 0);
+        _715.linewidth_ix = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 4);
+        _715.pathseg_ix = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 8);
+        _715.path_ix = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 12);
+        _715.pathseg_offset = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 16);
+        tm.trans_ix = _715.trans_ix;
+        tm.linewidth_ix = _715.linewidth_ix;
+        tm.pathseg_ix = _715.pathseg_ix;
+        tm.path_ix = _715.path_ix;
+        tm.pathseg_offset = _715.pathseg_offset;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -394,13 +395,16 @@
         TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u];
         tm = combine_tag_monoid(param_3, param_4);
     }
-    uint ps_ix = (_639.Load(64) >> uint(2)) + tm.pathseg_offset;
+    uint ps_ix = (_639.Load(68) >> uint(2)) + tm.pathseg_offset;
     uint lw_ix = (_639.Load(60) >> uint(2)) + tm.linewidth_ix;
     uint save_path_ix = tm.path_ix;
-    TransformSegRef _768 = { _639.Load(36) + (tm.trans_ix * 24u) };
-    TransformSegRef trans_ref = _768;
-    PathSegRef _778 = { _639.Load(28) + (tm.pathseg_ix * 52u) };
-    PathSegRef ps_ref = _778;
+    uint trans_ix = tm.trans_ix;
+    TransformSegRef _770 = { _639.Load(36) + (trans_ix * 24u) };
+    TransformSegRef trans_ref = _770;
+    PathSegRef _780 = { _639.Load(28) + (tm.pathseg_ix * 52u) };
+    PathSegRef ps_ref = _780;
+    float linewidth[4];
+    uint save_trans_ix[4];
     float2 p0;
     float2 p1;
     float2 p2;
@@ -411,6 +415,8 @@
     Alloc param_15;
     for (uint i_1 = 0u; i_1 < 4u; i_1++)
     {
+        linewidth[i_1] = asfloat(_574.Load(lw_ix * 4 + 0));
+        save_trans_ix[i_1] = trans_ix;
         uint tag_byte = tag_word >> (i_1 * 8u);
         uint seg_type = tag_byte & 3u;
         if (seg_type != 0u)
@@ -449,10 +455,9 @@
                     }
                 }
             }
-            float linewidth = asfloat(_574.Load(lw_ix * 4 + 0));
-            Alloc _864;
-            _864.offset = _639.Load(36);
-            param_13.offset = _864.offset;
+            Alloc _876;
+            _876.offset = _639.Load(36);
+            param_13.offset = _876.offset;
             TransformSegRef param_14 = trans_ref;
             TransformSeg transform = TransformSeg_read(param_13, param_14);
             p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate;
@@ -461,25 +466,25 @@
             if (seg_type >= 2u)
             {
                 p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate;
-                float4 _934 = bbox;
-                float2 _937 = min(_934.xy, p2);
-                bbox.x = _937.x;
-                bbox.y = _937.y;
-                float4 _942 = bbox;
-                float2 _945 = max(_942.zw, p2);
-                bbox.z = _945.x;
-                bbox.w = _945.y;
+                float4 _946 = bbox;
+                float2 _949 = min(_946.xy, p2);
+                bbox.x = _949.x;
+                bbox.y = _949.y;
+                float4 _954 = bbox;
+                float2 _957 = max(_954.zw, p2);
+                bbox.z = _957.x;
+                bbox.w = _957.y;
                 if (seg_type == 3u)
                 {
                     p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate;
-                    float4 _970 = bbox;
-                    float2 _973 = min(_970.xy, p3);
-                    bbox.x = _973.x;
-                    bbox.y = _973.y;
-                    float4 _978 = bbox;
-                    float2 _981 = max(_978.zw, p3);
-                    bbox.z = _981.x;
-                    bbox.w = _981.y;
+                    float4 _982 = bbox;
+                    float2 _985 = min(_982.xy, p3);
+                    bbox.x = _985.x;
+                    bbox.y = _985.y;
+                    float4 _990 = bbox;
+                    float2 _993 = max(_990.zw, p3);
+                    bbox.z = _993.x;
+                    bbox.w = _993.y;
                 }
                 else
                 {
@@ -495,9 +500,9 @@
                 p1 = lerp(p0, p3, 0.3333333432674407958984375f.xx);
             }
             float2 stroke = 0.0f.xx;
-            if (linewidth >= 0.0f)
+            if (linewidth[i_1] >= 0.0f)
             {
-                stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5f * linewidth);
+                stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5f * linewidth[i_1]);
                 bbox += float4(-stroke, stroke);
             }
             local[i_1].bbox = bbox;
@@ -509,10 +514,10 @@
             cubic.path_ix = tm.path_ix;
             cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1;
             cubic.stroke = stroke;
-            uint fill_mode = uint(linewidth >= 0.0f);
-            Alloc _1070;
-            _1070.offset = _639.Load(28);
-            param_15.offset = _1070.offset;
+            uint fill_mode = uint(linewidth[i_1] >= 0.0f);
+            Alloc _1088;
+            _1088.offset = _639.Load(28);
+            param_15.offset = _1088.offset;
             PathSegRef param_16 = ps_ref;
             uint param_17 = fill_mode;
             PathCubic param_18 = cubic;
@@ -528,6 +533,7 @@
             uint is_path = (tag_byte >> uint(4)) & 1u;
             local[i_1].flags = is_path;
             tm.path_ix += is_path;
+            trans_ix += ((tag_byte >> uint(5)) & 1u);
             trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u);
             lw_ix += ((tag_byte >> uint(6)) & 1u);
         }
@@ -556,7 +562,7 @@
     }
     GroupMemoryBarrierWithGroupSync();
     uint path_ix = save_path_ix;
-    uint bbox_out_ix = (_639.Load(40) >> uint(2)) + (path_ix * 4u);
+    uint bbox_out_ix = (_639.Load(40) >> uint(2)) + (path_ix * 6u);
     Monoid row = monoid_identity();
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -568,22 +574,24 @@
         Monoid param_24 = local[i_4];
         Monoid m = combine_monoid(param_23, param_24);
         bool do_atomic = false;
-        bool _1240 = i_4 == 3u;
-        bool _1247;
-        if (_1240)
+        bool _1263 = i_4 == 3u;
+        bool _1270;
+        if (_1263)
         {
-            _1247 = gl_LocalInvocationID.x == 511u;
+            _1270 = gl_LocalInvocationID.x == 511u;
         }
         else
         {
-            _1247 = _1240;
+            _1270 = _1263;
         }
-        if (_1247)
+        if (_1270)
         {
             do_atomic = true;
         }
         if ((m.flags & 1u) != 0u)
         {
+            _111.Store((bbox_out_ix + 4u) * 4 + 8, asuint(linewidth[i_4]));
+            _111.Store((bbox_out_ix + 5u) * 4 + 8, save_trans_ix[i_4]);
             if ((m.flags & 2u) == 0u)
             {
                 do_atomic = true;
@@ -598,38 +606,38 @@
                 _111.Store((bbox_out_ix + 2u) * 4 + 8, round_up(param_27));
                 float param_28 = m.bbox.w;
                 _111.Store((bbox_out_ix + 3u) * 4 + 8, round_up(param_28));
-                bbox_out_ix += 4u;
+                bbox_out_ix += 6u;
                 do_atomic = false;
             }
         }
         if (do_atomic)
         {
-            bool _1299 = m.bbox.z > m.bbox.x;
-            bool _1308;
-            if (!_1299)
+            bool _1335 = m.bbox.z > m.bbox.x;
+            bool _1344;
+            if (!_1335)
             {
-                _1308 = m.bbox.w > m.bbox.y;
+                _1344 = m.bbox.w > m.bbox.y;
             }
             else
             {
-                _1308 = _1299;
+                _1344 = _1335;
             }
-            if (_1308)
+            if (_1344)
             {
                 float param_29 = m.bbox.x;
-                uint _1317;
-                _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1317);
+                uint _1353;
+                _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1353);
                 float param_30 = m.bbox.y;
-                uint _1325;
-                _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1325);
+                uint _1361;
+                _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1361);
                 float param_31 = m.bbox.z;
-                uint _1333;
-                _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1333);
+                uint _1369;
+                _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1369);
                 float param_32 = m.bbox.w;
-                uint _1341;
-                _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1341);
+                uint _1377;
+                _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1377);
             }
-            bbox_out_ix += 4u;
+            bbox_out_ix += 6u;
         }
     }
 }
diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl
index 71299bd..0f60d4d 100644
--- a/piet-gpu/shader/gen/pathseg.msl
+++ b/piet-gpu/shader/gen/pathseg.msl
@@ -130,9 +130,10 @@
     Alloc_1 bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
@@ -429,7 +430,7 @@
     return uint(fast::min(65535.0, ceil(x) + 32768.0));
 }
 
-kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _710 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _709 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup TagMonoid sh_tag[512];
     threadgroup Monoid sh_scratch[512];
@@ -455,12 +456,12 @@
     TagMonoid tm = tag_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        uint _713 = gl_WorkGroupID.x - 1u;
-        tm.trans_ix = _710.parent[_713].trans_ix;
-        tm.linewidth_ix = _710.parent[_713].linewidth_ix;
-        tm.pathseg_ix = _710.parent[_713].pathseg_ix;
-        tm.path_ix = _710.parent[_713].path_ix;
-        tm.pathseg_offset = _710.parent[_713].pathseg_offset;
+        uint _712 = gl_WorkGroupID.x - 1u;
+        tm.trans_ix = _709.parent[_712].trans_ix;
+        tm.linewidth_ix = _709.parent[_712].linewidth_ix;
+        tm.pathseg_ix = _709.parent[_712].pathseg_ix;
+        tm.path_ix = _709.parent[_712].path_ix;
+        tm.pathseg_offset = _709.parent[_712].pathseg_offset;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -471,8 +472,11 @@
     uint ps_ix = (_639.conf.pathseg_offset >> uint(2)) + tm.pathseg_offset;
     uint lw_ix = (_639.conf.linewidth_offset >> uint(2)) + tm.linewidth_ix;
     uint save_path_ix = tm.path_ix;
-    TransformSegRef trans_ref = TransformSegRef{ _639.conf.trans_alloc.offset + (tm.trans_ix * 24u) };
+    uint trans_ix = tm.trans_ix;
+    TransformSegRef trans_ref = TransformSegRef{ _639.conf.trans_alloc.offset + (trans_ix * 24u) };
     PathSegRef ps_ref = PathSegRef{ _639.conf.pathseg_alloc.offset + (tm.pathseg_ix * 52u) };
+    spvUnsafeArray<float, 4> linewidth;
+    spvUnsafeArray<uint, 4> save_trans_ix;
     float2 p0;
     float2 p1;
     float2 p2;
@@ -483,6 +487,8 @@
     Alloc param_15;
     for (uint i_1 = 0u; i_1 < 4u; i_1++)
     {
+        linewidth[i_1] = as_type<float>(v_574.scene[lw_ix]);
+        save_trans_ix[i_1] = trans_ix;
         uint tag_byte = tag_word >> (i_1 * 8u);
         uint seg_type = tag_byte & 3u;
         if (seg_type != 0u)
@@ -521,7 +527,6 @@
                     }
                 }
             }
-            float linewidth = as_type<float>(v_574.scene[lw_ix]);
             param_13.offset = _639.conf.trans_alloc.offset;
             TransformSegRef param_14 = trans_ref;
             TransformSeg transform = TransformSeg_read(param_13, param_14, v_111);
@@ -531,25 +536,25 @@
             if (seg_type >= 2u)
             {
                 p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate;
-                float4 _934 = bbox;
-                float2 _937 = fast::min(_934.xy, p2);
-                bbox.x = _937.x;
-                bbox.y = _937.y;
-                float4 _942 = bbox;
-                float2 _945 = fast::max(_942.zw, p2);
-                bbox.z = _945.x;
-                bbox.w = _945.y;
+                float4 _946 = bbox;
+                float2 _949 = fast::min(_946.xy, p2);
+                bbox.x = _949.x;
+                bbox.y = _949.y;
+                float4 _954 = bbox;
+                float2 _957 = fast::max(_954.zw, p2);
+                bbox.z = _957.x;
+                bbox.w = _957.y;
                 if (seg_type == 3u)
                 {
                     p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate;
-                    float4 _970 = bbox;
-                    float2 _973 = fast::min(_970.xy, p3);
-                    bbox.x = _973.x;
-                    bbox.y = _973.y;
-                    float4 _978 = bbox;
-                    float2 _981 = fast::max(_978.zw, p3);
-                    bbox.z = _981.x;
-                    bbox.w = _981.y;
+                    float4 _982 = bbox;
+                    float2 _985 = fast::min(_982.xy, p3);
+                    bbox.x = _985.x;
+                    bbox.y = _985.y;
+                    float4 _990 = bbox;
+                    float2 _993 = fast::max(_990.zw, p3);
+                    bbox.z = _993.x;
+                    bbox.w = _993.y;
                 }
                 else
                 {
@@ -565,9 +570,9 @@
                 p1 = mix(p0, p3, float2(0.3333333432674407958984375));
             }
             float2 stroke = float2(0.0);
-            if (linewidth >= 0.0)
+            if (linewidth[i_1] >= 0.0)
             {
-                stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5 * linewidth);
+                stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5 * linewidth[i_1]);
                 bbox += float4(-stroke, stroke);
             }
             local[i_1].bbox = bbox;
@@ -579,7 +584,7 @@
             cubic.path_ix = tm.path_ix;
             cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1;
             cubic.stroke = stroke;
-            uint fill_mode = uint(linewidth >= 0.0);
+            uint fill_mode = uint(linewidth[i_1] >= 0.0);
             param_15.offset = _639.conf.pathseg_alloc.offset;
             PathSegRef param_16 = ps_ref;
             uint param_17 = fill_mode;
@@ -596,6 +601,7 @@
             uint is_path = (tag_byte >> uint(4)) & 1u;
             local[i_1].flags = is_path;
             tm.path_ix += is_path;
+            trans_ix += ((tag_byte >> uint(5)) & 1u);
             trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u);
             lw_ix += ((tag_byte >> uint(6)) & 1u);
         }
@@ -624,7 +630,7 @@
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint path_ix = save_path_ix;
-    uint bbox_out_ix = (_639.conf.bbox_alloc.offset >> uint(2)) + (path_ix * 4u);
+    uint bbox_out_ix = (_639.conf.bbox_alloc.offset >> uint(2)) + (path_ix * 6u);
     Monoid row = monoid_identity();
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -636,22 +642,24 @@
         Monoid param_24 = local[i_4];
         Monoid m = combine_monoid(param_23, param_24);
         bool do_atomic = false;
-        bool _1240 = i_4 == 3u;
-        bool _1247;
-        if (_1240)
+        bool _1263 = i_4 == 3u;
+        bool _1270;
+        if (_1263)
         {
-            _1247 = gl_LocalInvocationID.x == 511u;
+            _1270 = gl_LocalInvocationID.x == 511u;
         }
         else
         {
-            _1247 = _1240;
+            _1270 = _1263;
         }
-        if (_1247)
+        if (_1270)
         {
             do_atomic = true;
         }
         if ((m.flags & 1u) != 0u)
         {
+            v_111.memory[bbox_out_ix + 4u] = as_type<uint>(linewidth[i_4]);
+            v_111.memory[bbox_out_ix + 5u] = save_trans_ix[i_4];
             if ((m.flags & 2u) == 0u)
             {
                 do_atomic = true;
@@ -666,34 +674,34 @@
                 v_111.memory[bbox_out_ix + 2u] = round_up(param_27);
                 float param_28 = m.bbox.w;
                 v_111.memory[bbox_out_ix + 3u] = round_up(param_28);
-                bbox_out_ix += 4u;
+                bbox_out_ix += 6u;
                 do_atomic = false;
             }
         }
         if (do_atomic)
         {
-            bool _1299 = m.bbox.z > m.bbox.x;
-            bool _1308;
-            if (!_1299)
+            bool _1335 = m.bbox.z > m.bbox.x;
+            bool _1344;
+            if (!_1335)
             {
-                _1308 = m.bbox.w > m.bbox.y;
+                _1344 = m.bbox.w > m.bbox.y;
             }
             else
             {
-                _1308 = _1299;
+                _1344 = _1335;
             }
-            if (_1308)
+            if (_1344)
             {
                 float param_29 = m.bbox.x;
-                uint _1317 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed);
+                uint _1353 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed);
                 float param_30 = m.bbox.y;
-                uint _1325 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed);
+                uint _1361 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed);
                 float param_31 = m.bbox.z;
-                uint _1333 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed);
+                uint _1369 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed);
                 float param_32 = m.bbox.w;
-                uint _1341 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed);
+                uint _1377 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed);
             }
-            bbox_out_ix += 4u;
+            bbox_out_ix += 6u;
         }
     }
 }
diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv
index bc165ac..fc63eb5 100644
--- a/piet-gpu/shader/gen/pathseg.spv
+++ b/piet-gpu/shader/gen/pathseg.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil
index 02a4750..d585c96 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.dxil
+++ b/piet-gpu/shader/gen/pathtag_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl
index 5e98362..dd7c611 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.hlsl
+++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl
@@ -27,18 +27,19 @@
     Alloc bbox_alloc;
     Alloc drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u);
 
 ByteAddressBuffer _139 : register(t1);
-ByteAddressBuffer _151 : register(t2);
-RWByteAddressBuffer _239 : register(u3);
-RWByteAddressBuffer _259 : register(u0);
+ByteAddressBuffer _150 : register(t2);
+RWByteAddressBuffer _238 : register(u3);
+RWByteAddressBuffer _258 : register(u0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -82,13 +83,13 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 4u;
-    uint scene_ix = (_139.Load(56) >> uint(2)) + ix;
-    uint tag_word = _151.Load(scene_ix * 4 + 0);
+    uint scene_ix = (_139.Load(64) >> uint(2)) + ix;
+    uint tag_word = _150.Load(scene_ix * 4 + 0);
     uint param = tag_word;
     TagMonoid agg = reduce_tag(param);
     for (uint i = 1u; i < 4u; i++)
     {
-        tag_word = _151.Load((scene_ix + i) * 4 + 0);
+        tag_word = _150.Load((scene_ix + i) * 4 + 0);
         uint param_1 = tag_word;
         TagMonoid param_2 = agg;
         TagMonoid param_3 = reduce_tag(param_1);
@@ -110,11 +111,11 @@
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        _239.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix);
-        _239.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix);
-        _239.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix);
-        _239.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix);
-        _239.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset);
+        _238.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix);
+        _238.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix);
+        _238.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix);
+        _238.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix);
+        _238.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset);
     }
 }
 
diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl
index 38451d4..e82577c 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.msl
+++ b/piet-gpu/shader/gen/pathtag_reduce.msl
@@ -34,9 +34,10 @@
     Alloc bbox_alloc;
     Alloc drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
@@ -102,17 +103,17 @@
     return c;
 }
 
-kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _151 [[buffer(2)]], device OutBuf& _239 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _150 [[buffer(2)]], device OutBuf& _238 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup TagMonoid sh_scratch[128];
     uint ix = gl_GlobalInvocationID.x * 4u;
     uint scene_ix = (_139.conf.pathtag_offset >> uint(2)) + ix;
-    uint tag_word = _151.scene[scene_ix];
+    uint tag_word = _150.scene[scene_ix];
     uint param = tag_word;
     TagMonoid agg = reduce_tag(param);
     for (uint i = 1u; i < 4u; i++)
     {
-        tag_word = _151.scene[scene_ix + i];
+        tag_word = _150.scene[scene_ix + i];
         uint param_1 = tag_word;
         TagMonoid param_2 = agg;
         TagMonoid param_3 = reduce_tag(param_1);
@@ -134,11 +135,11 @@
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        _239.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix;
-        _239.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix;
-        _239.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix;
-        _239.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
-        _239.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset;
+        _238.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix;
+        _238.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix;
+        _238.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix;
+        _238.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
+        _238.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset;
     }
 }
 
diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv
index eef46a2..6dc35b8 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.spv
+++ b/piet-gpu/shader/gen/pathtag_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil
index dabc049..102d2f0 100644
--- a/piet-gpu/shader/gen/transform_leaf.dxil
+++ b/piet-gpu/shader/gen/transform_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl
index 2f0de05..6fa9267 100644
--- a/piet-gpu/shader/gen/transform_leaf.hlsl
+++ b/piet-gpu/shader/gen/transform_leaf.hlsl
@@ -40,9 +40,10 @@
     Alloc bbox_alloc;
     Alloc drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
@@ -149,7 +150,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _285 = { _278.Load(52) + (ix * 24u) };
+    TransformRef _285 = { _278.Load(56) + (ix * 24u) };
     TransformRef ref = _285;
     TransformRef param = ref;
     Transform agg = Transform_read(param);
diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl
index 3120b3d..9c7e6b7 100644
--- a/piet-gpu/shader/gen/transform_leaf.msl
+++ b/piet-gpu/shader/gen/transform_leaf.msl
@@ -103,9 +103,10 @@
     Alloc_1 bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv
index 01f047b..e561e9d 100644
--- a/piet-gpu/shader/gen/transform_leaf.spv
+++ b/piet-gpu/shader/gen/transform_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil
index 68997d0..1ed5e0e 100644
--- a/piet-gpu/shader/gen/transform_reduce.dxil
+++ b/piet-gpu/shader/gen/transform_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl
index 9d8a5d6..60addf3 100644
--- a/piet-gpu/shader/gen/transform_reduce.hlsl
+++ b/piet-gpu/shader/gen/transform_reduce.hlsl
@@ -29,9 +29,10 @@
     Alloc bbox_alloc;
     Alloc drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
@@ -86,7 +87,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _168 = { _161.Load(52) + (ix * 24u) };
+    TransformRef _168 = { _161.Load(56) + (ix * 24u) };
     TransformRef ref = _168;
     TransformRef param = ref;
     Transform agg = Transform_read(param);
diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl
index e61b602..ac586d9 100644
--- a/piet-gpu/shader/gen/transform_reduce.msl
+++ b/piet-gpu/shader/gen/transform_reduce.msl
@@ -41,9 +41,10 @@
     Alloc bbox_alloc;
     Alloc drawmonoid_alloc;
     uint n_trans;
+    uint n_path;
     uint trans_offset;
-    uint pathtag_offset;
     uint linewidth_offset;
+    uint pathtag_offset;
     uint pathseg_offset;
 };
 
diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv
index 77eadb2..5638afb 100644
--- a/piet-gpu/shader/gen/transform_reduce.spv
+++ b/piet-gpu/shader/gen/transform_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index 04b6364..0eb1e5a 100644
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp
index ea525f5..1bd06f9 100644
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@@ -100,6 +100,8 @@
     case PathSeg_Cubic:
         PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);
 
+        // Affine transform is now applied in pathseg
+        /*
         uint trans_ix = cubic.trans_ix;
         if (trans_ix > 0) {
             TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (trans_ix - 1) * TransformSeg_size);
@@ -109,6 +111,7 @@
             cubic.p2 = trans.mat.xy * cubic.p2.x + trans.mat.zw * cubic.p2.y + trans.translate;
             cubic.p3 = trans.mat.xy * cubic.p3.x + trans.mat.zw * cubic.p3.y + trans.translate;
         }
+        */
 
         vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
         float err = err_v.x * err_v.x + err_v.y * err_v.y;
diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
index 240f8f7..0da044f 100644
--- a/piet-gpu/shader/path_coarse.spv
+++ b/piet-gpu/shader/path_coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/pathseg.comp b/piet-gpu/shader/pathseg.comp
index 7b8f3f0..ec0a440 100644
--- a/piet-gpu/shader/pathseg.comp
+++ b/piet-gpu/shader/pathseg.comp
@@ -92,6 +92,8 @@
 
 void main() {
     Monoid local[N_SEQ];
+    float linewidth[N_SEQ];
+    uint save_trans_ix[N_SEQ];
 
     uint ix = gl_GlobalInvocationID.x * N_SEQ;
 
@@ -124,9 +126,12 @@
     uint ps_ix = (conf.pathseg_offset >> 2) + tm.pathseg_offset;
     uint lw_ix = (conf.linewidth_offset >> 2) + tm.linewidth_ix;
     uint save_path_ix = tm.path_ix;
-    TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + tm.trans_ix * TransformSeg_size);
+    uint trans_ix = tm.trans_ix;
+    TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + trans_ix * TransformSeg_size);
     PathSegRef ps_ref = PathSegRef(conf.pathseg_alloc.offset + tm.pathseg_ix * PathSeg_size);
     for (uint i = 0; i < N_SEQ; i++) {
+        linewidth[i] = uintBitsToFloat(scene[lw_ix]);
+        save_trans_ix[i] = trans_ix;
         // if N_SEQ > 4, need to load tag_word from local if N_SEQ % 4 == 0
         uint tag_byte = tag_word >> (i * 8);
         uint seg_type = tag_byte & 3;
@@ -158,7 +163,6 @@
                     }
                 }
             }
-            float linewidth = uintBitsToFloat(scene[lw_ix]);
             TransformSeg transform = TransformSeg_read(conf.trans_alloc, trans_ref);
             p0 = transform.mat.xy * p0.x + transform.mat.zw * p0.y + transform.translate;
             p1 = transform.mat.xy * p1.x + transform.mat.zw * p1.y + transform.translate;
@@ -183,9 +187,9 @@
                 p1 = mix(p0, p3, 1.0 / 3.0);
             }
             vec2 stroke = vec2(0.0, 0.0);
-            if (linewidth >= 0.0) {
+            if (linewidth[i] >= 0.0) {
                 // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
-                stroke = 0.5 * linewidth * vec2(length(transform.mat.xz), length(transform.mat.yw));
+                stroke = 0.5 * linewidth[i] * vec2(length(transform.mat.xz), length(transform.mat.yw));
                 bbox += vec4(-stroke, stroke);
             }
             local[i].bbox = bbox;
@@ -201,7 +205,7 @@
             // Not needed, TODO remove from struct
             cubic.trans_ix = gl_GlobalInvocationID.x * 4 + i;
             cubic.stroke = stroke;
-            uint fill_mode = uint(linewidth >= 0.0);
+            uint fill_mode = uint(linewidth[i] >= 0.0);
             PathSeg_Cubic_write(conf.pathseg_alloc, ps_ref, fill_mode, cubic);
 
             ps_ref.offset += PathSeg_size;
@@ -215,6 +219,7 @@
             // Relies on the fact that RESET_BBOX == 1
             local[i].flags = is_path;
             tm.path_ix += is_path;
+            trans_ix += (tag_byte >> 5) & 1;
             trans_ref.offset += ((tag_byte >> 5) & 1) * TransformSeg_size;
             lw_ix += (tag_byte >> 6) & 1;
         }
@@ -244,7 +249,7 @@
     
     barrier();
     uint path_ix = save_path_ix;
-    uint bbox_out_ix = (conf.bbox_alloc.offset >> 2) + path_ix * 4;
+    uint bbox_out_ix = (conf.bbox_alloc.offset >> 2) + path_ix * 6;
     // Write bboxes to paths; do atomic min/max if partial
     Monoid row = monoid_identity();
     if (gl_LocalInvocationID.x > 0) {
@@ -259,6 +264,8 @@
             do_atomic = true;
         }
         if ((m.flags & FLAG_RESET_BBOX) != 0) {
+            memory[bbox_out_ix + 4] = floatBitsToUint(linewidth[i]);
+            memory[bbox_out_ix + 5] = save_trans_ix[i];
             if ((m.flags & FLAG_SET_BBOX) == 0) {
                 do_atomic = true;
             } else {
@@ -266,7 +273,7 @@
                 memory[bbox_out_ix + 1] = round_down(m.bbox.y);
                 memory[bbox_out_ix + 2] = round_up(m.bbox.z);
                 memory[bbox_out_ix + 3] = round_up(m.bbox.w);
-                bbox_out_ix += 4;
+                bbox_out_ix += 6;
                 do_atomic = false;
             }
         }
@@ -278,7 +285,7 @@
                 atomicMax(memory[bbox_out_ix + 2], round_up(m.bbox.z));
                 atomicMax(memory[bbox_out_ix + 3], round_up(m.bbox.w));
             }
-            bbox_out_ix += 4;
+            bbox_out_ix += 6;
         }
     }
 }
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index 3bb1fdd..5d4cc73 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -48,12 +48,14 @@
     // Number of transforms in scene
     // This is probably not needed.
     uint n_trans;
+    // This only counts actual paths, not EndClip.
+    uint n_path;
     // Offset (in bytes) of transform stream in scene buffer
     uint trans_offset;
-    // Offset (in bytes) of path tag stream in scene
-    uint pathtag_offset;
     // Offset (in bytes) of linewidth stream in scene
     uint linewidth_offset;
+    // Offset (in bytes) of path tag stream in scene
+    uint pathtag_offset;
     // Offset (in bytes) of path segment stream in scene
     uint pathseg_offset;
 };
diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv
index 0de00e3..b443b03 100644
--- a/piet-gpu/shader/tile_alloc.spv
+++ b/piet-gpu/shader/tile_alloc.spv
Binary files differ
diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs
new file mode 100644
index 0000000..12e9db4
--- /dev/null
+++ b/piet-gpu/src/encoder.rs
@@ -0,0 +1,199 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Low-level scene encoding.
+
+use bytemuck::{Pod, Zeroable};
+use piet_gpu_hal::BufWrite;
+
+use crate::stages::{self, Config, PathEncoder, Transform};
+
+pub struct Encoder {
+    transform_stream: Vec<stages::Transform>,
+    tag_stream: Vec<u8>,
+    pathseg_stream: Vec<u8>,
+    linewidth_stream: Vec<f32>,
+    drawobj_stream: Vec<u8>,
+    n_path: u32,
+    n_pathseg: u32,
+}
+
+// Currently same as Element, but may change - should become packed.
+const DRAWOBJ_SIZE: usize = 36;
+const TRANSFORM_SIZE: usize = 24;
+const LINEWIDTH_SIZE: usize = 4;
+const PATHSEG_SIZE: usize = 52;
+const BBOX_SIZE: usize = 24;
+const DRAWMONOID_SIZE: usize = 8;
+const ANNOTATED_SIZE: usize = 40;
+
+// Maybe pull these from the relevant stages? In any case, they may depend
+// on runtime query of GPU (supported workgroup size).
+const TRANSFORM_PART_SIZE: usize = 4096;
+const PATHSEG_PART_SIZE: usize = 2048;
+const DRAWOBJ_PART_SIZE: usize = 4096;
+
+// These are bytemuck versions of elements currently defined in the
+// Element struct in piet-gpu-types; that's pretty much going away.
+
+const ELEMENT_FILLCOLOR: u32 = 4;
+
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct FillColor {
+    tag: u32,
+    rgba_color: u32,
+    padding: [u32; 7],
+}
+
+impl Encoder {
+    pub fn new() -> Encoder {
+        Encoder {
+            transform_stream: vec![Transform::IDENTITY],
+            tag_stream: Vec::new(),
+            pathseg_stream: Vec::new(),
+            linewidth_stream: vec![-1.0],
+            drawobj_stream: Vec::new(),
+            n_path: 0,
+            n_pathseg: 0,
+        }
+    }
+
+    pub fn path_encoder(&mut self) -> PathEncoder {
+        PathEncoder::new(&mut self.tag_stream, &mut self.pathseg_stream)
+    }
+
+    pub fn finish_path(&mut self, n_pathseg: u32) {
+        self.n_path += 1;
+        self.n_pathseg += n_pathseg;
+    }
+
+    pub fn transform(&mut self, transform: Transform) {
+        self.tag_stream.push(0x20);
+        self.transform_stream.push(transform);
+    }
+
+    // -1.0 means "fill"
+    pub fn linewidth(&mut self, linewidth: f32) {
+        self.tag_stream.push(0x40);
+        self.linewidth_stream.push(linewidth);
+    }
+
+    /// Encode a fill color draw object.
+    ///
+    /// This should be encoded after a path.
+    pub fn fill_color(&mut self, rgba_color: u32) {
+        let element = FillColor {
+            tag: ELEMENT_FILLCOLOR,
+            rgba_color,
+            ..Default::default()
+        };
+        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
+    }
+
+    /// Return a config for the element processing pipeline.
+    ///
+    /// This does not include further pipeline processing. Also returns the
+    /// beginning of free memory.
+    pub fn stage_config(&self) -> (Config, usize) {
+        // Layout of scene buffer
+        let n_drawobj = self.n_drawobj();
+        let n_drawobj_padded = align_up(n_drawobj, DRAWOBJ_PART_SIZE);
+        let trans_offset = n_drawobj_padded * DRAWOBJ_SIZE;
+        let n_trans = self.transform_stream.len();
+        let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE);
+        let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
+        let n_linewidth = self.linewidth_stream.len();
+        let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE;
+        let n_pathtag = self.tag_stream.len();
+        let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE);
+        let pathseg_offset = pathtag_offset + n_pathtag_padded;
+
+        // Layout of memory
+        let mut alloc = 0;
+        let trans_alloc = alloc;
+        alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
+        let pathseg_alloc = alloc;
+        alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
+        let bbox_alloc = alloc;
+        let n_path = self.n_path as usize;
+        alloc += bbox_alloc + n_path * BBOX_SIZE;
+        let drawmonoid_alloc = alloc;
+        alloc += n_drawobj_padded * DRAWMONOID_SIZE;
+        let anno_alloc = alloc;
+        alloc += n_drawobj * ANNOTATED_SIZE;
+
+        let config = Config {
+            n_elements: n_drawobj as u32,
+            n_pathseg: self.n_pathseg,
+            pathseg_alloc: pathseg_alloc as u32,
+            anno_alloc: anno_alloc as u32,
+            trans_alloc: trans_alloc as u32,
+            bbox_alloc: bbox_alloc as u32,
+            drawmonoid_alloc: drawmonoid_alloc as u32,
+            n_trans: n_trans as u32,
+            n_path: self.n_path,
+            trans_offset: trans_offset as u32,
+            linewidth_offset: linewidth_offset as u32,
+            pathtag_offset: pathtag_offset as u32,
+            pathseg_offset: pathseg_offset as u32,
+            ..Default::default()
+        };
+        (config, alloc)
+    }
+
+    pub fn write_scene(&self, buf: &mut BufWrite) {
+        buf.extend_slice(&self.drawobj_stream);
+        let n_drawobj = self.drawobj_stream.len() / DRAWOBJ_SIZE;
+        buf.fill_zero(padding(n_drawobj, DRAWOBJ_PART_SIZE) * DRAWOBJ_SIZE);
+        buf.extend_slice(&self.transform_stream);
+        let n_trans = self.transform_stream.len();
+        buf.fill_zero(padding(n_trans, TRANSFORM_PART_SIZE) * TRANSFORM_SIZE);
+        buf.extend_slice(&self.linewidth_stream);
+        buf.extend_slice(&self.tag_stream);
+        let n_pathtag = self.tag_stream.len();
+        buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE));
+        buf.extend_slice(&self.pathseg_stream);
+    }
+
+    /// The number of elements in the draw object stream.
+    pub(crate) fn n_drawobj(&self) -> usize {
+        self.drawobj_stream.len() / DRAWOBJ_SIZE
+    }
+
+    /// The number of paths.
+    pub(crate) fn n_path(&self) -> u32 {
+        self.n_path
+    }
+
+    /// The number of path segments.
+    pub(crate) fn n_pathseg(&self) -> u32 {
+        self.n_pathseg
+    }
+
+    pub(crate) fn n_transform(&self) -> usize {
+        self.transform_stream.len()
+    }
+}
+
+fn align_up(x: usize, align: usize) -> usize {
+    debug_assert!(align.is_power_of_two());
+    (x + align - 1) & !(align - 1)
+}
+
+fn padding(x: usize, align: usize) -> usize {
+    x.wrapping_neg() & (align - 1)
+}
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 2b45e7c..25627f6 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -1,3 +1,4 @@
+mod encoder;
 mod gradient;
 mod pico_svg;
 mod render_ctx;
@@ -12,16 +13,15 @@
 use piet::kurbo::Vec2;
 use piet::{ImageFormat, RenderContext};
 
-use piet_gpu_types::encoder::Encode;
-
 use piet_gpu_hal::{
     BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image, ImageLayout, Pipeline,
     QueryPool, Session, ShaderCode,
 };
 
 use pico_svg::PicoSvg;
+use stages::{ElementBinding, ElementCode};
 
-use crate::stages::Config;
+use crate::stages::{Config, ElementStage};
 
 const TILE_W: usize = 16;
 const TILE_H: usize = 16;
@@ -70,8 +70,10 @@
     // Device config buf
     config_buf: Buffer,
 
-    el_pipeline: Pipeline,
-    el_ds: Vec<DescriptorSet>,
+    // New element pipeline
+    element_code: ElementCode,
+    element_stage: ElementStage,
+    element_bindings: Vec<ElementBinding>,
 
     tile_pipeline: Pipeline,
     tile_ds: DescriptorSet,
@@ -91,7 +93,8 @@
     k4_pipeline: Pipeline,
     k4_ds: DescriptorSet,
 
-    n_elements: usize,
+    n_transform: usize,
+    n_drawobj: usize,
     n_paths: usize,
     n_pathseg: usize,
 
@@ -120,7 +123,7 @@
         // TODO: separate staging buffer (if needed)
         let scene_bufs = (0..n_bufs)
             .map(|_| session.create_buffer(8 * 1024 * 1024, host_upload).unwrap())
-            .collect();
+            .collect::<Vec<_>>();
 
         let state_buf = session.create_buffer(1 * 1024 * 1024, dev)?;
         let image_dev = session.create_image2d(width as u32, height as u32)?;
@@ -142,23 +145,21 @@
             .collect();
         let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?;
 
-        let el_code = ShaderCode::Spv(include_bytes!("../shader/elements.spv"));
-        let el_pipeline = session.create_compute_pipeline(
-            el_code,
-            &[
-                BindType::Buffer,
-                BindType::Buffer,
-                BindType::Buffer,
-                BindType::Buffer,
-            ],
-        )?;
-        let mut el_ds = Vec::with_capacity(n_bufs);
-        for scene_buf in &scene_bufs {
-            el_ds.push(session.create_simple_descriptor_set(
-                &el_pipeline,
-                &[&memory_buf_dev, &config_buf, scene_buf, &state_buf],
-            )?);
-        }
+        let element_code = ElementCode::new(session);
+        let element_stage = ElementStage::new(session, &element_code);
+        let element_bindings = scene_bufs
+            .iter()
+            .zip(&config_bufs)
+            .map(|(scene_buf, config_buf)| {
+                element_stage.bind(
+                    session,
+                    &element_code,
+                    config_buf,
+                    scene_buf,
+                    &memory_buf_dev,
+                )
+            })
+            .collect();
 
         let tile_alloc_code = ShaderCode::Spv(include_bytes!("../shader/tile_alloc.spv"));
         let tile_pipeline = session
@@ -237,8 +238,9 @@
             config_buf,
             config_bufs,
             image_dev,
-            el_pipeline,
-            el_ds,
+            element_code,
+            element_stage,
+            element_bindings,
             tile_pipeline,
             tile_ds,
             path_pipeline,
@@ -251,7 +253,8 @@
             coarse_ds,
             k4_pipeline,
             k4_ds,
-            n_elements: 0,
+            n_transform: 0,
+            n_drawobj: 0,
             n_paths: 0,
             n_pathseg: 0,
             _bg_image: bg_image,
@@ -270,55 +273,38 @@
         render_ctx: &mut PietGpuRenderContext,
         buf_ix: usize,
     ) -> Result<(), Error> {
-        let n_paths = render_ctx.path_count();
-        let n_pathseg = render_ctx.pathseg_count();
-        let n_trans = render_ctx.trans_count();
-        self.n_paths = n_paths;
-        self.n_pathseg = n_pathseg;
+        let (mut config, mut alloc) = render_ctx.stage_config();
+        let n_drawobj = render_ctx.n_drawobj();
+        // TODO: be more consistent in size types
+        let n_path = render_ctx.n_path() as usize;
+        self.n_paths = n_path;
+        self.n_transform = render_ctx.n_transform();
+        self.n_drawobj = render_ctx.n_drawobj();
+        self.n_pathseg = render_ctx.n_pathseg() as usize;
 
         // These constants depend on encoding and may need to be updated.
         // Perhaps we can plumb these from piet-gpu-derive?
         const PATH_SIZE: usize = 12;
         const BIN_SIZE: usize = 8;
-        const PATHSEG_SIZE: usize = 52;
-        const ANNO_SIZE: usize = 40;
-        const TRANS_SIZE: usize = 24;
         let width_in_tiles = self.width / TILE_W;
         let height_in_tiles = self.height / TILE_H;
-        let mut alloc = 0;
         let tile_base = alloc;
-        alloc += ((n_paths + 3) & !3) * PATH_SIZE;
+        alloc += ((n_path + 3) & !3) * PATH_SIZE;
         let bin_base = alloc;
-        alloc += ((n_paths + 255) & !255) * BIN_SIZE;
+        alloc += ((n_drawobj + 255) & !255) * BIN_SIZE;
         let ptcl_base = alloc;
         alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
-        let pathseg_base = alloc;
-        alloc += (n_pathseg * PATHSEG_SIZE + 3) & !3;
-        let anno_base = alloc;
-        alloc += (n_paths * ANNO_SIZE + 3) & !3;
-        let trans_base = alloc;
-        alloc += (n_trans * TRANS_SIZE + 3) & !3;
-        let config = Config {
-            n_elements: n_paths as u32,
-            n_pathseg: n_pathseg as u32,
-            width_in_tiles: width_in_tiles as u32,
-            height_in_tiles: height_in_tiles as u32,
-            tile_alloc: tile_base as u32,
-            bin_alloc: bin_base as u32,
-            ptcl_alloc: ptcl_base as u32,
-            pathseg_alloc: pathseg_base as u32,
-            anno_alloc: anno_base as u32,
-            trans_alloc: trans_base as u32,
-            n_trans: n_trans as u32,
-            // We'll fill the rest of the fields in when we hook up the new element pipeline.
-            ..Default::default()
-        };
+        config.width_in_tiles = width_in_tiles as u32;
+        config.height_in_tiles = height_in_tiles as u32;
+        config.tile_alloc = tile_base as u32;
+        config.bin_alloc = bin_base as u32;
+        config.ptcl_alloc = ptcl_base as u32;
         unsafe {
-            let scene = render_ctx.get_scene_buf();
-            self.n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
             // TODO: reallocate scene buffer if size is inadequate
-            assert!(self.scene_bufs[buf_ix].size() as usize >= scene.len());
-            self.scene_bufs[buf_ix].write(scene)?;
+            {
+                let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?;
+                render_ctx.write_scene(&mut mapped_scene);
+            }
             self.config_bufs[buf_ix].write(&[config])?;
             self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;
 
@@ -355,11 +341,14 @@
         cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
         cmd_buf.reset_query_pool(&query_pool);
         cmd_buf.write_timestamp(&query_pool, 0);
-        cmd_buf.dispatch(
-            &self.el_pipeline,
-            &self.el_ds[buf_ix],
-            (((self.n_elements + 127) / 128) as u32, 1, 1),
-            (128, 1, 1),
+        self.element_stage.record(
+            cmd_buf,
+            &self.element_code,
+            &self.element_bindings[buf_ix],
+            self.n_transform as u64,
+            self.n_paths as u32,
+            self.n_pathseg as u32,
+            self.n_drawobj as u64,
         );
         cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index f050c76..5b10fec 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -1,5 +1,6 @@
 use std::borrow::Cow;
 
+use crate::stages::Config;
 use crate::MAX_BLEND_STACK;
 use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape};
 use piet::{
@@ -7,11 +8,9 @@
     StrokeStyle,
 };
 
+use piet_gpu_hal::BufWrite;
 use piet_gpu_types::encoder::{Encode, Encoder};
-use piet_gpu_types::scene::{
-    Clip, CubicSeg, Element, FillColor, FillLinGradient, LineSeg, QuadSeg, SetFillMode,
-    SetLineWidth, Transform,
-};
+use piet_gpu_types::scene::{Clip, Element, FillColor, FillLinGradient, SetFillMode, Transform};
 
 use crate::gradient::{LinearGradient, RampCache};
 use crate::text::Font;
@@ -40,6 +39,10 @@
     clip_stack: Vec<ClipElement>,
 
     ramp_cache: RampCache,
+
+    // Fields for new element processing pipeline below
+    // TODO: delete old encoder, rename
+    new_encoder: crate::encoder::Encoder,
 }
 
 #[derive(Clone)]
@@ -81,7 +84,7 @@
         let elements = Vec::new();
         let font = Font::new();
         let inner_text = PietGpuText::new(font);
-        let stroke_width = 0.0;
+        let stroke_width = -1.0;
         PietGpuRenderContext {
             encoder,
             elements,
@@ -95,9 +98,40 @@
             state_stack: Vec::new(),
             clip_stack: Vec::new(),
             ramp_cache: RampCache::default(),
+            new_encoder: crate::encoder::Encoder::new(),
         }
     }
 
+    pub fn stage_config(&self) -> (Config, usize) {
+        self.new_encoder.stage_config()
+    }
+
+    /// Number of draw objects.
+    ///
+    /// This is for the new element processing pipeline. It's not necessarily the
+    /// same as the number of paths (as in the old pipeline), but it might take a
+    /// while to sort that out.
+    pub fn n_drawobj(&self) -> usize {
+        self.new_encoder.n_drawobj()
+    }
+
+    /// Number of paths.
+    pub fn n_path(&self) -> u32 {
+        self.new_encoder.n_path()
+    }
+
+    pub fn n_pathseg(&self) -> u32 {
+        self.new_encoder.n_pathseg()
+    }
+
+    pub fn n_transform(&self) -> usize {
+        self.new_encoder.n_transform()
+    }
+
+    pub fn write_scene(&self, buf: &mut BufWrite) {
+        self.new_encoder.write_scene(buf);
+    }
+
     pub fn get_scene_buf(&mut self) -> &[u8] {
         const ALIGN: usize = 128;
         let padded_size = (self.elements.len() + (ALIGN - 1)) & ALIGN.wrapping_neg();
@@ -171,13 +205,7 @@
     fn clear(&mut self, _color: Color) {}
 
     fn stroke(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>, width: f64) {
-        let width_f32 = width as f32;
-        if self.stroke_width != width_f32 {
-            self.elements
-                .push(Element::SetLineWidth(SetLineWidth { width: width_f32 }));
-            self.stroke_width = width_f32;
-        }
-        self.set_fill_mode(FillMode::Stroke);
+        self.encode_linewidth(width.abs() as f32);
         let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
         // Note: the bbox contribution of stroke becomes more complicated with miter joins.
         self.accumulate_bbox(|| shape.bounding_box() + Insets::uniform(width * 0.5));
@@ -201,7 +229,7 @@
         // Perhaps that should be added to kurbo.
         self.accumulate_bbox(|| shape.bounding_box());
         let path = shape.path_elements(TOLERANCE);
-        self.set_fill_mode(FillMode::Nonzero);
+        self.encode_linewidth(-1.0);
         self.encode_path(path, true);
         self.encode_brush(&brush);
     }
@@ -318,21 +346,6 @@
 }
 
 impl PietGpuRenderContext {
-    fn encode_line_seg(&mut self, seg: LineSeg) {
-        self.elements.push(Element::Line(seg));
-        self.pathseg_count += 1;
-    }
-
-    fn encode_quad_seg(&mut self, seg: QuadSeg) {
-        self.elements.push(Element::Quad(seg));
-        self.pathseg_count += 1;
-    }
-
-    fn encode_cubic_seg(&mut self, seg: CubicSeg) {
-        self.elements.push(Element::Cubic(seg));
-        self.pathseg_count += 1;
-    }
-
     fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
         if is_fill {
             self.encode_path_inner(
@@ -352,99 +365,34 @@
     }
 
     fn encode_path_inner(&mut self, path: impl Iterator<Item = PathEl>) {
-        let flatten = false;
-        if flatten {
-            let mut start_pt = None;
-            let mut last_pt = None;
-            piet::kurbo::flatten(path, TOLERANCE, |el| {
-                match el {
-                    PathEl::MoveTo(p) => {
-                        let scene_pt = to_f32_2(p);
-                        start_pt = Some(scene_pt);
-                        last_pt = Some(scene_pt);
-                    }
-                    PathEl::LineTo(p) => {
-                        let scene_pt = to_f32_2(p);
-                        let seg = LineSeg {
-                            p0: last_pt.unwrap(),
-                            p1: scene_pt,
-                        };
-                        self.encode_line_seg(seg);
-                        last_pt = Some(scene_pt);
-                    }
-                    PathEl::ClosePath => {
-                        if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
-                            if last != start {
-                                let seg = LineSeg {
-                                    p0: last,
-                                    p1: start,
-                                };
-                                self.encode_line_seg(seg);
-                            }
-                        }
-                    }
-                    _ => (),
+        let mut pe = self.new_encoder.path_encoder();
+        for el in path {
+            match el {
+                PathEl::MoveTo(p) => {
+                    let p = to_f32_2(p);
+                    pe.move_to(p[0], p[1]);
                 }
-                //println!("{:?}", el);
-            });
-        } else {
-            let mut start_pt = None;
-            let mut last_pt = None;
-            for el in path {
-                match el {
-                    PathEl::MoveTo(p) => {
-                        let scene_pt = to_f32_2(p);
-                        start_pt = Some(scene_pt);
-                        last_pt = Some(scene_pt);
-                    }
-                    PathEl::LineTo(p) => {
-                        let scene_pt = to_f32_2(p);
-                        let seg = LineSeg {
-                            p0: last_pt.unwrap(),
-                            p1: scene_pt,
-                        };
-                        self.encode_line_seg(seg);
-                        last_pt = Some(scene_pt);
-                    }
-                    PathEl::QuadTo(p1, p2) => {
-                        let scene_p1 = to_f32_2(p1);
-                        let scene_p2 = to_f32_2(p2);
-                        let seg = QuadSeg {
-                            p0: last_pt.unwrap(),
-                            p1: scene_p1,
-                            p2: scene_p2,
-                        };
-                        self.encode_quad_seg(seg);
-                        last_pt = Some(scene_p2);
-                    }
-                    PathEl::CurveTo(p1, p2, p3) => {
-                        let scene_p1 = to_f32_2(p1);
-                        let scene_p2 = to_f32_2(p2);
-                        let scene_p3 = to_f32_2(p3);
-                        let seg = CubicSeg {
-                            p0: last_pt.unwrap(),
-                            p1: scene_p1,
-                            p2: scene_p2,
-                            p3: scene_p3,
-                        };
-                        self.encode_cubic_seg(seg);
-                        last_pt = Some(scene_p3);
-                    }
-                    PathEl::ClosePath => {
-                        if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
-                            if last != start {
-                                let seg = LineSeg {
-                                    p0: last,
-                                    p1: start,
-                                };
-                                self.encode_line_seg(seg);
-                            }
-                        }
-                    }
+                PathEl::LineTo(p) => {
+                    let p = to_f32_2(p);
+                    pe.line_to(p[0], p[1]);
                 }
-                //println!("{:?}", el);
+                PathEl::QuadTo(p1, p2) => {
+                    let p1 = to_f32_2(p1);
+                    let p2 = to_f32_2(p2);
+                    pe.quad_to(p1[0], p1[1], p2[0], p2[1]);
+                }
+                PathEl::CurveTo(p1, p2, p3) => {
+                    let p1 = to_f32_2(p1);
+                    let p2 = to_f32_2(p2);
+                    let p3 = to_f32_2(p3);
+                    pe.cubic_to(p1[0], p1[1], p2[0], p2[1], p3[0], p3[1]);
+                }
+                PathEl::ClosePath => pe.close_path(),
             }
         }
+        pe.path();
+        let n_pathseg = pe.n_pathseg();
+        self.new_encoder.finish_path(n_pathseg);
     }
 
     fn pop_clip(&mut self) {
@@ -511,14 +459,17 @@
         self.trans_count += 1;
     }
 
+    fn encode_linewidth(&mut self, linewidth: f32) {
+        if self.stroke_width != linewidth {
+            self.new_encoder.linewidth(linewidth);
+            self.stroke_width = linewidth;
+        }
+    }
+
     fn encode_brush(&mut self, brush: &PietGpuBrush) {
         match brush {
             PietGpuBrush::Solid(rgba_color) => {
-                let fill = FillColor {
-                    rgba_color: *rgba_color,
-                };
-                self.elements.push(Element::FillColor(fill));
-                self.path_count += 1;
+                self.new_encoder.fill_color(*rgba_color);
             }
             PietGpuBrush::LinGradient(lin) => {
                 let fill_lin = FillLinGradient {
diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs
index f4a086c..1683cac 100644
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@@ -24,13 +24,14 @@
 
 pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage};
 pub use path::{PathBinding, PathCode, PathEncoder, PathStage};
+use piet_gpu_hal::{Buffer, CmdBuf, Session};
 pub use transform::{Transform, TransformBinding, TransformCode, TransformStage};
 
 /// The configuration block passed to piet-gpu shaders.
 ///
 /// Note: this should be kept in sync with the version in setup.h.
 #[repr(C)]
-#[derive(Clone, Copy, Default, Zeroable, Pod)]
+#[derive(Clone, Copy, Default, Debug, Zeroable, Pod)]
 pub struct Config {
     pub n_elements: u32, // paths
     pub n_pathseg: u32,
@@ -45,8 +46,111 @@
     pub bbox_alloc: u32,
     pub drawmonoid_alloc: u32,
     pub n_trans: u32,
+    pub n_path: u32,
     pub trans_offset: u32,
-    pub pathtag_offset: u32,
     pub linewidth_offset: u32,
+    pub pathtag_offset: u32,
     pub pathseg_offset: u32,
 }
+
+// The "element" stage combines a number of stages for parts of the pipeline.
+
+pub struct ElementCode {
+    transform_code: TransformCode,
+    path_code: PathCode,
+    draw_code: DrawCode,
+}
+
+pub struct ElementStage {
+    transform_stage: TransformStage,
+    path_stage: PathStage,
+    draw_stage: DrawStage,
+}
+
+pub struct ElementBinding {
+    transform_binding: TransformBinding,
+    path_binding: PathBinding,
+    draw_binding: DrawBinding,
+}
+
+impl ElementCode {
+    pub unsafe fn new(session: &Session) -> ElementCode {
+        ElementCode {
+            transform_code: TransformCode::new(session),
+            path_code: PathCode::new(session),
+            draw_code: DrawCode::new(session),
+        }
+    }
+}
+
+impl ElementStage {
+    pub unsafe fn new(session: &Session, code: &ElementCode) -> ElementStage {
+        ElementStage {
+            transform_stage: TransformStage::new(session, &code.transform_code),
+            path_stage: PathStage::new(session, &code.path_code),
+            draw_stage: DrawStage::new(session, &code.draw_code),
+        }
+    }
+
+    pub unsafe fn bind(
+        &self,
+        session: &Session,
+        code: &ElementCode,
+        config_buf: &Buffer,
+        scene_buf: &Buffer,
+        memory_buf: &Buffer,
+    ) -> ElementBinding {
+        ElementBinding {
+            transform_binding: self.transform_stage.bind(
+                session,
+                &code.transform_code,
+                config_buf,
+                scene_buf,
+                memory_buf,
+            ),
+            path_binding: self.path_stage.bind(
+                session,
+                &code.path_code,
+                config_buf,
+                scene_buf,
+                memory_buf,
+            ),
+            draw_binding: self.draw_stage.bind(
+                session,
+                &code.draw_code,
+                config_buf,
+                scene_buf,
+                memory_buf,
+            ),
+        }
+    }
+
+    pub unsafe fn record(
+        &self,
+        cmd_buf: &mut CmdBuf,
+        code: &ElementCode,
+        binding: &ElementBinding,
+        n_transform: u64,
+        n_paths: u32,
+        n_tags: u32,
+        n_drawobj: u64,
+    ) {
+        self.transform_stage.record(
+            cmd_buf,
+            &code.transform_code,
+            &binding.transform_binding,
+            n_transform,
+        );
+        // No memory barrier needed here; path has at least one before pathseg
+        self.path_stage.record(
+            cmd_buf,
+            &code.path_code,
+            &binding.path_binding,
+            n_paths,
+            n_tags,
+        );
+        // No memory barrier needed here; draw has at least one before draw_leaf
+        self.draw_stage
+            .record(cmd_buf, &code.draw_code, &binding.draw_binding, n_drawobj);
+    }
+}
diff --git a/piet-gpu/src/stages/draw.rs b/piet-gpu/src/stages/draw.rs
index d50c6cb..da773cf 100644
--- a/piet-gpu/src/stages/draw.rs
+++ b/piet-gpu/src/stages/draw.rs
@@ -151,8 +151,8 @@
                 (1, 1, 1),
                 (DRAW_WG as u32, 1, 1),
             );
-            cmd_buf.memory_barrier();
         }
+        cmd_buf.memory_barrier();
         cmd_buf.dispatch(
             &code.leaf_pipeline,
             &binding.leaf_ds,
diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs
index e233c65..c9d2c60 100644
--- a/piet-gpu/src/stages/path.rs
+++ b/piet-gpu/src/stages/path.rs
@@ -258,11 +258,11 @@
         self.n_pathseg += 1;
     }
 
-    pub fn quad_to(&mut self, x0: f32, y0: f32, x1: f32, y1: f32) {
+    pub fn quad_to(&mut self, x1: f32, y1: f32, x2: f32, y2: f32) {
         if self.state == State::Start {
             return;
         }
-        let buf = [x0, y0, x1, y1];
+        let buf = [x1, y1, x2, y2];
         let bytes = bytemuck::bytes_of(&buf);
         self.pathseg_stream.extend_from_slice(bytes);
         self.tag_stream.push(10);
@@ -270,11 +270,11 @@
         self.n_pathseg += 1;
     }
 
-    pub fn cubic_to(&mut self, x0: f32, y0: f32, x1: f32, y1: f32, x2: f32, y2: f32) {
+    pub fn cubic_to(&mut self, x1: f32, y1: f32, x2: f32, y2: f32, x3: f32, y3: f32) {
         if self.state == State::Start {
             return;
         }
-        let buf = [x0, y0, x1, y1, x2, y2];
+        let buf = [x1, y1, x2, y2, x3, y3];
         let bytes = bytemuck::bytes_of(&buf);
         self.pathseg_stream.extend_from_slice(bytes);
         self.tag_stream.push(11);
@@ -288,6 +288,7 @@
             State::MoveTo => {
                 let new_len = self.pathseg_stream.len() - 8;
                 self.pathseg_stream.truncate(new_len);
+                self.state = State::Start;
                 return;
             }
             State::NonemptySubpath => (),
@@ -333,7 +334,9 @@
     ///
     /// This is the number of path segments that will be written by the
     /// path stage; use this for allocating the output buffer.
-    pub fn n_pathseg(&self) -> u32 {
+    ///
+    /// Also note: it takes `self` for lifetime reasons.
+    pub fn n_pathseg(self) -> u32 {
         self.n_pathseg
     }
 }
diff --git a/piet-gpu/src/stages/transform.rs b/piet-gpu/src/stages/transform.rs
index 4fb5e9f..4383c14 100644
--- a/piet-gpu/src/stages/transform.rs
+++ b/piet-gpu/src/stages/transform.rs
@@ -167,6 +167,11 @@
 }
 
 impl Transform {
+    pub const IDENTITY: Transform = Transform {
+        mat: [1.0, 0.0, 0.0, 1.0],
+        translate: [0.0, 0.0],
+    };
+
     pub fn from_kurbo(a: Affine) -> Transform {
         let c = a.as_coeffs();
         Transform {
diff --git a/tests/src/draw.rs b/tests/src/draw.rs
index ca19312..2447444 100644
--- a/tests/src/draw.rs
+++ b/tests/src/draw.rs
@@ -24,6 +24,7 @@
 use piet_gpu::stages::{self, DrawCode, DrawMonoid, DrawStage};
 
 const ELEMENT_SIZE: usize = 36;
+const ANNOTATED_SIZE: usize = 40;
 
 const ELEMENT_FILLCOLOR: u32 = 4;
 const ELEMENT_FILLLINGRADIENT: u32 = 5;
@@ -99,16 +100,18 @@
 
         // Layout of memory
         let drawmonoid_alloc = 0;
+        let anno_alloc = drawmonoid_alloc + 8 * n_tags;
         let stage_config = stages::Config {
             n_elements: n_tags as u32,
-            drawmonoid_alloc,
+            anno_alloc: anno_alloc as u32,
+            drawmonoid_alloc: drawmonoid_alloc as u32,
             ..Default::default()
         };
         stage_config
     }
 
     fn memory_size(&self) -> u64 {
-        8 + self.tags.len() as u64 * 8
+        (8 + self.tags.len() * (8 + ANNOTATED_SIZE)) as u64
     }
 
     fn fill_scene(&self, buf: &mut BufWrite) {
diff --git a/tests/src/path.rs b/tests/src/path.rs
index 948bd6f..7c5388f 100644
--- a/tests/src/path.rs
+++ b/tests/src/path.rs
@@ -19,7 +19,7 @@
 use crate::{Config, Runner, TestResult};
 
 use bytemuck::{Pod, Zeroable};
-use piet_gpu::stages::{self, PathCode, PathEncoder, PathStage};
+use piet_gpu::stages::{self, PathCode, PathEncoder, PathStage, Transform};
 use piet_gpu_hal::{BufWrite, BufferUsage};
 use rand::{prelude::ThreadRng, Rng};
 
@@ -55,6 +55,8 @@
     top: u32,
     right: u32,
     bottom: u32,
+    linewidth: f32,
+    trans_ix: u32,
 }
 
 pub unsafe fn path_test(runner: &mut Runner, config: &Config) -> TestResult {
@@ -206,11 +208,11 @@
         let pathseg_alloc = trans_alloc + n_trans * 24;
         let bbox_alloc = pathseg_alloc + self.n_pathseg * PATHSEG_SIZE;
         let stage_config = stages::Config {
-            n_elements: self.n_path,
             pathseg_alloc,
             trans_alloc,
             bbox_alloc,
             n_trans,
+            n_path: self.n_path,
             pathtag_offset,
             linewidth_offset,
             pathseg_offset,
@@ -236,7 +238,7 @@
     fn memory_full_size(&self) -> u64 {
         let mut size = self.memory_init_size();
         size += (self.n_pathseg * PATHSEG_SIZE) as u64;
-        size += (self.n_path * 16) as u64;
+        size += (self.n_path * 24) as u64;
         size
     }
 
@@ -246,7 +248,7 @@
         let mem_error = 0u32;
         let mem_init = [mem_offset, mem_error];
         buf.push(mem_init);
-        let trans = [1.0f32, 0.0, 0.0, 1.0, 0.0, 0.0];
+        let trans = Transform::IDENTITY;
         buf.push(trans);
     }
 
@@ -274,17 +276,15 @@
         }
         let begin_bbox = 32 + PATHSEG_SIZE * self.n_pathseg;
         for i in 0..self.n_path {
-            let offset = (begin_bbox + 16 * i) as usize;
-            let actual = bytemuck::from_bytes::<Bbox>(&memory[offset..offset + 16]);
+            let offset = (begin_bbox + 24 * i) as usize;
+            let actual = bytemuck::from_bytes::<Bbox>(&memory[offset..offset + 24]);
             let expected_f32 = self.bbox[i as usize];
-            let expected = Bbox {
-                left: round_down(expected_f32.0),
-                top: round_down(expected_f32.1),
-                right: round_up(expected_f32.2),
-                bottom: round_up(expected_f32.3),
-            };
-            if expected != *actual {
-                println!("{}: {:?} {:?}", i, actual, expected);
+            if round_down(expected_f32.0) != actual.left
+                || round_down(expected_f32.1) != actual.top
+                || round_up(expected_f32.2) != actual.right
+                || round_up(expected_f32.3) != actual.bottom
+            {
+                println!("{}: {:?} {:?}", i, actual, expected_f32);
                 return Some(format!("bbox mismatch at {}", i));
             }
         }