Variable size encoding of draw objects

This patch switches to a variable size encoding of draw objects.

In addition to the CPU-side scene encoding, it changes the representation of intermediate per draw object state from the `Annotated` struct to a variable "info" encoding. In addition, the bounding boxes are moved to a separate array (for a more "structure of "arrays" approach). Data that's unchanged from the scene encoding is not copied. Rather, downstream stages can access the data from the scene buffer (reducing allocation and copying).

Prefix sums, computed in `DrawMonoid` track the offset of both scene and intermediate data. The tags for the CPU-side encoding have been split into their own stream (again a change from AoS to SoA style).

This is not necessarily the final form. There's some stuff (including at least one piet-gpu-derive type) that can be deleted. In addition, the linewidth field should probably move from the info to path-specific. Also, the 1:1 correspondence between draw object and path has not yet been broken.

Closes #152
diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp
index 0c698b1..4a45d28 100644
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@@ -38,7 +38,6 @@
     Config conf;
 };
 
-#include "annotated.h"
 #include "tile.h"
 
 shared uint sh_row_count[BACKDROP_WG];
@@ -48,39 +47,29 @@
 void main() {
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
 
     // Work assignment: 1 thread : 1 path element
     uint row_count = 0;
     bool mem_ok = mem_error == NO_ERROR;
     if (gl_LocalInvocationID.y == 0) {
         if (element_ix < conf.n_elements) {
-            AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
-            switch (tag.tag) {
-            case Annotated_Image:
-            case Annotated_LinGradient:
-            case Annotated_BeginClip:
-            case Annotated_Color:
-                if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) {
-                    break;
-                }
-                // Fall through.
-                PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
-                Path path = Path_read(conf.tile_alloc, path_ref);
-                sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-                row_count = path.bbox.w - path.bbox.y;
-                // Paths that don't cross tile top edges don't have backdrops.
-                // Don't apply the optimization to paths that may cross the y = 0
-                // top edge, but clipped to 1 row.
-                if (row_count == 1 && path.bbox.y > 0) {
-                    // Note: this can probably be expanded to width = 2 as
-                    // long as it doesn't cross the left edge.
-                    row_count = 0;
-                }
-                Alloc path_alloc = new_alloc(
-                    path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
-                sh_row_alloc[th_ix] = path_alloc;
+            // Possible TODO: it's not necessary to process backdrops of stroked paths.
+            // We had logic for that but took it out because it used the Annotated struct.
+            PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
+            Path path = Path_read(conf.tile_alloc, path_ref);
+            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+            row_count = path.bbox.w - path.bbox.y;
+            // Paths that don't cross tile top edges don't have backdrops.
+            // Don't apply the optimization to paths that may cross the y = 0
+            // top edge, but clipped to 1 row.
+            if (row_count == 1 && path.bbox.y > 0) {
+                // Note: this can probably be expanded to width = 2 as
+                // long as it doesn't cross the left edge.
+                row_count = 0;
             }
+            Alloc path_alloc = new_alloc(
+                path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+            sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
     }
diff --git a/piet-gpu/shader/bbox_clear.comp b/piet-gpu/shader/bbox_clear.comp
index c609642..52577f9 100644
--- a/piet-gpu/shader/bbox_clear.comp
+++ b/piet-gpu/shader/bbox_clear.comp
@@ -20,7 +20,7 @@
 void main() {
     uint ix = gl_GlobalInvocationID.x;
     if (ix < conf.n_path) {
-        uint out_ix = (conf.bbox_alloc.offset >> 2) + 6 * ix;
+        uint out_ix = (conf.path_bbox_alloc.offset >> 2) + 6 * ix;
         memory[out_ix] = 0xffff;
         memory[out_ix + 1] = 0xffff;
         memory[out_ix + 2] = 0;
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
index 2304ea2..9b04400 100644
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@@ -18,7 +18,6 @@
     Config conf;
 };
 
-#include "annotated.h"
 #include "bins.h"
 #include "drawtag.h"
 
@@ -37,10 +36,12 @@
 shared bool sh_alloc_failed;
 
 DrawMonoid load_draw_monoid(uint element_ix) {
-    uint base = (conf.drawmonoid_alloc.offset >> 2) + 2 * element_ix;
+    uint base = (conf.drawmonoid_alloc.offset >> 2) + 4 * element_ix;
     uint path_ix = memory[base];
     uint clip_ix = memory[base + 1];
-    return DrawMonoid(path_ix, clip_ix);
+    uint scene_offset = memory[base + 2];
+    uint info_offset = memory[base + 3];
+    return DrawMonoid(path_ix, clip_ix, scene_offset, info_offset);
 }
 
 // Load bounding box computed by clip processing
@@ -60,7 +61,7 @@
 
 // Load path's bbox from bbox (as written by pathseg).
 vec4 load_path_bbox(uint path_ix) {
-    uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix;
+    uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
     float bbox_l = float(memory[base]) - 32768.0;
     float bbox_t = float(memory[base + 1]) - 32768.0;
     float bbox_r = float(memory[base + 2]) - 32768.0;
@@ -69,16 +70,15 @@
     return bbox;
 }
 
-void store_path_bbox(AnnotatedRef ref, vec4 bbox) {
-    uint ix = ref.offset >> 2;
-    memory[ix + 1] = floatBitsToUint(bbox.x);
-    memory[ix + 2] = floatBitsToUint(bbox.y);
-    memory[ix + 3] = floatBitsToUint(bbox.z);
-    memory[ix + 4] = floatBitsToUint(bbox.w);
+void store_draw_bbox(uint draw_ix, vec4 bbox) {
+    uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix;
+    memory[base] = floatBitsToUint(bbox.x);
+    memory[base + 1] = floatBitsToUint(bbox.y);
+    memory[base + 2] = floatBitsToUint(bbox.z);
+    memory[base + 3] = floatBitsToUint(bbox.w);
 }
 
 void main() {
-    uint my_n_elements = conf.n_elements;
     uint my_partition = gl_WorkGroupID.x;
 
     for (uint i = 0; i < N_SLICE; i++) {
@@ -91,18 +91,8 @@
 
     // Read inputs and determine coverage of bins
     uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
-    uint tag = Annotated_Nop;
-    if (element_ix < my_n_elements) {
-        tag = Annotated_tag(conf.anno_alloc, ref).tag;
-    }
     int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
-    switch (tag) {
-    case Annotated_Color:
-    case Annotated_LinGradient:
-    case Annotated_Image:
-    case Annotated_BeginClip:
-    case Annotated_EndClip:
+    if (element_ix < conf.n_elements) {
         DrawMonoid draw_monoid = load_draw_monoid(element_ix);
         uint path_ix = draw_monoid.path_ix;
         vec4 clip_bbox = vec4(-1e9, -1e9, 1e9, 1e9);
@@ -119,12 +109,11 @@
         // Avoid negative-size bbox (is this necessary)?
         bbox.zw = max(bbox.xy, bbox.zw);
         // Store clip-intersected bbox for tile_alloc.
-        store_path_bbox(ref, bbox);
+        store_draw_bbox(element_ix, bbox);
         x0 = int(floor(bbox.x * SX));
         y0 = int(floor(bbox.y * SY));
         x1 = int(ceil(bbox.z * SX));
         y1 = int(ceil(bbox.w * SY));
-        break;
     }
 
     // At this point, we run an iterator over the coverage area,
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index ac4f3d7..785e016 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -22,43 +22,43 @@
 rule msl
   command = $spirv_cross --msl $in --output $out $msl_flags
 
-build gen/binning.spv: glsl binning.comp | annotated.h bins.h drawtag.h setup.h mem.h
+build gen/binning.spv: glsl binning.comp | bins.h drawtag.h setup.h mem.h
 build gen/binning.hlsl: hlsl gen/binning.spv
 build gen/binning.dxil: dxil gen/binning.hlsl
 build gen/binning.msl: msl gen/binning.spv
 
-build gen/tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h
+build gen/tile_alloc.spv: glsl tile_alloc.comp | drawtag.h tile.h setup.h mem.h
 build gen/tile_alloc.hlsl: hlsl gen/tile_alloc.spv
 build gen/tile_alloc.dxil: dxil gen/tile_alloc.hlsl
 build gen/tile_alloc.msl: msl gen/tile_alloc.spv
 
-build gen/path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setup.h
+build gen/path_coarse.spv: glsl path_coarse.comp | pathseg.h tile.h setup.h mem.h
 build gen/path_coarse.hlsl: hlsl gen/path_coarse.spv
 build gen/path_coarse.dxil: dxil gen/path_coarse.hlsl
 build gen/path_coarse.msl: msl gen/path_coarse.spv
 
-build gen/backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h
+build gen/backdrop.spv: glsl backdrop.comp | tile.h setup.h mem.h
 build gen/backdrop.hlsl: hlsl gen/backdrop.spv
 build gen/backdrop.dxil: dxil gen/backdrop.hlsl
 build gen/backdrop.msl: msl gen/backdrop.spv
 
-build gen/backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
+build gen/backdrop_lg.spv: glsl backdrop.comp | tile.h setup.h mem.h
   flags = -DBACKDROP_DIST_FACTOR=4
 build gen/backdrop_lg.hlsl: hlsl gen/backdrop_lg.spv
 build gen/backdrop_lg.dxil: dxil gen/backdrop_lg.hlsl
 build gen/backdrop_lg.msl: msl gen/backdrop_lg.spv
 
-build gen/coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
+build gen/coarse.spv: glsl coarse.comp | drawtag.h bins.h ptcl.h setup.h mem.h
 build gen/coarse.hlsl: hlsl gen/coarse.spv
 build gen/coarse.dxil: dxil gen/coarse.hlsl
 build gen/coarse.msl: msl gen/coarse.spv
 
-build gen/kernel4.spv: glsl kernel4.comp | blend.h ptcl.h setup.h
+build gen/kernel4.spv: glsl kernel4.comp | blend.h ptcl.h setup.h mem.h
 build gen/kernel4.hlsl: hlsl gen/kernel4.spv
 build gen/kernel4.dxil: dxil gen/kernel4.hlsl
 build gen/kernel4.msl: msl gen/kernel4.spv
 
-build gen/kernel4_gray.spv: glsl kernel4.comp | ptcl.h setup.h
+build gen/kernel4_gray.spv: glsl kernel4.comp | ptcl.h setup.h mem.h
   flags = -DGRAY
 build gen/kernel4_gray.hlsl: hlsl gen/kernel4_gray.spv
 build gen/kernel4_gray.dxil: dxil gen/kernel4_gray.hlsl
@@ -114,17 +114,17 @@
 build gen/draw_root.dxil: dxil gen/draw_root.hlsl
 build gen/draw_root.msl: msl gen/draw_root.spv
 
-build gen/draw_leaf.spv: glsl draw_leaf.comp | blend.h scene.h drawtag.h annotated.h setup.h mem.h
+build gen/draw_leaf.spv: glsl draw_leaf.comp | blend.h scene.h drawtag.h setup.h mem.h
 build gen/draw_leaf.hlsl: hlsl gen/draw_leaf.spv
 build gen/draw_leaf.dxil: dxil gen/draw_leaf.hlsl
 build gen/draw_leaf.msl: msl gen/draw_leaf.spv
 
-build gen/clip_reduce.spv: glsl clip_reduce.comp | mem.h setup.h annotated.h
+build gen/clip_reduce.spv: glsl clip_reduce.comp | mem.h setup.h
 build gen/clip_reduce.hlsl: hlsl gen/clip_reduce.spv
 build gen/clip_reduce.dxil: dxil gen/clip_reduce.hlsl
 build gen/clip_reduce.msl: msl gen/clip_reduce.spv
 
-build gen/clip_leaf.spv: glsl clip_leaf.comp | mem.h setup.h annotated.h
+build gen/clip_leaf.spv: glsl clip_leaf.comp | mem.h setup.h
 build gen/clip_leaf.hlsl: hlsl gen/clip_leaf.spv
 build gen/clip_leaf.dxil: dxil gen/clip_leaf.hlsl
 build gen/clip_leaf.msl: msl gen/clip_leaf.spv
diff --git a/piet-gpu/shader/clip_leaf.comp b/piet-gpu/shader/clip_leaf.comp
index 5f7e79b..5353b0b 100644
--- a/piet-gpu/shader/clip_leaf.comp
+++ b/piet-gpu/shader/clip_leaf.comp
@@ -18,8 +18,6 @@
     Config conf;
 };
 
-#include "annotated.h"
-
 // Some of this is cut'n'paste duplication with the reduce pass, and
 // arguably should be moved to a common .h file.
 // The bicyclic monoid
@@ -43,7 +41,7 @@
 
 // Load path's bbox from bbox (as written by pathseg).
 vec4 load_path_bbox(uint path_ix) {
-    uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix;
+    uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
     float bbox_l = float(memory[base]) - 32768.0;
     float bbox_t = float(memory[base + 1]) - 32768.0;
     float bbox_r = float(memory[base + 2]) - 32768.0;
@@ -271,7 +269,7 @@
         // and is in the ClipEl for cross-partition.
         // If not, can probably get rid of it in the stack intermediate buf.
         path_ix = load_path_ix(parent);
-        uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 2 * ~inp;
+        uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 4 * ~inp;
         // Fix up drawmonoid so path_ix at EndClip matches BeginClip
         memory[drawmonoid_out_base] = path_ix;
 
diff --git a/piet-gpu/shader/clip_reduce.comp b/piet-gpu/shader/clip_reduce.comp
index c62b239..8b247ab 100644
--- a/piet-gpu/shader/clip_reduce.comp
+++ b/piet-gpu/shader/clip_reduce.comp
@@ -31,8 +31,6 @@
     Config conf;
 };
 
-#include "annotated.h"
-
 // The intermediate state for clip processing.
 struct ClipEl {
     // index of parent node
@@ -59,7 +57,7 @@
 
 // Load path's bbox from bbox (as written by pathseg).
 vec4 load_path_bbox(uint path_ix) {
-    uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix;
+    uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
     float bbox_l = float(memory[base]) - 32768.0;
     float bbox_t = float(memory[base + 1]) - 32768.0;
     float bbox_r = float(memory[base + 2]) - 32768.0;
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index df306e0..961fc99 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -19,11 +19,15 @@
 
 layout(local_size_x = N_TILE, local_size_y = 1) in;
 
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+layout(binding = 1) readonly buffer ConfigBuf {
     Config conf;
 };
 
-#include "annotated.h"
+layout(binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+#include "drawtag.h"
 #include "bins.h"
 #include "tile.h"
 #include "ptcl.h"
@@ -92,8 +96,8 @@
     return true;
 }
 
-void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float linewidth) {
-    if (fill_mode_from_flags(flags) == MODE_NONZERO) {
+void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) {
+    if (linewidth < 0.0) {
         if (tile.tile.offset != 0) {
             CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
             Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
@@ -146,6 +150,10 @@
     uint part_start_ix = 0;
     uint ready_ix = 0;
 
+    uint drawmonoid_start = conf.drawmonoid_alloc.offset >> 2;
+    uint drawtag_start = conf.drawtag_offset >> 2;
+    uint drawdata_start = conf.drawdata_offset >> 2;
+    uint drawinfo_start = conf.drawinfo_alloc.offset >> 2;
     bool mem_ok = mem_error == NO_ERROR;
     while (true) {
         for (uint i = 0; i < N_SLICE; i++) {
@@ -207,24 +215,22 @@
         // We've done the merge and filled the buffer.
 
         // Read one element, compute coverage.
-        uint tag = Annotated_Nop;
+        uint tag = Drawtag_Nop;
         uint element_ix;
-        AnnotatedRef ref;
         if (th_ix + rd_ix < wr_ix) {
             element_ix = sh_elements[th_ix];
-            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
-            tag = Annotated_tag(conf.anno_alloc, ref).tag;
+            tag = scene[drawtag_start + element_ix];
         }
 
         // Bounding box of element in pixel coordinates.
         uint tile_count;
         switch (tag) {
-        case Annotated_Color:
-        case Annotated_Image:
-        case Annotated_LinGradient:
-        case Annotated_BeginClip:
-        case Annotated_EndClip:
-            uint drawmonoid_base = (conf.drawmonoid_alloc.offset >> 2) + 2 * element_ix;
+        case Drawtag_FillColor:
+        case Drawtag_FillImage:
+        case Drawtag_FillLinGradient:
+        case Drawtag_BeginClip:
+        case Drawtag_EndClip:
+            uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
             uint path_ix = memory[drawmonoid_base];
             Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
             uint stride = path.bbox.z - path.bbox.x;
@@ -272,9 +278,7 @@
                     el_ix = probe;
                 }
             }
-            AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size);
-            AnnotatedTag anno_tag = Annotated_tag(conf.anno_alloc, ref);
-            uint tag = anno_tag.tag;
+            uint tag = scene[drawtag_start + sh_elements[el_ix]];
             uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + seq_ix % width;
@@ -283,15 +287,16 @@
             if (mem_ok) {
                 Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok),
                                       TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
-                bool is_clip = tag == Annotated_BeginClip || tag == Annotated_EndClip;
+                bool is_clip = (tag & 1) != 0;
                 // Always include the tile if it contains a path segment.
                 // For draws, include the tile if it is solid.
                 // For clips, include the tile if it is empty - this way, logic
                 // below will suppress the drawing of inner elements.
                 // For blends, include the tile if
                 // (blend_mode, composition_mode) != (Normal, SrcOver)
+                bool is_blend = false; // TODO
                 include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
-                    || (is_clip && (anno_tag.flags & 0x2) != 0);
+                    || (is_clip && is_blend);
             }
             if (include_tile) {
                 uint el_slice = el_ix / 32;
@@ -302,8 +307,8 @@
 
         barrier();
 
-        // Output non-segment elements for this tile. The thread does a sequential walk
-        // through the non-segment elements.
+        // Output draw objects for this tile. The thread does a sequential walk
+        // through the draw objects.
         uint slice_ix = 0;
         uint bitmap = sh_bitmaps[0][th_ix];
         while (mem_ok) {
@@ -323,59 +328,55 @@
             // Clear LSB
             bitmap &= bitmap - 1;
 
-            // At this point, we read the element again from global memory.
-            // If that turns out to be expensive, maybe we can pack it into
-            // shared memory (or perhaps just the tag).
-            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
-            AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
+            uint drawtag = scene[drawtag_start + element_ix];
 
             if (clip_zero_depth == 0) {
-                switch (tag.tag) {
-                case Annotated_Color:
-                    Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
-                                          TileRef(sh_tile_base[element_ref_ix] +
-                                                  (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    AnnoColor fill = Annotated_Color_read(conf.anno_alloc, ref);
+                Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
+                                        TileRef(sh_tile_base[element_ref_ix] +
+                                                (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
+                uint scene_offset = memory[drawmonoid_base + 2];
+                uint info_offset = memory[drawmonoid_base + 3];
+                uint dd = drawdata_start + (scene_offset >> 2);
+                uint di = drawinfo_start + (info_offset >> 2);
+                switch (drawtag) {
+                case Drawtag_FillColor:
+                    float linewidth = uintBitsToFloat(memory[di]);
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
                     }
-                    write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill.linewidth);
-                    Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(fill.rgba_color));
+                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
+                    uint rgba = scene[dd];
+                    Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba));
                     cmd_ref.offset += 4 + CmdColor_size;
                     break;
-                case Annotated_LinGradient:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
-                                     TileRef(sh_tile_base[element_ref_ix] +
-                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    AnnoLinGradient lin = Annotated_LinGradient_read(conf.anno_alloc, ref);
+                case Drawtag_FillLinGradient:
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
                     }
-                    write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill.linewidth);
+                    linewidth = uintBitsToFloat(memory[di]);
+                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
                     CmdLinGrad cmd_lin;
-                    cmd_lin.index = lin.index;
-                    cmd_lin.line_x = lin.line_x;
-                    cmd_lin.line_y = lin.line_y;
-                    cmd_lin.line_c = lin.line_c;
+                    cmd_lin.index = scene[dd];
+                    cmd_lin.line_x = uintBitsToFloat(memory[di + 1]);
+                    cmd_lin.line_y = uintBitsToFloat(memory[di + 2]);
+                    cmd_lin.line_c = uintBitsToFloat(memory[di + 3]);
                     Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
                     cmd_ref.offset += 4 + CmdLinGrad_size;
                     break;
-                case Annotated_Image:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
-                                     TileRef(sh_tile_base[element_ref_ix] +
-                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    AnnoImage fill_img = Annotated_Image_read(conf.anno_alloc, ref);
+                case Drawtag_FillImage:
+                    linewidth = uintBitsToFloat(memory[di]);
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
                     }
-                    write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill_img.linewidth);
-                    Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(fill_img.index, fill_img.offset));
+                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
+                    uint index = scene[dd];
+                    uint raw1 = scene[dd + 1];
+                    ivec2 offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
+                    Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset));
                     cmd_ref.offset += 4 + CmdImage_size;
                     break;
-                case Annotated_BeginClip:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
-                                     TileRef(sh_tile_base[element_ref_ix] +
-                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                case Drawtag_BeginClip:
                     if (tile.tile.offset == 0 && tile.backdrop == 0) {
                         clip_zero_depth = clip_depth + 1;
                     } else {
@@ -387,27 +388,24 @@
                     }
                     clip_depth++;
                     break;
-                case Annotated_EndClip:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
-                                     TileRef(sh_tile_base[element_ref_ix] +
-                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    AnnoEndClip end_clip = Annotated_EndClip_read(conf.anno_alloc, ref);
+                case Drawtag_EndClip:
                     clip_depth--;
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
                     }
-                    write_fill(cmd_alloc, cmd_ref, MODE_NONZERO, tile, 0.0);
-                    Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(end_clip.blend));
+                    write_fill(cmd_alloc, cmd_ref, tile, -1.0);
+                    uint blend = scene[dd];
+                    Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
                     cmd_ref.offset += 4 + CmdEndClip_size;
                     break;
                 }
             } else {
                 // In "clip zero" state, suppress all drawing
-                switch (tag.tag) {
-                case Annotated_BeginClip:
+                switch (drawtag) {
+                case Drawtag_BeginClip:
                     clip_depth++;
                     break;
-                case Annotated_EndClip:
+                case Drawtag_EndClip:
                     if (clip_depth == clip_zero_depth) {
                         clip_zero_depth = 0;
                     }
diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp
index 74fc2f8..1cee0ef 100644
--- a/piet-gpu/shader/draw_leaf.comp
+++ b/piet-gpu/shader/draw_leaf.comp
@@ -27,7 +27,6 @@
 #include "scene.h"
 #include "tile.h"
 #include "drawtag.h"
-#include "annotated.h"
 #include "blend.h"
 
 #define Monoid DrawMonoid
@@ -42,14 +41,14 @@
     Monoid local[N_ROWS];
 
     uint ix = gl_GlobalInvocationID.x * N_ROWS;
-    ElementRef ref = ElementRef(ix * Element_size);
-    uint tag_word = Element_tag(ref).tag;
+    uint drawtag_base = conf.drawtag_offset >> 2;
+    uint tag_word = scene[drawtag_base + ix];
 
     Monoid agg = map_tag(tag_word);
     local[0] = agg;
     for (uint i = 1; i < N_ROWS; i++) {
-        tag_word = Element_tag(Element_index(ref, i)).tag;
-        agg = combine_tag_monoid(agg, map_tag(tag_word));
+        tag_word = scene[drawtag_base + ix + i];
+        agg = combine_draw_monoid(agg, map_tag(tag_word));
         local[i] = agg;
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
@@ -57,41 +56,47 @@
         barrier();
         if (gl_LocalInvocationID.x >= (1u << i)) {
             Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
-            agg = combine_tag_monoid(other, agg);
+            agg = combine_draw_monoid(other, agg);
         }
         barrier();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
 
     barrier();
-    Monoid row = tag_monoid_identity();
+    Monoid row = draw_monoid_identity();
     if (gl_WorkGroupID.x > 0) {
         row = parent[gl_WorkGroupID.x - 1];
     }
     if (gl_LocalInvocationID.x > 0) {
-        row = combine_tag_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
+        row = combine_draw_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
     }
+    uint drawdata_base = conf.drawdata_offset >> 2;
+    uint drawinfo_base = conf.drawinfo_alloc.offset >> 2;
     uint out_ix = gl_GlobalInvocationID.x * N_ROWS;
-    uint out_base = (conf.drawmonoid_alloc.offset >> 2) + out_ix * 2;
+    uint out_base = (conf.drawmonoid_alloc.offset >> 2) + out_ix * 4;
     uint clip_out_base = conf.clip_alloc.offset >> 2;
-    AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + out_ix * Annotated_size);
     for (uint i = 0; i < N_ROWS; i++) {
         Monoid m = row;
         if (i > 0) {
-            m = combine_tag_monoid(m, local[i - 1]);
+            m = combine_draw_monoid(m, local[i - 1]);
         }
         // m now holds exclusive scan of draw monoid
-        memory[out_base + i * 2] = m.path_ix;
-        memory[out_base + i * 2 + 1] = m.clip_ix;
+        memory[out_base + i * 4] = m.path_ix;
+        memory[out_base + i * 4 + 1] = m.clip_ix;
+        memory[out_base + i * 4 + 2] = m.scene_offset;
+        memory[out_base + i * 4 + 3] = m.info_offset;
+
+        // u32 offset of drawobj data
+        uint dd = drawdata_base + (m.scene_offset >> 2);
+        uint di = drawinfo_base + (m.info_offset >> 2);
 
         // For compatibility, we'll generate an Annotated object, same as old
         // pipeline. However, going forward we'll get rid of that, and have
         // later stages read scene + bbox etc.
-        ElementRef this_ref = Element_index(ref, i);
-        tag_word = Element_tag(this_ref).tag;
-        if (tag_word == Element_FillColor || tag_word == Element_FillLinGradient || tag_word == Element_FillImage ||
-            tag_word == Element_BeginClip) {
-            uint bbox_offset = (conf.bbox_alloc.offset >> 2) + 6 * m.path_ix;
+        tag_word = scene[drawtag_base + ix + i];
+        if (tag_word == Drawtag_FillColor || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillImage ||
+            tag_word == Drawtag_BeginClip) {
+            uint bbox_offset = (conf.path_bbox_alloc.offset >> 2) + 6 * m.path_ix;
             float bbox_l = float(memory[bbox_offset]) - 32768.0;
             float bbox_t = float(memory[bbox_offset + 1]) - 32768.0;
             float bbox_r = float(memory[bbox_offset + 2]) - 32768.0;
@@ -101,11 +106,11 @@
             uint fill_mode = uint(linewidth >= 0.0);
             vec4 mat;
             vec2 translate;
-            if (linewidth >= 0.0 || tag_word == Element_FillLinGradient) {
+            if (linewidth >= 0.0 || tag_word == Drawtag_FillLinGradient) {
                 uint trans_ix = memory[bbox_offset + 5];
                 uint t = (conf.trans_alloc.offset >> 2) + 6 * trans_ix;
                 mat = uintBitsToFloat(uvec4(memory[t], memory[t + 1], memory[t + 2], memory[t + 3]));
-                if (tag_word == Element_FillLinGradient) {
+                if (tag_word == Drawtag_FillLinGradient) {
                     translate = uintBitsToFloat(uvec2(memory[t + 4], memory[t + 5]));
                 }
             }
@@ -113,69 +118,38 @@
                 // TODO: need to deal with anisotropic case
                 linewidth *= sqrt(abs(mat.x * mat.w - mat.y * mat.z));
             }
-            linewidth = max(linewidth, 0.0);
             switch (tag_word) {
-            case Element_FillColor:
-                FillColor fill = Element_FillColor_read(this_ref);
-                AnnoColor anno_fill;
-                anno_fill.bbox = bbox;
-                anno_fill.linewidth = linewidth;
-                anno_fill.rgba_color = fill.rgba_color;
-                Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill);
+            case Drawtag_FillColor:
+            case Drawtag_FillImage:
+                memory[di] = floatBitsToUint(linewidth);
                 break;
-            case Element_FillLinGradient:
-                FillLinGradient lin = Element_FillLinGradient_read(this_ref);
-                AnnoLinGradient anno_lin;
-                anno_lin.bbox = bbox;
-                anno_lin.linewidth = linewidth;
-                anno_lin.index = lin.index;
-                vec2 p0 = mat.xy * lin.p0.x + mat.zw * lin.p0.y + translate;
-                vec2 p1 = mat.xy * lin.p1.x + mat.zw * lin.p1.y + translate;
+            case Drawtag_FillLinGradient:
+                memory[di] = floatBitsToUint(linewidth);
+                uint index = scene[dd];
+                vec2 p0 = uintBitsToFloat(uvec2(scene[dd + 1], scene[dd + 2]));
+                vec2 p1 = uintBitsToFloat(uvec2(scene[dd + 3], scene[dd + 4]));
+                p0 = mat.xy * p0.x + mat.zw * p0.y + translate;
+                p1 = mat.xy * p1.x + mat.zw * p1.y + translate;
                 vec2 dxy = p1 - p0;
                 float scale = 1.0 / (dxy.x * dxy.x + dxy.y * dxy.y);
                 float line_x = dxy.x * scale;
                 float line_y = dxy.y * scale;
-                anno_lin.line_x = line_x;
-                anno_lin.line_y = line_y;
-                anno_lin.line_c = -(p0.x * line_x + p0.y * line_y);
-                Annotated_LinGradient_write(conf.anno_alloc, out_ref, fill_mode, anno_lin);
+                float line_c = -(p0.x * line_x + p0.y * line_y);
+                memory[di + 1] = floatBitsToUint(line_x);
+                memory[di + 2] = floatBitsToUint(line_y);
+                memory[di + 3] = floatBitsToUint(line_c);
                 break;
-            case Element_FillImage:
-                FillImage fill_img = Element_FillImage_read(this_ref);
-                AnnoImage anno_img;
-                anno_img.bbox = bbox;
-                anno_img.linewidth = linewidth;
-                anno_img.index = fill_img.index;
-                anno_img.offset = fill_img.offset;
-                Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
-                break;
-            case Element_BeginClip:
-                Clip begin_clip = Element_BeginClip_read(this_ref);
-                AnnoBeginClip anno_begin_clip;
-                anno_begin_clip.bbox = bbox;
-                anno_begin_clip.linewidth = 0.0; // don't support clip-with-stroke
-                anno_begin_clip.blend = begin_clip.blend;
-                uint flags = uint(begin_clip.blend != BlendComp_default) << 1;
-                Annotated_BeginClip_write(conf.anno_alloc, out_ref, flags, anno_begin_clip);
+            case Drawtag_BeginClip:
                 break;
             }
-        } else if (tag_word == Element_EndClip) {
-            Clip end_clip = Element_BeginClip_read(this_ref);
-            AnnoEndClip anno_end_clip;
-            // The actual bbox will be reconstructed from clip stream output.
-            anno_end_clip.bbox = vec4(-1e9, -1e9, 1e9, 1e9);
-            anno_end_clip.blend = end_clip.blend;
-            uint flags = uint(end_clip.blend != BlendComp_default) << 1;
-            Annotated_EndClip_write(conf.anno_alloc, out_ref, flags, anno_end_clip);
         }
         // Generate clip stream.
-        if (tag_word == Element_BeginClip || tag_word == Element_EndClip) {
+        if (tag_word == Drawtag_BeginClip || tag_word == Drawtag_EndClip) {
             uint path_ix = ~(out_ix + i);
-            if (tag_word == Element_BeginClip) {
+            if (tag_word == Drawtag_BeginClip) {
                 path_ix = m.path_ix;
             }
             memory[clip_out_base + m.clip_ix] = path_ix;
         }
-        out_ref.offset += Annotated_size;
     }
 }
diff --git a/piet-gpu/shader/draw_reduce.comp b/piet-gpu/shader/draw_reduce.comp
index 68d43e9..d125d6e 100644
--- a/piet-gpu/shader/draw_reduce.comp
+++ b/piet-gpu/shader/draw_reduce.comp
@@ -36,13 +36,13 @@
 
 void main() {
     uint ix = gl_GlobalInvocationID.x * N_ROWS;
-    ElementRef ref = ElementRef(ix * Element_size);
-    uint tag_word = Element_tag(ref).tag;
+    uint drawtag_base = conf.drawtag_offset >> 2;
+    uint tag_word = scene[drawtag_base + ix];
 
     Monoid agg = map_tag(tag_word);
     for (uint i = 1; i < N_ROWS; i++) {
-        tag_word = Element_tag(Element_index(ref, i)).tag;
-        agg = combine_tag_monoid(agg, map_tag(tag_word));
+        uint tag_word = scene[drawtag_base + ix + i];
+        agg = combine_draw_monoid(agg, map_tag(tag_word));
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
     for (uint i = 0; i < LG_WG_SIZE; i++) {
@@ -50,7 +50,7 @@
         // We could make this predicate tighter, but would it help?
         if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) {
             Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)];
-            agg = combine_tag_monoid(agg, other);
+            agg = combine_draw_monoid(agg, other);
         }
         barrier();
         sh_scratch[gl_LocalInvocationID.x] = agg;
diff --git a/piet-gpu/shader/draw_scan.comp b/piet-gpu/shader/draw_scan.comp
index 1c26c26..d285020 100644
--- a/piet-gpu/shader/draw_scan.comp
+++ b/piet-gpu/shader/draw_scan.comp
@@ -16,8 +16,8 @@
 layout(local_size_x = WG_SIZE, local_size_y = 1) in;
 
 #define Monoid DrawMonoid
-#define combine_monoid combine_tag_monoid
-#define monoid_identity tag_monoid_identity
+#define combine_monoid combine_draw_monoid
+#define monoid_identity draw_monoid_identity
 
 layout(binding = 0) buffer DataBuf {
     Monoid[] data;
diff --git a/piet-gpu/shader/drawtag.h b/piet-gpu/shader/drawtag.h
index 17105f6..7f73546 100644
--- a/piet-gpu/shader/drawtag.h
+++ b/piet-gpu/shader/drawtag.h
@@ -2,36 +2,39 @@
 
 // Common data structures and functions for the draw tag stream.
 
+// Design of draw tag: & 0x1c gives scene size in bytes
+// & 1 gives clip
+// (tag >> 4) & 0x1c is info size in bytes
+
+#define Drawtag_Nop 0
+#define Drawtag_FillColor 0x44
+#define Drawtag_FillLinGradient 0x114
+#define Drawtag_FillImage 0x48
+#define Drawtag_BeginClip 0x05
+#define Drawtag_EndClip 0x25
+
 struct DrawMonoid {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
-DrawMonoid tag_monoid_identity() {
-    return DrawMonoid(0, 0);
+DrawMonoid draw_monoid_identity() {
+    return DrawMonoid(0, 0, 0, 0);
 }
 
-DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b) {
+DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b) {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
-#ifdef Element_size
 DrawMonoid map_tag(uint tag_word) {
-    switch (tag_word) {
-    case Element_FillColor:
-    case Element_FillLinGradient:
-    case Element_FillImage:
-        return DrawMonoid(1, 0);
-    case Element_BeginClip:
-    // TODO: endclip should be (0, 1), ie not generate a path. But for now
-    // we generate a dummy path.
-    case Element_EndClip:
-        return DrawMonoid(1, 1);
-    default:
-        return DrawMonoid(0, 0);
-    }
+    // TODO: at some point, EndClip should not generate a path
+    uint has_path = uint(tag_word != Drawtag_Nop);
+    return DrawMonoid(has_path, tag_word & 1, tag_word & 0x1c, (tag_word >> 4) & 0x1c);
 }
-#endif
diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil
index 4ebcb1c..df2be88 100644
--- a/piet-gpu/shader/gen/backdrop.dxil
+++ b/piet-gpu/shader/gen/backdrop.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop.hlsl b/piet-gpu/shader/gen/backdrop.hlsl
index a2e71a8..aba3cff 100644
--- a/piet-gpu/shader/gen/backdrop.hlsl
+++ b/piet-gpu/shader/gen/backdrop.hlsl
@@ -3,17 +3,6 @@
     uint offset;
 };
 
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct PathRef
 {
     uint offset;
@@ -42,12 +31,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -55,12 +46,14 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _79 : register(u0, space0);
-ByteAddressBuffer _186 : register(t1, space0);
+RWByteAddressBuffer _67 : register(u0, space0);
+ByteAddressBuffer _166 : register(t1, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
@@ -89,24 +82,10 @@
     {
         return 0u;
     }
-    uint v = _79.Load(offset * 4 + 8);
+    uint v = _67.Load(offset * 4 + 8);
     return v;
 }
 
-AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _121 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _121;
-}
-
-uint fill_mode_from_flags(uint flags)
-{
-    return flags & 1u;
-}
-
 Path Path_read(Alloc a, PathRef ref)
 {
     uint ix = ref.offset >> uint(2);
@@ -121,8 +100,8 @@
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _165 = { raw2 };
-    s.tiles = _165;
+    TileRef _134 = { raw2 };
+    s.tiles = _134;
     return s;
 }
 
@@ -141,88 +120,65 @@
     {
         return;
     }
-    _79.Store(offset * 4 + 8, val);
+    _67.Store(offset * 4 + 8, val);
 }
 
 void comp_main()
 {
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef _194 = { _186.Load(32) + (element_ix * 40u) };
-    AnnotatedRef ref = _194;
     uint row_count = 0u;
-    bool mem_ok = _79.Load(4) == 0u;
+    bool mem_ok = _67.Load(4) == 0u;
     if (gl_LocalInvocationID.y == 0u)
     {
-        if (element_ix < _186.Load(0))
+        if (element_ix < _166.Load(0))
         {
-            Alloc _217;
-            _217.offset = _186.Load(32);
+            PathRef _180 = { _166.Load(16) + (element_ix * 12u) };
+            PathRef path_ref = _180;
+            Alloc _185;
+            _185.offset = _166.Load(16);
             Alloc param;
-            param.offset = _217.offset;
-            AnnotatedRef param_1 = ref;
-            AnnotatedTag tag = Annotated_tag(param, param_1);
-            switch (tag.tag)
+            param.offset = _185.offset;
+            PathRef param_1 = path_ref;
+            Path path = Path_read(param, param_1);
+            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+            row_count = path.bbox.w - path.bbox.y;
+            bool _210 = row_count == 1u;
+            bool _216;
+            if (_210)
             {
-                case 3u:
-                case 2u:
-                case 4u:
-                case 1u:
-                {
-                    uint param_2 = tag.flags;
-                    if (fill_mode_from_flags(param_2) != 0u)
-                    {
-                        break;
-                    }
-                    PathRef _243 = { _186.Load(16) + (element_ix * 12u) };
-                    PathRef path_ref = _243;
-                    Alloc _247;
-                    _247.offset = _186.Load(16);
-                    Alloc param_3;
-                    param_3.offset = _247.offset;
-                    PathRef param_4 = path_ref;
-                    Path path = Path_read(param_3, param_4);
-                    sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-                    row_count = path.bbox.w - path.bbox.y;
-                    bool _272 = row_count == 1u;
-                    bool _278;
-                    if (_272)
-                    {
-                        _278 = path.bbox.y > 0u;
-                    }
-                    else
-                    {
-                        _278 = _272;
-                    }
-                    if (_278)
-                    {
-                        row_count = 0u;
-                    }
-                    uint param_5 = path.tiles.offset;
-                    uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                    bool param_7 = mem_ok;
-                    Alloc path_alloc = new_alloc(param_5, param_6, param_7);
-                    sh_row_alloc[th_ix] = path_alloc;
-                    break;
-                }
+                _216 = path.bbox.y > 0u;
             }
+            else
+            {
+                _216 = _210;
+            }
+            if (_216)
+            {
+                row_count = 0u;
+            }
+            uint param_2 = path.tiles.offset;
+            uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_4 = mem_ok;
+            Alloc path_alloc = new_alloc(param_2, param_3, param_4);
+            sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
     }
     for (uint i = 0u; i < 8u; i++)
     {
         GroupMemoryBarrierWithGroupSync();
-        bool _325 = gl_LocalInvocationID.y == 0u;
-        bool _332;
-        if (_325)
+        bool _262 = gl_LocalInvocationID.y == 0u;
+        bool _269;
+        if (_262)
         {
-            _332 = th_ix >= (1u << i);
+            _269 = th_ix >= (1u << i);
         }
         else
         {
-            _332 = _325;
+            _269 = _262;
         }
-        if (_332)
+        if (_269)
         {
             row_count += sh_row_count[th_ix - (1u << i)];
         }
@@ -234,7 +190,7 @@
     }
     GroupMemoryBarrierWithGroupSync();
     uint total_rows = sh_row_count[255];
-    uint _411;
+    uint _348;
     for (uint row = th_ix; row < total_rows; row += 256u)
     {
         uint el_ix = 0u;
@@ -252,27 +208,27 @@
             Alloc tiles_alloc = sh_row_alloc[el_ix];
             if (el_ix > 0u)
             {
-                _411 = sh_row_count[el_ix - 1u];
+                _348 = sh_row_count[el_ix - 1u];
             }
             else
             {
-                _411 = 0u;
+                _348 = 0u;
             }
-            uint seq_ix = row - _411;
+            uint seq_ix = row - _348;
             uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
-            Alloc param_8 = tiles_alloc;
-            uint param_9 = tile_el_ix;
-            uint sum = read_mem(param_8, param_9);
+            Alloc param_5 = tiles_alloc;
+            uint param_6 = tile_el_ix;
+            uint sum = read_mem(param_5, param_6);
             for (uint x = 1u; x < width; x++)
             {
                 tile_el_ix += 2u;
-                Alloc param_10 = tiles_alloc;
-                uint param_11 = tile_el_ix;
-                sum += read_mem(param_10, param_11);
-                Alloc param_12 = tiles_alloc;
-                uint param_13 = tile_el_ix;
-                uint param_14 = sum;
-                write_mem(param_12, param_13, param_14);
+                Alloc param_7 = tiles_alloc;
+                uint param_8 = tile_el_ix;
+                sum += read_mem(param_7, param_8);
+                Alloc param_9 = tiles_alloc;
+                uint param_10 = tile_el_ix;
+                uint param_11 = sum;
+                write_mem(param_9, param_10, param_11);
             }
         }
     }
diff --git a/piet-gpu/shader/gen/backdrop.msl b/piet-gpu/shader/gen/backdrop.msl
index be670fc..1c0a0bb 100644
--- a/piet-gpu/shader/gen/backdrop.msl
+++ b/piet-gpu/shader/gen/backdrop.msl
@@ -10,17 +10,6 @@
     uint offset;
 };
 
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct PathRef
 {
     uint offset;
@@ -61,12 +50,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -74,6 +65,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -90,7 +83,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -98,38 +91,23 @@
     {
         return 0u;
     }
-    uint v = v_79.memory[offset];
+    uint v = v_67.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_79);
-    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
-
-static inline __attribute__((always_inline))
-uint fill_mode_from_flags(thread const uint& flags)
-{
-    return flags & 1u;
-}
-
-static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_79);
+    uint raw0 = read_mem(param, param_1, v_67);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_79);
+    uint raw1 = read_mem(param_2, param_3, v_67);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_79);
+    uint raw2 = read_mem(param_4, param_5, v_67);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -145,7 +123,7 @@
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -153,85 +131,65 @@
     {
         return;
     }
-    v_79.memory[offset] = val;
+    v_67.memory[offset] = val;
 }
 
-kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_row_width[256];
     threadgroup Alloc sh_row_alloc[256];
     threadgroup uint sh_row_count[256];
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) };
     uint row_count = 0u;
-    bool mem_ok = v_79.mem_error == 0u;
+    bool mem_ok = v_67.mem_error == 0u;
     if (gl_LocalInvocationID.y == 0u)
     {
-        if (element_ix < _186.conf.n_elements)
+        if (element_ix < _166.conf.n_elements)
         {
+            PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) };
             Alloc param;
-            param.offset = _186.conf.anno_alloc.offset;
-            AnnotatedRef param_1 = ref;
-            AnnotatedTag tag = Annotated_tag(param, param_1, v_79);
-            switch (tag.tag)
+            param.offset = _166.conf.tile_alloc.offset;
+            PathRef param_1 = path_ref;
+            Path path = Path_read(param, param_1, v_67);
+            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+            row_count = path.bbox.w - path.bbox.y;
+            bool _210 = row_count == 1u;
+            bool _216;
+            if (_210)
             {
-                case 3u:
-                case 2u:
-                case 4u:
-                case 1u:
-                {
-                    uint param_2 = tag.flags;
-                    if (fill_mode_from_flags(param_2) != 0u)
-                    {
-                        break;
-                    }
-                    PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) };
-                    Alloc param_3;
-                    param_3.offset = _186.conf.tile_alloc.offset;
-                    PathRef param_4 = path_ref;
-                    Path path = Path_read(param_3, param_4, v_79);
-                    sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-                    row_count = path.bbox.w - path.bbox.y;
-                    bool _272 = row_count == 1u;
-                    bool _278;
-                    if (_272)
-                    {
-                        _278 = path.bbox.y > 0u;
-                    }
-                    else
-                    {
-                        _278 = _272;
-                    }
-                    if (_278)
-                    {
-                        row_count = 0u;
-                    }
-                    uint param_5 = path.tiles.offset;
-                    uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                    bool param_7 = mem_ok;
-                    Alloc path_alloc = new_alloc(param_5, param_6, param_7);
-                    sh_row_alloc[th_ix] = path_alloc;
-                    break;
-                }
+                _216 = path.bbox.y > 0u;
             }
+            else
+            {
+                _216 = _210;
+            }
+            if (_216)
+            {
+                row_count = 0u;
+            }
+            uint param_2 = path.tiles.offset;
+            uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_4 = mem_ok;
+            Alloc path_alloc = new_alloc(param_2, param_3, param_4);
+            sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
     }
     for (uint i = 0u; i < 8u; i++)
     {
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        bool _325 = gl_LocalInvocationID.y == 0u;
-        bool _332;
-        if (_325)
+        bool _262 = gl_LocalInvocationID.y == 0u;
+        bool _269;
+        if (_262)
         {
-            _332 = th_ix >= (1u << i);
+            _269 = th_ix >= (1u << i);
         }
         else
         {
-            _332 = _325;
+            _269 = _262;
         }
-        if (_332)
+        if (_269)
         {
             row_count += sh_row_count[th_ix - (1u << i)];
         }
@@ -243,7 +201,7 @@
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint total_rows = sh_row_count[255];
-    uint _411;
+    uint _348;
     for (uint row = th_ix; row < total_rows; row += 256u)
     {
         uint el_ix = 0u;
@@ -261,27 +219,27 @@
             Alloc tiles_alloc = sh_row_alloc[el_ix];
             if (el_ix > 0u)
             {
-                _411 = sh_row_count[el_ix - 1u];
+                _348 = sh_row_count[el_ix - 1u];
             }
             else
             {
-                _411 = 0u;
+                _348 = 0u;
             }
-            uint seq_ix = row - _411;
+            uint seq_ix = row - _348;
             uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
-            Alloc param_8 = tiles_alloc;
-            uint param_9 = tile_el_ix;
-            uint sum = read_mem(param_8, param_9, v_79);
+            Alloc param_5 = tiles_alloc;
+            uint param_6 = tile_el_ix;
+            uint sum = read_mem(param_5, param_6, v_67);
             for (uint x = 1u; x < width; x++)
             {
                 tile_el_ix += 2u;
-                Alloc param_10 = tiles_alloc;
-                uint param_11 = tile_el_ix;
-                sum += read_mem(param_10, param_11, v_79);
-                Alloc param_12 = tiles_alloc;
-                uint param_13 = tile_el_ix;
-                uint param_14 = sum;
-                write_mem(param_12, param_13, param_14, v_79);
+                Alloc param_7 = tiles_alloc;
+                uint param_8 = tile_el_ix;
+                sum += read_mem(param_7, param_8, v_67);
+                Alloc param_9 = tiles_alloc;
+                uint param_10 = tile_el_ix;
+                uint param_11 = sum;
+                write_mem(param_9, param_10, param_11, v_67);
             }
         }
     }
diff --git a/piet-gpu/shader/gen/backdrop.spv b/piet-gpu/shader/gen/backdrop.spv
index f90bf6e..2bd17d8 100644
--- a/piet-gpu/shader/gen/backdrop.spv
+++ b/piet-gpu/shader/gen/backdrop.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil
index e6b2f1a..81f9b65 100644
--- a/piet-gpu/shader/gen/backdrop_lg.dxil
+++ b/piet-gpu/shader/gen/backdrop_lg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.hlsl b/piet-gpu/shader/gen/backdrop_lg.hlsl
index 5071af2..c506403 100644
--- a/piet-gpu/shader/gen/backdrop_lg.hlsl
+++ b/piet-gpu/shader/gen/backdrop_lg.hlsl
@@ -3,17 +3,6 @@
     uint offset;
 };
 
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct PathRef
 {
     uint offset;
@@ -42,12 +31,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -55,12 +46,14 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 4u, 1u);
 
-RWByteAddressBuffer _79 : register(u0, space0);
-ByteAddressBuffer _186 : register(t1, space0);
+RWByteAddressBuffer _67 : register(u0, space0);
+ByteAddressBuffer _166 : register(t1, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
@@ -89,24 +82,10 @@
     {
         return 0u;
     }
-    uint v = _79.Load(offset * 4 + 8);
+    uint v = _67.Load(offset * 4 + 8);
     return v;
 }
 
-AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _121 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _121;
-}
-
-uint fill_mode_from_flags(uint flags)
-{
-    return flags & 1u;
-}
-
 Path Path_read(Alloc a, PathRef ref)
 {
     uint ix = ref.offset >> uint(2);
@@ -121,8 +100,8 @@
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _165 = { raw2 };
-    s.tiles = _165;
+    TileRef _134 = { raw2 };
+    s.tiles = _134;
     return s;
 }
 
@@ -141,88 +120,65 @@
     {
         return;
     }
-    _79.Store(offset * 4 + 8, val);
+    _67.Store(offset * 4 + 8, val);
 }
 
 void comp_main()
 {
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef _194 = { _186.Load(32) + (element_ix * 40u) };
-    AnnotatedRef ref = _194;
     uint row_count = 0u;
-    bool mem_ok = _79.Load(4) == 0u;
+    bool mem_ok = _67.Load(4) == 0u;
     if (gl_LocalInvocationID.y == 0u)
     {
-        if (element_ix < _186.Load(0))
+        if (element_ix < _166.Load(0))
         {
-            Alloc _217;
-            _217.offset = _186.Load(32);
+            PathRef _180 = { _166.Load(16) + (element_ix * 12u) };
+            PathRef path_ref = _180;
+            Alloc _185;
+            _185.offset = _166.Load(16);
             Alloc param;
-            param.offset = _217.offset;
-            AnnotatedRef param_1 = ref;
-            AnnotatedTag tag = Annotated_tag(param, param_1);
-            switch (tag.tag)
+            param.offset = _185.offset;
+            PathRef param_1 = path_ref;
+            Path path = Path_read(param, param_1);
+            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+            row_count = path.bbox.w - path.bbox.y;
+            bool _210 = row_count == 1u;
+            bool _216;
+            if (_210)
             {
-                case 3u:
-                case 2u:
-                case 4u:
-                case 1u:
-                {
-                    uint param_2 = tag.flags;
-                    if (fill_mode_from_flags(param_2) != 0u)
-                    {
-                        break;
-                    }
-                    PathRef _243 = { _186.Load(16) + (element_ix * 12u) };
-                    PathRef path_ref = _243;
-                    Alloc _247;
-                    _247.offset = _186.Load(16);
-                    Alloc param_3;
-                    param_3.offset = _247.offset;
-                    PathRef param_4 = path_ref;
-                    Path path = Path_read(param_3, param_4);
-                    sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-                    row_count = path.bbox.w - path.bbox.y;
-                    bool _272 = row_count == 1u;
-                    bool _278;
-                    if (_272)
-                    {
-                        _278 = path.bbox.y > 0u;
-                    }
-                    else
-                    {
-                        _278 = _272;
-                    }
-                    if (_278)
-                    {
-                        row_count = 0u;
-                    }
-                    uint param_5 = path.tiles.offset;
-                    uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                    bool param_7 = mem_ok;
-                    Alloc path_alloc = new_alloc(param_5, param_6, param_7);
-                    sh_row_alloc[th_ix] = path_alloc;
-                    break;
-                }
+                _216 = path.bbox.y > 0u;
             }
+            else
+            {
+                _216 = _210;
+            }
+            if (_216)
+            {
+                row_count = 0u;
+            }
+            uint param_2 = path.tiles.offset;
+            uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_4 = mem_ok;
+            Alloc path_alloc = new_alloc(param_2, param_3, param_4);
+            sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
     }
     for (uint i = 0u; i < 8u; i++)
     {
         GroupMemoryBarrierWithGroupSync();
-        bool _325 = gl_LocalInvocationID.y == 0u;
-        bool _332;
-        if (_325)
+        bool _262 = gl_LocalInvocationID.y == 0u;
+        bool _269;
+        if (_262)
         {
-            _332 = th_ix >= (1u << i);
+            _269 = th_ix >= (1u << i);
         }
         else
         {
-            _332 = _325;
+            _269 = _262;
         }
-        if (_332)
+        if (_269)
         {
             row_count += sh_row_count[th_ix - (1u << i)];
         }
@@ -234,7 +190,7 @@
     }
     GroupMemoryBarrierWithGroupSync();
     uint total_rows = sh_row_count[255];
-    uint _411;
+    uint _348;
     for (uint row = th_ix; row < total_rows; row += 1024u)
     {
         uint el_ix = 0u;
@@ -252,27 +208,27 @@
             Alloc tiles_alloc = sh_row_alloc[el_ix];
             if (el_ix > 0u)
             {
-                _411 = sh_row_count[el_ix - 1u];
+                _348 = sh_row_count[el_ix - 1u];
             }
             else
             {
-                _411 = 0u;
+                _348 = 0u;
             }
-            uint seq_ix = row - _411;
+            uint seq_ix = row - _348;
             uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
-            Alloc param_8 = tiles_alloc;
-            uint param_9 = tile_el_ix;
-            uint sum = read_mem(param_8, param_9);
+            Alloc param_5 = tiles_alloc;
+            uint param_6 = tile_el_ix;
+            uint sum = read_mem(param_5, param_6);
             for (uint x = 1u; x < width; x++)
             {
                 tile_el_ix += 2u;
-                Alloc param_10 = tiles_alloc;
-                uint param_11 = tile_el_ix;
-                sum += read_mem(param_10, param_11);
-                Alloc param_12 = tiles_alloc;
-                uint param_13 = tile_el_ix;
-                uint param_14 = sum;
-                write_mem(param_12, param_13, param_14);
+                Alloc param_7 = tiles_alloc;
+                uint param_8 = tile_el_ix;
+                sum += read_mem(param_7, param_8);
+                Alloc param_9 = tiles_alloc;
+                uint param_10 = tile_el_ix;
+                uint param_11 = sum;
+                write_mem(param_9, param_10, param_11);
             }
         }
     }
diff --git a/piet-gpu/shader/gen/backdrop_lg.msl b/piet-gpu/shader/gen/backdrop_lg.msl
index 31cd6cd..de43ebe 100644
--- a/piet-gpu/shader/gen/backdrop_lg.msl
+++ b/piet-gpu/shader/gen/backdrop_lg.msl
@@ -10,17 +10,6 @@
     uint offset;
 };
 
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct PathRef
 {
     uint offset;
@@ -61,12 +50,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -74,6 +65,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -90,7 +83,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -98,38 +91,23 @@
     {
         return 0u;
     }
-    uint v = v_79.memory[offset];
+    uint v = v_67.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_79);
-    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
-
-static inline __attribute__((always_inline))
-uint fill_mode_from_flags(thread const uint& flags)
-{
-    return flags & 1u;
-}
-
-static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_79);
+    uint raw0 = read_mem(param, param_1, v_67);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_79);
+    uint raw1 = read_mem(param_2, param_3, v_67);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_79);
+    uint raw2 = read_mem(param_4, param_5, v_67);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -145,7 +123,7 @@
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -153,85 +131,65 @@
     {
         return;
     }
-    v_79.memory[offset] = val;
+    v_67.memory[offset] = val;
 }
 
-kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_row_width[256];
     threadgroup Alloc sh_row_alloc[256];
     threadgroup uint sh_row_count[256];
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) };
     uint row_count = 0u;
-    bool mem_ok = v_79.mem_error == 0u;
+    bool mem_ok = v_67.mem_error == 0u;
     if (gl_LocalInvocationID.y == 0u)
     {
-        if (element_ix < _186.conf.n_elements)
+        if (element_ix < _166.conf.n_elements)
         {
+            PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) };
             Alloc param;
-            param.offset = _186.conf.anno_alloc.offset;
-            AnnotatedRef param_1 = ref;
-            AnnotatedTag tag = Annotated_tag(param, param_1, v_79);
-            switch (tag.tag)
+            param.offset = _166.conf.tile_alloc.offset;
+            PathRef param_1 = path_ref;
+            Path path = Path_read(param, param_1, v_67);
+            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+            row_count = path.bbox.w - path.bbox.y;
+            bool _210 = row_count == 1u;
+            bool _216;
+            if (_210)
             {
-                case 3u:
-                case 2u:
-                case 4u:
-                case 1u:
-                {
-                    uint param_2 = tag.flags;
-                    if (fill_mode_from_flags(param_2) != 0u)
-                    {
-                        break;
-                    }
-                    PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) };
-                    Alloc param_3;
-                    param_3.offset = _186.conf.tile_alloc.offset;
-                    PathRef param_4 = path_ref;
-                    Path path = Path_read(param_3, param_4, v_79);
-                    sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-                    row_count = path.bbox.w - path.bbox.y;
-                    bool _272 = row_count == 1u;
-                    bool _278;
-                    if (_272)
-                    {
-                        _278 = path.bbox.y > 0u;
-                    }
-                    else
-                    {
-                        _278 = _272;
-                    }
-                    if (_278)
-                    {
-                        row_count = 0u;
-                    }
-                    uint param_5 = path.tiles.offset;
-                    uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                    bool param_7 = mem_ok;
-                    Alloc path_alloc = new_alloc(param_5, param_6, param_7);
-                    sh_row_alloc[th_ix] = path_alloc;
-                    break;
-                }
+                _216 = path.bbox.y > 0u;
             }
+            else
+            {
+                _216 = _210;
+            }
+            if (_216)
+            {
+                row_count = 0u;
+            }
+            uint param_2 = path.tiles.offset;
+            uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_4 = mem_ok;
+            Alloc path_alloc = new_alloc(param_2, param_3, param_4);
+            sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
     }
     for (uint i = 0u; i < 8u; i++)
     {
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        bool _325 = gl_LocalInvocationID.y == 0u;
-        bool _332;
-        if (_325)
+        bool _262 = gl_LocalInvocationID.y == 0u;
+        bool _269;
+        if (_262)
         {
-            _332 = th_ix >= (1u << i);
+            _269 = th_ix >= (1u << i);
         }
         else
         {
-            _332 = _325;
+            _269 = _262;
         }
-        if (_332)
+        if (_269)
         {
             row_count += sh_row_count[th_ix - (1u << i)];
         }
@@ -243,7 +201,7 @@
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint total_rows = sh_row_count[255];
-    uint _411;
+    uint _348;
     for (uint row = th_ix; row < total_rows; row += 1024u)
     {
         uint el_ix = 0u;
@@ -261,27 +219,27 @@
             Alloc tiles_alloc = sh_row_alloc[el_ix];
             if (el_ix > 0u)
             {
-                _411 = sh_row_count[el_ix - 1u];
+                _348 = sh_row_count[el_ix - 1u];
             }
             else
             {
-                _411 = 0u;
+                _348 = 0u;
             }
-            uint seq_ix = row - _411;
+            uint seq_ix = row - _348;
             uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
-            Alloc param_8 = tiles_alloc;
-            uint param_9 = tile_el_ix;
-            uint sum = read_mem(param_8, param_9, v_79);
+            Alloc param_5 = tiles_alloc;
+            uint param_6 = tile_el_ix;
+            uint sum = read_mem(param_5, param_6, v_67);
             for (uint x = 1u; x < width; x++)
             {
                 tile_el_ix += 2u;
-                Alloc param_10 = tiles_alloc;
-                uint param_11 = tile_el_ix;
-                sum += read_mem(param_10, param_11, v_79);
-                Alloc param_12 = tiles_alloc;
-                uint param_13 = tile_el_ix;
-                uint param_14 = sum;
-                write_mem(param_12, param_13, param_14, v_79);
+                Alloc param_7 = tiles_alloc;
+                uint param_8 = tile_el_ix;
+                sum += read_mem(param_7, param_8, v_67);
+                Alloc param_9 = tiles_alloc;
+                uint param_10 = tile_el_ix;
+                uint param_11 = sum;
+                write_mem(param_9, param_10, param_11, v_67);
             }
         }
     }
diff --git a/piet-gpu/shader/gen/backdrop_lg.spv b/piet-gpu/shader/gen/backdrop_lg.spv
index 3f90d2e..ff2b1d7 100644
--- a/piet-gpu/shader/gen/backdrop_lg.spv
+++ b/piet-gpu/shader/gen/backdrop_lg.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil
index 26b5f16..6b3efaf 100644
--- a/piet-gpu/shader/gen/bbox_clear.dxil
+++ b/piet-gpu/shader/gen/bbox_clear.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl
index 903d84c..8a884d3 100644
--- a/piet-gpu/shader/gen/bbox_clear.hlsl
+++ b/piet-gpu/shader/gen/bbox_clear.hlsl
@@ -15,12 +15,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -28,6 +30,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
@@ -44,7 +48,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x;
-    if (ix < _21.Load(68))
+    if (ix < _21.Load(76))
     {
         uint out_ix = (_21.Load(40) >> uint(2)) + (6u * ix);
         _45.Store(out_ix * 4 + 8, 65535u);
diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl
index e80f15e..c278c68 100644
--- a/piet-gpu/shader/gen/bbox_clear.msl
+++ b/piet-gpu/shader/gen/bbox_clear.msl
@@ -20,12 +20,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -33,6 +35,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -54,7 +58,7 @@
     uint ix = gl_GlobalInvocationID.x;
     if (ix < _21.conf.n_path)
     {
-        uint out_ix = (_21.conf.bbox_alloc.offset >> uint(2)) + (6u * ix);
+        uint out_ix = (_21.conf.path_bbox_alloc.offset >> uint(2)) + (6u * ix);
         _45.memory[out_ix] = 65535u;
         _45.memory[out_ix + 1u] = 65535u;
         _45.memory[out_ix + 2u] = 0u;
diff --git a/piet-gpu/shader/gen/bbox_clear.spv b/piet-gpu/shader/gen/bbox_clear.spv
index e3e88d7..58a270e 100644
--- a/piet-gpu/shader/gen/bbox_clear.spv
+++ b/piet-gpu/shader/gen/bbox_clear.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil
index fe53f27..4a4f073 100644
--- a/piet-gpu/shader/gen/binning.dxil
+++ b/piet-gpu/shader/gen/binning.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl
index e4de1b9..986f42b 100644
--- a/piet-gpu/shader/gen/binning.hlsl
+++ b/piet-gpu/shader/gen/binning.hlsl
@@ -9,17 +9,6 @@
     bool failed;
 };
 
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct BinInstanceRef
 {
     uint offset;
@@ -34,6 +23,8 @@
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
 struct Config
@@ -48,12 +39,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -61,12 +54,14 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _94 : register(u0, space0);
-ByteAddressBuffer _202 : register(t1, space0);
+RWByteAddressBuffer _81 : register(u0, space0);
+ByteAddressBuffer _156 : register(t1, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -81,59 +76,35 @@
 groupshared uint count[8][256];
 groupshared Alloc sh_chunk_alloc[256];
 
-bool touch_mem(Alloc alloc, uint offset)
-{
-    return true;
-}
-
-uint read_mem(Alloc alloc, uint offset)
-{
-    Alloc param = alloc;
-    uint param_1 = offset;
-    if (!touch_mem(param, param_1))
-    {
-        return 0u;
-    }
-    uint v = _94.Load(offset * 4 + 8);
-    return v;
-}
-
-AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _181 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _181;
-}
-
 DrawMonoid load_draw_monoid(uint element_ix)
 {
-    uint base = (_202.Load(44) >> uint(2)) + (2u * element_ix);
-    uint path_ix = _94.Load(base * 4 + 8);
-    uint clip_ix = _94.Load((base + 1u) * 4 + 8);
-    DrawMonoid _222 = { path_ix, clip_ix };
-    return _222;
+    uint base = (_156.Load(44) >> uint(2)) + (4u * element_ix);
+    uint path_ix = _81.Load(base * 4 + 8);
+    uint clip_ix = _81.Load((base + 1u) * 4 + 8);
+    uint scene_offset = _81.Load((base + 2u) * 4 + 8);
+    uint info_offset = _81.Load((base + 3u) * 4 + 8);
+    DrawMonoid _190 = { path_ix, clip_ix, scene_offset, info_offset };
+    return _190;
 }
 
 float4 load_clip_bbox(uint clip_ix)
 {
-    uint base = (_202.Load(60) >> uint(2)) + (4u * clip_ix);
-    float x0 = asfloat(_94.Load(base * 4 + 8));
-    float y0 = asfloat(_94.Load((base + 1u) * 4 + 8));
-    float x1 = asfloat(_94.Load((base + 2u) * 4 + 8));
-    float y1 = asfloat(_94.Load((base + 3u) * 4 + 8));
+    uint base = (_156.Load(60) >> uint(2)) + (4u * clip_ix);
+    float x0 = asfloat(_81.Load(base * 4 + 8));
+    float y0 = asfloat(_81.Load((base + 1u) * 4 + 8));
+    float x1 = asfloat(_81.Load((base + 2u) * 4 + 8));
+    float y1 = asfloat(_81.Load((base + 3u) * 4 + 8));
     float4 bbox = float4(x0, y0, x1, y1);
     return bbox;
 }
 
 float4 load_path_bbox(uint path_ix)
 {
-    uint base = (_202.Load(40) >> uint(2)) + (6u * path_ix);
-    float bbox_l = float(_94.Load(base * 4 + 8)) - 32768.0f;
-    float bbox_t = float(_94.Load((base + 1u) * 4 + 8)) - 32768.0f;
-    float bbox_r = float(_94.Load((base + 2u) * 4 + 8)) - 32768.0f;
-    float bbox_b = float(_94.Load((base + 3u) * 4 + 8)) - 32768.0f;
+    uint base = (_156.Load(40) >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(_81.Load(base * 4 + 8)) - 32768.0f;
+    float bbox_t = float(_81.Load((base + 1u) * 4 + 8)) - 32768.0f;
+    float bbox_r = float(_81.Load((base + 2u) * 4 + 8)) - 32768.0f;
+    float bbox_b = float(_81.Load((base + 3u) * 4 + 8)) - 32768.0f;
     float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
     return bbox;
 }
@@ -143,13 +114,13 @@
     return float4(max(a.xy, b.xy), min(a.zw, b.zw));
 }
 
-void store_path_bbox(AnnotatedRef ref, float4 bbox)
+void store_draw_bbox(uint draw_ix, float4 bbox)
 {
-    uint ix = ref.offset >> uint(2);
-    _94.Store((ix + 1u) * 4 + 8, asuint(bbox.x));
-    _94.Store((ix + 2u) * 4 + 8, asuint(bbox.y));
-    _94.Store((ix + 3u) * 4 + 8, asuint(bbox.z));
-    _94.Store((ix + 4u) * 4 + 8, asuint(bbox.w));
+    uint base = (_156.Load(64) >> uint(2)) + (4u * draw_ix);
+    _81.Store(base * 4 + 8, asuint(bbox.x));
+    _81.Store((base + 1u) * 4 + 8, asuint(bbox.y));
+    _81.Store((base + 2u) * 4 + 8, asuint(bbox.z));
+    _81.Store((base + 3u) * 4 + 8, asuint(bbox.w));
 }
 
 Alloc new_alloc(uint offset, uint size, bool mem_ok)
@@ -161,27 +132,32 @@
 
 MallocResult malloc(uint size)
 {
-    uint _100;
-    _94.InterlockedAdd(0, size, _100);
-    uint offset = _100;
-    uint _107;
-    _94.GetDimensions(_107);
-    _107 = (_107 - 8) / 4;
+    uint _87;
+    _81.InterlockedAdd(0, size, _87);
+    uint offset = _87;
+    uint _94;
+    _81.GetDimensions(_94);
+    _94 = (_94 - 8) / 4;
     MallocResult r;
-    r.failed = (offset + size) > uint(int(_107) * 4);
+    r.failed = (offset + size) > uint(int(_94) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _129;
-        _94.InterlockedMax(4, 1u, _129);
+        uint _116;
+        _81.InterlockedMax(4, 1u, _116);
         return r;
     }
     return r;
 }
 
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
 void write_mem(Alloc alloc, uint offset, uint val)
 {
     Alloc param = alloc;
@@ -190,7 +166,7 @@
     {
         return;
     }
-    _94.Store(offset * 4 + 8, val);
+    _81.Store(offset * 4 + 8, val);
 }
 
 void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s)
@@ -204,7 +180,6 @@
 
 void comp_main()
 {
-    uint my_n_elements = _202.Load(0);
     uint my_partition = gl_WorkGroupID.x;
     for (uint i = 0u; i < 8u; i++)
     {
@@ -216,62 +191,42 @@
     }
     GroupMemoryBarrierWithGroupSync();
     uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
-    AnnotatedRef _415 = { _202.Load(32) + (element_ix * 40u) };
-    AnnotatedRef ref = _415;
-    uint tag = 0u;
-    if (element_ix < my_n_elements)
-    {
-        Alloc _425;
-        _425.offset = _202.Load(32);
-        Alloc param;
-        param.offset = _425.offset;
-        AnnotatedRef param_1 = ref;
-        tag = Annotated_tag(param, param_1).tag;
-    }
     int x0 = 0;
     int y0 = 0;
     int x1 = 0;
     int y1 = 0;
-    switch (tag)
+    if (element_ix < _156.Load(0))
     {
-        case 1u:
-        case 2u:
-        case 3u:
-        case 4u:
-        case 5u:
+        uint param = element_ix;
+        DrawMonoid draw_monoid = load_draw_monoid(param);
+        uint path_ix = draw_monoid.path_ix;
+        float4 clip_bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
+        uint clip_ix = draw_monoid.clip_ix;
+        if (clip_ix > 0u)
         {
-            uint param_2 = element_ix;
-            DrawMonoid draw_monoid = load_draw_monoid(param_2);
-            uint path_ix = draw_monoid.path_ix;
-            float4 clip_bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
-            uint clip_ix = draw_monoid.clip_ix;
-            if (clip_ix > 0u)
-            {
-                uint param_3 = clip_ix - 1u;
-                clip_bbox = load_clip_bbox(param_3);
-            }
-            uint param_4 = path_ix;
-            float4 path_bbox = load_path_bbox(param_4);
-            float4 param_5 = path_bbox;
-            float4 param_6 = clip_bbox;
-            float4 bbox = bbox_intersect(param_5, param_6);
-            float4 _473 = bbox;
-            float4 _475 = bbox;
-            float2 _477 = max(_473.xy, _475.zw);
-            bbox.z = _477.x;
-            bbox.w = _477.y;
-            AnnotatedRef param_7 = ref;
-            float4 param_8 = bbox;
-            store_path_bbox(param_7, param_8);
-            x0 = int(floor(bbox.x * 0.00390625f));
-            y0 = int(floor(bbox.y * 0.00390625f));
-            x1 = int(ceil(bbox.z * 0.00390625f));
-            y1 = int(ceil(bbox.w * 0.00390625f));
-            break;
+            uint param_1 = clip_ix - 1u;
+            clip_bbox = load_clip_bbox(param_1);
         }
+        uint param_2 = path_ix;
+        float4 path_bbox = load_path_bbox(param_2);
+        float4 param_3 = path_bbox;
+        float4 param_4 = clip_bbox;
+        float4 bbox = bbox_intersect(param_3, param_4);
+        float4 _417 = bbox;
+        float4 _419 = bbox;
+        float2 _421 = max(_417.xy, _419.zw);
+        bbox.z = _421.x;
+        bbox.w = _421.y;
+        uint param_5 = element_ix;
+        float4 param_6 = bbox;
+        store_draw_bbox(param_5, param_6);
+        x0 = int(floor(bbox.x * 0.00390625f));
+        y0 = int(floor(bbox.y * 0.00390625f));
+        x1 = int(ceil(bbox.z * 0.00390625f));
+        y1 = int(ceil(bbox.w * 0.00390625f));
     }
-    uint width_in_bins = ((_202.Load(8) + 16u) - 1u) / 16u;
-    uint height_in_bins = ((_202.Load(12) + 16u) - 1u) / 16u;
+    uint width_in_bins = ((_156.Load(8) + 16u) - 1u) / 16u;
+    uint height_in_bins = ((_156.Load(12) + 16u) - 1u) / 16u;
     x0 = clamp(x0, 0, int(width_in_bins));
     x1 = clamp(x1, x0, int(width_in_bins));
     y0 = clamp(y0, 0, int(height_in_bins));
@@ -286,8 +241,8 @@
     uint my_mask = 1u << (gl_LocalInvocationID.x & 31u);
     while (y < y1)
     {
-        uint _581;
-        InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _581);
+        uint _523;
+        InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _523);
         x++;
         if (x == x1)
         {
@@ -302,15 +257,15 @@
         element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x])));
         count[i_1][gl_LocalInvocationID.x] = element_count;
     }
-    uint param_9 = 0u;
-    uint param_10 = 0u;
-    bool param_11 = true;
-    Alloc chunk_alloc = new_alloc(param_9, param_10, param_11);
+    uint param_7 = 0u;
+    uint param_8 = 0u;
+    bool param_9 = true;
+    Alloc chunk_alloc = new_alloc(param_7, param_8, param_9);
     if (element_count != 0u)
     {
-        uint param_12 = element_count * 4u;
-        MallocResult _631 = malloc(param_12);
-        MallocResult chunk = _631;
+        uint param_10 = element_count * 4u;
+        MallocResult _573 = malloc(param_10);
+        MallocResult chunk = _573;
         chunk_alloc = chunk.alloc;
         sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
         if (chunk.failed)
@@ -318,32 +273,32 @@
             sh_alloc_failed = true;
         }
     }
-    uint out_ix = (_202.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
-    Alloc _660;
-    _660.offset = _202.Load(20);
-    Alloc param_13;
-    param_13.offset = _660.offset;
-    uint param_14 = out_ix;
-    uint param_15 = element_count;
-    write_mem(param_13, param_14, param_15);
-    Alloc _672;
-    _672.offset = _202.Load(20);
-    Alloc param_16;
-    param_16.offset = _672.offset;
-    uint param_17 = out_ix + 1u;
-    uint param_18 = chunk_alloc.offset;
-    write_mem(param_16, param_17, param_18);
+    uint out_ix = (_156.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
+    Alloc _603;
+    _603.offset = _156.Load(20);
+    Alloc param_11;
+    param_11.offset = _603.offset;
+    uint param_12 = out_ix;
+    uint param_13 = element_count;
+    write_mem(param_11, param_12, param_13);
+    Alloc _615;
+    _615.offset = _156.Load(20);
+    Alloc param_14;
+    param_14.offset = _615.offset;
+    uint param_15 = out_ix + 1u;
+    uint param_16 = chunk_alloc.offset;
+    write_mem(param_14, param_15, param_16);
     GroupMemoryBarrierWithGroupSync();
-    bool _687;
+    bool _630;
     if (!sh_alloc_failed)
     {
-        _687 = _94.Load(4) != 0u;
+        _630 = _81.Load(4) != 0u;
     }
     else
     {
-        _687 = sh_alloc_failed;
+        _630 = sh_alloc_failed;
     }
-    if (_687)
+    if (_630)
     {
         return;
     }
@@ -362,12 +317,12 @@
             }
             Alloc out_alloc = sh_chunk_alloc[bin_ix];
             uint out_offset = out_alloc.offset + (idx * 4u);
-            BinInstanceRef _749 = { out_offset };
-            BinInstance _751 = { element_ix };
-            Alloc param_19 = out_alloc;
-            BinInstanceRef param_20 = _749;
-            BinInstance param_21 = _751;
-            BinInstance_write(param_19, param_20, param_21);
+            BinInstanceRef _692 = { out_offset };
+            BinInstance _694 = { element_ix };
+            Alloc param_17 = out_alloc;
+            BinInstanceRef param_18 = _692;
+            BinInstance param_19 = _694;
+            BinInstance_write(param_17, param_18, param_19);
         }
         x++;
         if (x == x1)
diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl
index 0e3b6c8..2ee5168 100644
--- a/piet-gpu/shader/gen/binning.msl
+++ b/piet-gpu/shader/gen/binning.msl
@@ -18,17 +18,6 @@
     bool failed;
 };
 
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct BinInstanceRef
 {
     uint offset;
@@ -43,6 +32,8 @@
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
 struct Memory
@@ -69,12 +60,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -82,6 +75,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -92,62 +87,36 @@
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
 {
-    return true;
+    uint base = (v_156.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * element_ix);
+    uint path_ix = v_81.memory[base];
+    uint clip_ix = v_81.memory[base + 1u];
+    uint scene_offset = v_81.memory[base + 2u];
+    uint info_offset = v_81.memory[base + 3u];
+    return DrawMonoid{ path_ix, clip_ix, scene_offset, info_offset };
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_94, constant uint& v_94BufferSize)
+float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
 {
-    Alloc param = alloc;
-    uint param_1 = offset;
-    if (!touch_mem(param, param_1))
-    {
-        return 0u;
-    }
-    uint v = v_94.memory[offset];
-    return v;
-}
-
-static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_94, constant uint& v_94BufferSize)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_94, v_94BufferSize);
-    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
-
-static inline __attribute__((always_inline))
-DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202)
-{
-    uint base = (v_202.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix);
-    uint path_ix = v_94.memory[base];
-    uint clip_ix = v_94.memory[base + 1u];
-    return DrawMonoid{ path_ix, clip_ix };
-}
-
-static inline __attribute__((always_inline))
-float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202)
-{
-    uint base = (v_202.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix);
-    float x0 = as_type<float>(v_94.memory[base]);
-    float y0 = as_type<float>(v_94.memory[base + 1u]);
-    float x1 = as_type<float>(v_94.memory[base + 2u]);
-    float y1 = as_type<float>(v_94.memory[base + 3u]);
+    uint base = (v_156.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix);
+    float x0 = as_type<float>(v_81.memory[base]);
+    float y0 = as_type<float>(v_81.memory[base + 1u]);
+    float x1 = as_type<float>(v_81.memory[base + 2u]);
+    float y1 = as_type<float>(v_81.memory[base + 3u]);
     float4 bbox = float4(x0, y0, x1, y1);
     return bbox;
 }
 
 static inline __attribute__((always_inline))
-float4 load_path_bbox(thread const uint& path_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202)
+float4 load_path_bbox(thread const uint& path_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
 {
-    uint base = (v_202.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix);
-    float bbox_l = float(v_94.memory[base]) - 32768.0;
-    float bbox_t = float(v_94.memory[base + 1u]) - 32768.0;
-    float bbox_r = float(v_94.memory[base + 2u]) - 32768.0;
-    float bbox_b = float(v_94.memory[base + 3u]) - 32768.0;
+    uint base = (v_156.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(v_81.memory[base]) - 32768.0;
+    float bbox_t = float(v_81.memory[base + 1u]) - 32768.0;
+    float bbox_r = float(v_81.memory[base + 2u]) - 32768.0;
+    float bbox_b = float(v_81.memory[base + 3u]) - 32768.0;
     float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
     return bbox;
 }
@@ -159,13 +128,13 @@
 }
 
 static inline __attribute__((always_inline))
-void store_path_bbox(thread const AnnotatedRef& ref, thread const float4& bbox, device Memory& v_94, constant uint& v_94BufferSize)
+void store_draw_bbox(thread const uint& draw_ix, thread const float4& bbox, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
 {
-    uint ix = ref.offset >> uint(2);
-    v_94.memory[ix + 1u] = as_type<uint>(bbox.x);
-    v_94.memory[ix + 2u] = as_type<uint>(bbox.y);
-    v_94.memory[ix + 3u] = as_type<uint>(bbox.z);
-    v_94.memory[ix + 4u] = as_type<uint>(bbox.w);
+    uint base = (v_156.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix);
+    v_81.memory[base] = as_type<uint>(bbox.x);
+    v_81.memory[base + 1u] = as_type<uint>(bbox.y);
+    v_81.memory[base + 2u] = as_type<uint>(bbox.z);
+    v_81.memory[base + 3u] = as_type<uint>(bbox.w);
 }
 
 static inline __attribute__((always_inline))
@@ -177,26 +146,32 @@
 }
 
 static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_94, constant uint& v_94BufferSize)
+MallocResult malloc(thread const uint& size, device Memory& v_81, constant uint& v_81BufferSize)
 {
-    uint _100 = atomic_fetch_add_explicit((device atomic_uint*)&v_94.mem_offset, size, memory_order_relaxed);
-    uint offset = _100;
+    uint _87 = atomic_fetch_add_explicit((device atomic_uint*)&v_81.mem_offset, size, memory_order_relaxed);
+    uint offset = _87;
     MallocResult r;
-    r.failed = (offset + size) > uint(int((v_94BufferSize - 8) / 4) * 4);
+    r.failed = (offset + size) > uint(int((v_81BufferSize - 8) / 4) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _129 = atomic_fetch_max_explicit((device atomic_uint*)&v_94.mem_error, 1u, memory_order_relaxed);
+        uint _116 = atomic_fetch_max_explicit((device atomic_uint*)&v_81.mem_error, 1u, memory_order_relaxed);
         return r;
     }
     return r;
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_94, constant uint& v_94BufferSize)
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_81, constant uint& v_81BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -204,27 +179,26 @@
     {
         return;
     }
-    v_94.memory[offset] = val;
+    v_81.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_94, constant uint& v_94BufferSize)
+void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_81, constant uint& v_81BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.element_ix;
-    write_mem(param, param_1, param_2, v_94, v_94BufferSize);
+    write_mem(param, param_1, param_2, v_81, v_81BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_94 [[buffer(0)]], const device ConfigBuf& v_202 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_81 [[buffer(0)]], const device ConfigBuf& v_156 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint bitmaps[8][256];
     threadgroup short sh_alloc_failed;
     threadgroup uint count[8][256];
     threadgroup Alloc sh_chunk_alloc[256];
-    constant uint& v_94BufferSize = spvBufferSizeConstants[0];
-    uint my_n_elements = v_202.conf.n_elements;
+    constant uint& v_81BufferSize = spvBufferSizeConstants[0];
     uint my_partition = gl_WorkGroupID.x;
     for (uint i = 0u; i < 8u; i++)
     {
@@ -236,59 +210,42 @@
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef{ v_202.conf.anno_alloc.offset + (element_ix * 40u) };
-    uint tag = 0u;
-    if (element_ix < my_n_elements)
-    {
-        Alloc param;
-        param.offset = v_202.conf.anno_alloc.offset;
-        AnnotatedRef param_1 = ref;
-        tag = Annotated_tag(param, param_1, v_94, v_94BufferSize).tag;
-    }
     int x0 = 0;
     int y0 = 0;
     int x1 = 0;
     int y1 = 0;
-    switch (tag)
+    if (element_ix < v_156.conf.n_elements)
     {
-        case 1u:
-        case 2u:
-        case 3u:
-        case 4u:
-        case 5u:
+        uint param = element_ix;
+        DrawMonoid draw_monoid = load_draw_monoid(param, v_81, v_81BufferSize, v_156);
+        uint path_ix = draw_monoid.path_ix;
+        float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
+        uint clip_ix = draw_monoid.clip_ix;
+        if (clip_ix > 0u)
         {
-            uint param_2 = element_ix;
-            DrawMonoid draw_monoid = load_draw_monoid(param_2, v_94, v_94BufferSize, v_202);
-            uint path_ix = draw_monoid.path_ix;
-            float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
-            uint clip_ix = draw_monoid.clip_ix;
-            if (clip_ix > 0u)
-            {
-                uint param_3 = clip_ix - 1u;
-                clip_bbox = load_clip_bbox(param_3, v_94, v_94BufferSize, v_202);
-            }
-            uint param_4 = path_ix;
-            float4 path_bbox = load_path_bbox(param_4, v_94, v_94BufferSize, v_202);
-            float4 param_5 = path_bbox;
-            float4 param_6 = clip_bbox;
-            float4 bbox = bbox_intersect(param_5, param_6);
-            float4 _473 = bbox;
-            float4 _475 = bbox;
-            float2 _477 = fast::max(_473.xy, _475.zw);
-            bbox.z = _477.x;
-            bbox.w = _477.y;
-            AnnotatedRef param_7 = ref;
-            float4 param_8 = bbox;
-            store_path_bbox(param_7, param_8, v_94, v_94BufferSize);
-            x0 = int(floor(bbox.x * 0.00390625));
-            y0 = int(floor(bbox.y * 0.00390625));
-            x1 = int(ceil(bbox.z * 0.00390625));
-            y1 = int(ceil(bbox.w * 0.00390625));
-            break;
+            uint param_1 = clip_ix - 1u;
+            clip_bbox = load_clip_bbox(param_1, v_81, v_81BufferSize, v_156);
         }
+        uint param_2 = path_ix;
+        float4 path_bbox = load_path_bbox(param_2, v_81, v_81BufferSize, v_156);
+        float4 param_3 = path_bbox;
+        float4 param_4 = clip_bbox;
+        float4 bbox = bbox_intersect(param_3, param_4);
+        float4 _417 = bbox;
+        float4 _419 = bbox;
+        float2 _421 = fast::max(_417.xy, _419.zw);
+        bbox.z = _421.x;
+        bbox.w = _421.y;
+        uint param_5 = element_ix;
+        float4 param_6 = bbox;
+        store_draw_bbox(param_5, param_6, v_81, v_81BufferSize, v_156);
+        x0 = int(floor(bbox.x * 0.00390625));
+        y0 = int(floor(bbox.y * 0.00390625));
+        x1 = int(ceil(bbox.z * 0.00390625));
+        y1 = int(ceil(bbox.w * 0.00390625));
     }
-    uint width_in_bins = ((v_202.conf.width_in_tiles + 16u) - 1u) / 16u;
-    uint height_in_bins = ((v_202.conf.height_in_tiles + 16u) - 1u) / 16u;
+    uint width_in_bins = ((v_156.conf.width_in_tiles + 16u) - 1u) / 16u;
+    uint height_in_bins = ((v_156.conf.height_in_tiles + 16u) - 1u) / 16u;
     x0 = clamp(x0, 0, int(width_in_bins));
     x1 = clamp(x1, x0, int(width_in_bins));
     y0 = clamp(y0, 0, int(height_in_bins));
@@ -303,7 +260,7 @@
     uint my_mask = 1u << (gl_LocalInvocationID.x & 31u);
     while (y < y1)
     {
-        uint _581 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
+        uint _523 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
         x++;
         if (x == x1)
         {
@@ -318,15 +275,15 @@
         element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x])));
         count[i_1][gl_LocalInvocationID.x] = element_count;
     }
-    uint param_9 = 0u;
-    uint param_10 = 0u;
-    bool param_11 = true;
-    Alloc chunk_alloc = new_alloc(param_9, param_10, param_11);
+    uint param_7 = 0u;
+    uint param_8 = 0u;
+    bool param_9 = true;
+    Alloc chunk_alloc = new_alloc(param_7, param_8, param_9);
     if (element_count != 0u)
     {
-        uint param_12 = element_count * 4u;
-        MallocResult _631 = malloc(param_12, v_94, v_94BufferSize);
-        MallocResult chunk = _631;
+        uint param_10 = element_count * 4u;
+        MallocResult _573 = malloc(param_10, v_81, v_81BufferSize);
+        MallocResult chunk = _573;
         chunk_alloc = chunk.alloc;
         sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
         if (chunk.failed)
@@ -334,28 +291,28 @@
             sh_alloc_failed = short(true);
         }
     }
-    uint out_ix = (v_202.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
-    Alloc param_13;
-    param_13.offset = v_202.conf.bin_alloc.offset;
-    uint param_14 = out_ix;
-    uint param_15 = element_count;
-    write_mem(param_13, param_14, param_15, v_94, v_94BufferSize);
-    Alloc param_16;
-    param_16.offset = v_202.conf.bin_alloc.offset;
-    uint param_17 = out_ix + 1u;
-    uint param_18 = chunk_alloc.offset;
-    write_mem(param_16, param_17, param_18, v_94, v_94BufferSize);
+    uint out_ix = (v_156.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
+    Alloc param_11;
+    param_11.offset = v_156.conf.bin_alloc.offset;
+    uint param_12 = out_ix;
+    uint param_13 = element_count;
+    write_mem(param_11, param_12, param_13, v_81, v_81BufferSize);
+    Alloc param_14;
+    param_14.offset = v_156.conf.bin_alloc.offset;
+    uint param_15 = out_ix + 1u;
+    uint param_16 = chunk_alloc.offset;
+    write_mem(param_14, param_15, param_16, v_81, v_81BufferSize);
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    bool _687;
+    bool _630;
     if (!bool(sh_alloc_failed))
     {
-        _687 = v_94.mem_error != 0u;
+        _630 = v_81.mem_error != 0u;
     }
     else
     {
-        _687 = bool(sh_alloc_failed);
+        _630 = bool(sh_alloc_failed);
     }
-    if (_687)
+    if (_630)
     {
         return;
     }
@@ -374,10 +331,10 @@
             }
             Alloc out_alloc = sh_chunk_alloc[bin_ix];
             uint out_offset = out_alloc.offset + (idx * 4u);
-            Alloc param_19 = out_alloc;
-            BinInstanceRef param_20 = BinInstanceRef{ out_offset };
-            BinInstance param_21 = BinInstance{ element_ix };
-            BinInstance_write(param_19, param_20, param_21, v_94, v_94BufferSize);
+            Alloc param_17 = out_alloc;
+            BinInstanceRef param_18 = BinInstanceRef{ out_offset };
+            BinInstance param_19 = BinInstance{ element_ix };
+            BinInstance_write(param_17, param_18, param_19, v_81, v_81BufferSize);
         }
         x++;
         if (x == x1)
diff --git a/piet-gpu/shader/gen/binning.spv b/piet-gpu/shader/gen/binning.spv
index eca0692..30eacd6 100644
--- a/piet-gpu/shader/gen/binning.spv
+++ b/piet-gpu/shader/gen/binning.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_leaf.dxil b/piet-gpu/shader/gen/clip_leaf.dxil
index fe87a11..b681a65 100644
--- a/piet-gpu/shader/gen/clip_leaf.dxil
+++ b/piet-gpu/shader/gen/clip_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_leaf.hlsl b/piet-gpu/shader/gen/clip_leaf.hlsl
index d570420..ed45bf1 100644
--- a/piet-gpu/shader/gen/clip_leaf.hlsl
+++ b/piet-gpu/shader/gen/clip_leaf.hlsl
@@ -27,12 +27,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -40,6 +42,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
@@ -99,7 +103,7 @@
 
 uint load_path_ix(uint ix)
 {
-    if (ix < _80.Load(72))
+    if (ix < _80.Load(80))
     {
         return _96.Load(((_80.Load(48) >> uint(2)) + ix) * 4 + 8);
     }
@@ -324,7 +328,7 @@
     bool _725;
     if (_717)
     {
-        _725 = gl_GlobalInvocationID.x < _80.Load(72);
+        _725 = gl_GlobalInvocationID.x < _80.Load(80);
     }
     else
     {
@@ -334,7 +338,7 @@
     {
         uint param_15 = parent;
         path_ix = load_path_ix(param_15);
-        uint drawmonoid_out_base = (_80.Load(44) >> uint(2)) + (2u * (~inp));
+        uint drawmonoid_out_base = (_80.Load(44) >> uint(2)) + (4u * (~inp));
         _96.Store(drawmonoid_out_base * 4 + 8, path_ix);
         if (int(grandparent) >= 0)
         {
diff --git a/piet-gpu/shader/gen/clip_leaf.msl b/piet-gpu/shader/gen/clip_leaf.msl
index 4e2d059..5f5e0a7 100644
--- a/piet-gpu/shader/gen/clip_leaf.msl
+++ b/piet-gpu/shader/gen/clip_leaf.msl
@@ -34,12 +34,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -47,6 +49,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -112,7 +116,7 @@
 static inline __attribute__((always_inline))
 float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_80, device Memory& v_96)
 {
-    uint base = (v_80.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+    uint base = (v_80.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix);
     float bbox_l = float(v_96.memory[base]) - 32768.0;
     float bbox_t = float(v_96.memory[base + 1u]) - 32768.0;
     float bbox_r = float(v_96.memory[base + 2u]) - 32768.0;
@@ -341,7 +345,7 @@
     {
         uint param_15 = parent;
         path_ix = load_path_ix(param_15, v_80, v_96);
-        uint drawmonoid_out_base = (v_80.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * (~inp));
+        uint drawmonoid_out_base = (v_80.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * (~inp));
         v_96.memory[drawmonoid_out_base] = path_ix;
         if (int(grandparent) >= 0)
         {
diff --git a/piet-gpu/shader/gen/clip_leaf.spv b/piet-gpu/shader/gen/clip_leaf.spv
index 7c4c174..beac64b 100644
--- a/piet-gpu/shader/gen/clip_leaf.spv
+++ b/piet-gpu/shader/gen/clip_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_reduce.dxil b/piet-gpu/shader/gen/clip_reduce.dxil
index 0dff71b..0ccaac9 100644
--- a/piet-gpu/shader/gen/clip_reduce.dxil
+++ b/piet-gpu/shader/gen/clip_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_reduce.hlsl b/piet-gpu/shader/gen/clip_reduce.hlsl
index 6851e63..1276b5f 100644
--- a/piet-gpu/shader/gen/clip_reduce.hlsl
+++ b/piet-gpu/shader/gen/clip_reduce.hlsl
@@ -27,12 +27,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -40,6 +42,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
diff --git a/piet-gpu/shader/gen/clip_reduce.msl b/piet-gpu/shader/gen/clip_reduce.msl
index 5845676..26214f1 100644
--- a/piet-gpu/shader/gen/clip_reduce.msl
+++ b/piet-gpu/shader/gen/clip_reduce.msl
@@ -34,12 +34,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -47,6 +49,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -81,7 +85,7 @@
 static inline __attribute__((always_inline))
 float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_64, device Memory& v_80)
 {
-    uint base = (v_64.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+    uint base = (v_64.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix);
     float bbox_l = float(v_80.memory[base]) - 32768.0;
     float bbox_t = float(v_80.memory[base + 1u]) - 32768.0;
     float bbox_r = float(v_80.memory[base + 2u]) - 32768.0;
diff --git a/piet-gpu/shader/gen/clip_reduce.spv b/piet-gpu/shader/gen/clip_reduce.spv
index cbe8c1f..ce0b9bb 100644
--- a/piet-gpu/shader/gen/clip_reduce.spv
+++ b/piet-gpu/shader/gen/clip_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index c7e1682..2339e7d 100644
--- a/piet-gpu/shader/gen/coarse.dxil
+++ b/piet-gpu/shader/gen/coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index f00eabe..db1e496 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -9,68 +9,6 @@
     bool failed;
 };
 
-struct AnnoImageRef
-{
-    uint offset;
-};
-
-struct AnnoImage
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    int2 offset;
-};
-
-struct AnnoColorRef
-{
-    uint offset;
-};
-
-struct AnnoColor
-{
-    float4 bbox;
-    float linewidth;
-    uint rgba_color;
-};
-
-struct AnnoLinGradientRef
-{
-    uint offset;
-};
-
-struct AnnoLinGradient
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    float line_x;
-    float line_y;
-    float line_c;
-};
-
-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct BinInstanceRef
 {
     uint offset;
@@ -201,12 +139,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -214,12 +154,15 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _308 : register(u0, space0);
-ByteAddressBuffer _1283 : register(t1, space0);
+RWByteAddressBuffer _242 : register(u0, space0);
+ByteAddressBuffer _854 : register(t1, space0);
+ByteAddressBuffer _1222 : register(t2, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -242,8 +185,8 @@
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _385 = { a.offset + offset };
-    return _385;
+    Alloc _319 = { a.offset + offset };
+    return _319;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -259,7 +202,7 @@
     {
         return 0u;
     }
-    uint v = _308.Load(offset * 4 + 8);
+    uint v = _242.Load(offset * 4 + 8);
     return v;
 }
 
@@ -272,8 +215,8 @@
 
 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index)
 {
-    BinInstanceRef _765 = { ref.offset + (index * 4u) };
-    return _765;
+    BinInstanceRef _328 = { ref.offset + (index * 4u) };
+    return _328;
 }
 
 BinInstance BinInstance_read(Alloc a, BinInstanceRef ref)
@@ -287,15 +230,6 @@
     return s;
 }
 
-AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _717 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _717;
-}
-
 Path Path_read(Alloc a, PathRef ref)
 {
     uint ix = ref.offset >> uint(2);
@@ -310,8 +244,8 @@
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _825 = { raw2 };
-    s.tiles = _825;
+    TileRef _391 = { raw2 };
+    s.tiles = _391;
     return s;
 }
 
@@ -321,11 +255,11 @@
 
 Alloc read_tile_alloc(uint el_ix, bool mem_ok)
 {
-    uint _1169;
-    _308.GetDimensions(_1169);
-    _1169 = (_1169 - 8) / 4;
+    uint _741;
+    _242.GetDimensions(_741);
+    _741 = (_741 - 8) / 4;
     uint param = 0u;
-    uint param_1 = uint(int(_1169) * 4);
+    uint param_1 = uint(int(_741) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
@@ -339,67 +273,31 @@
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
     uint raw1 = read_mem(param_2, param_3);
-    TileSegRef _850 = { raw0 };
+    TileSegRef _416 = { raw0 };
     Tile s;
-    s.tile = _850;
+    s.tile = _416;
     s.backdrop = int(raw1);
     return s;
 }
 
-AnnoColor AnnoColor_read(Alloc a, AnnoColorRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9);
-    Alloc param_10 = a;
-    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11);
-    AnnoColor s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.linewidth = asfloat(raw4);
-    s.rgba_color = raw5;
-    return s;
-}
-
-AnnoColor Annotated_Color_read(Alloc a, AnnotatedRef ref)
-{
-    AnnoColorRef _723 = { ref.offset + 4u };
-    Alloc param = a;
-    AnnoColorRef param_1 = _723;
-    return AnnoColor_read(param, param_1);
-}
-
 MallocResult malloc(uint size)
 {
-    uint _314;
-    _308.InterlockedAdd(0, size, _314);
-    uint offset = _314;
-    uint _321;
-    _308.GetDimensions(_321);
-    _321 = (_321 - 8) / 4;
+    uint _248;
+    _242.InterlockedAdd(0, size, _248);
+    uint offset = _248;
+    uint _255;
+    _242.GetDimensions(_255);
+    _255 = (_255 - 8) / 4;
     MallocResult r;
-    r.failed = (offset + size) > uint(int(_321) * 4);
+    r.failed = (offset + size) > uint(int(_255) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _343;
-        _308.InterlockedMax(4, 1u, _343);
+        uint _277;
+        _242.InterlockedMax(4, 1u, _277);
         return r;
     }
     return r;
@@ -413,7 +311,7 @@
     {
         return;
     }
-    _308.Store(offset * 4 + 8, val);
+    _242.Store(offset * 4 + 8, val);
 }
 
 void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s)
@@ -431,9 +329,9 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 10u;
     write_mem(param, param_1, param_2);
-    CmdJumpRef _1162 = { ref.offset + 4u };
+    CmdJumpRef _734 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdJumpRef param_4 = _1162;
+    CmdJumpRef param_4 = _734;
     CmdJump param_5 = s;
     CmdJump_write(param_3, param_4, param_5);
 }
@@ -445,30 +343,25 @@
         return true;
     }
     uint param = 1024u;
-    MallocResult _1190 = malloc(param);
-    MallocResult new_cmd = _1190;
+    MallocResult _762 = malloc(param);
+    MallocResult new_cmd = _762;
     if (new_cmd.failed)
     {
         return false;
     }
-    CmdJump _1200 = { new_cmd.alloc.offset };
-    CmdJump jump = _1200;
+    CmdJump _772 = { new_cmd.alloc.offset };
+    CmdJump jump = _772;
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
     Cmd_Jump_write(param_1, param_2, param_3);
     cmd_alloc = new_cmd.alloc;
-    CmdRef _1212 = { cmd_alloc.offset };
-    cmd_ref = _1212;
+    CmdRef _784 = { cmd_alloc.offset };
+    cmd_ref = _784;
     cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
     return true;
 }
 
-uint fill_mode_from_flags(uint flags)
-{
-    return flags & 1u;
-}
-
 void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s)
 {
     uint ix = ref.offset >> uint(2);
@@ -488,9 +381,9 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
     write_mem(param, param_1, param_2);
-    CmdFillRef _1036 = { ref.offset + 4u };
+    CmdFillRef _604 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdFillRef param_4 = _1036;
+    CmdFillRef param_4 = _604;
     CmdFill param_5 = s;
     CmdFill_write(param_3, param_4, param_5);
 }
@@ -522,44 +415,43 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
     write_mem(param, param_1, param_2);
-    CmdStrokeRef _1054 = { ref.offset + 4u };
+    CmdStrokeRef _622 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdStrokeRef param_4 = _1054;
+    CmdStrokeRef param_4 = _622;
     CmdStroke param_5 = s;
     CmdStroke_write(param_3, param_4, param_5);
 }
 
-void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float linewidth)
+void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth)
 {
-    uint param = flags;
-    if (fill_mode_from_flags(param) == 0u)
+    if (linewidth < 0.0f)
     {
         if (tile.tile.offset != 0u)
         {
-            CmdFill _1236 = { tile.tile.offset, tile.backdrop };
-            CmdFill cmd_fill = _1236;
-            Alloc param_1 = alloc;
-            CmdRef param_2 = cmd_ref;
-            CmdFill param_3 = cmd_fill;
-            Cmd_Fill_write(param_1, param_2, param_3);
+            CmdFill _807 = { tile.tile.offset, tile.backdrop };
+            CmdFill cmd_fill = _807;
+            Alloc param = alloc;
+            CmdRef param_1 = cmd_ref;
+            CmdFill param_2 = cmd_fill;
+            Cmd_Fill_write(param, param_1, param_2);
             cmd_ref.offset += 12u;
         }
         else
         {
-            Alloc param_4 = alloc;
-            CmdRef param_5 = cmd_ref;
-            Cmd_Solid_write(param_4, param_5);
+            Alloc param_3 = alloc;
+            CmdRef param_4 = cmd_ref;
+            Cmd_Solid_write(param_3, param_4);
             cmd_ref.offset += 4u;
         }
     }
     else
     {
-        CmdStroke _1266 = { tile.tile.offset, 0.5f * linewidth };
-        CmdStroke cmd_stroke = _1266;
-        Alloc param_6 = alloc;
-        CmdRef param_7 = cmd_ref;
-        CmdStroke param_8 = cmd_stroke;
-        Cmd_Stroke_write(param_6, param_7, param_8);
+        CmdStroke _837 = { tile.tile.offset, 0.5f * linewidth };
+        CmdStroke cmd_stroke = _837;
+        Alloc param_5 = alloc;
+        CmdRef param_6 = cmd_ref;
+        CmdStroke param_7 = cmd_stroke;
+        Cmd_Stroke_write(param_5, param_6, param_7);
         cmd_ref.offset += 12u;
     }
 }
@@ -579,61 +471,13 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
     write_mem(param, param_1, param_2);
-    CmdColorRef _1080 = { ref.offset + 4u };
+    CmdColorRef _649 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdColorRef param_4 = _1080;
+    CmdColorRef param_4 = _649;
     CmdColor param_5 = s;
     CmdColor_write(param_3, param_4, param_5);
 }
 
-AnnoLinGradient AnnoLinGradient_read(Alloc a, AnnoLinGradientRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9);
-    Alloc param_10 = a;
-    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13);
-    Alloc param_14 = a;
-    uint param_15 = ix + 7u;
-    uint raw7 = read_mem(param_14, param_15);
-    Alloc param_16 = a;
-    uint param_17 = ix + 8u;
-    uint raw8 = read_mem(param_16, param_17);
-    AnnoLinGradient s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.linewidth = asfloat(raw4);
-    s.index = raw5;
-    s.line_x = asfloat(raw6);
-    s.line_y = asfloat(raw7);
-    s.line_c = asfloat(raw8);
-    return s;
-}
-
-AnnoLinGradient Annotated_LinGradient_read(Alloc a, AnnotatedRef ref)
-{
-    AnnoLinGradientRef _733 = { ref.offset + 4u };
-    Alloc param = a;
-    AnnoLinGradientRef param_1 = _733;
-    return AnnoLinGradient_read(param, param_1);
-}
-
 void CmdLinGrad_write(Alloc a, CmdLinGradRef ref, CmdLinGrad s)
 {
     uint ix = ref.offset >> uint(2);
@@ -661,53 +505,13 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
     write_mem(param, param_1, param_2);
-    CmdLinGradRef _1098 = { ref.offset + 4u };
+    CmdLinGradRef _668 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdLinGradRef param_4 = _1098;
+    CmdLinGradRef param_4 = _668;
     CmdLinGrad param_5 = s;
     CmdLinGrad_write(param_3, param_4, param_5);
 }
 
-AnnoImage AnnoImage_read(Alloc a, AnnoImageRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9);
-    Alloc param_10 = a;
-    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13);
-    AnnoImage s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.linewidth = asfloat(raw4);
-    s.index = raw5;
-    s.offset = int2(int(raw6 << uint(16)) >> 16, int(raw6) >> 16);
-    return s;
-}
-
-AnnoImage Annotated_Image_read(Alloc a, AnnotatedRef ref)
-{
-    AnnoImageRef _743 = { ref.offset + 4u };
-    Alloc param = a;
-    AnnoImageRef param_1 = _743;
-    return AnnoImage_read(param, param_1);
-}
-
 void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s)
 {
     uint ix = ref.offset >> uint(2);
@@ -727,9 +531,9 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
     write_mem(param, param_1, param_2);
-    CmdImageRef _1116 = { ref.offset + 4u };
+    CmdImageRef _687 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdImageRef param_4 = _1116;
+    CmdImageRef param_4 = _687;
     CmdImage param_5 = s;
     CmdImage_write(param_3, param_4, param_5);
 }
@@ -742,38 +546,6 @@
     write_mem(param, param_1, param_2);
 }
 
-AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9);
-    AnnoEndClip s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.blend = raw4;
-    return s;
-}
-
-AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref)
-{
-    AnnoEndClipRef _753 = { ref.offset + 4u };
-    Alloc param = a;
-    AnnoEndClipRef param_1 = _753;
-    return AnnoEndClip_read(param, param_1);
-}
-
 void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s)
 {
     uint ix = ref.offset >> uint(2);
@@ -789,9 +561,9 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 9u;
     write_mem(param, param_1, param_2);
-    CmdEndClipRef _1143 = { ref.offset + 4u };
+    CmdEndClipRef _715 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdEndClipRef param_4 = _1143;
+    CmdEndClipRef param_4 = _715;
     CmdEndClip param_5 = s;
     CmdEndClip_write(param_3, param_4, param_5);
 }
@@ -806,25 +578,25 @@
 
 void comp_main()
 {
-    uint width_in_bins = ((_1283.Load(8) + 16u) - 1u) / 16u;
+    uint width_in_bins = ((_854.Load(8) + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_1283.Load(0) + 256u) - 1u) / 256u;
+    uint n_partitions = ((_854.Load(0) + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _1283.Load(8)) + bin_tile_x) + tile_x;
-    Alloc _1348;
-    _1348.offset = _1283.Load(24);
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _854.Load(8)) + bin_tile_x) + tile_x;
+    Alloc _919;
+    _919.offset = _854.Load(24);
     Alloc param;
-    param.offset = _1348.offset;
+    param.offset = _919.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1357 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1357;
+    CmdRef _928 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _928;
     uint cmd_limit = (cmd_ref.offset + 1024u) - 60u;
     uint clip_depth = 0u;
     uint clip_zero_depth = 0u;
@@ -832,59 +604,54 @@
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    bool mem_ok = _308.Load(4) == 0u;
+    uint drawmonoid_start = _854.Load(44) >> uint(2);
+    uint drawtag_start = _854.Load(100) >> uint(2);
+    uint drawdata_start = _854.Load(104) >> uint(2);
+    uint drawinfo_start = _854.Load(68) >> uint(2);
+    bool mem_ok = _242.Load(4) == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1562;
+    uint _1154;
     uint element_ix;
-    AnnotatedRef ref;
     Alloc param_14;
-    Alloc param_16;
     uint tile_count;
-    Alloc param_23;
-    uint _1887;
-    Alloc param_29;
-    Tile tile_1;
-    AnnoColor fill;
-    Alloc param_35;
-    Alloc param_52;
+    uint _1453;
+    float linewidth;
     CmdLinGrad cmd_lin;
-    Alloc param_69;
-    Alloc param_95;
     while (true)
     {
         for (uint i = 0u; i < 8u; i++)
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1614;
+        bool _1206;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1412 = th_ix < 256u;
-                bool _1420;
-                if (_1412)
+                bool _1003 = th_ix < 256u;
+                bool _1011;
+                if (_1003)
                 {
-                    _1420 = (partition_ix + th_ix) < n_partitions;
+                    _1011 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1420 = _1412;
+                    _1011 = _1003;
                 }
-                if (_1420)
+                if (_1011)
                 {
-                    uint in_ix = (_1283.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1437;
-                    _1437.offset = _1283.Load(20);
-                    param_3.offset = _1437.offset;
+                    uint in_ix = (_854.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    Alloc _1029;
+                    _1029.offset = _854.Load(20);
+                    param_3.offset = _1029.offset;
                     uint param_4 = in_ix;
                     count = read_mem(param_3, param_4);
-                    Alloc _1448;
-                    _1448.offset = _1283.Load(20);
-                    param_5.offset = _1448.offset;
+                    Alloc _1040;
+                    _1040.offset = _854.Load(20);
+                    param_5.offset = _1040.offset;
                     uint param_6 = in_ix + 1u;
                     uint offset = read_mem(param_5, param_6);
                     uint param_7 = offset;
@@ -930,16 +697,16 @@
                 }
                 if (part_ix > 0u)
                 {
-                    _1562 = sh_part_count[part_ix - 1u];
+                    _1154 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1562 = part_start_ix;
+                    _1154 = part_start_ix;
                 }
-                ix -= _1562;
+                ix -= _1154;
                 Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1581 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1581;
+                BinInstanceRef _1173 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1173;
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
@@ -949,16 +716,16 @@
             }
             GroupMemoryBarrierWithGroupSync();
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1604 = (wr_ix - rd_ix) < 256u;
-            if (_1604)
+            bool _1196 = (wr_ix - rd_ix) < 256u;
+            if (_1196)
             {
-                _1614 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1206 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1614 = _1604;
+                _1206 = _1196;
             }
-            if (_1614)
+            if (_1206)
             {
                 continue;
             }
@@ -971,30 +738,24 @@
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            AnnotatedRef _1635 = { _1283.Load(32) + (element_ix * 40u) };
-            ref = _1635;
-            Alloc _1638;
-            _1638.offset = _1283.Load(32);
-            param_14.offset = _1638.offset;
-            AnnotatedRef param_15 = ref;
-            tag = Annotated_tag(param_14, param_15).tag;
+            tag = _1222.Load((drawtag_start + element_ix) * 4 + 0);
         }
         switch (tag)
         {
-            case 1u:
-            case 3u:
-            case 2u:
-            case 4u:
+            case 68u:
+            case 72u:
+            case 276u:
             case 5u:
+            case 37u:
             {
-                uint drawmonoid_base = (_1283.Load(44) >> uint(2)) + (2u * element_ix);
-                uint path_ix = _308.Load(drawmonoid_base * 4 + 8);
-                PathRef _1667 = { _1283.Load(16) + (path_ix * 12u) };
-                Alloc _1670;
-                _1670.offset = _1283.Load(16);
-                param_16.offset = _1670.offset;
-                PathRef param_17 = _1667;
-                Path path = Path_read(param_16, param_17);
+                uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
+                uint path_ix = _242.Load(drawmonoid_base * 4 + 8);
+                PathRef _1247 = { _854.Load(16) + (path_ix * 12u) };
+                Alloc _1250;
+                _1250.offset = _854.Load(16);
+                param_14.offset = _1250.offset;
+                PathRef param_15 = _1247;
+                Path path = Path_read(param_14, param_15);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
                 int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -1009,13 +770,13 @@
                 tile_count = uint(x1 - x0) * uint(y1 - y0);
                 uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
                 sh_tile_base[th_ix] = base;
-                uint param_18 = path.tiles.offset;
-                uint param_19 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                bool param_20 = mem_ok;
-                Alloc path_alloc = new_alloc(param_18, param_19, param_20);
-                uint param_21 = th_ix;
-                Alloc param_22 = path_alloc;
-                write_tile_alloc(param_21, param_22);
+                uint param_16 = path.tiles.offset;
+                uint param_17 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                bool param_18 = mem_ok;
+                Alloc path_alloc = new_alloc(param_16, param_17, param_18);
+                uint param_19 = th_ix;
+                Alloc param_20 = path_alloc;
+                write_tile_alloc(param_19, param_20);
                 break;
             }
             default:
@@ -1048,72 +809,57 @@
                     el_ix = probe_1;
                 }
             }
-            AnnotatedRef _1869 = { _1283.Load(32) + (sh_elements[el_ix] * 40u) };
-            AnnotatedRef ref_1 = _1869;
-            Alloc _1874;
-            _1874.offset = _1283.Load(32);
-            param_23.offset = _1874.offset;
-            AnnotatedRef param_24 = ref_1;
-            AnnotatedTag anno_tag = Annotated_tag(param_23, param_24);
-            uint tag_1 = anno_tag.tag;
+            uint tag_1 = _1222.Load((drawtag_start + sh_elements[el_ix]) * 4 + 0);
             if (el_ix > 0u)
             {
-                _1887 = sh_tile_count[el_ix - 1u];
+                _1453 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1887 = 0u;
+                _1453 = 0u;
             }
-            uint seq_ix = ix_1 - _1887;
+            uint seq_ix = ix_1 - _1453;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
             bool include_tile = false;
             if (mem_ok)
             {
-                uint param_25 = el_ix;
-                bool param_26 = mem_ok;
-                TileRef _1939 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1939;
-                Tile tile = Tile_read(param_27, param_28);
-                bool is_clip = (tag_1 == 4u) || (tag_1 == 5u);
-                bool _1951 = tile.tile.offset != 0u;
-                bool _1960;
-                if (!_1951)
+                uint param_21 = el_ix;
+                bool param_22 = mem_ok;
+                TileRef _1505 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                Alloc param_23 = read_tile_alloc(param_21, param_22);
+                TileRef param_24 = _1505;
+                Tile tile = Tile_read(param_23, param_24);
+                bool is_clip = (tag_1 & 1u) != 0u;
+                bool is_blend = false;
+                bool _1516 = tile.tile.offset != 0u;
+                bool _1525;
+                if (!_1516)
                 {
-                    _1960 = (tile.backdrop == 0) == is_clip;
+                    _1525 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1960 = _1951;
+                    _1525 = _1516;
                 }
-                bool _1972;
-                if (!_1960)
+                bool _1532;
+                if (!_1525)
                 {
-                    bool _1971;
-                    if (is_clip)
-                    {
-                        _1971 = (anno_tag.flags & 2u) != 0u;
-                    }
-                    else
-                    {
-                        _1971 = is_clip;
-                    }
-                    _1972 = _1971;
+                    _1532 = is_clip && is_blend;
                 }
                 else
                 {
-                    _1972 = _1960;
+                    _1532 = _1525;
                 }
-                include_tile = _1972;
+                include_tile = _1532;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1992;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1992);
+                uint _1552;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1552);
             }
         }
         GroupMemoryBarrierWithGroupSync();
@@ -1137,217 +883,176 @@
             uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
             uint element_ix_1 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            AnnotatedRef _2046 = { _1283.Load(32) + (element_ix_1 * 40u) };
-            ref = _2046;
-            Alloc _2050;
-            _2050.offset = _1283.Load(32);
-            param_29.offset = _2050.offset;
-            AnnotatedRef param_30 = ref;
-            AnnotatedTag tag_2 = Annotated_tag(param_29, param_30);
+            uint drawtag = _1222.Load((drawtag_start + element_ix_1) * 4 + 0);
             if (clip_zero_depth == 0u)
             {
-                switch (tag_2.tag)
+                uint param_25 = element_ref_ix;
+                bool param_26 = mem_ok;
+                TileRef _1629 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                Alloc param_27 = read_tile_alloc(param_25, param_26);
+                TileRef param_28 = _1629;
+                Tile tile_1 = Tile_read(param_27, param_28);
+                uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
+                uint scene_offset = _242.Load((drawmonoid_base_1 + 2u) * 4 + 8);
+                uint info_offset = _242.Load((drawmonoid_base_1 + 3u) * 4 + 8);
+                uint dd = drawdata_start + (scene_offset >> uint(2));
+                uint di = drawinfo_start + (info_offset >> uint(2));
+                switch (drawtag)
                 {
-                    case 1u:
+                    case 68u:
                     {
-                        uint param_31 = element_ref_ix;
-                        bool param_32 = mem_ok;
-                        TileRef _2086 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        Alloc param_33 = read_tile_alloc(param_31, param_32);
-                        TileRef param_34 = _2086;
-                        tile_1 = Tile_read(param_33, param_34);
-                        Alloc _2093;
-                        _2093.offset = _1283.Load(32);
-                        param_35.offset = _2093.offset;
-                        AnnotatedRef param_36 = ref;
-                        fill = Annotated_Color_read(param_35, param_36);
-                        Alloc param_37 = cmd_alloc;
-                        CmdRef param_38 = cmd_ref;
-                        uint param_39 = cmd_limit;
-                        bool _2105 = alloc_cmd(param_37, param_38, param_39);
-                        cmd_alloc = param_37;
-                        cmd_ref = param_38;
-                        cmd_limit = param_39;
-                        if (!_2105)
+                        linewidth = asfloat(_242.Load(di * 4 + 8));
+                        Alloc param_29 = cmd_alloc;
+                        CmdRef param_30 = cmd_ref;
+                        uint param_31 = cmd_limit;
+                        bool _1676 = alloc_cmd(param_29, param_30, param_31);
+                        cmd_alloc = param_29;
+                        cmd_ref = param_30;
+                        cmd_limit = param_31;
+                        if (!_1676)
                         {
                             break;
                         }
-                        Alloc param_40 = cmd_alloc;
-                        CmdRef param_41 = cmd_ref;
-                        uint param_42 = tag_2.flags;
-                        Tile param_43 = tile_1;
-                        float param_44 = fill.linewidth;
-                        write_fill(param_40, param_41, param_42, param_43, param_44);
-                        cmd_ref = param_41;
-                        CmdColor _2129 = { fill.rgba_color };
-                        Alloc param_45 = cmd_alloc;
-                        CmdRef param_46 = cmd_ref;
-                        CmdColor param_47 = _2129;
-                        Cmd_Color_write(param_45, param_46, param_47);
+                        Alloc param_32 = cmd_alloc;
+                        CmdRef param_33 = cmd_ref;
+                        Tile param_34 = tile_1;
+                        float param_35 = linewidth;
+                        write_fill(param_32, param_33, param_34, param_35);
+                        cmd_ref = param_33;
+                        uint rgba = _1222.Load(dd * 4 + 0);
+                        CmdColor _1699 = { rgba };
+                        Alloc param_36 = cmd_alloc;
+                        CmdRef param_37 = cmd_ref;
+                        CmdColor param_38 = _1699;
+                        Cmd_Color_write(param_36, param_37, param_38);
                         cmd_ref.offset += 8u;
                         break;
                     }
-                    case 2u:
+                    case 276u:
                     {
-                        uint param_48 = element_ref_ix;
-                        bool param_49 = mem_ok;
-                        TileRef _2158 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        Alloc param_50 = read_tile_alloc(param_48, param_49);
-                        TileRef param_51 = _2158;
-                        tile_1 = Tile_read(param_50, param_51);
-                        Alloc _2165;
-                        _2165.offset = _1283.Load(32);
-                        param_52.offset = _2165.offset;
-                        AnnotatedRef param_53 = ref;
-                        AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53);
-                        Alloc param_54 = cmd_alloc;
-                        CmdRef param_55 = cmd_ref;
-                        uint param_56 = cmd_limit;
-                        bool _2177 = alloc_cmd(param_54, param_55, param_56);
-                        cmd_alloc = param_54;
-                        cmd_ref = param_55;
-                        cmd_limit = param_56;
-                        if (!_2177)
+                        Alloc param_39 = cmd_alloc;
+                        CmdRef param_40 = cmd_ref;
+                        uint param_41 = cmd_limit;
+                        bool _1717 = alloc_cmd(param_39, param_40, param_41);
+                        cmd_alloc = param_39;
+                        cmd_ref = param_40;
+                        cmd_limit = param_41;
+                        if (!_1717)
                         {
                             break;
                         }
-                        Alloc param_57 = cmd_alloc;
-                        CmdRef param_58 = cmd_ref;
-                        uint param_59 = tag_2.flags;
-                        Tile param_60 = tile_1;
-                        float param_61 = fill.linewidth;
-                        write_fill(param_57, param_58, param_59, param_60, param_61);
-                        cmd_ref = param_58;
-                        cmd_lin.index = lin.index;
-                        cmd_lin.line_x = lin.line_x;
-                        cmd_lin.line_y = lin.line_y;
-                        cmd_lin.line_c = lin.line_c;
-                        Alloc param_62 = cmd_alloc;
-                        CmdRef param_63 = cmd_ref;
-                        CmdLinGrad param_64 = cmd_lin;
-                        Cmd_LinGrad_write(param_62, param_63, param_64);
+                        linewidth = asfloat(_242.Load(di * 4 + 8));
+                        Alloc param_42 = cmd_alloc;
+                        CmdRef param_43 = cmd_ref;
+                        Tile param_44 = tile_1;
+                        float param_45 = linewidth;
+                        write_fill(param_42, param_43, param_44, param_45);
+                        cmd_ref = param_43;
+                        cmd_lin.index = _1222.Load(dd * 4 + 0);
+                        cmd_lin.line_x = asfloat(_242.Load((di + 1u) * 4 + 8));
+                        cmd_lin.line_y = asfloat(_242.Load((di + 2u) * 4 + 8));
+                        cmd_lin.line_c = asfloat(_242.Load((di + 3u) * 4 + 8));
+                        Alloc param_46 = cmd_alloc;
+                        CmdRef param_47 = cmd_ref;
+                        CmdLinGrad param_48 = cmd_lin;
+                        Cmd_LinGrad_write(param_46, param_47, param_48);
                         cmd_ref.offset += 20u;
                         break;
                     }
-                    case 3u:
+                    case 72u:
                     {
-                        uint param_65 = element_ref_ix;
-                        bool param_66 = mem_ok;
-                        TileRef _2242 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        Alloc param_67 = read_tile_alloc(param_65, param_66);
-                        TileRef param_68 = _2242;
-                        tile_1 = Tile_read(param_67, param_68);
-                        Alloc _2249;
-                        _2249.offset = _1283.Load(32);
-                        param_69.offset = _2249.offset;
-                        AnnotatedRef param_70 = ref;
-                        AnnoImage fill_img = Annotated_Image_read(param_69, param_70);
-                        Alloc param_71 = cmd_alloc;
-                        CmdRef param_72 = cmd_ref;
-                        uint param_73 = cmd_limit;
-                        bool _2261 = alloc_cmd(param_71, param_72, param_73);
-                        cmd_alloc = param_71;
-                        cmd_ref = param_72;
-                        cmd_limit = param_73;
-                        if (!_2261)
+                        linewidth = asfloat(_242.Load(di * 4 + 8));
+                        Alloc param_49 = cmd_alloc;
+                        CmdRef param_50 = cmd_ref;
+                        uint param_51 = cmd_limit;
+                        bool _1785 = alloc_cmd(param_49, param_50, param_51);
+                        cmd_alloc = param_49;
+                        cmd_ref = param_50;
+                        cmd_limit = param_51;
+                        if (!_1785)
                         {
                             break;
                         }
-                        Alloc param_74 = cmd_alloc;
-                        CmdRef param_75 = cmd_ref;
-                        uint param_76 = tag_2.flags;
-                        Tile param_77 = tile_1;
-                        float param_78 = fill_img.linewidth;
-                        write_fill(param_74, param_75, param_76, param_77, param_78);
-                        cmd_ref = param_75;
-                        CmdImage _2287 = { fill_img.index, fill_img.offset };
-                        Alloc param_79 = cmd_alloc;
-                        CmdRef param_80 = cmd_ref;
-                        CmdImage param_81 = _2287;
-                        Cmd_Image_write(param_79, param_80, param_81);
+                        Alloc param_52 = cmd_alloc;
+                        CmdRef param_53 = cmd_ref;
+                        Tile param_54 = tile_1;
+                        float param_55 = linewidth;
+                        write_fill(param_52, param_53, param_54, param_55);
+                        cmd_ref = param_53;
+                        uint index = _1222.Load(dd * 4 + 0);
+                        uint raw1 = _1222.Load((dd + 1u) * 4 + 0);
+                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+                        CmdImage _1824 = { index, offset_1 };
+                        Alloc param_56 = cmd_alloc;
+                        CmdRef param_57 = cmd_ref;
+                        CmdImage param_58 = _1824;
+                        Cmd_Image_write(param_56, param_57, param_58);
                         cmd_ref.offset += 12u;
                         break;
                     }
-                    case 4u:
+                    case 5u:
                     {
-                        uint param_82 = element_ref_ix;
-                        bool param_83 = mem_ok;
-                        TileRef _2316 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        Alloc param_84 = read_tile_alloc(param_82, param_83);
-                        TileRef param_85 = _2316;
-                        tile_1 = Tile_read(param_84, param_85);
-                        bool _2322 = tile_1.tile.offset == 0u;
-                        bool _2328;
-                        if (_2322)
+                        bool _1838 = tile_1.tile.offset == 0u;
+                        bool _1844;
+                        if (_1838)
                         {
-                            _2328 = tile_1.backdrop == 0;
+                            _1844 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2328 = _2322;
+                            _1844 = _1838;
                         }
-                        if (_2328)
+                        if (_1844)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
                         else
                         {
-                            Alloc param_86 = cmd_alloc;
-                            CmdRef param_87 = cmd_ref;
-                            uint param_88 = cmd_limit;
-                            bool _2340 = alloc_cmd(param_86, param_87, param_88);
-                            cmd_alloc = param_86;
-                            cmd_ref = param_87;
-                            cmd_limit = param_88;
-                            if (!_2340)
+                            Alloc param_59 = cmd_alloc;
+                            CmdRef param_60 = cmd_ref;
+                            uint param_61 = cmd_limit;
+                            bool _1856 = alloc_cmd(param_59, param_60, param_61);
+                            cmd_alloc = param_59;
+                            cmd_ref = param_60;
+                            cmd_limit = param_61;
+                            if (!_1856)
                             {
                                 break;
                             }
-                            Alloc param_89 = cmd_alloc;
-                            CmdRef param_90 = cmd_ref;
-                            Cmd_BeginClip_write(param_89, param_90);
+                            Alloc param_62 = cmd_alloc;
+                            CmdRef param_63 = cmd_ref;
+                            Cmd_BeginClip_write(param_62, param_63);
                             cmd_ref.offset += 4u;
                         }
                         clip_depth++;
                         break;
                     }
-                    case 5u:
+                    case 37u:
                     {
-                        uint param_91 = element_ref_ix;
-                        bool param_92 = mem_ok;
-                        TileRef _2377 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        Alloc param_93 = read_tile_alloc(param_91, param_92);
-                        TileRef param_94 = _2377;
-                        tile_1 = Tile_read(param_93, param_94);
-                        Alloc _2384;
-                        _2384.offset = _1283.Load(32);
-                        param_95.offset = _2384.offset;
-                        AnnotatedRef param_96 = ref;
-                        AnnoEndClip end_clip = Annotated_EndClip_read(param_95, param_96);
                         clip_depth--;
-                        Alloc param_97 = cmd_alloc;
-                        CmdRef param_98 = cmd_ref;
-                        uint param_99 = cmd_limit;
-                        bool _2398 = alloc_cmd(param_97, param_98, param_99);
-                        cmd_alloc = param_97;
-                        cmd_ref = param_98;
-                        cmd_limit = param_99;
-                        if (!_2398)
+                        Alloc param_64 = cmd_alloc;
+                        CmdRef param_65 = cmd_ref;
+                        uint param_66 = cmd_limit;
+                        bool _1884 = alloc_cmd(param_64, param_65, param_66);
+                        cmd_alloc = param_64;
+                        cmd_ref = param_65;
+                        cmd_limit = param_66;
+                        if (!_1884)
                         {
                             break;
                         }
-                        Alloc param_100 = cmd_alloc;
-                        CmdRef param_101 = cmd_ref;
-                        uint param_102 = 0u;
-                        Tile param_103 = tile_1;
-                        float param_104 = 0.0f;
-                        write_fill(param_100, param_101, param_102, param_103, param_104);
-                        cmd_ref = param_101;
-                        CmdEndClip _2419 = { end_clip.blend };
-                        Alloc param_105 = cmd_alloc;
-                        CmdRef param_106 = cmd_ref;
-                        CmdEndClip param_107 = _2419;
-                        Cmd_EndClip_write(param_105, param_106, param_107);
+                        Alloc param_67 = cmd_alloc;
+                        CmdRef param_68 = cmd_ref;
+                        Tile param_69 = tile_1;
+                        float param_70 = -1.0f;
+                        write_fill(param_67, param_68, param_69, param_70);
+                        cmd_ref = param_68;
+                        uint blend = _1222.Load(dd * 4 + 0);
+                        CmdEndClip _1907 = { blend };
+                        Alloc param_71 = cmd_alloc;
+                        CmdRef param_72 = cmd_ref;
+                        CmdEndClip param_73 = _1907;
+                        Cmd_EndClip_write(param_71, param_72, param_73);
                         cmd_ref.offset += 8u;
                         break;
                     }
@@ -1355,14 +1060,14 @@
             }
             else
             {
-                switch (tag_2.tag)
+                switch (drawtag)
                 {
-                    case 4u:
+                    case 5u:
                     {
                         clip_depth++;
                         break;
                     }
-                    case 5u:
+                    case 37u:
                     {
                         if (clip_depth == clip_zero_depth)
                         {
@@ -1381,21 +1086,21 @@
             break;
         }
     }
-    bool _2467 = (bin_tile_x + tile_x) < _1283.Load(8);
-    bool _2476;
-    if (_2467)
+    bool _1954 = (bin_tile_x + tile_x) < _854.Load(8);
+    bool _1963;
+    if (_1954)
     {
-        _2476 = (bin_tile_y + tile_y) < _1283.Load(12);
+        _1963 = (bin_tile_y + tile_y) < _854.Load(12);
     }
     else
     {
-        _2476 = _2467;
+        _1963 = _1954;
     }
-    if (_2476)
+    if (_1963)
     {
-        Alloc param_108 = cmd_alloc;
-        CmdRef param_109 = cmd_ref;
-        Cmd_End_write(param_108, param_109);
+        Alloc param_74 = cmd_alloc;
+        CmdRef param_75 = cmd_ref;
+        Cmd_End_write(param_74, param_75);
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index 1422ff1..96f1026 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -25,68 +25,6 @@
     bool failed;
 };
 
-struct AnnoImageRef
-{
-    uint offset;
-};
-
-struct AnnoImage
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    int2 offset;
-};
-
-struct AnnoColorRef
-{
-    uint offset;
-};
-
-struct AnnoColor
-{
-    float4 bbox;
-    float linewidth;
-    uint rgba_color;
-};
-
-struct AnnoLinGradientRef
-{
-    uint offset;
-};
-
-struct AnnoLinGradient
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    float line_x;
-    float line_y;
-    float line_c;
-};
-
-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct BinInstanceRef
 {
     uint offset;
@@ -229,12 +167,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -242,6 +182,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -249,6 +191,11 @@
     Config conf;
 };
 
+struct SceneBuf
+{
+    uint scene[1];
+};
+
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
@@ -264,7 +211,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_308, constant uint& v_308BufferSize)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -272,7 +219,7 @@
     {
         return 0u;
     }
-    uint v = v_308.memory[offset];
+    uint v = v_242.memory[offset];
     return v;
 }
 
@@ -291,39 +238,30 @@
 }
 
 static inline __attribute__((always_inline))
-BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
+    uint raw0 = read_mem(param, param_1, v_242, v_242BufferSize);
     BinInstance s;
     s.element_ix = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_308, v_308BufferSize);
-    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
-
-static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
+    uint raw0 = read_mem(param, param_1, v_242, v_242BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_242, v_242BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_242, v_242BufferSize);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -336,24 +274,24 @@
 }
 
 static inline __attribute__((always_inline))
-Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_308, constant uint& v_308BufferSize)
+Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint param = 0u;
-    uint param_1 = uint(int((v_308BufferSize - 8) / 4) * 4);
+    uint param_1 = uint(int((v_242BufferSize - 8) / 4) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
 
 static inline __attribute__((always_inline))
-Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
+    uint raw0 = read_mem(param, param_1, v_242, v_242BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_242, v_242BufferSize);
     Tile s;
     s.tile = TileSegRef{ raw0 };
     s.backdrop = int(raw1);
@@ -361,63 +299,26 @@
 }
 
 static inline __attribute__((always_inline))
-AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+MallocResult malloc(thread const uint& size, device Memory& v_242, constant uint& v_242BufferSize)
 {
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
-    Alloc param_10 = a;
-    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_308, v_308BufferSize);
-    AnnoColor s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    s.linewidth = as_type<float>(raw4);
-    s.rgba_color = raw5;
-    return s;
-}
-
-static inline __attribute__((always_inline))
-AnnoColor Annotated_Color_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    Alloc param = a;
-    AnnoColorRef param_1 = AnnoColorRef{ ref.offset + 4u };
-    return AnnoColor_read(param, param_1, v_308, v_308BufferSize);
-}
-
-static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    uint _314 = atomic_fetch_add_explicit((device atomic_uint*)&v_308.mem_offset, size, memory_order_relaxed);
-    uint offset = _314;
+    uint _248 = atomic_fetch_add_explicit((device atomic_uint*)&v_242.mem_offset, size, memory_order_relaxed);
+    uint offset = _248;
     MallocResult r;
-    r.failed = (offset + size) > uint(int((v_308BufferSize - 8) / 4) * 4);
+    r.failed = (offset + size) > uint(int((v_242BufferSize - 8) / 4) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _343 = atomic_fetch_max_explicit((device atomic_uint*)&v_308.mem_error, 1u, memory_order_relaxed);
+        uint _277 = atomic_fetch_max_explicit((device atomic_uint*)&v_242.mem_error, 1u, memory_order_relaxed);
         return r;
     }
     return r;
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_308, constant uint& v_308BufferSize)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -425,42 +326,42 @@
     {
         return;
     }
-    v_308.memory[offset] = val;
+    v_242.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.new_ref;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 10u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u };
     CmdJump param_5 = s;
-    CmdJump_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdJump_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_308, constant uint& v_308BufferSize)
+bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_242, constant uint& v_242BufferSize)
 {
     if (cmd_ref.offset < cmd_limit)
     {
         return true;
     }
     uint param = 1024u;
-    MallocResult _1190 = malloc(param, v_308, v_308BufferSize);
-    MallocResult new_cmd = _1190;
+    MallocResult _762 = malloc(param, v_242, v_242BufferSize);
+    MallocResult new_cmd = _762;
     if (new_cmd.failed)
     {
         return false;
@@ -469,7 +370,7 @@
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
-    Cmd_Jump_write(param_1, param_2, param_3, v_308, v_308BufferSize);
+    Cmd_Jump_write(param_1, param_2, param_3, v_242, v_242BufferSize);
     cmd_alloc = new_cmd.alloc;
     cmd_ref = CmdRef{ cmd_alloc.offset };
     cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
@@ -477,358 +378,228 @@
 }
 
 static inline __attribute__((always_inline))
-uint fill_mode_from_flags(thread const uint& flags)
-{
-    return flags & 1u;
-}
-
-static inline __attribute__((always_inline))
-void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = uint(s.backdrop);
-    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
+    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u };
     CmdFill param_5 = s;
-    CmdFill_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdFill_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 3u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.half_width);
-    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
+    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u };
     CmdStroke param_5 = s;
-    CmdStroke_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdStroke_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const uint& flags, thread const Tile& tile, thread const float& linewidth, device Memory& v_308, constant uint& v_308BufferSize)
+void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_242, constant uint& v_242BufferSize)
 {
-    uint param = flags;
-    if (fill_mode_from_flags(param) == 0u)
+    if (linewidth < 0.0)
     {
         if (tile.tile.offset != 0u)
         {
             CmdFill cmd_fill = CmdFill{ tile.tile.offset, tile.backdrop };
-            Alloc param_1 = alloc;
-            CmdRef param_2 = cmd_ref;
-            CmdFill param_3 = cmd_fill;
-            Cmd_Fill_write(param_1, param_2, param_3, v_308, v_308BufferSize);
+            Alloc param = alloc;
+            CmdRef param_1 = cmd_ref;
+            CmdFill param_2 = cmd_fill;
+            Cmd_Fill_write(param, param_1, param_2, v_242, v_242BufferSize);
             cmd_ref.offset += 12u;
         }
         else
         {
-            Alloc param_4 = alloc;
-            CmdRef param_5 = cmd_ref;
-            Cmd_Solid_write(param_4, param_5, v_308, v_308BufferSize);
+            Alloc param_3 = alloc;
+            CmdRef param_4 = cmd_ref;
+            Cmd_Solid_write(param_3, param_4, v_242, v_242BufferSize);
             cmd_ref.offset += 4u;
         }
     }
     else
     {
         CmdStroke cmd_stroke = CmdStroke{ tile.tile.offset, 0.5 * linewidth };
-        Alloc param_6 = alloc;
-        CmdRef param_7 = cmd_ref;
-        CmdStroke param_8 = cmd_stroke;
-        Cmd_Stroke_write(param_6, param_7, param_8, v_308, v_308BufferSize);
+        Alloc param_5 = alloc;
+        CmdRef param_6 = cmd_ref;
+        CmdStroke param_7 = cmd_stroke;
+        Cmd_Stroke_write(param_5, param_6, param_7, v_242, v_242BufferSize);
         cmd_ref.offset += 12u;
     }
 }
 
 static inline __attribute__((always_inline))
-void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.rgba_color;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u };
     CmdColor param_5 = s;
-    CmdColor_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdColor_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLinGradientRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
-    Alloc param_10 = a;
-    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_308, v_308BufferSize);
-    Alloc param_12 = a;
-    uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_308, v_308BufferSize);
-    Alloc param_14 = a;
-    uint param_15 = ix + 7u;
-    uint raw7 = read_mem(param_14, param_15, v_308, v_308BufferSize);
-    Alloc param_16 = a;
-    uint param_17 = ix + 8u;
-    uint raw8 = read_mem(param_16, param_17, v_308, v_308BufferSize);
-    AnnoLinGradient s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    s.linewidth = as_type<float>(raw4);
-    s.index = raw5;
-    s.line_x = as_type<float>(raw6);
-    s.line_y = as_type<float>(raw7);
-    s.line_c = as_type<float>(raw8);
-    return s;
-}
-
-static inline __attribute__((always_inline))
-AnnoLinGradient Annotated_LinGradient_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    Alloc param = a;
-    AnnoLinGradientRef param_1 = AnnoLinGradientRef{ ref.offset + 4u };
-    return AnnoLinGradient_read(param, param_1, v_308, v_308BufferSize);
-}
-
-static inline __attribute__((always_inline))
-void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.line_x);
-    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
+    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.line_y);
-    write_mem(param_6, param_7, param_8, v_308, v_308BufferSize);
+    write_mem(param_6, param_7, param_8, v_242, v_242BufferSize);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.line_c);
-    write_mem(param_9, param_10, param_11, v_308, v_308BufferSize);
+    write_mem(param_9, param_10, param_11, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u };
     CmdLinGrad param_5 = s;
-    CmdLinGrad_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdLinGrad_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
-    Alloc param_10 = a;
-    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_308, v_308BufferSize);
-    Alloc param_12 = a;
-    uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_308, v_308BufferSize);
-    AnnoImage s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    s.linewidth = as_type<float>(raw4);
-    s.index = raw5;
-    s.offset = int2(int(raw6 << uint(16)) >> 16, int(raw6) >> 16);
-    return s;
-}
-
-static inline __attribute__((always_inline))
-AnnoImage Annotated_Image_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    Alloc param = a;
-    AnnoImageRef param_1 = AnnoImageRef{ ref.offset + 4u };
-    return AnnoImage_read(param, param_1, v_308, v_308BufferSize);
-}
-
-static inline __attribute__((always_inline))
-void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
+    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
     CmdImage param_5 = s;
-    CmdImage_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdImage_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 8u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
-    AnnoEndClip s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    s.blend = raw4;
-    return s;
-}
-
-static inline __attribute__((always_inline))
-AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    Alloc param = a;
-    AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u };
-    return AnnoEndClip_read(param, param_1, v_308, v_308BufferSize);
-}
-
-static inline __attribute__((always_inline))
-void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.blend;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 9u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u };
     CmdEndClip param_5 = s;
-    CmdEndClip_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdEndClip_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 0u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_308 [[buffer(0)]], const device ConfigBuf& _1283 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_242 [[buffer(0)]], const device ConfigBuf& _854 [[buffer(1)]], const device SceneBuf& _1222 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_bitmaps[8][256];
     threadgroup Alloc sh_part_elements[256];
@@ -840,19 +611,19 @@
     threadgroup uint sh_tile_y0[256];
     threadgroup uint sh_tile_base[256];
     threadgroup uint sh_tile_count[256];
-    constant uint& v_308BufferSize = spvBufferSizeConstants[0];
-    uint width_in_bins = ((_1283.conf.width_in_tiles + 16u) - 1u) / 16u;
+    constant uint& v_242BufferSize = spvBufferSizeConstants[0];
+    uint width_in_bins = ((_854.conf.width_in_tiles + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_1283.conf.n_elements + 256u) - 1u) / 256u;
+    uint n_partitions = ((_854.conf.n_elements + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _1283.conf.width_in_tiles) + bin_tile_x) + tile_x;
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _854.conf.width_in_tiles) + bin_tile_x) + tile_x;
     Alloc param;
-    param.offset = _1283.conf.ptcl_alloc.offset;
+    param.offset = _854.conf.ptcl_alloc.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -864,57 +635,52 @@
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    bool mem_ok = v_308.mem_error == 0u;
+    uint drawmonoid_start = _854.conf.drawmonoid_alloc.offset >> uint(2);
+    uint drawtag_start = _854.conf.drawtag_offset >> uint(2);
+    uint drawdata_start = _854.conf.drawdata_offset >> uint(2);
+    uint drawinfo_start = _854.conf.drawinfo_alloc.offset >> uint(2);
+    bool mem_ok = v_242.mem_error == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1562;
+    uint _1154;
     uint element_ix;
-    AnnotatedRef ref;
     Alloc param_14;
-    Alloc param_16;
     uint tile_count;
-    Alloc param_23;
-    uint _1887;
-    Alloc param_29;
-    Tile tile_1;
-    AnnoColor fill;
-    Alloc param_35;
-    Alloc param_52;
+    uint _1453;
+    float linewidth;
     CmdLinGrad cmd_lin;
-    Alloc param_69;
-    Alloc param_95;
     while (true)
     {
         for (uint i = 0u; i < 8u; i++)
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1614;
+        bool _1206;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1412 = th_ix < 256u;
-                bool _1420;
-                if (_1412)
+                bool _1003 = th_ix < 256u;
+                bool _1011;
+                if (_1003)
                 {
-                    _1420 = (partition_ix + th_ix) < n_partitions;
+                    _1011 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1420 = _1412;
+                    _1011 = _1003;
                 }
-                if (_1420)
+                if (_1011)
                 {
-                    uint in_ix = (_1283.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    param_3.offset = _1283.conf.bin_alloc.offset;
+                    uint in_ix = (_854.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    param_3.offset = _854.conf.bin_alloc.offset;
                     uint param_4 = in_ix;
-                    count = read_mem(param_3, param_4, v_308, v_308BufferSize);
-                    param_5.offset = _1283.conf.bin_alloc.offset;
+                    count = read_mem(param_3, param_4, v_242, v_242BufferSize);
+                    param_5.offset = _854.conf.bin_alloc.offset;
                     uint param_6 = in_ix + 1u;
-                    uint offset = read_mem(param_5, param_6, v_308, v_308BufferSize);
+                    uint offset = read_mem(param_5, param_6, v_242, v_242BufferSize);
                     uint param_7 = offset;
                     uint param_8 = count * 4u;
                     bool param_9 = mem_ok;
@@ -958,34 +724,34 @@
                 }
                 if (part_ix > 0u)
                 {
-                    _1562 = sh_part_count[part_ix - 1u];
+                    _1154 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1562 = part_start_ix;
+                    _1154 = part_start_ix;
                 }
-                ix -= _1562;
+                ix -= _1154;
                 Alloc bin_alloc = sh_part_elements[part_ix];
                 BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
                 BinInstanceRef param_13 = BinInstance_index(param_10, param_11);
-                BinInstance inst = BinInstance_read(param_12, param_13, v_308, v_308BufferSize);
+                BinInstance inst = BinInstance_read(param_12, param_13, v_242, v_242BufferSize);
                 sh_elements[th_ix] = inst.element_ix;
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1604 = (wr_ix - rd_ix) < 256u;
-            if (_1604)
+            bool _1196 = (wr_ix - rd_ix) < 256u;
+            if (_1196)
             {
-                _1614 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1206 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1614 = _1604;
+                _1206 = _1196;
             }
-            if (_1614)
+            if (_1206)
             {
                 continue;
             }
@@ -998,24 +764,21 @@
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            ref = AnnotatedRef{ _1283.conf.anno_alloc.offset + (element_ix * 40u) };
-            param_14.offset = _1283.conf.anno_alloc.offset;
-            AnnotatedRef param_15 = ref;
-            tag = Annotated_tag(param_14, param_15, v_308, v_308BufferSize).tag;
+            tag = _1222.scene[drawtag_start + element_ix];
         }
         switch (tag)
         {
-            case 1u:
-            case 3u:
-            case 2u:
-            case 4u:
+            case 68u:
+            case 72u:
+            case 276u:
             case 5u:
+            case 37u:
             {
-                uint drawmonoid_base = (_1283.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix);
-                uint path_ix = v_308.memory[drawmonoid_base];
-                param_16.offset = _1283.conf.tile_alloc.offset;
-                PathRef param_17 = PathRef{ _1283.conf.tile_alloc.offset + (path_ix * 12u) };
-                Path path = Path_read(param_16, param_17, v_308, v_308BufferSize);
+                uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
+                uint path_ix = v_242.memory[drawmonoid_base];
+                param_14.offset = _854.conf.tile_alloc.offset;
+                PathRef param_15 = PathRef{ _854.conf.tile_alloc.offset + (path_ix * 12u) };
+                Path path = Path_read(param_14, param_15, v_242, v_242BufferSize);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
                 int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -1030,13 +793,13 @@
                 tile_count = uint(x1 - x0) * uint(y1 - y0);
                 uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
                 sh_tile_base[th_ix] = base;
-                uint param_18 = path.tiles.offset;
-                uint param_19 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                bool param_20 = mem_ok;
-                Alloc path_alloc = new_alloc(param_18, param_19, param_20);
-                uint param_21 = th_ix;
-                Alloc param_22 = path_alloc;
-                write_tile_alloc(param_21, param_22);
+                uint param_16 = path.tiles.offset;
+                uint param_17 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                bool param_18 = mem_ok;
+                Alloc path_alloc = new_alloc(param_16, param_17, param_18);
+                uint param_19 = th_ix;
+                Alloc param_20 = path_alloc;
+                write_tile_alloc(param_19, param_20);
                 break;
             }
             default:
@@ -1069,67 +832,55 @@
                     el_ix = probe_1;
                 }
             }
-            AnnotatedRef ref_1 = AnnotatedRef{ _1283.conf.anno_alloc.offset + (sh_elements[el_ix] * 40u) };
-            param_23.offset = _1283.conf.anno_alloc.offset;
-            AnnotatedRef param_24 = ref_1;
-            AnnotatedTag anno_tag = Annotated_tag(param_23, param_24, v_308, v_308BufferSize);
-            uint tag_1 = anno_tag.tag;
+            uint tag_1 = _1222.scene[drawtag_start + sh_elements[el_ix]];
             if (el_ix > 0u)
             {
-                _1887 = sh_tile_count[el_ix - 1u];
+                _1453 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1887 = 0u;
+                _1453 = 0u;
             }
-            uint seq_ix = ix_1 - _1887;
+            uint seq_ix = ix_1 - _1453;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
             bool include_tile = false;
             if (mem_ok)
             {
-                uint param_25 = el_ix;
-                bool param_26 = mem_ok;
-                Alloc param_27 = read_tile_alloc(param_25, param_26, v_308, v_308BufferSize);
-                TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                Tile tile = Tile_read(param_27, param_28, v_308, v_308BufferSize);
-                bool is_clip = (tag_1 == 4u) || (tag_1 == 5u);
-                bool _1951 = tile.tile.offset != 0u;
-                bool _1960;
-                if (!_1951)
+                uint param_21 = el_ix;
+                bool param_22 = mem_ok;
+                Alloc param_23 = read_tile_alloc(param_21, param_22, v_242, v_242BufferSize);
+                TileRef param_24 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                Tile tile = Tile_read(param_23, param_24, v_242, v_242BufferSize);
+                bool is_clip = (tag_1 & 1u) != 0u;
+                bool is_blend = false;
+                bool _1516 = tile.tile.offset != 0u;
+                bool _1525;
+                if (!_1516)
                 {
-                    _1960 = (tile.backdrop == 0) == is_clip;
+                    _1525 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1960 = _1951;
+                    _1525 = _1516;
                 }
-                bool _1972;
-                if (!_1960)
+                bool _1532;
+                if (!_1525)
                 {
-                    bool _1971;
-                    if (is_clip)
-                    {
-                        _1971 = (anno_tag.flags & 2u) != 0u;
-                    }
-                    else
-                    {
-                        _1971 = is_clip;
-                    }
-                    _1972 = _1971;
+                    _1532 = is_clip && is_blend;
                 }
                 else
                 {
-                    _1972 = _1960;
+                    _1532 = _1525;
                 }
-                include_tile = _1972;
+                include_tile = _1532;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1992 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1552 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -1153,198 +904,172 @@
             uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
             uint element_ix_1 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            ref = AnnotatedRef{ _1283.conf.anno_alloc.offset + (element_ix_1 * 40u) };
-            param_29.offset = _1283.conf.anno_alloc.offset;
-            AnnotatedRef param_30 = ref;
-            AnnotatedTag tag_2 = Annotated_tag(param_29, param_30, v_308, v_308BufferSize);
+            uint drawtag = _1222.scene[drawtag_start + element_ix_1];
             if (clip_zero_depth == 0u)
             {
-                switch (tag_2.tag)
+                uint param_25 = element_ref_ix;
+                bool param_26 = mem_ok;
+                Alloc param_27 = read_tile_alloc(param_25, param_26, v_242, v_242BufferSize);
+                TileRef param_28 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                Tile tile_1 = Tile_read(param_27, param_28, v_242, v_242BufferSize);
+                uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
+                uint scene_offset = v_242.memory[drawmonoid_base_1 + 2u];
+                uint info_offset = v_242.memory[drawmonoid_base_1 + 3u];
+                uint dd = drawdata_start + (scene_offset >> uint(2));
+                uint di = drawinfo_start + (info_offset >> uint(2));
+                switch (drawtag)
                 {
-                    case 1u:
+                    case 68u:
                     {
-                        uint param_31 = element_ref_ix;
-                        bool param_32 = mem_ok;
-                        Alloc param_33 = read_tile_alloc(param_31, param_32, v_308, v_308BufferSize);
-                        TileRef param_34 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_33, param_34, v_308, v_308BufferSize);
-                        param_35.offset = _1283.conf.anno_alloc.offset;
-                        AnnotatedRef param_36 = ref;
-                        fill = Annotated_Color_read(param_35, param_36, v_308, v_308BufferSize);
-                        Alloc param_37 = cmd_alloc;
-                        CmdRef param_38 = cmd_ref;
-                        uint param_39 = cmd_limit;
-                        bool _2105 = alloc_cmd(param_37, param_38, param_39, v_308, v_308BufferSize);
-                        cmd_alloc = param_37;
-                        cmd_ref = param_38;
-                        cmd_limit = param_39;
-                        if (!_2105)
+                        linewidth = as_type<float>(v_242.memory[di]);
+                        Alloc param_29 = cmd_alloc;
+                        CmdRef param_30 = cmd_ref;
+                        uint param_31 = cmd_limit;
+                        bool _1676 = alloc_cmd(param_29, param_30, param_31, v_242, v_242BufferSize);
+                        cmd_alloc = param_29;
+                        cmd_ref = param_30;
+                        cmd_limit = param_31;
+                        if (!_1676)
                         {
                             break;
                         }
-                        Alloc param_40 = cmd_alloc;
-                        CmdRef param_41 = cmd_ref;
-                        uint param_42 = tag_2.flags;
-                        Tile param_43 = tile_1;
-                        float param_44 = fill.linewidth;
-                        write_fill(param_40, param_41, param_42, param_43, param_44, v_308, v_308BufferSize);
-                        cmd_ref = param_41;
-                        Alloc param_45 = cmd_alloc;
-                        CmdRef param_46 = cmd_ref;
-                        CmdColor param_47 = CmdColor{ fill.rgba_color };
-                        Cmd_Color_write(param_45, param_46, param_47, v_308, v_308BufferSize);
+                        Alloc param_32 = cmd_alloc;
+                        CmdRef param_33 = cmd_ref;
+                        Tile param_34 = tile_1;
+                        float param_35 = linewidth;
+                        write_fill(param_32, param_33, param_34, param_35, v_242, v_242BufferSize);
+                        cmd_ref = param_33;
+                        uint rgba = _1222.scene[dd];
+                        Alloc param_36 = cmd_alloc;
+                        CmdRef param_37 = cmd_ref;
+                        CmdColor param_38 = CmdColor{ rgba };
+                        Cmd_Color_write(param_36, param_37, param_38, v_242, v_242BufferSize);
                         cmd_ref.offset += 8u;
                         break;
                     }
-                    case 2u:
+                    case 276u:
                     {
-                        uint param_48 = element_ref_ix;
-                        bool param_49 = mem_ok;
-                        Alloc param_50 = read_tile_alloc(param_48, param_49, v_308, v_308BufferSize);
-                        TileRef param_51 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_50, param_51, v_308, v_308BufferSize);
-                        param_52.offset = _1283.conf.anno_alloc.offset;
-                        AnnotatedRef param_53 = ref;
-                        AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53, v_308, v_308BufferSize);
-                        Alloc param_54 = cmd_alloc;
-                        CmdRef param_55 = cmd_ref;
-                        uint param_56 = cmd_limit;
-                        bool _2177 = alloc_cmd(param_54, param_55, param_56, v_308, v_308BufferSize);
-                        cmd_alloc = param_54;
-                        cmd_ref = param_55;
-                        cmd_limit = param_56;
-                        if (!_2177)
+                        Alloc param_39 = cmd_alloc;
+                        CmdRef param_40 = cmd_ref;
+                        uint param_41 = cmd_limit;
+                        bool _1717 = alloc_cmd(param_39, param_40, param_41, v_242, v_242BufferSize);
+                        cmd_alloc = param_39;
+                        cmd_ref = param_40;
+                        cmd_limit = param_41;
+                        if (!_1717)
                         {
                             break;
                         }
-                        Alloc param_57 = cmd_alloc;
-                        CmdRef param_58 = cmd_ref;
-                        uint param_59 = tag_2.flags;
-                        Tile param_60 = tile_1;
-                        float param_61 = fill.linewidth;
-                        write_fill(param_57, param_58, param_59, param_60, param_61, v_308, v_308BufferSize);
-                        cmd_ref = param_58;
-                        cmd_lin.index = lin.index;
-                        cmd_lin.line_x = lin.line_x;
-                        cmd_lin.line_y = lin.line_y;
-                        cmd_lin.line_c = lin.line_c;
-                        Alloc param_62 = cmd_alloc;
-                        CmdRef param_63 = cmd_ref;
-                        CmdLinGrad param_64 = cmd_lin;
-                        Cmd_LinGrad_write(param_62, param_63, param_64, v_308, v_308BufferSize);
+                        linewidth = as_type<float>(v_242.memory[di]);
+                        Alloc param_42 = cmd_alloc;
+                        CmdRef param_43 = cmd_ref;
+                        Tile param_44 = tile_1;
+                        float param_45 = linewidth;
+                        write_fill(param_42, param_43, param_44, param_45, v_242, v_242BufferSize);
+                        cmd_ref = param_43;
+                        cmd_lin.index = _1222.scene[dd];
+                        cmd_lin.line_x = as_type<float>(v_242.memory[di + 1u]);
+                        cmd_lin.line_y = as_type<float>(v_242.memory[di + 2u]);
+                        cmd_lin.line_c = as_type<float>(v_242.memory[di + 3u]);
+                        Alloc param_46 = cmd_alloc;
+                        CmdRef param_47 = cmd_ref;
+                        CmdLinGrad param_48 = cmd_lin;
+                        Cmd_LinGrad_write(param_46, param_47, param_48, v_242, v_242BufferSize);
                         cmd_ref.offset += 20u;
                         break;
                     }
-                    case 3u:
+                    case 72u:
                     {
-                        uint param_65 = element_ref_ix;
-                        bool param_66 = mem_ok;
-                        Alloc param_67 = read_tile_alloc(param_65, param_66, v_308, v_308BufferSize);
-                        TileRef param_68 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_67, param_68, v_308, v_308BufferSize);
-                        param_69.offset = _1283.conf.anno_alloc.offset;
-                        AnnotatedRef param_70 = ref;
-                        AnnoImage fill_img = Annotated_Image_read(param_69, param_70, v_308, v_308BufferSize);
-                        Alloc param_71 = cmd_alloc;
-                        CmdRef param_72 = cmd_ref;
-                        uint param_73 = cmd_limit;
-                        bool _2261 = alloc_cmd(param_71, param_72, param_73, v_308, v_308BufferSize);
-                        cmd_alloc = param_71;
-                        cmd_ref = param_72;
-                        cmd_limit = param_73;
-                        if (!_2261)
+                        linewidth = as_type<float>(v_242.memory[di]);
+                        Alloc param_49 = cmd_alloc;
+                        CmdRef param_50 = cmd_ref;
+                        uint param_51 = cmd_limit;
+                        bool _1785 = alloc_cmd(param_49, param_50, param_51, v_242, v_242BufferSize);
+                        cmd_alloc = param_49;
+                        cmd_ref = param_50;
+                        cmd_limit = param_51;
+                        if (!_1785)
                         {
                             break;
                         }
-                        Alloc param_74 = cmd_alloc;
-                        CmdRef param_75 = cmd_ref;
-                        uint param_76 = tag_2.flags;
-                        Tile param_77 = tile_1;
-                        float param_78 = fill_img.linewidth;
-                        write_fill(param_74, param_75, param_76, param_77, param_78, v_308, v_308BufferSize);
-                        cmd_ref = param_75;
-                        Alloc param_79 = cmd_alloc;
-                        CmdRef param_80 = cmd_ref;
-                        CmdImage param_81 = CmdImage{ fill_img.index, fill_img.offset };
-                        Cmd_Image_write(param_79, param_80, param_81, v_308, v_308BufferSize);
+                        Alloc param_52 = cmd_alloc;
+                        CmdRef param_53 = cmd_ref;
+                        Tile param_54 = tile_1;
+                        float param_55 = linewidth;
+                        write_fill(param_52, param_53, param_54, param_55, v_242, v_242BufferSize);
+                        cmd_ref = param_53;
+                        uint index = _1222.scene[dd];
+                        uint raw1 = _1222.scene[dd + 1u];
+                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+                        Alloc param_56 = cmd_alloc;
+                        CmdRef param_57 = cmd_ref;
+                        CmdImage param_58 = CmdImage{ index, offset_1 };
+                        Cmd_Image_write(param_56, param_57, param_58, v_242, v_242BufferSize);
                         cmd_ref.offset += 12u;
                         break;
                     }
-                    case 4u:
+                    case 5u:
                     {
-                        uint param_82 = element_ref_ix;
-                        bool param_83 = mem_ok;
-                        Alloc param_84 = read_tile_alloc(param_82, param_83, v_308, v_308BufferSize);
-                        TileRef param_85 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_84, param_85, v_308, v_308BufferSize);
-                        bool _2322 = tile_1.tile.offset == 0u;
-                        bool _2328;
-                        if (_2322)
+                        bool _1838 = tile_1.tile.offset == 0u;
+                        bool _1844;
+                        if (_1838)
                         {
-                            _2328 = tile_1.backdrop == 0;
+                            _1844 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2328 = _2322;
+                            _1844 = _1838;
                         }
-                        if (_2328)
+                        if (_1844)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
                         else
                         {
-                            Alloc param_86 = cmd_alloc;
-                            CmdRef param_87 = cmd_ref;
-                            uint param_88 = cmd_limit;
-                            bool _2340 = alloc_cmd(param_86, param_87, param_88, v_308, v_308BufferSize);
-                            cmd_alloc = param_86;
-                            cmd_ref = param_87;
-                            cmd_limit = param_88;
-                            if (!_2340)
+                            Alloc param_59 = cmd_alloc;
+                            CmdRef param_60 = cmd_ref;
+                            uint param_61 = cmd_limit;
+                            bool _1856 = alloc_cmd(param_59, param_60, param_61, v_242, v_242BufferSize);
+                            cmd_alloc = param_59;
+                            cmd_ref = param_60;
+                            cmd_limit = param_61;
+                            if (!_1856)
                             {
                                 break;
                             }
-                            Alloc param_89 = cmd_alloc;
-                            CmdRef param_90 = cmd_ref;
-                            Cmd_BeginClip_write(param_89, param_90, v_308, v_308BufferSize);
+                            Alloc param_62 = cmd_alloc;
+                            CmdRef param_63 = cmd_ref;
+                            Cmd_BeginClip_write(param_62, param_63, v_242, v_242BufferSize);
                             cmd_ref.offset += 4u;
                         }
                         clip_depth++;
                         break;
                     }
-                    case 5u:
+                    case 37u:
                     {
-                        uint param_91 = element_ref_ix;
-                        bool param_92 = mem_ok;
-                        Alloc param_93 = read_tile_alloc(param_91, param_92, v_308, v_308BufferSize);
-                        TileRef param_94 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_93, param_94, v_308, v_308BufferSize);
-                        param_95.offset = _1283.conf.anno_alloc.offset;
-                        AnnotatedRef param_96 = ref;
-                        AnnoEndClip end_clip = Annotated_EndClip_read(param_95, param_96, v_308, v_308BufferSize);
                         clip_depth--;
-                        Alloc param_97 = cmd_alloc;
-                        CmdRef param_98 = cmd_ref;
-                        uint param_99 = cmd_limit;
-                        bool _2398 = alloc_cmd(param_97, param_98, param_99, v_308, v_308BufferSize);
-                        cmd_alloc = param_97;
-                        cmd_ref = param_98;
-                        cmd_limit = param_99;
-                        if (!_2398)
+                        Alloc param_64 = cmd_alloc;
+                        CmdRef param_65 = cmd_ref;
+                        uint param_66 = cmd_limit;
+                        bool _1884 = alloc_cmd(param_64, param_65, param_66, v_242, v_242BufferSize);
+                        cmd_alloc = param_64;
+                        cmd_ref = param_65;
+                        cmd_limit = param_66;
+                        if (!_1884)
                         {
                             break;
                         }
-                        Alloc param_100 = cmd_alloc;
-                        CmdRef param_101 = cmd_ref;
-                        uint param_102 = 0u;
-                        Tile param_103 = tile_1;
-                        float param_104 = 0.0;
-                        write_fill(param_100, param_101, param_102, param_103, param_104, v_308, v_308BufferSize);
-                        cmd_ref = param_101;
-                        Alloc param_105 = cmd_alloc;
-                        CmdRef param_106 = cmd_ref;
-                        CmdEndClip param_107 = CmdEndClip{ end_clip.blend };
-                        Cmd_EndClip_write(param_105, param_106, param_107, v_308, v_308BufferSize);
+                        Alloc param_67 = cmd_alloc;
+                        CmdRef param_68 = cmd_ref;
+                        Tile param_69 = tile_1;
+                        float param_70 = -1.0;
+                        write_fill(param_67, param_68, param_69, param_70, v_242, v_242BufferSize);
+                        cmd_ref = param_68;
+                        uint blend = _1222.scene[dd];
+                        Alloc param_71 = cmd_alloc;
+                        CmdRef param_72 = cmd_ref;
+                        CmdEndClip param_73 = CmdEndClip{ blend };
+                        Cmd_EndClip_write(param_71, param_72, param_73, v_242, v_242BufferSize);
                         cmd_ref.offset += 8u;
                         break;
                     }
@@ -1352,14 +1077,14 @@
             }
             else
             {
-                switch (tag_2.tag)
+                switch (drawtag)
                 {
-                    case 4u:
+                    case 5u:
                     {
                         clip_depth++;
                         break;
                     }
-                    case 5u:
+                    case 37u:
                     {
                         if (clip_depth == clip_zero_depth)
                         {
@@ -1378,21 +1103,21 @@
             break;
         }
     }
-    bool _2467 = (bin_tile_x + tile_x) < _1283.conf.width_in_tiles;
-    bool _2476;
-    if (_2467)
+    bool _1954 = (bin_tile_x + tile_x) < _854.conf.width_in_tiles;
+    bool _1963;
+    if (_1954)
     {
-        _2476 = (bin_tile_y + tile_y) < _1283.conf.height_in_tiles;
+        _1963 = (bin_tile_y + tile_y) < _854.conf.height_in_tiles;
     }
     else
     {
-        _2476 = _2467;
+        _1963 = _1954;
     }
-    if (_2476)
+    if (_1963)
     {
-        Alloc param_108 = cmd_alloc;
-        CmdRef param_109 = cmd_ref;
-        Cmd_End_write(param_108, param_109, v_308, v_308BufferSize);
+        Alloc param_74 = cmd_alloc;
+        CmdRef param_75 = cmd_ref;
+        Cmd_End_write(param_74, param_75, v_242, v_242BufferSize);
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index 1fef2d7..bc27230 100644
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil
index d1567c9..6ce3b8c 100644
--- a/piet-gpu/shader/gen/draw_leaf.dxil
+++ b/piet-gpu/shader/gen/draw_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl
index 1f2f78b..f812f52 100644
--- a/piet-gpu/shader/gen/draw_leaf.hlsl
+++ b/piet-gpu/shader/gen/draw_leaf.hlsl
@@ -1,133 +1,12 @@
-struct Alloc
-{
-    uint offset;
-};
-
-struct ElementRef
-{
-    uint offset;
-};
-
-struct FillColorRef
-{
-    uint offset;
-};
-
-struct FillColor
-{
-    uint rgba_color;
-};
-
-struct FillLinGradientRef
-{
-    uint offset;
-};
-
-struct FillLinGradient
-{
-    uint index;
-    float2 p0;
-    float2 p1;
-};
-
-struct FillImageRef
-{
-    uint offset;
-};
-
-struct FillImage
-{
-    uint index;
-    int2 offset;
-};
-
-struct ClipRef
-{
-    uint offset;
-};
-
-struct Clip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct ElementTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct DrawMonoid
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
-struct AnnoImageRef
-{
-    uint offset;
-};
-
-struct AnnoImage
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    int2 offset;
-};
-
-struct AnnoColorRef
-{
-    uint offset;
-};
-
-struct AnnoColor
-{
-    float4 bbox;
-    float linewidth;
-    uint rgba_color;
-};
-
-struct AnnoLinGradientRef
-{
-    uint offset;
-};
-
-struct AnnoLinGradient
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    float line_x;
-    float line_y;
-    float line_c;
-};
-
-struct AnnoBeginClipRef
-{
-    uint offset;
-};
-
-struct AnnoBeginClip
-{
-    float4 bbox;
-    float linewidth;
-    uint blend;
-};
-
-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct AnnotatedRef
+struct Alloc
 {
     uint offset;
 };
@@ -144,12 +23,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -157,18 +38,18 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-static const DrawMonoid _413 = { 0u, 0u };
-static const DrawMonoid _437 = { 1u, 0u };
-static const DrawMonoid _439 = { 1u, 1u };
+static const DrawMonoid _23 = { 0u, 0u, 0u, 0u };
 
-RWByteAddressBuffer _199 : register(u0, space0);
-ByteAddressBuffer _223 : register(t2, space0);
-ByteAddressBuffer _1020 : register(t3, space0);
-ByteAddressBuffer _1054 : register(t1, space0);
+ByteAddressBuffer _92 : register(t1, space0);
+ByteAddressBuffer _102 : register(t2, space0);
+ByteAddressBuffer _202 : register(t3, space0);
+RWByteAddressBuffer _284 : register(u0, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -182,389 +63,44 @@
 
 groupshared DrawMonoid sh_scratch[256];
 
-ElementTag Element_tag(ElementRef ref)
-{
-    uint tag_and_flags = _223.Load((ref.offset >> uint(2)) * 4 + 0);
-    ElementTag _378 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _378;
-}
-
 DrawMonoid map_tag(uint tag_word)
 {
-    switch (tag_word)
-    {
-        case 4u:
-        case 5u:
-        case 6u:
-        {
-            return _437;
-        }
-        case 9u:
-        case 10u:
-        {
-            return _439;
-        }
-        default:
-        {
-            return _413;
-        }
-    }
+    uint has_path = uint(tag_word != 0u);
+    DrawMonoid _75 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
+    return _75;
 }
 
-ElementRef Element_index(ElementRef ref, uint index)
-{
-    ElementRef _212 = { ref.offset + (index * 36u) };
-    return _212;
-}
-
-DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
+DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
 {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
-DrawMonoid tag_monoid_identity()
+DrawMonoid draw_monoid_identity()
 {
-    return _413;
-}
-
-FillColor FillColor_read(FillColorRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = _223.Load((ix + 0u) * 4 + 0);
-    FillColor s;
-    s.rgba_color = raw0;
-    return s;
-}
-
-FillColor Element_FillColor_read(ElementRef ref)
-{
-    FillColorRef _384 = { ref.offset + 4u };
-    FillColorRef param = _384;
-    return FillColor_read(param);
-}
-
-bool touch_mem(Alloc alloc, uint offset)
-{
-    return true;
-}
-
-void write_mem(Alloc alloc, uint offset, uint val)
-{
-    Alloc param = alloc;
-    uint param_1 = offset;
-    if (!touch_mem(param, param_1))
-    {
-        return;
-    }
-    _199.Store(offset * 4 + 8, val);
-}
-
-void AnnoColor_write(Alloc a, AnnoColorRef ref, AnnoColor s)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = asuint(s.bbox.x);
-    write_mem(param, param_1, param_2);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = asuint(s.bbox.y);
-    write_mem(param_3, param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = asuint(s.bbox.z);
-    write_mem(param_6, param_7, param_8);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = asuint(s.bbox.w);
-    write_mem(param_9, param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = asuint(s.linewidth);
-    write_mem(param_12, param_13, param_14);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.rgba_color;
-    write_mem(param_15, param_16, param_17);
-}
-
-void Annotated_Color_write(Alloc a, AnnotatedRef ref, uint flags, AnnoColor s)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 1u;
-    write_mem(param, param_1, param_2);
-    AnnoColorRef _818 = { ref.offset + 4u };
-    Alloc param_3 = a;
-    AnnoColorRef param_4 = _818;
-    AnnoColor param_5 = s;
-    AnnoColor_write(param_3, param_4, param_5);
-}
-
-FillLinGradient FillLinGradient_read(FillLinGradientRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = _223.Load((ix + 0u) * 4 + 0);
-    uint raw1 = _223.Load((ix + 1u) * 4 + 0);
-    uint raw2 = _223.Load((ix + 2u) * 4 + 0);
-    uint raw3 = _223.Load((ix + 3u) * 4 + 0);
-    uint raw4 = _223.Load((ix + 4u) * 4 + 0);
-    FillLinGradient s;
-    s.index = raw0;
-    s.p0 = float2(asfloat(raw1), asfloat(raw2));
-    s.p1 = float2(asfloat(raw3), asfloat(raw4));
-    return s;
-}
-
-FillLinGradient Element_FillLinGradient_read(ElementRef ref)
-{
-    FillLinGradientRef _392 = { ref.offset + 4u };
-    FillLinGradientRef param = _392;
-    return FillLinGradient_read(param);
-}
-
-void AnnoLinGradient_write(Alloc a, AnnoLinGradientRef ref, AnnoLinGradient s)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = asuint(s.bbox.x);
-    write_mem(param, param_1, param_2);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = asuint(s.bbox.y);
-    write_mem(param_3, param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = asuint(s.bbox.z);
-    write_mem(param_6, param_7, param_8);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = asuint(s.bbox.w);
-    write_mem(param_9, param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = asuint(s.linewidth);
-    write_mem(param_12, param_13, param_14);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17);
-    Alloc param_18 = a;
-    uint param_19 = ix + 6u;
-    uint param_20 = asuint(s.line_x);
-    write_mem(param_18, param_19, param_20);
-    Alloc param_21 = a;
-    uint param_22 = ix + 7u;
-    uint param_23 = asuint(s.line_y);
-    write_mem(param_21, param_22, param_23);
-    Alloc param_24 = a;
-    uint param_25 = ix + 8u;
-    uint param_26 = asuint(s.line_c);
-    write_mem(param_24, param_25, param_26);
-}
-
-void Annotated_LinGradient_write(Alloc a, AnnotatedRef ref, uint flags, AnnoLinGradient s)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 2u;
-    write_mem(param, param_1, param_2);
-    AnnoLinGradientRef _839 = { ref.offset + 4u };
-    Alloc param_3 = a;
-    AnnoLinGradientRef param_4 = _839;
-    AnnoLinGradient param_5 = s;
-    AnnoLinGradient_write(param_3, param_4, param_5);
-}
-
-FillImage FillImage_read(FillImageRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = _223.Load((ix + 0u) * 4 + 0);
-    uint raw1 = _223.Load((ix + 1u) * 4 + 0);
-    FillImage s;
-    s.index = raw0;
-    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-    return s;
-}
-
-FillImage Element_FillImage_read(ElementRef ref)
-{
-    FillImageRef _400 = { ref.offset + 4u };
-    FillImageRef param = _400;
-    return FillImage_read(param);
-}
-
-void AnnoImage_write(Alloc a, AnnoImageRef ref, AnnoImage s)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = asuint(s.bbox.x);
-    write_mem(param, param_1, param_2);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = asuint(s.bbox.y);
-    write_mem(param_3, param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = asuint(s.bbox.z);
-    write_mem(param_6, param_7, param_8);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = asuint(s.bbox.w);
-    write_mem(param_9, param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = asuint(s.linewidth);
-    write_mem(param_12, param_13, param_14);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17);
-    Alloc param_18 = a;
-    uint param_19 = ix + 6u;
-    uint param_20 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_18, param_19, param_20);
-}
-
-void Annotated_Image_write(Alloc a, AnnotatedRef ref, uint flags, AnnoImage s)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 3u;
-    write_mem(param, param_1, param_2);
-    AnnoImageRef _860 = { ref.offset + 4u };
-    Alloc param_3 = a;
-    AnnoImageRef param_4 = _860;
-    AnnoImage param_5 = s;
-    AnnoImage_write(param_3, param_4, param_5);
-}
-
-Clip Clip_read(ClipRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = _223.Load((ix + 0u) * 4 + 0);
-    uint raw1 = _223.Load((ix + 1u) * 4 + 0);
-    uint raw2 = _223.Load((ix + 2u) * 4 + 0);
-    uint raw3 = _223.Load((ix + 3u) * 4 + 0);
-    Clip s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.blend = _223.Load((ix + 4u) * 4 + 0);
-    return s;
-}
-
-Clip Element_BeginClip_read(ElementRef ref)
-{
-    ClipRef _408 = { ref.offset + 4u };
-    ClipRef param = _408;
-    return Clip_read(param);
-}
-
-void AnnoBeginClip_write(Alloc a, AnnoBeginClipRef ref, AnnoBeginClip s)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = asuint(s.bbox.x);
-    write_mem(param, param_1, param_2);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = asuint(s.bbox.y);
-    write_mem(param_3, param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = asuint(s.bbox.z);
-    write_mem(param_6, param_7, param_8);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = asuint(s.bbox.w);
-    write_mem(param_9, param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = asuint(s.linewidth);
-    write_mem(param_12, param_13, param_14);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.blend;
-    write_mem(param_15, param_16, param_17);
-}
-
-void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoBeginClip s)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 4u;
-    write_mem(param, param_1, param_2);
-    AnnoBeginClipRef _881 = { ref.offset + 4u };
-    Alloc param_3 = a;
-    AnnoBeginClipRef param_4 = _881;
-    AnnoBeginClip param_5 = s;
-    AnnoBeginClip_write(param_3, param_4, param_5);
-}
-
-void AnnoEndClip_write(Alloc a, AnnoEndClipRef ref, AnnoEndClip s)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = asuint(s.bbox.x);
-    write_mem(param, param_1, param_2);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = asuint(s.bbox.y);
-    write_mem(param_3, param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = asuint(s.bbox.z);
-    write_mem(param_6, param_7, param_8);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = asuint(s.bbox.w);
-    write_mem(param_9, param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = s.blend;
-    write_mem(param_12, param_13, param_14);
-}
-
-void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoEndClip s)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 5u;
-    write_mem(param, param_1, param_2);
-    AnnoEndClipRef _902 = { ref.offset + 4u };
-    Alloc param_3 = a;
-    AnnoEndClipRef param_4 = _902;
-    AnnoEndClip param_5 = s;
-    AnnoEndClip_write(param_3, param_4, param_5);
+    return _23;
 }
 
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    ElementRef _920 = { ix * 36u };
-    ElementRef ref = _920;
-    ElementRef param = ref;
-    uint tag_word = Element_tag(param).tag;
-    uint param_1 = tag_word;
-    DrawMonoid agg = map_tag(param_1);
+    uint drawtag_base = _92.Load(100) >> uint(2);
+    uint tag_word = _102.Load((drawtag_base + ix) * 4 + 0);
+    uint param = tag_word;
+    DrawMonoid agg = map_tag(param);
     DrawMonoid local[8];
     local[0] = agg;
     for (uint i = 1u; i < 8u; i++)
     {
-        ElementRef param_2 = ref;
-        uint param_3 = i;
-        ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4).tag;
-        uint param_5 = tag_word;
-        DrawMonoid param_6 = agg;
-        DrawMonoid param_7 = map_tag(param_5);
-        agg = combine_tag_monoid(param_6, param_7);
+        tag_word = _102.Load(((drawtag_base + ix) + i) * 4 + 0);
+        uint param_1 = tag_word;
+        DrawMonoid param_2 = agg;
+        DrawMonoid param_3 = map_tag(param_1);
+        agg = combine_draw_monoid(param_2, param_3);
         local[i] = agg;
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
@@ -574,194 +110,121 @@
         if (gl_LocalInvocationID.x >= (1u << i_1))
         {
             DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
-            DrawMonoid param_8 = other;
-            DrawMonoid param_9 = agg;
-            agg = combine_tag_monoid(param_8, param_9);
+            DrawMonoid param_4 = other;
+            DrawMonoid param_5 = agg;
+            agg = combine_draw_monoid(param_4, param_5);
         }
         GroupMemoryBarrierWithGroupSync();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     GroupMemoryBarrierWithGroupSync();
-    DrawMonoid row = tag_monoid_identity();
+    DrawMonoid row = draw_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        DrawMonoid _1026;
-        _1026.path_ix = _1020.Load((gl_WorkGroupID.x - 1u) * 8 + 0);
-        _1026.clip_ix = _1020.Load((gl_WorkGroupID.x - 1u) * 8 + 4);
-        row.path_ix = _1026.path_ix;
-        row.clip_ix = _1026.clip_ix;
+        DrawMonoid _208;
+        _208.path_ix = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 0);
+        _208.clip_ix = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 4);
+        _208.scene_offset = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 8);
+        _208.info_offset = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 12);
+        row.path_ix = _208.path_ix;
+        row.clip_ix = _208.clip_ix;
+        row.scene_offset = _208.scene_offset;
+        row.info_offset = _208.info_offset;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
-        DrawMonoid param_10 = row;
-        DrawMonoid param_11 = sh_scratch[gl_LocalInvocationID.x - 1u];
-        row = combine_tag_monoid(param_10, param_11);
+        DrawMonoid param_6 = row;
+        DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
+        row = combine_draw_monoid(param_6, param_7);
     }
+    uint drawdata_base = _92.Load(104) >> uint(2);
+    uint drawinfo_base = _92.Load(68) >> uint(2);
     uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_1054.Load(44) >> uint(2)) + (out_ix * 2u);
-    uint clip_out_base = _1054.Load(48) >> uint(2);
-    AnnotatedRef _1075 = { _1054.Load(32) + (out_ix * 40u) };
-    AnnotatedRef out_ref = _1075;
+    uint out_base = (_92.Load(44) >> uint(2)) + (out_ix * 4u);
+    uint clip_out_base = _92.Load(48) >> uint(2);
     float4 mat;
     float2 translate;
-    AnnoColor anno_fill;
-    Alloc param_18;
-    AnnoLinGradient anno_lin;
-    Alloc param_23;
-    AnnoImage anno_img;
-    Alloc param_28;
-    AnnoBeginClip anno_begin_clip;
-    Alloc param_33;
-    AnnoEndClip anno_end_clip;
-    Alloc param_38;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid m = row;
         if (i_2 > 0u)
         {
-            DrawMonoid param_12 = m;
-            DrawMonoid param_13 = local[i_2 - 1u];
-            m = combine_tag_monoid(param_12, param_13);
+            DrawMonoid param_8 = m;
+            DrawMonoid param_9 = local[i_2 - 1u];
+            m = combine_draw_monoid(param_8, param_9);
         }
-        _199.Store((out_base + (i_2 * 2u)) * 4 + 8, m.path_ix);
-        _199.Store(((out_base + (i_2 * 2u)) + 1u) * 4 + 8, m.clip_ix);
-        ElementRef param_14 = ref;
-        uint param_15 = i_2;
-        ElementRef this_ref = Element_index(param_14, param_15);
-        ElementRef param_16 = this_ref;
-        tag_word = Element_tag(param_16).tag;
-        if ((((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) || (tag_word == 9u))
+        _284.Store((out_base + (i_2 * 4u)) * 4 + 8, m.path_ix);
+        _284.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 8, m.clip_ix);
+        _284.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 8, m.scene_offset);
+        _284.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 8, m.info_offset);
+        uint dd = drawdata_base + (m.scene_offset >> uint(2));
+        uint di = drawinfo_base + (m.info_offset >> uint(2));
+        tag_word = _102.Load(((drawtag_base + ix) + i_2) * 4 + 0);
+        if ((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 72u)) || (tag_word == 5u))
         {
-            uint bbox_offset = (_1054.Load(40) >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(_199.Load(bbox_offset * 4 + 8)) - 32768.0f;
-            float bbox_t = float(_199.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
-            float bbox_r = float(_199.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
-            float bbox_b = float(_199.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
+            uint bbox_offset = (_92.Load(40) >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_284.Load(bbox_offset * 4 + 8)) - 32768.0f;
+            float bbox_t = float(_284.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
+            float bbox_r = float(_284.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
+            float bbox_b = float(_284.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
             float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = asfloat(_199.Load((bbox_offset + 4u) * 4 + 8));
+            float linewidth = asfloat(_284.Load((bbox_offset + 4u) * 4 + 8));
             uint fill_mode = uint(linewidth >= 0.0f);
-            if ((linewidth >= 0.0f) || (tag_word == 5u))
+            if ((linewidth >= 0.0f) || (tag_word == 276u))
             {
-                uint trans_ix = _199.Load((bbox_offset + 5u) * 4 + 8);
-                uint t = (_1054.Load(36) >> uint(2)) + (6u * trans_ix);
-                mat = asfloat(uint4(_199.Load(t * 4 + 8), _199.Load((t + 1u) * 4 + 8), _199.Load((t + 2u) * 4 + 8), _199.Load((t + 3u) * 4 + 8)));
-                if (tag_word == 5u)
+                uint trans_ix = _284.Load((bbox_offset + 5u) * 4 + 8);
+                uint t = (_92.Load(36) >> uint(2)) + (6u * trans_ix);
+                mat = asfloat(uint4(_284.Load(t * 4 + 8), _284.Load((t + 1u) * 4 + 8), _284.Load((t + 2u) * 4 + 8), _284.Load((t + 3u) * 4 + 8)));
+                if (tag_word == 276u)
                 {
-                    translate = asfloat(uint2(_199.Load((t + 4u) * 4 + 8), _199.Load((t + 5u) * 4 + 8)));
+                    translate = asfloat(uint2(_284.Load((t + 4u) * 4 + 8), _284.Load((t + 5u) * 4 + 8)));
                 }
             }
             if (linewidth >= 0.0f)
             {
                 linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z)));
             }
-            linewidth = max(linewidth, 0.0f);
             switch (tag_word)
             {
-                case 4u:
+                case 68u:
+                case 72u:
                 {
-                    ElementRef param_17 = this_ref;
-                    FillColor fill = Element_FillColor_read(param_17);
-                    anno_fill.bbox = bbox;
-                    anno_fill.linewidth = linewidth;
-                    anno_fill.rgba_color = fill.rgba_color;
-                    Alloc _1288;
-                    _1288.offset = _1054.Load(32);
-                    param_18.offset = _1288.offset;
-                    AnnotatedRef param_19 = out_ref;
-                    uint param_20 = fill_mode;
-                    AnnoColor param_21 = anno_fill;
-                    Annotated_Color_write(param_18, param_19, param_20, param_21);
+                    _284.Store(di * 4 + 8, asuint(linewidth));
                     break;
                 }
-                case 5u:
+                case 276u:
                 {
-                    ElementRef param_22 = this_ref;
-                    FillLinGradient lin = Element_FillLinGradient_read(param_22);
-                    anno_lin.bbox = bbox;
-                    anno_lin.linewidth = linewidth;
-                    anno_lin.index = lin.index;
-                    float2 p0 = ((mat.xy * lin.p0.x) + (mat.zw * lin.p0.y)) + translate;
-                    float2 p1 = ((mat.xy * lin.p1.x) + (mat.zw * lin.p1.y)) + translate;
+                    _284.Store(di * 4 + 8, asuint(linewidth));
+                    uint index = _102.Load(dd * 4 + 0);
+                    float2 p0 = asfloat(uint2(_102.Load((dd + 1u) * 4 + 0), _102.Load((dd + 2u) * 4 + 0)));
+                    float2 p1 = asfloat(uint2(_102.Load((dd + 3u) * 4 + 0), _102.Load((dd + 4u) * 4 + 0)));
+                    p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
+                    p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
                     float2 dxy = p1 - p0;
                     float scale = 1.0f / ((dxy.x * dxy.x) + (dxy.y * dxy.y));
                     float line_x = dxy.x * scale;
                     float line_y = dxy.y * scale;
-                    anno_lin.line_x = line_x;
-                    anno_lin.line_y = line_y;
-                    anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    Alloc _1384;
-                    _1384.offset = _1054.Load(32);
-                    param_23.offset = _1384.offset;
-                    AnnotatedRef param_24 = out_ref;
-                    uint param_25 = fill_mode;
-                    AnnoLinGradient param_26 = anno_lin;
-                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26);
+                    float line_c = -((p0.x * line_x) + (p0.y * line_y));
+                    _284.Store((di + 1u) * 4 + 8, asuint(line_x));
+                    _284.Store((di + 2u) * 4 + 8, asuint(line_y));
+                    _284.Store((di + 3u) * 4 + 8, asuint(line_c));
                     break;
                 }
-                case 6u:
+                case 5u:
                 {
-                    ElementRef param_27 = this_ref;
-                    FillImage fill_img = Element_FillImage_read(param_27);
-                    anno_img.bbox = bbox;
-                    anno_img.linewidth = linewidth;
-                    anno_img.index = fill_img.index;
-                    anno_img.offset = fill_img.offset;
-                    Alloc _1412;
-                    _1412.offset = _1054.Load(32);
-                    param_28.offset = _1412.offset;
-                    AnnotatedRef param_29 = out_ref;
-                    uint param_30 = fill_mode;
-                    AnnoImage param_31 = anno_img;
-                    Annotated_Image_write(param_28, param_29, param_30, param_31);
-                    break;
-                }
-                case 9u:
-                {
-                    ElementRef param_32 = this_ref;
-                    Clip begin_clip = Element_BeginClip_read(param_32);
-                    anno_begin_clip.bbox = bbox;
-                    anno_begin_clip.linewidth = 0.0f;
-                    anno_begin_clip.blend = begin_clip.blend;
-                    uint flags = uint(begin_clip.blend != 3u) << uint(1);
-                    Alloc _1442;
-                    _1442.offset = _1054.Load(32);
-                    param_33.offset = _1442.offset;
-                    AnnotatedRef param_34 = out_ref;
-                    uint param_35 = flags;
-                    AnnoBeginClip param_36 = anno_begin_clip;
-                    Annotated_BeginClip_write(param_33, param_34, param_35, param_36);
                     break;
                 }
             }
         }
-        else
-        {
-            if (tag_word == 10u)
-            {
-                ElementRef param_37 = this_ref;
-                Clip end_clip = Element_BeginClip_read(param_37);
-                anno_end_clip.bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
-                anno_end_clip.blend = end_clip.blend;
-                uint flags_1 = uint(end_clip.blend != 3u) << uint(1);
-                Alloc _1480;
-                _1480.offset = _1054.Load(32);
-                param_38.offset = _1480.offset;
-                AnnotatedRef param_39 = out_ref;
-                uint param_40 = flags_1;
-                AnnoEndClip param_41 = anno_end_clip;
-                Annotated_EndClip_write(param_38, param_39, param_40, param_41);
-            }
-        }
-        if ((tag_word == 9u) || (tag_word == 10u))
+        if ((tag_word == 5u) || (tag_word == 37u))
         {
             uint path_ix = ~(out_ix + i_2);
-            if (tag_word == 9u)
+            if (tag_word == 5u)
             {
                 path_ix = m.path_ix;
             }
-            _199.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
+            _284.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
         }
-        out_ref.offset += 40u;
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl
index 5b9ecc6..a8516ae 100644
--- a/piet-gpu/shader/gen/draw_leaf.msl
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@@ -44,145 +44,53 @@
     }
 };
 
+struct DrawMonoid
+{
+    uint path_ix;
+    uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
+};
+
 struct Alloc
 {
     uint offset;
 };
 
-struct ElementRef
+struct Config
 {
-    uint offset;
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc path_bbox_alloc;
+    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
+    uint n_trans;
+    uint n_path;
+    uint n_clip;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
-struct FillColorRef
+struct ConfigBuf
 {
-    uint offset;
-};
-
-struct FillColor
-{
-    uint rgba_color;
-};
-
-struct FillLinGradientRef
-{
-    uint offset;
-};
-
-struct FillLinGradient
-{
-    uint index;
-    float2 p0;
-    float2 p1;
-};
-
-struct FillImageRef
-{
-    uint offset;
-};
-
-struct FillImage
-{
-    uint index;
-    int2 offset;
-};
-
-struct ClipRef
-{
-    uint offset;
-};
-
-struct Clip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct ElementTag
-{
-    uint tag;
-    uint flags;
-};
-
-struct DrawMonoid
-{
-    uint path_ix;
-    uint clip_ix;
-};
-
-struct AnnoImageRef
-{
-    uint offset;
-};
-
-struct AnnoImage
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    int2 offset;
-};
-
-struct AnnoColorRef
-{
-    uint offset;
-};
-
-struct AnnoColor
-{
-    float4 bbox;
-    float linewidth;
-    uint rgba_color;
-};
-
-struct AnnoLinGradientRef
-{
-    uint offset;
-};
-
-struct AnnoLinGradient
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    float line_x;
-    float line_y;
-    float line_c;
-};
-
-struct AnnoBeginClipRef
-{
-    uint offset;
-};
-
-struct AnnoBeginClip
-{
-    float4 bbox;
-    float linewidth;
-    uint blend;
-};
-
-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct Memory
-{
-    uint mem_offset;
-    uint mem_error;
-    uint memory[1];
+    Config conf;
 };
 
 struct SceneBuf
@@ -194,6 +102,8 @@
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
 struct ParentBuf
@@ -201,442 +111,56 @@
     DrawMonoid_1 parent[1];
 };
 
-struct Alloc_1
+struct Memory
 {
-    uint offset;
-};
-
-struct Config
-{
-    uint n_elements;
-    uint n_pathseg;
-    uint width_in_tiles;
-    uint height_in_tiles;
-    Alloc_1 tile_alloc;
-    Alloc_1 bin_alloc;
-    Alloc_1 ptcl_alloc;
-    Alloc_1 pathseg_alloc;
-    Alloc_1 anno_alloc;
-    Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
-    Alloc_1 drawmonoid_alloc;
-    Alloc_1 clip_alloc;
-    Alloc_1 clip_bic_alloc;
-    Alloc_1 clip_stack_alloc;
-    Alloc_1 clip_bbox_alloc;
-    uint n_trans;
-    uint n_path;
-    uint n_clip;
-    uint trans_offset;
-    uint linewidth_offset;
-    uint pathtag_offset;
-    uint pathseg_offset;
-};
-
-struct ConfigBuf
-{
-    Config conf;
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
 };
 
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_223)
-{
-    uint tag_and_flags = v_223.scene[ref.offset >> uint(2)];
-    return ElementTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
-
-static inline __attribute__((always_inline))
 DrawMonoid map_tag(thread const uint& tag_word)
 {
-    switch (tag_word)
-    {
-        case 4u:
-        case 5u:
-        case 6u:
-        {
-            return DrawMonoid{ 1u, 0u };
-        }
-        case 9u:
-        case 10u:
-        {
-            return DrawMonoid{ 1u, 1u };
-        }
-        default:
-        {
-            return DrawMonoid{ 0u, 0u };
-        }
-    }
+    uint has_path = uint(tag_word != 0u);
+    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
 }
 
 static inline __attribute__((always_inline))
-ElementRef Element_index(thread const ElementRef& ref, thread const uint& index)
-{
-    return ElementRef{ ref.offset + (index * 36u) };
-}
-
-static inline __attribute__((always_inline))
-DrawMonoid combine_tag_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
+DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
 {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
 static inline __attribute__((always_inline))
-DrawMonoid tag_monoid_identity()
+DrawMonoid draw_monoid_identity()
 {
-    return DrawMonoid{ 0u, 0u };
+    return DrawMonoid{ 0u, 0u, 0u, 0u };
 }
 
-static inline __attribute__((always_inline))
-FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_223)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_223.scene[ix + 0u];
-    FillColor s;
-    s.rgba_color = raw0;
-    return s;
-}
-
-static inline __attribute__((always_inline))
-FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_223)
-{
-    FillColorRef param = FillColorRef{ ref.offset + 4u };
-    return FillColor_read(param, v_223);
-}
-
-static inline __attribute__((always_inline))
-bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
-{
-    return true;
-}
-
-static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_199)
-{
-    Alloc param = alloc;
-    uint param_1 = offset;
-    if (!touch_mem(param, param_1))
-    {
-        return;
-    }
-    v_199.memory[offset] = val;
-}
-
-static inline __attribute__((always_inline))
-void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_199)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_199);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_199);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_199);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_199);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.rgba_color;
-    write_mem(param_15, param_16, param_17, v_199);
-}
-
-static inline __attribute__((always_inline))
-void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_199)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 1u;
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    AnnoColorRef param_4 = AnnoColorRef{ ref.offset + 4u };
-    AnnoColor param_5 = s;
-    AnnoColor_write(param_3, param_4, param_5, v_199);
-}
-
-static inline __attribute__((always_inline))
-FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_223)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_223.scene[ix + 0u];
-    uint raw1 = v_223.scene[ix + 1u];
-    uint raw2 = v_223.scene[ix + 2u];
-    uint raw3 = v_223.scene[ix + 3u];
-    uint raw4 = v_223.scene[ix + 4u];
-    FillLinGradient s;
-    s.index = raw0;
-    s.p0 = float2(as_type<float>(raw1), as_type<float>(raw2));
-    s.p1 = float2(as_type<float>(raw3), as_type<float>(raw4));
-    return s;
-}
-
-static inline __attribute__((always_inline))
-FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_223)
-{
-    FillLinGradientRef param = FillLinGradientRef{ ref.offset + 4u };
-    return FillLinGradient_read(param, v_223);
-}
-
-static inline __attribute__((always_inline))
-void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_199)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_199);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_199);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_199);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_199);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17, v_199);
-    Alloc param_18 = a;
-    uint param_19 = ix + 6u;
-    uint param_20 = as_type<uint>(s.line_x);
-    write_mem(param_18, param_19, param_20, v_199);
-    Alloc param_21 = a;
-    uint param_22 = ix + 7u;
-    uint param_23 = as_type<uint>(s.line_y);
-    write_mem(param_21, param_22, param_23, v_199);
-    Alloc param_24 = a;
-    uint param_25 = ix + 8u;
-    uint param_26 = as_type<uint>(s.line_c);
-    write_mem(param_24, param_25, param_26, v_199);
-}
-
-static inline __attribute__((always_inline))
-void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_199)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 2u;
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    AnnoLinGradientRef param_4 = AnnoLinGradientRef{ ref.offset + 4u };
-    AnnoLinGradient param_5 = s;
-    AnnoLinGradient_write(param_3, param_4, param_5, v_199);
-}
-
-static inline __attribute__((always_inline))
-FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_223)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_223.scene[ix + 0u];
-    uint raw1 = v_223.scene[ix + 1u];
-    FillImage s;
-    s.index = raw0;
-    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-    return s;
-}
-
-static inline __attribute__((always_inline))
-FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_223)
-{
-    FillImageRef param = FillImageRef{ ref.offset + 4u };
-    return FillImage_read(param, v_223);
-}
-
-static inline __attribute__((always_inline))
-void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_199)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_199);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_199);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_199);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_199);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17, v_199);
-    Alloc param_18 = a;
-    uint param_19 = ix + 6u;
-    uint param_20 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_18, param_19, param_20, v_199);
-}
-
-static inline __attribute__((always_inline))
-void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_199)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 3u;
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    AnnoImageRef param_4 = AnnoImageRef{ ref.offset + 4u };
-    AnnoImage param_5 = s;
-    AnnoImage_write(param_3, param_4, param_5, v_199);
-}
-
-static inline __attribute__((always_inline))
-Clip Clip_read(thread const ClipRef& ref, const device SceneBuf& v_223)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_223.scene[ix + 0u];
-    uint raw1 = v_223.scene[ix + 1u];
-    uint raw2 = v_223.scene[ix + 2u];
-    uint raw3 = v_223.scene[ix + 3u];
-    Clip s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    s.blend = v_223.scene[ix + 4u];
-    return s;
-}
-
-static inline __attribute__((always_inline))
-Clip Element_BeginClip_read(thread const ElementRef& ref, const device SceneBuf& v_223)
-{
-    ClipRef param = ClipRef{ ref.offset + 4u };
-    return Clip_read(param, v_223);
-}
-
-static inline __attribute__((always_inline))
-void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_199)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_199);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_199);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_199);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_199);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.blend;
-    write_mem(param_15, param_16, param_17, v_199);
-}
-
-static inline __attribute__((always_inline))
-void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_199)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 4u;
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    AnnoBeginClipRef param_4 = AnnoBeginClipRef{ ref.offset + 4u };
-    AnnoBeginClip param_5 = s;
-    AnnoBeginClip_write(param_3, param_4, param_5, v_199);
-}
-
-static inline __attribute__((always_inline))
-void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_199)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_199);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_199);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_199);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = s.blend;
-    write_mem(param_12, param_13, param_14, v_199);
-}
-
-static inline __attribute__((always_inline))
-void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoEndClip& s, device Memory& v_199)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 5u;
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    AnnoEndClipRef param_4 = AnnoEndClipRef{ ref.offset + 4u };
-    AnnoEndClip param_5 = s;
-    AnnoEndClip_write(param_3, param_4, param_5, v_199);
-}
-
-kernel void main0(device Memory& v_199 [[buffer(0)]], const device ConfigBuf& _1054 [[buffer(1)]], const device SceneBuf& v_223 [[buffer(2)]], const device ParentBuf& _1020 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92 [[buffer(1)]], const device SceneBuf& _102 [[buffer(2)]], const device ParentBuf& _202 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup DrawMonoid sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
-    ElementRef ref = ElementRef{ ix * 36u };
-    ElementRef param = ref;
-    uint tag_word = Element_tag(param, v_223).tag;
-    uint param_1 = tag_word;
-    DrawMonoid agg = map_tag(param_1);
+    uint drawtag_base = _92.conf.drawtag_offset >> uint(2);
+    uint tag_word = _102.scene[drawtag_base + ix];
+    uint param = tag_word;
+    DrawMonoid agg = map_tag(param);
     spvUnsafeArray<DrawMonoid, 8> local;
     local[0] = agg;
     for (uint i = 1u; i < 8u; i++)
     {
-        ElementRef param_2 = ref;
-        uint param_3 = i;
-        ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4, v_223).tag;
-        uint param_5 = tag_word;
-        DrawMonoid param_6 = agg;
-        DrawMonoid param_7 = map_tag(param_5);
-        agg = combine_tag_monoid(param_6, param_7);
+        tag_word = _102.scene[(drawtag_base + ix) + i];
+        uint param_1 = tag_word;
+        DrawMonoid param_2 = agg;
+        DrawMonoid param_3 = map_tag(param_1);
+        agg = combine_draw_monoid(param_2, param_3);
         local[i] = agg;
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
@@ -646,181 +170,117 @@
         if (gl_LocalInvocationID.x >= (1u << i_1))
         {
             DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
-            DrawMonoid param_8 = other;
-            DrawMonoid param_9 = agg;
-            agg = combine_tag_monoid(param_8, param_9);
+            DrawMonoid param_4 = other;
+            DrawMonoid param_5 = agg;
+            agg = combine_draw_monoid(param_4, param_5);
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    DrawMonoid row = tag_monoid_identity();
+    DrawMonoid row = draw_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        uint _1023 = gl_WorkGroupID.x - 1u;
-        row.path_ix = _1020.parent[_1023].path_ix;
-        row.clip_ix = _1020.parent[_1023].clip_ix;
+        uint _205 = gl_WorkGroupID.x - 1u;
+        row.path_ix = _202.parent[_205].path_ix;
+        row.clip_ix = _202.parent[_205].clip_ix;
+        row.scene_offset = _202.parent[_205].scene_offset;
+        row.info_offset = _202.parent[_205].info_offset;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
-        DrawMonoid param_10 = row;
-        DrawMonoid param_11 = sh_scratch[gl_LocalInvocationID.x - 1u];
-        row = combine_tag_monoid(param_10, param_11);
+        DrawMonoid param_6 = row;
+        DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
+        row = combine_draw_monoid(param_6, param_7);
     }
+    uint drawdata_base = _92.conf.drawdata_offset >> uint(2);
+    uint drawinfo_base = _92.conf.drawinfo_alloc.offset >> uint(2);
     uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_1054.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u);
-    uint clip_out_base = _1054.conf.clip_alloc.offset >> uint(2);
-    AnnotatedRef out_ref = AnnotatedRef{ _1054.conf.anno_alloc.offset + (out_ix * 40u) };
+    uint out_base = (_92.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 4u);
+    uint clip_out_base = _92.conf.clip_alloc.offset >> uint(2);
     float4 mat;
     float2 translate;
-    AnnoColor anno_fill;
-    Alloc param_18;
-    AnnoLinGradient anno_lin;
-    Alloc param_23;
-    AnnoImage anno_img;
-    Alloc param_28;
-    AnnoBeginClip anno_begin_clip;
-    Alloc param_33;
-    AnnoEndClip anno_end_clip;
-    Alloc param_38;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid m = row;
         if (i_2 > 0u)
         {
-            DrawMonoid param_12 = m;
-            DrawMonoid param_13 = local[i_2 - 1u];
-            m = combine_tag_monoid(param_12, param_13);
+            DrawMonoid param_8 = m;
+            DrawMonoid param_9 = local[i_2 - 1u];
+            m = combine_draw_monoid(param_8, param_9);
         }
-        v_199.memory[out_base + (i_2 * 2u)] = m.path_ix;
-        v_199.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
-        ElementRef param_14 = ref;
-        uint param_15 = i_2;
-        ElementRef this_ref = Element_index(param_14, param_15);
-        ElementRef param_16 = this_ref;
-        tag_word = Element_tag(param_16, v_223).tag;
-        if ((((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) || (tag_word == 9u))
+        _284.memory[out_base + (i_2 * 4u)] = m.path_ix;
+        _284.memory[(out_base + (i_2 * 4u)) + 1u] = m.clip_ix;
+        _284.memory[(out_base + (i_2 * 4u)) + 2u] = m.scene_offset;
+        _284.memory[(out_base + (i_2 * 4u)) + 3u] = m.info_offset;
+        uint dd = drawdata_base + (m.scene_offset >> uint(2));
+        uint di = drawinfo_base + (m.info_offset >> uint(2));
+        tag_word = _102.scene[(drawtag_base + ix) + i_2];
+        if ((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 72u)) || (tag_word == 5u))
         {
-            uint bbox_offset = (_1054.conf.bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(v_199.memory[bbox_offset]) - 32768.0;
-            float bbox_t = float(v_199.memory[bbox_offset + 1u]) - 32768.0;
-            float bbox_r = float(v_199.memory[bbox_offset + 2u]) - 32768.0;
-            float bbox_b = float(v_199.memory[bbox_offset + 3u]) - 32768.0;
+            uint bbox_offset = (_92.conf.path_bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_284.memory[bbox_offset]) - 32768.0;
+            float bbox_t = float(_284.memory[bbox_offset + 1u]) - 32768.0;
+            float bbox_r = float(_284.memory[bbox_offset + 2u]) - 32768.0;
+            float bbox_b = float(_284.memory[bbox_offset + 3u]) - 32768.0;
             float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = as_type<float>(v_199.memory[bbox_offset + 4u]);
+            float linewidth = as_type<float>(_284.memory[bbox_offset + 4u]);
             uint fill_mode = uint(linewidth >= 0.0);
-            if ((linewidth >= 0.0) || (tag_word == 5u))
+            if ((linewidth >= 0.0) || (tag_word == 276u))
             {
-                uint trans_ix = v_199.memory[bbox_offset + 5u];
-                uint t = (_1054.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
-                mat = as_type<float4>(uint4(v_199.memory[t], v_199.memory[t + 1u], v_199.memory[t + 2u], v_199.memory[t + 3u]));
-                if (tag_word == 5u)
+                uint trans_ix = _284.memory[bbox_offset + 5u];
+                uint t = (_92.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
+                mat = as_type<float4>(uint4(_284.memory[t], _284.memory[t + 1u], _284.memory[t + 2u], _284.memory[t + 3u]));
+                if (tag_word == 276u)
                 {
-                    translate = as_type<float2>(uint2(v_199.memory[t + 4u], v_199.memory[t + 5u]));
+                    translate = as_type<float2>(uint2(_284.memory[t + 4u], _284.memory[t + 5u]));
                 }
             }
             if (linewidth >= 0.0)
             {
                 linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z)));
             }
-            linewidth = fast::max(linewidth, 0.0);
             switch (tag_word)
             {
-                case 4u:
+                case 68u:
+                case 72u:
                 {
-                    ElementRef param_17 = this_ref;
-                    FillColor fill = Element_FillColor_read(param_17, v_223);
-                    anno_fill.bbox = bbox;
-                    anno_fill.linewidth = linewidth;
-                    anno_fill.rgba_color = fill.rgba_color;
-                    param_18.offset = _1054.conf.anno_alloc.offset;
-                    AnnotatedRef param_19 = out_ref;
-                    uint param_20 = fill_mode;
-                    AnnoColor param_21 = anno_fill;
-                    Annotated_Color_write(param_18, param_19, param_20, param_21, v_199);
+                    _284.memory[di] = as_type<uint>(linewidth);
                     break;
                 }
-                case 5u:
+                case 276u:
                 {
-                    ElementRef param_22 = this_ref;
-                    FillLinGradient lin = Element_FillLinGradient_read(param_22, v_223);
-                    anno_lin.bbox = bbox;
-                    anno_lin.linewidth = linewidth;
-                    anno_lin.index = lin.index;
-                    float2 p0 = ((mat.xy * lin.p0.x) + (mat.zw * lin.p0.y)) + translate;
-                    float2 p1 = ((mat.xy * lin.p1.x) + (mat.zw * lin.p1.y)) + translate;
+                    _284.memory[di] = as_type<uint>(linewidth);
+                    uint index = _102.scene[dd];
+                    float2 p0 = as_type<float2>(uint2(_102.scene[dd + 1u], _102.scene[dd + 2u]));
+                    float2 p1 = as_type<float2>(uint2(_102.scene[dd + 3u], _102.scene[dd + 4u]));
+                    p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
+                    p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
                     float2 dxy = p1 - p0;
                     float scale = 1.0 / ((dxy.x * dxy.x) + (dxy.y * dxy.y));
                     float line_x = dxy.x * scale;
                     float line_y = dxy.y * scale;
-                    anno_lin.line_x = line_x;
-                    anno_lin.line_y = line_y;
-                    anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    param_23.offset = _1054.conf.anno_alloc.offset;
-                    AnnotatedRef param_24 = out_ref;
-                    uint param_25 = fill_mode;
-                    AnnoLinGradient param_26 = anno_lin;
-                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_199);
+                    float line_c = -((p0.x * line_x) + (p0.y * line_y));
+                    _284.memory[di + 1u] = as_type<uint>(line_x);
+                    _284.memory[di + 2u] = as_type<uint>(line_y);
+                    _284.memory[di + 3u] = as_type<uint>(line_c);
                     break;
                 }
-                case 6u:
+                case 5u:
                 {
-                    ElementRef param_27 = this_ref;
-                    FillImage fill_img = Element_FillImage_read(param_27, v_223);
-                    anno_img.bbox = bbox;
-                    anno_img.linewidth = linewidth;
-                    anno_img.index = fill_img.index;
-                    anno_img.offset = fill_img.offset;
-                    param_28.offset = _1054.conf.anno_alloc.offset;
-                    AnnotatedRef param_29 = out_ref;
-                    uint param_30 = fill_mode;
-                    AnnoImage param_31 = anno_img;
-                    Annotated_Image_write(param_28, param_29, param_30, param_31, v_199);
-                    break;
-                }
-                case 9u:
-                {
-                    ElementRef param_32 = this_ref;
-                    Clip begin_clip = Element_BeginClip_read(param_32, v_223);
-                    anno_begin_clip.bbox = bbox;
-                    anno_begin_clip.linewidth = 0.0;
-                    anno_begin_clip.blend = begin_clip.blend;
-                    uint flags = uint(begin_clip.blend != 3u) << uint(1);
-                    param_33.offset = _1054.conf.anno_alloc.offset;
-                    AnnotatedRef param_34 = out_ref;
-                    uint param_35 = flags;
-                    AnnoBeginClip param_36 = anno_begin_clip;
-                    Annotated_BeginClip_write(param_33, param_34, param_35, param_36, v_199);
                     break;
                 }
             }
         }
-        else
-        {
-            if (tag_word == 10u)
-            {
-                ElementRef param_37 = this_ref;
-                Clip end_clip = Element_BeginClip_read(param_37, v_223);
-                anno_end_clip.bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
-                anno_end_clip.blend = end_clip.blend;
-                uint flags_1 = uint(end_clip.blend != 3u) << uint(1);
-                param_38.offset = _1054.conf.anno_alloc.offset;
-                AnnotatedRef param_39 = out_ref;
-                uint param_40 = flags_1;
-                AnnoEndClip param_41 = anno_end_clip;
-                Annotated_EndClip_write(param_38, param_39, param_40, param_41, v_199);
-            }
-        }
-        if ((tag_word == 9u) || (tag_word == 10u))
+        if ((tag_word == 5u) || (tag_word == 37u))
         {
             uint path_ix = ~(out_ix + i_2);
-            if (tag_word == 9u)
+            if (tag_word == 5u)
             {
                 path_ix = m.path_ix;
             }
-            v_199.memory[clip_out_base + m.clip_ix] = path_ix;
+            _284.memory[clip_out_base + m.clip_ix] = path_ix;
         }
-        out_ref.offset += 40u;
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv
index bdbdb0c..d18b287 100644
--- a/piet-gpu/shader/gen/draw_leaf.spv
+++ b/piet-gpu/shader/gen/draw_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil
index 9b1b0fd..bfafc5f 100644
--- a/piet-gpu/shader/gen/draw_reduce.dxil
+++ b/piet-gpu/shader/gen/draw_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl
index 0553c9b..7220b7e 100644
--- a/piet-gpu/shader/gen/draw_reduce.hlsl
+++ b/piet-gpu/shader/gen/draw_reduce.hlsl
@@ -1,22 +1,11 @@
-struct ElementRef
-{
-    uint offset;
-};
-
-struct ElementTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct DrawMonoid
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
-static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
-
 struct Alloc
 {
     uint offset;
@@ -34,12 +23,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -47,16 +38,16 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
-static const DrawMonoid _87 = { 1u, 0u };
-static const DrawMonoid _89 = { 1u, 1u };
-static const DrawMonoid _91 = { 0u, 0u };
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-ByteAddressBuffer _46 : register(t2, space0);
-RWByteAddressBuffer _199 : register(u3, space0);
-RWByteAddressBuffer _213 : register(u0, space0);
-ByteAddressBuffer _219 : register(t1, space0);
+ByteAddressBuffer _86 : register(t1, space0);
+ByteAddressBuffer _96 : register(t2, space0);
+RWByteAddressBuffer _187 : register(u3, space0);
+RWByteAddressBuffer _205 : register(u0, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -70,68 +61,37 @@
 
 groupshared DrawMonoid sh_scratch[256];
 
-ElementTag Element_tag(ElementRef ref)
-{
-    uint tag_and_flags = _46.Load((ref.offset >> uint(2)) * 4 + 0);
-    ElementTag _60 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _60;
-}
-
 DrawMonoid map_tag(uint tag_word)
 {
-    switch (tag_word)
-    {
-        case 4u:
-        case 5u:
-        case 6u:
-        {
-            return _87;
-        }
-        case 9u:
-        case 10u:
-        {
-            return _89;
-        }
-        default:
-        {
-            return _91;
-        }
-    }
+    uint has_path = uint(tag_word != 0u);
+    DrawMonoid _69 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
+    return _69;
 }
 
-ElementRef Element_index(ElementRef ref, uint index)
-{
-    ElementRef _39 = { ref.offset + (index * 36u) };
-    return _39;
-}
-
-DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
+DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
 {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    ElementRef _107 = { ix * 36u };
-    ElementRef ref = _107;
-    ElementRef param = ref;
-    uint tag_word = Element_tag(param).tag;
-    uint param_1 = tag_word;
-    DrawMonoid agg = map_tag(param_1);
+    uint drawtag_base = _86.Load(100) >> uint(2);
+    uint tag_word = _96.Load((drawtag_base + ix) * 4 + 0);
+    uint param = tag_word;
+    DrawMonoid agg = map_tag(param);
     for (uint i = 1u; i < 8u; i++)
     {
-        ElementRef param_2 = ref;
-        uint param_3 = i;
-        ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4).tag;
-        uint param_5 = tag_word;
-        DrawMonoid param_6 = agg;
-        DrawMonoid param_7 = map_tag(param_5);
-        agg = combine_tag_monoid(param_6, param_7);
+        uint tag_word_1 = _96.Load(((drawtag_base + ix) + i) * 4 + 0);
+        uint param_1 = tag_word_1;
+        DrawMonoid param_2 = agg;
+        DrawMonoid param_3 = map_tag(param_1);
+        agg = combine_draw_monoid(param_2, param_3);
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
@@ -140,17 +100,19 @@
         if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u)
         {
             DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
-            DrawMonoid param_8 = agg;
-            DrawMonoid param_9 = other;
-            agg = combine_tag_monoid(param_8, param_9);
+            DrawMonoid param_4 = agg;
+            DrawMonoid param_5 = other;
+            agg = combine_draw_monoid(param_4, param_5);
         }
         GroupMemoryBarrierWithGroupSync();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        _199.Store(gl_WorkGroupID.x * 8 + 0, agg.path_ix);
-        _199.Store(gl_WorkGroupID.x * 8 + 4, agg.clip_ix);
+        _187.Store(gl_WorkGroupID.x * 16 + 0, agg.path_ix);
+        _187.Store(gl_WorkGroupID.x * 16 + 4, agg.clip_ix);
+        _187.Store(gl_WorkGroupID.x * 16 + 8, agg.scene_offset);
+        _187.Store(gl_WorkGroupID.x * 16 + 12, agg.info_offset);
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl
index 064c515..8e409a8 100644
--- a/piet-gpu/shader/gen/draw_reduce.msl
+++ b/piet-gpu/shader/gen/draw_reduce.msl
@@ -5,48 +5,14 @@
 
 using namespace metal;
 
-struct ElementRef
-{
-    uint offset;
-};
-
-struct ElementTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct DrawMonoid
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
-struct SceneBuf
-{
-    uint scene[1];
-};
-
-struct DrawMonoid_1
-{
-    uint path_ix;
-    uint clip_ix;
-};
-
-struct OutBuf
-{
-    DrawMonoid_1 outbuf[1];
-};
-
-struct Memory
-{
-    uint mem_offset;
-    uint mem_error;
-    uint memory[1];
-};
-
-constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
-
 struct Alloc
 {
     uint offset;
@@ -64,12 +30,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -77,6 +45,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -84,70 +54,66 @@
     Config conf;
 };
 
-static inline __attribute__((always_inline))
-ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_46)
+struct SceneBuf
 {
-    uint tag_and_flags = v_46.scene[ref.offset >> uint(2)];
-    return ElementTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
+    uint scene[1];
+};
+
+struct DrawMonoid_1
+{
+    uint path_ix;
+    uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
+};
+
+struct OutBuf
+{
+    DrawMonoid_1 outbuf[1];
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
 DrawMonoid map_tag(thread const uint& tag_word)
 {
-    switch (tag_word)
-    {
-        case 4u:
-        case 5u:
-        case 6u:
-        {
-            return DrawMonoid{ 1u, 0u };
-        }
-        case 9u:
-        case 10u:
-        {
-            return DrawMonoid{ 1u, 1u };
-        }
-        default:
-        {
-            return DrawMonoid{ 0u, 0u };
-        }
-    }
+    uint has_path = uint(tag_word != 0u);
+    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
 }
 
 static inline __attribute__((always_inline))
-ElementRef Element_index(thread const ElementRef& ref, thread const uint& index)
-{
-    return ElementRef{ ref.offset + (index * 36u) };
-}
-
-static inline __attribute__((always_inline))
-DrawMonoid combine_tag_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
+DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
 {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
-kernel void main0(const device SceneBuf& v_46 [[buffer(2)]], device OutBuf& _199 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(const device ConfigBuf& _86 [[buffer(1)]], const device SceneBuf& _96 [[buffer(2)]], device OutBuf& _187 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup DrawMonoid sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
-    ElementRef ref = ElementRef{ ix * 36u };
-    ElementRef param = ref;
-    uint tag_word = Element_tag(param, v_46).tag;
-    uint param_1 = tag_word;
-    DrawMonoid agg = map_tag(param_1);
+    uint drawtag_base = _86.conf.drawtag_offset >> uint(2);
+    uint tag_word = _96.scene[drawtag_base + ix];
+    uint param = tag_word;
+    DrawMonoid agg = map_tag(param);
     for (uint i = 1u; i < 8u; i++)
     {
-        ElementRef param_2 = ref;
-        uint param_3 = i;
-        ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4, v_46).tag;
-        uint param_5 = tag_word;
-        DrawMonoid param_6 = agg;
-        DrawMonoid param_7 = map_tag(param_5);
-        agg = combine_tag_monoid(param_6, param_7);
+        uint tag_word_1 = _96.scene[(drawtag_base + ix) + i];
+        uint param_1 = tag_word_1;
+        DrawMonoid param_2 = agg;
+        DrawMonoid param_3 = map_tag(param_1);
+        agg = combine_draw_monoid(param_2, param_3);
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
@@ -156,17 +122,19 @@
         if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u)
         {
             DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
-            DrawMonoid param_8 = agg;
-            DrawMonoid param_9 = other;
-            agg = combine_tag_monoid(param_8, param_9);
+            DrawMonoid param_4 = agg;
+            DrawMonoid param_5 = other;
+            agg = combine_draw_monoid(param_4, param_5);
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        _199.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
-        _199.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix;
+        _187.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
+        _187.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix;
+        _187.outbuf[gl_WorkGroupID.x].scene_offset = agg.scene_offset;
+        _187.outbuf[gl_WorkGroupID.x].info_offset = agg.info_offset;
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv
index a45627d..4daf43a 100644
--- a/piet-gpu/shader/gen/draw_reduce.spv
+++ b/piet-gpu/shader/gen/draw_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_root.dxil b/piet-gpu/shader/gen/draw_root.dxil
index a84fd4a..873fa29 100644
--- a/piet-gpu/shader/gen/draw_root.dxil
+++ b/piet-gpu/shader/gen/draw_root.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_root.hlsl b/piet-gpu/shader/gen/draw_root.hlsl
index 56b513f..b4cb7e4 100644
--- a/piet-gpu/shader/gen/draw_root.hlsl
+++ b/piet-gpu/shader/gen/draw_root.hlsl
@@ -2,13 +2,15 @@
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-static const DrawMonoid _18 = { 0u, 0u };
+static const DrawMonoid _18 = { 0u, 0u, 0u, 0u };
 
-RWByteAddressBuffer _57 : register(u0, space0);
+RWByteAddressBuffer _71 : register(u0, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
@@ -20,15 +22,17 @@
 
 groupshared DrawMonoid sh_scratch[256];
 
-DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
+DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
 {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
-DrawMonoid tag_monoid_identity()
+DrawMonoid draw_monoid_identity()
 {
     return _18;
 }
@@ -36,22 +40,30 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    DrawMonoid _61;
-    _61.path_ix = _57.Load(ix * 8 + 0);
-    _61.clip_ix = _57.Load(ix * 8 + 4);
+    DrawMonoid _75;
+    _75.path_ix = _71.Load(ix * 16 + 0);
+    _75.clip_ix = _71.Load(ix * 16 + 4);
+    _75.scene_offset = _71.Load(ix * 16 + 8);
+    _75.info_offset = _71.Load(ix * 16 + 12);
     DrawMonoid local[8];
-    local[0].path_ix = _61.path_ix;
-    local[0].clip_ix = _61.clip_ix;
+    local[0].path_ix = _75.path_ix;
+    local[0].clip_ix = _75.clip_ix;
+    local[0].scene_offset = _75.scene_offset;
+    local[0].info_offset = _75.info_offset;
     DrawMonoid param_1;
     for (uint i = 1u; i < 8u; i++)
     {
         DrawMonoid param = local[i - 1u];
-        DrawMonoid _88;
-        _88.path_ix = _57.Load((ix + i) * 8 + 0);
-        _88.clip_ix = _57.Load((ix + i) * 8 + 4);
-        param_1.path_ix = _88.path_ix;
-        param_1.clip_ix = _88.clip_ix;
-        local[i] = combine_tag_monoid(param, param_1);
+        DrawMonoid _106;
+        _106.path_ix = _71.Load((ix + i) * 16 + 0);
+        _106.clip_ix = _71.Load((ix + i) * 16 + 4);
+        _106.scene_offset = _71.Load((ix + i) * 16 + 8);
+        _106.info_offset = _71.Load((ix + i) * 16 + 12);
+        param_1.path_ix = _106.path_ix;
+        param_1.clip_ix = _106.clip_ix;
+        param_1.scene_offset = _106.scene_offset;
+        param_1.info_offset = _106.info_offset;
+        local[i] = combine_draw_monoid(param, param_1);
     }
     DrawMonoid agg = local[7];
     sh_scratch[gl_LocalInvocationID.x] = agg;
@@ -63,13 +75,13 @@
             DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
             DrawMonoid param_2 = other;
             DrawMonoid param_3 = agg;
-            agg = combine_tag_monoid(param_2, param_3);
+            agg = combine_draw_monoid(param_2, param_3);
         }
         GroupMemoryBarrierWithGroupSync();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     GroupMemoryBarrierWithGroupSync();
-    DrawMonoid row = tag_monoid_identity();
+    DrawMonoid row = draw_monoid_identity();
     if (gl_LocalInvocationID.x > 0u)
     {
         row = sh_scratch[gl_LocalInvocationID.x - 1u];
@@ -78,10 +90,12 @@
     {
         DrawMonoid param_4 = row;
         DrawMonoid param_5 = local[i_2];
-        DrawMonoid m = combine_tag_monoid(param_4, param_5);
-        uint _177 = ix + i_2;
-        _57.Store(_177 * 8 + 0, m.path_ix);
-        _57.Store(_177 * 8 + 4, m.clip_ix);
+        DrawMonoid m = combine_draw_monoid(param_4, param_5);
+        uint _199 = ix + i_2;
+        _71.Store(_199 * 16 + 0, m.path_ix);
+        _71.Store(_199 * 16 + 4, m.clip_ix);
+        _71.Store(_199 * 16 + 8, m.scene_offset);
+        _71.Store(_199 * 16 + 12, m.info_offset);
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_root.msl b/piet-gpu/shader/gen/draw_root.msl
index 0d22e4b..9ee8cfe 100644
--- a/piet-gpu/shader/gen/draw_root.msl
+++ b/piet-gpu/shader/gen/draw_root.msl
@@ -48,12 +48,16 @@
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
 struct DrawMonoid_1
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
 struct DataBuf
@@ -64,35 +68,41 @@
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-DrawMonoid combine_tag_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
+DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
 {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
 static inline __attribute__((always_inline))
-DrawMonoid tag_monoid_identity()
+DrawMonoid draw_monoid_identity()
 {
-    return DrawMonoid{ 0u, 0u };
+    return DrawMonoid{ 0u, 0u, 0u, 0u };
 }
 
-kernel void main0(device DataBuf& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device DataBuf& _71 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup DrawMonoid sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
     spvUnsafeArray<DrawMonoid, 8> local;
-    local[0].path_ix = _57.data[ix].path_ix;
-    local[0].clip_ix = _57.data[ix].clip_ix;
+    local[0].path_ix = _71.data[ix].path_ix;
+    local[0].clip_ix = _71.data[ix].clip_ix;
+    local[0].scene_offset = _71.data[ix].scene_offset;
+    local[0].info_offset = _71.data[ix].info_offset;
     DrawMonoid param_1;
     for (uint i = 1u; i < 8u; i++)
     {
-        uint _82 = ix + i;
+        uint _100 = ix + i;
         DrawMonoid param = local[i - 1u];
-        param_1.path_ix = _57.data[_82].path_ix;
-        param_1.clip_ix = _57.data[_82].clip_ix;
-        local[i] = combine_tag_monoid(param, param_1);
+        param_1.path_ix = _71.data[_100].path_ix;
+        param_1.clip_ix = _71.data[_100].clip_ix;
+        param_1.scene_offset = _71.data[_100].scene_offset;
+        param_1.info_offset = _71.data[_100].info_offset;
+        local[i] = combine_draw_monoid(param, param_1);
     }
     DrawMonoid agg = local[7];
     sh_scratch[gl_LocalInvocationID.x] = agg;
@@ -104,13 +114,13 @@
             DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
             DrawMonoid param_2 = other;
             DrawMonoid param_3 = agg;
-            agg = combine_tag_monoid(param_2, param_3);
+            agg = combine_draw_monoid(param_2, param_3);
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    DrawMonoid row = tag_monoid_identity();
+    DrawMonoid row = draw_monoid_identity();
     if (gl_LocalInvocationID.x > 0u)
     {
         row = sh_scratch[gl_LocalInvocationID.x - 1u];
@@ -119,10 +129,12 @@
     {
         DrawMonoid param_4 = row;
         DrawMonoid param_5 = local[i_2];
-        DrawMonoid m = combine_tag_monoid(param_4, param_5);
-        uint _177 = ix + i_2;
-        _57.data[_177].path_ix = m.path_ix;
-        _57.data[_177].clip_ix = m.clip_ix;
+        DrawMonoid m = combine_draw_monoid(param_4, param_5);
+        uint _199 = ix + i_2;
+        _71.data[_199].path_ix = m.path_ix;
+        _71.data[_199].clip_ix = m.clip_ix;
+        _71.data[_199].scene_offset = m.scene_offset;
+        _71.data[_199].info_offset = m.info_offset;
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_root.spv b/piet-gpu/shader/gen/draw_root.spv
index 1c11414..e6a53e5 100644
--- a/piet-gpu/shader/gen/draw_root.spv
+++ b/piet-gpu/shader/gen/draw_root.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index c0c27c9..bc0b8e8 100644
--- a/piet-gpu/shader/gen/kernel4.dxil
+++ b/piet-gpu/shader/gen/kernel4.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index 21bd083..f17b240 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -125,12 +125,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -138,6 +140,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index 9a8fbd0..c1f41af 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -183,12 +183,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -196,6 +198,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index 4d205ce..91272da 100644
--- a/piet-gpu/shader/gen/kernel4.spv
+++ b/piet-gpu/shader/gen/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index 18c4b7e..7e39c73 100644
--- a/piet-gpu/shader/gen/kernel4_gray.dxil
+++ b/piet-gpu/shader/gen/kernel4_gray.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index 7dc2e01..de95771 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -125,12 +125,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -138,6 +140,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 38506dd..5128e99 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -183,12 +183,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -196,6 +198,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 305facd..791b76c 100644
--- a/piet-gpu/shader/gen/kernel4_gray.spv
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil
index 9fd593c..b6c9398 100644
--- a/piet-gpu/shader/gen/path_coarse.dxil
+++ b/piet-gpu/shader/gen/path_coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/path_coarse.hlsl b/piet-gpu/shader/gen/path_coarse.hlsl
index 59cd7a6..93ee8f0 100644
--- a/piet-gpu/shader/gen/path_coarse.hlsl
+++ b/piet-gpu/shader/gen/path_coarse.hlsl
@@ -84,12 +84,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -97,6 +99,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(32u, 1u, 1u);
diff --git a/piet-gpu/shader/gen/path_coarse.msl b/piet-gpu/shader/gen/path_coarse.msl
index f3cead8..26aa33a 100644
--- a/piet-gpu/shader/gen/path_coarse.msl
+++ b/piet-gpu/shader/gen/path_coarse.msl
@@ -144,12 +144,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -157,6 +159,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/path_coarse.spv b/piet-gpu/shader/gen/path_coarse.spv
index 7c452bf..5e6beda 100644
--- a/piet-gpu/shader/gen/path_coarse.spv
+++ b/piet-gpu/shader/gen/path_coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil
index 3895131..7ce4684 100644
--- a/piet-gpu/shader/gen/pathseg.dxil
+++ b/piet-gpu/shader/gen/pathseg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl
index 0501f6f..578417f 100644
--- a/piet-gpu/shader/gen/pathseg.hlsl
+++ b/piet-gpu/shader/gen/pathseg.hlsl
@@ -62,12 +62,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -75,6 +77,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
@@ -361,7 +365,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 4u;
-    uint tag_word = _574.Load(((_639.Load(84) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
+    uint tag_word = _574.Load(((_639.Load(92) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
     uint param = tag_word;
     TagMonoid local_tm = reduce_tag(param);
     sh_tag[gl_LocalInvocationID.x] = local_tm;
@@ -400,8 +404,8 @@
         TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u];
         tm = combine_tag_monoid(param_3, param_4);
     }
-    uint ps_ix = (_639.Load(88) >> uint(2)) + tm.pathseg_offset;
-    uint lw_ix = (_639.Load(80) >> uint(2)) + tm.linewidth_ix;
+    uint ps_ix = (_639.Load(96) >> uint(2)) + tm.pathseg_offset;
+    uint lw_ix = (_639.Load(88) >> uint(2)) + tm.linewidth_ix;
     uint save_path_ix = tm.path_ix;
     uint trans_ix = tm.trans_ix;
     TransformSegRef _771 = { _639.Load(36) + (trans_ix * 24u) };
diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl
index 0e97d68..9f6328e 100644
--- a/piet-gpu/shader/gen/pathseg.msl
+++ b/piet-gpu/shader/gen/pathseg.msl
@@ -127,12 +127,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -140,6 +142,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -635,7 +639,7 @@
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint path_ix = save_path_ix;
-    uint bbox_out_ix = (_639.conf.bbox_alloc.offset >> uint(2)) + (path_ix * 6u);
+    uint bbox_out_ix = (_639.conf.path_bbox_alloc.offset >> uint(2)) + (path_ix * 6u);
     Monoid row = monoid_identity();
     if (gl_LocalInvocationID.x > 0u)
     {
diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv
index a1f223c..4e2e9d5 100644
--- a/piet-gpu/shader/gen/pathseg.spv
+++ b/piet-gpu/shader/gen/pathseg.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil
index 89fb562..ff544b8 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.dxil
+++ b/piet-gpu/shader/gen/pathtag_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl
index 754e6e9..5f7d125 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.hlsl
+++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl
@@ -24,12 +24,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -37,6 +39,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u);
@@ -88,7 +92,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 2u;
-    uint scene_ix = (_139.Load(84) >> uint(2)) + ix;
+    uint scene_ix = (_139.Load(92) >> uint(2)) + ix;
     uint tag_word = _151.Load(scene_ix * 4 + 0);
     uint param = tag_word;
     TagMonoid agg = reduce_tag(param);
diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl
index 83a8208..91e0cca 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.msl
+++ b/piet-gpu/shader/gen/pathtag_reduce.msl
@@ -31,12 +31,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -44,6 +46,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv
index feaad0a..f1d8679 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.spv
+++ b/piet-gpu/shader/gen/pathtag_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil
index 77f12e6..48584bd 100644
--- a/piet-gpu/shader/gen/pathtag_root.dxil
+++ b/piet-gpu/shader/gen/pathtag_root.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil
index fdc60a1..d4d0b49 100644
--- a/piet-gpu/shader/gen/tile_alloc.dxil
+++ b/piet-gpu/shader/gen/tile_alloc.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl
index 1c9d04b..9e370ed 100644
--- a/piet-gpu/shader/gen/tile_alloc.hlsl
+++ b/piet-gpu/shader/gen/tile_alloc.hlsl
@@ -9,28 +9,6 @@
     bool failed;
 };
 
-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct PathRef
 {
     uint offset;
@@ -59,12 +37,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -72,12 +52,15 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _92 : register(u0, space0);
-ByteAddressBuffer _314 : register(t1, space0);
+RWByteAddressBuffer _70 : register(u0, space0);
+ByteAddressBuffer _181 : register(t1, space0);
+ByteAddressBuffer _257 : register(t2, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
@@ -90,62 +73,15 @@
 groupshared uint sh_tile_count[256];
 groupshared MallocResult sh_tile_alloc;
 
-bool touch_mem(Alloc alloc, uint offset)
+float4 load_draw_bbox(uint draw_ix)
 {
-    return true;
-}
-
-uint read_mem(Alloc alloc, uint offset)
-{
-    Alloc param = alloc;
-    uint param_1 = offset;
-    if (!touch_mem(param, param_1))
-    {
-        return 0u;
-    }
-    uint v = _92.Load(offset * 4 + 8);
-    return v;
-}
-
-AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _246 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _246;
-}
-
-AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9);
-    AnnoEndClip s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.blend = raw4;
-    return s;
-}
-
-AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref)
-{
-    AnnoEndClipRef _252 = { ref.offset + 4u };
-    Alloc param = a;
-    AnnoEndClipRef param_1 = _252;
-    return AnnoEndClip_read(param, param_1);
+    uint base = (_181.Load(64) >> uint(2)) + (4u * draw_ix);
+    float x0 = asfloat(_70.Load(base * 4 + 8));
+    float y0 = asfloat(_70.Load((base + 1u) * 4 + 8));
+    float x1 = asfloat(_70.Load((base + 2u) * 4 + 8));
+    float y1 = asfloat(_70.Load((base + 3u) * 4 + 8));
+    float4 bbox = float4(x0, y0, x1, y1);
+    return bbox;
 }
 
 Alloc new_alloc(uint offset, uint size, bool mem_ok)
@@ -157,22 +93,22 @@
 
 MallocResult malloc(uint size)
 {
-    uint _98;
-    _92.InterlockedAdd(0, size, _98);
-    uint offset = _98;
-    uint _105;
-    _92.GetDimensions(_105);
-    _105 = (_105 - 8) / 4;
+    uint _76;
+    _70.InterlockedAdd(0, size, _76);
+    uint offset = _76;
+    uint _83;
+    _70.GetDimensions(_83);
+    _83 = (_83 - 8) / 4;
     MallocResult r;
-    r.failed = (offset + size) > uint(int(_105) * 4);
+    r.failed = (offset + size) > uint(int(_83) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _127;
-        _92.InterlockedMax(4, 1u, _127);
+        uint _105;
+        _70.InterlockedMax(4, 1u, _105);
         return r;
     }
     return r;
@@ -180,8 +116,13 @@
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _169 = { a.offset + offset };
-    return _169;
+    Alloc _131 = { a.offset + offset };
+    return _131;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
 }
 
 void write_mem(Alloc alloc, uint offset, uint val)
@@ -192,7 +133,7 @@
     {
         return;
     }
-    _92.Store(offset * 4 + 8, val);
+    _70.Store(offset * 4 + 8, val);
 }
 
 void Path_write(Alloc a, PathRef ref, Path s)
@@ -216,53 +157,35 @@
 {
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    PathRef _321 = { _314.Load(16) + (element_ix * 12u) };
-    PathRef path_ref = _321;
-    AnnotatedRef _330 = { _314.Load(32) + (element_ix * 40u) };
-    AnnotatedRef ref = _330;
-    uint tag = 0u;
-    if (element_ix < _314.Load(0))
+    PathRef _241 = { _181.Load(16) + (element_ix * 12u) };
+    PathRef path_ref = _241;
+    uint drawtag_base = _181.Load(100) >> uint(2);
+    uint drawtag = 0u;
+    if (element_ix < _181.Load(0))
     {
-        Alloc _341;
-        _341.offset = _314.Load(32);
-        Alloc param;
-        param.offset = _341.offset;
-        AnnotatedRef param_1 = ref;
-        tag = Annotated_tag(param, param_1).tag;
+        drawtag = _257.Load((drawtag_base + element_ix) * 4 + 0);
     }
     int x0 = 0;
     int y0 = 0;
     int x1 = 0;
     int y1 = 0;
-    switch (tag)
+    if ((drawtag != 0u) && (drawtag != 37u))
     {
-        case 1u:
-        case 2u:
-        case 3u:
-        case 4u:
-        case 5u:
-        {
-            Alloc _359;
-            _359.offset = _314.Load(32);
-            Alloc param_2;
-            param_2.offset = _359.offset;
-            AnnotatedRef param_3 = ref;
-            AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3);
-            x0 = int(floor(clip.bbox.x * 0.0625f));
-            y0 = int(floor(clip.bbox.y * 0.0625f));
-            x1 = int(ceil(clip.bbox.z * 0.0625f));
-            y1 = int(ceil(clip.bbox.w * 0.0625f));
-            break;
-        }
+        uint param = element_ix;
+        float4 bbox = load_draw_bbox(param);
+        x0 = int(floor(bbox.x * 0.0625f));
+        y0 = int(floor(bbox.y * 0.0625f));
+        x1 = int(ceil(bbox.z * 0.0625f));
+        y1 = int(ceil(bbox.w * 0.0625f));
     }
-    x0 = clamp(x0, 0, int(_314.Load(8)));
-    y0 = clamp(y0, 0, int(_314.Load(12)));
-    x1 = clamp(x1, 0, int(_314.Load(8)));
-    y1 = clamp(y1, 0, int(_314.Load(12)));
+    x0 = clamp(x0, 0, int(_181.Load(8)));
+    y0 = clamp(y0, 0, int(_181.Load(12)));
+    x1 = clamp(x1, 0, int(_181.Load(8)));
+    y1 = clamp(y1, 0, int(_181.Load(12)));
     Path path;
     path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
     uint tile_count = uint((x1 - x0) * (y1 - y0));
-    if (tag == 5u)
+    if (drawtag == 37u)
     {
         tile_count = 0u;
     }
@@ -280,59 +203,59 @@
     }
     if (th_ix == 255u)
     {
-        uint param_4 = total_tile_count * 8u;
-        MallocResult _485 = malloc(param_4);
-        sh_tile_alloc = _485;
+        uint param_1 = total_tile_count * 8u;
+        MallocResult _396 = malloc(param_1);
+        sh_tile_alloc = _396;
     }
     GroupMemoryBarrierWithGroupSync();
     MallocResult alloc_start = sh_tile_alloc;
-    bool _496;
+    bool _407;
     if (!alloc_start.failed)
     {
-        _496 = _92.Load(4) != 0u;
+        _407 = _70.Load(4) != 0u;
     }
     else
     {
-        _496 = alloc_start.failed;
+        _407 = alloc_start.failed;
     }
-    if (_496)
+    if (_407)
     {
         return;
     }
-    if (element_ix < _314.Load(0))
+    if (element_ix < _181.Load(0))
     {
-        uint _509;
+        uint _420;
         if (th_ix > 0u)
         {
-            _509 = sh_tile_count[th_ix - 1u];
+            _420 = sh_tile_count[th_ix - 1u];
         }
         else
         {
-            _509 = 0u;
+            _420 = 0u;
         }
-        uint tile_subix = _509;
-        Alloc param_5 = alloc_start.alloc;
-        uint param_6 = 8u * tile_subix;
-        uint param_7 = 8u * tile_count;
-        Alloc tiles_alloc = slice_mem(param_5, param_6, param_7);
-        TileRef _531 = { tiles_alloc.offset };
-        path.tiles = _531;
-        Alloc _536;
-        _536.offset = _314.Load(16);
-        Alloc param_8;
-        param_8.offset = _536.offset;
-        PathRef param_9 = path_ref;
-        Path param_10 = path;
-        Path_write(param_8, param_9, param_10);
+        uint tile_subix = _420;
+        Alloc param_2 = alloc_start.alloc;
+        uint param_3 = 8u * tile_subix;
+        uint param_4 = 8u * tile_count;
+        Alloc tiles_alloc = slice_mem(param_2, param_3, param_4);
+        TileRef _442 = { tiles_alloc.offset };
+        path.tiles = _442;
+        Alloc _448;
+        _448.offset = _181.Load(16);
+        Alloc param_5;
+        param_5.offset = _448.offset;
+        PathRef param_6 = path_ref;
+        Path param_7 = path;
+        Path_write(param_5, param_6, param_7);
     }
     uint total_count = sh_tile_count[255] * 2u;
     uint start_ix = alloc_start.alloc.offset >> uint(2);
     for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
     {
-        Alloc param_11 = alloc_start.alloc;
-        uint param_12 = start_ix + i_1;
-        uint param_13 = 0u;
-        write_mem(param_11, param_12, param_13);
+        Alloc param_8 = alloc_start.alloc;
+        uint param_9 = start_ix + i_1;
+        uint param_10 = 0u;
+        write_mem(param_8, param_9, param_10);
     }
 }
 
diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl
index c03e830..e6f486b 100644
--- a/piet-gpu/shader/gen/tile_alloc.msl
+++ b/piet-gpu/shader/gen/tile_alloc.msl
@@ -18,28 +18,6 @@
     bool failed;
 };
 
-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct PathRef
 {
     uint offset;
@@ -80,12 +58,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -93,6 +73,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -100,67 +82,23 @@
     Config conf;
 };
 
+struct SceneBuf
+{
+    uint scene[1];
+};
+
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+float4 load_draw_bbox(thread const uint& draw_ix, device Memory& v_70, constant uint& v_70BufferSize, const device ConfigBuf& v_181)
 {
-    return true;
-}
-
-static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_92, constant uint& v_92BufferSize)
-{
-    Alloc param = alloc;
-    uint param_1 = offset;
-    if (!touch_mem(param, param_1))
-    {
-        return 0u;
-    }
-    uint v = v_92.memory[offset];
-    return v;
-}
-
-static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_92, constant uint& v_92BufferSize)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_92, v_92BufferSize);
-    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
-
-static inline __attribute__((always_inline))
-AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_92, constant uint& v_92BufferSize)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_92, v_92BufferSize);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_92, v_92BufferSize);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_92, v_92BufferSize);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_92, v_92BufferSize);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_92, v_92BufferSize);
-    AnnoEndClip s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    s.blend = raw4;
-    return s;
-}
-
-static inline __attribute__((always_inline))
-AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_92, constant uint& v_92BufferSize)
-{
-    Alloc param = a;
-    AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u };
-    return AnnoEndClip_read(param, param_1, v_92, v_92BufferSize);
+    uint base = (v_181.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix);
+    float x0 = as_type<float>(v_70.memory[base]);
+    float y0 = as_type<float>(v_70.memory[base + 1u]);
+    float x1 = as_type<float>(v_70.memory[base + 2u]);
+    float y1 = as_type<float>(v_70.memory[base + 3u]);
+    float4 bbox = float4(x0, y0, x1, y1);
+    return bbox;
 }
 
 static inline __attribute__((always_inline))
@@ -172,19 +110,19 @@
 }
 
 static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_92, constant uint& v_92BufferSize)
+MallocResult malloc(thread const uint& size, device Memory& v_70, constant uint& v_70BufferSize)
 {
-    uint _98 = atomic_fetch_add_explicit((device atomic_uint*)&v_92.mem_offset, size, memory_order_relaxed);
-    uint offset = _98;
+    uint _76 = atomic_fetch_add_explicit((device atomic_uint*)&v_70.mem_offset, size, memory_order_relaxed);
+    uint offset = _76;
     MallocResult r;
-    r.failed = (offset + size) > uint(int((v_92BufferSize - 8) / 4) * 4);
+    r.failed = (offset + size) > uint(int((v_70BufferSize - 8) / 4) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _127 = atomic_fetch_max_explicit((device atomic_uint*)&v_92.mem_error, 1u, memory_order_relaxed);
+        uint _105 = atomic_fetch_max_explicit((device atomic_uint*)&v_70.mem_error, 1u, memory_order_relaxed);
         return r;
     }
     return r;
@@ -197,7 +135,13 @@
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_92, constant uint& v_92BufferSize)
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_70, constant uint& v_70BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -205,75 +149,62 @@
     {
         return;
     }
-    v_92.memory[offset] = val;
+    v_70.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_92, constant uint& v_92BufferSize)
+void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_70, constant uint& v_70BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.bbox.x | (s.bbox.y << uint(16));
-    write_mem(param, param_1, param_2, v_92, v_92BufferSize);
+    write_mem(param, param_1, param_2, v_70, v_70BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = s.bbox.z | (s.bbox.w << uint(16));
-    write_mem(param_3, param_4, param_5, v_92, v_92BufferSize);
+    write_mem(param_3, param_4, param_5, v_70, v_70BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = s.tiles.offset;
-    write_mem(param_6, param_7, param_8, v_92, v_92BufferSize);
+    write_mem(param_6, param_7, param_8, v_70, v_70BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_92 [[buffer(0)]], const device ConfigBuf& _314 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_70 [[buffer(0)]], const device ConfigBuf& v_181 [[buffer(1)]], const device SceneBuf& _257 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
     threadgroup uint sh_tile_count[256];
     threadgroup MallocResult sh_tile_alloc;
-    constant uint& v_92BufferSize = spvBufferSizeConstants[0];
+    constant uint& v_70BufferSize = spvBufferSizeConstants[0];
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    PathRef path_ref = PathRef{ _314.conf.tile_alloc.offset + (element_ix * 12u) };
-    AnnotatedRef ref = AnnotatedRef{ _314.conf.anno_alloc.offset + (element_ix * 40u) };
-    uint tag = 0u;
-    if (element_ix < _314.conf.n_elements)
+    PathRef path_ref = PathRef{ v_181.conf.tile_alloc.offset + (element_ix * 12u) };
+    uint drawtag_base = v_181.conf.drawtag_offset >> uint(2);
+    uint drawtag = 0u;
+    if (element_ix < v_181.conf.n_elements)
     {
-        Alloc param;
-        param.offset = _314.conf.anno_alloc.offset;
-        AnnotatedRef param_1 = ref;
-        tag = Annotated_tag(param, param_1, v_92, v_92BufferSize).tag;
+        drawtag = _257.scene[drawtag_base + element_ix];
     }
     int x0 = 0;
     int y0 = 0;
     int x1 = 0;
     int y1 = 0;
-    switch (tag)
+    if ((drawtag != 0u) && (drawtag != 37u))
     {
-        case 1u:
-        case 2u:
-        case 3u:
-        case 4u:
-        case 5u:
-        {
-            Alloc param_2;
-            param_2.offset = _314.conf.anno_alloc.offset;
-            AnnotatedRef param_3 = ref;
-            AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_92, v_92BufferSize);
-            x0 = int(floor(clip.bbox.x * 0.0625));
-            y0 = int(floor(clip.bbox.y * 0.0625));
-            x1 = int(ceil(clip.bbox.z * 0.0625));
-            y1 = int(ceil(clip.bbox.w * 0.0625));
-            break;
-        }
+        uint param = element_ix;
+        float4 bbox = load_draw_bbox(param, v_70, v_70BufferSize, v_181);
+        x0 = int(floor(bbox.x * 0.0625));
+        y0 = int(floor(bbox.y * 0.0625));
+        x1 = int(ceil(bbox.z * 0.0625));
+        y1 = int(ceil(bbox.w * 0.0625));
     }
-    x0 = clamp(x0, 0, int(_314.conf.width_in_tiles));
-    y0 = clamp(y0, 0, int(_314.conf.height_in_tiles));
-    x1 = clamp(x1, 0, int(_314.conf.width_in_tiles));
-    y1 = clamp(y1, 0, int(_314.conf.height_in_tiles));
+    x0 = clamp(x0, 0, int(v_181.conf.width_in_tiles));
+    y0 = clamp(y0, 0, int(v_181.conf.height_in_tiles));
+    x1 = clamp(x1, 0, int(v_181.conf.width_in_tiles));
+    y1 = clamp(y1, 0, int(v_181.conf.height_in_tiles));
     Path path;
     path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
     uint tile_count = uint((x1 - x0) * (y1 - y0));
-    if (tag == 5u)
+    if (drawtag == 37u)
     {
         tile_count = 0u;
     }
@@ -291,56 +222,56 @@
     }
     if (th_ix == 255u)
     {
-        uint param_4 = total_tile_count * 8u;
-        MallocResult _485 = malloc(param_4, v_92, v_92BufferSize);
-        sh_tile_alloc = _485;
+        uint param_1 = total_tile_count * 8u;
+        MallocResult _396 = malloc(param_1, v_70, v_70BufferSize);
+        sh_tile_alloc = _396;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     MallocResult alloc_start = sh_tile_alloc;
-    bool _496;
+    bool _407;
     if (!alloc_start.failed)
     {
-        _496 = v_92.mem_error != 0u;
+        _407 = v_70.mem_error != 0u;
     }
     else
     {
-        _496 = alloc_start.failed;
+        _407 = alloc_start.failed;
     }
-    if (_496)
+    if (_407)
     {
         return;
     }
-    if (element_ix < _314.conf.n_elements)
+    if (element_ix < v_181.conf.n_elements)
     {
-        uint _509;
+        uint _420;
         if (th_ix > 0u)
         {
-            _509 = sh_tile_count[th_ix - 1u];
+            _420 = sh_tile_count[th_ix - 1u];
         }
         else
         {
-            _509 = 0u;
+            _420 = 0u;
         }
-        uint tile_subix = _509;
-        Alloc param_5 = alloc_start.alloc;
-        uint param_6 = 8u * tile_subix;
-        uint param_7 = 8u * tile_count;
-        Alloc tiles_alloc = slice_mem(param_5, param_6, param_7);
+        uint tile_subix = _420;
+        Alloc param_2 = alloc_start.alloc;
+        uint param_3 = 8u * tile_subix;
+        uint param_4 = 8u * tile_count;
+        Alloc tiles_alloc = slice_mem(param_2, param_3, param_4);
         path.tiles = TileRef{ tiles_alloc.offset };
-        Alloc param_8;
-        param_8.offset = _314.conf.tile_alloc.offset;
-        PathRef param_9 = path_ref;
-        Path param_10 = path;
-        Path_write(param_8, param_9, param_10, v_92, v_92BufferSize);
+        Alloc param_5;
+        param_5.offset = v_181.conf.tile_alloc.offset;
+        PathRef param_6 = path_ref;
+        Path param_7 = path;
+        Path_write(param_5, param_6, param_7, v_70, v_70BufferSize);
     }
     uint total_count = sh_tile_count[255] * 2u;
     uint start_ix = alloc_start.alloc.offset >> uint(2);
     for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
     {
-        Alloc param_11 = alloc_start.alloc;
-        uint param_12 = start_ix + i_1;
-        uint param_13 = 0u;
-        write_mem(param_11, param_12, param_13, v_92, v_92BufferSize);
+        Alloc param_8 = alloc_start.alloc;
+        uint param_9 = start_ix + i_1;
+        uint param_10 = 0u;
+        write_mem(param_8, param_9, param_10, v_70, v_70BufferSize);
     }
 }
 
diff --git a/piet-gpu/shader/gen/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv
index cf2f01c..3ae1073 100644
--- a/piet-gpu/shader/gen/tile_alloc.spv
+++ b/piet-gpu/shader/gen/tile_alloc.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil
index 8a3200b..0c1e376 100644
--- a/piet-gpu/shader/gen/transform_leaf.dxil
+++ b/piet-gpu/shader/gen/transform_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl
index 3528d6c..8a3b3d5 100644
--- a/piet-gpu/shader/gen/transform_leaf.hlsl
+++ b/piet-gpu/shader/gen/transform_leaf.hlsl
@@ -37,12 +37,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -50,6 +52,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
@@ -155,7 +159,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _285 = { _278.Load(76) + (ix * 24u) };
+    TransformRef _285 = { _278.Load(84) + (ix * 24u) };
     TransformRef ref = _285;
     TransformRef param = ref;
     Transform agg = Transform_read(param);
diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl
index 6a99fae..fe45438 100644
--- a/piet-gpu/shader/gen/transform_leaf.msl
+++ b/piet-gpu/shader/gen/transform_leaf.msl
@@ -100,12 +100,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -113,6 +115,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv
index b9a0a83..b739099 100644
--- a/piet-gpu/shader/gen/transform_leaf.spv
+++ b/piet-gpu/shader/gen/transform_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil
index 6c94a3a..fc3a311 100644
--- a/piet-gpu/shader/gen/transform_reduce.dxil
+++ b/piet-gpu/shader/gen/transform_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl
index cce1a22..bd14f79 100644
--- a/piet-gpu/shader/gen/transform_reduce.hlsl
+++ b/piet-gpu/shader/gen/transform_reduce.hlsl
@@ -26,12 +26,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -39,6 +41,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
@@ -92,7 +96,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _168 = { _161.Load(76) + (ix * 24u) };
+    TransformRef _168 = { _161.Load(84) + (ix * 24u) };
     TransformRef ref = _168;
     TransformRef param = ref;
     Transform agg = Transform_read(param);
diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl
index 3695563..62da531 100644
--- a/piet-gpu/shader/gen/transform_reduce.msl
+++ b/piet-gpu/shader/gen/transform_reduce.msl
@@ -38,12 +38,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -51,6 +53,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv
index e74cb8d..6aa6b94 100644
--- a/piet-gpu/shader/gen/transform_reduce.spv
+++ b/piet-gpu/shader/gen/transform_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_root.dxil b/piet-gpu/shader/gen/transform_root.dxil
index 5b4f059..a33ff7f 100644
--- a/piet-gpu/shader/gen/transform_root.dxil
+++ b/piet-gpu/shader/gen/transform_root.dxil
Binary files differ
diff --git a/piet-gpu/shader/pathseg.comp b/piet-gpu/shader/pathseg.comp
index a2ea86e..ce4ab84 100644
--- a/piet-gpu/shader/pathseg.comp
+++ b/piet-gpu/shader/pathseg.comp
@@ -248,7 +248,7 @@
 
     barrier();
     uint path_ix = save_path_ix;
-    uint bbox_out_ix = (conf.bbox_alloc.offset >> 2) + path_ix * 6;
+    uint bbox_out_ix = (conf.path_bbox_alloc.offset >> 2) + path_ix * 6;
     // Write bboxes to paths; do atomic min/max if partial
     Monoid row = monoid_identity();
     if (gl_LocalInvocationID.x > 0) {
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index 0dccecb..ec17188 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -42,7 +42,7 @@
     // new element pipeline stuff follows
 
     // Bounding boxes of paths, stored as int (so atomics work)
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     // Monoid for draw objects
     Alloc drawmonoid_alloc;
 
@@ -54,6 +54,10 @@
     Alloc clip_stack_alloc;
     // Clip processing results (path_ix + bbox)
     Alloc clip_bbox_alloc;
+    // Bounding box per draw object
+    Alloc draw_bbox_alloc;
+    // Info computed in draw stage, per draw object
+    Alloc drawinfo_alloc;
 
     // Number of transforms in scene
     // This is probably not needed.
@@ -63,6 +67,10 @@
     uint n_path;
     // Total number of BeginClip and EndClip draw objects.
     uint n_clip;
+
+    // Note: one of these offsets *could* be hardcoded to zero (as was the
+    // original element stream), but for now retain flexibility.
+
     // Offset (in bytes) of transform stream in scene buffer
     uint trans_offset;
     // Offset (in bytes) of linewidth stream in scene
@@ -71,6 +79,10 @@
     uint pathtag_offset;
     // Offset (in bytes) of path segment stream in scene
     uint pathseg_offset;
+    // Offset (in bytes) of draw object tag stream in scene; see drawtag.h
+    uint drawtag_offset;
+    // Offset (in bytes) of draw payload stream in scene
+    uint drawdata_offset;
 };
 #endif
 
diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp
index 024f499..0fec2ce 100644
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@@ -17,7 +17,11 @@
     Config conf;
 };
 
-#include "annotated.h"
+layout(binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+#include "drawtag.h"
 #include "tile.h"
 
 // scale factors useful for converting coordinates to tiles
@@ -27,31 +31,39 @@
 shared uint sh_tile_count[TILE_ALLOC_WG];
 shared MallocResult sh_tile_alloc;
 
+vec4 load_draw_bbox(uint draw_ix) {
+    uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix;
+    float x0 = uintBitsToFloat(memory[base]);
+    float y0 = uintBitsToFloat(memory[base + 1]);
+    float x1 = uintBitsToFloat(memory[base + 2]);
+    float y1 = uintBitsToFloat(memory[base + 3]);
+    vec4 bbox = vec4(x0, y0, x1, y1);
+    return bbox;
+}
+
 void main() {
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
+    // At the moment, element_ix == path_ix. The clip-intersected bounding boxes
+    // for elements (draw objects) are computed in the binning stage, but at some
+    // point we'll probably want to break that correspondence. Tiles should be
+    // allocated for paths, not draw objs. EndClip doesn't need an allocation.
     PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
-    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
+    uint drawtag_base = conf.drawtag_offset >> 2;
 
-    uint tag = Annotated_Nop;
+    uint drawtag = Drawtag_Nop;
     if (element_ix < conf.n_elements) {
-        tag = Annotated_tag(conf.anno_alloc, ref).tag;
+        drawtag = scene[drawtag_base + element_ix];
     }
     int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
-    switch (tag) {
-    case Annotated_Color:
-    case Annotated_LinGradient:
-    case Annotated_Image:
-    case Annotated_BeginClip:
-    case Annotated_EndClip:
-        // Note: we take advantage of the fact that fills, strokes, and
-        // clips have compatible layout.
-        AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
-        x0 = int(floor(clip.bbox.x * SX));
-        y0 = int(floor(clip.bbox.y * SY));
-        x1 = int(ceil(clip.bbox.z * SX));
-        y1 = int(ceil(clip.bbox.w * SY));
-        break;
+    // Allocate an empty path for EndClip; at some point we'll change
+    // this to be per path rather than per draw object.
+    if (drawtag != Drawtag_Nop && drawtag != Drawtag_EndClip) {
+        vec4 bbox = load_draw_bbox(element_ix);
+        x0 = int(floor(bbox.x * SX));
+        y0 = int(floor(bbox.y * SY));
+        x1 = int(ceil(bbox.z * SX));
+        y1 = int(ceil(bbox.w * SY));
     }
     x0 = clamp(x0, 0, int(conf.width_in_tiles));
     y0 = clamp(y0, 0, int(conf.height_in_tiles));
@@ -61,11 +73,6 @@
     Path path;
     path.bbox = uvec4(x0, y0, x1, y1);
     uint tile_count = (x1 - x0) * (y1 - y0);
-    if (tag == Annotated_EndClip) {
-        // Don't actually allocate tiles for an end clip, but we do want
-        // the path structure (especially bbox) allocated for it.
-        tile_count = 0;
-    }
 
     sh_tile_count[th_ix] = tile_count;
     uint total_tile_count = tile_count;
diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs
index c24615e..c25c260 100644
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@@ -30,7 +30,8 @@
     tag_stream: Vec<u8>,
     pathseg_stream: Vec<u8>,
     linewidth_stream: Vec<f32>,
-    drawobj_stream: Vec<u8>,
+    drawtag_stream: Vec<u32>,
+    drawdata_stream: Vec<u8>,
     n_path: u32,
     n_pathseg: u32,
     n_clip: u32,
@@ -43,53 +44,54 @@
 pub struct GlyphEncoder {
     tag_stream: Vec<u8>,
     pathseg_stream: Vec<u8>,
-    drawobj_stream: Vec<u8>,
+    drawtag_stream: Vec<u32>,
+    drawdata_stream: Vec<u8>,
     n_path: u32,
     n_pathseg: u32,
 }
 
-// Currently same as Element, but may change - should become packed.
-const DRAWOBJ_SIZE: usize = 36;
 const TRANSFORM_SIZE: usize = 24;
 const LINEWIDTH_SIZE: usize = 4;
 const PATHSEG_SIZE: usize = 52;
-const BBOX_SIZE: usize = 24;
-const DRAWMONOID_SIZE: usize = 8;
+const PATH_BBOX_SIZE: usize = 24;
+const DRAWMONOID_SIZE: usize = 16;
+const DRAW_BBOX_SIZE: usize = 16;
+const DRAWTAG_SIZE: usize = 4;
 const ANNOTATED_SIZE: usize = 40;
 
-// These are bytemuck versions of elements currently defined in the
-// Element struct in piet-gpu-types; that's pretty much going away.
-
-const ELEMENT_FILLCOLOR: u32 = 4;
-const ELEMENT_FILLLINGRADIENT: u32 = 5;
-const ELEMENT_BEGINCLIP: u32 = 9;
-const ELEMENT_ENDCLIP: u32 = 10;
+// Tags for draw objects. See shader/drawtag.h for the authoritative source.
+const DRAWTAG_FILLCOLOR: u32 = 0x44;
+const DRAWTAG_FILLLINGRADIENT: u32 = 0x114;
+const DRAWTAG_BEGINCLIP: u32 = 0x05;
+const DRAWTAG_ENDCLIP: u32 = 0x25;
 
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
 pub struct FillColor {
-    tag: u32,
     rgba_color: u32,
-    padding: [u32; 7],
 }
 
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
 pub struct FillLinGradient {
-    tag: u32,
     index: u32,
     p0: [f32; 2],
     p1: [f32; 2],
-    padding: [u32; 3],
+}
+
+#[allow(unused)]
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct FillImage {
+    index: u32,
+    // [i16; 2]
+    offset: u32,
 }
 
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
 pub struct Clip {
-    tag: u32,
-    bbox: [f32; 4],
     blend: u32,
-    padding: [u32; 3],
 }
 
 impl Encoder {
@@ -99,7 +101,8 @@
             tag_stream: Vec::new(),
             pathseg_stream: Vec::new(),
             linewidth_stream: vec![-1.0],
-            drawobj_stream: Vec::new(),
+            drawtag_stream: Vec::new(),
+            drawdata_stream: Vec::new(),
             n_path: 0,
             n_pathseg: 0,
             n_clip: 0,
@@ -130,51 +133,42 @@
     ///
     /// This should be encoded after a path.
     pub fn fill_color(&mut self, rgba_color: u32) {
+        self.drawtag_stream.push(DRAWTAG_FILLCOLOR);
         let element = FillColor {
-            tag: ELEMENT_FILLCOLOR,
             rgba_color,
-            ..Default::default()
         };
-        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
     }
 
     /// Encode a fill linear gradient draw object.
     ///
     /// This should be encoded after a path.
     pub fn fill_lin_gradient(&mut self, index: u32, p0: [f32; 2], p1: [f32; 2]) {
+        self.drawtag_stream.push(DRAWTAG_FILLLINGRADIENT);
         let element = FillLinGradient {
-            tag: ELEMENT_FILLLINGRADIENT,
             index,
             p0,
             p1,
-            ..Default::default()
         };
-        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
     }
 
-    /// Start a clip and return a save point to be filled in later.
-    pub fn begin_clip(&mut self, blend: Option<Blend>) -> usize {
-        let saved = self.drawobj_stream.len();
+    /// Start a clip.
+    pub fn begin_clip(&mut self, blend: Option<Blend>) {
+        self.drawtag_stream.push(DRAWTAG_BEGINCLIP);
         let element = Clip {
-            tag: ELEMENT_BEGINCLIP,
             blend: blend.unwrap_or(Blend::default()).pack(),
-            ..Default::default()
         };
-        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
         self.n_clip += 1;
-        saved
     }
 
-    pub fn end_clip(&mut self, bbox: [f32; 4], blend: Option<Blend>, save_point: usize) {
+    pub fn end_clip(&mut self, blend: Option<Blend>) {
+        self.drawtag_stream.push(DRAWTAG_ENDCLIP);
         let element = Clip {
-            tag: ELEMENT_ENDCLIP,
-            bbox,
             blend: blend.unwrap_or(Blend::default()).pack(),
-            ..Default::default()
         };
-        self.drawobj_stream[save_point + 4..save_point + 20]
-            .clone_from_slice(bytemuck::bytes_of(&bbox));
-        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
         // This is a dummy path, and will go away with the new clip impl.
         self.tag_stream.push(0x10);
         self.n_path += 1;
@@ -187,9 +181,11 @@
     /// beginning of free memory.
     pub fn stage_config(&self) -> (Config, usize) {
         // Layout of scene buffer
+        let drawtag_offset = 0;
         let n_drawobj = self.n_drawobj();
         let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
-        let trans_offset = n_drawobj_padded * DRAWOBJ_SIZE;
+        let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
+        let trans_offset = drawdata_offset + self.drawdata_stream.len();
         let n_trans = self.transform_stream.len();
         let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
         let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
@@ -205,9 +201,9 @@
         alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
         let pathseg_alloc = alloc;
         alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
-        let bbox_alloc = alloc;
+        let path_bbox_alloc = alloc;
         let n_path = self.n_path as usize;
-        alloc += bbox_alloc + n_path * BBOX_SIZE;
+        alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
         let drawmonoid_alloc = alloc;
         alloc += n_drawobj_padded * DRAWMONOID_SIZE;
         let anno_alloc = alloc;
@@ -226,6 +222,12 @@
         let clip_bbox_alloc = alloc;
         const CLIP_BBOX_SIZE: usize = 16;
         alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
+        let draw_bbox_alloc = alloc;
+        alloc += n_drawobj * DRAW_BBOX_SIZE;
+        let drawinfo_alloc = alloc;
+        // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
+        const MAX_DRAWINFO_SIZE: usize = 16;
+        alloc += n_drawobj * MAX_DRAWINFO_SIZE;
 
         let config = Config {
             n_elements: n_drawobj as u32,
@@ -233,12 +235,14 @@
             pathseg_alloc: pathseg_alloc as u32,
             anno_alloc: anno_alloc as u32,
             trans_alloc: trans_alloc as u32,
-            bbox_alloc: bbox_alloc as u32,
+            path_bbox_alloc: path_bbox_alloc as u32,
             drawmonoid_alloc: drawmonoid_alloc as u32,
             clip_alloc: clip_alloc as u32,
             clip_bic_alloc: clip_bic_alloc as u32,
             clip_stack_alloc: clip_stack_alloc as u32,
             clip_bbox_alloc: clip_bbox_alloc as u32,
+            draw_bbox_alloc: draw_bbox_alloc as u32,
+            drawinfo_alloc: drawinfo_alloc as u32,
             n_trans: n_trans as u32,
             n_path: self.n_path,
             n_clip: self.n_clip,
@@ -246,15 +250,18 @@
             linewidth_offset: linewidth_offset as u32,
             pathtag_offset: pathtag_offset as u32,
             pathseg_offset: pathseg_offset as u32,
+            drawtag_offset: drawtag_offset as u32,
+            drawdata_offset: drawdata_offset as u32,
             ..Default::default()
         };
         (config, alloc)
     }
 
     pub fn write_scene(&self, buf: &mut BufWrite) {
-        buf.extend_slice(&self.drawobj_stream);
-        let n_drawobj = self.drawobj_stream.len() / DRAWOBJ_SIZE;
-        buf.fill_zero(padding(n_drawobj, DRAW_PART_SIZE as usize) * DRAWOBJ_SIZE);
+        buf.extend_slice(&self.drawtag_stream);
+        let n_drawobj = self.drawtag_stream.len();
+        buf.fill_zero(padding(n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE);
+        buf.extend_slice(&self.drawdata_stream);
         buf.extend_slice(&self.transform_stream);
         let n_trans = self.transform_stream.len();
         buf.fill_zero(padding(n_trans, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE);
@@ -265,9 +272,9 @@
         buf.extend_slice(&self.pathseg_stream);
     }
 
-    /// The number of elements in the draw object stream.
+    /// The number of draw objects in the draw object stream.
     pub(crate) fn n_drawobj(&self) -> usize {
-        self.drawobj_stream.len() / DRAWOBJ_SIZE
+        self.drawtag_stream.len()
     }
 
     /// The number of paths.
@@ -296,7 +303,8 @@
     pub(crate) fn encode_glyph(&mut self, glyph: &GlyphEncoder) {
         self.tag_stream.extend(&glyph.tag_stream);
         self.pathseg_stream.extend(&glyph.pathseg_stream);
-        self.drawobj_stream.extend(&glyph.drawobj_stream);
+        self.drawtag_stream.extend(&glyph.drawtag_stream);
+        self.drawdata_stream.extend(&glyph.drawdata_stream);
         self.n_path += glyph.n_path;
         self.n_pathseg += glyph.n_pathseg;
     }
@@ -325,15 +333,14 @@
     ///
     /// This should be encoded after a path.
     pub(crate) fn fill_color(&mut self, rgba_color: u32) {
+        self.drawtag_stream.push(DRAWTAG_FILLCOLOR);
         let element = FillColor {
-            tag: ELEMENT_FILLCOLOR,
             rgba_color,
-            ..Default::default()
         };
-        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
     }
 
     pub(crate) fn is_color(&self) -> bool {
-        !self.drawobj_stream.is_empty()
+        !self.drawtag_stream.is_empty()
     }
 }
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index bd26d45..5c5c78a 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -92,7 +92,7 @@
     clip_binding: ClipBinding,
 
     tile_pipeline: Pipeline,
-    tile_ds: DescriptorSet,
+    tile_ds: Vec<DescriptorSet>,
 
     path_pipeline: Pipeline,
     path_ds: DescriptorSet,
@@ -105,7 +105,7 @@
     bin_ds: DescriptorSet,
 
     coarse_pipeline: Pipeline,
-    coarse_ds: DescriptorSet,
+    coarse_ds: Vec<DescriptorSet>,
 
     k4_pipeline: Pipeline,
     k4_ds: DescriptorSet,
@@ -176,10 +176,8 @@
         };
         let image_dev = session.create_image2d(width as u32, height as u32, image_format)?;
 
-        // Note: this must be updated when the config struct size changes.
         const CONFIG_BUFFER_SIZE: u64 = std::mem::size_of::<Config>() as u64;
         let config_buf = session.create_buffer(CONFIG_BUFFER_SIZE, dev).unwrap();
-        // TODO: separate staging buffer (if needed)
         let config_bufs = (0..n_bufs)
             .map(|_| {
                 session
@@ -197,7 +195,7 @@
         let element_stage = ElementStage::new(session, &element_code);
         let element_bindings = scene_bufs
             .iter()
-            .map(|scene_buf| {
+            .map(|scene_buf|
                 element_stage.bind(
                     session,
                     &element_code,
@@ -205,7 +203,7 @@
                     scene_buf,
                     &memory_buf_dev,
                 )
-            })
+            )
             .collect();
 
         let clip_code = ClipCode::new(session);
@@ -214,8 +212,15 @@
         let tile_alloc_code = include_shader!(session, "../shader/gen/tile_alloc");
         let tile_pipeline = session
             .create_compute_pipeline(tile_alloc_code, &[BindType::Buffer, BindType::BufReadOnly])?;
-        let tile_ds = session
-            .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf])?;
+        let tile_ds = scene_bufs
+            .iter()
+            .map(|scene_buf| {
+                session.create_simple_descriptor_set(
+                    &tile_pipeline,
+                    &[&memory_buf_dev, &config_buf, scene_buf],
+                )
+            })
+            .collect::<Result<Vec<_>, _>>()?;
 
         let path_alloc_code = include_shader!(session, "../shader/gen/path_coarse");
         let path_pipeline = session
@@ -243,10 +248,23 @@
             session.create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf])?;
 
         let coarse_code = include_shader!(session, "../shader/gen/coarse");
-        let coarse_pipeline = session
-            .create_compute_pipeline(coarse_code, &[BindType::Buffer, BindType::BufReadOnly])?;
-        let coarse_ds = session
-            .create_simple_descriptor_set(&coarse_pipeline, &[&memory_buf_dev, &config_buf])?;
+        let coarse_pipeline = session.create_compute_pipeline(
+            coarse_code,
+            &[
+                BindType::Buffer,
+                BindType::BufReadOnly,
+                BindType::BufReadOnly,
+            ],
+        )?;
+        let coarse_ds = scene_bufs
+            .iter()
+            .map(|scene_buf| {
+                session.create_simple_descriptor_set(
+                    &coarse_pipeline,
+                    &[&memory_buf_dev, &config_buf, scene_buf],
+                )
+            })
+            .collect::<Result<Vec<_>, _>>()?;
 
         let bg_image = Self::make_test_bg_image(&session);
 
@@ -430,7 +448,7 @@
         cmd_buf.begin_debug_label("Tile allocation");
         cmd_buf.dispatch(
             &self.tile_pipeline,
-            &self.tile_ds,
+            &self.tile_ds[buf_ix],
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
@@ -462,7 +480,7 @@
         cmd_buf.begin_debug_label("Coarse raster");
         cmd_buf.dispatch(
             &self.coarse_pipeline,
-            &self.coarse_ds,
+            &self.coarse_ds[buf_ix],
             (
                 (self.width as u32 + 255) / 256,
                 (self.height as u32 + 255) / 256,
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index 1fe1ce9..e9a24fa 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -64,9 +64,6 @@
 }
 
 struct ClipElement {
-    /// Byte offset of BeginClip element in element vec, for bbox fixup.
-    save_point: usize,
-    bbox: Option<Rect>,
     blend: Option<Blend>,
 }
 
@@ -199,8 +196,6 @@
     fn stroke(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>, width: f64) {
         self.encode_linewidth(width.abs() as f32);
         let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
-        // Note: the bbox contribution of stroke becomes more complicated with miter joins.
-        self.accumulate_bbox(|| shape.bounding_box() + Insets::uniform(width * 0.5));
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, false);
         self.encode_brush(&brush);
@@ -217,9 +212,6 @@
 
     fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>) {
         let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
-        // Note: we might get a good speedup from using an approximate bounding box.
-        // Perhaps that should be added to kurbo.
-        self.accumulate_bbox(|| shape.bounding_box());
         let path = shape.path_elements(TOLERANCE);
         self.encode_linewidth(-1.0);
         self.encode_path(path, true);
@@ -232,13 +224,11 @@
         self.encode_linewidth(-1.0);
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
-        let save_point = self.new_encoder.begin_clip(None);
+        self.new_encoder.begin_clip(None);
         if self.clip_stack.len() >= MAX_BLEND_STACK {
             panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
         }
         self.clip_stack.push(ClipElement {
-            bbox: None,
-            save_point,
             blend: None,
         });
         if let Some(tos) = self.state_stack.last_mut() {
@@ -340,16 +330,13 @@
         self.encode_linewidth(-1.0);
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
-        let save_point = self.new_encoder.begin_clip(Some(blend));
+        self.new_encoder.begin_clip(Some(blend));
         if self.clip_stack.len() >= MAX_BLEND_STACK {
             panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
         }
         self.clip_stack.push(ClipElement {
-            bbox: None,
-            save_point,
             blend: Some(blend),
         });
-        self.accumulate_bbox(|| shape.bounding_box());
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;
         }
@@ -406,37 +393,7 @@
 
     fn pop_clip(&mut self) {
         let tos = self.clip_stack.pop().unwrap();
-        let bbox = tos.bbox.unwrap_or_default();
-        let bbox_f32_4 = rect_to_f32_4(bbox);
-        self.new_encoder.end_clip(bbox_f32_4, tos.blend, tos.save_point);
-        if let Some(bbox) = tos.bbox {
-            self.union_bbox(bbox);
-        }
-    }
-
-    /// Accumulate a bbox.
-    ///
-    /// The bbox is given lazily as a closure, relative to the current transform.
-    /// It's lazy because we don't need to compute it unless we're inside a clip.
-    fn accumulate_bbox(&mut self, f: impl FnOnce() -> Rect) {
-        if !self.clip_stack.is_empty() {
-            let bbox = f();
-            let bbox = self.cur_transform.transform_rect_bbox(bbox);
-            self.union_bbox(bbox);
-        }
-    }
-
-    /// Accumulate an absolute bbox.
-    ///
-    /// The bbox is given already transformed into surface coordinates.
-    fn union_bbox(&mut self, bbox: Rect) {
-        if let Some(tos) = self.clip_stack.last_mut() {
-            tos.bbox = if let Some(old_bbox) = tos.bbox {
-                Some(old_bbox.union(bbox))
-            } else {
-                Some(bbox)
-            };
-        }
+        self.new_encoder.end_clip(tos.blend);
     }
 
     pub(crate) fn encode_glyph(&mut self, glyph: &GlyphEncoder) {
diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs
index e155c50..52b8bf1 100644
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@@ -47,12 +47,14 @@
     pub pathseg_alloc: u32,
     pub anno_alloc: u32,
     pub trans_alloc: u32,
-    pub bbox_alloc: u32,
+    pub path_bbox_alloc: u32,
     pub drawmonoid_alloc: u32,
     pub clip_alloc: u32,
     pub clip_bic_alloc: u32,
     pub clip_stack_alloc: u32,
     pub clip_bbox_alloc: u32,
+    pub draw_bbox_alloc: u32,
+    pub drawinfo_alloc: u32,
     pub n_trans: u32,
     pub n_path: u32,
     pub n_clip: u32,
@@ -60,6 +62,8 @@
     pub linewidth_offset: u32,
     pub pathtag_offset: u32,
     pub pathseg_offset: u32,
+    pub drawtag_offset: u32,
+    pub drawdata_offset: u32,
 }
 
 // The "element" stage combines a number of stages for parts of the pipeline.
diff --git a/piet-gpu/src/stages/draw.rs b/piet-gpu/src/stages/draw.rs
index 5328a84..21312a4 100644
--- a/piet-gpu/src/stages/draw.rs
+++ b/piet-gpu/src/stages/draw.rs
@@ -28,6 +28,8 @@
 pub struct DrawMonoid {
     pub path_ix: u32,
     pub clip_ix: u32,
+    pub scene_offset: u32,
+    pub info_offset: u32,
 }
 
 const DRAW_WG: u64 = 256;
@@ -93,7 +95,7 @@
     pub unsafe fn new(session: &Session, code: &DrawCode) -> DrawStage {
         // We're limited to DRAW_PART_SIZE^2
         // Also note: size here allows padding
-        let root_buf_size = DRAW_PART_SIZE * 8;
+        let root_buf_size = DRAW_PART_SIZE * 16;
         let root_buf = session
             .create_buffer(root_buf_size, BufferUsage::STORAGE)
             .unwrap();
diff --git a/tests/src/clip.rs b/tests/src/clip.rs
index cfd8a35..4a38949 100644
--- a/tests/src/clip.rs
+++ b/tests/src/clip.rs
@@ -163,8 +163,7 @@
         let clip_bbox_alloc = clip_stack_alloc + 20 * n_clip;
         stages::Config {
             clip_alloc: clip_alloc as u32,
-            // TODO: this wants to be renamed to path_bbox_alloc
-            bbox_alloc: path_bbox_alloc as u32,
+            path_bbox_alloc: path_bbox_alloc as u32,
             drawmonoid_alloc: drawmonoid_alloc as u32,
             clip_bic_alloc: clip_bic_alloc as u32,
             clip_stack_alloc: clip_stack_alloc as u32,
@@ -194,7 +193,7 @@
         let clip_range = clip_bbox_start..(clip_bbox_start + n_clip * 16);
         let clip_result = bytemuck::cast_slice::<u8, [f32; 4]>(&buf[clip_range]);
         let draw_start = 8 + n_clip * 4 + n_path * 24;
-        let draw_range = draw_start..(draw_start + n_clip * 8);
+        let draw_range = draw_start..(draw_start + n_clip * 16);
         let draw_result = bytemuck::cast_slice::<u8, DrawMonoid>(&buf[draw_range]);
         let mut bbox_stack = Vec::new();
         let mut parent_stack = Vec::new();
diff --git a/tests/src/draw.rs b/tests/src/draw.rs
index 7b264d4..692f943 100644
--- a/tests/src/draw.rs
+++ b/tests/src/draw.rs
@@ -17,20 +17,23 @@
 //! Tests for the piet-gpu draw object stage.
 
 use piet_gpu_hal::{BufWrite, BufferUsage};
-use rand::Rng;
+use rand::{Rng, seq::SliceRandom};
 
 use crate::{Config, Runner, TestResult};
 
 use piet_gpu::stages::{self, DrawCode, DrawMonoid, DrawStage};
 
-const ELEMENT_SIZE: usize = 36;
+const DRAWTAG_SIZE: usize = 4;
 const ANNOTATED_SIZE: usize = 40;
 
-const ELEMENT_FILLCOLOR: u32 = 4;
-const ELEMENT_FILLLINGRADIENT: u32 = 5;
-const ELEMENT_FILLIMAGE: u32 = 6;
-const ELEMENT_BEGINCLIP: u32 = 9;
-const ELEMENT_ENDCLIP: u32 = 10;
+// Tags for draw objects. See shader/drawtag.h for the authoritative source.
+const DRAWTAG_FILLCOLOR: u32 = 4;
+const DRAWTAG_FILLLINGRADIENT: u32 = 20;
+const DRAWTAG_FILLIMAGE: u32 = 8;
+const DRAWTAG_BEGINCLIP: u32 = 5;
+const DRAWTAG_ENDCLIP: u32 = 37;
+
+const TAGS: &[u32] = &[DRAWTAG_FILLCOLOR, DRAWTAG_FILLLINGRADIENT, DRAWTAG_FILLIMAGE, DRAWTAG_BEGINCLIP, DRAWTAG_ENDCLIP];
 
 struct DrawTestData {
     tags: Vec<u32>,
@@ -47,7 +50,7 @@
         .session
         .create_buffer_init(std::slice::from_ref(&stage_config), BufferUsage::STORAGE)
         .unwrap();
-    let scene_size = n_tag * ELEMENT_SIZE as u64;
+    let scene_size = n_tag * DRAWTAG_SIZE as u64;
     let scene_buf = runner
         .session
         .create_buffer_with(scene_size, |b| data.fill_scene(b), BufferUsage::STORAGE)
@@ -92,7 +95,7 @@
 impl DrawTestData {
     fn new(n: u64) -> DrawTestData {
         let mut rng = rand::thread_rng();
-        let tags = (0..n).map(|_| rng.gen_range(0, 12)).collect();
+        let tags = (0..n).map(|_| *TAGS.choose(&mut rng).unwrap()).collect();
         DrawTestData { tags }
     }
 
@@ -101,13 +104,14 @@
 
         // Layout of memory
         let drawmonoid_alloc = 0;
-        let anno_alloc = drawmonoid_alloc + 8 * n_tags;
+        let anno_alloc = drawmonoid_alloc + 16 * n_tags;
         let clip_alloc = anno_alloc + ANNOTATED_SIZE * n_tags;
         let stage_config = stages::Config {
             n_elements: n_tags as u32,
             anno_alloc: anno_alloc as u32,
             drawmonoid_alloc: drawmonoid_alloc as u32,
             clip_alloc: clip_alloc as u32,
+            drawtag_offset: 0,
             ..Default::default()
         };
         stage_config
@@ -116,37 +120,35 @@
     fn memory_size(&self) -> u64 {
         // Note: this overallocates the clip buf a bit - only needed for the
         // total number of begin_clip and end_clip tags.
-        (8 + self.tags.len() * (8 + 4 + ANNOTATED_SIZE)) as u64
+        (8 + self.tags.len() * (16 + 4 + ANNOTATED_SIZE)) as u64
     }
 
     fn fill_scene(&self, buf: &mut BufWrite) {
-        let mut element = [0u32; ELEMENT_SIZE / 4];
-        for tag in &self.tags {
-            element[0] = *tag;
-            buf.push(element);
-        }
+        buf.extend_slice(&self.tags);
     }
 
     fn verify(&self, buf: &[u8]) -> Option<String> {
-        let size = self.tags.len() * 8;
+        let size = self.tags.len() * 16;
         let actual = bytemuck::cast_slice::<u8, DrawMonoid>(&buf[8..8 + size]);
         let mut expected = DrawMonoid::default();
         for (i, (tag, actual)) in self.tags.iter().zip(actual).enumerate() {
             // Verify exclusive prefix sum.
             let (path_ix, clip_ix) = Self::reduce_tag(*tag);
             if *actual != expected {
+                println!("{:?} {:?}", actual, expected);
                 return Some(format!("draw mismatch at {}", i));
             }
             expected.path_ix += path_ix;
             expected.clip_ix += clip_ix;
+            expected.scene_offset += tag & 28;
         }
         None
     }
 
     fn reduce_tag(tag: u32) -> (u32, u32) {
         match tag {
-            ELEMENT_FILLCOLOR | ELEMENT_FILLLINGRADIENT | ELEMENT_FILLIMAGE => (1, 0),
-            ELEMENT_BEGINCLIP | ELEMENT_ENDCLIP => (1, 1),
+            DRAWTAG_FILLCOLOR | DRAWTAG_FILLLINGRADIENT | DRAWTAG_FILLIMAGE => (1, 0),
+            DRAWTAG_BEGINCLIP | DRAWTAG_ENDCLIP => (1, 1),
             // TODO: ENDCLIP will become (0, 1)
             _ => (0, 0),
         }
diff --git a/tests/src/path.rs b/tests/src/path.rs
index 6f1f61a..bf72c68 100644
--- a/tests/src/path.rs
+++ b/tests/src/path.rs
@@ -207,11 +207,11 @@
         // Layout of memory
         let trans_alloc = 0;
         let pathseg_alloc = trans_alloc + n_trans * 24;
-        let bbox_alloc = pathseg_alloc + self.n_pathseg * PATHSEG_SIZE;
+        let path_bbox_alloc = pathseg_alloc + self.n_pathseg * PATHSEG_SIZE;
         let stage_config = stages::Config {
             pathseg_alloc,
             trans_alloc,
-            bbox_alloc,
+            path_bbox_alloc,
             n_trans,
             n_path: self.n_path,
             pathtag_offset,