Merge pull request #159 from linebender/varenc

Variable size encoding of draw objects
diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp
index 0c698b1..4a45d28 100644
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@@ -38,7 +38,6 @@
     Config conf;
 };
 
-#include "annotated.h"
 #include "tile.h"
 
 shared uint sh_row_count[BACKDROP_WG];
@@ -48,39 +47,29 @@
 void main() {
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
 
     // Work assignment: 1 thread : 1 path element
     uint row_count = 0;
     bool mem_ok = mem_error == NO_ERROR;
     if (gl_LocalInvocationID.y == 0) {
         if (element_ix < conf.n_elements) {
-            AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
-            switch (tag.tag) {
-            case Annotated_Image:
-            case Annotated_LinGradient:
-            case Annotated_BeginClip:
-            case Annotated_Color:
-                if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) {
-                    break;
-                }
-                // Fall through.
-                PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
-                Path path = Path_read(conf.tile_alloc, path_ref);
-                sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-                row_count = path.bbox.w - path.bbox.y;
-                // Paths that don't cross tile top edges don't have backdrops.
-                // Don't apply the optimization to paths that may cross the y = 0
-                // top edge, but clipped to 1 row.
-                if (row_count == 1 && path.bbox.y > 0) {
-                    // Note: this can probably be expanded to width = 2 as
-                    // long as it doesn't cross the left edge.
-                    row_count = 0;
-                }
-                Alloc path_alloc = new_alloc(
-                    path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
-                sh_row_alloc[th_ix] = path_alloc;
+            // Possible TODO: it's not necessary to process backdrops of stroked paths.
+            // We had logic for that but took it out because it used the Annotated struct.
+            PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
+            Path path = Path_read(conf.tile_alloc, path_ref);
+            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+            row_count = path.bbox.w - path.bbox.y;
+            // Paths that don't cross tile top edges don't have backdrops.
+            // Don't apply the optimization to paths that may cross the y = 0
+            // top edge, but clipped to 1 row.
+            if (row_count == 1 && path.bbox.y > 0) {
+                // Note: this can probably be expanded to width = 2 as
+                // long as it doesn't cross the left edge.
+                row_count = 0;
             }
+            Alloc path_alloc = new_alloc(
+                path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+            sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
     }
diff --git a/piet-gpu/shader/bbox_clear.comp b/piet-gpu/shader/bbox_clear.comp
index c609642..52577f9 100644
--- a/piet-gpu/shader/bbox_clear.comp
+++ b/piet-gpu/shader/bbox_clear.comp
@@ -20,7 +20,7 @@
 void main() {
     uint ix = gl_GlobalInvocationID.x;
     if (ix < conf.n_path) {
-        uint out_ix = (conf.bbox_alloc.offset >> 2) + 6 * ix;
+        uint out_ix = (conf.path_bbox_alloc.offset >> 2) + 6 * ix;
         memory[out_ix] = 0xffff;
         memory[out_ix + 1] = 0xffff;
         memory[out_ix + 2] = 0;
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
index 2304ea2..9b04400 100644
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@@ -18,7 +18,6 @@
     Config conf;
 };
 
-#include "annotated.h"
 #include "bins.h"
 #include "drawtag.h"
 
@@ -37,10 +36,12 @@
 shared bool sh_alloc_failed;
 
 DrawMonoid load_draw_monoid(uint element_ix) {
-    uint base = (conf.drawmonoid_alloc.offset >> 2) + 2 * element_ix;
+    uint base = (conf.drawmonoid_alloc.offset >> 2) + 4 * element_ix;
     uint path_ix = memory[base];
     uint clip_ix = memory[base + 1];
-    return DrawMonoid(path_ix, clip_ix);
+    uint scene_offset = memory[base + 2];
+    uint info_offset = memory[base + 3];
+    return DrawMonoid(path_ix, clip_ix, scene_offset, info_offset);
 }
 
 // Load bounding box computed by clip processing
@@ -60,7 +61,7 @@
 
 // Load path's bbox from bbox (as written by pathseg).
 vec4 load_path_bbox(uint path_ix) {
-    uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix;
+    uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
     float bbox_l = float(memory[base]) - 32768.0;
     float bbox_t = float(memory[base + 1]) - 32768.0;
     float bbox_r = float(memory[base + 2]) - 32768.0;
@@ -69,16 +70,15 @@
     return bbox;
 }
 
-void store_path_bbox(AnnotatedRef ref, vec4 bbox) {
-    uint ix = ref.offset >> 2;
-    memory[ix + 1] = floatBitsToUint(bbox.x);
-    memory[ix + 2] = floatBitsToUint(bbox.y);
-    memory[ix + 3] = floatBitsToUint(bbox.z);
-    memory[ix + 4] = floatBitsToUint(bbox.w);
+void store_draw_bbox(uint draw_ix, vec4 bbox) {
+    uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix;
+    memory[base] = floatBitsToUint(bbox.x);
+    memory[base + 1] = floatBitsToUint(bbox.y);
+    memory[base + 2] = floatBitsToUint(bbox.z);
+    memory[base + 3] = floatBitsToUint(bbox.w);
 }
 
 void main() {
-    uint my_n_elements = conf.n_elements;
     uint my_partition = gl_WorkGroupID.x;
 
     for (uint i = 0; i < N_SLICE; i++) {
@@ -91,18 +91,8 @@
 
     // Read inputs and determine coverage of bins
     uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
-    uint tag = Annotated_Nop;
-    if (element_ix < my_n_elements) {
-        tag = Annotated_tag(conf.anno_alloc, ref).tag;
-    }
     int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
-    switch (tag) {
-    case Annotated_Color:
-    case Annotated_LinGradient:
-    case Annotated_Image:
-    case Annotated_BeginClip:
-    case Annotated_EndClip:
+    if (element_ix < conf.n_elements) {
         DrawMonoid draw_monoid = load_draw_monoid(element_ix);
         uint path_ix = draw_monoid.path_ix;
         vec4 clip_bbox = vec4(-1e9, -1e9, 1e9, 1e9);
@@ -119,12 +109,11 @@
         // Avoid negative-size bbox (is this necessary)?
         bbox.zw = max(bbox.xy, bbox.zw);
         // Store clip-intersected bbox for tile_alloc.
-        store_path_bbox(ref, bbox);
+        store_draw_bbox(element_ix, bbox);
         x0 = int(floor(bbox.x * SX));
         y0 = int(floor(bbox.y * SY));
         x1 = int(ceil(bbox.z * SX));
         y1 = int(ceil(bbox.w * SY));
-        break;
     }
 
     // At this point, we run an iterator over the coverage area,
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index ac4f3d7..60e5582 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -22,43 +22,43 @@
 rule msl
   command = $spirv_cross --msl $in --output $out $msl_flags
 
-build gen/binning.spv: glsl binning.comp | annotated.h bins.h drawtag.h setup.h mem.h
+build gen/binning.spv: glsl binning.comp | bins.h drawtag.h setup.h mem.h
 build gen/binning.hlsl: hlsl gen/binning.spv
 build gen/binning.dxil: dxil gen/binning.hlsl
 build gen/binning.msl: msl gen/binning.spv
 
-build gen/tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h
+build gen/tile_alloc.spv: glsl tile_alloc.comp | drawtag.h tile.h setup.h mem.h
 build gen/tile_alloc.hlsl: hlsl gen/tile_alloc.spv
 build gen/tile_alloc.dxil: dxil gen/tile_alloc.hlsl
 build gen/tile_alloc.msl: msl gen/tile_alloc.spv
 
-build gen/path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setup.h
+build gen/path_coarse.spv: glsl path_coarse.comp | pathseg.h tile.h setup.h mem.h
 build gen/path_coarse.hlsl: hlsl gen/path_coarse.spv
 build gen/path_coarse.dxil: dxil gen/path_coarse.hlsl
 build gen/path_coarse.msl: msl gen/path_coarse.spv
 
-build gen/backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h
+build gen/backdrop.spv: glsl backdrop.comp | tile.h setup.h mem.h
 build gen/backdrop.hlsl: hlsl gen/backdrop.spv
 build gen/backdrop.dxil: dxil gen/backdrop.hlsl
 build gen/backdrop.msl: msl gen/backdrop.spv
 
-build gen/backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
+build gen/backdrop_lg.spv: glsl backdrop.comp | tile.h setup.h mem.h
   flags = -DBACKDROP_DIST_FACTOR=4
 build gen/backdrop_lg.hlsl: hlsl gen/backdrop_lg.spv
 build gen/backdrop_lg.dxil: dxil gen/backdrop_lg.hlsl
 build gen/backdrop_lg.msl: msl gen/backdrop_lg.spv
 
-build gen/coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
+build gen/coarse.spv: glsl coarse.comp | drawtag.h bins.h ptcl.h blend.h setup.h mem.h
 build gen/coarse.hlsl: hlsl gen/coarse.spv
 build gen/coarse.dxil: dxil gen/coarse.hlsl
 build gen/coarse.msl: msl gen/coarse.spv
 
-build gen/kernel4.spv: glsl kernel4.comp | blend.h ptcl.h setup.h
+build gen/kernel4.spv: glsl kernel4.comp | blend.h ptcl.h setup.h mem.h
 build gen/kernel4.hlsl: hlsl gen/kernel4.spv
 build gen/kernel4.dxil: dxil gen/kernel4.hlsl
 build gen/kernel4.msl: msl gen/kernel4.spv
 
-build gen/kernel4_gray.spv: glsl kernel4.comp | ptcl.h setup.h
+build gen/kernel4_gray.spv: glsl kernel4.comp | ptcl.h setup.h mem.h
   flags = -DGRAY
 build gen/kernel4_gray.hlsl: hlsl gen/kernel4_gray.spv
 build gen/kernel4_gray.dxil: dxil gen/kernel4_gray.hlsl
@@ -114,17 +114,17 @@
 build gen/draw_root.dxil: dxil gen/draw_root.hlsl
 build gen/draw_root.msl: msl gen/draw_root.spv
 
-build gen/draw_leaf.spv: glsl draw_leaf.comp | blend.h scene.h drawtag.h annotated.h setup.h mem.h
+build gen/draw_leaf.spv: glsl draw_leaf.comp | blend.h scene.h drawtag.h setup.h mem.h
 build gen/draw_leaf.hlsl: hlsl gen/draw_leaf.spv
 build gen/draw_leaf.dxil: dxil gen/draw_leaf.hlsl
 build gen/draw_leaf.msl: msl gen/draw_leaf.spv
 
-build gen/clip_reduce.spv: glsl clip_reduce.comp | mem.h setup.h annotated.h
+build gen/clip_reduce.spv: glsl clip_reduce.comp | mem.h setup.h
 build gen/clip_reduce.hlsl: hlsl gen/clip_reduce.spv
 build gen/clip_reduce.dxil: dxil gen/clip_reduce.hlsl
 build gen/clip_reduce.msl: msl gen/clip_reduce.spv
 
-build gen/clip_leaf.spv: glsl clip_leaf.comp | mem.h setup.h annotated.h
+build gen/clip_leaf.spv: glsl clip_leaf.comp | mem.h setup.h
 build gen/clip_leaf.hlsl: hlsl gen/clip_leaf.spv
 build gen/clip_leaf.dxil: dxil gen/clip_leaf.hlsl
 build gen/clip_leaf.msl: msl gen/clip_leaf.spv
diff --git a/piet-gpu/shader/clip_leaf.comp b/piet-gpu/shader/clip_leaf.comp
index 5f7e79b..5353b0b 100644
--- a/piet-gpu/shader/clip_leaf.comp
+++ b/piet-gpu/shader/clip_leaf.comp
@@ -18,8 +18,6 @@
     Config conf;
 };
 
-#include "annotated.h"
-
 // Some of this is cut'n'paste duplication with the reduce pass, and
 // arguably should be moved to a common .h file.
 // The bicyclic monoid
@@ -43,7 +41,7 @@
 
 // Load path's bbox from bbox (as written by pathseg).
 vec4 load_path_bbox(uint path_ix) {
-    uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix;
+    uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
     float bbox_l = float(memory[base]) - 32768.0;
     float bbox_t = float(memory[base + 1]) - 32768.0;
     float bbox_r = float(memory[base + 2]) - 32768.0;
@@ -271,7 +269,7 @@
         // and is in the ClipEl for cross-partition.
         // If not, can probably get rid of it in the stack intermediate buf.
         path_ix = load_path_ix(parent);
-        uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 2 * ~inp;
+        uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 4 * ~inp;
         // Fix up drawmonoid so path_ix at EndClip matches BeginClip
         memory[drawmonoid_out_base] = path_ix;
 
diff --git a/piet-gpu/shader/clip_reduce.comp b/piet-gpu/shader/clip_reduce.comp
index c62b239..8b247ab 100644
--- a/piet-gpu/shader/clip_reduce.comp
+++ b/piet-gpu/shader/clip_reduce.comp
@@ -31,8 +31,6 @@
     Config conf;
 };
 
-#include "annotated.h"
-
 // The intermediate state for clip processing.
 struct ClipEl {
     // index of parent node
@@ -59,7 +57,7 @@
 
 // Load path's bbox from bbox (as written by pathseg).
 vec4 load_path_bbox(uint path_ix) {
-    uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix;
+    uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
     float bbox_l = float(memory[base]) - 32768.0;
     float bbox_t = float(memory[base + 1]) - 32768.0;
     float bbox_r = float(memory[base + 2]) - 32768.0;
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index df306e0..454371c 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -19,14 +19,19 @@
 
 layout(local_size_x = N_TILE, local_size_y = 1) in;
 
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+layout(binding = 1) readonly buffer ConfigBuf {
     Config conf;
 };
 
-#include "annotated.h"
+layout(binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+#include "drawtag.h"
 #include "bins.h"
 #include "tile.h"
 #include "ptcl.h"
+#include "blend.h"
 
 #define LG_N_PART_READ (7 + LG_WG_FACTOR)
 #define N_PART_READ (1 << LG_N_PART_READ)
@@ -92,8 +97,8 @@
     return true;
 }
 
-void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float linewidth) {
-    if (fill_mode_from_flags(flags) == MODE_NONZERO) {
+void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) {
+    if (linewidth < 0.0) {
         if (tile.tile.offset != 0) {
             CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
             Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
@@ -146,6 +151,10 @@
     uint part_start_ix = 0;
     uint ready_ix = 0;
 
+    uint drawmonoid_start = conf.drawmonoid_alloc.offset >> 2;
+    uint drawtag_start = conf.drawtag_offset >> 2;
+    uint drawdata_start = conf.drawdata_offset >> 2;
+    uint drawinfo_start = conf.drawinfo_alloc.offset >> 2;
     bool mem_ok = mem_error == NO_ERROR;
     while (true) {
         for (uint i = 0; i < N_SLICE; i++) {
@@ -207,24 +216,22 @@
         // We've done the merge and filled the buffer.
 
         // Read one element, compute coverage.
-        uint tag = Annotated_Nop;
+        uint tag = Drawtag_Nop;
         uint element_ix;
-        AnnotatedRef ref;
         if (th_ix + rd_ix < wr_ix) {
             element_ix = sh_elements[th_ix];
-            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
-            tag = Annotated_tag(conf.anno_alloc, ref).tag;
+            tag = scene[drawtag_start + element_ix];
         }
 
         // Bounding box of element in pixel coordinates.
         uint tile_count;
         switch (tag) {
-        case Annotated_Color:
-        case Annotated_Image:
-        case Annotated_LinGradient:
-        case Annotated_BeginClip:
-        case Annotated_EndClip:
-            uint drawmonoid_base = (conf.drawmonoid_alloc.offset >> 2) + 2 * element_ix;
+        case Drawtag_FillColor:
+        case Drawtag_FillImage:
+        case Drawtag_FillLinGradient:
+        case Drawtag_BeginClip:
+        case Drawtag_EndClip:
+            uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
             uint path_ix = memory[drawmonoid_base];
             Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
             uint stride = path.bbox.z - path.bbox.x;
@@ -272,9 +279,8 @@
                     el_ix = probe;
                 }
             }
-            AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size);
-            AnnotatedTag anno_tag = Annotated_tag(conf.anno_alloc, ref);
-            uint tag = anno_tag.tag;
+            uint element_ix = sh_elements[el_ix];
+            uint tag = scene[drawtag_start + element_ix];
             uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + seq_ix % width;
@@ -283,15 +289,23 @@
             if (mem_ok) {
                 Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok),
                                       TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
-                bool is_clip = tag == Annotated_BeginClip || tag == Annotated_EndClip;
+                bool is_clip = (tag & 1) != 0;
                 // Always include the tile if it contains a path segment.
                 // For draws, include the tile if it is solid.
                 // For clips, include the tile if it is empty - this way, logic
                 // below will suppress the drawing of inner elements.
                 // For blends, include the tile if
                 // (blend_mode, composition_mode) != (Normal, SrcOver)
+                bool is_blend = false;
+                if (is_clip) {
+                    uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
+                    uint scene_offset = memory[drawmonoid_base + 2];
+                    uint dd = drawdata_start + (scene_offset >> 2);
+                    uint blend = scene[dd];
+                    is_blend = (blend != BlendComp_default);
+                }
                 include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
-                    || (is_clip && (anno_tag.flags & 0x2) != 0);
+                    || is_blend;
             }
             if (include_tile) {
                 uint el_slice = el_ix / 32;
@@ -302,8 +316,8 @@
 
         barrier();
 
-        // Output non-segment elements for this tile. The thread does a sequential walk
-        // through the non-segment elements.
+        // Output draw objects for this tile. The thread does a sequential walk
+        // through the draw objects.
         uint slice_ix = 0;
         uint bitmap = sh_bitmaps[0][th_ix];
         while (mem_ok) {
@@ -323,59 +337,55 @@
             // Clear LSB
             bitmap &= bitmap - 1;
 
-            // At this point, we read the element again from global memory.
-            // If that turns out to be expensive, maybe we can pack it into
-            // shared memory (or perhaps just the tag).
-            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
-            AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
+            uint drawtag = scene[drawtag_start + element_ix];
 
             if (clip_zero_depth == 0) {
-                switch (tag.tag) {
-                case Annotated_Color:
-                    Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
-                                          TileRef(sh_tile_base[element_ref_ix] +
-                                                  (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    AnnoColor fill = Annotated_Color_read(conf.anno_alloc, ref);
+                Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
+                                        TileRef(sh_tile_base[element_ref_ix] +
+                                                (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
+                uint scene_offset = memory[drawmonoid_base + 2];
+                uint info_offset = memory[drawmonoid_base + 3];
+                uint dd = drawdata_start + (scene_offset >> 2);
+                uint di = drawinfo_start + (info_offset >> 2);
+                switch (drawtag) {
+                case Drawtag_FillColor:
+                    float linewidth = uintBitsToFloat(memory[di]);
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
                     }
-                    write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill.linewidth);
-                    Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(fill.rgba_color));
+                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
+                    uint rgba = scene[dd];
+                    Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba));
                     cmd_ref.offset += 4 + CmdColor_size;
                     break;
-                case Annotated_LinGradient:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
-                                     TileRef(sh_tile_base[element_ref_ix] +
-                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    AnnoLinGradient lin = Annotated_LinGradient_read(conf.anno_alloc, ref);
+                case Drawtag_FillLinGradient:
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
                     }
-                    write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill.linewidth);
+                    linewidth = uintBitsToFloat(memory[di]);
+                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
                     CmdLinGrad cmd_lin;
-                    cmd_lin.index = lin.index;
-                    cmd_lin.line_x = lin.line_x;
-                    cmd_lin.line_y = lin.line_y;
-                    cmd_lin.line_c = lin.line_c;
+                    cmd_lin.index = scene[dd];
+                    cmd_lin.line_x = uintBitsToFloat(memory[di + 1]);
+                    cmd_lin.line_y = uintBitsToFloat(memory[di + 2]);
+                    cmd_lin.line_c = uintBitsToFloat(memory[di + 3]);
                     Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
                     cmd_ref.offset += 4 + CmdLinGrad_size;
                     break;
-                case Annotated_Image:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
-                                     TileRef(sh_tile_base[element_ref_ix] +
-                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    AnnoImage fill_img = Annotated_Image_read(conf.anno_alloc, ref);
+                case Drawtag_FillImage:
+                    linewidth = uintBitsToFloat(memory[di]);
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
                     }
-                    write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill_img.linewidth);
-                    Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(fill_img.index, fill_img.offset));
+                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
+                    uint index = scene[dd];
+                    uint raw1 = scene[dd + 1];
+                    ivec2 offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
+                    Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset));
                     cmd_ref.offset += 4 + CmdImage_size;
                     break;
-                case Annotated_BeginClip:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
-                                     TileRef(sh_tile_base[element_ref_ix] +
-                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                case Drawtag_BeginClip:
                     if (tile.tile.offset == 0 && tile.backdrop == 0) {
                         clip_zero_depth = clip_depth + 1;
                     } else {
@@ -387,27 +397,24 @@
                     }
                     clip_depth++;
                     break;
-                case Annotated_EndClip:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
-                                     TileRef(sh_tile_base[element_ref_ix] +
-                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    AnnoEndClip end_clip = Annotated_EndClip_read(conf.anno_alloc, ref);
+                case Drawtag_EndClip:
                     clip_depth--;
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
                     }
-                    write_fill(cmd_alloc, cmd_ref, MODE_NONZERO, tile, 0.0);
-                    Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(end_clip.blend));
+                    write_fill(cmd_alloc, cmd_ref, tile, -1.0);
+                    uint blend = scene[dd];
+                    Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
                     cmd_ref.offset += 4 + CmdEndClip_size;
                     break;
                 }
             } else {
                 // In "clip zero" state, suppress all drawing
-                switch (tag.tag) {
-                case Annotated_BeginClip:
+                switch (drawtag) {
+                case Drawtag_BeginClip:
                     clip_depth++;
                     break;
-                case Annotated_EndClip:
+                case Drawtag_EndClip:
                     if (clip_depth == clip_zero_depth) {
                         clip_zero_depth = 0;
                     }
diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp
index 74fc2f8..1cee0ef 100644
--- a/piet-gpu/shader/draw_leaf.comp
+++ b/piet-gpu/shader/draw_leaf.comp
@@ -27,7 +27,6 @@
 #include "scene.h"
 #include "tile.h"
 #include "drawtag.h"
-#include "annotated.h"
 #include "blend.h"
 
 #define Monoid DrawMonoid
@@ -42,14 +41,14 @@
     Monoid local[N_ROWS];
 
     uint ix = gl_GlobalInvocationID.x * N_ROWS;
-    ElementRef ref = ElementRef(ix * Element_size);
-    uint tag_word = Element_tag(ref).tag;
+    uint drawtag_base = conf.drawtag_offset >> 2;
+    uint tag_word = scene[drawtag_base + ix];
 
     Monoid agg = map_tag(tag_word);
     local[0] = agg;
     for (uint i = 1; i < N_ROWS; i++) {
-        tag_word = Element_tag(Element_index(ref, i)).tag;
-        agg = combine_tag_monoid(agg, map_tag(tag_word));
+        tag_word = scene[drawtag_base + ix + i];
+        agg = combine_draw_monoid(agg, map_tag(tag_word));
         local[i] = agg;
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
@@ -57,41 +56,47 @@
         barrier();
         if (gl_LocalInvocationID.x >= (1u << i)) {
             Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
-            agg = combine_tag_monoid(other, agg);
+            agg = combine_draw_monoid(other, agg);
         }
         barrier();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
 
     barrier();
-    Monoid row = tag_monoid_identity();
+    Monoid row = draw_monoid_identity();
     if (gl_WorkGroupID.x > 0) {
         row = parent[gl_WorkGroupID.x - 1];
     }
     if (gl_LocalInvocationID.x > 0) {
-        row = combine_tag_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
+        row = combine_draw_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
     }
+    uint drawdata_base = conf.drawdata_offset >> 2;
+    uint drawinfo_base = conf.drawinfo_alloc.offset >> 2;
     uint out_ix = gl_GlobalInvocationID.x * N_ROWS;
-    uint out_base = (conf.drawmonoid_alloc.offset >> 2) + out_ix * 2;
+    uint out_base = (conf.drawmonoid_alloc.offset >> 2) + out_ix * 4;
     uint clip_out_base = conf.clip_alloc.offset >> 2;
-    AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + out_ix * Annotated_size);
     for (uint i = 0; i < N_ROWS; i++) {
         Monoid m = row;
         if (i > 0) {
-            m = combine_tag_monoid(m, local[i - 1]);
+            m = combine_draw_monoid(m, local[i - 1]);
         }
         // m now holds exclusive scan of draw monoid
-        memory[out_base + i * 2] = m.path_ix;
-        memory[out_base + i * 2 + 1] = m.clip_ix;
+        memory[out_base + i * 4] = m.path_ix;
+        memory[out_base + i * 4 + 1] = m.clip_ix;
+        memory[out_base + i * 4 + 2] = m.scene_offset;
+        memory[out_base + i * 4 + 3] = m.info_offset;
+
+        // u32 offset of drawobj data
+        uint dd = drawdata_base + (m.scene_offset >> 2);
+        uint di = drawinfo_base + (m.info_offset >> 2);
 
         // For compatibility, we'll generate an Annotated object, same as old
         // pipeline. However, going forward we'll get rid of that, and have
         // later stages read scene + bbox etc.
-        ElementRef this_ref = Element_index(ref, i);
-        tag_word = Element_tag(this_ref).tag;
-        if (tag_word == Element_FillColor || tag_word == Element_FillLinGradient || tag_word == Element_FillImage ||
-            tag_word == Element_BeginClip) {
-            uint bbox_offset = (conf.bbox_alloc.offset >> 2) + 6 * m.path_ix;
+        tag_word = scene[drawtag_base + ix + i];
+        if (tag_word == Drawtag_FillColor || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillImage ||
+            tag_word == Drawtag_BeginClip) {
+            uint bbox_offset = (conf.path_bbox_alloc.offset >> 2) + 6 * m.path_ix;
             float bbox_l = float(memory[bbox_offset]) - 32768.0;
             float bbox_t = float(memory[bbox_offset + 1]) - 32768.0;
             float bbox_r = float(memory[bbox_offset + 2]) - 32768.0;
@@ -101,11 +106,11 @@
             uint fill_mode = uint(linewidth >= 0.0);
             vec4 mat;
             vec2 translate;
-            if (linewidth >= 0.0 || tag_word == Element_FillLinGradient) {
+            if (linewidth >= 0.0 || tag_word == Drawtag_FillLinGradient) {
                 uint trans_ix = memory[bbox_offset + 5];
                 uint t = (conf.trans_alloc.offset >> 2) + 6 * trans_ix;
                 mat = uintBitsToFloat(uvec4(memory[t], memory[t + 1], memory[t + 2], memory[t + 3]));
-                if (tag_word == Element_FillLinGradient) {
+                if (tag_word == Drawtag_FillLinGradient) {
                     translate = uintBitsToFloat(uvec2(memory[t + 4], memory[t + 5]));
                 }
             }
@@ -113,69 +118,38 @@
                 // TODO: need to deal with anisotropic case
                 linewidth *= sqrt(abs(mat.x * mat.w - mat.y * mat.z));
             }
-            linewidth = max(linewidth, 0.0);
             switch (tag_word) {
-            case Element_FillColor:
-                FillColor fill = Element_FillColor_read(this_ref);
-                AnnoColor anno_fill;
-                anno_fill.bbox = bbox;
-                anno_fill.linewidth = linewidth;
-                anno_fill.rgba_color = fill.rgba_color;
-                Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill);
+            case Drawtag_FillColor:
+            case Drawtag_FillImage:
+                memory[di] = floatBitsToUint(linewidth);
                 break;
-            case Element_FillLinGradient:
-                FillLinGradient lin = Element_FillLinGradient_read(this_ref);
-                AnnoLinGradient anno_lin;
-                anno_lin.bbox = bbox;
-                anno_lin.linewidth = linewidth;
-                anno_lin.index = lin.index;
-                vec2 p0 = mat.xy * lin.p0.x + mat.zw * lin.p0.y + translate;
-                vec2 p1 = mat.xy * lin.p1.x + mat.zw * lin.p1.y + translate;
+            case Drawtag_FillLinGradient:
+                memory[di] = floatBitsToUint(linewidth);
+                uint index = scene[dd];
+                vec2 p0 = uintBitsToFloat(uvec2(scene[dd + 1], scene[dd + 2]));
+                vec2 p1 = uintBitsToFloat(uvec2(scene[dd + 3], scene[dd + 4]));
+                p0 = mat.xy * p0.x + mat.zw * p0.y + translate;
+                p1 = mat.xy * p1.x + mat.zw * p1.y + translate;
                 vec2 dxy = p1 - p0;
                 float scale = 1.0 / (dxy.x * dxy.x + dxy.y * dxy.y);
                 float line_x = dxy.x * scale;
                 float line_y = dxy.y * scale;
-                anno_lin.line_x = line_x;
-                anno_lin.line_y = line_y;
-                anno_lin.line_c = -(p0.x * line_x + p0.y * line_y);
-                Annotated_LinGradient_write(conf.anno_alloc, out_ref, fill_mode, anno_lin);
+                float line_c = -(p0.x * line_x + p0.y * line_y);
+                memory[di + 1] = floatBitsToUint(line_x);
+                memory[di + 2] = floatBitsToUint(line_y);
+                memory[di + 3] = floatBitsToUint(line_c);
                 break;
-            case Element_FillImage:
-                FillImage fill_img = Element_FillImage_read(this_ref);
-                AnnoImage anno_img;
-                anno_img.bbox = bbox;
-                anno_img.linewidth = linewidth;
-                anno_img.index = fill_img.index;
-                anno_img.offset = fill_img.offset;
-                Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
-                break;
-            case Element_BeginClip:
-                Clip begin_clip = Element_BeginClip_read(this_ref);
-                AnnoBeginClip anno_begin_clip;
-                anno_begin_clip.bbox = bbox;
-                anno_begin_clip.linewidth = 0.0; // don't support clip-with-stroke
-                anno_begin_clip.blend = begin_clip.blend;
-                uint flags = uint(begin_clip.blend != BlendComp_default) << 1;
-                Annotated_BeginClip_write(conf.anno_alloc, out_ref, flags, anno_begin_clip);
+            case Drawtag_BeginClip:
                 break;
             }
-        } else if (tag_word == Element_EndClip) {
-            Clip end_clip = Element_BeginClip_read(this_ref);
-            AnnoEndClip anno_end_clip;
-            // The actual bbox will be reconstructed from clip stream output.
-            anno_end_clip.bbox = vec4(-1e9, -1e9, 1e9, 1e9);
-            anno_end_clip.blend = end_clip.blend;
-            uint flags = uint(end_clip.blend != BlendComp_default) << 1;
-            Annotated_EndClip_write(conf.anno_alloc, out_ref, flags, anno_end_clip);
         }
         // Generate clip stream.
-        if (tag_word == Element_BeginClip || tag_word == Element_EndClip) {
+        if (tag_word == Drawtag_BeginClip || tag_word == Drawtag_EndClip) {
             uint path_ix = ~(out_ix + i);
-            if (tag_word == Element_BeginClip) {
+            if (tag_word == Drawtag_BeginClip) {
                 path_ix = m.path_ix;
             }
             memory[clip_out_base + m.clip_ix] = path_ix;
         }
-        out_ref.offset += Annotated_size;
     }
 }
diff --git a/piet-gpu/shader/draw_reduce.comp b/piet-gpu/shader/draw_reduce.comp
index 68d43e9..d125d6e 100644
--- a/piet-gpu/shader/draw_reduce.comp
+++ b/piet-gpu/shader/draw_reduce.comp
@@ -36,13 +36,13 @@
 
 void main() {
     uint ix = gl_GlobalInvocationID.x * N_ROWS;
-    ElementRef ref = ElementRef(ix * Element_size);
-    uint tag_word = Element_tag(ref).tag;
+    uint drawtag_base = conf.drawtag_offset >> 2;
+    uint tag_word = scene[drawtag_base + ix];
 
     Monoid agg = map_tag(tag_word);
     for (uint i = 1; i < N_ROWS; i++) {
-        tag_word = Element_tag(Element_index(ref, i)).tag;
-        agg = combine_tag_monoid(agg, map_tag(tag_word));
+        uint tag_word = scene[drawtag_base + ix + i];
+        agg = combine_draw_monoid(agg, map_tag(tag_word));
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
     for (uint i = 0; i < LG_WG_SIZE; i++) {
@@ -50,7 +50,7 @@
         // We could make this predicate tighter, but would it help?
         if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) {
             Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)];
-            agg = combine_tag_monoid(agg, other);
+            agg = combine_draw_monoid(agg, other);
         }
         barrier();
         sh_scratch[gl_LocalInvocationID.x] = agg;
diff --git a/piet-gpu/shader/draw_scan.comp b/piet-gpu/shader/draw_scan.comp
index 1c26c26..d285020 100644
--- a/piet-gpu/shader/draw_scan.comp
+++ b/piet-gpu/shader/draw_scan.comp
@@ -16,8 +16,8 @@
 layout(local_size_x = WG_SIZE, local_size_y = 1) in;
 
 #define Monoid DrawMonoid
-#define combine_monoid combine_tag_monoid
-#define monoid_identity tag_monoid_identity
+#define combine_monoid combine_draw_monoid
+#define monoid_identity draw_monoid_identity
 
 layout(binding = 0) buffer DataBuf {
     Monoid[] data;
diff --git a/piet-gpu/shader/drawtag.h b/piet-gpu/shader/drawtag.h
index 17105f6..7f73546 100644
--- a/piet-gpu/shader/drawtag.h
+++ b/piet-gpu/shader/drawtag.h
@@ -2,36 +2,39 @@
 
 // Common data structures and functions for the draw tag stream.
 
+// Design of draw tag: & 0x1c gives scene size in bytes
+// & 1 gives clip
+// (tag >> 4) & 0x1c is info size in bytes
+
+#define Drawtag_Nop 0
+#define Drawtag_FillColor 0x44
+#define Drawtag_FillLinGradient 0x114
+#define Drawtag_FillImage 0x48
+#define Drawtag_BeginClip 0x05
+#define Drawtag_EndClip 0x25
+
 struct DrawMonoid {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
-DrawMonoid tag_monoid_identity() {
-    return DrawMonoid(0, 0);
+DrawMonoid draw_monoid_identity() {
+    return DrawMonoid(0, 0, 0, 0);
 }
 
-DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b) {
+DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b) {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
-#ifdef Element_size
 DrawMonoid map_tag(uint tag_word) {
-    switch (tag_word) {
-    case Element_FillColor:
-    case Element_FillLinGradient:
-    case Element_FillImage:
-        return DrawMonoid(1, 0);
-    case Element_BeginClip:
-    // TODO: endclip should be (0, 1), ie not generate a path. But for now
-    // we generate a dummy path.
-    case Element_EndClip:
-        return DrawMonoid(1, 1);
-    default:
-        return DrawMonoid(0, 0);
-    }
+    // TODO: at some point, EndClip should not generate a path
+    uint has_path = uint(tag_word != Drawtag_Nop);
+    return DrawMonoid(has_path, tag_word & 1, tag_word & 0x1c, (tag_word >> 4) & 0x1c);
 }
-#endif
diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil
index 4ebcb1c..0fb9622 100644
--- a/piet-gpu/shader/gen/backdrop.dxil
+++ b/piet-gpu/shader/gen/backdrop.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop.hlsl b/piet-gpu/shader/gen/backdrop.hlsl
index a2e71a8..aba3cff 100644
--- a/piet-gpu/shader/gen/backdrop.hlsl
+++ b/piet-gpu/shader/gen/backdrop.hlsl
@@ -3,17 +3,6 @@
     uint offset;
 };
 
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct PathRef
 {
     uint offset;
@@ -42,12 +31,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -55,12 +46,14 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _79 : register(u0, space0);
-ByteAddressBuffer _186 : register(t1, space0);
+RWByteAddressBuffer _67 : register(u0, space0);
+ByteAddressBuffer _166 : register(t1, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
@@ -89,24 +82,10 @@
     {
         return 0u;
     }
-    uint v = _79.Load(offset * 4 + 8);
+    uint v = _67.Load(offset * 4 + 8);
     return v;
 }
 
-AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _121 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _121;
-}
-
-uint fill_mode_from_flags(uint flags)
-{
-    return flags & 1u;
-}
-
 Path Path_read(Alloc a, PathRef ref)
 {
     uint ix = ref.offset >> uint(2);
@@ -121,8 +100,8 @@
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _165 = { raw2 };
-    s.tiles = _165;
+    TileRef _134 = { raw2 };
+    s.tiles = _134;
     return s;
 }
 
@@ -141,88 +120,65 @@
     {
         return;
     }
-    _79.Store(offset * 4 + 8, val);
+    _67.Store(offset * 4 + 8, val);
 }
 
 void comp_main()
 {
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef _194 = { _186.Load(32) + (element_ix * 40u) };
-    AnnotatedRef ref = _194;
     uint row_count = 0u;
-    bool mem_ok = _79.Load(4) == 0u;
+    bool mem_ok = _67.Load(4) == 0u;
     if (gl_LocalInvocationID.y == 0u)
     {
-        if (element_ix < _186.Load(0))
+        if (element_ix < _166.Load(0))
         {
-            Alloc _217;
-            _217.offset = _186.Load(32);
+            PathRef _180 = { _166.Load(16) + (element_ix * 12u) };
+            PathRef path_ref = _180;
+            Alloc _185;
+            _185.offset = _166.Load(16);
             Alloc param;
-            param.offset = _217.offset;
-            AnnotatedRef param_1 = ref;
-            AnnotatedTag tag = Annotated_tag(param, param_1);
-            switch (tag.tag)
+            param.offset = _185.offset;
+            PathRef param_1 = path_ref;
+            Path path = Path_read(param, param_1);
+            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+            row_count = path.bbox.w - path.bbox.y;
+            bool _210 = row_count == 1u;
+            bool _216;
+            if (_210)
             {
-                case 3u:
-                case 2u:
-                case 4u:
-                case 1u:
-                {
-                    uint param_2 = tag.flags;
-                    if (fill_mode_from_flags(param_2) != 0u)
-                    {
-                        break;
-                    }
-                    PathRef _243 = { _186.Load(16) + (element_ix * 12u) };
-                    PathRef path_ref = _243;
-                    Alloc _247;
-                    _247.offset = _186.Load(16);
-                    Alloc param_3;
-                    param_3.offset = _247.offset;
-                    PathRef param_4 = path_ref;
-                    Path path = Path_read(param_3, param_4);
-                    sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-                    row_count = path.bbox.w - path.bbox.y;
-                    bool _272 = row_count == 1u;
-                    bool _278;
-                    if (_272)
-                    {
-                        _278 = path.bbox.y > 0u;
-                    }
-                    else
-                    {
-                        _278 = _272;
-                    }
-                    if (_278)
-                    {
-                        row_count = 0u;
-                    }
-                    uint param_5 = path.tiles.offset;
-                    uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                    bool param_7 = mem_ok;
-                    Alloc path_alloc = new_alloc(param_5, param_6, param_7);
-                    sh_row_alloc[th_ix] = path_alloc;
-                    break;
-                }
+                _216 = path.bbox.y > 0u;
             }
+            else
+            {
+                _216 = _210;
+            }
+            if (_216)
+            {
+                row_count = 0u;
+            }
+            uint param_2 = path.tiles.offset;
+            uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_4 = mem_ok;
+            Alloc path_alloc = new_alloc(param_2, param_3, param_4);
+            sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
     }
     for (uint i = 0u; i < 8u; i++)
     {
         GroupMemoryBarrierWithGroupSync();
-        bool _325 = gl_LocalInvocationID.y == 0u;
-        bool _332;
-        if (_325)
+        bool _262 = gl_LocalInvocationID.y == 0u;
+        bool _269;
+        if (_262)
         {
-            _332 = th_ix >= (1u << i);
+            _269 = th_ix >= (1u << i);
         }
         else
         {
-            _332 = _325;
+            _269 = _262;
         }
-        if (_332)
+        if (_269)
         {
             row_count += sh_row_count[th_ix - (1u << i)];
         }
@@ -234,7 +190,7 @@
     }
     GroupMemoryBarrierWithGroupSync();
     uint total_rows = sh_row_count[255];
-    uint _411;
+    uint _348;
     for (uint row = th_ix; row < total_rows; row += 256u)
     {
         uint el_ix = 0u;
@@ -252,27 +208,27 @@
             Alloc tiles_alloc = sh_row_alloc[el_ix];
             if (el_ix > 0u)
             {
-                _411 = sh_row_count[el_ix - 1u];
+                _348 = sh_row_count[el_ix - 1u];
             }
             else
             {
-                _411 = 0u;
+                _348 = 0u;
             }
-            uint seq_ix = row - _411;
+            uint seq_ix = row - _348;
             uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
-            Alloc param_8 = tiles_alloc;
-            uint param_9 = tile_el_ix;
-            uint sum = read_mem(param_8, param_9);
+            Alloc param_5 = tiles_alloc;
+            uint param_6 = tile_el_ix;
+            uint sum = read_mem(param_5, param_6);
             for (uint x = 1u; x < width; x++)
             {
                 tile_el_ix += 2u;
-                Alloc param_10 = tiles_alloc;
-                uint param_11 = tile_el_ix;
-                sum += read_mem(param_10, param_11);
-                Alloc param_12 = tiles_alloc;
-                uint param_13 = tile_el_ix;
-                uint param_14 = sum;
-                write_mem(param_12, param_13, param_14);
+                Alloc param_7 = tiles_alloc;
+                uint param_8 = tile_el_ix;
+                sum += read_mem(param_7, param_8);
+                Alloc param_9 = tiles_alloc;
+                uint param_10 = tile_el_ix;
+                uint param_11 = sum;
+                write_mem(param_9, param_10, param_11);
             }
         }
     }
diff --git a/piet-gpu/shader/gen/backdrop.msl b/piet-gpu/shader/gen/backdrop.msl
index be670fc..1c0a0bb 100644
--- a/piet-gpu/shader/gen/backdrop.msl
+++ b/piet-gpu/shader/gen/backdrop.msl
@@ -10,17 +10,6 @@
     uint offset;
 };
 
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct PathRef
 {
     uint offset;
@@ -61,12 +50,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -74,6 +65,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -90,7 +83,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -98,38 +91,23 @@
     {
         return 0u;
     }
-    uint v = v_79.memory[offset];
+    uint v = v_67.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_79);
-    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
-
-static inline __attribute__((always_inline))
-uint fill_mode_from_flags(thread const uint& flags)
-{
-    return flags & 1u;
-}
-
-static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_79);
+    uint raw0 = read_mem(param, param_1, v_67);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_79);
+    uint raw1 = read_mem(param_2, param_3, v_67);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_79);
+    uint raw2 = read_mem(param_4, param_5, v_67);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -145,7 +123,7 @@
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -153,85 +131,65 @@
     {
         return;
     }
-    v_79.memory[offset] = val;
+    v_67.memory[offset] = val;
 }
 
-kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_row_width[256];
     threadgroup Alloc sh_row_alloc[256];
     threadgroup uint sh_row_count[256];
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) };
     uint row_count = 0u;
-    bool mem_ok = v_79.mem_error == 0u;
+    bool mem_ok = v_67.mem_error == 0u;
     if (gl_LocalInvocationID.y == 0u)
     {
-        if (element_ix < _186.conf.n_elements)
+        if (element_ix < _166.conf.n_elements)
         {
+            PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) };
             Alloc param;
-            param.offset = _186.conf.anno_alloc.offset;
-            AnnotatedRef param_1 = ref;
-            AnnotatedTag tag = Annotated_tag(param, param_1, v_79);
-            switch (tag.tag)
+            param.offset = _166.conf.tile_alloc.offset;
+            PathRef param_1 = path_ref;
+            Path path = Path_read(param, param_1, v_67);
+            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+            row_count = path.bbox.w - path.bbox.y;
+            bool _210 = row_count == 1u;
+            bool _216;
+            if (_210)
             {
-                case 3u:
-                case 2u:
-                case 4u:
-                case 1u:
-                {
-                    uint param_2 = tag.flags;
-                    if (fill_mode_from_flags(param_2) != 0u)
-                    {
-                        break;
-                    }
-                    PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) };
-                    Alloc param_3;
-                    param_3.offset = _186.conf.tile_alloc.offset;
-                    PathRef param_4 = path_ref;
-                    Path path = Path_read(param_3, param_4, v_79);
-                    sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-                    row_count = path.bbox.w - path.bbox.y;
-                    bool _272 = row_count == 1u;
-                    bool _278;
-                    if (_272)
-                    {
-                        _278 = path.bbox.y > 0u;
-                    }
-                    else
-                    {
-                        _278 = _272;
-                    }
-                    if (_278)
-                    {
-                        row_count = 0u;
-                    }
-                    uint param_5 = path.tiles.offset;
-                    uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                    bool param_7 = mem_ok;
-                    Alloc path_alloc = new_alloc(param_5, param_6, param_7);
-                    sh_row_alloc[th_ix] = path_alloc;
-                    break;
-                }
+                _216 = path.bbox.y > 0u;
             }
+            else
+            {
+                _216 = _210;
+            }
+            if (_216)
+            {
+                row_count = 0u;
+            }
+            uint param_2 = path.tiles.offset;
+            uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_4 = mem_ok;
+            Alloc path_alloc = new_alloc(param_2, param_3, param_4);
+            sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
     }
     for (uint i = 0u; i < 8u; i++)
     {
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        bool _325 = gl_LocalInvocationID.y == 0u;
-        bool _332;
-        if (_325)
+        bool _262 = gl_LocalInvocationID.y == 0u;
+        bool _269;
+        if (_262)
         {
-            _332 = th_ix >= (1u << i);
+            _269 = th_ix >= (1u << i);
         }
         else
         {
-            _332 = _325;
+            _269 = _262;
         }
-        if (_332)
+        if (_269)
         {
             row_count += sh_row_count[th_ix - (1u << i)];
         }
@@ -243,7 +201,7 @@
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint total_rows = sh_row_count[255];
-    uint _411;
+    uint _348;
     for (uint row = th_ix; row < total_rows; row += 256u)
     {
         uint el_ix = 0u;
@@ -261,27 +219,27 @@
             Alloc tiles_alloc = sh_row_alloc[el_ix];
             if (el_ix > 0u)
             {
-                _411 = sh_row_count[el_ix - 1u];
+                _348 = sh_row_count[el_ix - 1u];
             }
             else
             {
-                _411 = 0u;
+                _348 = 0u;
             }
-            uint seq_ix = row - _411;
+            uint seq_ix = row - _348;
             uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
-            Alloc param_8 = tiles_alloc;
-            uint param_9 = tile_el_ix;
-            uint sum = read_mem(param_8, param_9, v_79);
+            Alloc param_5 = tiles_alloc;
+            uint param_6 = tile_el_ix;
+            uint sum = read_mem(param_5, param_6, v_67);
             for (uint x = 1u; x < width; x++)
             {
                 tile_el_ix += 2u;
-                Alloc param_10 = tiles_alloc;
-                uint param_11 = tile_el_ix;
-                sum += read_mem(param_10, param_11, v_79);
-                Alloc param_12 = tiles_alloc;
-                uint param_13 = tile_el_ix;
-                uint param_14 = sum;
-                write_mem(param_12, param_13, param_14, v_79);
+                Alloc param_7 = tiles_alloc;
+                uint param_8 = tile_el_ix;
+                sum += read_mem(param_7, param_8, v_67);
+                Alloc param_9 = tiles_alloc;
+                uint param_10 = tile_el_ix;
+                uint param_11 = sum;
+                write_mem(param_9, param_10, param_11, v_67);
             }
         }
     }
diff --git a/piet-gpu/shader/gen/backdrop.spv b/piet-gpu/shader/gen/backdrop.spv
index f90bf6e..2bd17d8 100644
--- a/piet-gpu/shader/gen/backdrop.spv
+++ b/piet-gpu/shader/gen/backdrop.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil
index e6b2f1a..e24a6d3 100644
--- a/piet-gpu/shader/gen/backdrop_lg.dxil
+++ b/piet-gpu/shader/gen/backdrop_lg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.hlsl b/piet-gpu/shader/gen/backdrop_lg.hlsl
index 5071af2..c506403 100644
--- a/piet-gpu/shader/gen/backdrop_lg.hlsl
+++ b/piet-gpu/shader/gen/backdrop_lg.hlsl
@@ -3,17 +3,6 @@
     uint offset;
 };
 
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct PathRef
 {
     uint offset;
@@ -42,12 +31,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -55,12 +46,14 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 4u, 1u);
 
-RWByteAddressBuffer _79 : register(u0, space0);
-ByteAddressBuffer _186 : register(t1, space0);
+RWByteAddressBuffer _67 : register(u0, space0);
+ByteAddressBuffer _166 : register(t1, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
@@ -89,24 +82,10 @@
     {
         return 0u;
     }
-    uint v = _79.Load(offset * 4 + 8);
+    uint v = _67.Load(offset * 4 + 8);
     return v;
 }
 
-AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _121 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _121;
-}
-
-uint fill_mode_from_flags(uint flags)
-{
-    return flags & 1u;
-}
-
 Path Path_read(Alloc a, PathRef ref)
 {
     uint ix = ref.offset >> uint(2);
@@ -121,8 +100,8 @@
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _165 = { raw2 };
-    s.tiles = _165;
+    TileRef _134 = { raw2 };
+    s.tiles = _134;
     return s;
 }
 
@@ -141,88 +120,65 @@
     {
         return;
     }
-    _79.Store(offset * 4 + 8, val);
+    _67.Store(offset * 4 + 8, val);
 }
 
 void comp_main()
 {
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef _194 = { _186.Load(32) + (element_ix * 40u) };
-    AnnotatedRef ref = _194;
     uint row_count = 0u;
-    bool mem_ok = _79.Load(4) == 0u;
+    bool mem_ok = _67.Load(4) == 0u;
     if (gl_LocalInvocationID.y == 0u)
     {
-        if (element_ix < _186.Load(0))
+        if (element_ix < _166.Load(0))
         {
-            Alloc _217;
-            _217.offset = _186.Load(32);
+            PathRef _180 = { _166.Load(16) + (element_ix * 12u) };
+            PathRef path_ref = _180;
+            Alloc _185;
+            _185.offset = _166.Load(16);
             Alloc param;
-            param.offset = _217.offset;
-            AnnotatedRef param_1 = ref;
-            AnnotatedTag tag = Annotated_tag(param, param_1);
-            switch (tag.tag)
+            param.offset = _185.offset;
+            PathRef param_1 = path_ref;
+            Path path = Path_read(param, param_1);
+            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+            row_count = path.bbox.w - path.bbox.y;
+            bool _210 = row_count == 1u;
+            bool _216;
+            if (_210)
             {
-                case 3u:
-                case 2u:
-                case 4u:
-                case 1u:
-                {
-                    uint param_2 = tag.flags;
-                    if (fill_mode_from_flags(param_2) != 0u)
-                    {
-                        break;
-                    }
-                    PathRef _243 = { _186.Load(16) + (element_ix * 12u) };
-                    PathRef path_ref = _243;
-                    Alloc _247;
-                    _247.offset = _186.Load(16);
-                    Alloc param_3;
-                    param_3.offset = _247.offset;
-                    PathRef param_4 = path_ref;
-                    Path path = Path_read(param_3, param_4);
-                    sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-                    row_count = path.bbox.w - path.bbox.y;
-                    bool _272 = row_count == 1u;
-                    bool _278;
-                    if (_272)
-                    {
-                        _278 = path.bbox.y > 0u;
-                    }
-                    else
-                    {
-                        _278 = _272;
-                    }
-                    if (_278)
-                    {
-                        row_count = 0u;
-                    }
-                    uint param_5 = path.tiles.offset;
-                    uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                    bool param_7 = mem_ok;
-                    Alloc path_alloc = new_alloc(param_5, param_6, param_7);
-                    sh_row_alloc[th_ix] = path_alloc;
-                    break;
-                }
+                _216 = path.bbox.y > 0u;
             }
+            else
+            {
+                _216 = _210;
+            }
+            if (_216)
+            {
+                row_count = 0u;
+            }
+            uint param_2 = path.tiles.offset;
+            uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_4 = mem_ok;
+            Alloc path_alloc = new_alloc(param_2, param_3, param_4);
+            sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
     }
     for (uint i = 0u; i < 8u; i++)
     {
         GroupMemoryBarrierWithGroupSync();
-        bool _325 = gl_LocalInvocationID.y == 0u;
-        bool _332;
-        if (_325)
+        bool _262 = gl_LocalInvocationID.y == 0u;
+        bool _269;
+        if (_262)
         {
-            _332 = th_ix >= (1u << i);
+            _269 = th_ix >= (1u << i);
         }
         else
         {
-            _332 = _325;
+            _269 = _262;
         }
-        if (_332)
+        if (_269)
         {
             row_count += sh_row_count[th_ix - (1u << i)];
         }
@@ -234,7 +190,7 @@
     }
     GroupMemoryBarrierWithGroupSync();
     uint total_rows = sh_row_count[255];
-    uint _411;
+    uint _348;
     for (uint row = th_ix; row < total_rows; row += 1024u)
     {
         uint el_ix = 0u;
@@ -252,27 +208,27 @@
             Alloc tiles_alloc = sh_row_alloc[el_ix];
             if (el_ix > 0u)
             {
-                _411 = sh_row_count[el_ix - 1u];
+                _348 = sh_row_count[el_ix - 1u];
             }
             else
             {
-                _411 = 0u;
+                _348 = 0u;
             }
-            uint seq_ix = row - _411;
+            uint seq_ix = row - _348;
             uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
-            Alloc param_8 = tiles_alloc;
-            uint param_9 = tile_el_ix;
-            uint sum = read_mem(param_8, param_9);
+            Alloc param_5 = tiles_alloc;
+            uint param_6 = tile_el_ix;
+            uint sum = read_mem(param_5, param_6);
             for (uint x = 1u; x < width; x++)
             {
                 tile_el_ix += 2u;
-                Alloc param_10 = tiles_alloc;
-                uint param_11 = tile_el_ix;
-                sum += read_mem(param_10, param_11);
-                Alloc param_12 = tiles_alloc;
-                uint param_13 = tile_el_ix;
-                uint param_14 = sum;
-                write_mem(param_12, param_13, param_14);
+                Alloc param_7 = tiles_alloc;
+                uint param_8 = tile_el_ix;
+                sum += read_mem(param_7, param_8);
+                Alloc param_9 = tiles_alloc;
+                uint param_10 = tile_el_ix;
+                uint param_11 = sum;
+                write_mem(param_9, param_10, param_11);
             }
         }
     }
diff --git a/piet-gpu/shader/gen/backdrop_lg.msl b/piet-gpu/shader/gen/backdrop_lg.msl
index 31cd6cd..de43ebe 100644
--- a/piet-gpu/shader/gen/backdrop_lg.msl
+++ b/piet-gpu/shader/gen/backdrop_lg.msl
@@ -10,17 +10,6 @@
     uint offset;
 };
 
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct PathRef
 {
     uint offset;
@@ -61,12 +50,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -74,6 +65,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -90,7 +83,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -98,38 +91,23 @@
     {
         return 0u;
     }
-    uint v = v_79.memory[offset];
+    uint v = v_67.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_79);
-    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
-
-static inline __attribute__((always_inline))
-uint fill_mode_from_flags(thread const uint& flags)
-{
-    return flags & 1u;
-}
-
-static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_79);
+    uint raw0 = read_mem(param, param_1, v_67);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_79);
+    uint raw1 = read_mem(param_2, param_3, v_67);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_79);
+    uint raw2 = read_mem(param_4, param_5, v_67);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -145,7 +123,7 @@
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -153,85 +131,65 @@
     {
         return;
     }
-    v_79.memory[offset] = val;
+    v_67.memory[offset] = val;
 }
 
-kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_row_width[256];
     threadgroup Alloc sh_row_alloc[256];
     threadgroup uint sh_row_count[256];
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) };
     uint row_count = 0u;
-    bool mem_ok = v_79.mem_error == 0u;
+    bool mem_ok = v_67.mem_error == 0u;
     if (gl_LocalInvocationID.y == 0u)
     {
-        if (element_ix < _186.conf.n_elements)
+        if (element_ix < _166.conf.n_elements)
         {
+            PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) };
             Alloc param;
-            param.offset = _186.conf.anno_alloc.offset;
-            AnnotatedRef param_1 = ref;
-            AnnotatedTag tag = Annotated_tag(param, param_1, v_79);
-            switch (tag.tag)
+            param.offset = _166.conf.tile_alloc.offset;
+            PathRef param_1 = path_ref;
+            Path path = Path_read(param, param_1, v_67);
+            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+            row_count = path.bbox.w - path.bbox.y;
+            bool _210 = row_count == 1u;
+            bool _216;
+            if (_210)
             {
-                case 3u:
-                case 2u:
-                case 4u:
-                case 1u:
-                {
-                    uint param_2 = tag.flags;
-                    if (fill_mode_from_flags(param_2) != 0u)
-                    {
-                        break;
-                    }
-                    PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) };
-                    Alloc param_3;
-                    param_3.offset = _186.conf.tile_alloc.offset;
-                    PathRef param_4 = path_ref;
-                    Path path = Path_read(param_3, param_4, v_79);
-                    sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-                    row_count = path.bbox.w - path.bbox.y;
-                    bool _272 = row_count == 1u;
-                    bool _278;
-                    if (_272)
-                    {
-                        _278 = path.bbox.y > 0u;
-                    }
-                    else
-                    {
-                        _278 = _272;
-                    }
-                    if (_278)
-                    {
-                        row_count = 0u;
-                    }
-                    uint param_5 = path.tiles.offset;
-                    uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                    bool param_7 = mem_ok;
-                    Alloc path_alloc = new_alloc(param_5, param_6, param_7);
-                    sh_row_alloc[th_ix] = path_alloc;
-                    break;
-                }
+                _216 = path.bbox.y > 0u;
             }
+            else
+            {
+                _216 = _210;
+            }
+            if (_216)
+            {
+                row_count = 0u;
+            }
+            uint param_2 = path.tiles.offset;
+            uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_4 = mem_ok;
+            Alloc path_alloc = new_alloc(param_2, param_3, param_4);
+            sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
     }
     for (uint i = 0u; i < 8u; i++)
     {
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        bool _325 = gl_LocalInvocationID.y == 0u;
-        bool _332;
-        if (_325)
+        bool _262 = gl_LocalInvocationID.y == 0u;
+        bool _269;
+        if (_262)
         {
-            _332 = th_ix >= (1u << i);
+            _269 = th_ix >= (1u << i);
         }
         else
         {
-            _332 = _325;
+            _269 = _262;
         }
-        if (_332)
+        if (_269)
         {
             row_count += sh_row_count[th_ix - (1u << i)];
         }
@@ -243,7 +201,7 @@
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint total_rows = sh_row_count[255];
-    uint _411;
+    uint _348;
     for (uint row = th_ix; row < total_rows; row += 1024u)
     {
         uint el_ix = 0u;
@@ -261,27 +219,27 @@
             Alloc tiles_alloc = sh_row_alloc[el_ix];
             if (el_ix > 0u)
             {
-                _411 = sh_row_count[el_ix - 1u];
+                _348 = sh_row_count[el_ix - 1u];
             }
             else
             {
-                _411 = 0u;
+                _348 = 0u;
             }
-            uint seq_ix = row - _411;
+            uint seq_ix = row - _348;
             uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
-            Alloc param_8 = tiles_alloc;
-            uint param_9 = tile_el_ix;
-            uint sum = read_mem(param_8, param_9, v_79);
+            Alloc param_5 = tiles_alloc;
+            uint param_6 = tile_el_ix;
+            uint sum = read_mem(param_5, param_6, v_67);
             for (uint x = 1u; x < width; x++)
             {
                 tile_el_ix += 2u;
-                Alloc param_10 = tiles_alloc;
-                uint param_11 = tile_el_ix;
-                sum += read_mem(param_10, param_11, v_79);
-                Alloc param_12 = tiles_alloc;
-                uint param_13 = tile_el_ix;
-                uint param_14 = sum;
-                write_mem(param_12, param_13, param_14, v_79);
+                Alloc param_7 = tiles_alloc;
+                uint param_8 = tile_el_ix;
+                sum += read_mem(param_7, param_8, v_67);
+                Alloc param_9 = tiles_alloc;
+                uint param_10 = tile_el_ix;
+                uint param_11 = sum;
+                write_mem(param_9, param_10, param_11, v_67);
             }
         }
     }
diff --git a/piet-gpu/shader/gen/backdrop_lg.spv b/piet-gpu/shader/gen/backdrop_lg.spv
index 3f90d2e..ff2b1d7 100644
--- a/piet-gpu/shader/gen/backdrop_lg.spv
+++ b/piet-gpu/shader/gen/backdrop_lg.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil
index 26b5f16..6655b7f 100644
--- a/piet-gpu/shader/gen/bbox_clear.dxil
+++ b/piet-gpu/shader/gen/bbox_clear.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl
index 903d84c..8a884d3 100644
--- a/piet-gpu/shader/gen/bbox_clear.hlsl
+++ b/piet-gpu/shader/gen/bbox_clear.hlsl
@@ -15,12 +15,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -28,6 +30,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
@@ -44,7 +48,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x;
-    if (ix < _21.Load(68))
+    if (ix < _21.Load(76))
     {
         uint out_ix = (_21.Load(40) >> uint(2)) + (6u * ix);
         _45.Store(out_ix * 4 + 8, 65535u);
diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl
index e80f15e..c278c68 100644
--- a/piet-gpu/shader/gen/bbox_clear.msl
+++ b/piet-gpu/shader/gen/bbox_clear.msl
@@ -20,12 +20,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -33,6 +35,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -54,7 +58,7 @@
     uint ix = gl_GlobalInvocationID.x;
     if (ix < _21.conf.n_path)
     {
-        uint out_ix = (_21.conf.bbox_alloc.offset >> uint(2)) + (6u * ix);
+        uint out_ix = (_21.conf.path_bbox_alloc.offset >> uint(2)) + (6u * ix);
         _45.memory[out_ix] = 65535u;
         _45.memory[out_ix + 1u] = 65535u;
         _45.memory[out_ix + 2u] = 0u;
diff --git a/piet-gpu/shader/gen/bbox_clear.spv b/piet-gpu/shader/gen/bbox_clear.spv
index e3e88d7..58a270e 100644
--- a/piet-gpu/shader/gen/bbox_clear.spv
+++ b/piet-gpu/shader/gen/bbox_clear.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil
index fe53f27..3050aa8 100644
--- a/piet-gpu/shader/gen/binning.dxil
+++ b/piet-gpu/shader/gen/binning.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl
index e4de1b9..986f42b 100644
--- a/piet-gpu/shader/gen/binning.hlsl
+++ b/piet-gpu/shader/gen/binning.hlsl
@@ -9,17 +9,6 @@
     bool failed;
 };
 
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct BinInstanceRef
 {
     uint offset;
@@ -34,6 +23,8 @@
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
 struct Config
@@ -48,12 +39,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -61,12 +54,14 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _94 : register(u0, space0);
-ByteAddressBuffer _202 : register(t1, space0);
+RWByteAddressBuffer _81 : register(u0, space0);
+ByteAddressBuffer _156 : register(t1, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -81,59 +76,35 @@
 groupshared uint count[8][256];
 groupshared Alloc sh_chunk_alloc[256];
 
-bool touch_mem(Alloc alloc, uint offset)
-{
-    return true;
-}
-
-uint read_mem(Alloc alloc, uint offset)
-{
-    Alloc param = alloc;
-    uint param_1 = offset;
-    if (!touch_mem(param, param_1))
-    {
-        return 0u;
-    }
-    uint v = _94.Load(offset * 4 + 8);
-    return v;
-}
-
-AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _181 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _181;
-}
-
 DrawMonoid load_draw_monoid(uint element_ix)
 {
-    uint base = (_202.Load(44) >> uint(2)) + (2u * element_ix);
-    uint path_ix = _94.Load(base * 4 + 8);
-    uint clip_ix = _94.Load((base + 1u) * 4 + 8);
-    DrawMonoid _222 = { path_ix, clip_ix };
-    return _222;
+    uint base = (_156.Load(44) >> uint(2)) + (4u * element_ix);
+    uint path_ix = _81.Load(base * 4 + 8);
+    uint clip_ix = _81.Load((base + 1u) * 4 + 8);
+    uint scene_offset = _81.Load((base + 2u) * 4 + 8);
+    uint info_offset = _81.Load((base + 3u) * 4 + 8);
+    DrawMonoid _190 = { path_ix, clip_ix, scene_offset, info_offset };
+    return _190;
 }
 
 float4 load_clip_bbox(uint clip_ix)
 {
-    uint base = (_202.Load(60) >> uint(2)) + (4u * clip_ix);
-    float x0 = asfloat(_94.Load(base * 4 + 8));
-    float y0 = asfloat(_94.Load((base + 1u) * 4 + 8));
-    float x1 = asfloat(_94.Load((base + 2u) * 4 + 8));
-    float y1 = asfloat(_94.Load((base + 3u) * 4 + 8));
+    uint base = (_156.Load(60) >> uint(2)) + (4u * clip_ix);
+    float x0 = asfloat(_81.Load(base * 4 + 8));
+    float y0 = asfloat(_81.Load((base + 1u) * 4 + 8));
+    float x1 = asfloat(_81.Load((base + 2u) * 4 + 8));
+    float y1 = asfloat(_81.Load((base + 3u) * 4 + 8));
     float4 bbox = float4(x0, y0, x1, y1);
     return bbox;
 }
 
 float4 load_path_bbox(uint path_ix)
 {
-    uint base = (_202.Load(40) >> uint(2)) + (6u * path_ix);
-    float bbox_l = float(_94.Load(base * 4 + 8)) - 32768.0f;
-    float bbox_t = float(_94.Load((base + 1u) * 4 + 8)) - 32768.0f;
-    float bbox_r = float(_94.Load((base + 2u) * 4 + 8)) - 32768.0f;
-    float bbox_b = float(_94.Load((base + 3u) * 4 + 8)) - 32768.0f;
+    uint base = (_156.Load(40) >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(_81.Load(base * 4 + 8)) - 32768.0f;
+    float bbox_t = float(_81.Load((base + 1u) * 4 + 8)) - 32768.0f;
+    float bbox_r = float(_81.Load((base + 2u) * 4 + 8)) - 32768.0f;
+    float bbox_b = float(_81.Load((base + 3u) * 4 + 8)) - 32768.0f;
     float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
     return bbox;
 }
@@ -143,13 +114,13 @@
     return float4(max(a.xy, b.xy), min(a.zw, b.zw));
 }
 
-void store_path_bbox(AnnotatedRef ref, float4 bbox)
+void store_draw_bbox(uint draw_ix, float4 bbox)
 {
-    uint ix = ref.offset >> uint(2);
-    _94.Store((ix + 1u) * 4 + 8, asuint(bbox.x));
-    _94.Store((ix + 2u) * 4 + 8, asuint(bbox.y));
-    _94.Store((ix + 3u) * 4 + 8, asuint(bbox.z));
-    _94.Store((ix + 4u) * 4 + 8, asuint(bbox.w));
+    uint base = (_156.Load(64) >> uint(2)) + (4u * draw_ix);
+    _81.Store(base * 4 + 8, asuint(bbox.x));
+    _81.Store((base + 1u) * 4 + 8, asuint(bbox.y));
+    _81.Store((base + 2u) * 4 + 8, asuint(bbox.z));
+    _81.Store((base + 3u) * 4 + 8, asuint(bbox.w));
 }
 
 Alloc new_alloc(uint offset, uint size, bool mem_ok)
@@ -161,27 +132,32 @@
 
 MallocResult malloc(uint size)
 {
-    uint _100;
-    _94.InterlockedAdd(0, size, _100);
-    uint offset = _100;
-    uint _107;
-    _94.GetDimensions(_107);
-    _107 = (_107 - 8) / 4;
+    uint _87;
+    _81.InterlockedAdd(0, size, _87);
+    uint offset = _87;
+    uint _94;
+    _81.GetDimensions(_94);
+    _94 = (_94 - 8) / 4;
     MallocResult r;
-    r.failed = (offset + size) > uint(int(_107) * 4);
+    r.failed = (offset + size) > uint(int(_94) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _129;
-        _94.InterlockedMax(4, 1u, _129);
+        uint _116;
+        _81.InterlockedMax(4, 1u, _116);
         return r;
     }
     return r;
 }
 
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
 void write_mem(Alloc alloc, uint offset, uint val)
 {
     Alloc param = alloc;
@@ -190,7 +166,7 @@
     {
         return;
     }
-    _94.Store(offset * 4 + 8, val);
+    _81.Store(offset * 4 + 8, val);
 }
 
 void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s)
@@ -204,7 +180,6 @@
 
 void comp_main()
 {
-    uint my_n_elements = _202.Load(0);
     uint my_partition = gl_WorkGroupID.x;
     for (uint i = 0u; i < 8u; i++)
     {
@@ -216,62 +191,42 @@
     }
     GroupMemoryBarrierWithGroupSync();
     uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
-    AnnotatedRef _415 = { _202.Load(32) + (element_ix * 40u) };
-    AnnotatedRef ref = _415;
-    uint tag = 0u;
-    if (element_ix < my_n_elements)
-    {
-        Alloc _425;
-        _425.offset = _202.Load(32);
-        Alloc param;
-        param.offset = _425.offset;
-        AnnotatedRef param_1 = ref;
-        tag = Annotated_tag(param, param_1).tag;
-    }
     int x0 = 0;
     int y0 = 0;
     int x1 = 0;
     int y1 = 0;
-    switch (tag)
+    if (element_ix < _156.Load(0))
     {
-        case 1u:
-        case 2u:
-        case 3u:
-        case 4u:
-        case 5u:
+        uint param = element_ix;
+        DrawMonoid draw_monoid = load_draw_monoid(param);
+        uint path_ix = draw_monoid.path_ix;
+        float4 clip_bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
+        uint clip_ix = draw_monoid.clip_ix;
+        if (clip_ix > 0u)
         {
-            uint param_2 = element_ix;
-            DrawMonoid draw_monoid = load_draw_monoid(param_2);
-            uint path_ix = draw_monoid.path_ix;
-            float4 clip_bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
-            uint clip_ix = draw_monoid.clip_ix;
-            if (clip_ix > 0u)
-            {
-                uint param_3 = clip_ix - 1u;
-                clip_bbox = load_clip_bbox(param_3);
-            }
-            uint param_4 = path_ix;
-            float4 path_bbox = load_path_bbox(param_4);
-            float4 param_5 = path_bbox;
-            float4 param_6 = clip_bbox;
-            float4 bbox = bbox_intersect(param_5, param_6);
-            float4 _473 = bbox;
-            float4 _475 = bbox;
-            float2 _477 = max(_473.xy, _475.zw);
-            bbox.z = _477.x;
-            bbox.w = _477.y;
-            AnnotatedRef param_7 = ref;
-            float4 param_8 = bbox;
-            store_path_bbox(param_7, param_8);
-            x0 = int(floor(bbox.x * 0.00390625f));
-            y0 = int(floor(bbox.y * 0.00390625f));
-            x1 = int(ceil(bbox.z * 0.00390625f));
-            y1 = int(ceil(bbox.w * 0.00390625f));
-            break;
+            uint param_1 = clip_ix - 1u;
+            clip_bbox = load_clip_bbox(param_1);
         }
+        uint param_2 = path_ix;
+        float4 path_bbox = load_path_bbox(param_2);
+        float4 param_3 = path_bbox;
+        float4 param_4 = clip_bbox;
+        float4 bbox = bbox_intersect(param_3, param_4);
+        float4 _417 = bbox;
+        float4 _419 = bbox;
+        float2 _421 = max(_417.xy, _419.zw);
+        bbox.z = _421.x;
+        bbox.w = _421.y;
+        uint param_5 = element_ix;
+        float4 param_6 = bbox;
+        store_draw_bbox(param_5, param_6);
+        x0 = int(floor(bbox.x * 0.00390625f));
+        y0 = int(floor(bbox.y * 0.00390625f));
+        x1 = int(ceil(bbox.z * 0.00390625f));
+        y1 = int(ceil(bbox.w * 0.00390625f));
     }
-    uint width_in_bins = ((_202.Load(8) + 16u) - 1u) / 16u;
-    uint height_in_bins = ((_202.Load(12) + 16u) - 1u) / 16u;
+    uint width_in_bins = ((_156.Load(8) + 16u) - 1u) / 16u;
+    uint height_in_bins = ((_156.Load(12) + 16u) - 1u) / 16u;
     x0 = clamp(x0, 0, int(width_in_bins));
     x1 = clamp(x1, x0, int(width_in_bins));
     y0 = clamp(y0, 0, int(height_in_bins));
@@ -286,8 +241,8 @@
     uint my_mask = 1u << (gl_LocalInvocationID.x & 31u);
     while (y < y1)
     {
-        uint _581;
-        InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _581);
+        uint _523;
+        InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _523);
         x++;
         if (x == x1)
         {
@@ -302,15 +257,15 @@
         element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x])));
         count[i_1][gl_LocalInvocationID.x] = element_count;
     }
-    uint param_9 = 0u;
-    uint param_10 = 0u;
-    bool param_11 = true;
-    Alloc chunk_alloc = new_alloc(param_9, param_10, param_11);
+    uint param_7 = 0u;
+    uint param_8 = 0u;
+    bool param_9 = true;
+    Alloc chunk_alloc = new_alloc(param_7, param_8, param_9);
     if (element_count != 0u)
     {
-        uint param_12 = element_count * 4u;
-        MallocResult _631 = malloc(param_12);
-        MallocResult chunk = _631;
+        uint param_10 = element_count * 4u;
+        MallocResult _573 = malloc(param_10);
+        MallocResult chunk = _573;
         chunk_alloc = chunk.alloc;
         sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
         if (chunk.failed)
@@ -318,32 +273,32 @@
             sh_alloc_failed = true;
         }
     }
-    uint out_ix = (_202.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
-    Alloc _660;
-    _660.offset = _202.Load(20);
-    Alloc param_13;
-    param_13.offset = _660.offset;
-    uint param_14 = out_ix;
-    uint param_15 = element_count;
-    write_mem(param_13, param_14, param_15);
-    Alloc _672;
-    _672.offset = _202.Load(20);
-    Alloc param_16;
-    param_16.offset = _672.offset;
-    uint param_17 = out_ix + 1u;
-    uint param_18 = chunk_alloc.offset;
-    write_mem(param_16, param_17, param_18);
+    uint out_ix = (_156.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
+    Alloc _603;
+    _603.offset = _156.Load(20);
+    Alloc param_11;
+    param_11.offset = _603.offset;
+    uint param_12 = out_ix;
+    uint param_13 = element_count;
+    write_mem(param_11, param_12, param_13);
+    Alloc _615;
+    _615.offset = _156.Load(20);
+    Alloc param_14;
+    param_14.offset = _615.offset;
+    uint param_15 = out_ix + 1u;
+    uint param_16 = chunk_alloc.offset;
+    write_mem(param_14, param_15, param_16);
     GroupMemoryBarrierWithGroupSync();
-    bool _687;
+    bool _630;
     if (!sh_alloc_failed)
     {
-        _687 = _94.Load(4) != 0u;
+        _630 = _81.Load(4) != 0u;
     }
     else
     {
-        _687 = sh_alloc_failed;
+        _630 = sh_alloc_failed;
     }
-    if (_687)
+    if (_630)
     {
         return;
     }
@@ -362,12 +317,12 @@
             }
             Alloc out_alloc = sh_chunk_alloc[bin_ix];
             uint out_offset = out_alloc.offset + (idx * 4u);
-            BinInstanceRef _749 = { out_offset };
-            BinInstance _751 = { element_ix };
-            Alloc param_19 = out_alloc;
-            BinInstanceRef param_20 = _749;
-            BinInstance param_21 = _751;
-            BinInstance_write(param_19, param_20, param_21);
+            BinInstanceRef _692 = { out_offset };
+            BinInstance _694 = { element_ix };
+            Alloc param_17 = out_alloc;
+            BinInstanceRef param_18 = _692;
+            BinInstance param_19 = _694;
+            BinInstance_write(param_17, param_18, param_19);
         }
         x++;
         if (x == x1)
diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl
index 0e3b6c8..2ee5168 100644
--- a/piet-gpu/shader/gen/binning.msl
+++ b/piet-gpu/shader/gen/binning.msl
@@ -18,17 +18,6 @@
     bool failed;
 };
 
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct BinInstanceRef
 {
     uint offset;
@@ -43,6 +32,8 @@
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
 struct Memory
@@ -69,12 +60,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -82,6 +75,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -92,62 +87,36 @@
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
 {
-    return true;
+    uint base = (v_156.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * element_ix);
+    uint path_ix = v_81.memory[base];
+    uint clip_ix = v_81.memory[base + 1u];
+    uint scene_offset = v_81.memory[base + 2u];
+    uint info_offset = v_81.memory[base + 3u];
+    return DrawMonoid{ path_ix, clip_ix, scene_offset, info_offset };
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_94, constant uint& v_94BufferSize)
+float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
 {
-    Alloc param = alloc;
-    uint param_1 = offset;
-    if (!touch_mem(param, param_1))
-    {
-        return 0u;
-    }
-    uint v = v_94.memory[offset];
-    return v;
-}
-
-static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_94, constant uint& v_94BufferSize)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_94, v_94BufferSize);
-    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
-
-static inline __attribute__((always_inline))
-DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202)
-{
-    uint base = (v_202.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix);
-    uint path_ix = v_94.memory[base];
-    uint clip_ix = v_94.memory[base + 1u];
-    return DrawMonoid{ path_ix, clip_ix };
-}
-
-static inline __attribute__((always_inline))
-float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202)
-{
-    uint base = (v_202.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix);
-    float x0 = as_type<float>(v_94.memory[base]);
-    float y0 = as_type<float>(v_94.memory[base + 1u]);
-    float x1 = as_type<float>(v_94.memory[base + 2u]);
-    float y1 = as_type<float>(v_94.memory[base + 3u]);
+    uint base = (v_156.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix);
+    float x0 = as_type<float>(v_81.memory[base]);
+    float y0 = as_type<float>(v_81.memory[base + 1u]);
+    float x1 = as_type<float>(v_81.memory[base + 2u]);
+    float y1 = as_type<float>(v_81.memory[base + 3u]);
     float4 bbox = float4(x0, y0, x1, y1);
     return bbox;
 }
 
 static inline __attribute__((always_inline))
-float4 load_path_bbox(thread const uint& path_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202)
+float4 load_path_bbox(thread const uint& path_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
 {
-    uint base = (v_202.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix);
-    float bbox_l = float(v_94.memory[base]) - 32768.0;
-    float bbox_t = float(v_94.memory[base + 1u]) - 32768.0;
-    float bbox_r = float(v_94.memory[base + 2u]) - 32768.0;
-    float bbox_b = float(v_94.memory[base + 3u]) - 32768.0;
+    uint base = (v_156.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(v_81.memory[base]) - 32768.0;
+    float bbox_t = float(v_81.memory[base + 1u]) - 32768.0;
+    float bbox_r = float(v_81.memory[base + 2u]) - 32768.0;
+    float bbox_b = float(v_81.memory[base + 3u]) - 32768.0;
     float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
     return bbox;
 }
@@ -159,13 +128,13 @@
 }
 
 static inline __attribute__((always_inline))
-void store_path_bbox(thread const AnnotatedRef& ref, thread const float4& bbox, device Memory& v_94, constant uint& v_94BufferSize)
+void store_draw_bbox(thread const uint& draw_ix, thread const float4& bbox, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
 {
-    uint ix = ref.offset >> uint(2);
-    v_94.memory[ix + 1u] = as_type<uint>(bbox.x);
-    v_94.memory[ix + 2u] = as_type<uint>(bbox.y);
-    v_94.memory[ix + 3u] = as_type<uint>(bbox.z);
-    v_94.memory[ix + 4u] = as_type<uint>(bbox.w);
+    uint base = (v_156.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix);
+    v_81.memory[base] = as_type<uint>(bbox.x);
+    v_81.memory[base + 1u] = as_type<uint>(bbox.y);
+    v_81.memory[base + 2u] = as_type<uint>(bbox.z);
+    v_81.memory[base + 3u] = as_type<uint>(bbox.w);
 }
 
 static inline __attribute__((always_inline))
@@ -177,26 +146,32 @@
 }
 
 static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_94, constant uint& v_94BufferSize)
+MallocResult malloc(thread const uint& size, device Memory& v_81, constant uint& v_81BufferSize)
 {
-    uint _100 = atomic_fetch_add_explicit((device atomic_uint*)&v_94.mem_offset, size, memory_order_relaxed);
-    uint offset = _100;
+    uint _87 = atomic_fetch_add_explicit((device atomic_uint*)&v_81.mem_offset, size, memory_order_relaxed);
+    uint offset = _87;
     MallocResult r;
-    r.failed = (offset + size) > uint(int((v_94BufferSize - 8) / 4) * 4);
+    r.failed = (offset + size) > uint(int((v_81BufferSize - 8) / 4) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _129 = atomic_fetch_max_explicit((device atomic_uint*)&v_94.mem_error, 1u, memory_order_relaxed);
+        uint _116 = atomic_fetch_max_explicit((device atomic_uint*)&v_81.mem_error, 1u, memory_order_relaxed);
         return r;
     }
     return r;
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_94, constant uint& v_94BufferSize)
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_81, constant uint& v_81BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -204,27 +179,26 @@
     {
         return;
     }
-    v_94.memory[offset] = val;
+    v_81.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_94, constant uint& v_94BufferSize)
+void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_81, constant uint& v_81BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.element_ix;
-    write_mem(param, param_1, param_2, v_94, v_94BufferSize);
+    write_mem(param, param_1, param_2, v_81, v_81BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_94 [[buffer(0)]], const device ConfigBuf& v_202 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_81 [[buffer(0)]], const device ConfigBuf& v_156 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint bitmaps[8][256];
     threadgroup short sh_alloc_failed;
     threadgroup uint count[8][256];
     threadgroup Alloc sh_chunk_alloc[256];
-    constant uint& v_94BufferSize = spvBufferSizeConstants[0];
-    uint my_n_elements = v_202.conf.n_elements;
+    constant uint& v_81BufferSize = spvBufferSizeConstants[0];
     uint my_partition = gl_WorkGroupID.x;
     for (uint i = 0u; i < 8u; i++)
     {
@@ -236,59 +210,42 @@
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef{ v_202.conf.anno_alloc.offset + (element_ix * 40u) };
-    uint tag = 0u;
-    if (element_ix < my_n_elements)
-    {
-        Alloc param;
-        param.offset = v_202.conf.anno_alloc.offset;
-        AnnotatedRef param_1 = ref;
-        tag = Annotated_tag(param, param_1, v_94, v_94BufferSize).tag;
-    }
     int x0 = 0;
     int y0 = 0;
     int x1 = 0;
     int y1 = 0;
-    switch (tag)
+    if (element_ix < v_156.conf.n_elements)
     {
-        case 1u:
-        case 2u:
-        case 3u:
-        case 4u:
-        case 5u:
+        uint param = element_ix;
+        DrawMonoid draw_monoid = load_draw_monoid(param, v_81, v_81BufferSize, v_156);
+        uint path_ix = draw_monoid.path_ix;
+        float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
+        uint clip_ix = draw_monoid.clip_ix;
+        if (clip_ix > 0u)
         {
-            uint param_2 = element_ix;
-            DrawMonoid draw_monoid = load_draw_monoid(param_2, v_94, v_94BufferSize, v_202);
-            uint path_ix = draw_monoid.path_ix;
-            float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
-            uint clip_ix = draw_monoid.clip_ix;
-            if (clip_ix > 0u)
-            {
-                uint param_3 = clip_ix - 1u;
-                clip_bbox = load_clip_bbox(param_3, v_94, v_94BufferSize, v_202);
-            }
-            uint param_4 = path_ix;
-            float4 path_bbox = load_path_bbox(param_4, v_94, v_94BufferSize, v_202);
-            float4 param_5 = path_bbox;
-            float4 param_6 = clip_bbox;
-            float4 bbox = bbox_intersect(param_5, param_6);
-            float4 _473 = bbox;
-            float4 _475 = bbox;
-            float2 _477 = fast::max(_473.xy, _475.zw);
-            bbox.z = _477.x;
-            bbox.w = _477.y;
-            AnnotatedRef param_7 = ref;
-            float4 param_8 = bbox;
-            store_path_bbox(param_7, param_8, v_94, v_94BufferSize);
-            x0 = int(floor(bbox.x * 0.00390625));
-            y0 = int(floor(bbox.y * 0.00390625));
-            x1 = int(ceil(bbox.z * 0.00390625));
-            y1 = int(ceil(bbox.w * 0.00390625));
-            break;
+            uint param_1 = clip_ix - 1u;
+            clip_bbox = load_clip_bbox(param_1, v_81, v_81BufferSize, v_156);
         }
+        uint param_2 = path_ix;
+        float4 path_bbox = load_path_bbox(param_2, v_81, v_81BufferSize, v_156);
+        float4 param_3 = path_bbox;
+        float4 param_4 = clip_bbox;
+        float4 bbox = bbox_intersect(param_3, param_4);
+        float4 _417 = bbox;
+        float4 _419 = bbox;
+        float2 _421 = fast::max(_417.xy, _419.zw);
+        bbox.z = _421.x;
+        bbox.w = _421.y;
+        uint param_5 = element_ix;
+        float4 param_6 = bbox;
+        store_draw_bbox(param_5, param_6, v_81, v_81BufferSize, v_156);
+        x0 = int(floor(bbox.x * 0.00390625));
+        y0 = int(floor(bbox.y * 0.00390625));
+        x1 = int(ceil(bbox.z * 0.00390625));
+        y1 = int(ceil(bbox.w * 0.00390625));
     }
-    uint width_in_bins = ((v_202.conf.width_in_tiles + 16u) - 1u) / 16u;
-    uint height_in_bins = ((v_202.conf.height_in_tiles + 16u) - 1u) / 16u;
+    uint width_in_bins = ((v_156.conf.width_in_tiles + 16u) - 1u) / 16u;
+    uint height_in_bins = ((v_156.conf.height_in_tiles + 16u) - 1u) / 16u;
     x0 = clamp(x0, 0, int(width_in_bins));
     x1 = clamp(x1, x0, int(width_in_bins));
     y0 = clamp(y0, 0, int(height_in_bins));
@@ -303,7 +260,7 @@
     uint my_mask = 1u << (gl_LocalInvocationID.x & 31u);
     while (y < y1)
     {
-        uint _581 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
+        uint _523 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
         x++;
         if (x == x1)
         {
@@ -318,15 +275,15 @@
         element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x])));
         count[i_1][gl_LocalInvocationID.x] = element_count;
     }
-    uint param_9 = 0u;
-    uint param_10 = 0u;
-    bool param_11 = true;
-    Alloc chunk_alloc = new_alloc(param_9, param_10, param_11);
+    uint param_7 = 0u;
+    uint param_8 = 0u;
+    bool param_9 = true;
+    Alloc chunk_alloc = new_alloc(param_7, param_8, param_9);
     if (element_count != 0u)
     {
-        uint param_12 = element_count * 4u;
-        MallocResult _631 = malloc(param_12, v_94, v_94BufferSize);
-        MallocResult chunk = _631;
+        uint param_10 = element_count * 4u;
+        MallocResult _573 = malloc(param_10, v_81, v_81BufferSize);
+        MallocResult chunk = _573;
         chunk_alloc = chunk.alloc;
         sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
         if (chunk.failed)
@@ -334,28 +291,28 @@
             sh_alloc_failed = short(true);
         }
     }
-    uint out_ix = (v_202.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
-    Alloc param_13;
-    param_13.offset = v_202.conf.bin_alloc.offset;
-    uint param_14 = out_ix;
-    uint param_15 = element_count;
-    write_mem(param_13, param_14, param_15, v_94, v_94BufferSize);
-    Alloc param_16;
-    param_16.offset = v_202.conf.bin_alloc.offset;
-    uint param_17 = out_ix + 1u;
-    uint param_18 = chunk_alloc.offset;
-    write_mem(param_16, param_17, param_18, v_94, v_94BufferSize);
+    uint out_ix = (v_156.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
+    Alloc param_11;
+    param_11.offset = v_156.conf.bin_alloc.offset;
+    uint param_12 = out_ix;
+    uint param_13 = element_count;
+    write_mem(param_11, param_12, param_13, v_81, v_81BufferSize);
+    Alloc param_14;
+    param_14.offset = v_156.conf.bin_alloc.offset;
+    uint param_15 = out_ix + 1u;
+    uint param_16 = chunk_alloc.offset;
+    write_mem(param_14, param_15, param_16, v_81, v_81BufferSize);
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    bool _687;
+    bool _630;
     if (!bool(sh_alloc_failed))
     {
-        _687 = v_94.mem_error != 0u;
+        _630 = v_81.mem_error != 0u;
     }
     else
     {
-        _687 = bool(sh_alloc_failed);
+        _630 = bool(sh_alloc_failed);
     }
-    if (_687)
+    if (_630)
     {
         return;
     }
@@ -374,10 +331,10 @@
             }
             Alloc out_alloc = sh_chunk_alloc[bin_ix];
             uint out_offset = out_alloc.offset + (idx * 4u);
-            Alloc param_19 = out_alloc;
-            BinInstanceRef param_20 = BinInstanceRef{ out_offset };
-            BinInstance param_21 = BinInstance{ element_ix };
-            BinInstance_write(param_19, param_20, param_21, v_94, v_94BufferSize);
+            Alloc param_17 = out_alloc;
+            BinInstanceRef param_18 = BinInstanceRef{ out_offset };
+            BinInstance param_19 = BinInstance{ element_ix };
+            BinInstance_write(param_17, param_18, param_19, v_81, v_81BufferSize);
         }
         x++;
         if (x == x1)
diff --git a/piet-gpu/shader/gen/binning.spv b/piet-gpu/shader/gen/binning.spv
index eca0692..30eacd6 100644
--- a/piet-gpu/shader/gen/binning.spv
+++ b/piet-gpu/shader/gen/binning.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_leaf.dxil b/piet-gpu/shader/gen/clip_leaf.dxil
index fe87a11..29a158e 100644
--- a/piet-gpu/shader/gen/clip_leaf.dxil
+++ b/piet-gpu/shader/gen/clip_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_leaf.hlsl b/piet-gpu/shader/gen/clip_leaf.hlsl
index d570420..ed45bf1 100644
--- a/piet-gpu/shader/gen/clip_leaf.hlsl
+++ b/piet-gpu/shader/gen/clip_leaf.hlsl
@@ -27,12 +27,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -40,6 +42,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
@@ -99,7 +103,7 @@
 
 uint load_path_ix(uint ix)
 {
-    if (ix < _80.Load(72))
+    if (ix < _80.Load(80))
     {
         return _96.Load(((_80.Load(48) >> uint(2)) + ix) * 4 + 8);
     }
@@ -324,7 +328,7 @@
     bool _725;
     if (_717)
     {
-        _725 = gl_GlobalInvocationID.x < _80.Load(72);
+        _725 = gl_GlobalInvocationID.x < _80.Load(80);
     }
     else
     {
@@ -334,7 +338,7 @@
     {
         uint param_15 = parent;
         path_ix = load_path_ix(param_15);
-        uint drawmonoid_out_base = (_80.Load(44) >> uint(2)) + (2u * (~inp));
+        uint drawmonoid_out_base = (_80.Load(44) >> uint(2)) + (4u * (~inp));
         _96.Store(drawmonoid_out_base * 4 + 8, path_ix);
         if (int(grandparent) >= 0)
         {
diff --git a/piet-gpu/shader/gen/clip_leaf.msl b/piet-gpu/shader/gen/clip_leaf.msl
index 4e2d059..5f5e0a7 100644
--- a/piet-gpu/shader/gen/clip_leaf.msl
+++ b/piet-gpu/shader/gen/clip_leaf.msl
@@ -34,12 +34,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -47,6 +49,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -112,7 +116,7 @@
 static inline __attribute__((always_inline))
 float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_80, device Memory& v_96)
 {
-    uint base = (v_80.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+    uint base = (v_80.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix);
     float bbox_l = float(v_96.memory[base]) - 32768.0;
     float bbox_t = float(v_96.memory[base + 1u]) - 32768.0;
     float bbox_r = float(v_96.memory[base + 2u]) - 32768.0;
@@ -341,7 +345,7 @@
     {
         uint param_15 = parent;
         path_ix = load_path_ix(param_15, v_80, v_96);
-        uint drawmonoid_out_base = (v_80.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * (~inp));
+        uint drawmonoid_out_base = (v_80.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * (~inp));
         v_96.memory[drawmonoid_out_base] = path_ix;
         if (int(grandparent) >= 0)
         {
diff --git a/piet-gpu/shader/gen/clip_leaf.spv b/piet-gpu/shader/gen/clip_leaf.spv
index 7c4c174..beac64b 100644
--- a/piet-gpu/shader/gen/clip_leaf.spv
+++ b/piet-gpu/shader/gen/clip_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_reduce.hlsl b/piet-gpu/shader/gen/clip_reduce.hlsl
index 6851e63..1276b5f 100644
--- a/piet-gpu/shader/gen/clip_reduce.hlsl
+++ b/piet-gpu/shader/gen/clip_reduce.hlsl
@@ -27,12 +27,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -40,6 +42,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
diff --git a/piet-gpu/shader/gen/clip_reduce.msl b/piet-gpu/shader/gen/clip_reduce.msl
index 5845676..26214f1 100644
--- a/piet-gpu/shader/gen/clip_reduce.msl
+++ b/piet-gpu/shader/gen/clip_reduce.msl
@@ -34,12 +34,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -47,6 +49,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -81,7 +85,7 @@
 static inline __attribute__((always_inline))
 float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_64, device Memory& v_80)
 {
-    uint base = (v_64.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+    uint base = (v_64.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix);
     float bbox_l = float(v_80.memory[base]) - 32768.0;
     float bbox_t = float(v_80.memory[base + 1u]) - 32768.0;
     float bbox_r = float(v_80.memory[base + 2u]) - 32768.0;
diff --git a/piet-gpu/shader/gen/clip_reduce.spv b/piet-gpu/shader/gen/clip_reduce.spv
index cbe8c1f..ce0b9bb 100644
--- a/piet-gpu/shader/gen/clip_reduce.spv
+++ b/piet-gpu/shader/gen/clip_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index c7e1682..12e88dd 100644
--- a/piet-gpu/shader/gen/coarse.dxil
+++ b/piet-gpu/shader/gen/coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index f00eabe..a702df5 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -9,68 +9,6 @@
     bool failed;
 };
 
-struct AnnoImageRef
-{
-    uint offset;
-};
-
-struct AnnoImage
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    int2 offset;
-};
-
-struct AnnoColorRef
-{
-    uint offset;
-};
-
-struct AnnoColor
-{
-    float4 bbox;
-    float linewidth;
-    uint rgba_color;
-};
-
-struct AnnoLinGradientRef
-{
-    uint offset;
-};
-
-struct AnnoLinGradient
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    float line_x;
-    float line_y;
-    float line_c;
-};
-
-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct BinInstanceRef
 {
     uint offset;
@@ -201,12 +139,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -214,12 +154,15 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _308 : register(u0, space0);
-ByteAddressBuffer _1283 : register(t1, space0);
+RWByteAddressBuffer _242 : register(u0, space0);
+ByteAddressBuffer _854 : register(t1, space0);
+ByteAddressBuffer _1222 : register(t2, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -242,8 +185,8 @@
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _385 = { a.offset + offset };
-    return _385;
+    Alloc _319 = { a.offset + offset };
+    return _319;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -259,7 +202,7 @@
     {
         return 0u;
     }
-    uint v = _308.Load(offset * 4 + 8);
+    uint v = _242.Load(offset * 4 + 8);
     return v;
 }
 
@@ -272,8 +215,8 @@
 
 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index)
 {
-    BinInstanceRef _765 = { ref.offset + (index * 4u) };
-    return _765;
+    BinInstanceRef _328 = { ref.offset + (index * 4u) };
+    return _328;
 }
 
 BinInstance BinInstance_read(Alloc a, BinInstanceRef ref)
@@ -287,15 +230,6 @@
     return s;
 }
 
-AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _717 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _717;
-}
-
 Path Path_read(Alloc a, PathRef ref)
 {
     uint ix = ref.offset >> uint(2);
@@ -310,8 +244,8 @@
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _825 = { raw2 };
-    s.tiles = _825;
+    TileRef _391 = { raw2 };
+    s.tiles = _391;
     return s;
 }
 
@@ -321,11 +255,11 @@
 
 Alloc read_tile_alloc(uint el_ix, bool mem_ok)
 {
-    uint _1169;
-    _308.GetDimensions(_1169);
-    _1169 = (_1169 - 8) / 4;
+    uint _741;
+    _242.GetDimensions(_741);
+    _741 = (_741 - 8) / 4;
     uint param = 0u;
-    uint param_1 = uint(int(_1169) * 4);
+    uint param_1 = uint(int(_741) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
@@ -339,67 +273,31 @@
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
     uint raw1 = read_mem(param_2, param_3);
-    TileSegRef _850 = { raw0 };
+    TileSegRef _416 = { raw0 };
     Tile s;
-    s.tile = _850;
+    s.tile = _416;
     s.backdrop = int(raw1);
     return s;
 }
 
-AnnoColor AnnoColor_read(Alloc a, AnnoColorRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9);
-    Alloc param_10 = a;
-    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11);
-    AnnoColor s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.linewidth = asfloat(raw4);
-    s.rgba_color = raw5;
-    return s;
-}
-
-AnnoColor Annotated_Color_read(Alloc a, AnnotatedRef ref)
-{
-    AnnoColorRef _723 = { ref.offset + 4u };
-    Alloc param = a;
-    AnnoColorRef param_1 = _723;
-    return AnnoColor_read(param, param_1);
-}
-
 MallocResult malloc(uint size)
 {
-    uint _314;
-    _308.InterlockedAdd(0, size, _314);
-    uint offset = _314;
-    uint _321;
-    _308.GetDimensions(_321);
-    _321 = (_321 - 8) / 4;
+    uint _248;
+    _242.InterlockedAdd(0, size, _248);
+    uint offset = _248;
+    uint _255;
+    _242.GetDimensions(_255);
+    _255 = (_255 - 8) / 4;
     MallocResult r;
-    r.failed = (offset + size) > uint(int(_321) * 4);
+    r.failed = (offset + size) > uint(int(_255) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _343;
-        _308.InterlockedMax(4, 1u, _343);
+        uint _277;
+        _242.InterlockedMax(4, 1u, _277);
         return r;
     }
     return r;
@@ -413,7 +311,7 @@
     {
         return;
     }
-    _308.Store(offset * 4 + 8, val);
+    _242.Store(offset * 4 + 8, val);
 }
 
 void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s)
@@ -431,9 +329,9 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 10u;
     write_mem(param, param_1, param_2);
-    CmdJumpRef _1162 = { ref.offset + 4u };
+    CmdJumpRef _734 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdJumpRef param_4 = _1162;
+    CmdJumpRef param_4 = _734;
     CmdJump param_5 = s;
     CmdJump_write(param_3, param_4, param_5);
 }
@@ -445,30 +343,25 @@
         return true;
     }
     uint param = 1024u;
-    MallocResult _1190 = malloc(param);
-    MallocResult new_cmd = _1190;
+    MallocResult _762 = malloc(param);
+    MallocResult new_cmd = _762;
     if (new_cmd.failed)
     {
         return false;
     }
-    CmdJump _1200 = { new_cmd.alloc.offset };
-    CmdJump jump = _1200;
+    CmdJump _772 = { new_cmd.alloc.offset };
+    CmdJump jump = _772;
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
     Cmd_Jump_write(param_1, param_2, param_3);
     cmd_alloc = new_cmd.alloc;
-    CmdRef _1212 = { cmd_alloc.offset };
-    cmd_ref = _1212;
+    CmdRef _784 = { cmd_alloc.offset };
+    cmd_ref = _784;
     cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
     return true;
 }
 
-uint fill_mode_from_flags(uint flags)
-{
-    return flags & 1u;
-}
-
 void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s)
 {
     uint ix = ref.offset >> uint(2);
@@ -488,9 +381,9 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
     write_mem(param, param_1, param_2);
-    CmdFillRef _1036 = { ref.offset + 4u };
+    CmdFillRef _604 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdFillRef param_4 = _1036;
+    CmdFillRef param_4 = _604;
     CmdFill param_5 = s;
     CmdFill_write(param_3, param_4, param_5);
 }
@@ -522,44 +415,43 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
     write_mem(param, param_1, param_2);
-    CmdStrokeRef _1054 = { ref.offset + 4u };
+    CmdStrokeRef _622 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdStrokeRef param_4 = _1054;
+    CmdStrokeRef param_4 = _622;
     CmdStroke param_5 = s;
     CmdStroke_write(param_3, param_4, param_5);
 }
 
-void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float linewidth)
+void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth)
 {
-    uint param = flags;
-    if (fill_mode_from_flags(param) == 0u)
+    if (linewidth < 0.0f)
     {
         if (tile.tile.offset != 0u)
         {
-            CmdFill _1236 = { tile.tile.offset, tile.backdrop };
-            CmdFill cmd_fill = _1236;
-            Alloc param_1 = alloc;
-            CmdRef param_2 = cmd_ref;
-            CmdFill param_3 = cmd_fill;
-            Cmd_Fill_write(param_1, param_2, param_3);
+            CmdFill _807 = { tile.tile.offset, tile.backdrop };
+            CmdFill cmd_fill = _807;
+            Alloc param = alloc;
+            CmdRef param_1 = cmd_ref;
+            CmdFill param_2 = cmd_fill;
+            Cmd_Fill_write(param, param_1, param_2);
             cmd_ref.offset += 12u;
         }
         else
         {
-            Alloc param_4 = alloc;
-            CmdRef param_5 = cmd_ref;
-            Cmd_Solid_write(param_4, param_5);
+            Alloc param_3 = alloc;
+            CmdRef param_4 = cmd_ref;
+            Cmd_Solid_write(param_3, param_4);
             cmd_ref.offset += 4u;
         }
     }
     else
     {
-        CmdStroke _1266 = { tile.tile.offset, 0.5f * linewidth };
-        CmdStroke cmd_stroke = _1266;
-        Alloc param_6 = alloc;
-        CmdRef param_7 = cmd_ref;
-        CmdStroke param_8 = cmd_stroke;
-        Cmd_Stroke_write(param_6, param_7, param_8);
+        CmdStroke _837 = { tile.tile.offset, 0.5f * linewidth };
+        CmdStroke cmd_stroke = _837;
+        Alloc param_5 = alloc;
+        CmdRef param_6 = cmd_ref;
+        CmdStroke param_7 = cmd_stroke;
+        Cmd_Stroke_write(param_5, param_6, param_7);
         cmd_ref.offset += 12u;
     }
 }
@@ -579,61 +471,13 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
     write_mem(param, param_1, param_2);
-    CmdColorRef _1080 = { ref.offset + 4u };
+    CmdColorRef _649 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdColorRef param_4 = _1080;
+    CmdColorRef param_4 = _649;
     CmdColor param_5 = s;
     CmdColor_write(param_3, param_4, param_5);
 }
 
-AnnoLinGradient AnnoLinGradient_read(Alloc a, AnnoLinGradientRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9);
-    Alloc param_10 = a;
-    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13);
-    Alloc param_14 = a;
-    uint param_15 = ix + 7u;
-    uint raw7 = read_mem(param_14, param_15);
-    Alloc param_16 = a;
-    uint param_17 = ix + 8u;
-    uint raw8 = read_mem(param_16, param_17);
-    AnnoLinGradient s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.linewidth = asfloat(raw4);
-    s.index = raw5;
-    s.line_x = asfloat(raw6);
-    s.line_y = asfloat(raw7);
-    s.line_c = asfloat(raw8);
-    return s;
-}
-
-AnnoLinGradient Annotated_LinGradient_read(Alloc a, AnnotatedRef ref)
-{
-    AnnoLinGradientRef _733 = { ref.offset + 4u };
-    Alloc param = a;
-    AnnoLinGradientRef param_1 = _733;
-    return AnnoLinGradient_read(param, param_1);
-}
-
 void CmdLinGrad_write(Alloc a, CmdLinGradRef ref, CmdLinGrad s)
 {
     uint ix = ref.offset >> uint(2);
@@ -661,53 +505,13 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
     write_mem(param, param_1, param_2);
-    CmdLinGradRef _1098 = { ref.offset + 4u };
+    CmdLinGradRef _668 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdLinGradRef param_4 = _1098;
+    CmdLinGradRef param_4 = _668;
     CmdLinGrad param_5 = s;
     CmdLinGrad_write(param_3, param_4, param_5);
 }
 
-AnnoImage AnnoImage_read(Alloc a, AnnoImageRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9);
-    Alloc param_10 = a;
-    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13);
-    AnnoImage s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.linewidth = asfloat(raw4);
-    s.index = raw5;
-    s.offset = int2(int(raw6 << uint(16)) >> 16, int(raw6) >> 16);
-    return s;
-}
-
-AnnoImage Annotated_Image_read(Alloc a, AnnotatedRef ref)
-{
-    AnnoImageRef _743 = { ref.offset + 4u };
-    Alloc param = a;
-    AnnoImageRef param_1 = _743;
-    return AnnoImage_read(param, param_1);
-}
-
 void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s)
 {
     uint ix = ref.offset >> uint(2);
@@ -727,9 +531,9 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
     write_mem(param, param_1, param_2);
-    CmdImageRef _1116 = { ref.offset + 4u };
+    CmdImageRef _687 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdImageRef param_4 = _1116;
+    CmdImageRef param_4 = _687;
     CmdImage param_5 = s;
     CmdImage_write(param_3, param_4, param_5);
 }
@@ -742,38 +546,6 @@
     write_mem(param, param_1, param_2);
 }
 
-AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9);
-    AnnoEndClip s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.blend = raw4;
-    return s;
-}
-
-AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref)
-{
-    AnnoEndClipRef _753 = { ref.offset + 4u };
-    Alloc param = a;
-    AnnoEndClipRef param_1 = _753;
-    return AnnoEndClip_read(param, param_1);
-}
-
 void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s)
 {
     uint ix = ref.offset >> uint(2);
@@ -789,9 +561,9 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 9u;
     write_mem(param, param_1, param_2);
-    CmdEndClipRef _1143 = { ref.offset + 4u };
+    CmdEndClipRef _715 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdEndClipRef param_4 = _1143;
+    CmdEndClipRef param_4 = _715;
     CmdEndClip param_5 = s;
     CmdEndClip_write(param_3, param_4, param_5);
 }
@@ -806,25 +578,25 @@
 
 void comp_main()
 {
-    uint width_in_bins = ((_1283.Load(8) + 16u) - 1u) / 16u;
+    uint width_in_bins = ((_854.Load(8) + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_1283.Load(0) + 256u) - 1u) / 256u;
+    uint n_partitions = ((_854.Load(0) + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _1283.Load(8)) + bin_tile_x) + tile_x;
-    Alloc _1348;
-    _1348.offset = _1283.Load(24);
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _854.Load(8)) + bin_tile_x) + tile_x;
+    Alloc _919;
+    _919.offset = _854.Load(24);
     Alloc param;
-    param.offset = _1348.offset;
+    param.offset = _919.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1357 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1357;
+    CmdRef _928 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _928;
     uint cmd_limit = (cmd_ref.offset + 1024u) - 60u;
     uint clip_depth = 0u;
     uint clip_zero_depth = 0u;
@@ -832,59 +604,54 @@
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    bool mem_ok = _308.Load(4) == 0u;
+    uint drawmonoid_start = _854.Load(44) >> uint(2);
+    uint drawtag_start = _854.Load(100) >> uint(2);
+    uint drawdata_start = _854.Load(104) >> uint(2);
+    uint drawinfo_start = _854.Load(68) >> uint(2);
+    bool mem_ok = _242.Load(4) == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1562;
+    uint _1154;
     uint element_ix;
-    AnnotatedRef ref;
     Alloc param_14;
-    Alloc param_16;
     uint tile_count;
-    Alloc param_23;
-    uint _1887;
-    Alloc param_29;
-    Tile tile_1;
-    AnnoColor fill;
-    Alloc param_35;
-    Alloc param_52;
+    uint _1455;
+    float linewidth;
     CmdLinGrad cmd_lin;
-    Alloc param_69;
-    Alloc param_95;
     while (true)
     {
         for (uint i = 0u; i < 8u; i++)
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1614;
+        bool _1206;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1412 = th_ix < 256u;
-                bool _1420;
-                if (_1412)
+                bool _1003 = th_ix < 256u;
+                bool _1011;
+                if (_1003)
                 {
-                    _1420 = (partition_ix + th_ix) < n_partitions;
+                    _1011 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1420 = _1412;
+                    _1011 = _1003;
                 }
-                if (_1420)
+                if (_1011)
                 {
-                    uint in_ix = (_1283.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1437;
-                    _1437.offset = _1283.Load(20);
-                    param_3.offset = _1437.offset;
+                    uint in_ix = (_854.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    Alloc _1029;
+                    _1029.offset = _854.Load(20);
+                    param_3.offset = _1029.offset;
                     uint param_4 = in_ix;
                     count = read_mem(param_3, param_4);
-                    Alloc _1448;
-                    _1448.offset = _1283.Load(20);
-                    param_5.offset = _1448.offset;
+                    Alloc _1040;
+                    _1040.offset = _854.Load(20);
+                    param_5.offset = _1040.offset;
                     uint param_6 = in_ix + 1u;
                     uint offset = read_mem(param_5, param_6);
                     uint param_7 = offset;
@@ -930,16 +697,16 @@
                 }
                 if (part_ix > 0u)
                 {
-                    _1562 = sh_part_count[part_ix - 1u];
+                    _1154 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1562 = part_start_ix;
+                    _1154 = part_start_ix;
                 }
-                ix -= _1562;
+                ix -= _1154;
                 Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1581 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1581;
+                BinInstanceRef _1173 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1173;
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
@@ -949,16 +716,16 @@
             }
             GroupMemoryBarrierWithGroupSync();
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1604 = (wr_ix - rd_ix) < 256u;
-            if (_1604)
+            bool _1196 = (wr_ix - rd_ix) < 256u;
+            if (_1196)
             {
-                _1614 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1206 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1614 = _1604;
+                _1206 = _1196;
             }
-            if (_1614)
+            if (_1206)
             {
                 continue;
             }
@@ -971,30 +738,24 @@
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            AnnotatedRef _1635 = { _1283.Load(32) + (element_ix * 40u) };
-            ref = _1635;
-            Alloc _1638;
-            _1638.offset = _1283.Load(32);
-            param_14.offset = _1638.offset;
-            AnnotatedRef param_15 = ref;
-            tag = Annotated_tag(param_14, param_15).tag;
+            tag = _1222.Load((drawtag_start + element_ix) * 4 + 0);
         }
         switch (tag)
         {
-            case 1u:
-            case 3u:
-            case 2u:
-            case 4u:
+            case 68u:
+            case 72u:
+            case 276u:
             case 5u:
+            case 37u:
             {
-                uint drawmonoid_base = (_1283.Load(44) >> uint(2)) + (2u * element_ix);
-                uint path_ix = _308.Load(drawmonoid_base * 4 + 8);
-                PathRef _1667 = { _1283.Load(16) + (path_ix * 12u) };
-                Alloc _1670;
-                _1670.offset = _1283.Load(16);
-                param_16.offset = _1670.offset;
-                PathRef param_17 = _1667;
-                Path path = Path_read(param_16, param_17);
+                uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
+                uint path_ix = _242.Load(drawmonoid_base * 4 + 8);
+                PathRef _1247 = { _854.Load(16) + (path_ix * 12u) };
+                Alloc _1250;
+                _1250.offset = _854.Load(16);
+                param_14.offset = _1250.offset;
+                PathRef param_15 = _1247;
+                Path path = Path_read(param_14, param_15);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
                 int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -1009,13 +770,13 @@
                 tile_count = uint(x1 - x0) * uint(y1 - y0);
                 uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
                 sh_tile_base[th_ix] = base;
-                uint param_18 = path.tiles.offset;
-                uint param_19 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                bool param_20 = mem_ok;
-                Alloc path_alloc = new_alloc(param_18, param_19, param_20);
-                uint param_21 = th_ix;
-                Alloc param_22 = path_alloc;
-                write_tile_alloc(param_21, param_22);
+                uint param_16 = path.tiles.offset;
+                uint param_17 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                bool param_18 = mem_ok;
+                Alloc path_alloc = new_alloc(param_16, param_17, param_18);
+                uint param_19 = th_ix;
+                Alloc param_20 = path_alloc;
+                write_tile_alloc(param_19, param_20);
                 break;
             }
             default:
@@ -1048,72 +809,57 @@
                     el_ix = probe_1;
                 }
             }
-            AnnotatedRef _1869 = { _1283.Load(32) + (sh_elements[el_ix] * 40u) };
-            AnnotatedRef ref_1 = _1869;
-            Alloc _1874;
-            _1874.offset = _1283.Load(32);
-            param_23.offset = _1874.offset;
-            AnnotatedRef param_24 = ref_1;
-            AnnotatedTag anno_tag = Annotated_tag(param_23, param_24);
-            uint tag_1 = anno_tag.tag;
+            uint element_ix_1 = sh_elements[el_ix];
+            uint tag_1 = _1222.Load((drawtag_start + element_ix_1) * 4 + 0);
             if (el_ix > 0u)
             {
-                _1887 = sh_tile_count[el_ix - 1u];
+                _1455 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1887 = 0u;
+                _1455 = 0u;
             }
-            uint seq_ix = ix_1 - _1887;
+            uint seq_ix = ix_1 - _1455;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
             bool include_tile = false;
             if (mem_ok)
             {
-                uint param_25 = el_ix;
-                bool param_26 = mem_ok;
-                TileRef _1939 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1939;
-                Tile tile = Tile_read(param_27, param_28);
-                bool is_clip = (tag_1 == 4u) || (tag_1 == 5u);
-                bool _1951 = tile.tile.offset != 0u;
-                bool _1960;
-                if (!_1951)
+                uint param_21 = el_ix;
+                bool param_22 = mem_ok;
+                TileRef _1507 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                Alloc param_23 = read_tile_alloc(param_21, param_22);
+                TileRef param_24 = _1507;
+                Tile tile = Tile_read(param_23, param_24);
+                bool is_clip = (tag_1 & 1u) != 0u;
+                bool is_blend = false;
+                if (is_clip)
                 {
-                    _1960 = (tile.backdrop == 0) == is_clip;
+                    uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
+                    uint scene_offset = _242.Load((drawmonoid_base_1 + 2u) * 4 + 8);
+                    uint dd = drawdata_start + (scene_offset >> uint(2));
+                    uint blend = _1222.Load(dd * 4 + 0);
+                    is_blend = blend != 3u;
+                }
+                bool _1542 = tile.tile.offset != 0u;
+                bool _1551;
+                if (!_1542)
+                {
+                    _1551 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1960 = _1951;
+                    _1551 = _1542;
                 }
-                bool _1972;
-                if (!_1960)
-                {
-                    bool _1971;
-                    if (is_clip)
-                    {
-                        _1971 = (anno_tag.flags & 2u) != 0u;
-                    }
-                    else
-                    {
-                        _1971 = is_clip;
-                    }
-                    _1972 = _1971;
-                }
-                else
-                {
-                    _1972 = _1960;
-                }
-                include_tile = _1972;
+                include_tile = _1551 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1992;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1992);
+                uint _1573;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1573);
             }
         }
         GroupMemoryBarrierWithGroupSync();
@@ -1135,219 +881,178 @@
                 }
             }
             uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
-            uint element_ix_1 = sh_elements[element_ref_ix];
+            uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            AnnotatedRef _2046 = { _1283.Load(32) + (element_ix_1 * 40u) };
-            ref = _2046;
-            Alloc _2050;
-            _2050.offset = _1283.Load(32);
-            param_29.offset = _2050.offset;
-            AnnotatedRef param_30 = ref;
-            AnnotatedTag tag_2 = Annotated_tag(param_29, param_30);
+            uint drawtag = _1222.Load((drawtag_start + element_ix_2) * 4 + 0);
             if (clip_zero_depth == 0u)
             {
-                switch (tag_2.tag)
+                uint param_25 = element_ref_ix;
+                bool param_26 = mem_ok;
+                TileRef _1650 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                Alloc param_27 = read_tile_alloc(param_25, param_26);
+                TileRef param_28 = _1650;
+                Tile tile_1 = Tile_read(param_27, param_28);
+                uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
+                uint scene_offset_1 = _242.Load((drawmonoid_base_2 + 2u) * 4 + 8);
+                uint info_offset = _242.Load((drawmonoid_base_2 + 3u) * 4 + 8);
+                uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
+                uint di = drawinfo_start + (info_offset >> uint(2));
+                switch (drawtag)
                 {
-                    case 1u:
+                    case 68u:
                     {
-                        uint param_31 = element_ref_ix;
-                        bool param_32 = mem_ok;
-                        TileRef _2086 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        Alloc param_33 = read_tile_alloc(param_31, param_32);
-                        TileRef param_34 = _2086;
-                        tile_1 = Tile_read(param_33, param_34);
-                        Alloc _2093;
-                        _2093.offset = _1283.Load(32);
-                        param_35.offset = _2093.offset;
-                        AnnotatedRef param_36 = ref;
-                        fill = Annotated_Color_read(param_35, param_36);
-                        Alloc param_37 = cmd_alloc;
-                        CmdRef param_38 = cmd_ref;
-                        uint param_39 = cmd_limit;
-                        bool _2105 = alloc_cmd(param_37, param_38, param_39);
-                        cmd_alloc = param_37;
-                        cmd_ref = param_38;
-                        cmd_limit = param_39;
-                        if (!_2105)
+                        linewidth = asfloat(_242.Load(di * 4 + 8));
+                        Alloc param_29 = cmd_alloc;
+                        CmdRef param_30 = cmd_ref;
+                        uint param_31 = cmd_limit;
+                        bool _1697 = alloc_cmd(param_29, param_30, param_31);
+                        cmd_alloc = param_29;
+                        cmd_ref = param_30;
+                        cmd_limit = param_31;
+                        if (!_1697)
                         {
                             break;
                         }
-                        Alloc param_40 = cmd_alloc;
-                        CmdRef param_41 = cmd_ref;
-                        uint param_42 = tag_2.flags;
-                        Tile param_43 = tile_1;
-                        float param_44 = fill.linewidth;
-                        write_fill(param_40, param_41, param_42, param_43, param_44);
-                        cmd_ref = param_41;
-                        CmdColor _2129 = { fill.rgba_color };
-                        Alloc param_45 = cmd_alloc;
-                        CmdRef param_46 = cmd_ref;
-                        CmdColor param_47 = _2129;
-                        Cmd_Color_write(param_45, param_46, param_47);
+                        Alloc param_32 = cmd_alloc;
+                        CmdRef param_33 = cmd_ref;
+                        Tile param_34 = tile_1;
+                        float param_35 = linewidth;
+                        write_fill(param_32, param_33, param_34, param_35);
+                        cmd_ref = param_33;
+                        uint rgba = _1222.Load(dd_1 * 4 + 0);
+                        CmdColor _1720 = { rgba };
+                        Alloc param_36 = cmd_alloc;
+                        CmdRef param_37 = cmd_ref;
+                        CmdColor param_38 = _1720;
+                        Cmd_Color_write(param_36, param_37, param_38);
                         cmd_ref.offset += 8u;
                         break;
                     }
-                    case 2u:
+                    case 276u:
                     {
-                        uint param_48 = element_ref_ix;
-                        bool param_49 = mem_ok;
-                        TileRef _2158 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        Alloc param_50 = read_tile_alloc(param_48, param_49);
-                        TileRef param_51 = _2158;
-                        tile_1 = Tile_read(param_50, param_51);
-                        Alloc _2165;
-                        _2165.offset = _1283.Load(32);
-                        param_52.offset = _2165.offset;
-                        AnnotatedRef param_53 = ref;
-                        AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53);
-                        Alloc param_54 = cmd_alloc;
-                        CmdRef param_55 = cmd_ref;
-                        uint param_56 = cmd_limit;
-                        bool _2177 = alloc_cmd(param_54, param_55, param_56);
-                        cmd_alloc = param_54;
-                        cmd_ref = param_55;
-                        cmd_limit = param_56;
-                        if (!_2177)
+                        Alloc param_39 = cmd_alloc;
+                        CmdRef param_40 = cmd_ref;
+                        uint param_41 = cmd_limit;
+                        bool _1738 = alloc_cmd(param_39, param_40, param_41);
+                        cmd_alloc = param_39;
+                        cmd_ref = param_40;
+                        cmd_limit = param_41;
+                        if (!_1738)
                         {
                             break;
                         }
-                        Alloc param_57 = cmd_alloc;
-                        CmdRef param_58 = cmd_ref;
-                        uint param_59 = tag_2.flags;
-                        Tile param_60 = tile_1;
-                        float param_61 = fill.linewidth;
-                        write_fill(param_57, param_58, param_59, param_60, param_61);
-                        cmd_ref = param_58;
-                        cmd_lin.index = lin.index;
-                        cmd_lin.line_x = lin.line_x;
-                        cmd_lin.line_y = lin.line_y;
-                        cmd_lin.line_c = lin.line_c;
-                        Alloc param_62 = cmd_alloc;
-                        CmdRef param_63 = cmd_ref;
-                        CmdLinGrad param_64 = cmd_lin;
-                        Cmd_LinGrad_write(param_62, param_63, param_64);
+                        linewidth = asfloat(_242.Load(di * 4 + 8));
+                        Alloc param_42 = cmd_alloc;
+                        CmdRef param_43 = cmd_ref;
+                        Tile param_44 = tile_1;
+                        float param_45 = linewidth;
+                        write_fill(param_42, param_43, param_44, param_45);
+                        cmd_ref = param_43;
+                        cmd_lin.index = _1222.Load(dd_1 * 4 + 0);
+                        cmd_lin.line_x = asfloat(_242.Load((di + 1u) * 4 + 8));
+                        cmd_lin.line_y = asfloat(_242.Load((di + 2u) * 4 + 8));
+                        cmd_lin.line_c = asfloat(_242.Load((di + 3u) * 4 + 8));
+                        Alloc param_46 = cmd_alloc;
+                        CmdRef param_47 = cmd_ref;
+                        CmdLinGrad param_48 = cmd_lin;
+                        Cmd_LinGrad_write(param_46, param_47, param_48);
                         cmd_ref.offset += 20u;
                         break;
                     }
-                    case 3u:
+                    case 72u:
                     {
-                        uint param_65 = element_ref_ix;
-                        bool param_66 = mem_ok;
-                        TileRef _2242 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        Alloc param_67 = read_tile_alloc(param_65, param_66);
-                        TileRef param_68 = _2242;
-                        tile_1 = Tile_read(param_67, param_68);
-                        Alloc _2249;
-                        _2249.offset = _1283.Load(32);
-                        param_69.offset = _2249.offset;
-                        AnnotatedRef param_70 = ref;
-                        AnnoImage fill_img = Annotated_Image_read(param_69, param_70);
-                        Alloc param_71 = cmd_alloc;
-                        CmdRef param_72 = cmd_ref;
-                        uint param_73 = cmd_limit;
-                        bool _2261 = alloc_cmd(param_71, param_72, param_73);
-                        cmd_alloc = param_71;
-                        cmd_ref = param_72;
-                        cmd_limit = param_73;
-                        if (!_2261)
+                        linewidth = asfloat(_242.Load(di * 4 + 8));
+                        Alloc param_49 = cmd_alloc;
+                        CmdRef param_50 = cmd_ref;
+                        uint param_51 = cmd_limit;
+                        bool _1806 = alloc_cmd(param_49, param_50, param_51);
+                        cmd_alloc = param_49;
+                        cmd_ref = param_50;
+                        cmd_limit = param_51;
+                        if (!_1806)
                         {
                             break;
                         }
-                        Alloc param_74 = cmd_alloc;
-                        CmdRef param_75 = cmd_ref;
-                        uint param_76 = tag_2.flags;
-                        Tile param_77 = tile_1;
-                        float param_78 = fill_img.linewidth;
-                        write_fill(param_74, param_75, param_76, param_77, param_78);
-                        cmd_ref = param_75;
-                        CmdImage _2287 = { fill_img.index, fill_img.offset };
-                        Alloc param_79 = cmd_alloc;
-                        CmdRef param_80 = cmd_ref;
-                        CmdImage param_81 = _2287;
-                        Cmd_Image_write(param_79, param_80, param_81);
+                        Alloc param_52 = cmd_alloc;
+                        CmdRef param_53 = cmd_ref;
+                        Tile param_54 = tile_1;
+                        float param_55 = linewidth;
+                        write_fill(param_52, param_53, param_54, param_55);
+                        cmd_ref = param_53;
+                        uint index = _1222.Load(dd_1 * 4 + 0);
+                        uint raw1 = _1222.Load((dd_1 + 1u) * 4 + 0);
+                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+                        CmdImage _1845 = { index, offset_1 };
+                        Alloc param_56 = cmd_alloc;
+                        CmdRef param_57 = cmd_ref;
+                        CmdImage param_58 = _1845;
+                        Cmd_Image_write(param_56, param_57, param_58);
                         cmd_ref.offset += 12u;
                         break;
                     }
-                    case 4u:
+                    case 5u:
                     {
-                        uint param_82 = element_ref_ix;
-                        bool param_83 = mem_ok;
-                        TileRef _2316 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        Alloc param_84 = read_tile_alloc(param_82, param_83);
-                        TileRef param_85 = _2316;
-                        tile_1 = Tile_read(param_84, param_85);
-                        bool _2322 = tile_1.tile.offset == 0u;
-                        bool _2328;
-                        if (_2322)
+                        bool _1859 = tile_1.tile.offset == 0u;
+                        bool _1865;
+                        if (_1859)
                         {
-                            _2328 = tile_1.backdrop == 0;
+                            _1865 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2328 = _2322;
+                            _1865 = _1859;
                         }
-                        if (_2328)
+                        if (_1865)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
                         else
                         {
-                            Alloc param_86 = cmd_alloc;
-                            CmdRef param_87 = cmd_ref;
-                            uint param_88 = cmd_limit;
-                            bool _2340 = alloc_cmd(param_86, param_87, param_88);
-                            cmd_alloc = param_86;
-                            cmd_ref = param_87;
-                            cmd_limit = param_88;
-                            if (!_2340)
+                            Alloc param_59 = cmd_alloc;
+                            CmdRef param_60 = cmd_ref;
+                            uint param_61 = cmd_limit;
+                            bool _1877 = alloc_cmd(param_59, param_60, param_61);
+                            cmd_alloc = param_59;
+                            cmd_ref = param_60;
+                            cmd_limit = param_61;
+                            if (!_1877)
                             {
                                 break;
                             }
-                            Alloc param_89 = cmd_alloc;
-                            CmdRef param_90 = cmd_ref;
-                            Cmd_BeginClip_write(param_89, param_90);
+                            Alloc param_62 = cmd_alloc;
+                            CmdRef param_63 = cmd_ref;
+                            Cmd_BeginClip_write(param_62, param_63);
                             cmd_ref.offset += 4u;
                         }
                         clip_depth++;
                         break;
                     }
-                    case 5u:
+                    case 37u:
                     {
-                        uint param_91 = element_ref_ix;
-                        bool param_92 = mem_ok;
-                        TileRef _2377 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        Alloc param_93 = read_tile_alloc(param_91, param_92);
-                        TileRef param_94 = _2377;
-                        tile_1 = Tile_read(param_93, param_94);
-                        Alloc _2384;
-                        _2384.offset = _1283.Load(32);
-                        param_95.offset = _2384.offset;
-                        AnnotatedRef param_96 = ref;
-                        AnnoEndClip end_clip = Annotated_EndClip_read(param_95, param_96);
                         clip_depth--;
-                        Alloc param_97 = cmd_alloc;
-                        CmdRef param_98 = cmd_ref;
-                        uint param_99 = cmd_limit;
-                        bool _2398 = alloc_cmd(param_97, param_98, param_99);
-                        cmd_alloc = param_97;
-                        cmd_ref = param_98;
-                        cmd_limit = param_99;
-                        if (!_2398)
+                        Alloc param_64 = cmd_alloc;
+                        CmdRef param_65 = cmd_ref;
+                        uint param_66 = cmd_limit;
+                        bool _1905 = alloc_cmd(param_64, param_65, param_66);
+                        cmd_alloc = param_64;
+                        cmd_ref = param_65;
+                        cmd_limit = param_66;
+                        if (!_1905)
                         {
                             break;
                         }
-                        Alloc param_100 = cmd_alloc;
-                        CmdRef param_101 = cmd_ref;
-                        uint param_102 = 0u;
-                        Tile param_103 = tile_1;
-                        float param_104 = 0.0f;
-                        write_fill(param_100, param_101, param_102, param_103, param_104);
-                        cmd_ref = param_101;
-                        CmdEndClip _2419 = { end_clip.blend };
-                        Alloc param_105 = cmd_alloc;
-                        CmdRef param_106 = cmd_ref;
-                        CmdEndClip param_107 = _2419;
-                        Cmd_EndClip_write(param_105, param_106, param_107);
+                        Alloc param_67 = cmd_alloc;
+                        CmdRef param_68 = cmd_ref;
+                        Tile param_69 = tile_1;
+                        float param_70 = -1.0f;
+                        write_fill(param_67, param_68, param_69, param_70);
+                        cmd_ref = param_68;
+                        uint blend_1 = _1222.Load(dd_1 * 4 + 0);
+                        CmdEndClip _1928 = { blend_1 };
+                        Alloc param_71 = cmd_alloc;
+                        CmdRef param_72 = cmd_ref;
+                        CmdEndClip param_73 = _1928;
+                        Cmd_EndClip_write(param_71, param_72, param_73);
                         cmd_ref.offset += 8u;
                         break;
                     }
@@ -1355,14 +1060,14 @@
             }
             else
             {
-                switch (tag_2.tag)
+                switch (drawtag)
                 {
-                    case 4u:
+                    case 5u:
                     {
                         clip_depth++;
                         break;
                     }
-                    case 5u:
+                    case 37u:
                     {
                         if (clip_depth == clip_zero_depth)
                         {
@@ -1381,21 +1086,21 @@
             break;
         }
     }
-    bool _2467 = (bin_tile_x + tile_x) < _1283.Load(8);
-    bool _2476;
-    if (_2467)
+    bool _1975 = (bin_tile_x + tile_x) < _854.Load(8);
+    bool _1984;
+    if (_1975)
     {
-        _2476 = (bin_tile_y + tile_y) < _1283.Load(12);
+        _1984 = (bin_tile_y + tile_y) < _854.Load(12);
     }
     else
     {
-        _2476 = _2467;
+        _1984 = _1975;
     }
-    if (_2476)
+    if (_1984)
     {
-        Alloc param_108 = cmd_alloc;
-        CmdRef param_109 = cmd_ref;
-        Cmd_End_write(param_108, param_109);
+        Alloc param_74 = cmd_alloc;
+        CmdRef param_75 = cmd_ref;
+        Cmd_End_write(param_74, param_75);
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index 1422ff1..4226352 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -25,68 +25,6 @@
     bool failed;
 };
 
-struct AnnoImageRef
-{
-    uint offset;
-};
-
-struct AnnoImage
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    int2 offset;
-};
-
-struct AnnoColorRef
-{
-    uint offset;
-};
-
-struct AnnoColor
-{
-    float4 bbox;
-    float linewidth;
-    uint rgba_color;
-};
-
-struct AnnoLinGradientRef
-{
-    uint offset;
-};
-
-struct AnnoLinGradient
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    float line_x;
-    float line_y;
-    float line_c;
-};
-
-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct BinInstanceRef
 {
     uint offset;
@@ -229,12 +167,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -242,6 +182,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -249,6 +191,11 @@
     Config conf;
 };
 
+struct SceneBuf
+{
+    uint scene[1];
+};
+
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
@@ -264,7 +211,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_308, constant uint& v_308BufferSize)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -272,7 +219,7 @@
     {
         return 0u;
     }
-    uint v = v_308.memory[offset];
+    uint v = v_242.memory[offset];
     return v;
 }
 
@@ -291,39 +238,30 @@
 }
 
 static inline __attribute__((always_inline))
-BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
+    uint raw0 = read_mem(param, param_1, v_242, v_242BufferSize);
     BinInstance s;
     s.element_ix = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_308, v_308BufferSize);
-    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
-
-static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
+    uint raw0 = read_mem(param, param_1, v_242, v_242BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_242, v_242BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_242, v_242BufferSize);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -336,24 +274,24 @@
 }
 
 static inline __attribute__((always_inline))
-Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_308, constant uint& v_308BufferSize)
+Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint param = 0u;
-    uint param_1 = uint(int((v_308BufferSize - 8) / 4) * 4);
+    uint param_1 = uint(int((v_242BufferSize - 8) / 4) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
 
 static inline __attribute__((always_inline))
-Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
+    uint raw0 = read_mem(param, param_1, v_242, v_242BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_242, v_242BufferSize);
     Tile s;
     s.tile = TileSegRef{ raw0 };
     s.backdrop = int(raw1);
@@ -361,63 +299,26 @@
 }
 
 static inline __attribute__((always_inline))
-AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+MallocResult malloc(thread const uint& size, device Memory& v_242, constant uint& v_242BufferSize)
 {
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
-    Alloc param_10 = a;
-    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_308, v_308BufferSize);
-    AnnoColor s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    s.linewidth = as_type<float>(raw4);
-    s.rgba_color = raw5;
-    return s;
-}
-
-static inline __attribute__((always_inline))
-AnnoColor Annotated_Color_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    Alloc param = a;
-    AnnoColorRef param_1 = AnnoColorRef{ ref.offset + 4u };
-    return AnnoColor_read(param, param_1, v_308, v_308BufferSize);
-}
-
-static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    uint _314 = atomic_fetch_add_explicit((device atomic_uint*)&v_308.mem_offset, size, memory_order_relaxed);
-    uint offset = _314;
+    uint _248 = atomic_fetch_add_explicit((device atomic_uint*)&v_242.mem_offset, size, memory_order_relaxed);
+    uint offset = _248;
     MallocResult r;
-    r.failed = (offset + size) > uint(int((v_308BufferSize - 8) / 4) * 4);
+    r.failed = (offset + size) > uint(int((v_242BufferSize - 8) / 4) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _343 = atomic_fetch_max_explicit((device atomic_uint*)&v_308.mem_error, 1u, memory_order_relaxed);
+        uint _277 = atomic_fetch_max_explicit((device atomic_uint*)&v_242.mem_error, 1u, memory_order_relaxed);
         return r;
     }
     return r;
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_308, constant uint& v_308BufferSize)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -425,42 +326,42 @@
     {
         return;
     }
-    v_308.memory[offset] = val;
+    v_242.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.new_ref;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 10u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u };
     CmdJump param_5 = s;
-    CmdJump_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdJump_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_308, constant uint& v_308BufferSize)
+bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_242, constant uint& v_242BufferSize)
 {
     if (cmd_ref.offset < cmd_limit)
     {
         return true;
     }
     uint param = 1024u;
-    MallocResult _1190 = malloc(param, v_308, v_308BufferSize);
-    MallocResult new_cmd = _1190;
+    MallocResult _762 = malloc(param, v_242, v_242BufferSize);
+    MallocResult new_cmd = _762;
     if (new_cmd.failed)
     {
         return false;
@@ -469,7 +370,7 @@
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
-    Cmd_Jump_write(param_1, param_2, param_3, v_308, v_308BufferSize);
+    Cmd_Jump_write(param_1, param_2, param_3, v_242, v_242BufferSize);
     cmd_alloc = new_cmd.alloc;
     cmd_ref = CmdRef{ cmd_alloc.offset };
     cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
@@ -477,358 +378,228 @@
 }
 
 static inline __attribute__((always_inline))
-uint fill_mode_from_flags(thread const uint& flags)
-{
-    return flags & 1u;
-}
-
-static inline __attribute__((always_inline))
-void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = uint(s.backdrop);
-    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
+    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u };
     CmdFill param_5 = s;
-    CmdFill_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdFill_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 3u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.half_width);
-    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
+    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u };
     CmdStroke param_5 = s;
-    CmdStroke_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdStroke_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const uint& flags, thread const Tile& tile, thread const float& linewidth, device Memory& v_308, constant uint& v_308BufferSize)
+void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_242, constant uint& v_242BufferSize)
 {
-    uint param = flags;
-    if (fill_mode_from_flags(param) == 0u)
+    if (linewidth < 0.0)
     {
         if (tile.tile.offset != 0u)
         {
             CmdFill cmd_fill = CmdFill{ tile.tile.offset, tile.backdrop };
-            Alloc param_1 = alloc;
-            CmdRef param_2 = cmd_ref;
-            CmdFill param_3 = cmd_fill;
-            Cmd_Fill_write(param_1, param_2, param_3, v_308, v_308BufferSize);
+            Alloc param = alloc;
+            CmdRef param_1 = cmd_ref;
+            CmdFill param_2 = cmd_fill;
+            Cmd_Fill_write(param, param_1, param_2, v_242, v_242BufferSize);
             cmd_ref.offset += 12u;
         }
         else
         {
-            Alloc param_4 = alloc;
-            CmdRef param_5 = cmd_ref;
-            Cmd_Solid_write(param_4, param_5, v_308, v_308BufferSize);
+            Alloc param_3 = alloc;
+            CmdRef param_4 = cmd_ref;
+            Cmd_Solid_write(param_3, param_4, v_242, v_242BufferSize);
             cmd_ref.offset += 4u;
         }
     }
     else
     {
         CmdStroke cmd_stroke = CmdStroke{ tile.tile.offset, 0.5 * linewidth };
-        Alloc param_6 = alloc;
-        CmdRef param_7 = cmd_ref;
-        CmdStroke param_8 = cmd_stroke;
-        Cmd_Stroke_write(param_6, param_7, param_8, v_308, v_308BufferSize);
+        Alloc param_5 = alloc;
+        CmdRef param_6 = cmd_ref;
+        CmdStroke param_7 = cmd_stroke;
+        Cmd_Stroke_write(param_5, param_6, param_7, v_242, v_242BufferSize);
         cmd_ref.offset += 12u;
     }
 }
 
 static inline __attribute__((always_inline))
-void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.rgba_color;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u };
     CmdColor param_5 = s;
-    CmdColor_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdColor_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLinGradientRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
-    Alloc param_10 = a;
-    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_308, v_308BufferSize);
-    Alloc param_12 = a;
-    uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_308, v_308BufferSize);
-    Alloc param_14 = a;
-    uint param_15 = ix + 7u;
-    uint raw7 = read_mem(param_14, param_15, v_308, v_308BufferSize);
-    Alloc param_16 = a;
-    uint param_17 = ix + 8u;
-    uint raw8 = read_mem(param_16, param_17, v_308, v_308BufferSize);
-    AnnoLinGradient s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    s.linewidth = as_type<float>(raw4);
-    s.index = raw5;
-    s.line_x = as_type<float>(raw6);
-    s.line_y = as_type<float>(raw7);
-    s.line_c = as_type<float>(raw8);
-    return s;
-}
-
-static inline __attribute__((always_inline))
-AnnoLinGradient Annotated_LinGradient_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    Alloc param = a;
-    AnnoLinGradientRef param_1 = AnnoLinGradientRef{ ref.offset + 4u };
-    return AnnoLinGradient_read(param, param_1, v_308, v_308BufferSize);
-}
-
-static inline __attribute__((always_inline))
-void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.line_x);
-    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
+    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.line_y);
-    write_mem(param_6, param_7, param_8, v_308, v_308BufferSize);
+    write_mem(param_6, param_7, param_8, v_242, v_242BufferSize);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.line_c);
-    write_mem(param_9, param_10, param_11, v_308, v_308BufferSize);
+    write_mem(param_9, param_10, param_11, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u };
     CmdLinGrad param_5 = s;
-    CmdLinGrad_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdLinGrad_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
-    Alloc param_10 = a;
-    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_308, v_308BufferSize);
-    Alloc param_12 = a;
-    uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_308, v_308BufferSize);
-    AnnoImage s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    s.linewidth = as_type<float>(raw4);
-    s.index = raw5;
-    s.offset = int2(int(raw6 << uint(16)) >> 16, int(raw6) >> 16);
-    return s;
-}
-
-static inline __attribute__((always_inline))
-AnnoImage Annotated_Image_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    Alloc param = a;
-    AnnoImageRef param_1 = AnnoImageRef{ ref.offset + 4u };
-    return AnnoImage_read(param, param_1, v_308, v_308BufferSize);
-}
-
-static inline __attribute__((always_inline))
-void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
+    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
     CmdImage param_5 = s;
-    CmdImage_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdImage_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 8u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
-    AnnoEndClip s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    s.blend = raw4;
-    return s;
-}
-
-static inline __attribute__((always_inline))
-AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
-{
-    Alloc param = a;
-    AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u };
-    return AnnoEndClip_read(param, param_1, v_308, v_308BufferSize);
-}
-
-static inline __attribute__((always_inline))
-void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_308, constant uint& v_308BufferSize)
+void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.blend;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 9u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
     Alloc param_3 = a;
     CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u };
     CmdEndClip param_5 = s;
-    CmdEndClip_write(param_3, param_4, param_5, v_308, v_308BufferSize);
+    CmdEndClip_write(param_3, param_4, param_5, v_242, v_242BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 0u;
-    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_308 [[buffer(0)]], const device ConfigBuf& _1283 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_242 [[buffer(0)]], const device ConfigBuf& _854 [[buffer(1)]], const device SceneBuf& _1222 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_bitmaps[8][256];
     threadgroup Alloc sh_part_elements[256];
@@ -840,19 +611,19 @@
     threadgroup uint sh_tile_y0[256];
     threadgroup uint sh_tile_base[256];
     threadgroup uint sh_tile_count[256];
-    constant uint& v_308BufferSize = spvBufferSizeConstants[0];
-    uint width_in_bins = ((_1283.conf.width_in_tiles + 16u) - 1u) / 16u;
+    constant uint& v_242BufferSize = spvBufferSizeConstants[0];
+    uint width_in_bins = ((_854.conf.width_in_tiles + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_1283.conf.n_elements + 256u) - 1u) / 256u;
+    uint n_partitions = ((_854.conf.n_elements + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _1283.conf.width_in_tiles) + bin_tile_x) + tile_x;
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _854.conf.width_in_tiles) + bin_tile_x) + tile_x;
     Alloc param;
-    param.offset = _1283.conf.ptcl_alloc.offset;
+    param.offset = _854.conf.ptcl_alloc.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -864,57 +635,52 @@
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    bool mem_ok = v_308.mem_error == 0u;
+    uint drawmonoid_start = _854.conf.drawmonoid_alloc.offset >> uint(2);
+    uint drawtag_start = _854.conf.drawtag_offset >> uint(2);
+    uint drawdata_start = _854.conf.drawdata_offset >> uint(2);
+    uint drawinfo_start = _854.conf.drawinfo_alloc.offset >> uint(2);
+    bool mem_ok = v_242.mem_error == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1562;
+    uint _1154;
     uint element_ix;
-    AnnotatedRef ref;
     Alloc param_14;
-    Alloc param_16;
     uint tile_count;
-    Alloc param_23;
-    uint _1887;
-    Alloc param_29;
-    Tile tile_1;
-    AnnoColor fill;
-    Alloc param_35;
-    Alloc param_52;
+    uint _1455;
+    float linewidth;
     CmdLinGrad cmd_lin;
-    Alloc param_69;
-    Alloc param_95;
     while (true)
     {
         for (uint i = 0u; i < 8u; i++)
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1614;
+        bool _1206;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1412 = th_ix < 256u;
-                bool _1420;
-                if (_1412)
+                bool _1003 = th_ix < 256u;
+                bool _1011;
+                if (_1003)
                 {
-                    _1420 = (partition_ix + th_ix) < n_partitions;
+                    _1011 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1420 = _1412;
+                    _1011 = _1003;
                 }
-                if (_1420)
+                if (_1011)
                 {
-                    uint in_ix = (_1283.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    param_3.offset = _1283.conf.bin_alloc.offset;
+                    uint in_ix = (_854.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    param_3.offset = _854.conf.bin_alloc.offset;
                     uint param_4 = in_ix;
-                    count = read_mem(param_3, param_4, v_308, v_308BufferSize);
-                    param_5.offset = _1283.conf.bin_alloc.offset;
+                    count = read_mem(param_3, param_4, v_242, v_242BufferSize);
+                    param_5.offset = _854.conf.bin_alloc.offset;
                     uint param_6 = in_ix + 1u;
-                    uint offset = read_mem(param_5, param_6, v_308, v_308BufferSize);
+                    uint offset = read_mem(param_5, param_6, v_242, v_242BufferSize);
                     uint param_7 = offset;
                     uint param_8 = count * 4u;
                     bool param_9 = mem_ok;
@@ -958,34 +724,34 @@
                 }
                 if (part_ix > 0u)
                 {
-                    _1562 = sh_part_count[part_ix - 1u];
+                    _1154 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1562 = part_start_ix;
+                    _1154 = part_start_ix;
                 }
-                ix -= _1562;
+                ix -= _1154;
                 Alloc bin_alloc = sh_part_elements[part_ix];
                 BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
                 BinInstanceRef param_13 = BinInstance_index(param_10, param_11);
-                BinInstance inst = BinInstance_read(param_12, param_13, v_308, v_308BufferSize);
+                BinInstance inst = BinInstance_read(param_12, param_13, v_242, v_242BufferSize);
                 sh_elements[th_ix] = inst.element_ix;
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1604 = (wr_ix - rd_ix) < 256u;
-            if (_1604)
+            bool _1196 = (wr_ix - rd_ix) < 256u;
+            if (_1196)
             {
-                _1614 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1206 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1614 = _1604;
+                _1206 = _1196;
             }
-            if (_1614)
+            if (_1206)
             {
                 continue;
             }
@@ -998,24 +764,21 @@
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            ref = AnnotatedRef{ _1283.conf.anno_alloc.offset + (element_ix * 40u) };
-            param_14.offset = _1283.conf.anno_alloc.offset;
-            AnnotatedRef param_15 = ref;
-            tag = Annotated_tag(param_14, param_15, v_308, v_308BufferSize).tag;
+            tag = _1222.scene[drawtag_start + element_ix];
         }
         switch (tag)
         {
-            case 1u:
-            case 3u:
-            case 2u:
-            case 4u:
+            case 68u:
+            case 72u:
+            case 276u:
             case 5u:
+            case 37u:
             {
-                uint drawmonoid_base = (_1283.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix);
-                uint path_ix = v_308.memory[drawmonoid_base];
-                param_16.offset = _1283.conf.tile_alloc.offset;
-                PathRef param_17 = PathRef{ _1283.conf.tile_alloc.offset + (path_ix * 12u) };
-                Path path = Path_read(param_16, param_17, v_308, v_308BufferSize);
+                uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
+                uint path_ix = v_242.memory[drawmonoid_base];
+                param_14.offset = _854.conf.tile_alloc.offset;
+                PathRef param_15 = PathRef{ _854.conf.tile_alloc.offset + (path_ix * 12u) };
+                Path path = Path_read(param_14, param_15, v_242, v_242BufferSize);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
                 int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -1030,13 +793,13 @@
                 tile_count = uint(x1 - x0) * uint(y1 - y0);
                 uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
                 sh_tile_base[th_ix] = base;
-                uint param_18 = path.tiles.offset;
-                uint param_19 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                bool param_20 = mem_ok;
-                Alloc path_alloc = new_alloc(param_18, param_19, param_20);
-                uint param_21 = th_ix;
-                Alloc param_22 = path_alloc;
-                write_tile_alloc(param_21, param_22);
+                uint param_16 = path.tiles.offset;
+                uint param_17 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                bool param_18 = mem_ok;
+                Alloc path_alloc = new_alloc(param_16, param_17, param_18);
+                uint param_19 = th_ix;
+                Alloc param_20 = path_alloc;
+                write_tile_alloc(param_19, param_20);
                 break;
             }
             default:
@@ -1069,67 +832,55 @@
                     el_ix = probe_1;
                 }
             }
-            AnnotatedRef ref_1 = AnnotatedRef{ _1283.conf.anno_alloc.offset + (sh_elements[el_ix] * 40u) };
-            param_23.offset = _1283.conf.anno_alloc.offset;
-            AnnotatedRef param_24 = ref_1;
-            AnnotatedTag anno_tag = Annotated_tag(param_23, param_24, v_308, v_308BufferSize);
-            uint tag_1 = anno_tag.tag;
+            uint element_ix_1 = sh_elements[el_ix];
+            uint tag_1 = _1222.scene[drawtag_start + element_ix_1];
             if (el_ix > 0u)
             {
-                _1887 = sh_tile_count[el_ix - 1u];
+                _1455 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1887 = 0u;
+                _1455 = 0u;
             }
-            uint seq_ix = ix_1 - _1887;
+            uint seq_ix = ix_1 - _1455;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
             bool include_tile = false;
             if (mem_ok)
             {
-                uint param_25 = el_ix;
-                bool param_26 = mem_ok;
-                Alloc param_27 = read_tile_alloc(param_25, param_26, v_308, v_308BufferSize);
-                TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                Tile tile = Tile_read(param_27, param_28, v_308, v_308BufferSize);
-                bool is_clip = (tag_1 == 4u) || (tag_1 == 5u);
-                bool _1951 = tile.tile.offset != 0u;
-                bool _1960;
-                if (!_1951)
+                uint param_21 = el_ix;
+                bool param_22 = mem_ok;
+                Alloc param_23 = read_tile_alloc(param_21, param_22, v_242, v_242BufferSize);
+                TileRef param_24 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                Tile tile = Tile_read(param_23, param_24, v_242, v_242BufferSize);
+                bool is_clip = (tag_1 & 1u) != 0u;
+                bool is_blend = false;
+                if (is_clip)
                 {
-                    _1960 = (tile.backdrop == 0) == is_clip;
+                    uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
+                    uint scene_offset = v_242.memory[drawmonoid_base_1 + 2u];
+                    uint dd = drawdata_start + (scene_offset >> uint(2));
+                    uint blend = _1222.scene[dd];
+                    is_blend = blend != 3u;
+                }
+                bool _1542 = tile.tile.offset != 0u;
+                bool _1551;
+                if (!_1542)
+                {
+                    _1551 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1960 = _1951;
+                    _1551 = _1542;
                 }
-                bool _1972;
-                if (!_1960)
-                {
-                    bool _1971;
-                    if (is_clip)
-                    {
-                        _1971 = (anno_tag.flags & 2u) != 0u;
-                    }
-                    else
-                    {
-                        _1971 = is_clip;
-                    }
-                    _1972 = _1971;
-                }
-                else
-                {
-                    _1972 = _1960;
-                }
-                include_tile = _1972;
+                include_tile = _1551 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1992 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1573 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -1151,200 +902,174 @@
                 }
             }
             uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
-            uint element_ix_1 = sh_elements[element_ref_ix];
+            uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            ref = AnnotatedRef{ _1283.conf.anno_alloc.offset + (element_ix_1 * 40u) };
-            param_29.offset = _1283.conf.anno_alloc.offset;
-            AnnotatedRef param_30 = ref;
-            AnnotatedTag tag_2 = Annotated_tag(param_29, param_30, v_308, v_308BufferSize);
+            uint drawtag = _1222.scene[drawtag_start + element_ix_2];
             if (clip_zero_depth == 0u)
             {
-                switch (tag_2.tag)
+                uint param_25 = element_ref_ix;
+                bool param_26 = mem_ok;
+                Alloc param_27 = read_tile_alloc(param_25, param_26, v_242, v_242BufferSize);
+                TileRef param_28 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                Tile tile_1 = Tile_read(param_27, param_28, v_242, v_242BufferSize);
+                uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
+                uint scene_offset_1 = v_242.memory[drawmonoid_base_2 + 2u];
+                uint info_offset = v_242.memory[drawmonoid_base_2 + 3u];
+                uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
+                uint di = drawinfo_start + (info_offset >> uint(2));
+                switch (drawtag)
                 {
-                    case 1u:
+                    case 68u:
                     {
-                        uint param_31 = element_ref_ix;
-                        bool param_32 = mem_ok;
-                        Alloc param_33 = read_tile_alloc(param_31, param_32, v_308, v_308BufferSize);
-                        TileRef param_34 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_33, param_34, v_308, v_308BufferSize);
-                        param_35.offset = _1283.conf.anno_alloc.offset;
-                        AnnotatedRef param_36 = ref;
-                        fill = Annotated_Color_read(param_35, param_36, v_308, v_308BufferSize);
-                        Alloc param_37 = cmd_alloc;
-                        CmdRef param_38 = cmd_ref;
-                        uint param_39 = cmd_limit;
-                        bool _2105 = alloc_cmd(param_37, param_38, param_39, v_308, v_308BufferSize);
-                        cmd_alloc = param_37;
-                        cmd_ref = param_38;
-                        cmd_limit = param_39;
-                        if (!_2105)
+                        linewidth = as_type<float>(v_242.memory[di]);
+                        Alloc param_29 = cmd_alloc;
+                        CmdRef param_30 = cmd_ref;
+                        uint param_31 = cmd_limit;
+                        bool _1697 = alloc_cmd(param_29, param_30, param_31, v_242, v_242BufferSize);
+                        cmd_alloc = param_29;
+                        cmd_ref = param_30;
+                        cmd_limit = param_31;
+                        if (!_1697)
                         {
                             break;
                         }
-                        Alloc param_40 = cmd_alloc;
-                        CmdRef param_41 = cmd_ref;
-                        uint param_42 = tag_2.flags;
-                        Tile param_43 = tile_1;
-                        float param_44 = fill.linewidth;
-                        write_fill(param_40, param_41, param_42, param_43, param_44, v_308, v_308BufferSize);
-                        cmd_ref = param_41;
-                        Alloc param_45 = cmd_alloc;
-                        CmdRef param_46 = cmd_ref;
-                        CmdColor param_47 = CmdColor{ fill.rgba_color };
-                        Cmd_Color_write(param_45, param_46, param_47, v_308, v_308BufferSize);
+                        Alloc param_32 = cmd_alloc;
+                        CmdRef param_33 = cmd_ref;
+                        Tile param_34 = tile_1;
+                        float param_35 = linewidth;
+                        write_fill(param_32, param_33, param_34, param_35, v_242, v_242BufferSize);
+                        cmd_ref = param_33;
+                        uint rgba = _1222.scene[dd_1];
+                        Alloc param_36 = cmd_alloc;
+                        CmdRef param_37 = cmd_ref;
+                        CmdColor param_38 = CmdColor{ rgba };
+                        Cmd_Color_write(param_36, param_37, param_38, v_242, v_242BufferSize);
                         cmd_ref.offset += 8u;
                         break;
                     }
-                    case 2u:
+                    case 276u:
                     {
-                        uint param_48 = element_ref_ix;
-                        bool param_49 = mem_ok;
-                        Alloc param_50 = read_tile_alloc(param_48, param_49, v_308, v_308BufferSize);
-                        TileRef param_51 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_50, param_51, v_308, v_308BufferSize);
-                        param_52.offset = _1283.conf.anno_alloc.offset;
-                        AnnotatedRef param_53 = ref;
-                        AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53, v_308, v_308BufferSize);
-                        Alloc param_54 = cmd_alloc;
-                        CmdRef param_55 = cmd_ref;
-                        uint param_56 = cmd_limit;
-                        bool _2177 = alloc_cmd(param_54, param_55, param_56, v_308, v_308BufferSize);
-                        cmd_alloc = param_54;
-                        cmd_ref = param_55;
-                        cmd_limit = param_56;
-                        if (!_2177)
+                        Alloc param_39 = cmd_alloc;
+                        CmdRef param_40 = cmd_ref;
+                        uint param_41 = cmd_limit;
+                        bool _1738 = alloc_cmd(param_39, param_40, param_41, v_242, v_242BufferSize);
+                        cmd_alloc = param_39;
+                        cmd_ref = param_40;
+                        cmd_limit = param_41;
+                        if (!_1738)
                         {
                             break;
                         }
-                        Alloc param_57 = cmd_alloc;
-                        CmdRef param_58 = cmd_ref;
-                        uint param_59 = tag_2.flags;
-                        Tile param_60 = tile_1;
-                        float param_61 = fill.linewidth;
-                        write_fill(param_57, param_58, param_59, param_60, param_61, v_308, v_308BufferSize);
-                        cmd_ref = param_58;
-                        cmd_lin.index = lin.index;
-                        cmd_lin.line_x = lin.line_x;
-                        cmd_lin.line_y = lin.line_y;
-                        cmd_lin.line_c = lin.line_c;
-                        Alloc param_62 = cmd_alloc;
-                        CmdRef param_63 = cmd_ref;
-                        CmdLinGrad param_64 = cmd_lin;
-                        Cmd_LinGrad_write(param_62, param_63, param_64, v_308, v_308BufferSize);
+                        linewidth = as_type<float>(v_242.memory[di]);
+                        Alloc param_42 = cmd_alloc;
+                        CmdRef param_43 = cmd_ref;
+                        Tile param_44 = tile_1;
+                        float param_45 = linewidth;
+                        write_fill(param_42, param_43, param_44, param_45, v_242, v_242BufferSize);
+                        cmd_ref = param_43;
+                        cmd_lin.index = _1222.scene[dd_1];
+                        cmd_lin.line_x = as_type<float>(v_242.memory[di + 1u]);
+                        cmd_lin.line_y = as_type<float>(v_242.memory[di + 2u]);
+                        cmd_lin.line_c = as_type<float>(v_242.memory[di + 3u]);
+                        Alloc param_46 = cmd_alloc;
+                        CmdRef param_47 = cmd_ref;
+                        CmdLinGrad param_48 = cmd_lin;
+                        Cmd_LinGrad_write(param_46, param_47, param_48, v_242, v_242BufferSize);
                         cmd_ref.offset += 20u;
                         break;
                     }
-                    case 3u:
+                    case 72u:
                     {
-                        uint param_65 = element_ref_ix;
-                        bool param_66 = mem_ok;
-                        Alloc param_67 = read_tile_alloc(param_65, param_66, v_308, v_308BufferSize);
-                        TileRef param_68 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_67, param_68, v_308, v_308BufferSize);
-                        param_69.offset = _1283.conf.anno_alloc.offset;
-                        AnnotatedRef param_70 = ref;
-                        AnnoImage fill_img = Annotated_Image_read(param_69, param_70, v_308, v_308BufferSize);
-                        Alloc param_71 = cmd_alloc;
-                        CmdRef param_72 = cmd_ref;
-                        uint param_73 = cmd_limit;
-                        bool _2261 = alloc_cmd(param_71, param_72, param_73, v_308, v_308BufferSize);
-                        cmd_alloc = param_71;
-                        cmd_ref = param_72;
-                        cmd_limit = param_73;
-                        if (!_2261)
+                        linewidth = as_type<float>(v_242.memory[di]);
+                        Alloc param_49 = cmd_alloc;
+                        CmdRef param_50 = cmd_ref;
+                        uint param_51 = cmd_limit;
+                        bool _1806 = alloc_cmd(param_49, param_50, param_51, v_242, v_242BufferSize);
+                        cmd_alloc = param_49;
+                        cmd_ref = param_50;
+                        cmd_limit = param_51;
+                        if (!_1806)
                         {
                             break;
                         }
-                        Alloc param_74 = cmd_alloc;
-                        CmdRef param_75 = cmd_ref;
-                        uint param_76 = tag_2.flags;
-                        Tile param_77 = tile_1;
-                        float param_78 = fill_img.linewidth;
-                        write_fill(param_74, param_75, param_76, param_77, param_78, v_308, v_308BufferSize);
-                        cmd_ref = param_75;
-                        Alloc param_79 = cmd_alloc;
-                        CmdRef param_80 = cmd_ref;
-                        CmdImage param_81 = CmdImage{ fill_img.index, fill_img.offset };
-                        Cmd_Image_write(param_79, param_80, param_81, v_308, v_308BufferSize);
+                        Alloc param_52 = cmd_alloc;
+                        CmdRef param_53 = cmd_ref;
+                        Tile param_54 = tile_1;
+                        float param_55 = linewidth;
+                        write_fill(param_52, param_53, param_54, param_55, v_242, v_242BufferSize);
+                        cmd_ref = param_53;
+                        uint index = _1222.scene[dd_1];
+                        uint raw1 = _1222.scene[dd_1 + 1u];
+                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+                        Alloc param_56 = cmd_alloc;
+                        CmdRef param_57 = cmd_ref;
+                        CmdImage param_58 = CmdImage{ index, offset_1 };
+                        Cmd_Image_write(param_56, param_57, param_58, v_242, v_242BufferSize);
                         cmd_ref.offset += 12u;
                         break;
                     }
-                    case 4u:
+                    case 5u:
                     {
-                        uint param_82 = element_ref_ix;
-                        bool param_83 = mem_ok;
-                        Alloc param_84 = read_tile_alloc(param_82, param_83, v_308, v_308BufferSize);
-                        TileRef param_85 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_84, param_85, v_308, v_308BufferSize);
-                        bool _2322 = tile_1.tile.offset == 0u;
-                        bool _2328;
-                        if (_2322)
+                        bool _1859 = tile_1.tile.offset == 0u;
+                        bool _1865;
+                        if (_1859)
                         {
-                            _2328 = tile_1.backdrop == 0;
+                            _1865 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2328 = _2322;
+                            _1865 = _1859;
                         }
-                        if (_2328)
+                        if (_1865)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
                         else
                         {
-                            Alloc param_86 = cmd_alloc;
-                            CmdRef param_87 = cmd_ref;
-                            uint param_88 = cmd_limit;
-                            bool _2340 = alloc_cmd(param_86, param_87, param_88, v_308, v_308BufferSize);
-                            cmd_alloc = param_86;
-                            cmd_ref = param_87;
-                            cmd_limit = param_88;
-                            if (!_2340)
+                            Alloc param_59 = cmd_alloc;
+                            CmdRef param_60 = cmd_ref;
+                            uint param_61 = cmd_limit;
+                            bool _1877 = alloc_cmd(param_59, param_60, param_61, v_242, v_242BufferSize);
+                            cmd_alloc = param_59;
+                            cmd_ref = param_60;
+                            cmd_limit = param_61;
+                            if (!_1877)
                             {
                                 break;
                             }
-                            Alloc param_89 = cmd_alloc;
-                            CmdRef param_90 = cmd_ref;
-                            Cmd_BeginClip_write(param_89, param_90, v_308, v_308BufferSize);
+                            Alloc param_62 = cmd_alloc;
+                            CmdRef param_63 = cmd_ref;
+                            Cmd_BeginClip_write(param_62, param_63, v_242, v_242BufferSize);
                             cmd_ref.offset += 4u;
                         }
                         clip_depth++;
                         break;
                     }
-                    case 5u:
+                    case 37u:
                     {
-                        uint param_91 = element_ref_ix;
-                        bool param_92 = mem_ok;
-                        Alloc param_93 = read_tile_alloc(param_91, param_92, v_308, v_308BufferSize);
-                        TileRef param_94 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_93, param_94, v_308, v_308BufferSize);
-                        param_95.offset = _1283.conf.anno_alloc.offset;
-                        AnnotatedRef param_96 = ref;
-                        AnnoEndClip end_clip = Annotated_EndClip_read(param_95, param_96, v_308, v_308BufferSize);
                         clip_depth--;
-                        Alloc param_97 = cmd_alloc;
-                        CmdRef param_98 = cmd_ref;
-                        uint param_99 = cmd_limit;
-                        bool _2398 = alloc_cmd(param_97, param_98, param_99, v_308, v_308BufferSize);
-                        cmd_alloc = param_97;
-                        cmd_ref = param_98;
-                        cmd_limit = param_99;
-                        if (!_2398)
+                        Alloc param_64 = cmd_alloc;
+                        CmdRef param_65 = cmd_ref;
+                        uint param_66 = cmd_limit;
+                        bool _1905 = alloc_cmd(param_64, param_65, param_66, v_242, v_242BufferSize);
+                        cmd_alloc = param_64;
+                        cmd_ref = param_65;
+                        cmd_limit = param_66;
+                        if (!_1905)
                         {
                             break;
                         }
-                        Alloc param_100 = cmd_alloc;
-                        CmdRef param_101 = cmd_ref;
-                        uint param_102 = 0u;
-                        Tile param_103 = tile_1;
-                        float param_104 = 0.0;
-                        write_fill(param_100, param_101, param_102, param_103, param_104, v_308, v_308BufferSize);
-                        cmd_ref = param_101;
-                        Alloc param_105 = cmd_alloc;
-                        CmdRef param_106 = cmd_ref;
-                        CmdEndClip param_107 = CmdEndClip{ end_clip.blend };
-                        Cmd_EndClip_write(param_105, param_106, param_107, v_308, v_308BufferSize);
+                        Alloc param_67 = cmd_alloc;
+                        CmdRef param_68 = cmd_ref;
+                        Tile param_69 = tile_1;
+                        float param_70 = -1.0;
+                        write_fill(param_67, param_68, param_69, param_70, v_242, v_242BufferSize);
+                        cmd_ref = param_68;
+                        uint blend_1 = _1222.scene[dd_1];
+                        Alloc param_71 = cmd_alloc;
+                        CmdRef param_72 = cmd_ref;
+                        CmdEndClip param_73 = CmdEndClip{ blend_1 };
+                        Cmd_EndClip_write(param_71, param_72, param_73, v_242, v_242BufferSize);
                         cmd_ref.offset += 8u;
                         break;
                     }
@@ -1352,14 +1077,14 @@
             }
             else
             {
-                switch (tag_2.tag)
+                switch (drawtag)
                 {
-                    case 4u:
+                    case 5u:
                     {
                         clip_depth++;
                         break;
                     }
-                    case 5u:
+                    case 37u:
                     {
                         if (clip_depth == clip_zero_depth)
                         {
@@ -1378,21 +1103,21 @@
             break;
         }
     }
-    bool _2467 = (bin_tile_x + tile_x) < _1283.conf.width_in_tiles;
-    bool _2476;
-    if (_2467)
+    bool _1975 = (bin_tile_x + tile_x) < _854.conf.width_in_tiles;
+    bool _1984;
+    if (_1975)
     {
-        _2476 = (bin_tile_y + tile_y) < _1283.conf.height_in_tiles;
+        _1984 = (bin_tile_y + tile_y) < _854.conf.height_in_tiles;
     }
     else
     {
-        _2476 = _2467;
+        _1984 = _1975;
     }
-    if (_2476)
+    if (_1984)
     {
-        Alloc param_108 = cmd_alloc;
-        CmdRef param_109 = cmd_ref;
-        Cmd_End_write(param_108, param_109, v_308, v_308BufferSize);
+        Alloc param_74 = cmd_alloc;
+        CmdRef param_75 = cmd_ref;
+        Cmd_End_write(param_74, param_75, v_242, v_242BufferSize);
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index 1fef2d7..b85fd8c 100644
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil
index d1567c9..77396c1 100644
--- a/piet-gpu/shader/gen/draw_leaf.dxil
+++ b/piet-gpu/shader/gen/draw_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl
index 1f2f78b..f812f52 100644
--- a/piet-gpu/shader/gen/draw_leaf.hlsl
+++ b/piet-gpu/shader/gen/draw_leaf.hlsl
@@ -1,133 +1,12 @@
-struct Alloc
-{
-    uint offset;
-};
-
-struct ElementRef
-{
-    uint offset;
-};
-
-struct FillColorRef
-{
-    uint offset;
-};
-
-struct FillColor
-{
-    uint rgba_color;
-};
-
-struct FillLinGradientRef
-{
-    uint offset;
-};
-
-struct FillLinGradient
-{
-    uint index;
-    float2 p0;
-    float2 p1;
-};
-
-struct FillImageRef
-{
-    uint offset;
-};
-
-struct FillImage
-{
-    uint index;
-    int2 offset;
-};
-
-struct ClipRef
-{
-    uint offset;
-};
-
-struct Clip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct ElementTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct DrawMonoid
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
-struct AnnoImageRef
-{
-    uint offset;
-};
-
-struct AnnoImage
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    int2 offset;
-};
-
-struct AnnoColorRef
-{
-    uint offset;
-};
-
-struct AnnoColor
-{
-    float4 bbox;
-    float linewidth;
-    uint rgba_color;
-};
-
-struct AnnoLinGradientRef
-{
-    uint offset;
-};
-
-struct AnnoLinGradient
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    float line_x;
-    float line_y;
-    float line_c;
-};
-
-struct AnnoBeginClipRef
-{
-    uint offset;
-};
-
-struct AnnoBeginClip
-{
-    float4 bbox;
-    float linewidth;
-    uint blend;
-};
-
-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct AnnotatedRef
+struct Alloc
 {
     uint offset;
 };
@@ -144,12 +23,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -157,18 +38,18 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-static const DrawMonoid _413 = { 0u, 0u };
-static const DrawMonoid _437 = { 1u, 0u };
-static const DrawMonoid _439 = { 1u, 1u };
+static const DrawMonoid _23 = { 0u, 0u, 0u, 0u };
 
-RWByteAddressBuffer _199 : register(u0, space0);
-ByteAddressBuffer _223 : register(t2, space0);
-ByteAddressBuffer _1020 : register(t3, space0);
-ByteAddressBuffer _1054 : register(t1, space0);
+ByteAddressBuffer _92 : register(t1, space0);
+ByteAddressBuffer _102 : register(t2, space0);
+ByteAddressBuffer _202 : register(t3, space0);
+RWByteAddressBuffer _284 : register(u0, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -182,389 +63,44 @@
 
 groupshared DrawMonoid sh_scratch[256];
 
-ElementTag Element_tag(ElementRef ref)
-{
-    uint tag_and_flags = _223.Load((ref.offset >> uint(2)) * 4 + 0);
-    ElementTag _378 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _378;
-}
-
 DrawMonoid map_tag(uint tag_word)
 {
-    switch (tag_word)
-    {
-        case 4u:
-        case 5u:
-        case 6u:
-        {
-            return _437;
-        }
-        case 9u:
-        case 10u:
-        {
-            return _439;
-        }
-        default:
-        {
-            return _413;
-        }
-    }
+    uint has_path = uint(tag_word != 0u);
+    DrawMonoid _75 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
+    return _75;
 }
 
-ElementRef Element_index(ElementRef ref, uint index)
-{
-    ElementRef _212 = { ref.offset + (index * 36u) };
-    return _212;
-}
-
-DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
+DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
 {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
-DrawMonoid tag_monoid_identity()
+DrawMonoid draw_monoid_identity()
 {
-    return _413;
-}
-
-FillColor FillColor_read(FillColorRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = _223.Load((ix + 0u) * 4 + 0);
-    FillColor s;
-    s.rgba_color = raw0;
-    return s;
-}
-
-FillColor Element_FillColor_read(ElementRef ref)
-{
-    FillColorRef _384 = { ref.offset + 4u };
-    FillColorRef param = _384;
-    return FillColor_read(param);
-}
-
-bool touch_mem(Alloc alloc, uint offset)
-{
-    return true;
-}
-
-void write_mem(Alloc alloc, uint offset, uint val)
-{
-    Alloc param = alloc;
-    uint param_1 = offset;
-    if (!touch_mem(param, param_1))
-    {
-        return;
-    }
-    _199.Store(offset * 4 + 8, val);
-}
-
-void AnnoColor_write(Alloc a, AnnoColorRef ref, AnnoColor s)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = asuint(s.bbox.x);
-    write_mem(param, param_1, param_2);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = asuint(s.bbox.y);
-    write_mem(param_3, param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = asuint(s.bbox.z);
-    write_mem(param_6, param_7, param_8);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = asuint(s.bbox.w);
-    write_mem(param_9, param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = asuint(s.linewidth);
-    write_mem(param_12, param_13, param_14);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.rgba_color;
-    write_mem(param_15, param_16, param_17);
-}
-
-void Annotated_Color_write(Alloc a, AnnotatedRef ref, uint flags, AnnoColor s)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 1u;
-    write_mem(param, param_1, param_2);
-    AnnoColorRef _818 = { ref.offset + 4u };
-    Alloc param_3 = a;
-    AnnoColorRef param_4 = _818;
-    AnnoColor param_5 = s;
-    AnnoColor_write(param_3, param_4, param_5);
-}
-
-FillLinGradient FillLinGradient_read(FillLinGradientRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = _223.Load((ix + 0u) * 4 + 0);
-    uint raw1 = _223.Load((ix + 1u) * 4 + 0);
-    uint raw2 = _223.Load((ix + 2u) * 4 + 0);
-    uint raw3 = _223.Load((ix + 3u) * 4 + 0);
-    uint raw4 = _223.Load((ix + 4u) * 4 + 0);
-    FillLinGradient s;
-    s.index = raw0;
-    s.p0 = float2(asfloat(raw1), asfloat(raw2));
-    s.p1 = float2(asfloat(raw3), asfloat(raw4));
-    return s;
-}
-
-FillLinGradient Element_FillLinGradient_read(ElementRef ref)
-{
-    FillLinGradientRef _392 = { ref.offset + 4u };
-    FillLinGradientRef param = _392;
-    return FillLinGradient_read(param);
-}
-
-void AnnoLinGradient_write(Alloc a, AnnoLinGradientRef ref, AnnoLinGradient s)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = asuint(s.bbox.x);
-    write_mem(param, param_1, param_2);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = asuint(s.bbox.y);
-    write_mem(param_3, param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = asuint(s.bbox.z);
-    write_mem(param_6, param_7, param_8);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = asuint(s.bbox.w);
-    write_mem(param_9, param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = asuint(s.linewidth);
-    write_mem(param_12, param_13, param_14);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17);
-    Alloc param_18 = a;
-    uint param_19 = ix + 6u;
-    uint param_20 = asuint(s.line_x);
-    write_mem(param_18, param_19, param_20);
-    Alloc param_21 = a;
-    uint param_22 = ix + 7u;
-    uint param_23 = asuint(s.line_y);
-    write_mem(param_21, param_22, param_23);
-    Alloc param_24 = a;
-    uint param_25 = ix + 8u;
-    uint param_26 = asuint(s.line_c);
-    write_mem(param_24, param_25, param_26);
-}
-
-void Annotated_LinGradient_write(Alloc a, AnnotatedRef ref, uint flags, AnnoLinGradient s)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 2u;
-    write_mem(param, param_1, param_2);
-    AnnoLinGradientRef _839 = { ref.offset + 4u };
-    Alloc param_3 = a;
-    AnnoLinGradientRef param_4 = _839;
-    AnnoLinGradient param_5 = s;
-    AnnoLinGradient_write(param_3, param_4, param_5);
-}
-
-FillImage FillImage_read(FillImageRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = _223.Load((ix + 0u) * 4 + 0);
-    uint raw1 = _223.Load((ix + 1u) * 4 + 0);
-    FillImage s;
-    s.index = raw0;
-    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-    return s;
-}
-
-FillImage Element_FillImage_read(ElementRef ref)
-{
-    FillImageRef _400 = { ref.offset + 4u };
-    FillImageRef param = _400;
-    return FillImage_read(param);
-}
-
-void AnnoImage_write(Alloc a, AnnoImageRef ref, AnnoImage s)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = asuint(s.bbox.x);
-    write_mem(param, param_1, param_2);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = asuint(s.bbox.y);
-    write_mem(param_3, param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = asuint(s.bbox.z);
-    write_mem(param_6, param_7, param_8);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = asuint(s.bbox.w);
-    write_mem(param_9, param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = asuint(s.linewidth);
-    write_mem(param_12, param_13, param_14);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17);
-    Alloc param_18 = a;
-    uint param_19 = ix + 6u;
-    uint param_20 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_18, param_19, param_20);
-}
-
-void Annotated_Image_write(Alloc a, AnnotatedRef ref, uint flags, AnnoImage s)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 3u;
-    write_mem(param, param_1, param_2);
-    AnnoImageRef _860 = { ref.offset + 4u };
-    Alloc param_3 = a;
-    AnnoImageRef param_4 = _860;
-    AnnoImage param_5 = s;
-    AnnoImage_write(param_3, param_4, param_5);
-}
-
-Clip Clip_read(ClipRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = _223.Load((ix + 0u) * 4 + 0);
-    uint raw1 = _223.Load((ix + 1u) * 4 + 0);
-    uint raw2 = _223.Load((ix + 2u) * 4 + 0);
-    uint raw3 = _223.Load((ix + 3u) * 4 + 0);
-    Clip s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.blend = _223.Load((ix + 4u) * 4 + 0);
-    return s;
-}
-
-Clip Element_BeginClip_read(ElementRef ref)
-{
-    ClipRef _408 = { ref.offset + 4u };
-    ClipRef param = _408;
-    return Clip_read(param);
-}
-
-void AnnoBeginClip_write(Alloc a, AnnoBeginClipRef ref, AnnoBeginClip s)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = asuint(s.bbox.x);
-    write_mem(param, param_1, param_2);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = asuint(s.bbox.y);
-    write_mem(param_3, param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = asuint(s.bbox.z);
-    write_mem(param_6, param_7, param_8);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = asuint(s.bbox.w);
-    write_mem(param_9, param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = asuint(s.linewidth);
-    write_mem(param_12, param_13, param_14);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.blend;
-    write_mem(param_15, param_16, param_17);
-}
-
-void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoBeginClip s)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 4u;
-    write_mem(param, param_1, param_2);
-    AnnoBeginClipRef _881 = { ref.offset + 4u };
-    Alloc param_3 = a;
-    AnnoBeginClipRef param_4 = _881;
-    AnnoBeginClip param_5 = s;
-    AnnoBeginClip_write(param_3, param_4, param_5);
-}
-
-void AnnoEndClip_write(Alloc a, AnnoEndClipRef ref, AnnoEndClip s)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = asuint(s.bbox.x);
-    write_mem(param, param_1, param_2);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = asuint(s.bbox.y);
-    write_mem(param_3, param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = asuint(s.bbox.z);
-    write_mem(param_6, param_7, param_8);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = asuint(s.bbox.w);
-    write_mem(param_9, param_10, param_11);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = s.blend;
-    write_mem(param_12, param_13, param_14);
-}
-
-void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoEndClip s)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 5u;
-    write_mem(param, param_1, param_2);
-    AnnoEndClipRef _902 = { ref.offset + 4u };
-    Alloc param_3 = a;
-    AnnoEndClipRef param_4 = _902;
-    AnnoEndClip param_5 = s;
-    AnnoEndClip_write(param_3, param_4, param_5);
+    return _23;
 }
 
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    ElementRef _920 = { ix * 36u };
-    ElementRef ref = _920;
-    ElementRef param = ref;
-    uint tag_word = Element_tag(param).tag;
-    uint param_1 = tag_word;
-    DrawMonoid agg = map_tag(param_1);
+    uint drawtag_base = _92.Load(100) >> uint(2);
+    uint tag_word = _102.Load((drawtag_base + ix) * 4 + 0);
+    uint param = tag_word;
+    DrawMonoid agg = map_tag(param);
     DrawMonoid local[8];
     local[0] = agg;
     for (uint i = 1u; i < 8u; i++)
     {
-        ElementRef param_2 = ref;
-        uint param_3 = i;
-        ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4).tag;
-        uint param_5 = tag_word;
-        DrawMonoid param_6 = agg;
-        DrawMonoid param_7 = map_tag(param_5);
-        agg = combine_tag_monoid(param_6, param_7);
+        tag_word = _102.Load(((drawtag_base + ix) + i) * 4 + 0);
+        uint param_1 = tag_word;
+        DrawMonoid param_2 = agg;
+        DrawMonoid param_3 = map_tag(param_1);
+        agg = combine_draw_monoid(param_2, param_3);
         local[i] = agg;
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
@@ -574,194 +110,121 @@
         if (gl_LocalInvocationID.x >= (1u << i_1))
         {
             DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
-            DrawMonoid param_8 = other;
-            DrawMonoid param_9 = agg;
-            agg = combine_tag_monoid(param_8, param_9);
+            DrawMonoid param_4 = other;
+            DrawMonoid param_5 = agg;
+            agg = combine_draw_monoid(param_4, param_5);
         }
         GroupMemoryBarrierWithGroupSync();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     GroupMemoryBarrierWithGroupSync();
-    DrawMonoid row = tag_monoid_identity();
+    DrawMonoid row = draw_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        DrawMonoid _1026;
-        _1026.path_ix = _1020.Load((gl_WorkGroupID.x - 1u) * 8 + 0);
-        _1026.clip_ix = _1020.Load((gl_WorkGroupID.x - 1u) * 8 + 4);
-        row.path_ix = _1026.path_ix;
-        row.clip_ix = _1026.clip_ix;
+        DrawMonoid _208;
+        _208.path_ix = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 0);
+        _208.clip_ix = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 4);
+        _208.scene_offset = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 8);
+        _208.info_offset = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 12);
+        row.path_ix = _208.path_ix;
+        row.clip_ix = _208.clip_ix;
+        row.scene_offset = _208.scene_offset;
+        row.info_offset = _208.info_offset;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
-        DrawMonoid param_10 = row;
-        DrawMonoid param_11 = sh_scratch[gl_LocalInvocationID.x - 1u];
-        row = combine_tag_monoid(param_10, param_11);
+        DrawMonoid param_6 = row;
+        DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
+        row = combine_draw_monoid(param_6, param_7);
     }
+    uint drawdata_base = _92.Load(104) >> uint(2);
+    uint drawinfo_base = _92.Load(68) >> uint(2);
     uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_1054.Load(44) >> uint(2)) + (out_ix * 2u);
-    uint clip_out_base = _1054.Load(48) >> uint(2);
-    AnnotatedRef _1075 = { _1054.Load(32) + (out_ix * 40u) };
-    AnnotatedRef out_ref = _1075;
+    uint out_base = (_92.Load(44) >> uint(2)) + (out_ix * 4u);
+    uint clip_out_base = _92.Load(48) >> uint(2);
     float4 mat;
     float2 translate;
-    AnnoColor anno_fill;
-    Alloc param_18;
-    AnnoLinGradient anno_lin;
-    Alloc param_23;
-    AnnoImage anno_img;
-    Alloc param_28;
-    AnnoBeginClip anno_begin_clip;
-    Alloc param_33;
-    AnnoEndClip anno_end_clip;
-    Alloc param_38;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid m = row;
         if (i_2 > 0u)
         {
-            DrawMonoid param_12 = m;
-            DrawMonoid param_13 = local[i_2 - 1u];
-            m = combine_tag_monoid(param_12, param_13);
+            DrawMonoid param_8 = m;
+            DrawMonoid param_9 = local[i_2 - 1u];
+            m = combine_draw_monoid(param_8, param_9);
         }
-        _199.Store((out_base + (i_2 * 2u)) * 4 + 8, m.path_ix);
-        _199.Store(((out_base + (i_2 * 2u)) + 1u) * 4 + 8, m.clip_ix);
-        ElementRef param_14 = ref;
-        uint param_15 = i_2;
-        ElementRef this_ref = Element_index(param_14, param_15);
-        ElementRef param_16 = this_ref;
-        tag_word = Element_tag(param_16).tag;
-        if ((((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) || (tag_word == 9u))
+        _284.Store((out_base + (i_2 * 4u)) * 4 + 8, m.path_ix);
+        _284.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 8, m.clip_ix);
+        _284.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 8, m.scene_offset);
+        _284.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 8, m.info_offset);
+        uint dd = drawdata_base + (m.scene_offset >> uint(2));
+        uint di = drawinfo_base + (m.info_offset >> uint(2));
+        tag_word = _102.Load(((drawtag_base + ix) + i_2) * 4 + 0);
+        if ((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 72u)) || (tag_word == 5u))
         {
-            uint bbox_offset = (_1054.Load(40) >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(_199.Load(bbox_offset * 4 + 8)) - 32768.0f;
-            float bbox_t = float(_199.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
-            float bbox_r = float(_199.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
-            float bbox_b = float(_199.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
+            uint bbox_offset = (_92.Load(40) >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_284.Load(bbox_offset * 4 + 8)) - 32768.0f;
+            float bbox_t = float(_284.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
+            float bbox_r = float(_284.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
+            float bbox_b = float(_284.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
             float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = asfloat(_199.Load((bbox_offset + 4u) * 4 + 8));
+            float linewidth = asfloat(_284.Load((bbox_offset + 4u) * 4 + 8));
             uint fill_mode = uint(linewidth >= 0.0f);
-            if ((linewidth >= 0.0f) || (tag_word == 5u))
+            if ((linewidth >= 0.0f) || (tag_word == 276u))
             {
-                uint trans_ix = _199.Load((bbox_offset + 5u) * 4 + 8);
-                uint t = (_1054.Load(36) >> uint(2)) + (6u * trans_ix);
-                mat = asfloat(uint4(_199.Load(t * 4 + 8), _199.Load((t + 1u) * 4 + 8), _199.Load((t + 2u) * 4 + 8), _199.Load((t + 3u) * 4 + 8)));
-                if (tag_word == 5u)
+                uint trans_ix = _284.Load((bbox_offset + 5u) * 4 + 8);
+                uint t = (_92.Load(36) >> uint(2)) + (6u * trans_ix);
+                mat = asfloat(uint4(_284.Load(t * 4 + 8), _284.Load((t + 1u) * 4 + 8), _284.Load((t + 2u) * 4 + 8), _284.Load((t + 3u) * 4 + 8)));
+                if (tag_word == 276u)
                 {
-                    translate = asfloat(uint2(_199.Load((t + 4u) * 4 + 8), _199.Load((t + 5u) * 4 + 8)));
+                    translate = asfloat(uint2(_284.Load((t + 4u) * 4 + 8), _284.Load((t + 5u) * 4 + 8)));
                 }
             }
             if (linewidth >= 0.0f)
             {
                 linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z)));
             }
-            linewidth = max(linewidth, 0.0f);
             switch (tag_word)
             {
-                case 4u:
+                case 68u:
+                case 72u:
                 {
-                    ElementRef param_17 = this_ref;
-                    FillColor fill = Element_FillColor_read(param_17);
-                    anno_fill.bbox = bbox;
-                    anno_fill.linewidth = linewidth;
-                    anno_fill.rgba_color = fill.rgba_color;
-                    Alloc _1288;
-                    _1288.offset = _1054.Load(32);
-                    param_18.offset = _1288.offset;
-                    AnnotatedRef param_19 = out_ref;
-                    uint param_20 = fill_mode;
-                    AnnoColor param_21 = anno_fill;
-                    Annotated_Color_write(param_18, param_19, param_20, param_21);
+                    _284.Store(di * 4 + 8, asuint(linewidth));
                     break;
                 }
-                case 5u:
+                case 276u:
                 {
-                    ElementRef param_22 = this_ref;
-                    FillLinGradient lin = Element_FillLinGradient_read(param_22);
-                    anno_lin.bbox = bbox;
-                    anno_lin.linewidth = linewidth;
-                    anno_lin.index = lin.index;
-                    float2 p0 = ((mat.xy * lin.p0.x) + (mat.zw * lin.p0.y)) + translate;
-                    float2 p1 = ((mat.xy * lin.p1.x) + (mat.zw * lin.p1.y)) + translate;
+                    _284.Store(di * 4 + 8, asuint(linewidth));
+                    uint index = _102.Load(dd * 4 + 0);
+                    float2 p0 = asfloat(uint2(_102.Load((dd + 1u) * 4 + 0), _102.Load((dd + 2u) * 4 + 0)));
+                    float2 p1 = asfloat(uint2(_102.Load((dd + 3u) * 4 + 0), _102.Load((dd + 4u) * 4 + 0)));
+                    p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
+                    p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
                     float2 dxy = p1 - p0;
                     float scale = 1.0f / ((dxy.x * dxy.x) + (dxy.y * dxy.y));
                     float line_x = dxy.x * scale;
                     float line_y = dxy.y * scale;
-                    anno_lin.line_x = line_x;
-                    anno_lin.line_y = line_y;
-                    anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    Alloc _1384;
-                    _1384.offset = _1054.Load(32);
-                    param_23.offset = _1384.offset;
-                    AnnotatedRef param_24 = out_ref;
-                    uint param_25 = fill_mode;
-                    AnnoLinGradient param_26 = anno_lin;
-                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26);
+                    float line_c = -((p0.x * line_x) + (p0.y * line_y));
+                    _284.Store((di + 1u) * 4 + 8, asuint(line_x));
+                    _284.Store((di + 2u) * 4 + 8, asuint(line_y));
+                    _284.Store((di + 3u) * 4 + 8, asuint(line_c));
                     break;
                 }
-                case 6u:
+                case 5u:
                 {
-                    ElementRef param_27 = this_ref;
-                    FillImage fill_img = Element_FillImage_read(param_27);
-                    anno_img.bbox = bbox;
-                    anno_img.linewidth = linewidth;
-                    anno_img.index = fill_img.index;
-                    anno_img.offset = fill_img.offset;
-                    Alloc _1412;
-                    _1412.offset = _1054.Load(32);
-                    param_28.offset = _1412.offset;
-                    AnnotatedRef param_29 = out_ref;
-                    uint param_30 = fill_mode;
-                    AnnoImage param_31 = anno_img;
-                    Annotated_Image_write(param_28, param_29, param_30, param_31);
-                    break;
-                }
-                case 9u:
-                {
-                    ElementRef param_32 = this_ref;
-                    Clip begin_clip = Element_BeginClip_read(param_32);
-                    anno_begin_clip.bbox = bbox;
-                    anno_begin_clip.linewidth = 0.0f;
-                    anno_begin_clip.blend = begin_clip.blend;
-                    uint flags = uint(begin_clip.blend != 3u) << uint(1);
-                    Alloc _1442;
-                    _1442.offset = _1054.Load(32);
-                    param_33.offset = _1442.offset;
-                    AnnotatedRef param_34 = out_ref;
-                    uint param_35 = flags;
-                    AnnoBeginClip param_36 = anno_begin_clip;
-                    Annotated_BeginClip_write(param_33, param_34, param_35, param_36);
                     break;
                 }
             }
         }
-        else
-        {
-            if (tag_word == 10u)
-            {
-                ElementRef param_37 = this_ref;
-                Clip end_clip = Element_BeginClip_read(param_37);
-                anno_end_clip.bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
-                anno_end_clip.blend = end_clip.blend;
-                uint flags_1 = uint(end_clip.blend != 3u) << uint(1);
-                Alloc _1480;
-                _1480.offset = _1054.Load(32);
-                param_38.offset = _1480.offset;
-                AnnotatedRef param_39 = out_ref;
-                uint param_40 = flags_1;
-                AnnoEndClip param_41 = anno_end_clip;
-                Annotated_EndClip_write(param_38, param_39, param_40, param_41);
-            }
-        }
-        if ((tag_word == 9u) || (tag_word == 10u))
+        if ((tag_word == 5u) || (tag_word == 37u))
         {
             uint path_ix = ~(out_ix + i_2);
-            if (tag_word == 9u)
+            if (tag_word == 5u)
             {
                 path_ix = m.path_ix;
             }
-            _199.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
+            _284.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
         }
-        out_ref.offset += 40u;
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl
index 5b9ecc6..a8516ae 100644
--- a/piet-gpu/shader/gen/draw_leaf.msl
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@@ -44,145 +44,53 @@
     }
 };
 
+struct DrawMonoid
+{
+    uint path_ix;
+    uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
+};
+
 struct Alloc
 {
     uint offset;
 };
 
-struct ElementRef
+struct Config
 {
-    uint offset;
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc path_bbox_alloc;
+    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
+    uint n_trans;
+    uint n_path;
+    uint n_clip;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
-struct FillColorRef
+struct ConfigBuf
 {
-    uint offset;
-};
-
-struct FillColor
-{
-    uint rgba_color;
-};
-
-struct FillLinGradientRef
-{
-    uint offset;
-};
-
-struct FillLinGradient
-{
-    uint index;
-    float2 p0;
-    float2 p1;
-};
-
-struct FillImageRef
-{
-    uint offset;
-};
-
-struct FillImage
-{
-    uint index;
-    int2 offset;
-};
-
-struct ClipRef
-{
-    uint offset;
-};
-
-struct Clip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct ElementTag
-{
-    uint tag;
-    uint flags;
-};
-
-struct DrawMonoid
-{
-    uint path_ix;
-    uint clip_ix;
-};
-
-struct AnnoImageRef
-{
-    uint offset;
-};
-
-struct AnnoImage
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    int2 offset;
-};
-
-struct AnnoColorRef
-{
-    uint offset;
-};
-
-struct AnnoColor
-{
-    float4 bbox;
-    float linewidth;
-    uint rgba_color;
-};
-
-struct AnnoLinGradientRef
-{
-    uint offset;
-};
-
-struct AnnoLinGradient
-{
-    float4 bbox;
-    float linewidth;
-    uint index;
-    float line_x;
-    float line_y;
-    float line_c;
-};
-
-struct AnnoBeginClipRef
-{
-    uint offset;
-};
-
-struct AnnoBeginClip
-{
-    float4 bbox;
-    float linewidth;
-    uint blend;
-};
-
-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct Memory
-{
-    uint mem_offset;
-    uint mem_error;
-    uint memory[1];
+    Config conf;
 };
 
 struct SceneBuf
@@ -194,6 +102,8 @@
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
 struct ParentBuf
@@ -201,442 +111,56 @@
     DrawMonoid_1 parent[1];
 };
 
-struct Alloc_1
+struct Memory
 {
-    uint offset;
-};
-
-struct Config
-{
-    uint n_elements;
-    uint n_pathseg;
-    uint width_in_tiles;
-    uint height_in_tiles;
-    Alloc_1 tile_alloc;
-    Alloc_1 bin_alloc;
-    Alloc_1 ptcl_alloc;
-    Alloc_1 pathseg_alloc;
-    Alloc_1 anno_alloc;
-    Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
-    Alloc_1 drawmonoid_alloc;
-    Alloc_1 clip_alloc;
-    Alloc_1 clip_bic_alloc;
-    Alloc_1 clip_stack_alloc;
-    Alloc_1 clip_bbox_alloc;
-    uint n_trans;
-    uint n_path;
-    uint n_clip;
-    uint trans_offset;
-    uint linewidth_offset;
-    uint pathtag_offset;
-    uint pathseg_offset;
-};
-
-struct ConfigBuf
-{
-    Config conf;
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
 };
 
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_223)
-{
-    uint tag_and_flags = v_223.scene[ref.offset >> uint(2)];
-    return ElementTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
-
-static inline __attribute__((always_inline))
 DrawMonoid map_tag(thread const uint& tag_word)
 {
-    switch (tag_word)
-    {
-        case 4u:
-        case 5u:
-        case 6u:
-        {
-            return DrawMonoid{ 1u, 0u };
-        }
-        case 9u:
-        case 10u:
-        {
-            return DrawMonoid{ 1u, 1u };
-        }
-        default:
-        {
-            return DrawMonoid{ 0u, 0u };
-        }
-    }
+    uint has_path = uint(tag_word != 0u);
+    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
 }
 
 static inline __attribute__((always_inline))
-ElementRef Element_index(thread const ElementRef& ref, thread const uint& index)
-{
-    return ElementRef{ ref.offset + (index * 36u) };
-}
-
-static inline __attribute__((always_inline))
-DrawMonoid combine_tag_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
+DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
 {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
 static inline __attribute__((always_inline))
-DrawMonoid tag_monoid_identity()
+DrawMonoid draw_monoid_identity()
 {
-    return DrawMonoid{ 0u, 0u };
+    return DrawMonoid{ 0u, 0u, 0u, 0u };
 }
 
-static inline __attribute__((always_inline))
-FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_223)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_223.scene[ix + 0u];
-    FillColor s;
-    s.rgba_color = raw0;
-    return s;
-}
-
-static inline __attribute__((always_inline))
-FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_223)
-{
-    FillColorRef param = FillColorRef{ ref.offset + 4u };
-    return FillColor_read(param, v_223);
-}
-
-static inline __attribute__((always_inline))
-bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
-{
-    return true;
-}
-
-static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_199)
-{
-    Alloc param = alloc;
-    uint param_1 = offset;
-    if (!touch_mem(param, param_1))
-    {
-        return;
-    }
-    v_199.memory[offset] = val;
-}
-
-static inline __attribute__((always_inline))
-void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_199)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_199);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_199);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_199);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_199);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.rgba_color;
-    write_mem(param_15, param_16, param_17, v_199);
-}
-
-static inline __attribute__((always_inline))
-void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_199)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 1u;
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    AnnoColorRef param_4 = AnnoColorRef{ ref.offset + 4u };
-    AnnoColor param_5 = s;
-    AnnoColor_write(param_3, param_4, param_5, v_199);
-}
-
-static inline __attribute__((always_inline))
-FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_223)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_223.scene[ix + 0u];
-    uint raw1 = v_223.scene[ix + 1u];
-    uint raw2 = v_223.scene[ix + 2u];
-    uint raw3 = v_223.scene[ix + 3u];
-    uint raw4 = v_223.scene[ix + 4u];
-    FillLinGradient s;
-    s.index = raw0;
-    s.p0 = float2(as_type<float>(raw1), as_type<float>(raw2));
-    s.p1 = float2(as_type<float>(raw3), as_type<float>(raw4));
-    return s;
-}
-
-static inline __attribute__((always_inline))
-FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_223)
-{
-    FillLinGradientRef param = FillLinGradientRef{ ref.offset + 4u };
-    return FillLinGradient_read(param, v_223);
-}
-
-static inline __attribute__((always_inline))
-void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_199)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_199);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_199);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_199);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_199);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17, v_199);
-    Alloc param_18 = a;
-    uint param_19 = ix + 6u;
-    uint param_20 = as_type<uint>(s.line_x);
-    write_mem(param_18, param_19, param_20, v_199);
-    Alloc param_21 = a;
-    uint param_22 = ix + 7u;
-    uint param_23 = as_type<uint>(s.line_y);
-    write_mem(param_21, param_22, param_23, v_199);
-    Alloc param_24 = a;
-    uint param_25 = ix + 8u;
-    uint param_26 = as_type<uint>(s.line_c);
-    write_mem(param_24, param_25, param_26, v_199);
-}
-
-static inline __attribute__((always_inline))
-void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_199)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 2u;
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    AnnoLinGradientRef param_4 = AnnoLinGradientRef{ ref.offset + 4u };
-    AnnoLinGradient param_5 = s;
-    AnnoLinGradient_write(param_3, param_4, param_5, v_199);
-}
-
-static inline __attribute__((always_inline))
-FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_223)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_223.scene[ix + 0u];
-    uint raw1 = v_223.scene[ix + 1u];
-    FillImage s;
-    s.index = raw0;
-    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-    return s;
-}
-
-static inline __attribute__((always_inline))
-FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_223)
-{
-    FillImageRef param = FillImageRef{ ref.offset + 4u };
-    return FillImage_read(param, v_223);
-}
-
-static inline __attribute__((always_inline))
-void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_199)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_199);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_199);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_199);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_199);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17, v_199);
-    Alloc param_18 = a;
-    uint param_19 = ix + 6u;
-    uint param_20 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_18, param_19, param_20, v_199);
-}
-
-static inline __attribute__((always_inline))
-void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_199)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 3u;
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    AnnoImageRef param_4 = AnnoImageRef{ ref.offset + 4u };
-    AnnoImage param_5 = s;
-    AnnoImage_write(param_3, param_4, param_5, v_199);
-}
-
-static inline __attribute__((always_inline))
-Clip Clip_read(thread const ClipRef& ref, const device SceneBuf& v_223)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_223.scene[ix + 0u];
-    uint raw1 = v_223.scene[ix + 1u];
-    uint raw2 = v_223.scene[ix + 2u];
-    uint raw3 = v_223.scene[ix + 3u];
-    Clip s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    s.blend = v_223.scene[ix + 4u];
-    return s;
-}
-
-static inline __attribute__((always_inline))
-Clip Element_BeginClip_read(thread const ElementRef& ref, const device SceneBuf& v_223)
-{
-    ClipRef param = ClipRef{ ref.offset + 4u };
-    return Clip_read(param, v_223);
-}
-
-static inline __attribute__((always_inline))
-void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_199)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_199);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_199);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_199);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_199);
-    Alloc param_15 = a;
-    uint param_16 = ix + 5u;
-    uint param_17 = s.blend;
-    write_mem(param_15, param_16, param_17, v_199);
-}
-
-static inline __attribute__((always_inline))
-void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_199)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 4u;
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    AnnoBeginClipRef param_4 = AnnoBeginClipRef{ ref.offset + 4u };
-    AnnoBeginClip param_5 = s;
-    AnnoBeginClip_write(param_3, param_4, param_5, v_199);
-}
-
-static inline __attribute__((always_inline))
-void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_199)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    uint param_4 = ix + 1u;
-    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_199);
-    Alloc param_6 = a;
-    uint param_7 = ix + 2u;
-    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_199);
-    Alloc param_9 = a;
-    uint param_10 = ix + 3u;
-    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_199);
-    Alloc param_12 = a;
-    uint param_13 = ix + 4u;
-    uint param_14 = s.blend;
-    write_mem(param_12, param_13, param_14, v_199);
-}
-
-static inline __attribute__((always_inline))
-void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoEndClip& s, device Memory& v_199)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = (flags << uint(16)) | 5u;
-    write_mem(param, param_1, param_2, v_199);
-    Alloc param_3 = a;
-    AnnoEndClipRef param_4 = AnnoEndClipRef{ ref.offset + 4u };
-    AnnoEndClip param_5 = s;
-    AnnoEndClip_write(param_3, param_4, param_5, v_199);
-}
-
-kernel void main0(device Memory& v_199 [[buffer(0)]], const device ConfigBuf& _1054 [[buffer(1)]], const device SceneBuf& v_223 [[buffer(2)]], const device ParentBuf& _1020 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92 [[buffer(1)]], const device SceneBuf& _102 [[buffer(2)]], const device ParentBuf& _202 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup DrawMonoid sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
-    ElementRef ref = ElementRef{ ix * 36u };
-    ElementRef param = ref;
-    uint tag_word = Element_tag(param, v_223).tag;
-    uint param_1 = tag_word;
-    DrawMonoid agg = map_tag(param_1);
+    uint drawtag_base = _92.conf.drawtag_offset >> uint(2);
+    uint tag_word = _102.scene[drawtag_base + ix];
+    uint param = tag_word;
+    DrawMonoid agg = map_tag(param);
     spvUnsafeArray<DrawMonoid, 8> local;
     local[0] = agg;
     for (uint i = 1u; i < 8u; i++)
     {
-        ElementRef param_2 = ref;
-        uint param_3 = i;
-        ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4, v_223).tag;
-        uint param_5 = tag_word;
-        DrawMonoid param_6 = agg;
-        DrawMonoid param_7 = map_tag(param_5);
-        agg = combine_tag_monoid(param_6, param_7);
+        tag_word = _102.scene[(drawtag_base + ix) + i];
+        uint param_1 = tag_word;
+        DrawMonoid param_2 = agg;
+        DrawMonoid param_3 = map_tag(param_1);
+        agg = combine_draw_monoid(param_2, param_3);
         local[i] = agg;
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
@@ -646,181 +170,117 @@
         if (gl_LocalInvocationID.x >= (1u << i_1))
         {
             DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
-            DrawMonoid param_8 = other;
-            DrawMonoid param_9 = agg;
-            agg = combine_tag_monoid(param_8, param_9);
+            DrawMonoid param_4 = other;
+            DrawMonoid param_5 = agg;
+            agg = combine_draw_monoid(param_4, param_5);
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    DrawMonoid row = tag_monoid_identity();
+    DrawMonoid row = draw_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        uint _1023 = gl_WorkGroupID.x - 1u;
-        row.path_ix = _1020.parent[_1023].path_ix;
-        row.clip_ix = _1020.parent[_1023].clip_ix;
+        uint _205 = gl_WorkGroupID.x - 1u;
+        row.path_ix = _202.parent[_205].path_ix;
+        row.clip_ix = _202.parent[_205].clip_ix;
+        row.scene_offset = _202.parent[_205].scene_offset;
+        row.info_offset = _202.parent[_205].info_offset;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
-        DrawMonoid param_10 = row;
-        DrawMonoid param_11 = sh_scratch[gl_LocalInvocationID.x - 1u];
-        row = combine_tag_monoid(param_10, param_11);
+        DrawMonoid param_6 = row;
+        DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
+        row = combine_draw_monoid(param_6, param_7);
     }
+    uint drawdata_base = _92.conf.drawdata_offset >> uint(2);
+    uint drawinfo_base = _92.conf.drawinfo_alloc.offset >> uint(2);
     uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_1054.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u);
-    uint clip_out_base = _1054.conf.clip_alloc.offset >> uint(2);
-    AnnotatedRef out_ref = AnnotatedRef{ _1054.conf.anno_alloc.offset + (out_ix * 40u) };
+    uint out_base = (_92.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 4u);
+    uint clip_out_base = _92.conf.clip_alloc.offset >> uint(2);
     float4 mat;
     float2 translate;
-    AnnoColor anno_fill;
-    Alloc param_18;
-    AnnoLinGradient anno_lin;
-    Alloc param_23;
-    AnnoImage anno_img;
-    Alloc param_28;
-    AnnoBeginClip anno_begin_clip;
-    Alloc param_33;
-    AnnoEndClip anno_end_clip;
-    Alloc param_38;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid m = row;
         if (i_2 > 0u)
         {
-            DrawMonoid param_12 = m;
-            DrawMonoid param_13 = local[i_2 - 1u];
-            m = combine_tag_monoid(param_12, param_13);
+            DrawMonoid param_8 = m;
+            DrawMonoid param_9 = local[i_2 - 1u];
+            m = combine_draw_monoid(param_8, param_9);
         }
-        v_199.memory[out_base + (i_2 * 2u)] = m.path_ix;
-        v_199.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
-        ElementRef param_14 = ref;
-        uint param_15 = i_2;
-        ElementRef this_ref = Element_index(param_14, param_15);
-        ElementRef param_16 = this_ref;
-        tag_word = Element_tag(param_16, v_223).tag;
-        if ((((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) || (tag_word == 9u))
+        _284.memory[out_base + (i_2 * 4u)] = m.path_ix;
+        _284.memory[(out_base + (i_2 * 4u)) + 1u] = m.clip_ix;
+        _284.memory[(out_base + (i_2 * 4u)) + 2u] = m.scene_offset;
+        _284.memory[(out_base + (i_2 * 4u)) + 3u] = m.info_offset;
+        uint dd = drawdata_base + (m.scene_offset >> uint(2));
+        uint di = drawinfo_base + (m.info_offset >> uint(2));
+        tag_word = _102.scene[(drawtag_base + ix) + i_2];
+        if ((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 72u)) || (tag_word == 5u))
         {
-            uint bbox_offset = (_1054.conf.bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(v_199.memory[bbox_offset]) - 32768.0;
-            float bbox_t = float(v_199.memory[bbox_offset + 1u]) - 32768.0;
-            float bbox_r = float(v_199.memory[bbox_offset + 2u]) - 32768.0;
-            float bbox_b = float(v_199.memory[bbox_offset + 3u]) - 32768.0;
+            uint bbox_offset = (_92.conf.path_bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_284.memory[bbox_offset]) - 32768.0;
+            float bbox_t = float(_284.memory[bbox_offset + 1u]) - 32768.0;
+            float bbox_r = float(_284.memory[bbox_offset + 2u]) - 32768.0;
+            float bbox_b = float(_284.memory[bbox_offset + 3u]) - 32768.0;
             float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = as_type<float>(v_199.memory[bbox_offset + 4u]);
+            float linewidth = as_type<float>(_284.memory[bbox_offset + 4u]);
             uint fill_mode = uint(linewidth >= 0.0);
-            if ((linewidth >= 0.0) || (tag_word == 5u))
+            if ((linewidth >= 0.0) || (tag_word == 276u))
             {
-                uint trans_ix = v_199.memory[bbox_offset + 5u];
-                uint t = (_1054.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
-                mat = as_type<float4>(uint4(v_199.memory[t], v_199.memory[t + 1u], v_199.memory[t + 2u], v_199.memory[t + 3u]));
-                if (tag_word == 5u)
+                uint trans_ix = _284.memory[bbox_offset + 5u];
+                uint t = (_92.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
+                mat = as_type<float4>(uint4(_284.memory[t], _284.memory[t + 1u], _284.memory[t + 2u], _284.memory[t + 3u]));
+                if (tag_word == 276u)
                 {
-                    translate = as_type<float2>(uint2(v_199.memory[t + 4u], v_199.memory[t + 5u]));
+                    translate = as_type<float2>(uint2(_284.memory[t + 4u], _284.memory[t + 5u]));
                 }
             }
             if (linewidth >= 0.0)
             {
                 linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z)));
             }
-            linewidth = fast::max(linewidth, 0.0);
             switch (tag_word)
             {
-                case 4u:
+                case 68u:
+                case 72u:
                 {
-                    ElementRef param_17 = this_ref;
-                    FillColor fill = Element_FillColor_read(param_17, v_223);
-                    anno_fill.bbox = bbox;
-                    anno_fill.linewidth = linewidth;
-                    anno_fill.rgba_color = fill.rgba_color;
-                    param_18.offset = _1054.conf.anno_alloc.offset;
-                    AnnotatedRef param_19 = out_ref;
-                    uint param_20 = fill_mode;
-                    AnnoColor param_21 = anno_fill;
-                    Annotated_Color_write(param_18, param_19, param_20, param_21, v_199);
+                    _284.memory[di] = as_type<uint>(linewidth);
                     break;
                 }
-                case 5u:
+                case 276u:
                 {
-                    ElementRef param_22 = this_ref;
-                    FillLinGradient lin = Element_FillLinGradient_read(param_22, v_223);
-                    anno_lin.bbox = bbox;
-                    anno_lin.linewidth = linewidth;
-                    anno_lin.index = lin.index;
-                    float2 p0 = ((mat.xy * lin.p0.x) + (mat.zw * lin.p0.y)) + translate;
-                    float2 p1 = ((mat.xy * lin.p1.x) + (mat.zw * lin.p1.y)) + translate;
+                    _284.memory[di] = as_type<uint>(linewidth);
+                    uint index = _102.scene[dd];
+                    float2 p0 = as_type<float2>(uint2(_102.scene[dd + 1u], _102.scene[dd + 2u]));
+                    float2 p1 = as_type<float2>(uint2(_102.scene[dd + 3u], _102.scene[dd + 4u]));
+                    p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
+                    p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
                     float2 dxy = p1 - p0;
                     float scale = 1.0 / ((dxy.x * dxy.x) + (dxy.y * dxy.y));
                     float line_x = dxy.x * scale;
                     float line_y = dxy.y * scale;
-                    anno_lin.line_x = line_x;
-                    anno_lin.line_y = line_y;
-                    anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    param_23.offset = _1054.conf.anno_alloc.offset;
-                    AnnotatedRef param_24 = out_ref;
-                    uint param_25 = fill_mode;
-                    AnnoLinGradient param_26 = anno_lin;
-                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_199);
+                    float line_c = -((p0.x * line_x) + (p0.y * line_y));
+                    _284.memory[di + 1u] = as_type<uint>(line_x);
+                    _284.memory[di + 2u] = as_type<uint>(line_y);
+                    _284.memory[di + 3u] = as_type<uint>(line_c);
                     break;
                 }
-                case 6u:
+                case 5u:
                 {
-                    ElementRef param_27 = this_ref;
-                    FillImage fill_img = Element_FillImage_read(param_27, v_223);
-                    anno_img.bbox = bbox;
-                    anno_img.linewidth = linewidth;
-                    anno_img.index = fill_img.index;
-                    anno_img.offset = fill_img.offset;
-                    param_28.offset = _1054.conf.anno_alloc.offset;
-                    AnnotatedRef param_29 = out_ref;
-                    uint param_30 = fill_mode;
-                    AnnoImage param_31 = anno_img;
-                    Annotated_Image_write(param_28, param_29, param_30, param_31, v_199);
-                    break;
-                }
-                case 9u:
-                {
-                    ElementRef param_32 = this_ref;
-                    Clip begin_clip = Element_BeginClip_read(param_32, v_223);
-                    anno_begin_clip.bbox = bbox;
-                    anno_begin_clip.linewidth = 0.0;
-                    anno_begin_clip.blend = begin_clip.blend;
-                    uint flags = uint(begin_clip.blend != 3u) << uint(1);
-                    param_33.offset = _1054.conf.anno_alloc.offset;
-                    AnnotatedRef param_34 = out_ref;
-                    uint param_35 = flags;
-                    AnnoBeginClip param_36 = anno_begin_clip;
-                    Annotated_BeginClip_write(param_33, param_34, param_35, param_36, v_199);
                     break;
                 }
             }
         }
-        else
-        {
-            if (tag_word == 10u)
-            {
-                ElementRef param_37 = this_ref;
-                Clip end_clip = Element_BeginClip_read(param_37, v_223);
-                anno_end_clip.bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
-                anno_end_clip.blend = end_clip.blend;
-                uint flags_1 = uint(end_clip.blend != 3u) << uint(1);
-                param_38.offset = _1054.conf.anno_alloc.offset;
-                AnnotatedRef param_39 = out_ref;
-                uint param_40 = flags_1;
-                AnnoEndClip param_41 = anno_end_clip;
-                Annotated_EndClip_write(param_38, param_39, param_40, param_41, v_199);
-            }
-        }
-        if ((tag_word == 9u) || (tag_word == 10u))
+        if ((tag_word == 5u) || (tag_word == 37u))
         {
             uint path_ix = ~(out_ix + i_2);
-            if (tag_word == 9u)
+            if (tag_word == 5u)
             {
                 path_ix = m.path_ix;
             }
-            v_199.memory[clip_out_base + m.clip_ix] = path_ix;
+            _284.memory[clip_out_base + m.clip_ix] = path_ix;
         }
-        out_ref.offset += 40u;
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv
index bdbdb0c..d18b287 100644
--- a/piet-gpu/shader/gen/draw_leaf.spv
+++ b/piet-gpu/shader/gen/draw_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil
index 9b1b0fd..4df0ec5 100644
--- a/piet-gpu/shader/gen/draw_reduce.dxil
+++ b/piet-gpu/shader/gen/draw_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl
index 0553c9b..7220b7e 100644
--- a/piet-gpu/shader/gen/draw_reduce.hlsl
+++ b/piet-gpu/shader/gen/draw_reduce.hlsl
@@ -1,22 +1,11 @@
-struct ElementRef
-{
-    uint offset;
-};
-
-struct ElementTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct DrawMonoid
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
-static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
-
 struct Alloc
 {
     uint offset;
@@ -34,12 +23,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -47,16 +38,16 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
-static const DrawMonoid _87 = { 1u, 0u };
-static const DrawMonoid _89 = { 1u, 1u };
-static const DrawMonoid _91 = { 0u, 0u };
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-ByteAddressBuffer _46 : register(t2, space0);
-RWByteAddressBuffer _199 : register(u3, space0);
-RWByteAddressBuffer _213 : register(u0, space0);
-ByteAddressBuffer _219 : register(t1, space0);
+ByteAddressBuffer _86 : register(t1, space0);
+ByteAddressBuffer _96 : register(t2, space0);
+RWByteAddressBuffer _187 : register(u3, space0);
+RWByteAddressBuffer _205 : register(u0, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -70,68 +61,37 @@
 
 groupshared DrawMonoid sh_scratch[256];
 
-ElementTag Element_tag(ElementRef ref)
-{
-    uint tag_and_flags = _46.Load((ref.offset >> uint(2)) * 4 + 0);
-    ElementTag _60 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _60;
-}
-
 DrawMonoid map_tag(uint tag_word)
 {
-    switch (tag_word)
-    {
-        case 4u:
-        case 5u:
-        case 6u:
-        {
-            return _87;
-        }
-        case 9u:
-        case 10u:
-        {
-            return _89;
-        }
-        default:
-        {
-            return _91;
-        }
-    }
+    uint has_path = uint(tag_word != 0u);
+    DrawMonoid _69 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
+    return _69;
 }
 
-ElementRef Element_index(ElementRef ref, uint index)
-{
-    ElementRef _39 = { ref.offset + (index * 36u) };
-    return _39;
-}
-
-DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
+DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
 {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    ElementRef _107 = { ix * 36u };
-    ElementRef ref = _107;
-    ElementRef param = ref;
-    uint tag_word = Element_tag(param).tag;
-    uint param_1 = tag_word;
-    DrawMonoid agg = map_tag(param_1);
+    uint drawtag_base = _86.Load(100) >> uint(2);
+    uint tag_word = _96.Load((drawtag_base + ix) * 4 + 0);
+    uint param = tag_word;
+    DrawMonoid agg = map_tag(param);
     for (uint i = 1u; i < 8u; i++)
     {
-        ElementRef param_2 = ref;
-        uint param_3 = i;
-        ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4).tag;
-        uint param_5 = tag_word;
-        DrawMonoid param_6 = agg;
-        DrawMonoid param_7 = map_tag(param_5);
-        agg = combine_tag_monoid(param_6, param_7);
+        uint tag_word_1 = _96.Load(((drawtag_base + ix) + i) * 4 + 0);
+        uint param_1 = tag_word_1;
+        DrawMonoid param_2 = agg;
+        DrawMonoid param_3 = map_tag(param_1);
+        agg = combine_draw_monoid(param_2, param_3);
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
@@ -140,17 +100,19 @@
         if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u)
         {
             DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
-            DrawMonoid param_8 = agg;
-            DrawMonoid param_9 = other;
-            agg = combine_tag_monoid(param_8, param_9);
+            DrawMonoid param_4 = agg;
+            DrawMonoid param_5 = other;
+            agg = combine_draw_monoid(param_4, param_5);
         }
         GroupMemoryBarrierWithGroupSync();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        _199.Store(gl_WorkGroupID.x * 8 + 0, agg.path_ix);
-        _199.Store(gl_WorkGroupID.x * 8 + 4, agg.clip_ix);
+        _187.Store(gl_WorkGroupID.x * 16 + 0, agg.path_ix);
+        _187.Store(gl_WorkGroupID.x * 16 + 4, agg.clip_ix);
+        _187.Store(gl_WorkGroupID.x * 16 + 8, agg.scene_offset);
+        _187.Store(gl_WorkGroupID.x * 16 + 12, agg.info_offset);
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl
index 064c515..8e409a8 100644
--- a/piet-gpu/shader/gen/draw_reduce.msl
+++ b/piet-gpu/shader/gen/draw_reduce.msl
@@ -5,48 +5,14 @@
 
 using namespace metal;
 
-struct ElementRef
-{
-    uint offset;
-};
-
-struct ElementTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct DrawMonoid
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
-struct SceneBuf
-{
-    uint scene[1];
-};
-
-struct DrawMonoid_1
-{
-    uint path_ix;
-    uint clip_ix;
-};
-
-struct OutBuf
-{
-    DrawMonoid_1 outbuf[1];
-};
-
-struct Memory
-{
-    uint mem_offset;
-    uint mem_error;
-    uint memory[1];
-};
-
-constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
-
 struct Alloc
 {
     uint offset;
@@ -64,12 +30,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -77,6 +45,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -84,70 +54,66 @@
     Config conf;
 };
 
-static inline __attribute__((always_inline))
-ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_46)
+struct SceneBuf
 {
-    uint tag_and_flags = v_46.scene[ref.offset >> uint(2)];
-    return ElementTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
+    uint scene[1];
+};
+
+struct DrawMonoid_1
+{
+    uint path_ix;
+    uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
+};
+
+struct OutBuf
+{
+    DrawMonoid_1 outbuf[1];
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
 DrawMonoid map_tag(thread const uint& tag_word)
 {
-    switch (tag_word)
-    {
-        case 4u:
-        case 5u:
-        case 6u:
-        {
-            return DrawMonoid{ 1u, 0u };
-        }
-        case 9u:
-        case 10u:
-        {
-            return DrawMonoid{ 1u, 1u };
-        }
-        default:
-        {
-            return DrawMonoid{ 0u, 0u };
-        }
-    }
+    uint has_path = uint(tag_word != 0u);
+    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
 }
 
 static inline __attribute__((always_inline))
-ElementRef Element_index(thread const ElementRef& ref, thread const uint& index)
-{
-    return ElementRef{ ref.offset + (index * 36u) };
-}
-
-static inline __attribute__((always_inline))
-DrawMonoid combine_tag_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
+DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
 {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
-kernel void main0(const device SceneBuf& v_46 [[buffer(2)]], device OutBuf& _199 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(const device ConfigBuf& _86 [[buffer(1)]], const device SceneBuf& _96 [[buffer(2)]], device OutBuf& _187 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup DrawMonoid sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
-    ElementRef ref = ElementRef{ ix * 36u };
-    ElementRef param = ref;
-    uint tag_word = Element_tag(param, v_46).tag;
-    uint param_1 = tag_word;
-    DrawMonoid agg = map_tag(param_1);
+    uint drawtag_base = _86.conf.drawtag_offset >> uint(2);
+    uint tag_word = _96.scene[drawtag_base + ix];
+    uint param = tag_word;
+    DrawMonoid agg = map_tag(param);
     for (uint i = 1u; i < 8u; i++)
     {
-        ElementRef param_2 = ref;
-        uint param_3 = i;
-        ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4, v_46).tag;
-        uint param_5 = tag_word;
-        DrawMonoid param_6 = agg;
-        DrawMonoid param_7 = map_tag(param_5);
-        agg = combine_tag_monoid(param_6, param_7);
+        uint tag_word_1 = _96.scene[(drawtag_base + ix) + i];
+        uint param_1 = tag_word_1;
+        DrawMonoid param_2 = agg;
+        DrawMonoid param_3 = map_tag(param_1);
+        agg = combine_draw_monoid(param_2, param_3);
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
@@ -156,17 +122,19 @@
         if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u)
         {
             DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
-            DrawMonoid param_8 = agg;
-            DrawMonoid param_9 = other;
-            agg = combine_tag_monoid(param_8, param_9);
+            DrawMonoid param_4 = agg;
+            DrawMonoid param_5 = other;
+            agg = combine_draw_monoid(param_4, param_5);
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        _199.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
-        _199.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix;
+        _187.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
+        _187.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix;
+        _187.outbuf[gl_WorkGroupID.x].scene_offset = agg.scene_offset;
+        _187.outbuf[gl_WorkGroupID.x].info_offset = agg.info_offset;
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv
index a45627d..4daf43a 100644
--- a/piet-gpu/shader/gen/draw_reduce.spv
+++ b/piet-gpu/shader/gen/draw_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_root.dxil b/piet-gpu/shader/gen/draw_root.dxil
index a84fd4a..4ea23f7 100644
--- a/piet-gpu/shader/gen/draw_root.dxil
+++ b/piet-gpu/shader/gen/draw_root.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_root.hlsl b/piet-gpu/shader/gen/draw_root.hlsl
index 56b513f..b4cb7e4 100644
--- a/piet-gpu/shader/gen/draw_root.hlsl
+++ b/piet-gpu/shader/gen/draw_root.hlsl
@@ -2,13 +2,15 @@
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-static const DrawMonoid _18 = { 0u, 0u };
+static const DrawMonoid _18 = { 0u, 0u, 0u, 0u };
 
-RWByteAddressBuffer _57 : register(u0, space0);
+RWByteAddressBuffer _71 : register(u0, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
@@ -20,15 +22,17 @@
 
 groupshared DrawMonoid sh_scratch[256];
 
-DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
+DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
 {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
-DrawMonoid tag_monoid_identity()
+DrawMonoid draw_monoid_identity()
 {
     return _18;
 }
@@ -36,22 +40,30 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    DrawMonoid _61;
-    _61.path_ix = _57.Load(ix * 8 + 0);
-    _61.clip_ix = _57.Load(ix * 8 + 4);
+    DrawMonoid _75;
+    _75.path_ix = _71.Load(ix * 16 + 0);
+    _75.clip_ix = _71.Load(ix * 16 + 4);
+    _75.scene_offset = _71.Load(ix * 16 + 8);
+    _75.info_offset = _71.Load(ix * 16 + 12);
     DrawMonoid local[8];
-    local[0].path_ix = _61.path_ix;
-    local[0].clip_ix = _61.clip_ix;
+    local[0].path_ix = _75.path_ix;
+    local[0].clip_ix = _75.clip_ix;
+    local[0].scene_offset = _75.scene_offset;
+    local[0].info_offset = _75.info_offset;
     DrawMonoid param_1;
     for (uint i = 1u; i < 8u; i++)
     {
         DrawMonoid param = local[i - 1u];
-        DrawMonoid _88;
-        _88.path_ix = _57.Load((ix + i) * 8 + 0);
-        _88.clip_ix = _57.Load((ix + i) * 8 + 4);
-        param_1.path_ix = _88.path_ix;
-        param_1.clip_ix = _88.clip_ix;
-        local[i] = combine_tag_monoid(param, param_1);
+        DrawMonoid _106;
+        _106.path_ix = _71.Load((ix + i) * 16 + 0);
+        _106.clip_ix = _71.Load((ix + i) * 16 + 4);
+        _106.scene_offset = _71.Load((ix + i) * 16 + 8);
+        _106.info_offset = _71.Load((ix + i) * 16 + 12);
+        param_1.path_ix = _106.path_ix;
+        param_1.clip_ix = _106.clip_ix;
+        param_1.scene_offset = _106.scene_offset;
+        param_1.info_offset = _106.info_offset;
+        local[i] = combine_draw_monoid(param, param_1);
     }
     DrawMonoid agg = local[7];
     sh_scratch[gl_LocalInvocationID.x] = agg;
@@ -63,13 +75,13 @@
             DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
             DrawMonoid param_2 = other;
             DrawMonoid param_3 = agg;
-            agg = combine_tag_monoid(param_2, param_3);
+            agg = combine_draw_monoid(param_2, param_3);
         }
         GroupMemoryBarrierWithGroupSync();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     GroupMemoryBarrierWithGroupSync();
-    DrawMonoid row = tag_monoid_identity();
+    DrawMonoid row = draw_monoid_identity();
     if (gl_LocalInvocationID.x > 0u)
     {
         row = sh_scratch[gl_LocalInvocationID.x - 1u];
@@ -78,10 +90,12 @@
     {
         DrawMonoid param_4 = row;
         DrawMonoid param_5 = local[i_2];
-        DrawMonoid m = combine_tag_monoid(param_4, param_5);
-        uint _177 = ix + i_2;
-        _57.Store(_177 * 8 + 0, m.path_ix);
-        _57.Store(_177 * 8 + 4, m.clip_ix);
+        DrawMonoid m = combine_draw_monoid(param_4, param_5);
+        uint _199 = ix + i_2;
+        _71.Store(_199 * 16 + 0, m.path_ix);
+        _71.Store(_199 * 16 + 4, m.clip_ix);
+        _71.Store(_199 * 16 + 8, m.scene_offset);
+        _71.Store(_199 * 16 + 12, m.info_offset);
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_root.msl b/piet-gpu/shader/gen/draw_root.msl
index 0d22e4b..9ee8cfe 100644
--- a/piet-gpu/shader/gen/draw_root.msl
+++ b/piet-gpu/shader/gen/draw_root.msl
@@ -48,12 +48,16 @@
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
 struct DrawMonoid_1
 {
     uint path_ix;
     uint clip_ix;
+    uint scene_offset;
+    uint info_offset;
 };
 
 struct DataBuf
@@ -64,35 +68,41 @@
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-DrawMonoid combine_tag_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
+DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
 {
     DrawMonoid c;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
     return c;
 }
 
 static inline __attribute__((always_inline))
-DrawMonoid tag_monoid_identity()
+DrawMonoid draw_monoid_identity()
 {
-    return DrawMonoid{ 0u, 0u };
+    return DrawMonoid{ 0u, 0u, 0u, 0u };
 }
 
-kernel void main0(device DataBuf& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device DataBuf& _71 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup DrawMonoid sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
     spvUnsafeArray<DrawMonoid, 8> local;
-    local[0].path_ix = _57.data[ix].path_ix;
-    local[0].clip_ix = _57.data[ix].clip_ix;
+    local[0].path_ix = _71.data[ix].path_ix;
+    local[0].clip_ix = _71.data[ix].clip_ix;
+    local[0].scene_offset = _71.data[ix].scene_offset;
+    local[0].info_offset = _71.data[ix].info_offset;
     DrawMonoid param_1;
     for (uint i = 1u; i < 8u; i++)
     {
-        uint _82 = ix + i;
+        uint _100 = ix + i;
         DrawMonoid param = local[i - 1u];
-        param_1.path_ix = _57.data[_82].path_ix;
-        param_1.clip_ix = _57.data[_82].clip_ix;
-        local[i] = combine_tag_monoid(param, param_1);
+        param_1.path_ix = _71.data[_100].path_ix;
+        param_1.clip_ix = _71.data[_100].clip_ix;
+        param_1.scene_offset = _71.data[_100].scene_offset;
+        param_1.info_offset = _71.data[_100].info_offset;
+        local[i] = combine_draw_monoid(param, param_1);
     }
     DrawMonoid agg = local[7];
     sh_scratch[gl_LocalInvocationID.x] = agg;
@@ -104,13 +114,13 @@
             DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
             DrawMonoid param_2 = other;
             DrawMonoid param_3 = agg;
-            agg = combine_tag_monoid(param_2, param_3);
+            agg = combine_draw_monoid(param_2, param_3);
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    DrawMonoid row = tag_monoid_identity();
+    DrawMonoid row = draw_monoid_identity();
     if (gl_LocalInvocationID.x > 0u)
     {
         row = sh_scratch[gl_LocalInvocationID.x - 1u];
@@ -119,10 +129,12 @@
     {
         DrawMonoid param_4 = row;
         DrawMonoid param_5 = local[i_2];
-        DrawMonoid m = combine_tag_monoid(param_4, param_5);
-        uint _177 = ix + i_2;
-        _57.data[_177].path_ix = m.path_ix;
-        _57.data[_177].clip_ix = m.clip_ix;
+        DrawMonoid m = combine_draw_monoid(param_4, param_5);
+        uint _199 = ix + i_2;
+        _71.data[_199].path_ix = m.path_ix;
+        _71.data[_199].clip_ix = m.clip_ix;
+        _71.data[_199].scene_offset = m.scene_offset;
+        _71.data[_199].info_offset = m.info_offset;
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_root.spv b/piet-gpu/shader/gen/draw_root.spv
index 1c11414..e6a53e5 100644
--- a/piet-gpu/shader/gen/draw_root.spv
+++ b/piet-gpu/shader/gen/draw_root.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index 21bd083..f17b240 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -125,12 +125,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -138,6 +140,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index 9a8fbd0..c1f41af 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -183,12 +183,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -196,6 +198,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index 4d205ce..91272da 100644
--- a/piet-gpu/shader/gen/kernel4.spv
+++ b/piet-gpu/shader/gen/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index 7dc2e01..de95771 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -125,12 +125,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -138,6 +140,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 38506dd..5128e99 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -183,12 +183,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -196,6 +198,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 305facd..791b76c 100644
--- a/piet-gpu/shader/gen/kernel4_gray.spv
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/path_coarse.hlsl b/piet-gpu/shader/gen/path_coarse.hlsl
index 59cd7a6..93ee8f0 100644
--- a/piet-gpu/shader/gen/path_coarse.hlsl
+++ b/piet-gpu/shader/gen/path_coarse.hlsl
@@ -84,12 +84,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -97,6 +99,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(32u, 1u, 1u);
diff --git a/piet-gpu/shader/gen/path_coarse.msl b/piet-gpu/shader/gen/path_coarse.msl
index f3cead8..26aa33a 100644
--- a/piet-gpu/shader/gen/path_coarse.msl
+++ b/piet-gpu/shader/gen/path_coarse.msl
@@ -144,12 +144,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -157,6 +159,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/path_coarse.spv b/piet-gpu/shader/gen/path_coarse.spv
index 7c452bf..5e6beda 100644
--- a/piet-gpu/shader/gen/path_coarse.spv
+++ b/piet-gpu/shader/gen/path_coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil
index 3895131..6130712 100644
--- a/piet-gpu/shader/gen/pathseg.dxil
+++ b/piet-gpu/shader/gen/pathseg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl
index 0501f6f..578417f 100644
--- a/piet-gpu/shader/gen/pathseg.hlsl
+++ b/piet-gpu/shader/gen/pathseg.hlsl
@@ -62,12 +62,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -75,6 +77,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
@@ -361,7 +365,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 4u;
-    uint tag_word = _574.Load(((_639.Load(84) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
+    uint tag_word = _574.Load(((_639.Load(92) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
     uint param = tag_word;
     TagMonoid local_tm = reduce_tag(param);
     sh_tag[gl_LocalInvocationID.x] = local_tm;
@@ -400,8 +404,8 @@
         TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u];
         tm = combine_tag_monoid(param_3, param_4);
     }
-    uint ps_ix = (_639.Load(88) >> uint(2)) + tm.pathseg_offset;
-    uint lw_ix = (_639.Load(80) >> uint(2)) + tm.linewidth_ix;
+    uint ps_ix = (_639.Load(96) >> uint(2)) + tm.pathseg_offset;
+    uint lw_ix = (_639.Load(88) >> uint(2)) + tm.linewidth_ix;
     uint save_path_ix = tm.path_ix;
     uint trans_ix = tm.trans_ix;
     TransformSegRef _771 = { _639.Load(36) + (trans_ix * 24u) };
diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl
index 0e97d68..9f6328e 100644
--- a/piet-gpu/shader/gen/pathseg.msl
+++ b/piet-gpu/shader/gen/pathseg.msl
@@ -127,12 +127,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -140,6 +142,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -635,7 +639,7 @@
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint path_ix = save_path_ix;
-    uint bbox_out_ix = (_639.conf.bbox_alloc.offset >> uint(2)) + (path_ix * 6u);
+    uint bbox_out_ix = (_639.conf.path_bbox_alloc.offset >> uint(2)) + (path_ix * 6u);
     Monoid row = monoid_identity();
     if (gl_LocalInvocationID.x > 0u)
     {
diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv
index a1f223c..4e2e9d5 100644
--- a/piet-gpu/shader/gen/pathseg.spv
+++ b/piet-gpu/shader/gen/pathseg.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil
index 89fb562..4c2bd23 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.dxil
+++ b/piet-gpu/shader/gen/pathtag_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl
index 754e6e9..5f7d125 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.hlsl
+++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl
@@ -24,12 +24,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -37,6 +39,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u);
@@ -88,7 +92,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 2u;
-    uint scene_ix = (_139.Load(84) >> uint(2)) + ix;
+    uint scene_ix = (_139.Load(92) >> uint(2)) + ix;
     uint tag_word = _151.Load(scene_ix * 4 + 0);
     uint param = tag_word;
     TagMonoid agg = reduce_tag(param);
diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl
index 83a8208..91e0cca 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.msl
+++ b/piet-gpu/shader/gen/pathtag_reduce.msl
@@ -31,12 +31,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -44,6 +46,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv
index feaad0a..f1d8679 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.spv
+++ b/piet-gpu/shader/gen/pathtag_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil
index fdc60a1..7759910 100644
--- a/piet-gpu/shader/gen/tile_alloc.dxil
+++ b/piet-gpu/shader/gen/tile_alloc.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl
index 1c9d04b..73e0a8e 100644
--- a/piet-gpu/shader/gen/tile_alloc.hlsl
+++ b/piet-gpu/shader/gen/tile_alloc.hlsl
@@ -9,28 +9,6 @@
     bool failed;
 };
 
-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct PathRef
 {
     uint offset;
@@ -59,12 +37,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -72,12 +52,15 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _92 : register(u0, space0);
-ByteAddressBuffer _314 : register(t1, space0);
+RWByteAddressBuffer _70 : register(u0, space0);
+ByteAddressBuffer _181 : register(t1, space0);
+ByteAddressBuffer _257 : register(t2, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
@@ -90,62 +73,15 @@
 groupshared uint sh_tile_count[256];
 groupshared MallocResult sh_tile_alloc;
 
-bool touch_mem(Alloc alloc, uint offset)
+float4 load_draw_bbox(uint draw_ix)
 {
-    return true;
-}
-
-uint read_mem(Alloc alloc, uint offset)
-{
-    Alloc param = alloc;
-    uint param_1 = offset;
-    if (!touch_mem(param, param_1))
-    {
-        return 0u;
-    }
-    uint v = _92.Load(offset * 4 + 8);
-    return v;
-}
-
-AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _246 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _246;
-}
-
-AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9);
-    AnnoEndClip s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.blend = raw4;
-    return s;
-}
-
-AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref)
-{
-    AnnoEndClipRef _252 = { ref.offset + 4u };
-    Alloc param = a;
-    AnnoEndClipRef param_1 = _252;
-    return AnnoEndClip_read(param, param_1);
+    uint base = (_181.Load(64) >> uint(2)) + (4u * draw_ix);
+    float x0 = asfloat(_70.Load(base * 4 + 8));
+    float y0 = asfloat(_70.Load((base + 1u) * 4 + 8));
+    float x1 = asfloat(_70.Load((base + 2u) * 4 + 8));
+    float y1 = asfloat(_70.Load((base + 3u) * 4 + 8));
+    float4 bbox = float4(x0, y0, x1, y1);
+    return bbox;
 }
 
 Alloc new_alloc(uint offset, uint size, bool mem_ok)
@@ -157,22 +93,22 @@
 
 MallocResult malloc(uint size)
 {
-    uint _98;
-    _92.InterlockedAdd(0, size, _98);
-    uint offset = _98;
-    uint _105;
-    _92.GetDimensions(_105);
-    _105 = (_105 - 8) / 4;
+    uint _76;
+    _70.InterlockedAdd(0, size, _76);
+    uint offset = _76;
+    uint _83;
+    _70.GetDimensions(_83);
+    _83 = (_83 - 8) / 4;
     MallocResult r;
-    r.failed = (offset + size) > uint(int(_105) * 4);
+    r.failed = (offset + size) > uint(int(_83) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _127;
-        _92.InterlockedMax(4, 1u, _127);
+        uint _105;
+        _70.InterlockedMax(4, 1u, _105);
         return r;
     }
     return r;
@@ -180,8 +116,13 @@
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _169 = { a.offset + offset };
-    return _169;
+    Alloc _131 = { a.offset + offset };
+    return _131;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
 }
 
 void write_mem(Alloc alloc, uint offset, uint val)
@@ -192,7 +133,7 @@
     {
         return;
     }
-    _92.Store(offset * 4 + 8, val);
+    _70.Store(offset * 4 + 8, val);
 }
 
 void Path_write(Alloc a, PathRef ref, Path s)
@@ -216,56 +157,34 @@
 {
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    PathRef _321 = { _314.Load(16) + (element_ix * 12u) };
-    PathRef path_ref = _321;
-    AnnotatedRef _330 = { _314.Load(32) + (element_ix * 40u) };
-    AnnotatedRef ref = _330;
-    uint tag = 0u;
-    if (element_ix < _314.Load(0))
+    PathRef _241 = { _181.Load(16) + (element_ix * 12u) };
+    PathRef path_ref = _241;
+    uint drawtag_base = _181.Load(100) >> uint(2);
+    uint drawtag = 0u;
+    if (element_ix < _181.Load(0))
     {
-        Alloc _341;
-        _341.offset = _314.Load(32);
-        Alloc param;
-        param.offset = _341.offset;
-        AnnotatedRef param_1 = ref;
-        tag = Annotated_tag(param, param_1).tag;
+        drawtag = _257.Load((drawtag_base + element_ix) * 4 + 0);
     }
     int x0 = 0;
     int y0 = 0;
     int x1 = 0;
     int y1 = 0;
-    switch (tag)
+    if ((drawtag != 0u) && (drawtag != 37u))
     {
-        case 1u:
-        case 2u:
-        case 3u:
-        case 4u:
-        case 5u:
-        {
-            Alloc _359;
-            _359.offset = _314.Load(32);
-            Alloc param_2;
-            param_2.offset = _359.offset;
-            AnnotatedRef param_3 = ref;
-            AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3);
-            x0 = int(floor(clip.bbox.x * 0.0625f));
-            y0 = int(floor(clip.bbox.y * 0.0625f));
-            x1 = int(ceil(clip.bbox.z * 0.0625f));
-            y1 = int(ceil(clip.bbox.w * 0.0625f));
-            break;
-        }
+        uint param = element_ix;
+        float4 bbox = load_draw_bbox(param);
+        x0 = int(floor(bbox.x * 0.0625f));
+        y0 = int(floor(bbox.y * 0.0625f));
+        x1 = int(ceil(bbox.z * 0.0625f));
+        y1 = int(ceil(bbox.w * 0.0625f));
     }
-    x0 = clamp(x0, 0, int(_314.Load(8)));
-    y0 = clamp(y0, 0, int(_314.Load(12)));
-    x1 = clamp(x1, 0, int(_314.Load(8)));
-    y1 = clamp(y1, 0, int(_314.Load(12)));
+    x0 = clamp(x0, 0, int(_181.Load(8)));
+    y0 = clamp(y0, 0, int(_181.Load(12)));
+    x1 = clamp(x1, 0, int(_181.Load(8)));
+    y1 = clamp(y1, 0, int(_181.Load(12)));
     Path path;
     path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
     uint tile_count = uint((x1 - x0) * (y1 - y0));
-    if (tag == 5u)
-    {
-        tile_count = 0u;
-    }
     sh_tile_count[th_ix] = tile_count;
     uint total_tile_count = tile_count;
     for (uint i = 0u; i < 8u; i++)
@@ -280,59 +199,59 @@
     }
     if (th_ix == 255u)
     {
-        uint param_4 = total_tile_count * 8u;
-        MallocResult _485 = malloc(param_4);
-        sh_tile_alloc = _485;
+        uint param_1 = total_tile_count * 8u;
+        MallocResult _392 = malloc(param_1);
+        sh_tile_alloc = _392;
     }
     GroupMemoryBarrierWithGroupSync();
     MallocResult alloc_start = sh_tile_alloc;
-    bool _496;
+    bool _403;
     if (!alloc_start.failed)
     {
-        _496 = _92.Load(4) != 0u;
+        _403 = _70.Load(4) != 0u;
     }
     else
     {
-        _496 = alloc_start.failed;
+        _403 = alloc_start.failed;
     }
-    if (_496)
+    if (_403)
     {
         return;
     }
-    if (element_ix < _314.Load(0))
+    if (element_ix < _181.Load(0))
     {
-        uint _509;
+        uint _416;
         if (th_ix > 0u)
         {
-            _509 = sh_tile_count[th_ix - 1u];
+            _416 = sh_tile_count[th_ix - 1u];
         }
         else
         {
-            _509 = 0u;
+            _416 = 0u;
         }
-        uint tile_subix = _509;
-        Alloc param_5 = alloc_start.alloc;
-        uint param_6 = 8u * tile_subix;
-        uint param_7 = 8u * tile_count;
-        Alloc tiles_alloc = slice_mem(param_5, param_6, param_7);
-        TileRef _531 = { tiles_alloc.offset };
-        path.tiles = _531;
-        Alloc _536;
-        _536.offset = _314.Load(16);
-        Alloc param_8;
-        param_8.offset = _536.offset;
-        PathRef param_9 = path_ref;
-        Path param_10 = path;
-        Path_write(param_8, param_9, param_10);
+        uint tile_subix = _416;
+        Alloc param_2 = alloc_start.alloc;
+        uint param_3 = 8u * tile_subix;
+        uint param_4 = 8u * tile_count;
+        Alloc tiles_alloc = slice_mem(param_2, param_3, param_4);
+        TileRef _438 = { tiles_alloc.offset };
+        path.tiles = _438;
+        Alloc _444;
+        _444.offset = _181.Load(16);
+        Alloc param_5;
+        param_5.offset = _444.offset;
+        PathRef param_6 = path_ref;
+        Path param_7 = path;
+        Path_write(param_5, param_6, param_7);
     }
     uint total_count = sh_tile_count[255] * 2u;
     uint start_ix = alloc_start.alloc.offset >> uint(2);
     for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
     {
-        Alloc param_11 = alloc_start.alloc;
-        uint param_12 = start_ix + i_1;
-        uint param_13 = 0u;
-        write_mem(param_11, param_12, param_13);
+        Alloc param_8 = alloc_start.alloc;
+        uint param_9 = start_ix + i_1;
+        uint param_10 = 0u;
+        write_mem(param_8, param_9, param_10);
     }
 }
 
diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl
index c03e830..961be50 100644
--- a/piet-gpu/shader/gen/tile_alloc.msl
+++ b/piet-gpu/shader/gen/tile_alloc.msl
@@ -18,28 +18,6 @@
     bool failed;
 };
 
-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-    uint blend;
-};
-
-struct AnnotatedRef
-{
-    uint offset;
-};
-
-struct AnnotatedTag
-{
-    uint tag;
-    uint flags;
-};
-
 struct PathRef
 {
     uint offset;
@@ -80,12 +58,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -93,6 +73,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
@@ -100,67 +82,23 @@
     Config conf;
 };
 
+struct SceneBuf
+{
+    uint scene[1];
+};
+
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+float4 load_draw_bbox(thread const uint& draw_ix, device Memory& v_70, constant uint& v_70BufferSize, const device ConfigBuf& v_181)
 {
-    return true;
-}
-
-static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_92, constant uint& v_92BufferSize)
-{
-    Alloc param = alloc;
-    uint param_1 = offset;
-    if (!touch_mem(param, param_1))
-    {
-        return 0u;
-    }
-    uint v = v_92.memory[offset];
-    return v;
-}
-
-static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_92, constant uint& v_92BufferSize)
-{
-    Alloc param = a;
-    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_92, v_92BufferSize);
-    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-}
-
-static inline __attribute__((always_inline))
-AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_92, constant uint& v_92BufferSize)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_92, v_92BufferSize);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_92, v_92BufferSize);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_92, v_92BufferSize);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_92, v_92BufferSize);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_92, v_92BufferSize);
-    AnnoEndClip s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    s.blend = raw4;
-    return s;
-}
-
-static inline __attribute__((always_inline))
-AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_92, constant uint& v_92BufferSize)
-{
-    Alloc param = a;
-    AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u };
-    return AnnoEndClip_read(param, param_1, v_92, v_92BufferSize);
+    uint base = (v_181.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix);
+    float x0 = as_type<float>(v_70.memory[base]);
+    float y0 = as_type<float>(v_70.memory[base + 1u]);
+    float x1 = as_type<float>(v_70.memory[base + 2u]);
+    float y1 = as_type<float>(v_70.memory[base + 3u]);
+    float4 bbox = float4(x0, y0, x1, y1);
+    return bbox;
 }
 
 static inline __attribute__((always_inline))
@@ -172,19 +110,19 @@
 }
 
 static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_92, constant uint& v_92BufferSize)
+MallocResult malloc(thread const uint& size, device Memory& v_70, constant uint& v_70BufferSize)
 {
-    uint _98 = atomic_fetch_add_explicit((device atomic_uint*)&v_92.mem_offset, size, memory_order_relaxed);
-    uint offset = _98;
+    uint _76 = atomic_fetch_add_explicit((device atomic_uint*)&v_70.mem_offset, size, memory_order_relaxed);
+    uint offset = _76;
     MallocResult r;
-    r.failed = (offset + size) > uint(int((v_92BufferSize - 8) / 4) * 4);
+    r.failed = (offset + size) > uint(int((v_70BufferSize - 8) / 4) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _127 = atomic_fetch_max_explicit((device atomic_uint*)&v_92.mem_error, 1u, memory_order_relaxed);
+        uint _105 = atomic_fetch_max_explicit((device atomic_uint*)&v_70.mem_error, 1u, memory_order_relaxed);
         return r;
     }
     return r;
@@ -197,7 +135,13 @@
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_92, constant uint& v_92BufferSize)
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_70, constant uint& v_70BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -205,78 +149,61 @@
     {
         return;
     }
-    v_92.memory[offset] = val;
+    v_70.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_92, constant uint& v_92BufferSize)
+void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_70, constant uint& v_70BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.bbox.x | (s.bbox.y << uint(16));
-    write_mem(param, param_1, param_2, v_92, v_92BufferSize);
+    write_mem(param, param_1, param_2, v_70, v_70BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = s.bbox.z | (s.bbox.w << uint(16));
-    write_mem(param_3, param_4, param_5, v_92, v_92BufferSize);
+    write_mem(param_3, param_4, param_5, v_70, v_70BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = s.tiles.offset;
-    write_mem(param_6, param_7, param_8, v_92, v_92BufferSize);
+    write_mem(param_6, param_7, param_8, v_70, v_70BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_92 [[buffer(0)]], const device ConfigBuf& _314 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_70 [[buffer(0)]], const device ConfigBuf& v_181 [[buffer(1)]], const device SceneBuf& _257 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
     threadgroup uint sh_tile_count[256];
     threadgroup MallocResult sh_tile_alloc;
-    constant uint& v_92BufferSize = spvBufferSizeConstants[0];
+    constant uint& v_70BufferSize = spvBufferSizeConstants[0];
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    PathRef path_ref = PathRef{ _314.conf.tile_alloc.offset + (element_ix * 12u) };
-    AnnotatedRef ref = AnnotatedRef{ _314.conf.anno_alloc.offset + (element_ix * 40u) };
-    uint tag = 0u;
-    if (element_ix < _314.conf.n_elements)
+    PathRef path_ref = PathRef{ v_181.conf.tile_alloc.offset + (element_ix * 12u) };
+    uint drawtag_base = v_181.conf.drawtag_offset >> uint(2);
+    uint drawtag = 0u;
+    if (element_ix < v_181.conf.n_elements)
     {
-        Alloc param;
-        param.offset = _314.conf.anno_alloc.offset;
-        AnnotatedRef param_1 = ref;
-        tag = Annotated_tag(param, param_1, v_92, v_92BufferSize).tag;
+        drawtag = _257.scene[drawtag_base + element_ix];
     }
     int x0 = 0;
     int y0 = 0;
     int x1 = 0;
     int y1 = 0;
-    switch (tag)
+    if ((drawtag != 0u) && (drawtag != 37u))
     {
-        case 1u:
-        case 2u:
-        case 3u:
-        case 4u:
-        case 5u:
-        {
-            Alloc param_2;
-            param_2.offset = _314.conf.anno_alloc.offset;
-            AnnotatedRef param_3 = ref;
-            AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_92, v_92BufferSize);
-            x0 = int(floor(clip.bbox.x * 0.0625));
-            y0 = int(floor(clip.bbox.y * 0.0625));
-            x1 = int(ceil(clip.bbox.z * 0.0625));
-            y1 = int(ceil(clip.bbox.w * 0.0625));
-            break;
-        }
+        uint param = element_ix;
+        float4 bbox = load_draw_bbox(param, v_70, v_70BufferSize, v_181);
+        x0 = int(floor(bbox.x * 0.0625));
+        y0 = int(floor(bbox.y * 0.0625));
+        x1 = int(ceil(bbox.z * 0.0625));
+        y1 = int(ceil(bbox.w * 0.0625));
     }
-    x0 = clamp(x0, 0, int(_314.conf.width_in_tiles));
-    y0 = clamp(y0, 0, int(_314.conf.height_in_tiles));
-    x1 = clamp(x1, 0, int(_314.conf.width_in_tiles));
-    y1 = clamp(y1, 0, int(_314.conf.height_in_tiles));
+    x0 = clamp(x0, 0, int(v_181.conf.width_in_tiles));
+    y0 = clamp(y0, 0, int(v_181.conf.height_in_tiles));
+    x1 = clamp(x1, 0, int(v_181.conf.width_in_tiles));
+    y1 = clamp(y1, 0, int(v_181.conf.height_in_tiles));
     Path path;
     path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
     uint tile_count = uint((x1 - x0) * (y1 - y0));
-    if (tag == 5u)
-    {
-        tile_count = 0u;
-    }
     sh_tile_count[th_ix] = tile_count;
     uint total_tile_count = tile_count;
     for (uint i = 0u; i < 8u; i++)
@@ -291,56 +218,56 @@
     }
     if (th_ix == 255u)
     {
-        uint param_4 = total_tile_count * 8u;
-        MallocResult _485 = malloc(param_4, v_92, v_92BufferSize);
-        sh_tile_alloc = _485;
+        uint param_1 = total_tile_count * 8u;
+        MallocResult _392 = malloc(param_1, v_70, v_70BufferSize);
+        sh_tile_alloc = _392;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     MallocResult alloc_start = sh_tile_alloc;
-    bool _496;
+    bool _403;
     if (!alloc_start.failed)
     {
-        _496 = v_92.mem_error != 0u;
+        _403 = v_70.mem_error != 0u;
     }
     else
     {
-        _496 = alloc_start.failed;
+        _403 = alloc_start.failed;
     }
-    if (_496)
+    if (_403)
     {
         return;
     }
-    if (element_ix < _314.conf.n_elements)
+    if (element_ix < v_181.conf.n_elements)
     {
-        uint _509;
+        uint _416;
         if (th_ix > 0u)
         {
-            _509 = sh_tile_count[th_ix - 1u];
+            _416 = sh_tile_count[th_ix - 1u];
         }
         else
         {
-            _509 = 0u;
+            _416 = 0u;
         }
-        uint tile_subix = _509;
-        Alloc param_5 = alloc_start.alloc;
-        uint param_6 = 8u * tile_subix;
-        uint param_7 = 8u * tile_count;
-        Alloc tiles_alloc = slice_mem(param_5, param_6, param_7);
+        uint tile_subix = _416;
+        Alloc param_2 = alloc_start.alloc;
+        uint param_3 = 8u * tile_subix;
+        uint param_4 = 8u * tile_count;
+        Alloc tiles_alloc = slice_mem(param_2, param_3, param_4);
         path.tiles = TileRef{ tiles_alloc.offset };
-        Alloc param_8;
-        param_8.offset = _314.conf.tile_alloc.offset;
-        PathRef param_9 = path_ref;
-        Path param_10 = path;
-        Path_write(param_8, param_9, param_10, v_92, v_92BufferSize);
+        Alloc param_5;
+        param_5.offset = v_181.conf.tile_alloc.offset;
+        PathRef param_6 = path_ref;
+        Path param_7 = path;
+        Path_write(param_5, param_6, param_7, v_70, v_70BufferSize);
     }
     uint total_count = sh_tile_count[255] * 2u;
     uint start_ix = alloc_start.alloc.offset >> uint(2);
     for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
     {
-        Alloc param_11 = alloc_start.alloc;
-        uint param_12 = start_ix + i_1;
-        uint param_13 = 0u;
-        write_mem(param_11, param_12, param_13, v_92, v_92BufferSize);
+        Alloc param_8 = alloc_start.alloc;
+        uint param_9 = start_ix + i_1;
+        uint param_10 = 0u;
+        write_mem(param_8, param_9, param_10, v_70, v_70BufferSize);
     }
 }
 
diff --git a/piet-gpu/shader/gen/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv
index cf2f01c..dbc02a8 100644
--- a/piet-gpu/shader/gen/tile_alloc.spv
+++ b/piet-gpu/shader/gen/tile_alloc.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil
index 8a3200b..f9f31e6 100644
--- a/piet-gpu/shader/gen/transform_leaf.dxil
+++ b/piet-gpu/shader/gen/transform_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl
index 3528d6c..8a3b3d5 100644
--- a/piet-gpu/shader/gen/transform_leaf.hlsl
+++ b/piet-gpu/shader/gen/transform_leaf.hlsl
@@ -37,12 +37,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -50,6 +52,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
@@ -155,7 +159,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _285 = { _278.Load(76) + (ix * 24u) };
+    TransformRef _285 = { _278.Load(84) + (ix * 24u) };
     TransformRef ref = _285;
     TransformRef param = ref;
     Transform agg = Transform_read(param);
diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl
index 6a99fae..fe45438 100644
--- a/piet-gpu/shader/gen/transform_leaf.msl
+++ b/piet-gpu/shader/gen/transform_leaf.msl
@@ -100,12 +100,14 @@
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
-    Alloc_1 bbox_alloc;
+    Alloc_1 path_bbox_alloc;
     Alloc_1 drawmonoid_alloc;
     Alloc_1 clip_alloc;
     Alloc_1 clip_bic_alloc;
     Alloc_1 clip_stack_alloc;
     Alloc_1 clip_bbox_alloc;
+    Alloc_1 draw_bbox_alloc;
+    Alloc_1 drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -113,6 +115,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv
index b9a0a83..b739099 100644
--- a/piet-gpu/shader/gen/transform_leaf.spv
+++ b/piet-gpu/shader/gen/transform_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil
index 6c94a3a..978dd98 100644
--- a/piet-gpu/shader/gen/transform_reduce.dxil
+++ b/piet-gpu/shader/gen/transform_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl
index cce1a22..bd14f79 100644
--- a/piet-gpu/shader/gen/transform_reduce.hlsl
+++ b/piet-gpu/shader/gen/transform_reduce.hlsl
@@ -26,12 +26,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -39,6 +41,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
@@ -92,7 +96,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _168 = { _161.Load(76) + (ix * 24u) };
+    TransformRef _168 = { _161.Load(84) + (ix * 24u) };
     TransformRef ref = _168;
     TransformRef param = ref;
     Transform agg = Transform_read(param);
diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl
index 3695563..62da531 100644
--- a/piet-gpu/shader/gen/transform_reduce.msl
+++ b/piet-gpu/shader/gen/transform_reduce.msl
@@ -38,12 +38,14 @@
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     Alloc drawmonoid_alloc;
     Alloc clip_alloc;
     Alloc clip_bic_alloc;
     Alloc clip_stack_alloc;
     Alloc clip_bbox_alloc;
+    Alloc draw_bbox_alloc;
+    Alloc drawinfo_alloc;
     uint n_trans;
     uint n_path;
     uint n_clip;
@@ -51,6 +53,8 @@
     uint linewidth_offset;
     uint pathtag_offset;
     uint pathseg_offset;
+    uint drawtag_offset;
+    uint drawdata_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv
index e74cb8d..6aa6b94 100644
--- a/piet-gpu/shader/gen/transform_reduce.spv
+++ b/piet-gpu/shader/gen/transform_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/pathseg.comp b/piet-gpu/shader/pathseg.comp
index a2ea86e..ce4ab84 100644
--- a/piet-gpu/shader/pathseg.comp
+++ b/piet-gpu/shader/pathseg.comp
@@ -248,7 +248,7 @@
 
     barrier();
     uint path_ix = save_path_ix;
-    uint bbox_out_ix = (conf.bbox_alloc.offset >> 2) + path_ix * 6;
+    uint bbox_out_ix = (conf.path_bbox_alloc.offset >> 2) + path_ix * 6;
     // Write bboxes to paths; do atomic min/max if partial
     Monoid row = monoid_identity();
     if (gl_LocalInvocationID.x > 0) {
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index 0dccecb..ec17188 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -42,7 +42,7 @@
     // new element pipeline stuff follows
 
     // Bounding boxes of paths, stored as int (so atomics work)
-    Alloc bbox_alloc;
+    Alloc path_bbox_alloc;
     // Monoid for draw objects
     Alloc drawmonoid_alloc;
 
@@ -54,6 +54,10 @@
     Alloc clip_stack_alloc;
     // Clip processing results (path_ix + bbox)
     Alloc clip_bbox_alloc;
+    // Bounding box per draw object
+    Alloc draw_bbox_alloc;
+    // Info computed in draw stage, per draw object
+    Alloc drawinfo_alloc;
 
     // Number of transforms in scene
     // This is probably not needed.
@@ -63,6 +67,10 @@
     uint n_path;
     // Total number of BeginClip and EndClip draw objects.
     uint n_clip;
+
+    // Note: one of these offsets *could* be hardcoded to zero (as was the
+    // original element stream), but for now retain flexibility.
+
     // Offset (in bytes) of transform stream in scene buffer
     uint trans_offset;
     // Offset (in bytes) of linewidth stream in scene
@@ -71,6 +79,10 @@
     uint pathtag_offset;
     // Offset (in bytes) of path segment stream in scene
     uint pathseg_offset;
+    // Offset (in bytes) of draw object tag stream in scene; see drawtag.h
+    uint drawtag_offset;
+    // Offset (in bytes) of draw payload stream in scene
+    uint drawdata_offset;
 };
 #endif
 
diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp
index 024f499..0fec2ce 100644
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@@ -17,7 +17,11 @@
     Config conf;
 };
 
-#include "annotated.h"
+layout(binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+#include "drawtag.h"
 #include "tile.h"
 
 // scale factors useful for converting coordinates to tiles
@@ -27,31 +31,39 @@
 shared uint sh_tile_count[TILE_ALLOC_WG];
 shared MallocResult sh_tile_alloc;
 
+vec4 load_draw_bbox(uint draw_ix) {
+    uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix;
+    float x0 = uintBitsToFloat(memory[base]);
+    float y0 = uintBitsToFloat(memory[base + 1]);
+    float x1 = uintBitsToFloat(memory[base + 2]);
+    float y1 = uintBitsToFloat(memory[base + 3]);
+    vec4 bbox = vec4(x0, y0, x1, y1);
+    return bbox;
+}
+
 void main() {
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
+    // At the moment, element_ix == path_ix. The clip-intersected bounding boxes
+    // for elements (draw objects) are computed in the binning stage, but at some
+    // point we'll probably want to break that correspondence. Tiles should be
+    // allocated for paths, not draw objs. EndClip doesn't need an allocation.
     PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
-    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
+    uint drawtag_base = conf.drawtag_offset >> 2;
 
-    uint tag = Annotated_Nop;
+    uint drawtag = Drawtag_Nop;
     if (element_ix < conf.n_elements) {
-        tag = Annotated_tag(conf.anno_alloc, ref).tag;
+        drawtag = scene[drawtag_base + element_ix];
     }
     int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
-    switch (tag) {
-    case Annotated_Color:
-    case Annotated_LinGradient:
-    case Annotated_Image:
-    case Annotated_BeginClip:
-    case Annotated_EndClip:
-        // Note: we take advantage of the fact that fills, strokes, and
-        // clips have compatible layout.
-        AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
-        x0 = int(floor(clip.bbox.x * SX));
-        y0 = int(floor(clip.bbox.y * SY));
-        x1 = int(ceil(clip.bbox.z * SX));
-        y1 = int(ceil(clip.bbox.w * SY));
-        break;
+    // Allocate an empty path for EndClip; at some point we'll change
+    // this to be per path rather than per draw object.
+    if (drawtag != Drawtag_Nop && drawtag != Drawtag_EndClip) {
+        vec4 bbox = load_draw_bbox(element_ix);
+        x0 = int(floor(bbox.x * SX));
+        y0 = int(floor(bbox.y * SY));
+        x1 = int(ceil(bbox.z * SX));
+        y1 = int(ceil(bbox.w * SY));
     }
     x0 = clamp(x0, 0, int(conf.width_in_tiles));
     y0 = clamp(y0, 0, int(conf.height_in_tiles));
@@ -61,11 +73,6 @@
     Path path;
     path.bbox = uvec4(x0, y0, x1, y1);
     uint tile_count = (x1 - x0) * (y1 - y0);
-    if (tag == Annotated_EndClip) {
-        // Don't actually allocate tiles for an end clip, but we do want
-        // the path structure (especially bbox) allocated for it.
-        tile_count = 0;
-    }
 
     sh_tile_count[th_ix] = tile_count;
     uint total_tile_count = tile_count;
diff --git a/piet-gpu/src/blend.rs b/piet-gpu/src/blend.rs
index 6f1e791..aacf597 100644
--- a/piet-gpu/src/blend.rs
+++ b/piet-gpu/src/blend.rs
@@ -63,7 +63,10 @@
 
 impl Blend {
     pub fn new(mode: BlendMode, composition_mode: CompositionMode) -> Self {
-        Self { mode, composition_mode }
+        Self {
+            mode,
+            composition_mode,
+        }
     }
 
     pub(crate) fn pack(&self) -> u32 {
diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs
index c24615e..62c59c4 100644
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@@ -30,7 +30,8 @@
     tag_stream: Vec<u8>,
     pathseg_stream: Vec<u8>,
     linewidth_stream: Vec<f32>,
-    drawobj_stream: Vec<u8>,
+    drawtag_stream: Vec<u32>,
+    drawdata_stream: Vec<u8>,
     n_path: u32,
     n_pathseg: u32,
     n_clip: u32,
@@ -43,53 +44,54 @@
 pub struct GlyphEncoder {
     tag_stream: Vec<u8>,
     pathseg_stream: Vec<u8>,
-    drawobj_stream: Vec<u8>,
+    drawtag_stream: Vec<u32>,
+    drawdata_stream: Vec<u8>,
     n_path: u32,
     n_pathseg: u32,
 }
 
-// Currently same as Element, but may change - should become packed.
-const DRAWOBJ_SIZE: usize = 36;
 const TRANSFORM_SIZE: usize = 24;
 const LINEWIDTH_SIZE: usize = 4;
 const PATHSEG_SIZE: usize = 52;
-const BBOX_SIZE: usize = 24;
-const DRAWMONOID_SIZE: usize = 8;
+const PATH_BBOX_SIZE: usize = 24;
+const DRAWMONOID_SIZE: usize = 16;
+const DRAW_BBOX_SIZE: usize = 16;
+const DRAWTAG_SIZE: usize = 4;
 const ANNOTATED_SIZE: usize = 40;
 
-// These are bytemuck versions of elements currently defined in the
-// Element struct in piet-gpu-types; that's pretty much going away.
-
-const ELEMENT_FILLCOLOR: u32 = 4;
-const ELEMENT_FILLLINGRADIENT: u32 = 5;
-const ELEMENT_BEGINCLIP: u32 = 9;
-const ELEMENT_ENDCLIP: u32 = 10;
+// Tags for draw objects. See shader/drawtag.h for the authoritative source.
+const DRAWTAG_FILLCOLOR: u32 = 0x44;
+const DRAWTAG_FILLLINGRADIENT: u32 = 0x114;
+const DRAWTAG_BEGINCLIP: u32 = 0x05;
+const DRAWTAG_ENDCLIP: u32 = 0x25;
 
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
 pub struct FillColor {
-    tag: u32,
     rgba_color: u32,
-    padding: [u32; 7],
 }
 
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
 pub struct FillLinGradient {
-    tag: u32,
     index: u32,
     p0: [f32; 2],
     p1: [f32; 2],
-    padding: [u32; 3],
+}
+
+#[allow(unused)]
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct FillImage {
+    index: u32,
+    // [i16; 2]
+    offset: u32,
 }
 
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
 pub struct Clip {
-    tag: u32,
-    bbox: [f32; 4],
     blend: u32,
-    padding: [u32; 3],
 }
 
 impl Encoder {
@@ -99,7 +101,8 @@
             tag_stream: Vec::new(),
             pathseg_stream: Vec::new(),
             linewidth_stream: vec![-1.0],
-            drawobj_stream: Vec::new(),
+            drawtag_stream: Vec::new(),
+            drawdata_stream: Vec::new(),
             n_path: 0,
             n_pathseg: 0,
             n_clip: 0,
@@ -130,51 +133,36 @@
     ///
     /// This should be encoded after a path.
     pub fn fill_color(&mut self, rgba_color: u32) {
-        let element = FillColor {
-            tag: ELEMENT_FILLCOLOR,
-            rgba_color,
-            ..Default::default()
-        };
-        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
+        self.drawtag_stream.push(DRAWTAG_FILLCOLOR);
+        let element = FillColor { rgba_color };
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
     }
 
     /// Encode a fill linear gradient draw object.
     ///
     /// This should be encoded after a path.
     pub fn fill_lin_gradient(&mut self, index: u32, p0: [f32; 2], p1: [f32; 2]) {
-        let element = FillLinGradient {
-            tag: ELEMENT_FILLLINGRADIENT,
-            index,
-            p0,
-            p1,
-            ..Default::default()
-        };
-        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
+        self.drawtag_stream.push(DRAWTAG_FILLLINGRADIENT);
+        let element = FillLinGradient { index, p0, p1 };
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
     }
 
-    /// Start a clip and return a save point to be filled in later.
-    pub fn begin_clip(&mut self, blend: Option<Blend>) -> usize {
-        let saved = self.drawobj_stream.len();
+    /// Start a clip.
+    pub fn begin_clip(&mut self, blend: Option<Blend>) {
+        self.drawtag_stream.push(DRAWTAG_BEGINCLIP);
         let element = Clip {
-            tag: ELEMENT_BEGINCLIP,
             blend: blend.unwrap_or(Blend::default()).pack(),
-            ..Default::default()
         };
-        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
         self.n_clip += 1;
-        saved
     }
 
-    pub fn end_clip(&mut self, bbox: [f32; 4], blend: Option<Blend>, save_point: usize) {
+    pub fn end_clip(&mut self, blend: Option<Blend>) {
+        self.drawtag_stream.push(DRAWTAG_ENDCLIP);
         let element = Clip {
-            tag: ELEMENT_ENDCLIP,
-            bbox,
             blend: blend.unwrap_or(Blend::default()).pack(),
-            ..Default::default()
         };
-        self.drawobj_stream[save_point + 4..save_point + 20]
-            .clone_from_slice(bytemuck::bytes_of(&bbox));
-        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
         // This is a dummy path, and will go away with the new clip impl.
         self.tag_stream.push(0x10);
         self.n_path += 1;
@@ -187,9 +175,11 @@
     /// beginning of free memory.
     pub fn stage_config(&self) -> (Config, usize) {
         // Layout of scene buffer
+        let drawtag_offset = 0;
         let n_drawobj = self.n_drawobj();
         let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
-        let trans_offset = n_drawobj_padded * DRAWOBJ_SIZE;
+        let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
+        let trans_offset = drawdata_offset + self.drawdata_stream.len();
         let n_trans = self.transform_stream.len();
         let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
         let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
@@ -205,9 +195,9 @@
         alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
         let pathseg_alloc = alloc;
         alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
-        let bbox_alloc = alloc;
+        let path_bbox_alloc = alloc;
         let n_path = self.n_path as usize;
-        alloc += bbox_alloc + n_path * BBOX_SIZE;
+        alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
         let drawmonoid_alloc = alloc;
         alloc += n_drawobj_padded * DRAWMONOID_SIZE;
         let anno_alloc = alloc;
@@ -226,6 +216,12 @@
         let clip_bbox_alloc = alloc;
         const CLIP_BBOX_SIZE: usize = 16;
         alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
+        let draw_bbox_alloc = alloc;
+        alloc += n_drawobj * DRAW_BBOX_SIZE;
+        let drawinfo_alloc = alloc;
+        // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
+        const MAX_DRAWINFO_SIZE: usize = 16;
+        alloc += n_drawobj * MAX_DRAWINFO_SIZE;
 
         let config = Config {
             n_elements: n_drawobj as u32,
@@ -233,12 +229,14 @@
             pathseg_alloc: pathseg_alloc as u32,
             anno_alloc: anno_alloc as u32,
             trans_alloc: trans_alloc as u32,
-            bbox_alloc: bbox_alloc as u32,
+            path_bbox_alloc: path_bbox_alloc as u32,
             drawmonoid_alloc: drawmonoid_alloc as u32,
             clip_alloc: clip_alloc as u32,
             clip_bic_alloc: clip_bic_alloc as u32,
             clip_stack_alloc: clip_stack_alloc as u32,
             clip_bbox_alloc: clip_bbox_alloc as u32,
+            draw_bbox_alloc: draw_bbox_alloc as u32,
+            drawinfo_alloc: drawinfo_alloc as u32,
             n_trans: n_trans as u32,
             n_path: self.n_path,
             n_clip: self.n_clip,
@@ -246,15 +244,18 @@
             linewidth_offset: linewidth_offset as u32,
             pathtag_offset: pathtag_offset as u32,
             pathseg_offset: pathseg_offset as u32,
+            drawtag_offset: drawtag_offset as u32,
+            drawdata_offset: drawdata_offset as u32,
             ..Default::default()
         };
         (config, alloc)
     }
 
     pub fn write_scene(&self, buf: &mut BufWrite) {
-        buf.extend_slice(&self.drawobj_stream);
-        let n_drawobj = self.drawobj_stream.len() / DRAWOBJ_SIZE;
-        buf.fill_zero(padding(n_drawobj, DRAW_PART_SIZE as usize) * DRAWOBJ_SIZE);
+        buf.extend_slice(&self.drawtag_stream);
+        let n_drawobj = self.drawtag_stream.len();
+        buf.fill_zero(padding(n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE);
+        buf.extend_slice(&self.drawdata_stream);
         buf.extend_slice(&self.transform_stream);
         let n_trans = self.transform_stream.len();
         buf.fill_zero(padding(n_trans, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE);
@@ -265,9 +266,9 @@
         buf.extend_slice(&self.pathseg_stream);
     }
 
-    /// The number of elements in the draw object stream.
+    /// The number of draw objects in the draw object stream.
     pub(crate) fn n_drawobj(&self) -> usize {
-        self.drawobj_stream.len() / DRAWOBJ_SIZE
+        self.drawtag_stream.len()
     }
 
     /// The number of paths.
@@ -296,7 +297,8 @@
     pub(crate) fn encode_glyph(&mut self, glyph: &GlyphEncoder) {
         self.tag_stream.extend(&glyph.tag_stream);
         self.pathseg_stream.extend(&glyph.pathseg_stream);
-        self.drawobj_stream.extend(&glyph.drawobj_stream);
+        self.drawtag_stream.extend(&glyph.drawtag_stream);
+        self.drawdata_stream.extend(&glyph.drawdata_stream);
         self.n_path += glyph.n_path;
         self.n_pathseg += glyph.n_pathseg;
     }
@@ -325,15 +327,12 @@
     ///
     /// This should be encoded after a path.
     pub(crate) fn fill_color(&mut self, rgba_color: u32) {
-        let element = FillColor {
-            tag: ELEMENT_FILLCOLOR,
-            rgba_color,
-            ..Default::default()
-        };
-        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
+        self.drawtag_stream.push(DRAWTAG_FILLCOLOR);
+        let element = FillColor { rgba_color };
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
     }
 
     pub(crate) fn is_color(&self) -> bool {
-        !self.drawobj_stream.is_empty()
+        !self.drawtag_stream.is_empty()
     }
 }
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index bd26d45..e12f824 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -92,7 +92,7 @@
     clip_binding: ClipBinding,
 
     tile_pipeline: Pipeline,
-    tile_ds: DescriptorSet,
+    tile_ds: Vec<DescriptorSet>,
 
     path_pipeline: Pipeline,
     path_ds: DescriptorSet,
@@ -105,7 +105,7 @@
     bin_ds: DescriptorSet,
 
     coarse_pipeline: Pipeline,
-    coarse_ds: DescriptorSet,
+    coarse_ds: Vec<DescriptorSet>,
 
     k4_pipeline: Pipeline,
     k4_ds: DescriptorSet,
@@ -176,10 +176,8 @@
         };
         let image_dev = session.create_image2d(width as u32, height as u32, image_format)?;
 
-        // Note: this must be updated when the config struct size changes.
         const CONFIG_BUFFER_SIZE: u64 = std::mem::size_of::<Config>() as u64;
         let config_buf = session.create_buffer(CONFIG_BUFFER_SIZE, dev).unwrap();
-        // TODO: separate staging buffer (if needed)
         let config_bufs = (0..n_bufs)
             .map(|_| {
                 session
@@ -212,10 +210,23 @@
         let clip_binding = ClipBinding::new(session, &clip_code, &config_buf, &memory_buf_dev);
 
         let tile_alloc_code = include_shader!(session, "../shader/gen/tile_alloc");
-        let tile_pipeline = session
-            .create_compute_pipeline(tile_alloc_code, &[BindType::Buffer, BindType::BufReadOnly])?;
-        let tile_ds = session
-            .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf])?;
+        let tile_pipeline = session.create_compute_pipeline(
+            tile_alloc_code,
+            &[
+                BindType::Buffer,
+                BindType::BufReadOnly,
+                BindType::BufReadOnly,
+            ],
+        )?;
+        let tile_ds = scene_bufs
+            .iter()
+            .map(|scene_buf| {
+                session.create_simple_descriptor_set(
+                    &tile_pipeline,
+                    &[&memory_buf_dev, &config_buf, scene_buf],
+                )
+            })
+            .collect::<Result<Vec<_>, _>>()?;
 
         let path_alloc_code = include_shader!(session, "../shader/gen/path_coarse");
         let path_pipeline = session
@@ -243,11 +254,23 @@
             session.create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf])?;
 
         let coarse_code = include_shader!(session, "../shader/gen/coarse");
-        let coarse_pipeline = session
-            .create_compute_pipeline(coarse_code, &[BindType::Buffer, BindType::BufReadOnly])?;
-        let coarse_ds = session
-            .create_simple_descriptor_set(&coarse_pipeline, &[&memory_buf_dev, &config_buf])?;
-
+        let coarse_pipeline = session.create_compute_pipeline(
+            coarse_code,
+            &[
+                BindType::Buffer,
+                BindType::BufReadOnly,
+                BindType::BufReadOnly,
+            ],
+        )?;
+        let coarse_ds = scene_bufs
+            .iter()
+            .map(|scene_buf| {
+                session.create_simple_descriptor_set(
+                    &coarse_pipeline,
+                    &[&memory_buf_dev, &config_buf, scene_buf],
+                )
+            })
+            .collect::<Result<Vec<_>, _>>()?;
         let bg_image = Self::make_test_bg_image(&session);
 
         const GRADIENT_BUF_SIZE: usize =
@@ -430,7 +453,7 @@
         cmd_buf.begin_debug_label("Tile allocation");
         cmd_buf.dispatch(
             &self.tile_pipeline,
-            &self.tile_ds,
+            &self.tile_ds[buf_ix],
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
@@ -462,7 +485,7 @@
         cmd_buf.begin_debug_label("Coarse raster");
         cmd_buf.dispatch(
             &self.coarse_pipeline,
-            &self.coarse_ds,
+            &self.coarse_ds[buf_ix],
             (
                 (self.width as u32 + 255) / 256,
                 (self.height as u32 + 255) / 256,
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index 1fe1ce9..024dd2b 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -64,9 +64,6 @@
 }
 
 struct ClipElement {
-    /// Byte offset of BeginClip element in element vec, for bbox fixup.
-    save_point: usize,
-    bbox: Option<Rect>,
     blend: Option<Blend>,
 }
 
@@ -199,8 +196,6 @@
     fn stroke(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>, width: f64) {
         self.encode_linewidth(width.abs() as f32);
         let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
-        // Note: the bbox contribution of stroke becomes more complicated with miter joins.
-        self.accumulate_bbox(|| shape.bounding_box() + Insets::uniform(width * 0.5));
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, false);
         self.encode_brush(&brush);
@@ -217,9 +212,6 @@
 
     fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>) {
         let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
-        // Note: we might get a good speedup from using an approximate bounding box.
-        // Perhaps that should be added to kurbo.
-        self.accumulate_bbox(|| shape.bounding_box());
         let path = shape.path_elements(TOLERANCE);
         self.encode_linewidth(-1.0);
         self.encode_path(path, true);
@@ -232,15 +224,11 @@
         self.encode_linewidth(-1.0);
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
-        let save_point = self.new_encoder.begin_clip(None);
+        self.new_encoder.begin_clip(None);
         if self.clip_stack.len() >= MAX_BLEND_STACK {
             panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
         }
-        self.clip_stack.push(ClipElement {
-            bbox: None,
-            save_point,
-            blend: None,
-        });
+        self.clip_stack.push(ClipElement { blend: None });
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;
         }
@@ -340,16 +328,11 @@
         self.encode_linewidth(-1.0);
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
-        let save_point = self.new_encoder.begin_clip(Some(blend));
+        self.new_encoder.begin_clip(Some(blend));
         if self.clip_stack.len() >= MAX_BLEND_STACK {
             panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
         }
-        self.clip_stack.push(ClipElement {
-            bbox: None,
-            save_point,
-            blend: Some(blend),
-        });
-        self.accumulate_bbox(|| shape.bounding_box());
+        self.clip_stack.push(ClipElement { blend: Some(blend) });
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;
         }
@@ -406,37 +389,7 @@
 
     fn pop_clip(&mut self) {
         let tos = self.clip_stack.pop().unwrap();
-        let bbox = tos.bbox.unwrap_or_default();
-        let bbox_f32_4 = rect_to_f32_4(bbox);
-        self.new_encoder.end_clip(bbox_f32_4, tos.blend, tos.save_point);
-        if let Some(bbox) = tos.bbox {
-            self.union_bbox(bbox);
-        }
-    }
-
-    /// Accumulate a bbox.
-    ///
-    /// The bbox is given lazily as a closure, relative to the current transform.
-    /// It's lazy because we don't need to compute it unless we're inside a clip.
-    fn accumulate_bbox(&mut self, f: impl FnOnce() -> Rect) {
-        if !self.clip_stack.is_empty() {
-            let bbox = f();
-            let bbox = self.cur_transform.transform_rect_bbox(bbox);
-            self.union_bbox(bbox);
-        }
-    }
-
-    /// Accumulate an absolute bbox.
-    ///
-    /// The bbox is given already transformed into surface coordinates.
-    fn union_bbox(&mut self, bbox: Rect) {
-        if let Some(tos) = self.clip_stack.last_mut() {
-            tos.bbox = if let Some(old_bbox) = tos.bbox {
-                Some(old_bbox.union(bbox))
-            } else {
-                Some(bbox)
-            };
-        }
+        self.new_encoder.end_clip(tos.blend);
     }
 
     pub(crate) fn encode_glyph(&mut self, glyph: &GlyphEncoder) {
diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs
index e155c50..52b8bf1 100644
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@@ -47,12 +47,14 @@
     pub pathseg_alloc: u32,
     pub anno_alloc: u32,
     pub trans_alloc: u32,
-    pub bbox_alloc: u32,
+    pub path_bbox_alloc: u32,
     pub drawmonoid_alloc: u32,
     pub clip_alloc: u32,
     pub clip_bic_alloc: u32,
     pub clip_stack_alloc: u32,
     pub clip_bbox_alloc: u32,
+    pub draw_bbox_alloc: u32,
+    pub drawinfo_alloc: u32,
     pub n_trans: u32,
     pub n_path: u32,
     pub n_clip: u32,
@@ -60,6 +62,8 @@
     pub linewidth_offset: u32,
     pub pathtag_offset: u32,
     pub pathseg_offset: u32,
+    pub drawtag_offset: u32,
+    pub drawdata_offset: u32,
 }
 
 // The "element" stage combines a number of stages for parts of the pipeline.
diff --git a/piet-gpu/src/stages/draw.rs b/piet-gpu/src/stages/draw.rs
index 5328a84..21312a4 100644
--- a/piet-gpu/src/stages/draw.rs
+++ b/piet-gpu/src/stages/draw.rs
@@ -28,6 +28,8 @@
 pub struct DrawMonoid {
     pub path_ix: u32,
     pub clip_ix: u32,
+    pub scene_offset: u32,
+    pub info_offset: u32,
 }
 
 const DRAW_WG: u64 = 256;
@@ -93,7 +95,7 @@
     pub unsafe fn new(session: &Session, code: &DrawCode) -> DrawStage {
         // We're limited to DRAW_PART_SIZE^2
         // Also note: size here allows padding
-        let root_buf_size = DRAW_PART_SIZE * 8;
+        let root_buf_size = DRAW_PART_SIZE * 16;
         let root_buf = session
             .create_buffer(root_buf_size, BufferUsage::STORAGE)
             .unwrap();
diff --git a/piet-gpu/src/test_scenes.rs b/piet-gpu/src/test_scenes.rs
index 118b727..ee5839d 100644
--- a/piet-gpu/src/test_scenes.rs
+++ b/piet-gpu/src/test_scenes.rs
@@ -2,7 +2,7 @@
 
 use rand::{Rng, RngCore};
 
-use crate::{PietGpuRenderContext, Blend, BlendMode, CompositionMode};
+use crate::{Blend, BlendMode, CompositionMode, PietGpuRenderContext};
 use piet::kurbo::{Affine, BezPath, Circle, Line, Point, Rect, Shape};
 use piet::{
     Color, FixedGradient, FixedLinearGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
@@ -13,10 +13,7 @@
 const N_CIRCLES: usize = 0;
 
 pub fn render_blend_test(rc: &mut PietGpuRenderContext, i: usize, blend: Blend) {
-    rc.fill(
-        Rect::new(400., 400., 800., 800.),
-        &Color::rgb8(0, 0, 200),
-    );
+    rc.fill(Rect::new(400., 400., 800., 800.), &Color::rgb8(0, 0, 200));
     rc.save().unwrap();
     rc.blend(Rect::new(0., 0., 1000., 1000.), blend);
     rc.transform(Affine::translate(Vec2::new(600., 600.)) * Affine::rotate(0.01 * i as f64));
diff --git a/tests/src/clip.rs b/tests/src/clip.rs
index cfd8a35..4a38949 100644
--- a/tests/src/clip.rs
+++ b/tests/src/clip.rs
@@ -163,8 +163,7 @@
         let clip_bbox_alloc = clip_stack_alloc + 20 * n_clip;
         stages::Config {
             clip_alloc: clip_alloc as u32,
-            // TODO: this wants to be renamed to path_bbox_alloc
-            bbox_alloc: path_bbox_alloc as u32,
+            path_bbox_alloc: path_bbox_alloc as u32,
             drawmonoid_alloc: drawmonoid_alloc as u32,
             clip_bic_alloc: clip_bic_alloc as u32,
             clip_stack_alloc: clip_stack_alloc as u32,
@@ -194,7 +193,7 @@
         let clip_range = clip_bbox_start..(clip_bbox_start + n_clip * 16);
         let clip_result = bytemuck::cast_slice::<u8, [f32; 4]>(&buf[clip_range]);
         let draw_start = 8 + n_clip * 4 + n_path * 24;
-        let draw_range = draw_start..(draw_start + n_clip * 8);
+        let draw_range = draw_start..(draw_start + n_clip * 16);
         let draw_result = bytemuck::cast_slice::<u8, DrawMonoid>(&buf[draw_range]);
         let mut bbox_stack = Vec::new();
         let mut parent_stack = Vec::new();
diff --git a/tests/src/draw.rs b/tests/src/draw.rs
index 7b264d4..4372da4 100644
--- a/tests/src/draw.rs
+++ b/tests/src/draw.rs
@@ -17,20 +17,29 @@
 //! Tests for the piet-gpu draw object stage.
 
 use piet_gpu_hal::{BufWrite, BufferUsage};
-use rand::Rng;
+use rand::{seq::SliceRandom, Rng};
 
 use crate::{Config, Runner, TestResult};
 
 use piet_gpu::stages::{self, DrawCode, DrawMonoid, DrawStage};
 
-const ELEMENT_SIZE: usize = 36;
+const DRAWTAG_SIZE: usize = 4;
 const ANNOTATED_SIZE: usize = 40;
 
-const ELEMENT_FILLCOLOR: u32 = 4;
-const ELEMENT_FILLLINGRADIENT: u32 = 5;
-const ELEMENT_FILLIMAGE: u32 = 6;
-const ELEMENT_BEGINCLIP: u32 = 9;
-const ELEMENT_ENDCLIP: u32 = 10;
+// Tags for draw objects. See shader/drawtag.h for the authoritative source.
+const DRAWTAG_FILLCOLOR: u32 = 4;
+const DRAWTAG_FILLLINGRADIENT: u32 = 20;
+const DRAWTAG_FILLIMAGE: u32 = 8;
+const DRAWTAG_BEGINCLIP: u32 = 5;
+const DRAWTAG_ENDCLIP: u32 = 37;
+
+const TAGS: &[u32] = &[
+    DRAWTAG_FILLCOLOR,
+    DRAWTAG_FILLLINGRADIENT,
+    DRAWTAG_FILLIMAGE,
+    DRAWTAG_BEGINCLIP,
+    DRAWTAG_ENDCLIP,
+];
 
 struct DrawTestData {
     tags: Vec<u32>,
@@ -47,7 +56,7 @@
         .session
         .create_buffer_init(std::slice::from_ref(&stage_config), BufferUsage::STORAGE)
         .unwrap();
-    let scene_size = n_tag * ELEMENT_SIZE as u64;
+    let scene_size = n_tag * DRAWTAG_SIZE as u64;
     let scene_buf = runner
         .session
         .create_buffer_with(scene_size, |b| data.fill_scene(b), BufferUsage::STORAGE)
@@ -92,7 +101,7 @@
 impl DrawTestData {
     fn new(n: u64) -> DrawTestData {
         let mut rng = rand::thread_rng();
-        let tags = (0..n).map(|_| rng.gen_range(0, 12)).collect();
+        let tags = (0..n).map(|_| *TAGS.choose(&mut rng).unwrap()).collect();
         DrawTestData { tags }
     }
 
@@ -101,13 +110,14 @@
 
         // Layout of memory
         let drawmonoid_alloc = 0;
-        let anno_alloc = drawmonoid_alloc + 8 * n_tags;
+        let anno_alloc = drawmonoid_alloc + 16 * n_tags;
         let clip_alloc = anno_alloc + ANNOTATED_SIZE * n_tags;
         let stage_config = stages::Config {
             n_elements: n_tags as u32,
             anno_alloc: anno_alloc as u32,
             drawmonoid_alloc: drawmonoid_alloc as u32,
             clip_alloc: clip_alloc as u32,
+            drawtag_offset: 0,
             ..Default::default()
         };
         stage_config
@@ -116,37 +126,35 @@
     fn memory_size(&self) -> u64 {
         // Note: this overallocates the clip buf a bit - only needed for the
         // total number of begin_clip and end_clip tags.
-        (8 + self.tags.len() * (8 + 4 + ANNOTATED_SIZE)) as u64
+        (8 + self.tags.len() * (16 + 4 + ANNOTATED_SIZE)) as u64
     }
 
     fn fill_scene(&self, buf: &mut BufWrite) {
-        let mut element = [0u32; ELEMENT_SIZE / 4];
-        for tag in &self.tags {
-            element[0] = *tag;
-            buf.push(element);
-        }
+        buf.extend_slice(&self.tags);
     }
 
     fn verify(&self, buf: &[u8]) -> Option<String> {
-        let size = self.tags.len() * 8;
+        let size = self.tags.len() * 16;
         let actual = bytemuck::cast_slice::<u8, DrawMonoid>(&buf[8..8 + size]);
         let mut expected = DrawMonoid::default();
         for (i, (tag, actual)) in self.tags.iter().zip(actual).enumerate() {
             // Verify exclusive prefix sum.
             let (path_ix, clip_ix) = Self::reduce_tag(*tag);
             if *actual != expected {
+                println!("{:?} {:?}", actual, expected);
                 return Some(format!("draw mismatch at {}", i));
             }
             expected.path_ix += path_ix;
             expected.clip_ix += clip_ix;
+            expected.scene_offset += tag & 28;
         }
         None
     }
 
     fn reduce_tag(tag: u32) -> (u32, u32) {
         match tag {
-            ELEMENT_FILLCOLOR | ELEMENT_FILLLINGRADIENT | ELEMENT_FILLIMAGE => (1, 0),
-            ELEMENT_BEGINCLIP | ELEMENT_ENDCLIP => (1, 1),
+            DRAWTAG_FILLCOLOR | DRAWTAG_FILLLINGRADIENT | DRAWTAG_FILLIMAGE => (1, 0),
+            DRAWTAG_BEGINCLIP | DRAWTAG_ENDCLIP => (1, 1),
             // TODO: ENDCLIP will become (0, 1)
             _ => (0, 0),
         }
diff --git a/tests/src/path.rs b/tests/src/path.rs
index 6f1f61a..bf72c68 100644
--- a/tests/src/path.rs
+++ b/tests/src/path.rs
@@ -207,11 +207,11 @@
         // Layout of memory
         let trans_alloc = 0;
         let pathseg_alloc = trans_alloc + n_trans * 24;
-        let bbox_alloc = pathseg_alloc + self.n_pathseg * PATHSEG_SIZE;
+        let path_bbox_alloc = pathseg_alloc + self.n_pathseg * PATHSEG_SIZE;
         let stage_config = stages::Config {
             pathseg_alloc,
             trans_alloc,
-            bbox_alloc,
+            path_bbox_alloc,
             n_trans,
             n_path: self.n_path,
             pathtag_offset,