Make shaders cross-platform

Translate all piet-gpu shaders into DXIL and MSL; move generated files
into the shader/gen directory.
diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp
index d544417..e4140cd 100644
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@@ -87,8 +87,8 @@
     // Prefix sum of sh_row_count
     for (uint i = 0; i < LG_BACKDROP_WG; i++) {
         barrier();
-        if (gl_LocalInvocationID.y == 0 && th_ix >= (1 << i)) {
-            row_count += sh_row_count[th_ix - (1 << i)];
+        if (gl_LocalInvocationID.y == 0 && th_ix >= (1u << i)) {
+            row_count += sh_row_count[th_ix - (1u << i)];
         }
         barrier();
         if (gl_LocalInvocationID.y == 0) {
@@ -102,7 +102,7 @@
         // Binary search to find element
         uint el_ix = 0;
         for (uint i = 0; i < LG_BACKDROP_WG; i++) {
-            uint probe = el_ix + ((BACKDROP_WG / 2) >> i);
+            uint probe = el_ix + (uint(BACKDROP_WG / 2) >> i);
             if (row >= sh_row_count[probe - 1]) {
                 el_ix = probe;
             }
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 497915c..6ed2140 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -14,7 +14,7 @@
   command = $glslang_validator $flags -V -o $out $in
 
 rule hlsl
-  command = $spirv_cross --hlsl $in --output $out
+  command = $spirv_cross --hlsl --shader-model 60 $in --output $out
 
 rule dxil
   command = $dxc -T cs_6_0 $in -Fo $out
@@ -22,23 +22,41 @@
 rule msl
   command = $spirv_cross --msl $in --output $out $msl_flags
 
+build gen/binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h mem.h
+build gen/binning.hlsl: hlsl gen/binning.spv
+build gen/binning.dxil: dxil gen/binning.hlsl
+build gen/binning.msl: msl gen/binning.spv
 
-build elements.spv: glsl elements.comp | scene.h state.h annotated.h
+build gen/tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h
+build gen/tile_alloc.hlsl: hlsl gen/tile_alloc.spv
+build gen/tile_alloc.dxil: dxil gen/tile_alloc.hlsl
+build gen/tile_alloc.msl: msl gen/tile_alloc.spv
 
-build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h mem.h
+build gen/path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setup.h
+build gen/path_coarse.hlsl: hlsl gen/path_coarse.spv
+build gen/path_coarse.dxil: dxil gen/path_coarse.hlsl
+build gen/path_coarse.msl: msl gen/path_coarse.spv
 
-build tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h
+build gen/backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h
+build gen/backdrop.hlsl: hlsl gen/backdrop.spv
+build gen/backdrop.dxil: dxil gen/backdrop.hlsl
+build gen/backdrop.msl: msl gen/backdrop.spv
 
-build path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setup.h
-
-build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h
-
-build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
+build gen/backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
   flags = -DBACKDROP_DIST_FACTOR=4
+build gen/backdrop_lg.hlsl: hlsl gen/backdrop_lg.spv
+build gen/backdrop_lg.dxil: dxil gen/backdrop_lg.hlsl
+build gen/backdrop_lg.msl: msl gen/backdrop_lg.spv
 
-build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
+build gen/coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
+build gen/coarse.hlsl: hlsl gen/coarse.spv
+build gen/coarse.dxil: dxil gen/coarse.hlsl
+build gen/coarse.msl: msl gen/coarse.spv
 
-build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
+build gen/kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
+build gen/kernel4.hlsl: hlsl gen/kernel4.spv
+build gen/kernel4.dxil: dxil gen/kernel4.hlsl
+build gen/kernel4.msl: msl gen/kernel4.spv
 
 # New element pipeline follows
 
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index b541893..31a64e4 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -172,8 +172,8 @@
                     }
                     barrier();
                     if (th_ix < N_PART_READ) {
-                        if (th_ix >= (1 << i)) {
-                            count += sh_part_count[th_ix - (1 << i)];
+                        if (th_ix >= (1u << i)) {
+                            count += sh_part_count[th_ix - (1u << i)];
                         }
                     }
                     barrier();
@@ -190,7 +190,7 @@
             if (ix >= wr_ix && ix < ready_ix && mem_ok) {
                 uint part_ix = 0;
                 for (uint i = 0; i < LG_N_PART_READ; i++) {
-                    uint probe = part_ix + ((N_PART_READ / 2) >> i);
+                    uint probe = part_ix + (uint(N_PART_READ / 2) >> i);
                     if (ix >= sh_part_count[probe - 1]) {
                         part_ix = probe;
                     }
@@ -257,8 +257,8 @@
         sh_tile_count[th_ix] = tile_count;
         for (uint i = 0; i < LG_N_TILE; i++) {
             barrier();
-            if (th_ix >= (1 << i)) {
-                tile_count += sh_tile_count[th_ix - (1 << i)];
+            if (th_ix >= (1u << i)) {
+                tile_count += sh_tile_count[th_ix - (1u << i)];
             }
             barrier();
             sh_tile_count[th_ix] = tile_count;
@@ -269,7 +269,7 @@
             // Binary search to find element
             uint el_ix = 0;
             for (uint i = 0; i < LG_N_TILE; i++) {
-                uint probe = el_ix + ((N_TILE / 2) >> i);
+                uint probe = el_ix + (uint(N_TILE / 2) >> i);
                 if (ix >= sh_tile_count[probe - 1]) {
                     el_ix = probe;
                 }
@@ -292,7 +292,7 @@
             }
             if (include_tile) {
                 uint el_slice = el_ix / 32;
-                uint el_mask = 1 << (el_ix & 31);
+                uint el_mask = 1u << (el_ix & 31);
                 atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
             }
         }
@@ -372,7 +372,7 @@
                     if (tile.tile.offset == 0 && tile.backdrop == 0) {
                         clip_zero_depth = clip_depth + 1;
                     } else if (tile.tile.offset == 0 && clip_depth < 32) {
-                        clip_one_mask |= (1 << clip_depth);
+                        clip_one_mask |= (1u << clip_depth);
                     } else {
                         AnnoBeginClip begin_clip = Annotated_BeginClip_read(conf.anno_alloc, ref);
                         if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
@@ -382,14 +382,14 @@
                         Cmd_BeginClip_write(cmd_alloc, cmd_ref);
                         cmd_ref.offset += 4;
                         if (clip_depth < 32) {
-                            clip_one_mask &= ~(1 << clip_depth);
+                            clip_one_mask &= ~(1u << clip_depth);
                         }
                     }
                     clip_depth++;
                     break;
                 case Annotated_EndClip:
                     clip_depth--;
-                    if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
+                    if (clip_depth >= 32 || (clip_one_mask & (1u << clip_depth)) == 0) {
                         if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                             break;
                         }
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
deleted file mode 100644
index 6f33544..0000000
--- a/piet-gpu/shader/elements.comp
+++ /dev/null
@@ -1,467 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// The element processing stage, first in the pipeline.
-//
-// This stage is primarily about applying transforms and computing bounding
-// boxes. It is organized as a scan over the input elements, producing
-// annotated output elements.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define N_ROWS 4
-#define WG_SIZE 32
-#define LG_WG_SIZE 5
-#define PARTITION_SIZE (WG_SIZE * N_ROWS)
-
-layout(local_size_x = WG_SIZE, local_size_y = 1) in;
-
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-layout(set = 0, binding = 2) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-// It would be better to use the Vulkan memory model than
-// "volatile" but shooting for compatibility here rather
-// than doing things right.
-layout(set = 0, binding = 3) volatile buffer StateBuf {
-    uint part_counter;
-    uint[] state;
-};
-
-#include "scene.h"
-#include "state.h"
-#include "annotated.h"
-#include "pathseg.h"
-#include "tile.h"
-
-#define StateBuf_stride (4 + 2 * State_size)
-
-StateRef state_aggregate_ref(uint partition_ix) {
-    return StateRef(4 + partition_ix * StateBuf_stride);
-}
-
-StateRef state_prefix_ref(uint partition_ix) {
-    return StateRef(4 + partition_ix * StateBuf_stride + State_size);
-}
-
-uint state_flag_index(uint partition_ix) {
-    return partition_ix * (StateBuf_stride / 4);
-}
-
-// These correspond to X, A, P respectively in the prefix sum paper.
-#define FLAG_NOT_READY 0
-#define FLAG_AGGREGATE_READY 1
-#define FLAG_PREFIX_READY 2
-
-#define FLAG_SET_LINEWIDTH 1
-#define FLAG_SET_BBOX 2
-#define FLAG_RESET_BBOX 4
-#define FLAG_SET_FILL_MODE 8
-// Fill modes take up the next bit. Non-zero fill is 0, stroke is 1.
-#define LG_FILL_MODE 4
-#define FILL_MODE_BITS 1
-#define FILL_MODE_MASK (FILL_MODE_BITS << LG_FILL_MODE)
-
-// This is almost like a monoid (the interaction between transformation and
-// bounding boxes is approximate)
-State combine_state(State a, State b) {
-    State c;
-    c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
-    c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
-    c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
-    c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
-    if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
-        c.bbox = a.bbox;
-    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
-        (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
-    {
-        c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
-        c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
-    }
-    // It would be more concise to cast to matrix types; ah well.
-    c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y;
-    c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y;
-    c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w;
-    c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w;
-    c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
-    c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
-    c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
-    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX | FLAG_SET_FILL_MODE)) | b.flags;
-    c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
-    uint fill_mode = (b.flags & FLAG_SET_FILL_MODE) == 0 ? a.flags : b.flags;
-    fill_mode &= FILL_MODE_MASK;
-    c.flags = (c.flags & ~FILL_MODE_MASK) | fill_mode;
-    c.path_count = a.path_count + b.path_count;
-    c.pathseg_count = a.pathseg_count + b.pathseg_count;
-    c.trans_count = a.trans_count + b.trans_count;
-    return c;
-}
-
-State map_element(ElementRef ref) {
-    // TODO: it would *probably* be more efficient to make the memory read patterns less
-    // divergent, though it would be more wasted memory.
-    uint tag = Element_tag(ref).tag;
-    State c;
-    c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
-    c.mat = vec4(1.0, 0.0, 0.0, 1.0);
-    c.translate = vec2(0.0, 0.0);
-    c.linewidth = 1.0; // TODO should be 0.0
-    c.flags = 0;
-    c.path_count = 0;
-    c.pathseg_count = 0;
-    c.trans_count = 0;
-    switch (tag) {
-    case Element_Line:
-        LineSeg line = Element_Line_read(ref);
-        c.bbox.xy = min(line.p0, line.p1);
-        c.bbox.zw = max(line.p0, line.p1);
-        c.pathseg_count = 1;
-        break;
-    case Element_Quad:
-        QuadSeg quad = Element_Quad_read(ref);
-        c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
-        c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
-        c.pathseg_count = 1;
-        break;
-    case Element_Cubic:
-        CubicSeg cubic = Element_Cubic_read(ref);
-        c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
-        c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
-        c.pathseg_count = 1;
-        break;
-    case Element_FillColor:
-    case Element_FillLinGradient:
-    case Element_FillImage:
-    case Element_BeginClip:
-        c.flags = FLAG_RESET_BBOX;
-        c.path_count = 1;
-        break;
-    case Element_EndClip:
-        c.path_count = 1;
-        break;
-    case Element_SetLineWidth:
-        SetLineWidth lw = Element_SetLineWidth_read(ref);
-        c.linewidth = lw.width;
-        c.flags = FLAG_SET_LINEWIDTH;
-        break;
-    case Element_Transform:
-        Transform t = Element_Transform_read(ref);
-        c.mat = t.mat;
-        c.translate = t.translate;
-        c.trans_count = 1;
-        break;
-    case Element_SetFillMode:
-        SetFillMode fm = Element_SetFillMode_read(ref);
-        c.flags = FLAG_SET_FILL_MODE | (fm.fill_mode << LG_FILL_MODE);
-        break;
-    }
-    return c;
-}
-
-// Get the bounding box of a circle transformed by the matrix into an ellipse.
-vec2 get_linewidth(State st) {
-    // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
-    return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw));
-}
-
-shared State sh_state[WG_SIZE];
-
-shared uint sh_part_ix;
-shared State sh_prefix;
-shared uint sh_flag;
-
-void main() {
-    State th_state[N_ROWS];
-    // Determine partition to process by atomic counter (described in Section
-    // 4.4 of prefix sum paper).
-    if (gl_LocalInvocationID.x == 0) {
-        sh_part_ix = atomicAdd(part_counter, 1);
-    }
-    barrier();
-    uint part_ix = sh_part_ix;
-
-    uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
-    ElementRef ref = ElementRef(ix * Element_size);
-
-    th_state[0] = map_element(ref);
-    for (uint i = 1; i < N_ROWS; i++) {
-        // discussion question: would it be faster to load using more coherent patterns
-        // into thread memory? This is kinda strided.
-        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
-    }
-    State agg = th_state[N_ROWS - 1];
-    sh_state[gl_LocalInvocationID.x] = agg;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        barrier();
-        if (gl_LocalInvocationID.x >= (1 << i)) {
-            State other = sh_state[gl_LocalInvocationID.x - (1 << i)];
-            agg = combine_state(other, agg);
-        }
-        barrier();
-        sh_state[gl_LocalInvocationID.x] = agg;
-    }
-
-    State exclusive;
-    exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
-    exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
-    exclusive.translate = vec2(0.0, 0.0);
-    exclusive.linewidth = 1.0; //TODO should be 0.0
-    exclusive.flags = 0;
-    exclusive.path_count = 0;
-    exclusive.pathseg_count = 0;
-    exclusive.trans_count = 0;
-
-    // Publish aggregate for this partition
-    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-        State_write(state_aggregate_ref(part_ix), agg);
-        if (part_ix == 0) {
-            State_write(state_prefix_ref(part_ix), agg);
-        }
-    }
-    // Write flag with release semantics; this is done portably with a barrier.
-    memoryBarrierBuffer();
-    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-        uint flag = FLAG_AGGREGATE_READY;
-        if (part_ix == 0) {
-            flag = FLAG_PREFIX_READY;
-        }
-        state[state_flag_index(part_ix)] = flag;
-    }
-    if (part_ix != 0) {
-        // step 4 of paper: decoupled lookback
-        uint look_back_ix = part_ix - 1;
-
-        State their_agg;
-        uint their_ix = 0;
-        while (true) {
-            // Read flag with acquire semantics.
-            if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-                sh_flag = state[state_flag_index(look_back_ix)];
-            }
-            // The flag load is done only in the last thread. However, because the
-            // translation of memoryBarrierBuffer to Metal requires uniform control
-            // flow, we broadcast it to all threads.
-            memoryBarrierBuffer();
-            barrier();
-            uint flag = sh_flag;
-            barrier();
-
-            if (flag == FLAG_PREFIX_READY) {
-                if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-                    State their_prefix = State_read(state_prefix_ref(look_back_ix));
-                    exclusive = combine_state(their_prefix, exclusive);
-                }
-                break;
-            } else if (flag == FLAG_AGGREGATE_READY) {
-                if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-                    their_agg = State_read(state_aggregate_ref(look_back_ix));
-                    exclusive = combine_state(their_agg, exclusive);
-                }
-                look_back_ix--;
-                their_ix = 0;
-                continue;
-            }
-            // else spin
-
-            if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-                // Unfortunately there's no guarantee of forward progress of other
-                // workgroups, so compute a bit of the aggregate before trying again.
-                // In the worst case, spinning stops when the aggregate is complete.
-                ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size);
-                State s = map_element(ref);
-                if (their_ix == 0) {
-                    their_agg = s;
-                } else {
-                    their_agg = combine_state(their_agg, s);
-                }
-                their_ix++;
-                if (their_ix == PARTITION_SIZE) {
-                    exclusive = combine_state(their_agg, exclusive);
-                    if (look_back_ix == 0) {
-                        sh_flag = FLAG_PREFIX_READY;
-                    } else {
-                        look_back_ix--;
-                        their_ix = 0;
-                    }
-                }
-            }
-            barrier();
-            flag = sh_flag;
-            barrier();
-            if (flag == FLAG_PREFIX_READY) {
-                break;
-            }
-        }
-        // step 5 of paper: compute inclusive prefix
-        if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-            State inclusive_prefix = combine_state(exclusive, agg);
-            sh_prefix = exclusive;
-            State_write(state_prefix_ref(part_ix), inclusive_prefix);
-        }
-        memoryBarrierBuffer();
-        if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-            state[state_flag_index(part_ix)] = FLAG_PREFIX_READY;
-        }
-    }
-    barrier();
-    if (part_ix != 0) {
-        exclusive = sh_prefix;
-    }
-
-    State row = exclusive;
-    if (gl_LocalInvocationID.x > 0) {
-        State other = sh_state[gl_LocalInvocationID.x - 1];
-        row = combine_state(row, other);
-    }
-    for (uint i = 0; i < N_ROWS; i++) {
-        State st = combine_state(row, th_state[i]);
-
-        // Here we read again from the original scene. There may be
-        // gains to be had from stashing in shared memory or possibly
-        // registers (though register pressure is an issue).
-        ElementRef this_ref = Element_index(ref, i);
-        ElementTag tag = Element_tag(this_ref);
-        uint fill_mode = fill_mode_from_flags(st.flags >> LG_FILL_MODE);
-        bool is_stroke = fill_mode == MODE_STROKE;
-        switch (tag.tag) {
-        case Element_Line:
-            LineSeg line = Element_Line_read(this_ref);
-            PathCubic path_cubic;
-            path_cubic.p0 = line.p0;
-            path_cubic.p1 = mix(line.p0, line.p1, 1.0 / 3.0);
-            path_cubic.p2 = mix(line.p1, line.p0, 1.0 / 3.0);
-            path_cubic.p3 = line.p1;
-            path_cubic.path_ix = st.path_count;
-            path_cubic.trans_ix = st.trans_count;
-            if (is_stroke) {
-                path_cubic.stroke = get_linewidth(st);
-            } else {
-                path_cubic.stroke = vec2(0.0);
-            }
-            PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
-            PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
-            break;
-        case Element_Quad:
-            QuadSeg quad = Element_Quad_read(this_ref);
-            path_cubic.p0 = quad.p0;
-            path_cubic.p1 = mix(quad.p1, quad.p0, 1.0 / 3.0);
-            path_cubic.p2 = mix(quad.p1, quad.p2, 1.0 / 3.0);
-            path_cubic.p3 = quad.p2;
-            path_cubic.path_ix = st.path_count;
-            path_cubic.trans_ix = st.trans_count;
-            if (is_stroke) {
-                path_cubic.stroke = get_linewidth(st);
-            } else {
-                path_cubic.stroke = vec2(0.0);
-            }
-            path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
-            PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
-            break;
-        case Element_Cubic:
-            CubicSeg cubic = Element_Cubic_read(this_ref);
-            path_cubic.p0 = cubic.p0;
-            path_cubic.p1 = cubic.p1;
-            path_cubic.p2 = cubic.p2;
-            path_cubic.p3 = cubic.p3;
-            path_cubic.path_ix = st.path_count;
-            path_cubic.trans_ix = st.trans_count;
-            if (is_stroke) {
-                path_cubic.stroke = get_linewidth(st);
-            } else {
-                path_cubic.stroke = vec2(0.0);
-            }
-            path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
-            PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
-            break;
-        case Element_FillColor:
-            FillColor fill = Element_FillColor_read(this_ref);
-            AnnoColor anno_fill;
-            anno_fill.rgba_color = fill.rgba_color;
-            if (is_stroke) {
-                vec2 lw = get_linewidth(st);
-                anno_fill.bbox = st.bbox + vec4(-lw, lw);
-                anno_fill.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
-            } else {
-                anno_fill.bbox = st.bbox;
-                anno_fill.linewidth = 0.0;
-            }
-            AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
-            Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill);
-            break;
-        case Element_FillLinGradient:
-            FillLinGradient lin = Element_FillLinGradient_read(this_ref);
-            AnnoLinGradient anno_lin;
-            anno_lin.index = lin.index;
-            vec2 p0 = st.mat.xy * lin.p0.x + st.mat.zw * lin.p0.y + st.translate;
-            vec2 p1 = st.mat.xy * lin.p1.x + st.mat.zw * lin.p1.y + st.translate;
-            vec2 dxy = p1 - p0;
-            float scale = 1.0 / (dxy.x * dxy.x + dxy.y * dxy.y);
-            float line_x = dxy.x * scale;
-            float line_y = dxy.y * scale;
-            anno_lin.line_x = line_x;
-            anno_lin.line_y = line_y;
-            anno_lin.line_c = -(p0.x * line_x + p0.y * line_y);
-            // TODO: consider consolidating bbox calculation
-            if (is_stroke) {
-                vec2 lw = get_linewidth(st);
-                anno_lin.bbox = st.bbox + vec4(-lw, lw);
-                anno_lin.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
-            } else {
-                anno_lin.bbox = st.bbox;
-                anno_lin.linewidth = 0.0;
-            }
-            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
-            Annotated_LinGradient_write(conf.anno_alloc, out_ref, fill_mode, anno_lin);
-            break;
-        case Element_FillImage:
-            FillImage fill_img = Element_FillImage_read(this_ref);
-            AnnoImage anno_img;
-            anno_img.index = fill_img.index;
-            anno_img.offset = fill_img.offset;
-            if (is_stroke) {
-                vec2 lw = get_linewidth(st);
-                anno_img.bbox = st.bbox + vec4(-lw, lw);
-                anno_img.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
-            } else {
-                anno_img.bbox = st.bbox;
-                anno_img.linewidth = 0.0;
-            }
-            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
-            Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
-            break;
-        case Element_BeginClip:
-            Clip begin_clip = Element_BeginClip_read(this_ref);
-            AnnoBeginClip anno_begin_clip;
-            // This is the absolute bbox, it's been transformed during encoding.
-            anno_begin_clip.bbox = begin_clip.bbox;
-            if (is_stroke) {
-                vec2 lw = get_linewidth(st);
-                anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
-            } else {
-                anno_begin_clip.linewidth = 0.0;
-            }
-            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
-            Annotated_BeginClip_write(conf.anno_alloc, out_ref, fill_mode, anno_begin_clip);
-            break;
-        case Element_EndClip:
-            Clip end_clip = Element_EndClip_read(this_ref);
-            // This bbox is expected to be the same as the begin one.
-            AnnoEndClip anno_end_clip = AnnoEndClip(end_clip.bbox);
-            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
-            Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
-            break;
-        case Element_Transform:
-            TransformSeg transform = TransformSeg(st.mat, st.translate);
-            TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (st.trans_count - 1) * TransformSeg_size);
-            TransformSeg_write(conf.trans_alloc, trans_ref, transform);
-            break;
-        }
-    }
-}
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
deleted file mode 100644
index f906dac..0000000
--- a/piet-gpu/shader/elements.spv
+++ /dev/null
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil
new file mode 100644
index 0000000..4ebcb1c
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop.hlsl b/piet-gpu/shader/gen/backdrop.hlsl
new file mode 100644
index 0000000..65b969d
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop.hlsl
@@ -0,0 +1,283 @@
+struct Alloc
+{
+    uint offset;
+};
+
+struct AnnotatedRef
+{
+    uint offset;
+};
+
+struct AnnotatedTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct PathRef
+{
+    uint offset;
+};
+
+struct TileRef
+{
+    uint offset;
+};
+
+struct Path
+{
+    uint4 bbox;
+    TileRef tiles;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer _79 : register(u0, space0);
+ByteAddressBuffer _186 : register(t1, space0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+static uint gl_LocalInvocationIndex;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+    uint gl_LocalInvocationIndex : SV_GroupIndex;
+};
+
+groupshared uint sh_row_width[256];
+groupshared Alloc sh_row_alloc[256];
+groupshared uint sh_row_count[256];
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = _79.Load(offset * 4 + 8);
+    return v;
+}
+
+AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1);
+    AnnotatedTag _121 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _121;
+}
+
+uint fill_mode_from_flags(uint flags)
+{
+    return flags & 1u;
+}
+
+Path Path_read(Alloc a, PathRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Path s;
+    s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+    TileRef _165 = { raw2 };
+    s.tiles = _165;
+    return s;
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    _79.Store(offset * 4 + 8, val);
+}
+
+void comp_main()
+{
+    uint th_ix = gl_LocalInvocationIndex;
+    uint element_ix = gl_GlobalInvocationID.x;
+    AnnotatedRef _194 = { _186.Load(32) + (element_ix * 40u) };
+    AnnotatedRef ref = _194;
+    uint row_count = 0u;
+    bool mem_ok = _79.Load(4) == 0u;
+    if (gl_LocalInvocationID.y == 0u)
+    {
+        if (element_ix < _186.Load(0))
+        {
+            Alloc _217;
+            _217.offset = _186.Load(32);
+            Alloc param;
+            param.offset = _217.offset;
+            AnnotatedRef param_1 = ref;
+            AnnotatedTag tag = Annotated_tag(param, param_1);
+            switch (tag.tag)
+            {
+                case 3u:
+                case 2u:
+                case 4u:
+                case 1u:
+                {
+                    uint param_2 = tag.flags;
+                    if (fill_mode_from_flags(param_2) != 0u)
+                    {
+                        break;
+                    }
+                    PathRef _243 = { _186.Load(16) + (element_ix * 12u) };
+                    PathRef path_ref = _243;
+                    Alloc _247;
+                    _247.offset = _186.Load(16);
+                    Alloc param_3;
+                    param_3.offset = _247.offset;
+                    PathRef param_4 = path_ref;
+                    Path path = Path_read(param_3, param_4);
+                    sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+                    row_count = path.bbox.w - path.bbox.y;
+                    bool _272 = row_count == 1u;
+                    bool _278;
+                    if (_272)
+                    {
+                        _278 = path.bbox.y > 0u;
+                    }
+                    else
+                    {
+                        _278 = _272;
+                    }
+                    if (_278)
+                    {
+                        row_count = 0u;
+                    }
+                    uint param_5 = path.tiles.offset;
+                    uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                    bool param_7 = mem_ok;
+                    Alloc path_alloc = new_alloc(param_5, param_6, param_7);
+                    sh_row_alloc[th_ix] = path_alloc;
+                    break;
+                }
+            }
+        }
+        sh_row_count[th_ix] = row_count;
+    }
+    for (uint i = 0u; i < 8u; i++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        bool _325 = gl_LocalInvocationID.y == 0u;
+        bool _332;
+        if (_325)
+        {
+            _332 = th_ix >= (1u << i);
+        }
+        else
+        {
+            _332 = _325;
+        }
+        if (_332)
+        {
+            row_count += sh_row_count[th_ix - (1u << i)];
+        }
+        GroupMemoryBarrierWithGroupSync();
+        if (gl_LocalInvocationID.y == 0u)
+        {
+            sh_row_count[th_ix] = row_count;
+        }
+    }
+    GroupMemoryBarrierWithGroupSync();
+    uint total_rows = sh_row_count[255];
+    uint _411;
+    for (uint row = th_ix; row < total_rows; row += 256u)
+    {
+        uint el_ix = 0u;
+        for (uint i_1 = 0u; i_1 < 8u; i_1++)
+        {
+            uint probe = el_ix + (128u >> i_1);
+            if (row >= sh_row_count[probe - 1u])
+            {
+                el_ix = probe;
+            }
+        }
+        uint width = sh_row_width[el_ix];
+        if ((width > 0u) && mem_ok)
+        {
+            Alloc tiles_alloc = sh_row_alloc[el_ix];
+            if (el_ix > 0u)
+            {
+                _411 = sh_row_count[el_ix - 1u];
+            }
+            else
+            {
+                _411 = 0u;
+            }
+            uint seq_ix = row - _411;
+            uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
+            Alloc param_8 = tiles_alloc;
+            uint param_9 = tile_el_ix;
+            uint sum = read_mem(param_8, param_9);
+            for (uint x = 1u; x < width; x++)
+            {
+                tile_el_ix += 2u;
+                Alloc param_10 = tiles_alloc;
+                uint param_11 = tile_el_ix;
+                sum += read_mem(param_10, param_11);
+                Alloc param_12 = tiles_alloc;
+                uint param_13 = tile_el_ix;
+                uint param_14 = sum;
+                write_mem(param_12, param_13, param_14);
+            }
+        }
+    }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/backdrop.msl b/piet-gpu/shader/gen/backdrop.msl
new file mode 100644
index 0000000..7640ed0
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop.msl
@@ -0,0 +1,284 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct AnnotatedRef
+{
+    uint offset;
+};
+
+struct AnnotatedTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct PathRef
+{
+    uint offset;
+};
+
+struct TileRef
+{
+    uint offset;
+};
+
+struct Path
+{
+    uint4 bbox;
+    TileRef tiles;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+struct Alloc_1
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc_1 tile_alloc;
+    Alloc_1 bin_alloc;
+    Alloc_1 ptcl_alloc;
+    Alloc_1 pathseg_alloc;
+    Alloc_1 anno_alloc;
+    Alloc_1 trans_alloc;
+    Alloc_1 bbox_alloc;
+    Alloc_1 drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = v_79.memory[offset];
+    return v;
+}
+
+static inline __attribute__((always_inline))
+AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1, v_79);
+    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+}
+
+static inline __attribute__((always_inline))
+uint fill_mode_from_flags(thread const uint& flags)
+{
+    return flags & 1u;
+}
+
+static inline __attribute__((always_inline))
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_79);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_79);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_79);
+    Path s;
+    s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+    s.tiles = TileRef{ raw2 };
+    return s;
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    v_79.memory[offset] = val;
+}
+
+kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+    threadgroup uint sh_row_width[256];
+    threadgroup Alloc sh_row_alloc[256];
+    threadgroup uint sh_row_count[256];
+    uint th_ix = gl_LocalInvocationIndex;
+    uint element_ix = gl_GlobalInvocationID.x;
+    AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) };
+    uint row_count = 0u;
+    bool mem_ok = v_79.mem_error == 0u;
+    if (gl_LocalInvocationID.y == 0u)
+    {
+        if (element_ix < _186.conf.n_elements)
+        {
+            Alloc param;
+            param.offset = _186.conf.anno_alloc.offset;
+            AnnotatedRef param_1 = ref;
+            AnnotatedTag tag = Annotated_tag(param, param_1, v_79);
+            switch (tag.tag)
+            {
+                case 3u:
+                case 2u:
+                case 4u:
+                case 1u:
+                {
+                    uint param_2 = tag.flags;
+                    if (fill_mode_from_flags(param_2) != 0u)
+                    {
+                        break;
+                    }
+                    PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) };
+                    Alloc param_3;
+                    param_3.offset = _186.conf.tile_alloc.offset;
+                    PathRef param_4 = path_ref;
+                    Path path = Path_read(param_3, param_4, v_79);
+                    sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+                    row_count = path.bbox.w - path.bbox.y;
+                    bool _272 = row_count == 1u;
+                    bool _278;
+                    if (_272)
+                    {
+                        _278 = path.bbox.y > 0u;
+                    }
+                    else
+                    {
+                        _278 = _272;
+                    }
+                    if (_278)
+                    {
+                        row_count = 0u;
+                    }
+                    uint param_5 = path.tiles.offset;
+                    uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                    bool param_7 = mem_ok;
+                    Alloc path_alloc = new_alloc(param_5, param_6, param_7);
+                    sh_row_alloc[th_ix] = path_alloc;
+                    break;
+                }
+            }
+        }
+        sh_row_count[th_ix] = row_count;
+    }
+    for (uint i = 0u; i < 8u; i++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        bool _325 = gl_LocalInvocationID.y == 0u;
+        bool _332;
+        if (_325)
+        {
+            _332 = th_ix >= (1u << i);
+        }
+        else
+        {
+            _332 = _325;
+        }
+        if (_332)
+        {
+            row_count += sh_row_count[th_ix - (1u << i)];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (gl_LocalInvocationID.y == 0u)
+        {
+            sh_row_count[th_ix] = row_count;
+        }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    uint total_rows = sh_row_count[255];
+    uint _411;
+    for (uint row = th_ix; row < total_rows; row += 256u)
+    {
+        uint el_ix = 0u;
+        for (uint i_1 = 0u; i_1 < 8u; i_1++)
+        {
+            uint probe = el_ix + (128u >> i_1);
+            if (row >= sh_row_count[probe - 1u])
+            {
+                el_ix = probe;
+            }
+        }
+        uint width = sh_row_width[el_ix];
+        if ((width > 0u) && mem_ok)
+        {
+            Alloc tiles_alloc = sh_row_alloc[el_ix];
+            if (el_ix > 0u)
+            {
+                _411 = sh_row_count[el_ix - 1u];
+            }
+            else
+            {
+                _411 = 0u;
+            }
+            uint seq_ix = row - _411;
+            uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
+            Alloc param_8 = tiles_alloc;
+            uint param_9 = tile_el_ix;
+            uint sum = read_mem(param_8, param_9, v_79);
+            for (uint x = 1u; x < width; x++)
+            {
+                tile_el_ix += 2u;
+                Alloc param_10 = tiles_alloc;
+                uint param_11 = tile_el_ix;
+                sum += read_mem(param_10, param_11, v_79);
+                Alloc param_12 = tiles_alloc;
+                uint param_13 = tile_el_ix;
+                uint param_14 = sum;
+                write_mem(param_12, param_13, param_14, v_79);
+            }
+        }
+    }
+}
+
diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/gen/backdrop.spv
similarity index 71%
rename from piet-gpu/shader/backdrop.spv
rename to piet-gpu/shader/gen/backdrop.spv
index a1ed332..f3a7824 100644
--- a/piet-gpu/shader/backdrop.spv
+++ b/piet-gpu/shader/gen/backdrop.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil
new file mode 100644
index 0000000..e6b2f1a
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop_lg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.hlsl b/piet-gpu/shader/gen/backdrop_lg.hlsl
new file mode 100644
index 0000000..57bb6d3
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop_lg.hlsl
@@ -0,0 +1,283 @@
+struct Alloc
+{
+    uint offset;
+};
+
+struct AnnotatedRef
+{
+    uint offset;
+};
+
+struct AnnotatedTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct PathRef
+{
+    uint offset;
+};
+
+struct TileRef
+{
+    uint offset;
+};
+
+struct Path
+{
+    uint4 bbox;
+    TileRef tiles;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 4u, 1u);
+
+RWByteAddressBuffer _79 : register(u0, space0);
+ByteAddressBuffer _186 : register(t1, space0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+static uint gl_LocalInvocationIndex;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+    uint gl_LocalInvocationIndex : SV_GroupIndex;
+};
+
+groupshared uint sh_row_width[256];
+groupshared Alloc sh_row_alloc[256];
+groupshared uint sh_row_count[256];
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = _79.Load(offset * 4 + 8);
+    return v;
+}
+
+AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1);
+    AnnotatedTag _121 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _121;
+}
+
+uint fill_mode_from_flags(uint flags)
+{
+    return flags & 1u;
+}
+
+Path Path_read(Alloc a, PathRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Path s;
+    s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+    TileRef _165 = { raw2 };
+    s.tiles = _165;
+    return s;
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    _79.Store(offset * 4 + 8, val);
+}
+
+void comp_main()
+{
+    uint th_ix = gl_LocalInvocationIndex;
+    uint element_ix = gl_GlobalInvocationID.x;
+    AnnotatedRef _194 = { _186.Load(32) + (element_ix * 40u) };
+    AnnotatedRef ref = _194;
+    uint row_count = 0u;
+    bool mem_ok = _79.Load(4) == 0u;
+    if (gl_LocalInvocationID.y == 0u)
+    {
+        if (element_ix < _186.Load(0))
+        {
+            Alloc _217;
+            _217.offset = _186.Load(32);
+            Alloc param;
+            param.offset = _217.offset;
+            AnnotatedRef param_1 = ref;
+            AnnotatedTag tag = Annotated_tag(param, param_1);
+            switch (tag.tag)
+            {
+                case 3u:
+                case 2u:
+                case 4u:
+                case 1u:
+                {
+                    uint param_2 = tag.flags;
+                    if (fill_mode_from_flags(param_2) != 0u)
+                    {
+                        break;
+                    }
+                    PathRef _243 = { _186.Load(16) + (element_ix * 12u) };
+                    PathRef path_ref = _243;
+                    Alloc _247;
+                    _247.offset = _186.Load(16);
+                    Alloc param_3;
+                    param_3.offset = _247.offset;
+                    PathRef param_4 = path_ref;
+                    Path path = Path_read(param_3, param_4);
+                    sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+                    row_count = path.bbox.w - path.bbox.y;
+                    bool _272 = row_count == 1u;
+                    bool _278;
+                    if (_272)
+                    {
+                        _278 = path.bbox.y > 0u;
+                    }
+                    else
+                    {
+                        _278 = _272;
+                    }
+                    if (_278)
+                    {
+                        row_count = 0u;
+                    }
+                    uint param_5 = path.tiles.offset;
+                    uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                    bool param_7 = mem_ok;
+                    Alloc path_alloc = new_alloc(param_5, param_6, param_7);
+                    sh_row_alloc[th_ix] = path_alloc;
+                    break;
+                }
+            }
+        }
+        sh_row_count[th_ix] = row_count;
+    }
+    for (uint i = 0u; i < 8u; i++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        bool _325 = gl_LocalInvocationID.y == 0u;
+        bool _332;
+        if (_325)
+        {
+            _332 = th_ix >= (1u << i);
+        }
+        else
+        {
+            _332 = _325;
+        }
+        if (_332)
+        {
+            row_count += sh_row_count[th_ix - (1u << i)];
+        }
+        GroupMemoryBarrierWithGroupSync();
+        if (gl_LocalInvocationID.y == 0u)
+        {
+            sh_row_count[th_ix] = row_count;
+        }
+    }
+    GroupMemoryBarrierWithGroupSync();
+    uint total_rows = sh_row_count[255];
+    uint _411;
+    for (uint row = th_ix; row < total_rows; row += 1024u)
+    {
+        uint el_ix = 0u;
+        for (uint i_1 = 0u; i_1 < 8u; i_1++)
+        {
+            uint probe = el_ix + (128u >> i_1);
+            if (row >= sh_row_count[probe - 1u])
+            {
+                el_ix = probe;
+            }
+        }
+        uint width = sh_row_width[el_ix];
+        if ((width > 0u) && mem_ok)
+        {
+            Alloc tiles_alloc = sh_row_alloc[el_ix];
+            if (el_ix > 0u)
+            {
+                _411 = sh_row_count[el_ix - 1u];
+            }
+            else
+            {
+                _411 = 0u;
+            }
+            uint seq_ix = row - _411;
+            uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
+            Alloc param_8 = tiles_alloc;
+            uint param_9 = tile_el_ix;
+            uint sum = read_mem(param_8, param_9);
+            for (uint x = 1u; x < width; x++)
+            {
+                tile_el_ix += 2u;
+                Alloc param_10 = tiles_alloc;
+                uint param_11 = tile_el_ix;
+                sum += read_mem(param_10, param_11);
+                Alloc param_12 = tiles_alloc;
+                uint param_13 = tile_el_ix;
+                uint param_14 = sum;
+                write_mem(param_12, param_13, param_14);
+            }
+        }
+    }
+}
+
+[numthreads(256, 4, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/backdrop_lg.msl b/piet-gpu/shader/gen/backdrop_lg.msl
new file mode 100644
index 0000000..1c68980
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop_lg.msl
@@ -0,0 +1,284 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct AnnotatedRef
+{
+    uint offset;
+};
+
+struct AnnotatedTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct PathRef
+{
+    uint offset;
+};
+
+struct TileRef
+{
+    uint offset;
+};
+
+struct Path
+{
+    uint4 bbox;
+    TileRef tiles;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+struct Alloc_1
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc_1 tile_alloc;
+    Alloc_1 bin_alloc;
+    Alloc_1 ptcl_alloc;
+    Alloc_1 pathseg_alloc;
+    Alloc_1 anno_alloc;
+    Alloc_1 trans_alloc;
+    Alloc_1 bbox_alloc;
+    Alloc_1 drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 4u, 1u);
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = v_79.memory[offset];
+    return v;
+}
+
+static inline __attribute__((always_inline))
+AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1, v_79);
+    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+}
+
+static inline __attribute__((always_inline))
+uint fill_mode_from_flags(thread const uint& flags)
+{
+    return flags & 1u;
+}
+
+static inline __attribute__((always_inline))
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_79);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_79);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_79);
+    Path s;
+    s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+    s.tiles = TileRef{ raw2 };
+    return s;
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    v_79.memory[offset] = val;
+}
+
+kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+    threadgroup uint sh_row_width[256];
+    threadgroup Alloc sh_row_alloc[256];
+    threadgroup uint sh_row_count[256];
+    uint th_ix = gl_LocalInvocationIndex;
+    uint element_ix = gl_GlobalInvocationID.x;
+    AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) };
+    uint row_count = 0u;
+    bool mem_ok = v_79.mem_error == 0u;
+    if (gl_LocalInvocationID.y == 0u)
+    {
+        if (element_ix < _186.conf.n_elements)
+        {
+            Alloc param;
+            param.offset = _186.conf.anno_alloc.offset;
+            AnnotatedRef param_1 = ref;
+            AnnotatedTag tag = Annotated_tag(param, param_1, v_79);
+            switch (tag.tag)
+            {
+                case 3u:
+                case 2u:
+                case 4u:
+                case 1u:
+                {
+                    uint param_2 = tag.flags;
+                    if (fill_mode_from_flags(param_2) != 0u)
+                    {
+                        break;
+                    }
+                    PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) };
+                    Alloc param_3;
+                    param_3.offset = _186.conf.tile_alloc.offset;
+                    PathRef param_4 = path_ref;
+                    Path path = Path_read(param_3, param_4, v_79);
+                    sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+                    row_count = path.bbox.w - path.bbox.y;
+                    bool _272 = row_count == 1u;
+                    bool _278;
+                    if (_272)
+                    {
+                        _278 = path.bbox.y > 0u;
+                    }
+                    else
+                    {
+                        _278 = _272;
+                    }
+                    if (_278)
+                    {
+                        row_count = 0u;
+                    }
+                    uint param_5 = path.tiles.offset;
+                    uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                    bool param_7 = mem_ok;
+                    Alloc path_alloc = new_alloc(param_5, param_6, param_7);
+                    sh_row_alloc[th_ix] = path_alloc;
+                    break;
+                }
+            }
+        }
+        sh_row_count[th_ix] = row_count;
+    }
+    for (uint i = 0u; i < 8u; i++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        bool _325 = gl_LocalInvocationID.y == 0u;
+        bool _332;
+        if (_325)
+        {
+            _332 = th_ix >= (1u << i);
+        }
+        else
+        {
+            _332 = _325;
+        }
+        if (_332)
+        {
+            row_count += sh_row_count[th_ix - (1u << i)];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (gl_LocalInvocationID.y == 0u)
+        {
+            sh_row_count[th_ix] = row_count;
+        }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    uint total_rows = sh_row_count[255];
+    uint _411;
+    for (uint row = th_ix; row < total_rows; row += 1024u)
+    {
+        uint el_ix = 0u;
+        for (uint i_1 = 0u; i_1 < 8u; i_1++)
+        {
+            uint probe = el_ix + (128u >> i_1);
+            if (row >= sh_row_count[probe - 1u])
+            {
+                el_ix = probe;
+            }
+        }
+        uint width = sh_row_width[el_ix];
+        if ((width > 0u) && mem_ok)
+        {
+            Alloc tiles_alloc = sh_row_alloc[el_ix];
+            if (el_ix > 0u)
+            {
+                _411 = sh_row_count[el_ix - 1u];
+            }
+            else
+            {
+                _411 = 0u;
+            }
+            uint seq_ix = row - _411;
+            uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
+            Alloc param_8 = tiles_alloc;
+            uint param_9 = tile_el_ix;
+            uint sum = read_mem(param_8, param_9, v_79);
+            for (uint x = 1u; x < width; x++)
+            {
+                tile_el_ix += 2u;
+                Alloc param_10 = tiles_alloc;
+                uint param_11 = tile_el_ix;
+                sum += read_mem(param_10, param_11, v_79);
+                Alloc param_12 = tiles_alloc;
+                uint param_13 = tile_el_ix;
+                uint param_14 = sum;
+                write_mem(param_12, param_13, param_14, v_79);
+            }
+        }
+    }
+}
+
diff --git a/piet-gpu/shader/backdrop_lg.spv b/piet-gpu/shader/gen/backdrop_lg.spv
similarity index 70%
rename from piet-gpu/shader/backdrop_lg.spv
rename to piet-gpu/shader/gen/backdrop_lg.spv
index 457cb02..a77d46d 100644
--- a/piet-gpu/shader/backdrop_lg.spv
+++ b/piet-gpu/shader/gen/backdrop_lg.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl
index 903a185..64b109f 100644
--- a/piet-gpu/shader/gen/bbox_clear.hlsl
+++ b/piet-gpu/shader/gen/bbox_clear.hlsl
@@ -27,8 +27,8 @@
 
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
 
-ByteAddressBuffer _21 : register(t1);
-RWByteAddressBuffer _45 : register(u0);
+ByteAddressBuffer _21 : register(t1, space0);
+RWByteAddressBuffer _45 : register(u0, space0);
 
 static uint3 gl_GlobalInvocationID;
 struct SPIRV_Cross_Input
diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil
new file mode 100644
index 0000000..50034cc
--- /dev/null
+++ b/piet-gpu/shader/gen/binning.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl
new file mode 100644
index 0000000..2b0901e
--- /dev/null
+++ b/piet-gpu/shader/gen/binning.hlsl
@@ -0,0 +1,352 @@
+struct Alloc
+{
+    uint offset;
+};
+
+struct MallocResult
+{
+    Alloc alloc;
+    bool failed;
+};
+
+struct AnnoEndClipRef
+{
+    uint offset;
+};
+
+struct AnnoEndClip
+{
+    float4 bbox;
+};
+
+struct AnnotatedRef
+{
+    uint offset;
+};
+
+struct AnnotatedTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct BinInstanceRef
+{
+    uint offset;
+};
+
+struct BinInstance
+{
+    uint element_ix;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer _84 : register(u0, space0);
+ByteAddressBuffer _253 : register(t1, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+};
+
+groupshared uint bitmaps[8][256];
+groupshared bool sh_alloc_failed;
+groupshared uint count[8][256];
+groupshared Alloc sh_chunk_alloc[256];
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = _84.Load(offset * 4 + 8);
+    return v;
+}
+
+AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1);
+    AnnotatedTag _221 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _221;
+}
+
+AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    AnnoEndClip s;
+    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    return s;
+}
+
+AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref)
+{
+    AnnoEndClipRef _228 = { ref.offset + 4u };
+    Alloc param = a;
+    AnnoEndClipRef param_1 = _228;
+    return AnnoEndClip_read(param, param_1);
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+MallocResult malloc(uint size)
+{
+    uint _90;
+    _84.InterlockedAdd(0, size, _90);
+    uint offset = _90;
+    uint _97;
+    _84.GetDimensions(_97);
+    _97 = (_97 - 8) / 4;
+    MallocResult r;
+    r.failed = (offset + size) > uint(int(_97) * 4);
+    uint param = offset;
+    uint param_1 = size;
+    bool param_2 = !r.failed;
+    r.alloc = new_alloc(param, param_1, param_2);
+    if (r.failed)
+    {
+        uint _119;
+        _84.InterlockedMax(4, 1u, _119);
+        return r;
+    }
+    return r;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    _84.Store(offset * 4 + 8, val);
+}
+
+void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.element_ix;
+    write_mem(param, param_1, param_2);
+}
+
+void comp_main()
+{
+    uint my_n_elements = _253.Load(0);
+    uint my_partition = gl_WorkGroupID.x;
+    for (uint i = 0u; i < 8u; i++)
+    {
+        bitmaps[i][gl_LocalInvocationID.x] = 0u;
+    }
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        sh_alloc_failed = false;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
+    AnnotatedRef _308 = { _253.Load(32) + (element_ix * 40u) };
+    AnnotatedRef ref = _308;
+    uint tag = 0u;
+    if (element_ix < my_n_elements)
+    {
+        Alloc _318;
+        _318.offset = _253.Load(32);
+        Alloc param;
+        param.offset = _318.offset;
+        AnnotatedRef param_1 = ref;
+        tag = Annotated_tag(param, param_1).tag;
+    }
+    int x0 = 0;
+    int y0 = 0;
+    int x1 = 0;
+    int y1 = 0;
+    switch (tag)
+    {
+        case 1u:
+        case 2u:
+        case 3u:
+        case 4u:
+        case 5u:
+        {
+            Alloc _336;
+            _336.offset = _253.Load(32);
+            Alloc param_2;
+            param_2.offset = _336.offset;
+            AnnotatedRef param_3 = ref;
+            AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3);
+            x0 = int(floor(clip.bbox.x * 0.00390625f));
+            y0 = int(floor(clip.bbox.y * 0.00390625f));
+            x1 = int(ceil(clip.bbox.z * 0.00390625f));
+            y1 = int(ceil(clip.bbox.w * 0.00390625f));
+            break;
+        }
+    }
+    uint width_in_bins = ((_253.Load(8) + 16u) - 1u) / 16u;
+    uint height_in_bins = ((_253.Load(12) + 16u) - 1u) / 16u;
+    x0 = clamp(x0, 0, int(width_in_bins));
+    x1 = clamp(x1, x0, int(width_in_bins));
+    y0 = clamp(y0, 0, int(height_in_bins));
+    y1 = clamp(y1, y0, int(height_in_bins));
+    if (x0 == x1)
+    {
+        y1 = y0;
+    }
+    int x = x0;
+    int y = y0;
+    uint my_slice = gl_LocalInvocationID.x / 32u;
+    uint my_mask = uint(1 << int(gl_LocalInvocationID.x & 31u));
+    while (y < y1)
+    {
+        uint _438;
+        InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _438);
+        x++;
+        if (x == x1)
+        {
+            x = x0;
+            y++;
+        }
+    }
+    GroupMemoryBarrierWithGroupSync();
+    uint element_count = 0u;
+    for (uint i_1 = 0u; i_1 < 8u; i_1++)
+    {
+        element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x])));
+        count[i_1][gl_LocalInvocationID.x] = element_count;
+    }
+    uint param_4 = 0u;
+    uint param_5 = 0u;
+    bool param_6 = true;
+    Alloc chunk_alloc = new_alloc(param_4, param_5, param_6);
+    if (element_count != 0u)
+    {
+        uint param_7 = element_count * 4u;
+        MallocResult _488 = malloc(param_7);
+        MallocResult chunk = _488;
+        chunk_alloc = chunk.alloc;
+        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
+        if (chunk.failed)
+        {
+            sh_alloc_failed = true;
+        }
+    }
+    uint out_ix = (_253.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
+    Alloc _517;
+    _517.offset = _253.Load(20);
+    Alloc param_8;
+    param_8.offset = _517.offset;
+    uint param_9 = out_ix;
+    uint param_10 = element_count;
+    write_mem(param_8, param_9, param_10);
+    Alloc _529;
+    _529.offset = _253.Load(20);
+    Alloc param_11;
+    param_11.offset = _529.offset;
+    uint param_12 = out_ix + 1u;
+    uint param_13 = chunk_alloc.offset;
+    write_mem(param_11, param_12, param_13);
+    GroupMemoryBarrierWithGroupSync();
+    bool _544;
+    if (!sh_alloc_failed)
+    {
+        _544 = _84.Load(4) != 0u;
+    }
+    else
+    {
+        _544 = sh_alloc_failed;
+    }
+    if (_544)
+    {
+        return;
+    }
+    x = x0;
+    y = y0;
+    while (y < y1)
+    {
+        uint bin_ix = (uint(y) * width_in_bins) + uint(x);
+        uint out_mask = bitmaps[my_slice][bin_ix];
+        if ((out_mask & my_mask) != 0u)
+        {
+            uint idx = uint(int(countbits(out_mask & (my_mask - 1u))));
+            if (my_slice > 0u)
+            {
+                idx += count[my_slice - 1u][bin_ix];
+            }
+            Alloc out_alloc = sh_chunk_alloc[bin_ix];
+            uint out_offset = out_alloc.offset + (idx * 4u);
+            BinInstanceRef _606 = { out_offset };
+            BinInstance _608 = { element_ix };
+            Alloc param_14 = out_alloc;
+            BinInstanceRef param_15 = _606;
+            BinInstance param_16 = _608;
+            BinInstance_write(param_14, param_15, param_16);
+        }
+        x++;
+        if (x == x1)
+        {
+            x = x0;
+            y++;
+        }
+    }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl
new file mode 100644
index 0000000..f6e0505
--- /dev/null
+++ b/piet-gpu/shader/gen/binning.msl
@@ -0,0 +1,350 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct MallocResult
+{
+    Alloc alloc;
+    bool failed;
+};
+
+struct AnnoEndClipRef
+{
+    uint offset;
+};
+
+struct AnnoEndClip
+{
+    float4 bbox;
+};
+
+struct AnnotatedRef
+{
+    uint offset;
+};
+
+struct AnnotatedTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct BinInstanceRef
+{
+    uint offset;
+};
+
+struct BinInstance
+{
+    uint element_ix;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+struct Alloc_1
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc_1 tile_alloc;
+    Alloc_1 bin_alloc;
+    Alloc_1 ptcl_alloc;
+    Alloc_1 pathseg_alloc;
+    Alloc_1 anno_alloc;
+    Alloc_1 trans_alloc;
+    Alloc_1 bbox_alloc;
+    Alloc_1 drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_84, constant uint& v_84BufferSize)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = v_84.memory[offset];
+    return v;
+}
+
+static inline __attribute__((always_inline))
+AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_84, constant uint& v_84BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1, v_84, v_84BufferSize);
+    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+}
+
+static inline __attribute__((always_inline))
+AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_84, constant uint& v_84BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_84, v_84BufferSize);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_84, v_84BufferSize);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_84, v_84BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_84, v_84BufferSize);
+    AnnoEndClip s;
+    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    return s;
+}
+
+static inline __attribute__((always_inline))
+AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_84, constant uint& v_84BufferSize)
+{
+    Alloc param = a;
+    AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u };
+    return AnnoEndClip_read(param, param_1, v_84, v_84BufferSize);
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+static inline __attribute__((always_inline))
+MallocResult malloc(thread const uint& size, device Memory& v_84, constant uint& v_84BufferSize)
+{
+    uint _90 = atomic_fetch_add_explicit((device atomic_uint*)&v_84.mem_offset, size, memory_order_relaxed);
+    uint offset = _90;
+    MallocResult r;
+    r.failed = (offset + size) > uint(int((v_84BufferSize - 8) / 4) * 4);
+    uint param = offset;
+    uint param_1 = size;
+    bool param_2 = !r.failed;
+    r.alloc = new_alloc(param, param_1, param_2);
+    if (r.failed)
+    {
+        uint _119 = atomic_fetch_max_explicit((device atomic_uint*)&v_84.mem_error, 1u, memory_order_relaxed);
+        return r;
+    }
+    return r;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_84, constant uint& v_84BufferSize)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    v_84.memory[offset] = val;
+}
+
+static inline __attribute__((always_inline))
+void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_84, constant uint& v_84BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.element_ix;
+    write_mem(param, param_1, param_2, v_84, v_84BufferSize);
+}
+
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_84 [[buffer(0)]], const device ConfigBuf& _253 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+    threadgroup uint bitmaps[8][256];
+    threadgroup short sh_alloc_failed;
+    threadgroup uint count[8][256];
+    threadgroup Alloc sh_chunk_alloc[256];
+    constant uint& v_84BufferSize = spvBufferSizeConstants[0];
+    uint my_n_elements = _253.conf.n_elements;
+    uint my_partition = gl_WorkGroupID.x;
+    for (uint i = 0u; i < 8u; i++)
+    {
+        bitmaps[i][gl_LocalInvocationID.x] = 0u;
+    }
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        sh_alloc_failed = short(false);
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
+    AnnotatedRef ref = AnnotatedRef{ _253.conf.anno_alloc.offset + (element_ix * 40u) };
+    uint tag = 0u;
+    if (element_ix < my_n_elements)
+    {
+        Alloc param;
+        param.offset = _253.conf.anno_alloc.offset;
+        AnnotatedRef param_1 = ref;
+        tag = Annotated_tag(param, param_1, v_84, v_84BufferSize).tag;
+    }
+    int x0 = 0;
+    int y0 = 0;
+    int x1 = 0;
+    int y1 = 0;
+    switch (tag)
+    {
+        case 1u:
+        case 2u:
+        case 3u:
+        case 4u:
+        case 5u:
+        {
+            Alloc param_2;
+            param_2.offset = _253.conf.anno_alloc.offset;
+            AnnotatedRef param_3 = ref;
+            AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_84, v_84BufferSize);
+            x0 = int(floor(clip.bbox.x * 0.00390625));
+            y0 = int(floor(clip.bbox.y * 0.00390625));
+            x1 = int(ceil(clip.bbox.z * 0.00390625));
+            y1 = int(ceil(clip.bbox.w * 0.00390625));
+            break;
+        }
+    }
+    uint width_in_bins = ((_253.conf.width_in_tiles + 16u) - 1u) / 16u;
+    uint height_in_bins = ((_253.conf.height_in_tiles + 16u) - 1u) / 16u;
+    x0 = clamp(x0, 0, int(width_in_bins));
+    x1 = clamp(x1, x0, int(width_in_bins));
+    y0 = clamp(y0, 0, int(height_in_bins));
+    y1 = clamp(y1, y0, int(height_in_bins));
+    if (x0 == x1)
+    {
+        y1 = y0;
+    }
+    int x = x0;
+    int y = y0;
+    uint my_slice = gl_LocalInvocationID.x / 32u;
+    uint my_mask = uint(1 << int(gl_LocalInvocationID.x & 31u));
+    while (y < y1)
+    {
+        uint _438 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
+        x++;
+        if (x == x1)
+        {
+            x = x0;
+            y++;
+        }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    uint element_count = 0u;
+    for (uint i_1 = 0u; i_1 < 8u; i_1++)
+    {
+        element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x])));
+        count[i_1][gl_LocalInvocationID.x] = element_count;
+    }
+    uint param_4 = 0u;
+    uint param_5 = 0u;
+    bool param_6 = true;
+    Alloc chunk_alloc = new_alloc(param_4, param_5, param_6);
+    if (element_count != 0u)
+    {
+        uint param_7 = element_count * 4u;
+        MallocResult _488 = malloc(param_7, v_84, v_84BufferSize);
+        MallocResult chunk = _488;
+        chunk_alloc = chunk.alloc;
+        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
+        if (chunk.failed)
+        {
+            sh_alloc_failed = short(true);
+        }
+    }
+    uint out_ix = (_253.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
+    Alloc param_8;
+    param_8.offset = _253.conf.bin_alloc.offset;
+    uint param_9 = out_ix;
+    uint param_10 = element_count;
+    write_mem(param_8, param_9, param_10, v_84, v_84BufferSize);
+    Alloc param_11;
+    param_11.offset = _253.conf.bin_alloc.offset;
+    uint param_12 = out_ix + 1u;
+    uint param_13 = chunk_alloc.offset;
+    write_mem(param_11, param_12, param_13, v_84, v_84BufferSize);
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    bool _544;
+    if (!bool(sh_alloc_failed))
+    {
+        _544 = v_84.mem_error != 0u;
+    }
+    else
+    {
+        _544 = bool(sh_alloc_failed);
+    }
+    if (_544)
+    {
+        return;
+    }
+    x = x0;
+    y = y0;
+    while (y < y1)
+    {
+        uint bin_ix = (uint(y) * width_in_bins) + uint(x);
+        uint out_mask = bitmaps[my_slice][bin_ix];
+        if ((out_mask & my_mask) != 0u)
+        {
+            uint idx = uint(int(popcount(out_mask & (my_mask - 1u))));
+            if (my_slice > 0u)
+            {
+                idx += count[my_slice - 1u][bin_ix];
+            }
+            Alloc out_alloc = sh_chunk_alloc[bin_ix];
+            uint out_offset = out_alloc.offset + (idx * 4u);
+            Alloc param_14 = out_alloc;
+            BinInstanceRef param_15 = BinInstanceRef{ out_offset };
+            BinInstance param_16 = BinInstance{ element_ix };
+            BinInstance_write(param_14, param_15, param_16, v_84, v_84BufferSize);
+        }
+        x++;
+        if (x == x1)
+        {
+            x = x0;
+            y++;
+        }
+    }
+}
+
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/gen/binning.spv
similarity index 100%
rename from piet-gpu/shader/binning.spv
rename to piet-gpu/shader/gen/binning.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
new file mode 100644
index 0000000..16d47ce
--- /dev/null
+++ b/piet-gpu/shader/gen/coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
new file mode 100644
index 0000000..bc96cea
--- /dev/null
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -0,0 +1,1386 @@
+struct Alloc
+{
+    uint offset;
+};
+
+struct MallocResult
+{
+    Alloc alloc;
+    bool failed;
+};
+
+struct AnnoImageRef
+{
+    uint offset;
+};
+
+struct AnnoImage
+{
+    float4 bbox;
+    float linewidth;
+    uint index;
+    int2 offset;
+};
+
+struct AnnoColorRef
+{
+    uint offset;
+};
+
+struct AnnoColor
+{
+    float4 bbox;
+    float linewidth;
+    uint rgba_color;
+};
+
+struct AnnoLinGradientRef
+{
+    uint offset;
+};
+
+struct AnnoLinGradient
+{
+    float4 bbox;
+    float linewidth;
+    uint index;
+    float line_x;
+    float line_y;
+    float line_c;
+};
+
+struct AnnoBeginClipRef
+{
+    uint offset;
+};
+
+struct AnnoBeginClip
+{
+    float4 bbox;
+    float linewidth;
+};
+
+struct AnnotatedRef
+{
+    uint offset;
+};
+
+struct AnnotatedTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct BinInstanceRef
+{
+    uint offset;
+};
+
+struct BinInstance
+{
+    uint element_ix;
+};
+
+struct PathRef
+{
+    uint offset;
+};
+
+struct TileRef
+{
+    uint offset;
+};
+
+struct Path
+{
+    uint4 bbox;
+    TileRef tiles;
+};
+
+struct TileSegRef
+{
+    uint offset;
+};
+
+struct Tile
+{
+    TileSegRef tile;
+    int backdrop;
+};
+
+struct CmdStrokeRef
+{
+    uint offset;
+};
+
+struct CmdStroke
+{
+    uint tile_ref;
+    float half_width;
+};
+
+struct CmdFillRef
+{
+    uint offset;
+};
+
+struct CmdFill
+{
+    uint tile_ref;
+    int backdrop;
+};
+
+struct CmdColorRef
+{
+    uint offset;
+};
+
+struct CmdColor
+{
+    uint rgba_color;
+};
+
+struct CmdLinGradRef
+{
+    uint offset;
+};
+
+struct CmdLinGrad
+{
+    uint index;
+    float line_x;
+    float line_y;
+    float line_c;
+};
+
+struct CmdImageRef
+{
+    uint offset;
+};
+
+struct CmdImage
+{
+    uint index;
+    int2 offset;
+};
+
+struct CmdJumpRef
+{
+    uint offset;
+};
+
+struct CmdJump
+{
+    uint new_ref;
+};
+
+struct CmdRef
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer _296 : register(u0, space0);
+ByteAddressBuffer _1249 : register(t1, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+};
+
+groupshared uint sh_bitmaps[8][256];
+groupshared Alloc sh_part_elements[256];
+groupshared uint sh_part_count[256];
+groupshared uint sh_elements[256];
+groupshared uint sh_tile_stride[256];
+groupshared uint sh_tile_width[256];
+groupshared uint sh_tile_x0[256];
+groupshared uint sh_tile_y0[256];
+groupshared uint sh_tile_base[256];
+groupshared uint sh_tile_count[256];
+
+Alloc slice_mem(Alloc a, uint offset, uint size)
+{
+    Alloc _373 = { a.offset + offset };
+    return _373;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = _296.Load(offset * 4 + 8);
+    return v;
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index)
+{
+    BinInstanceRef _754 = { ref.offset + (index * 4u) };
+    return _754;
+}
+
+BinInstance BinInstance_read(Alloc a, BinInstanceRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    BinInstance s;
+    s.element_ix = raw0;
+    return s;
+}
+
+AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1);
+    AnnotatedTag _706 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _706;
+}
+
+Path Path_read(Alloc a, PathRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Path s;
+    s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+    TileRef _814 = { raw2 };
+    s.tiles = _814;
+    return s;
+}
+
+void write_tile_alloc(uint el_ix, Alloc a)
+{
+}
+
+Alloc read_tile_alloc(uint el_ix, bool mem_ok)
+{
+    uint _1135;
+    _296.GetDimensions(_1135);
+    _1135 = (_1135 - 8) / 4;
+    uint param = 0u;
+    uint param_1 = uint(int(_1135) * 4);
+    bool param_2 = mem_ok;
+    return new_alloc(param, param_1, param_2);
+}
+
+Tile Tile_read(Alloc a, TileRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    TileSegRef _839 = { raw0 };
+    Tile s;
+    s.tile = _839;
+    s.backdrop = int(raw1);
+    return s;
+}
+
+AnnoColor AnnoColor_read(Alloc a, AnnoColorRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11);
+    AnnoColor s;
+    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    s.linewidth = asfloat(raw4);
+    s.rgba_color = raw5;
+    return s;
+}
+
+AnnoColor Annotated_Color_read(Alloc a, AnnotatedRef ref)
+{
+    AnnoColorRef _712 = { ref.offset + 4u };
+    Alloc param = a;
+    AnnoColorRef param_1 = _712;
+    return AnnoColor_read(param, param_1);
+}
+
+MallocResult malloc(uint size)
+{
+    uint _302;
+    _296.InterlockedAdd(0, size, _302);
+    uint offset = _302;
+    uint _309;
+    _296.GetDimensions(_309);
+    _309 = (_309 - 8) / 4;
+    MallocResult r;
+    r.failed = (offset + size) > uint(int(_309) * 4);
+    uint param = offset;
+    uint param_1 = size;
+    bool param_2 = !r.failed;
+    r.alloc = new_alloc(param, param_1, param_2);
+    if (r.failed)
+    {
+        uint _331;
+        _296.InterlockedMax(4, 1u, _331);
+        return r;
+    }
+    return r;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    _296.Store(offset * 4 + 8, val);
+}
+
+void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.new_ref;
+    write_mem(param, param_1, param_2);
+}
+
+void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 10u;
+    write_mem(param, param_1, param_2);
+    CmdJumpRef _1128 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    CmdJumpRef param_4 = _1128;
+    CmdJump param_5 = s;
+    CmdJump_write(param_3, param_4, param_5);
+}
+
+bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit)
+{
+    if (cmd_ref.offset < cmd_limit)
+    {
+        return true;
+    }
+    uint param = 1024u;
+    MallocResult _1156 = malloc(param);
+    MallocResult new_cmd = _1156;
+    if (new_cmd.failed)
+    {
+        return false;
+    }
+    CmdJump _1166 = { new_cmd.alloc.offset };
+    CmdJump jump = _1166;
+    Alloc param_1 = cmd_alloc;
+    CmdRef param_2 = cmd_ref;
+    CmdJump param_3 = jump;
+    Cmd_Jump_write(param_1, param_2, param_3);
+    cmd_alloc = new_cmd.alloc;
+    CmdRef _1178 = { cmd_alloc.offset };
+    cmd_ref = _1178;
+    cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
+    return true;
+}
+
+uint fill_mode_from_flags(uint flags)
+{
+    return flags & 1u;
+}
+
+void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.tile_ref;
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = uint(s.backdrop);
+    write_mem(param_3, param_4, param_5);
+}
+
+void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 1u;
+    write_mem(param, param_1, param_2);
+    CmdFillRef _1012 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    CmdFillRef param_4 = _1012;
+    CmdFill param_5 = s;
+    CmdFill_write(param_3, param_4, param_5);
+}
+
+void Cmd_Solid_write(Alloc a, CmdRef ref)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 3u;
+    write_mem(param, param_1, param_2);
+}
+
+void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.tile_ref;
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.half_width);
+    write_mem(param_3, param_4, param_5);
+}
+
+void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 2u;
+    write_mem(param, param_1, param_2);
+    CmdStrokeRef _1030 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    CmdStrokeRef param_4 = _1030;
+    CmdStroke param_5 = s;
+    CmdStroke_write(param_3, param_4, param_5);
+}
+
+void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float linewidth)
+{
+    uint param = flags;
+    if (fill_mode_from_flags(param) == 0u)
+    {
+        if (tile.tile.offset != 0u)
+        {
+            CmdFill _1202 = { tile.tile.offset, tile.backdrop };
+            CmdFill cmd_fill = _1202;
+            Alloc param_1 = alloc;
+            CmdRef param_2 = cmd_ref;
+            CmdFill param_3 = cmd_fill;
+            Cmd_Fill_write(param_1, param_2, param_3);
+            cmd_ref.offset += 12u;
+        }
+        else
+        {
+            Alloc param_4 = alloc;
+            CmdRef param_5 = cmd_ref;
+            Cmd_Solid_write(param_4, param_5);
+            cmd_ref.offset += 4u;
+        }
+    }
+    else
+    {
+        CmdStroke _1232 = { tile.tile.offset, 0.5f * linewidth };
+        CmdStroke cmd_stroke = _1232;
+        Alloc param_6 = alloc;
+        CmdRef param_7 = cmd_ref;
+        CmdStroke param_8 = cmd_stroke;
+        Cmd_Stroke_write(param_6, param_7, param_8);
+        cmd_ref.offset += 12u;
+    }
+}
+
+void CmdColor_write(Alloc a, CmdColorRef ref, CmdColor s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.rgba_color;
+    write_mem(param, param_1, param_2);
+}
+
+void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 5u;
+    write_mem(param, param_1, param_2);
+    CmdColorRef _1056 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    CmdColorRef param_4 = _1056;
+    CmdColor param_5 = s;
+    CmdColor_write(param_3, param_4, param_5);
+}
+
+AnnoLinGradient AnnoLinGradient_read(Alloc a, AnnoLinGradientRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17);
+    AnnoLinGradient s;
+    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    s.linewidth = asfloat(raw4);
+    s.index = raw5;
+    s.line_x = asfloat(raw6);
+    s.line_y = asfloat(raw7);
+    s.line_c = asfloat(raw8);
+    return s;
+}
+
+AnnoLinGradient Annotated_LinGradient_read(Alloc a, AnnotatedRef ref)
+{
+    AnnoLinGradientRef _722 = { ref.offset + 4u };
+    Alloc param = a;
+    AnnoLinGradientRef param_1 = _722;
+    return AnnoLinGradient_read(param, param_1);
+}
+
+void CmdLinGrad_write(Alloc a, CmdLinGradRef ref, CmdLinGrad s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.index;
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.line_x);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s.line_y);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s.line_c);
+    write_mem(param_9, param_10, param_11);
+}
+
+void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 6u;
+    write_mem(param, param_1, param_2);
+    CmdLinGradRef _1074 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    CmdLinGradRef param_4 = _1074;
+    CmdLinGrad param_5 = s;
+    CmdLinGrad_write(param_3, param_4, param_5);
+}
+
+AnnoImage AnnoImage_read(Alloc a, AnnoImageRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13);
+    AnnoImage s;
+    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    s.linewidth = asfloat(raw4);
+    s.index = raw5;
+    s.offset = int2(int(raw6 << uint(16)) >> 16, int(raw6) >> 16);
+    return s;
+}
+
+AnnoImage Annotated_Image_read(Alloc a, AnnotatedRef ref)
+{
+    AnnoImageRef _732 = { ref.offset + 4u };
+    Alloc param = a;
+    AnnoImageRef param_1 = _732;
+    return AnnoImage_read(param, param_1);
+}
+
+void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.index;
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
+    write_mem(param_3, param_4, param_5);
+}
+
+void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 7u;
+    write_mem(param, param_1, param_2);
+    CmdImageRef _1092 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    CmdImageRef param_4 = _1092;
+    CmdImage param_5 = s;
+    CmdImage_write(param_3, param_4, param_5);
+}
+
+AnnoBeginClip AnnoBeginClip_read(Alloc a, AnnoBeginClipRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    AnnoBeginClip s;
+    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    s.linewidth = asfloat(raw4);
+    return s;
+}
+
+AnnoBeginClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref)
+{
+    AnnoBeginClipRef _742 = { ref.offset + 4u };
+    Alloc param = a;
+    AnnoBeginClipRef param_1 = _742;
+    return AnnoBeginClip_read(param, param_1);
+}
+
+void Cmd_BeginClip_write(Alloc a, CmdRef ref)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 8u;
+    write_mem(param, param_1, param_2);
+}
+
+void Cmd_EndClip_write(Alloc a, CmdRef ref)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 9u;
+    write_mem(param, param_1, param_2);
+}
+
+void Cmd_End_write(Alloc a, CmdRef ref)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 0u;
+    write_mem(param, param_1, param_2);
+}
+
+void comp_main()
+{
+    uint width_in_bins = ((_1249.Load(8) + 16u) - 1u) / 16u;
+    uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
+    uint partition_ix = 0u;
+    uint n_partitions = ((_1249.Load(0) + 256u) - 1u) / 256u;
+    uint th_ix = gl_LocalInvocationID.x;
+    uint bin_tile_x = 16u * gl_WorkGroupID.x;
+    uint bin_tile_y = 16u * gl_WorkGroupID.y;
+    uint tile_x = gl_LocalInvocationID.x % 16u;
+    uint tile_y = gl_LocalInvocationID.x / 16u;
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1249.Load(8)) + bin_tile_x) + tile_x;
+    Alloc _1314;
+    _1314.offset = _1249.Load(24);
+    Alloc param;
+    param.offset = _1314.offset;
+    uint param_1 = this_tile_ix * 1024u;
+    uint param_2 = 1024u;
+    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
+    CmdRef _1323 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1323;
+    uint cmd_limit = (cmd_ref.offset + 1024u) - 60u;
+    uint clip_depth = 0u;
+    uint clip_zero_depth = 0u;
+    uint clip_one_mask = 0u;
+    uint rd_ix = 0u;
+    uint wr_ix = 0u;
+    uint part_start_ix = 0u;
+    uint ready_ix = 0u;
+    bool mem_ok = _296.Load(4) == 0u;
+    Alloc param_3;
+    Alloc param_5;
+    uint _1529;
+    uint element_ix;
+    AnnotatedRef ref;
+    Alloc param_14;
+    Alloc param_16;
+    uint tile_count;
+    Alloc param_23;
+    uint _1841;
+    Alloc param_29;
+    Tile tile_1;
+    AnnoColor fill;
+    Alloc param_35;
+    Alloc param_52;
+    CmdLinGrad cmd_lin;
+    Alloc param_69;
+    Alloc param_86;
+    while (true)
+    {
+        for (uint i = 0u; i < 8u; i++)
+        {
+            sh_bitmaps[i][th_ix] = 0u;
+        }
+        bool _1581;
+        for (;;)
+        {
+            if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
+            {
+                part_start_ix = ready_ix;
+                uint count = 0u;
+                bool _1379 = th_ix < 256u;
+                bool _1387;
+                if (_1379)
+                {
+                    _1387 = (partition_ix + th_ix) < n_partitions;
+                }
+                else
+                {
+                    _1387 = _1379;
+                }
+                if (_1387)
+                {
+                    uint in_ix = (_1249.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    Alloc _1404;
+                    _1404.offset = _1249.Load(20);
+                    param_3.offset = _1404.offset;
+                    uint param_4 = in_ix;
+                    count = read_mem(param_3, param_4);
+                    Alloc _1415;
+                    _1415.offset = _1249.Load(20);
+                    param_5.offset = _1415.offset;
+                    uint param_6 = in_ix + 1u;
+                    uint offset = read_mem(param_5, param_6);
+                    uint param_7 = offset;
+                    uint param_8 = count * 4u;
+                    bool param_9 = mem_ok;
+                    sh_part_elements[th_ix] = new_alloc(param_7, param_8, param_9);
+                }
+                for (uint i_1 = 0u; i_1 < 8u; i_1++)
+                {
+                    if (th_ix < 256u)
+                    {
+                        sh_part_count[th_ix] = count;
+                    }
+                    GroupMemoryBarrierWithGroupSync();
+                    if (th_ix < 256u)
+                    {
+                        if (th_ix >= (1u << i_1))
+                        {
+                            count += sh_part_count[th_ix - (1u << i_1)];
+                        }
+                    }
+                    GroupMemoryBarrierWithGroupSync();
+                }
+                if (th_ix < 256u)
+                {
+                    sh_part_count[th_ix] = part_start_ix + count;
+                }
+                GroupMemoryBarrierWithGroupSync();
+                ready_ix = sh_part_count[255];
+                partition_ix += 256u;
+            }
+            uint ix = rd_ix + th_ix;
+            if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok)
+            {
+                uint part_ix = 0u;
+                for (uint i_2 = 0u; i_2 < 8u; i_2++)
+                {
+                    uint probe = part_ix + (128u >> i_2);
+                    if (ix >= sh_part_count[probe - 1u])
+                    {
+                        part_ix = probe;
+                    }
+                }
+                if (part_ix > 0u)
+                {
+                    _1529 = sh_part_count[part_ix - 1u];
+                }
+                else
+                {
+                    _1529 = part_start_ix;
+                }
+                ix -= _1529;
+                Alloc bin_alloc = sh_part_elements[part_ix];
+                BinInstanceRef _1548 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1548;
+                BinInstanceRef param_10 = inst_ref;
+                uint param_11 = ix;
+                Alloc param_12 = bin_alloc;
+                BinInstanceRef param_13 = BinInstance_index(param_10, param_11);
+                BinInstance inst = BinInstance_read(param_12, param_13);
+                sh_elements[th_ix] = inst.element_ix;
+            }
+            GroupMemoryBarrierWithGroupSync();
+            wr_ix = min((rd_ix + 256u), ready_ix);
+            bool _1571 = (wr_ix - rd_ix) < 256u;
+            if (_1571)
+            {
+                _1581 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+            }
+            else
+            {
+                _1581 = _1571;
+            }
+            if (_1581)
+            {
+                continue;
+            }
+            else
+            {
+                break;
+            }
+        }
+        uint tag = 0u;
+        if ((th_ix + rd_ix) < wr_ix)
+        {
+            element_ix = sh_elements[th_ix];
+            AnnotatedRef _1602 = { _1249.Load(32) + (element_ix * 40u) };
+            ref = _1602;
+            Alloc _1605;
+            _1605.offset = _1249.Load(32);
+            param_14.offset = _1605.offset;
+            AnnotatedRef param_15 = ref;
+            tag = Annotated_tag(param_14, param_15).tag;
+        }
+        switch (tag)
+        {
+            case 1u:
+            case 3u:
+            case 2u:
+            case 4u:
+            case 5u:
+            {
+                uint path_ix = element_ix;
+                PathRef _1624 = { _1249.Load(16) + (path_ix * 12u) };
+                Alloc _1627;
+                _1627.offset = _1249.Load(16);
+                param_16.offset = _1627.offset;
+                PathRef param_17 = _1624;
+                Path path = Path_read(param_16, param_17);
+                uint stride = path.bbox.z - path.bbox.x;
+                sh_tile_stride[th_ix] = stride;
+                int dx = int(path.bbox.x) - int(bin_tile_x);
+                int dy = int(path.bbox.y) - int(bin_tile_y);
+                int x0 = clamp(dx, 0, 16);
+                int y0 = clamp(dy, 0, 16);
+                int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16);
+                int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16);
+                sh_tile_width[th_ix] = uint(x1 - x0);
+                sh_tile_x0[th_ix] = uint(x0);
+                sh_tile_y0[th_ix] = uint(y0);
+                tile_count = uint(x1 - x0) * uint(y1 - y0);
+                uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
+                sh_tile_base[th_ix] = base;
+                uint param_18 = path.tiles.offset;
+                uint param_19 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                bool param_20 = mem_ok;
+                Alloc path_alloc = new_alloc(param_18, param_19, param_20);
+                uint param_21 = th_ix;
+                Alloc param_22 = path_alloc;
+                write_tile_alloc(param_21, param_22);
+                break;
+            }
+            default:
+            {
+                tile_count = 0u;
+                break;
+            }
+        }
+        sh_tile_count[th_ix] = tile_count;
+        for (uint i_3 = 0u; i_3 < 8u; i_3++)
+        {
+            GroupMemoryBarrierWithGroupSync();
+            if (th_ix >= (1u << i_3))
+            {
+                tile_count += sh_tile_count[th_ix - (1u << i_3)];
+            }
+            GroupMemoryBarrierWithGroupSync();
+            sh_tile_count[th_ix] = tile_count;
+        }
+        GroupMemoryBarrierWithGroupSync();
+        uint total_tile_count = sh_tile_count[255];
+        for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u)
+        {
+            uint el_ix = 0u;
+            for (uint i_4 = 0u; i_4 < 8u; i_4++)
+            {
+                uint probe_1 = el_ix + (128u >> i_4);
+                if (ix_1 >= sh_tile_count[probe_1 - 1u])
+                {
+                    el_ix = probe_1;
+                }
+            }
+            AnnotatedRef _1826 = { _1249.Load(32) + (sh_elements[el_ix] * 40u) };
+            AnnotatedRef ref_1 = _1826;
+            Alloc _1830;
+            _1830.offset = _1249.Load(32);
+            param_23.offset = _1830.offset;
+            AnnotatedRef param_24 = ref_1;
+            uint tag_1 = Annotated_tag(param_23, param_24).tag;
+            if (el_ix > 0u)
+            {
+                _1841 = sh_tile_count[el_ix - 1u];
+            }
+            else
+            {
+                _1841 = 0u;
+            }
+            uint seq_ix = ix_1 - _1841;
+            uint width = sh_tile_width[el_ix];
+            uint x = sh_tile_x0[el_ix] + (seq_ix % width);
+            uint y = sh_tile_y0[el_ix] + (seq_ix / width);
+            bool include_tile = false;
+            if ((tag_1 == 4u) || (tag_1 == 5u))
+            {
+                include_tile = true;
+            }
+            else
+            {
+                if (mem_ok)
+                {
+                    uint param_25 = el_ix;
+                    bool param_26 = mem_ok;
+                    TileRef _1901 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                    Alloc param_27 = read_tile_alloc(param_25, param_26);
+                    TileRef param_28 = _1901;
+                    Tile tile = Tile_read(param_27, param_28);
+                    bool _1907 = tile.tile.offset != 0u;
+                    bool _1914;
+                    if (!_1907)
+                    {
+                        _1914 = tile.backdrop != 0;
+                    }
+                    else
+                    {
+                        _1914 = _1907;
+                    }
+                    include_tile = _1914;
+                }
+            }
+            if (include_tile)
+            {
+                uint el_slice = el_ix / 32u;
+                uint el_mask = 1u << (el_ix & 31u);
+                uint _1934;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1934);
+            }
+        }
+        GroupMemoryBarrierWithGroupSync();
+        uint slice_ix = 0u;
+        uint bitmap = sh_bitmaps[0][th_ix];
+        while (mem_ok)
+        {
+            if (bitmap == 0u)
+            {
+                slice_ix++;
+                if (slice_ix == 8u)
+                {
+                    break;
+                }
+                bitmap = sh_bitmaps[slice_ix][th_ix];
+                if (bitmap == 0u)
+                {
+                    continue;
+                }
+            }
+            uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
+            uint element_ix_1 = sh_elements[element_ref_ix];
+            bitmap &= (bitmap - 1u);
+            AnnotatedRef _1988 = { _1249.Load(32) + (element_ix_1 * 40u) };
+            ref = _1988;
+            Alloc _1993;
+            _1993.offset = _1249.Load(32);
+            param_29.offset = _1993.offset;
+            AnnotatedRef param_30 = ref;
+            AnnotatedTag tag_2 = Annotated_tag(param_29, param_30);
+            if (clip_zero_depth == 0u)
+            {
+                switch (tag_2.tag)
+                {
+                    case 1u:
+                    {
+                        uint param_31 = element_ref_ix;
+                        bool param_32 = mem_ok;
+                        TileRef _2029 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        Alloc param_33 = read_tile_alloc(param_31, param_32);
+                        TileRef param_34 = _2029;
+                        tile_1 = Tile_read(param_33, param_34);
+                        Alloc _2036;
+                        _2036.offset = _1249.Load(32);
+                        param_35.offset = _2036.offset;
+                        AnnotatedRef param_36 = ref;
+                        fill = Annotated_Color_read(param_35, param_36);
+                        Alloc param_37 = cmd_alloc;
+                        CmdRef param_38 = cmd_ref;
+                        uint param_39 = cmd_limit;
+                        bool _2048 = alloc_cmd(param_37, param_38, param_39);
+                        cmd_alloc = param_37;
+                        cmd_ref = param_38;
+                        cmd_limit = param_39;
+                        if (!_2048)
+                        {
+                            break;
+                        }
+                        Alloc param_40 = cmd_alloc;
+                        CmdRef param_41 = cmd_ref;
+                        uint param_42 = tag_2.flags;
+                        Tile param_43 = tile_1;
+                        float param_44 = fill.linewidth;
+                        write_fill(param_40, param_41, param_42, param_43, param_44);
+                        cmd_ref = param_41;
+                        CmdColor _2072 = { fill.rgba_color };
+                        Alloc param_45 = cmd_alloc;
+                        CmdRef param_46 = cmd_ref;
+                        CmdColor param_47 = _2072;
+                        Cmd_Color_write(param_45, param_46, param_47);
+                        cmd_ref.offset += 8u;
+                        break;
+                    }
+                    case 2u:
+                    {
+                        uint param_48 = element_ref_ix;
+                        bool param_49 = mem_ok;
+                        TileRef _2101 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        Alloc param_50 = read_tile_alloc(param_48, param_49);
+                        TileRef param_51 = _2101;
+                        tile_1 = Tile_read(param_50, param_51);
+                        Alloc _2108;
+                        _2108.offset = _1249.Load(32);
+                        param_52.offset = _2108.offset;
+                        AnnotatedRef param_53 = ref;
+                        AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53);
+                        Alloc param_54 = cmd_alloc;
+                        CmdRef param_55 = cmd_ref;
+                        uint param_56 = cmd_limit;
+                        bool _2120 = alloc_cmd(param_54, param_55, param_56);
+                        cmd_alloc = param_54;
+                        cmd_ref = param_55;
+                        cmd_limit = param_56;
+                        if (!_2120)
+                        {
+                            break;
+                        }
+                        Alloc param_57 = cmd_alloc;
+                        CmdRef param_58 = cmd_ref;
+                        uint param_59 = tag_2.flags;
+                        Tile param_60 = tile_1;
+                        float param_61 = fill.linewidth;
+                        write_fill(param_57, param_58, param_59, param_60, param_61);
+                        cmd_ref = param_58;
+                        cmd_lin.index = lin.index;
+                        cmd_lin.line_x = lin.line_x;
+                        cmd_lin.line_y = lin.line_y;
+                        cmd_lin.line_c = lin.line_c;
+                        Alloc param_62 = cmd_alloc;
+                        CmdRef param_63 = cmd_ref;
+                        CmdLinGrad param_64 = cmd_lin;
+                        Cmd_LinGrad_write(param_62, param_63, param_64);
+                        cmd_ref.offset += 20u;
+                        break;
+                    }
+                    case 3u:
+                    {
+                        uint param_65 = element_ref_ix;
+                        bool param_66 = mem_ok;
+                        TileRef _2185 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        Alloc param_67 = read_tile_alloc(param_65, param_66);
+                        TileRef param_68 = _2185;
+                        tile_1 = Tile_read(param_67, param_68);
+                        Alloc _2192;
+                        _2192.offset = _1249.Load(32);
+                        param_69.offset = _2192.offset;
+                        AnnotatedRef param_70 = ref;
+                        AnnoImage fill_img = Annotated_Image_read(param_69, param_70);
+                        Alloc param_71 = cmd_alloc;
+                        CmdRef param_72 = cmd_ref;
+                        uint param_73 = cmd_limit;
+                        bool _2204 = alloc_cmd(param_71, param_72, param_73);
+                        cmd_alloc = param_71;
+                        cmd_ref = param_72;
+                        cmd_limit = param_73;
+                        if (!_2204)
+                        {
+                            break;
+                        }
+                        Alloc param_74 = cmd_alloc;
+                        CmdRef param_75 = cmd_ref;
+                        uint param_76 = tag_2.flags;
+                        Tile param_77 = tile_1;
+                        float param_78 = fill_img.linewidth;
+                        write_fill(param_74, param_75, param_76, param_77, param_78);
+                        cmd_ref = param_75;
+                        CmdImage _2230 = { fill_img.index, fill_img.offset };
+                        Alloc param_79 = cmd_alloc;
+                        CmdRef param_80 = cmd_ref;
+                        CmdImage param_81 = _2230;
+                        Cmd_Image_write(param_79, param_80, param_81);
+                        cmd_ref.offset += 12u;
+                        break;
+                    }
+                    case 4u:
+                    {
+                        uint param_82 = element_ref_ix;
+                        bool param_83 = mem_ok;
+                        TileRef _2259 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        Alloc param_84 = read_tile_alloc(param_82, param_83);
+                        TileRef param_85 = _2259;
+                        tile_1 = Tile_read(param_84, param_85);
+                        bool _2265 = tile_1.tile.offset == 0u;
+                        bool _2271;
+                        if (_2265)
+                        {
+                            _2271 = tile_1.backdrop == 0;
+                        }
+                        else
+                        {
+                            _2271 = _2265;
+                        }
+                        if (_2271)
+                        {
+                            clip_zero_depth = clip_depth + 1u;
+                        }
+                        else
+                        {
+                            if ((tile_1.tile.offset == 0u) && (clip_depth < 32u))
+                            {
+                                clip_one_mask |= (1u << clip_depth);
+                            }
+                            else
+                            {
+                                Alloc _2293;
+                                _2293.offset = _1249.Load(32);
+                                param_86.offset = _2293.offset;
+                                AnnotatedRef param_87 = ref;
+                                AnnoBeginClip begin_clip = Annotated_BeginClip_read(param_86, param_87);
+                                Alloc param_88 = cmd_alloc;
+                                CmdRef param_89 = cmd_ref;
+                                uint param_90 = cmd_limit;
+                                bool _2305 = alloc_cmd(param_88, param_89, param_90);
+                                cmd_alloc = param_88;
+                                cmd_ref = param_89;
+                                cmd_limit = param_90;
+                                if (!_2305)
+                                {
+                                    break;
+                                }
+                                Alloc param_91 = cmd_alloc;
+                                CmdRef param_92 = cmd_ref;
+                                uint param_93 = tag_2.flags;
+                                Tile param_94 = tile_1;
+                                float param_95 = begin_clip.linewidth;
+                                write_fill(param_91, param_92, param_93, param_94, param_95);
+                                cmd_ref = param_92;
+                                Alloc param_96 = cmd_alloc;
+                                CmdRef param_97 = cmd_ref;
+                                Cmd_BeginClip_write(param_96, param_97);
+                                cmd_ref.offset += 4u;
+                                if (clip_depth < 32u)
+                                {
+                                    clip_one_mask &= (~(1u << clip_depth));
+                                }
+                            }
+                        }
+                        clip_depth++;
+                        break;
+                    }
+                    case 5u:
+                    {
+                        clip_depth--;
+                        bool _2351 = clip_depth >= 32u;
+                        bool _2360;
+                        if (!_2351)
+                        {
+                            _2360 = (clip_one_mask & (1u << clip_depth)) == 0u;
+                        }
+                        else
+                        {
+                            _2360 = _2351;
+                        }
+                        if (_2360)
+                        {
+                            Alloc param_98 = cmd_alloc;
+                            CmdRef param_99 = cmd_ref;
+                            uint param_100 = cmd_limit;
+                            bool _2369 = alloc_cmd(param_98, param_99, param_100);
+                            cmd_alloc = param_98;
+                            cmd_ref = param_99;
+                            cmd_limit = param_100;
+                            if (!_2369)
+                            {
+                                break;
+                            }
+                            Alloc param_101 = cmd_alloc;
+                            CmdRef param_102 = cmd_ref;
+                            Cmd_Solid_write(param_101, param_102);
+                            cmd_ref.offset += 4u;
+                            Alloc param_103 = cmd_alloc;
+                            CmdRef param_104 = cmd_ref;
+                            Cmd_EndClip_write(param_103, param_104);
+                            cmd_ref.offset += 4u;
+                        }
+                        break;
+                    }
+                }
+            }
+            else
+            {
+                switch (tag_2.tag)
+                {
+                    case 4u:
+                    {
+                        clip_depth++;
+                        break;
+                    }
+                    case 5u:
+                    {
+                        if (clip_depth == clip_zero_depth)
+                        {
+                            clip_zero_depth = 0u;
+                        }
+                        clip_depth--;
+                        break;
+                    }
+                }
+            }
+        }
+        GroupMemoryBarrierWithGroupSync();
+        rd_ix += 256u;
+        if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions))
+        {
+            break;
+        }
+    }
+    bool _2432 = (bin_tile_x + tile_x) < _1249.Load(8);
+    bool _2441;
+    if (_2432)
+    {
+        _2441 = (bin_tile_y + tile_y) < _1249.Load(12);
+    }
+    else
+    {
+        _2441 = _2432;
+    }
+    if (_2441)
+    {
+        Alloc param_105 = cmd_alloc;
+        CmdRef param_106 = cmd_ref;
+        Cmd_End_write(param_105, param_106);
+    }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
new file mode 100644
index 0000000..096f710
--- /dev/null
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -0,0 +1,1378 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+// Implementation of the GLSL findLSB() function
+template<typename T>
+inline T spvFindLSB(T x)
+{
+    return select(ctz(x), T(-1), x == T(0));
+}
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct MallocResult
+{
+    Alloc alloc;
+    bool failed;
+};
+
+struct AnnoImageRef
+{
+    uint offset;
+};
+
+struct AnnoImage
+{
+    float4 bbox;
+    float linewidth;
+    uint index;
+    int2 offset;
+};
+
+struct AnnoColorRef
+{
+    uint offset;
+};
+
+struct AnnoColor
+{
+    float4 bbox;
+    float linewidth;
+    uint rgba_color;
+};
+
+struct AnnoLinGradientRef
+{
+    uint offset;
+};
+
+struct AnnoLinGradient
+{
+    float4 bbox;
+    float linewidth;
+    uint index;
+    float line_x;
+    float line_y;
+    float line_c;
+};
+
+struct AnnoBeginClipRef
+{
+    uint offset;
+};
+
+struct AnnoBeginClip
+{
+    float4 bbox;
+    float linewidth;
+};
+
+struct AnnotatedRef
+{
+    uint offset;
+};
+
+struct AnnotatedTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct BinInstanceRef
+{
+    uint offset;
+};
+
+struct BinInstance
+{
+    uint element_ix;
+};
+
+struct PathRef
+{
+    uint offset;
+};
+
+struct TileRef
+{
+    uint offset;
+};
+
+struct Path
+{
+    uint4 bbox;
+    TileRef tiles;
+};
+
+struct TileSegRef
+{
+    uint offset;
+};
+
+struct Tile
+{
+    TileSegRef tile;
+    int backdrop;
+};
+
+struct CmdStrokeRef
+{
+    uint offset;
+};
+
+struct CmdStroke
+{
+    uint tile_ref;
+    float half_width;
+};
+
+struct CmdFillRef
+{
+    uint offset;
+};
+
+struct CmdFill
+{
+    uint tile_ref;
+    int backdrop;
+};
+
+struct CmdColorRef
+{
+    uint offset;
+};
+
+struct CmdColor
+{
+    uint rgba_color;
+};
+
+struct CmdLinGradRef
+{
+    uint offset;
+};
+
+struct CmdLinGrad
+{
+    uint index;
+    float line_x;
+    float line_y;
+    float line_c;
+};
+
+struct CmdImageRef
+{
+    uint offset;
+};
+
+struct CmdImage
+{
+    uint index;
+    int2 offset;
+};
+
+struct CmdJumpRef
+{
+    uint offset;
+};
+
+struct CmdJump
+{
+    uint new_ref;
+};
+
+struct CmdRef
+{
+    uint offset;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+struct Alloc_1
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc_1 tile_alloc;
+    Alloc_1 bin_alloc;
+    Alloc_1 ptcl_alloc;
+    Alloc_1 pathseg_alloc;
+    Alloc_1 anno_alloc;
+    Alloc_1 trans_alloc;
+    Alloc_1 bbox_alloc;
+    Alloc_1 drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
+{
+    return Alloc{ a.offset + offset };
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = v_296.memory[offset];
+    return v;
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+static inline __attribute__((always_inline))
+BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const uint& index)
+{
+    return BinInstanceRef{ ref.offset + (index * 4u) };
+}
+
+static inline __attribute__((always_inline))
+BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize);
+    BinInstance s;
+    s.element_ix = raw0;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1, v_296, v_296BufferSize);
+    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+}
+
+static inline __attribute__((always_inline))
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize);
+    Path s;
+    s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+    s.tiles = TileRef{ raw2 };
+    return s;
+}
+
+static inline __attribute__((always_inline))
+void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a)
+{
+}
+
+static inline __attribute__((always_inline))
+Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint param = 0u;
+    uint param_1 = uint(int((v_296BufferSize - 8) / 4) * 4);
+    bool param_2 = mem_ok;
+    return new_alloc(param, param_1, param_2);
+}
+
+static inline __attribute__((always_inline))
+Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize);
+    Tile s;
+    s.tile = TileSegRef{ raw0 };
+    s.backdrop = int(raw1);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11, v_296, v_296BufferSize);
+    AnnoColor s;
+    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.linewidth = as_type<float>(raw4);
+    s.rgba_color = raw5;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+AnnoColor Annotated_Color_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    AnnoColorRef param_1 = AnnoColorRef{ ref.offset + 4u };
+    return AnnoColor_read(param, param_1, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+MallocResult malloc(thread const uint& size, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint _302 = atomic_fetch_add_explicit((device atomic_uint*)&v_296.mem_offset, size, memory_order_relaxed);
+    uint offset = _302;
+    MallocResult r;
+    r.failed = (offset + size) > uint(int((v_296BufferSize - 8) / 4) * 4);
+    uint param = offset;
+    uint param_1 = size;
+    bool param_2 = !r.failed;
+    r.alloc = new_alloc(param, param_1, param_2);
+    if (r.failed)
+    {
+        uint _331 = atomic_fetch_max_explicit((device atomic_uint*)&v_296.mem_error, 1u, memory_order_relaxed);
+        return r;
+    }
+    return r;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    v_296.memory[offset] = val;
+}
+
+static inline __attribute__((always_inline))
+void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.new_ref;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 10u;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+    Alloc param_3 = a;
+    CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u };
+    CmdJump param_5 = s;
+    CmdJump_write(param_3, param_4, param_5, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    if (cmd_ref.offset < cmd_limit)
+    {
+        return true;
+    }
+    uint param = 1024u;
+    MallocResult _1156 = malloc(param, v_296, v_296BufferSize);
+    MallocResult new_cmd = _1156;
+    if (new_cmd.failed)
+    {
+        return false;
+    }
+    CmdJump jump = CmdJump{ new_cmd.alloc.offset };
+    Alloc param_1 = cmd_alloc;
+    CmdRef param_2 = cmd_ref;
+    CmdJump param_3 = jump;
+    Cmd_Jump_write(param_1, param_2, param_3, v_296, v_296BufferSize);
+    cmd_alloc = new_cmd.alloc;
+    cmd_ref = CmdRef{ cmd_alloc.offset };
+    cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
+    return true;
+}
+
+static inline __attribute__((always_inline))
+uint fill_mode_from_flags(thread const uint& flags)
+{
+    return flags & 1u;
+}
+
+static inline __attribute__((always_inline))
+void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.tile_ref;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = uint(s.backdrop);
+    write_mem(param_3, param_4, param_5, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 1u;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+    Alloc param_3 = a;
+    CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u };
+    CmdFill param_5 = s;
+    CmdFill_write(param_3, param_4, param_5, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 3u;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.tile_ref;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = as_type<uint>(s.half_width);
+    write_mem(param_3, param_4, param_5, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 2u;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+    Alloc param_3 = a;
+    CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u };
+    CmdStroke param_5 = s;
+    CmdStroke_write(param_3, param_4, param_5, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const uint& flags, thread const Tile& tile, thread const float& linewidth, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint param = flags;
+    if (fill_mode_from_flags(param) == 0u)
+    {
+        if (tile.tile.offset != 0u)
+        {
+            CmdFill cmd_fill = CmdFill{ tile.tile.offset, tile.backdrop };
+            Alloc param_1 = alloc;
+            CmdRef param_2 = cmd_ref;
+            CmdFill param_3 = cmd_fill;
+            Cmd_Fill_write(param_1, param_2, param_3, v_296, v_296BufferSize);
+            cmd_ref.offset += 12u;
+        }
+        else
+        {
+            Alloc param_4 = alloc;
+            CmdRef param_5 = cmd_ref;
+            Cmd_Solid_write(param_4, param_5, v_296, v_296BufferSize);
+            cmd_ref.offset += 4u;
+        }
+    }
+    else
+    {
+        CmdStroke cmd_stroke = CmdStroke{ tile.tile.offset, 0.5 * linewidth };
+        Alloc param_6 = alloc;
+        CmdRef param_7 = cmd_ref;
+        CmdStroke param_8 = cmd_stroke;
+        Cmd_Stroke_write(param_6, param_7, param_8, v_296, v_296BufferSize);
+        cmd_ref.offset += 12u;
+    }
+}
+
+static inline __attribute__((always_inline))
+void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.rgba_color;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 5u;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+    Alloc param_3 = a;
+    CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u };
+    CmdColor param_5 = s;
+    CmdColor_write(param_3, param_4, param_5, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLinGradientRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11, v_296, v_296BufferSize);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13, v_296, v_296BufferSize);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15, v_296, v_296BufferSize);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17, v_296, v_296BufferSize);
+    AnnoLinGradient s;
+    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.linewidth = as_type<float>(raw4);
+    s.index = raw5;
+    s.line_x = as_type<float>(raw6);
+    s.line_y = as_type<float>(raw7);
+    s.line_c = as_type<float>(raw8);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+AnnoLinGradient Annotated_LinGradient_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    AnnoLinGradientRef param_1 = AnnoLinGradientRef{ ref.offset + 4u };
+    return AnnoLinGradient_read(param, param_1, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.index;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = as_type<uint>(s.line_x);
+    write_mem(param_3, param_4, param_5, v_296, v_296BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = as_type<uint>(s.line_y);
+    write_mem(param_6, param_7, param_8, v_296, v_296BufferSize);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = as_type<uint>(s.line_c);
+    write_mem(param_9, param_10, param_11, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 6u;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+    Alloc param_3 = a;
+    CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u };
+    CmdLinGrad param_5 = s;
+    CmdLinGrad_write(param_3, param_4, param_5, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11, v_296, v_296BufferSize);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13, v_296, v_296BufferSize);
+    AnnoImage s;
+    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.linewidth = as_type<float>(raw4);
+    s.index = raw5;
+    s.offset = int2(int(raw6 << uint(16)) >> 16, int(raw6) >> 16);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+AnnoImage Annotated_Image_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    AnnoImageRef param_1 = AnnoImageRef{ ref.offset + 4u };
+    return AnnoImage_read(param, param_1, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.index;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
+    write_mem(param_3, param_4, param_5, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 7u;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+    Alloc param_3 = a;
+    CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
+    CmdImage param_5 = s;
+    CmdImage_write(param_3, param_4, param_5, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+AnnoBeginClip AnnoBeginClip_read(thread const Alloc& a, thread const AnnoBeginClipRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize);
+    AnnoBeginClip s;
+    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.linewidth = as_type<float>(raw4);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+AnnoBeginClip Annotated_BeginClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    AnnoBeginClipRef param_1 = AnnoBeginClipRef{ ref.offset + 4u };
+    return AnnoBeginClip_read(param, param_1, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 8u;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 9u;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 0u;
+    write_mem(param, param_1, param_2, v_296, v_296BufferSize);
+}
+
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_296 [[buffer(0)]], const device ConfigBuf& _1249 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+    threadgroup uint sh_bitmaps[8][256];
+    threadgroup Alloc sh_part_elements[256];
+    threadgroup uint sh_part_count[256];
+    threadgroup uint sh_elements[256];
+    threadgroup uint sh_tile_stride[256];
+    threadgroup uint sh_tile_width[256];
+    threadgroup uint sh_tile_x0[256];
+    threadgroup uint sh_tile_y0[256];
+    threadgroup uint sh_tile_base[256];
+    threadgroup uint sh_tile_count[256];
+    constant uint& v_296BufferSize = spvBufferSizeConstants[0];
+    uint width_in_bins = ((_1249.conf.width_in_tiles + 16u) - 1u) / 16u;
+    uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
+    uint partition_ix = 0u;
+    uint n_partitions = ((_1249.conf.n_elements + 256u) - 1u) / 256u;
+    uint th_ix = gl_LocalInvocationID.x;
+    uint bin_tile_x = 16u * gl_WorkGroupID.x;
+    uint bin_tile_y = 16u * gl_WorkGroupID.y;
+    uint tile_x = gl_LocalInvocationID.x % 16u;
+    uint tile_y = gl_LocalInvocationID.x / 16u;
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1249.conf.width_in_tiles) + bin_tile_x) + tile_x;
+    Alloc param;
+    param.offset = _1249.conf.ptcl_alloc.offset;
+    uint param_1 = this_tile_ix * 1024u;
+    uint param_2 = 1024u;
+    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
+    CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+    uint cmd_limit = (cmd_ref.offset + 1024u) - 60u;
+    uint clip_depth = 0u;
+    uint clip_zero_depth = 0u;
+    uint clip_one_mask = 0u;
+    uint rd_ix = 0u;
+    uint wr_ix = 0u;
+    uint part_start_ix = 0u;
+    uint ready_ix = 0u;
+    bool mem_ok = v_296.mem_error == 0u;
+    Alloc param_3;
+    Alloc param_5;
+    uint _1529;
+    uint element_ix;
+    AnnotatedRef ref;
+    Alloc param_14;
+    Alloc param_16;
+    uint tile_count;
+    Alloc param_23;
+    uint _1841;
+    Alloc param_29;
+    Tile tile_1;
+    AnnoColor fill;
+    Alloc param_35;
+    Alloc param_52;
+    CmdLinGrad cmd_lin;
+    Alloc param_69;
+    Alloc param_86;
+    while (true)
+    {
+        for (uint i = 0u; i < 8u; i++)
+        {
+            sh_bitmaps[i][th_ix] = 0u;
+        }
+        bool _1581;
+        for (;;)
+        {
+            if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
+            {
+                part_start_ix = ready_ix;
+                uint count = 0u;
+                bool _1379 = th_ix < 256u;
+                bool _1387;
+                if (_1379)
+                {
+                    _1387 = (partition_ix + th_ix) < n_partitions;
+                }
+                else
+                {
+                    _1387 = _1379;
+                }
+                if (_1387)
+                {
+                    uint in_ix = (_1249.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    param_3.offset = _1249.conf.bin_alloc.offset;
+                    uint param_4 = in_ix;
+                    count = read_mem(param_3, param_4, v_296, v_296BufferSize);
+                    param_5.offset = _1249.conf.bin_alloc.offset;
+                    uint param_6 = in_ix + 1u;
+                    uint offset = read_mem(param_5, param_6, v_296, v_296BufferSize);
+                    uint param_7 = offset;
+                    uint param_8 = count * 4u;
+                    bool param_9 = mem_ok;
+                    sh_part_elements[th_ix] = new_alloc(param_7, param_8, param_9);
+                }
+                for (uint i_1 = 0u; i_1 < 8u; i_1++)
+                {
+                    if (th_ix < 256u)
+                    {
+                        sh_part_count[th_ix] = count;
+                    }
+                    threadgroup_barrier(mem_flags::mem_threadgroup);
+                    if (th_ix < 256u)
+                    {
+                        if (th_ix >= (1u << i_1))
+                        {
+                            count += sh_part_count[th_ix - (1u << i_1)];
+                        }
+                    }
+                    threadgroup_barrier(mem_flags::mem_threadgroup);
+                }
+                if (th_ix < 256u)
+                {
+                    sh_part_count[th_ix] = part_start_ix + count;
+                }
+                threadgroup_barrier(mem_flags::mem_threadgroup);
+                ready_ix = sh_part_count[255];
+                partition_ix += 256u;
+            }
+            uint ix = rd_ix + th_ix;
+            if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok)
+            {
+                uint part_ix = 0u;
+                for (uint i_2 = 0u; i_2 < 8u; i_2++)
+                {
+                    uint probe = part_ix + (128u >> i_2);
+                    if (ix >= sh_part_count[probe - 1u])
+                    {
+                        part_ix = probe;
+                    }
+                }
+                if (part_ix > 0u)
+                {
+                    _1529 = sh_part_count[part_ix - 1u];
+                }
+                else
+                {
+                    _1529 = part_start_ix;
+                }
+                ix -= _1529;
+                Alloc bin_alloc = sh_part_elements[part_ix];
+                BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
+                BinInstanceRef param_10 = inst_ref;
+                uint param_11 = ix;
+                Alloc param_12 = bin_alloc;
+                BinInstanceRef param_13 = BinInstance_index(param_10, param_11);
+                BinInstance inst = BinInstance_read(param_12, param_13, v_296, v_296BufferSize);
+                sh_elements[th_ix] = inst.element_ix;
+            }
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+            wr_ix = min((rd_ix + 256u), ready_ix);
+            bool _1571 = (wr_ix - rd_ix) < 256u;
+            if (_1571)
+            {
+                _1581 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+            }
+            else
+            {
+                _1581 = _1571;
+            }
+            if (_1581)
+            {
+                continue;
+            }
+            else
+            {
+                break;
+            }
+        }
+        uint tag = 0u;
+        if ((th_ix + rd_ix) < wr_ix)
+        {
+            element_ix = sh_elements[th_ix];
+            ref = AnnotatedRef{ _1249.conf.anno_alloc.offset + (element_ix * 40u) };
+            param_14.offset = _1249.conf.anno_alloc.offset;
+            AnnotatedRef param_15 = ref;
+            tag = Annotated_tag(param_14, param_15, v_296, v_296BufferSize).tag;
+        }
+        switch (tag)
+        {
+            case 1u:
+            case 3u:
+            case 2u:
+            case 4u:
+            case 5u:
+            {
+                uint path_ix = element_ix;
+                param_16.offset = _1249.conf.tile_alloc.offset;
+                PathRef param_17 = PathRef{ _1249.conf.tile_alloc.offset + (path_ix * 12u) };
+                Path path = Path_read(param_16, param_17, v_296, v_296BufferSize);
+                uint stride = path.bbox.z - path.bbox.x;
+                sh_tile_stride[th_ix] = stride;
+                int dx = int(path.bbox.x) - int(bin_tile_x);
+                int dy = int(path.bbox.y) - int(bin_tile_y);
+                int x0 = clamp(dx, 0, 16);
+                int y0 = clamp(dy, 0, 16);
+                int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16);
+                int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16);
+                sh_tile_width[th_ix] = uint(x1 - x0);
+                sh_tile_x0[th_ix] = uint(x0);
+                sh_tile_y0[th_ix] = uint(y0);
+                tile_count = uint(x1 - x0) * uint(y1 - y0);
+                uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
+                sh_tile_base[th_ix] = base;
+                uint param_18 = path.tiles.offset;
+                uint param_19 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                bool param_20 = mem_ok;
+                Alloc path_alloc = new_alloc(param_18, param_19, param_20);
+                uint param_21 = th_ix;
+                Alloc param_22 = path_alloc;
+                write_tile_alloc(param_21, param_22);
+                break;
+            }
+            default:
+            {
+                tile_count = 0u;
+                break;
+            }
+        }
+        sh_tile_count[th_ix] = tile_count;
+        for (uint i_3 = 0u; i_3 < 8u; i_3++)
+        {
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+            if (th_ix >= (1u << i_3))
+            {
+                tile_count += sh_tile_count[th_ix - (1u << i_3)];
+            }
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+            sh_tile_count[th_ix] = tile_count;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        uint total_tile_count = sh_tile_count[255];
+        for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u)
+        {
+            uint el_ix = 0u;
+            for (uint i_4 = 0u; i_4 < 8u; i_4++)
+            {
+                uint probe_1 = el_ix + (128u >> i_4);
+                if (ix_1 >= sh_tile_count[probe_1 - 1u])
+                {
+                    el_ix = probe_1;
+                }
+            }
+            AnnotatedRef ref_1 = AnnotatedRef{ _1249.conf.anno_alloc.offset + (sh_elements[el_ix] * 40u) };
+            param_23.offset = _1249.conf.anno_alloc.offset;
+            AnnotatedRef param_24 = ref_1;
+            uint tag_1 = Annotated_tag(param_23, param_24, v_296, v_296BufferSize).tag;
+            if (el_ix > 0u)
+            {
+                _1841 = sh_tile_count[el_ix - 1u];
+            }
+            else
+            {
+                _1841 = 0u;
+            }
+            uint seq_ix = ix_1 - _1841;
+            uint width = sh_tile_width[el_ix];
+            uint x = sh_tile_x0[el_ix] + (seq_ix % width);
+            uint y = sh_tile_y0[el_ix] + (seq_ix / width);
+            bool include_tile = false;
+            if ((tag_1 == 4u) || (tag_1 == 5u))
+            {
+                include_tile = true;
+            }
+            else
+            {
+                if (mem_ok)
+                {
+                    uint param_25 = el_ix;
+                    bool param_26 = mem_ok;
+                    Alloc param_27 = read_tile_alloc(param_25, param_26, v_296, v_296BufferSize);
+                    TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                    Tile tile = Tile_read(param_27, param_28, v_296, v_296BufferSize);
+                    bool _1907 = tile.tile.offset != 0u;
+                    bool _1914;
+                    if (!_1907)
+                    {
+                        _1914 = tile.backdrop != 0;
+                    }
+                    else
+                    {
+                        _1914 = _1907;
+                    }
+                    include_tile = _1914;
+                }
+            }
+            if (include_tile)
+            {
+                uint el_slice = el_ix / 32u;
+                uint el_mask = 1u << (el_ix & 31u);
+                uint _1934 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+            }
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        uint slice_ix = 0u;
+        uint bitmap = sh_bitmaps[0][th_ix];
+        while (mem_ok)
+        {
+            if (bitmap == 0u)
+            {
+                slice_ix++;
+                if (slice_ix == 8u)
+                {
+                    break;
+                }
+                bitmap = sh_bitmaps[slice_ix][th_ix];
+                if (bitmap == 0u)
+                {
+                    continue;
+                }
+            }
+            uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
+            uint element_ix_1 = sh_elements[element_ref_ix];
+            bitmap &= (bitmap - 1u);
+            ref = AnnotatedRef{ _1249.conf.anno_alloc.offset + (element_ix_1 * 40u) };
+            param_29.offset = _1249.conf.anno_alloc.offset;
+            AnnotatedRef param_30 = ref;
+            AnnotatedTag tag_2 = Annotated_tag(param_29, param_30, v_296, v_296BufferSize);
+            if (clip_zero_depth == 0u)
+            {
+                switch (tag_2.tag)
+                {
+                    case 1u:
+                    {
+                        uint param_31 = element_ref_ix;
+                        bool param_32 = mem_ok;
+                        Alloc param_33 = read_tile_alloc(param_31, param_32, v_296, v_296BufferSize);
+                        TileRef param_34 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        tile_1 = Tile_read(param_33, param_34, v_296, v_296BufferSize);
+                        param_35.offset = _1249.conf.anno_alloc.offset;
+                        AnnotatedRef param_36 = ref;
+                        fill = Annotated_Color_read(param_35, param_36, v_296, v_296BufferSize);
+                        Alloc param_37 = cmd_alloc;
+                        CmdRef param_38 = cmd_ref;
+                        uint param_39 = cmd_limit;
+                        bool _2048 = alloc_cmd(param_37, param_38, param_39, v_296, v_296BufferSize);
+                        cmd_alloc = param_37;
+                        cmd_ref = param_38;
+                        cmd_limit = param_39;
+                        if (!_2048)
+                        {
+                            break;
+                        }
+                        Alloc param_40 = cmd_alloc;
+                        CmdRef param_41 = cmd_ref;
+                        uint param_42 = tag_2.flags;
+                        Tile param_43 = tile_1;
+                        float param_44 = fill.linewidth;
+                        write_fill(param_40, param_41, param_42, param_43, param_44, v_296, v_296BufferSize);
+                        cmd_ref = param_41;
+                        Alloc param_45 = cmd_alloc;
+                        CmdRef param_46 = cmd_ref;
+                        CmdColor param_47 = CmdColor{ fill.rgba_color };
+                        Cmd_Color_write(param_45, param_46, param_47, v_296, v_296BufferSize);
+                        cmd_ref.offset += 8u;
+                        break;
+                    }
+                    case 2u:
+                    {
+                        uint param_48 = element_ref_ix;
+                        bool param_49 = mem_ok;
+                        Alloc param_50 = read_tile_alloc(param_48, param_49, v_296, v_296BufferSize);
+                        TileRef param_51 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        tile_1 = Tile_read(param_50, param_51, v_296, v_296BufferSize);
+                        param_52.offset = _1249.conf.anno_alloc.offset;
+                        AnnotatedRef param_53 = ref;
+                        AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53, v_296, v_296BufferSize);
+                        Alloc param_54 = cmd_alloc;
+                        CmdRef param_55 = cmd_ref;
+                        uint param_56 = cmd_limit;
+                        bool _2120 = alloc_cmd(param_54, param_55, param_56, v_296, v_296BufferSize);
+                        cmd_alloc = param_54;
+                        cmd_ref = param_55;
+                        cmd_limit = param_56;
+                        if (!_2120)
+                        {
+                            break;
+                        }
+                        Alloc param_57 = cmd_alloc;
+                        CmdRef param_58 = cmd_ref;
+                        uint param_59 = tag_2.flags;
+                        Tile param_60 = tile_1;
+                        float param_61 = fill.linewidth;
+                        write_fill(param_57, param_58, param_59, param_60, param_61, v_296, v_296BufferSize);
+                        cmd_ref = param_58;
+                        cmd_lin.index = lin.index;
+                        cmd_lin.line_x = lin.line_x;
+                        cmd_lin.line_y = lin.line_y;
+                        cmd_lin.line_c = lin.line_c;
+                        Alloc param_62 = cmd_alloc;
+                        CmdRef param_63 = cmd_ref;
+                        CmdLinGrad param_64 = cmd_lin;
+                        Cmd_LinGrad_write(param_62, param_63, param_64, v_296, v_296BufferSize);
+                        cmd_ref.offset += 20u;
+                        break;
+                    }
+                    case 3u:
+                    {
+                        uint param_65 = element_ref_ix;
+                        bool param_66 = mem_ok;
+                        Alloc param_67 = read_tile_alloc(param_65, param_66, v_296, v_296BufferSize);
+                        TileRef param_68 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        tile_1 = Tile_read(param_67, param_68, v_296, v_296BufferSize);
+                        param_69.offset = _1249.conf.anno_alloc.offset;
+                        AnnotatedRef param_70 = ref;
+                        AnnoImage fill_img = Annotated_Image_read(param_69, param_70, v_296, v_296BufferSize);
+                        Alloc param_71 = cmd_alloc;
+                        CmdRef param_72 = cmd_ref;
+                        uint param_73 = cmd_limit;
+                        bool _2204 = alloc_cmd(param_71, param_72, param_73, v_296, v_296BufferSize);
+                        cmd_alloc = param_71;
+                        cmd_ref = param_72;
+                        cmd_limit = param_73;
+                        if (!_2204)
+                        {
+                            break;
+                        }
+                        Alloc param_74 = cmd_alloc;
+                        CmdRef param_75 = cmd_ref;
+                        uint param_76 = tag_2.flags;
+                        Tile param_77 = tile_1;
+                        float param_78 = fill_img.linewidth;
+                        write_fill(param_74, param_75, param_76, param_77, param_78, v_296, v_296BufferSize);
+                        cmd_ref = param_75;
+                        Alloc param_79 = cmd_alloc;
+                        CmdRef param_80 = cmd_ref;
+                        CmdImage param_81 = CmdImage{ fill_img.index, fill_img.offset };
+                        Cmd_Image_write(param_79, param_80, param_81, v_296, v_296BufferSize);
+                        cmd_ref.offset += 12u;
+                        break;
+                    }
+                    case 4u:
+                    {
+                        uint param_82 = element_ref_ix;
+                        bool param_83 = mem_ok;
+                        Alloc param_84 = read_tile_alloc(param_82, param_83, v_296, v_296BufferSize);
+                        TileRef param_85 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        tile_1 = Tile_read(param_84, param_85, v_296, v_296BufferSize);
+                        bool _2265 = tile_1.tile.offset == 0u;
+                        bool _2271;
+                        if (_2265)
+                        {
+                            _2271 = tile_1.backdrop == 0;
+                        }
+                        else
+                        {
+                            _2271 = _2265;
+                        }
+                        if (_2271)
+                        {
+                            clip_zero_depth = clip_depth + 1u;
+                        }
+                        else
+                        {
+                            if ((tile_1.tile.offset == 0u) && (clip_depth < 32u))
+                            {
+                                clip_one_mask |= (1u << clip_depth);
+                            }
+                            else
+                            {
+                                param_86.offset = _1249.conf.anno_alloc.offset;
+                                AnnotatedRef param_87 = ref;
+                                AnnoBeginClip begin_clip = Annotated_BeginClip_read(param_86, param_87, v_296, v_296BufferSize);
+                                Alloc param_88 = cmd_alloc;
+                                CmdRef param_89 = cmd_ref;
+                                uint param_90 = cmd_limit;
+                                bool _2305 = alloc_cmd(param_88, param_89, param_90, v_296, v_296BufferSize);
+                                cmd_alloc = param_88;
+                                cmd_ref = param_89;
+                                cmd_limit = param_90;
+                                if (!_2305)
+                                {
+                                    break;
+                                }
+                                Alloc param_91 = cmd_alloc;
+                                CmdRef param_92 = cmd_ref;
+                                uint param_93 = tag_2.flags;
+                                Tile param_94 = tile_1;
+                                float param_95 = begin_clip.linewidth;
+                                write_fill(param_91, param_92, param_93, param_94, param_95, v_296, v_296BufferSize);
+                                cmd_ref = param_92;
+                                Alloc param_96 = cmd_alloc;
+                                CmdRef param_97 = cmd_ref;
+                                Cmd_BeginClip_write(param_96, param_97, v_296, v_296BufferSize);
+                                cmd_ref.offset += 4u;
+                                if (clip_depth < 32u)
+                                {
+                                    clip_one_mask &= (~(1u << clip_depth));
+                                }
+                            }
+                        }
+                        clip_depth++;
+                        break;
+                    }
+                    case 5u:
+                    {
+                        clip_depth--;
+                        bool _2351 = clip_depth >= 32u;
+                        bool _2360;
+                        if (!_2351)
+                        {
+                            _2360 = (clip_one_mask & (1u << clip_depth)) == 0u;
+                        }
+                        else
+                        {
+                            _2360 = _2351;
+                        }
+                        if (_2360)
+                        {
+                            Alloc param_98 = cmd_alloc;
+                            CmdRef param_99 = cmd_ref;
+                            uint param_100 = cmd_limit;
+                            bool _2369 = alloc_cmd(param_98, param_99, param_100, v_296, v_296BufferSize);
+                            cmd_alloc = param_98;
+                            cmd_ref = param_99;
+                            cmd_limit = param_100;
+                            if (!_2369)
+                            {
+                                break;
+                            }
+                            Alloc param_101 = cmd_alloc;
+                            CmdRef param_102 = cmd_ref;
+                            Cmd_Solid_write(param_101, param_102, v_296, v_296BufferSize);
+                            cmd_ref.offset += 4u;
+                            Alloc param_103 = cmd_alloc;
+                            CmdRef param_104 = cmd_ref;
+                            Cmd_EndClip_write(param_103, param_104, v_296, v_296BufferSize);
+                            cmd_ref.offset += 4u;
+                        }
+                        break;
+                    }
+                }
+            }
+            else
+            {
+                switch (tag_2.tag)
+                {
+                    case 4u:
+                    {
+                        clip_depth++;
+                        break;
+                    }
+                    case 5u:
+                    {
+                        if (clip_depth == clip_zero_depth)
+                        {
+                            clip_zero_depth = 0u;
+                        }
+                        clip_depth--;
+                        break;
+                    }
+                }
+            }
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        rd_ix += 256u;
+        if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions))
+        {
+            break;
+        }
+    }
+    bool _2432 = (bin_tile_x + tile_x) < _1249.conf.width_in_tiles;
+    bool _2441;
+    if (_2432)
+    {
+        _2441 = (bin_tile_y + tile_y) < _1249.conf.height_in_tiles;
+    }
+    else
+    {
+        _2441 = _2432;
+    }
+    if (_2441)
+    {
+        Alloc param_105 = cmd_alloc;
+        CmdRef param_106 = cmd_ref;
+        Cmd_End_write(param_105, param_106, v_296, v_296BufferSize);
+    }
+}
+
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/gen/coarse.spv
similarity index 61%
rename from piet-gpu/shader/coarse.spv
rename to piet-gpu/shader/gen/coarse.spv
index 8d4f7c0..fbe025d 100644
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl
index 0ef9538..0dec2cd 100644
--- a/piet-gpu/shader/gen/draw_leaf.hlsl
+++ b/piet-gpu/shader/gen/draw_leaf.hlsl
@@ -158,10 +158,10 @@
 static const DrawMonoid _445 = { 1u, 1u };
 static const DrawMonoid _447 = { 0u, 1u };
 
-RWByteAddressBuffer _201 : register(u0);
-ByteAddressBuffer _225 : register(t2);
-ByteAddressBuffer _1008 : register(t3);
-ByteAddressBuffer _1042 : register(t1);
+RWByteAddressBuffer _201 : register(u0, space0);
+ByteAddressBuffer _225 : register(t2, space0);
+ByteAddressBuffer _1008 : register(t3, space0);
+ByteAddressBuffer _1042 : register(t1, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl
index b28c956..216d923 100644
--- a/piet-gpu/shader/gen/draw_reduce.hlsl
+++ b/piet-gpu/shader/gen/draw_reduce.hlsl
@@ -49,10 +49,10 @@
 static const DrawMonoid _92 = { 0u, 1u };
 static const DrawMonoid _94 = { 0u, 0u };
 
-ByteAddressBuffer _46 : register(t2);
-RWByteAddressBuffer _203 : register(u3);
-RWByteAddressBuffer _217 : register(u0);
-ByteAddressBuffer _223 : register(t1);
+ByteAddressBuffer _46 : register(t2, space0);
+RWByteAddressBuffer _203 : register(u3, space0);
+RWByteAddressBuffer _217 : register(u0, space0);
+ByteAddressBuffer _223 : register(t1, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
diff --git a/piet-gpu/shader/gen/draw_root.hlsl b/piet-gpu/shader/gen/draw_root.hlsl
index 7dc68b1..ec75d5c 100644
--- a/piet-gpu/shader/gen/draw_root.hlsl
+++ b/piet-gpu/shader/gen/draw_root.hlsl
@@ -8,7 +8,7 @@
 
 static const DrawMonoid _18 = { 0u, 0u };
 
-RWByteAddressBuffer _57 : register(u0);
+RWByteAddressBuffer _57 : register(u0, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
new file mode 100644
index 0000000..3b3c42e
--- /dev/null
+++ b/piet-gpu/shader/gen/kernel4.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
new file mode 100644
index 0000000..8b0699a
--- /dev/null
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -0,0 +1,689 @@
+struct Alloc
+{
+    uint offset;
+};
+
+struct CmdStrokeRef
+{
+    uint offset;
+};
+
+struct CmdStroke
+{
+    uint tile_ref;
+    float half_width;
+};
+
+struct CmdFillRef
+{
+    uint offset;
+};
+
+struct CmdFill
+{
+    uint tile_ref;
+    int backdrop;
+};
+
+struct CmdColorRef
+{
+    uint offset;
+};
+
+struct CmdColor
+{
+    uint rgba_color;
+};
+
+struct CmdLinGradRef
+{
+    uint offset;
+};
+
+struct CmdLinGrad
+{
+    uint index;
+    float line_x;
+    float line_y;
+    float line_c;
+};
+
+struct CmdImageRef
+{
+    uint offset;
+};
+
+struct CmdImage
+{
+    uint index;
+    int2 offset;
+};
+
+struct CmdAlphaRef
+{
+    uint offset;
+};
+
+struct CmdAlpha
+{
+    float alpha;
+};
+
+struct CmdJumpRef
+{
+    uint offset;
+};
+
+struct CmdJump
+{
+    uint new_ref;
+};
+
+struct CmdRef
+{
+    uint offset;
+};
+
+struct CmdTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct TileSegRef
+{
+    uint offset;
+};
+
+struct TileSeg
+{
+    float2 origin;
+    float2 _vector;
+    float y_edge;
+    TileSegRef next;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
+
+RWByteAddressBuffer _202 : register(u0, space0);
+ByteAddressBuffer _723 : register(t1, space0);
+RWTexture2D<unorm float4> image_atlas : register(u3, space0);
+RWTexture2D<unorm float4> gradients : register(u4, space0);
+RWTexture2D<unorm float4> image : register(u2, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+};
+
+uint spvPackUnorm4x8(float4 value)
+{
+    uint4 Packed = uint4(round(saturate(value) * 255.0));
+    return Packed.x | (Packed.y << 8) | (Packed.z << 16) | (Packed.w << 24);
+}
+
+float4 spvUnpackUnorm4x8(uint value)
+{
+    uint4 Packed = uint4(value & 0xff, (value >> 8) & 0xff, (value >> 16) & 0xff, value >> 24);
+    return float4(Packed) / 255.0;
+}
+
+Alloc slice_mem(Alloc a, uint offset, uint size)
+{
+    Alloc _215 = { a.offset + offset };
+    return _215;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = _202.Load(offset * 4 + 8);
+    return v;
+}
+
+CmdTag Cmd_tag(Alloc a, CmdRef ref)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1);
+    CmdTag _432 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _432;
+}
+
+CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    CmdStroke s;
+    s.tile_ref = raw0;
+    s.half_width = asfloat(raw1);
+    return s;
+}
+
+CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
+{
+    CmdStrokeRef _449 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdStrokeRef param_1 = _449;
+    return CmdStroke_read(param, param_1);
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+TileSeg TileSeg_read(Alloc a, TileSegRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11);
+    TileSeg s;
+    s.origin = float2(asfloat(raw0), asfloat(raw1));
+    s._vector = float2(asfloat(raw2), asfloat(raw3));
+    s.y_edge = asfloat(raw4);
+    TileSegRef _572 = { raw5 };
+    s.next = _572;
+    return s;
+}
+
+uint2 chunk_offset(uint i)
+{
+    return uint2((i % 2u) * 8u, (i / 2u) * 4u);
+}
+
+CmdFill CmdFill_read(Alloc a, CmdFillRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    CmdFill s;
+    s.tile_ref = raw0;
+    s.backdrop = int(raw1);
+    return s;
+}
+
+CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
+{
+    CmdFillRef _439 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdFillRef param_1 = _439;
+    return CmdFill_read(param, param_1);
+}
+
+CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    CmdAlpha s;
+    s.alpha = asfloat(raw0);
+    return s;
+}
+
+CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
+{
+    CmdAlphaRef _459 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdAlphaRef param_1 = _459;
+    return CmdAlpha_read(param, param_1);
+}
+
+CmdColor CmdColor_read(Alloc a, CmdColorRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    CmdColor s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
+{
+    CmdColorRef _469 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdColorRef param_1 = _469;
+    return CmdColor_read(param, param_1);
+}
+
+float3 fromsRGB(float3 srgb)
+{
+    bool3 cutoff = bool3(srgb.x >= 0.040449999272823333740234375f.xxx.x, srgb.y >= 0.040449999272823333740234375f.xxx.y, srgb.z >= 0.040449999272823333740234375f.xxx.z);
+    float3 below = srgb / 12.9200000762939453125f.xxx;
+    float3 above = pow((srgb + 0.054999999701976776123046875f.xxx) / 1.05499994754791259765625f.xxx, 2.400000095367431640625f.xxx);
+    return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z);
+}
+
+float4 unpacksRGB(uint srgba)
+{
+    float4 color = spvUnpackUnorm4x8(srgba).wzyx;
+    float3 param = color.xyz;
+    return float4(fromsRGB(param), color.w);
+}
+
+CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    CmdLinGrad s;
+    s.index = raw0;
+    s.line_x = asfloat(raw1);
+    s.line_y = asfloat(raw2);
+    s.line_c = asfloat(raw3);
+    return s;
+}
+
+CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
+{
+    CmdLinGradRef _479 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdLinGradRef param_1 = _479;
+    return CmdLinGrad_read(param, param_1);
+}
+
+CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    CmdImage s;
+    s.index = raw0;
+    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+    return s;
+}
+
+CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
+{
+    CmdImageRef _489 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdImageRef param_1 = _489;
+    return CmdImage_read(param, param_1);
+}
+
+void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
+{
+    float4 rgba[8];
+    for (uint i = 0u; i < 8u; i++)
+    {
+        uint param = i;
+        int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
+        float4 fg_rgba = image_atlas[uv];
+        float3 param_1 = fg_rgba.xyz;
+        float3 _695 = fromsRGB(param_1);
+        fg_rgba.x = _695.x;
+        fg_rgba.y = _695.y;
+        fg_rgba.z = _695.z;
+        rgba[i] = fg_rgba;
+    }
+    spvReturnValue = rgba;
+}
+
+float3 tosRGB(float3 rgb)
+{
+    bool3 cutoff = bool3(rgb.x >= 0.003130800090730190277099609375f.xxx.x, rgb.y >= 0.003130800090730190277099609375f.xxx.y, rgb.z >= 0.003130800090730190277099609375f.xxx.z);
+    float3 below = 12.9200000762939453125f.xxx * rgb;
+    float3 above = (1.05499994754791259765625f.xxx * pow(rgb, 0.416660010814666748046875f.xxx)) - 0.054999999701976776123046875f.xxx;
+    return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z);
+}
+
+uint packsRGB(inout float4 rgba)
+{
+    float3 param = rgba.xyz;
+    rgba = float4(tosRGB(param), rgba.w);
+    return spvPackUnorm4x8(rgba.wzyx);
+}
+
+CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    CmdJump s;
+    s.new_ref = raw0;
+    return s;
+}
+
+CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
+{
+    CmdJumpRef _499 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdJumpRef param_1 = _499;
+    return CmdJump_read(param, param_1);
+}
+
+void comp_main()
+{
+    uint tile_ix = (gl_WorkGroupID.y * _723.Load(8)) + gl_WorkGroupID.x;
+    Alloc _738;
+    _738.offset = _723.Load(24);
+    Alloc param;
+    param.offset = _738.offset;
+    uint param_1 = tile_ix * 1024u;
+    uint param_2 = 1024u;
+    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
+    CmdRef _747 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _747;
+    uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
+    float2 xy = float2(xy_uint);
+    float4 rgba[8];
+    for (uint i = 0u; i < 8u; i++)
+    {
+        rgba[i] = 0.0f.xxxx;
+    }
+    uint clip_depth = 0u;
+    bool mem_ok = _202.Load(4) == 0u;
+    float df[8];
+    TileSegRef tile_seg_ref;
+    float area[8];
+    uint blend_stack[128][8];
+    float blend_alpha_stack[128][8];
+    while (mem_ok)
+    {
+        Alloc param_3 = cmd_alloc;
+        CmdRef param_4 = cmd_ref;
+        uint tag = Cmd_tag(param_3, param_4).tag;
+        if (tag == 0u)
+        {
+            break;
+        }
+        switch (tag)
+        {
+            case 2u:
+            {
+                Alloc param_5 = cmd_alloc;
+                CmdRef param_6 = cmd_ref;
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6);
+                for (uint k = 0u; k < 8u; k++)
+                {
+                    df[k] = 1000000000.0f;
+                }
+                TileSegRef _842 = { stroke.tile_ref };
+                tile_seg_ref = _842;
+                do
+                {
+                    uint param_7 = tile_seg_ref.offset;
+                    uint param_8 = 24u;
+                    bool param_9 = mem_ok;
+                    Alloc param_10 = new_alloc(param_7, param_8, param_9);
+                    TileSegRef param_11 = tile_seg_ref;
+                    TileSeg seg = TileSeg_read(param_10, param_11);
+                    float2 line_vec = seg._vector;
+                    for (uint k_1 = 0u; k_1 < 8u; k_1++)
+                    {
+                        float2 dpos = (xy + 0.5f.xx) - seg.origin;
+                        uint param_12 = k_1;
+                        dpos += float2(chunk_offset(param_12));
+                        float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0f, 1.0f);
+                        df[k_1] = min(df[k_1], length((line_vec * t) - dpos));
+                    }
+                    tile_seg_ref = seg.next;
+                } while (tile_seg_ref.offset != 0u);
+                for (uint k_2 = 0u; k_2 < 8u; k_2++)
+                {
+                    area[k_2] = clamp((stroke.half_width + 0.5f) - df[k_2], 0.0f, 1.0f);
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 1u:
+            {
+                Alloc param_13 = cmd_alloc;
+                CmdRef param_14 = cmd_ref;
+                CmdFill fill = Cmd_Fill_read(param_13, param_14);
+                for (uint k_3 = 0u; k_3 < 8u; k_3++)
+                {
+                    area[k_3] = float(fill.backdrop);
+                }
+                TileSegRef _964 = { fill.tile_ref };
+                tile_seg_ref = _964;
+                do
+                {
+                    uint param_15 = tile_seg_ref.offset;
+                    uint param_16 = 24u;
+                    bool param_17 = mem_ok;
+                    Alloc param_18 = new_alloc(param_15, param_16, param_17);
+                    TileSegRef param_19 = tile_seg_ref;
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19);
+                    for (uint k_4 = 0u; k_4 < 8u; k_4++)
+                    {
+                        uint param_20 = k_4;
+                        float2 my_xy = xy + float2(chunk_offset(param_20));
+                        float2 start = seg_1.origin - my_xy;
+                        float2 end = start + seg_1._vector;
+                        float2 window = clamp(float2(start.y, end.y), 0.0f.xx, 1.0f.xx);
+                        if (window.x != window.y)
+                        {
+                            float2 t_1 = (window - start.y.xx) / seg_1._vector.y.xx;
+                            float2 xs = float2(lerp(start.x, end.x, t_1.x), lerp(start.x, end.x, t_1.y));
+                            float xmin = min(min(xs.x, xs.y), 1.0f) - 9.9999999747524270787835121154785e-07f;
+                            float xmax = max(xs.x, xs.y);
+                            float b = min(xmax, 1.0f);
+                            float c = max(b, 0.0f);
+                            float d = max(xmin, 0.0f);
+                            float a = ((b + (0.5f * ((d * d) - (c * c)))) - xmin) / (xmax - xmin);
+                            area[k_4] += (a * (window.x - window.y));
+                        }
+                        area[k_4] += (sign(seg_1._vector.x) * clamp((my_xy.y - seg_1.y_edge) + 1.0f, 0.0f, 1.0f));
+                    }
+                    tile_seg_ref = seg_1.next;
+                } while (tile_seg_ref.offset != 0u);
+                for (uint k_5 = 0u; k_5 < 8u; k_5++)
+                {
+                    area[k_5] = min(abs(area[k_5]), 1.0f);
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 3u:
+            {
+                for (uint k_6 = 0u; k_6 < 8u; k_6++)
+                {
+                    area[k_6] = 1.0f;
+                }
+                cmd_ref.offset += 4u;
+                break;
+            }
+            case 4u:
+            {
+                Alloc param_21 = cmd_alloc;
+                CmdRef param_22 = cmd_ref;
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22);
+                for (uint k_7 = 0u; k_7 < 8u; k_7++)
+                {
+                    area[k_7] = alpha.alpha;
+                }
+                cmd_ref.offset += 8u;
+                break;
+            }
+            case 5u:
+            {
+                Alloc param_23 = cmd_alloc;
+                CmdRef param_24 = cmd_ref;
+                CmdColor color = Cmd_Color_read(param_23, param_24);
+                uint param_25 = color.rgba_color;
+                float4 fg = unpacksRGB(param_25);
+                for (uint k_8 = 0u; k_8 < 8u; k_8++)
+                {
+                    float4 fg_k = fg * area[k_8];
+                    rgba[k_8] = (rgba[k_8] * (1.0f - fg_k.w)) + fg_k;
+                }
+                cmd_ref.offset += 8u;
+                break;
+            }
+            case 6u:
+            {
+                Alloc param_26 = cmd_alloc;
+                CmdRef param_27 = cmd_ref;
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27);
+                float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
+                for (uint k_9 = 0u; k_9 < 8u; k_9++)
+                {
+                    uint param_28 = k_9;
+                    float2 chunk_xy = float2(chunk_offset(param_28));
+                    float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y);
+                    int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
+                    float4 fg_rgba = gradients[int2(x, int(lin.index))];
+                    float3 param_29 = fg_rgba.xyz;
+                    float3 _1298 = fromsRGB(param_29);
+                    fg_rgba.x = _1298.x;
+                    fg_rgba.y = _1298.y;
+                    fg_rgba.z = _1298.z;
+                    rgba[k_9] = fg_rgba;
+                }
+                cmd_ref.offset += 20u;
+                break;
+            }
+            case 7u:
+            {
+                Alloc param_30 = cmd_alloc;
+                CmdRef param_31 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_30, param_31);
+                uint2 param_32 = xy_uint;
+                CmdImage param_33 = fill_img;
+                float4 _1327[8];
+                fillImage(_1327, param_32, param_33);
+                float4 img[8] = _1327;
+                for (uint k_10 = 0u; k_10 < 8u; k_10++)
+                {
+                    float4 fg_k_1 = img[k_10] * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_1.w)) + fg_k_1;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 8u:
+            {
+                for (uint k_11 = 0u; k_11 < 8u; k_11++)
+                {
+                    uint d_2 = min(clip_depth, 127u);
+                    float4 param_34 = float4(rgba[k_11]);
+                    uint _1390 = packsRGB(param_34);
+                    blend_stack[d_2][k_11] = _1390;
+                    blend_alpha_stack[d_2][k_11] = clamp(abs(area[k_11]), 0.0f, 1.0f);
+                    rgba[k_11] = 0.0f.xxxx;
+                }
+                clip_depth++;
+                cmd_ref.offset += 4u;
+                break;
+            }
+            case 9u:
+            {
+                clip_depth--;
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                {
+                    uint d_3 = min(clip_depth, 127u);
+                    uint param_35 = blend_stack[d_3][k_12];
+                    float4 bg = unpacksRGB(param_35);
+                    float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12];
+                    rgba[k_12] = (bg * (1.0f - fg_1.w)) + fg_1;
+                }
+                cmd_ref.offset += 4u;
+                break;
+            }
+            case 10u:
+            {
+                Alloc param_36 = cmd_alloc;
+                CmdRef param_37 = cmd_ref;
+                CmdRef _1469 = { Cmd_Jump_read(param_36, param_37).new_ref };
+                cmd_ref = _1469;
+                cmd_alloc.offset = cmd_ref.offset;
+                break;
+            }
+        }
+    }
+    for (uint i_1 = 0u; i_1 < 8u; i_1++)
+    {
+        uint param_38 = i_1;
+        float3 param_39 = rgba[i_1].xyz;
+        image[int2(xy_uint + chunk_offset(param_38))] = float4(tosRGB(param_39), rgba[i_1].w);
+    }
+}
+
+[numthreads(8, 4, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
new file mode 100644
index 0000000..9318cc8
--- /dev/null
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -0,0 +1,728 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct CmdStrokeRef
+{
+    uint offset;
+};
+
+struct CmdStroke
+{
+    uint tile_ref;
+    float half_width;
+};
+
+struct CmdFillRef
+{
+    uint offset;
+};
+
+struct CmdFill
+{
+    uint tile_ref;
+    int backdrop;
+};
+
+struct CmdColorRef
+{
+    uint offset;
+};
+
+struct CmdColor
+{
+    uint rgba_color;
+};
+
+struct CmdLinGradRef
+{
+    uint offset;
+};
+
+struct CmdLinGrad
+{
+    uint index;
+    float line_x;
+    float line_y;
+    float line_c;
+};
+
+struct CmdImageRef
+{
+    uint offset;
+};
+
+struct CmdImage
+{
+    uint index;
+    int2 offset;
+};
+
+struct CmdAlphaRef
+{
+    uint offset;
+};
+
+struct CmdAlpha
+{
+    float alpha;
+};
+
+struct CmdJumpRef
+{
+    uint offset;
+};
+
+struct CmdJump
+{
+    uint new_ref;
+};
+
+struct CmdRef
+{
+    uint offset;
+};
+
+struct CmdTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct TileSegRef
+{
+    uint offset;
+};
+
+struct TileSeg
+{
+    float2 origin;
+    float2 vector;
+    float y_edge;
+    TileSegRef next;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+struct Alloc_1
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc_1 tile_alloc;
+    Alloc_1 bin_alloc;
+    Alloc_1 ptcl_alloc;
+    Alloc_1 pathseg_alloc;
+    Alloc_1 anno_alloc;
+    Alloc_1 trans_alloc;
+    Alloc_1 bbox_alloc;
+    Alloc_1 drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u);
+
+static inline __attribute__((always_inline))
+Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
+{
+    return Alloc{ a.offset + offset };
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_202)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = v_202.memory[offset];
+    return v;
+}
+
+static inline __attribute__((always_inline))
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1, v_202);
+    return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+}
+
+static inline __attribute__((always_inline))
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_202)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_202);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_202);
+    CmdStroke s;
+    s.tile_ref = raw0;
+    s.half_width = as_type<float>(raw1);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+{
+    Alloc param = a;
+    CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
+    return CmdStroke_read(param, param_1, v_202);
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+static inline __attribute__((always_inline))
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_202)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_202);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_202);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_202);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_202);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_202);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11, v_202);
+    TileSeg s;
+    s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
+    s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
+    s.y_edge = as_type<float>(raw4);
+    s.next = TileSegRef{ raw5 };
+    return s;
+}
+
+static inline __attribute__((always_inline))
+uint2 chunk_offset(thread const uint& i)
+{
+    return uint2((i % 2u) * 8u, (i / 2u) * 4u);
+}
+
+static inline __attribute__((always_inline))
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_202)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_202);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_202);
+    CmdFill s;
+    s.tile_ref = raw0;
+    s.backdrop = int(raw1);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+{
+    Alloc param = a;
+    CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
+    return CmdFill_read(param, param_1, v_202);
+}
+
+static inline __attribute__((always_inline))
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_202)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_202);
+    CmdAlpha s;
+    s.alpha = as_type<float>(raw0);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+{
+    Alloc param = a;
+    CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
+    return CmdAlpha_read(param, param_1, v_202);
+}
+
+static inline __attribute__((always_inline))
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_202)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_202);
+    CmdColor s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+{
+    Alloc param = a;
+    CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
+    return CmdColor_read(param, param_1, v_202);
+}
+
+static inline __attribute__((always_inline))
+float3 fromsRGB(thread const float3& srgb)
+{
+    bool3 cutoff = srgb >= float3(0.040449999272823333740234375);
+    float3 below = srgb / float3(12.9200000762939453125);
+    float3 above = pow((srgb + float3(0.054999999701976776123046875)) / float3(1.05499994754791259765625), float3(2.400000095367431640625));
+    return select(below, above, cutoff);
+}
+
+static inline __attribute__((always_inline))
+float4 unpacksRGB(thread const uint& srgba)
+{
+    float4 color = unpack_unorm4x8_to_float(srgba).wzyx;
+    float3 param = color.xyz;
+    return float4(fromsRGB(param), color.w);
+}
+
+static inline __attribute__((always_inline))
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_202)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_202);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_202);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_202);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_202);
+    CmdLinGrad s;
+    s.index = raw0;
+    s.line_x = as_type<float>(raw1);
+    s.line_y = as_type<float>(raw2);
+    s.line_c = as_type<float>(raw3);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+{
+    Alloc param = a;
+    CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
+    return CmdLinGrad_read(param, param_1, v_202);
+}
+
+static inline __attribute__((always_inline))
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_202)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_202);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_202);
+    CmdImage s;
+    s.index = raw0;
+    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+{
+    Alloc param = a;
+    CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
+    return CmdImage_read(param, param_1, v_202);
+}
+
+static inline __attribute__((always_inline))
+spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImage& cmd_img, thread texture2d<float> image_atlas)
+{
+    spvUnsafeArray<float4, 8> rgba;
+    for (uint i = 0u; i < 8u; i++)
+    {
+        uint param = i;
+        int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
+        float4 fg_rgba = image_atlas.read(uint2(uv));
+        float3 param_1 = fg_rgba.xyz;
+        float3 _695 = fromsRGB(param_1);
+        fg_rgba.x = _695.x;
+        fg_rgba.y = _695.y;
+        fg_rgba.z = _695.z;
+        rgba[i] = fg_rgba;
+    }
+    return rgba;
+}
+
+static inline __attribute__((always_inline))
+float3 tosRGB(thread const float3& rgb)
+{
+    bool3 cutoff = rgb >= float3(0.003130800090730190277099609375);
+    float3 below = float3(12.9200000762939453125) * rgb;
+    float3 above = (float3(1.05499994754791259765625) * pow(rgb, float3(0.416660010814666748046875))) - float3(0.054999999701976776123046875);
+    return select(below, above, cutoff);
+}
+
+static inline __attribute__((always_inline))
+uint packsRGB(thread float4& rgba)
+{
+    float3 param = rgba.xyz;
+    rgba = float4(tosRGB(param), rgba.w);
+    return pack_float_to_unorm4x8(rgba.wzyx);
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_202)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_202);
+    CmdJump s;
+    s.new_ref = raw0;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+{
+    Alloc param = a;
+    CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
+    return CmdJump_read(param, param_1, v_202);
+}
+
+kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _723 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+    uint tile_ix = (gl_WorkGroupID.y * _723.conf.width_in_tiles) + gl_WorkGroupID.x;
+    Alloc param;
+    param.offset = _723.conf.ptcl_alloc.offset;
+    uint param_1 = tile_ix * 1024u;
+    uint param_2 = 1024u;
+    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
+    CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+    uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
+    float2 xy = float2(xy_uint);
+    spvUnsafeArray<float4, 8> rgba;
+    for (uint i = 0u; i < 8u; i++)
+    {
+        rgba[i] = float4(0.0);
+    }
+    uint clip_depth = 0u;
+    bool mem_ok = v_202.mem_error == 0u;
+    spvUnsafeArray<float, 8> df;
+    TileSegRef tile_seg_ref;
+    spvUnsafeArray<float, 8> area;
+    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
+    spvUnsafeArray<spvUnsafeArray<float, 8>, 128> blend_alpha_stack;
+    while (mem_ok)
+    {
+        Alloc param_3 = cmd_alloc;
+        CmdRef param_4 = cmd_ref;
+        uint tag = Cmd_tag(param_3, param_4, v_202).tag;
+        if (tag == 0u)
+        {
+            break;
+        }
+        switch (tag)
+        {
+            case 2u:
+            {
+                Alloc param_5 = cmd_alloc;
+                CmdRef param_6 = cmd_ref;
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_202);
+                for (uint k = 0u; k < 8u; k++)
+                {
+                    df[k] = 1000000000.0;
+                }
+                tile_seg_ref = TileSegRef{ stroke.tile_ref };
+                do
+                {
+                    uint param_7 = tile_seg_ref.offset;
+                    uint param_8 = 24u;
+                    bool param_9 = mem_ok;
+                    Alloc param_10 = new_alloc(param_7, param_8, param_9);
+                    TileSegRef param_11 = tile_seg_ref;
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_202);
+                    float2 line_vec = seg.vector;
+                    for (uint k_1 = 0u; k_1 < 8u; k_1++)
+                    {
+                        float2 dpos = (xy + float2(0.5)) - seg.origin;
+                        uint param_12 = k_1;
+                        dpos += float2(chunk_offset(param_12));
+                        float t = fast::clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
+                        df[k_1] = fast::min(df[k_1], length((line_vec * t) - dpos));
+                    }
+                    tile_seg_ref = seg.next;
+                } while (tile_seg_ref.offset != 0u);
+                for (uint k_2 = 0u; k_2 < 8u; k_2++)
+                {
+                    area[k_2] = fast::clamp((stroke.half_width + 0.5) - df[k_2], 0.0, 1.0);
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 1u:
+            {
+                Alloc param_13 = cmd_alloc;
+                CmdRef param_14 = cmd_ref;
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_202);
+                for (uint k_3 = 0u; k_3 < 8u; k_3++)
+                {
+                    area[k_3] = float(fill.backdrop);
+                }
+                tile_seg_ref = TileSegRef{ fill.tile_ref };
+                do
+                {
+                    uint param_15 = tile_seg_ref.offset;
+                    uint param_16 = 24u;
+                    bool param_17 = mem_ok;
+                    Alloc param_18 = new_alloc(param_15, param_16, param_17);
+                    TileSegRef param_19 = tile_seg_ref;
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_202);
+                    for (uint k_4 = 0u; k_4 < 8u; k_4++)
+                    {
+                        uint param_20 = k_4;
+                        float2 my_xy = xy + float2(chunk_offset(param_20));
+                        float2 start = seg_1.origin - my_xy;
+                        float2 end = start + seg_1.vector;
+                        float2 window = fast::clamp(float2(start.y, end.y), float2(0.0), float2(1.0));
+                        if ((isunordered(window.x, window.y) || window.x != window.y))
+                        {
+                            float2 t_1 = (window - float2(start.y)) / float2(seg_1.vector.y);
+                            float2 xs = float2(mix(start.x, end.x, t_1.x), mix(start.x, end.x, t_1.y));
+                            float xmin = fast::min(fast::min(xs.x, xs.y), 1.0) - 9.9999999747524270787835121154785e-07;
+                            float xmax = fast::max(xs.x, xs.y);
+                            float b = fast::min(xmax, 1.0);
+                            float c = fast::max(b, 0.0);
+                            float d = fast::max(xmin, 0.0);
+                            float a = ((b + (0.5 * ((d * d) - (c * c)))) - xmin) / (xmax - xmin);
+                            area[k_4] += (a * (window.x - window.y));
+                        }
+                        area[k_4] += (sign(seg_1.vector.x) * fast::clamp((my_xy.y - seg_1.y_edge) + 1.0, 0.0, 1.0));
+                    }
+                    tile_seg_ref = seg_1.next;
+                } while (tile_seg_ref.offset != 0u);
+                for (uint k_5 = 0u; k_5 < 8u; k_5++)
+                {
+                    area[k_5] = fast::min(abs(area[k_5]), 1.0);
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 3u:
+            {
+                for (uint k_6 = 0u; k_6 < 8u; k_6++)
+                {
+                    area[k_6] = 1.0;
+                }
+                cmd_ref.offset += 4u;
+                break;
+            }
+            case 4u:
+            {
+                Alloc param_21 = cmd_alloc;
+                CmdRef param_22 = cmd_ref;
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_202);
+                for (uint k_7 = 0u; k_7 < 8u; k_7++)
+                {
+                    area[k_7] = alpha.alpha;
+                }
+                cmd_ref.offset += 8u;
+                break;
+            }
+            case 5u:
+            {
+                Alloc param_23 = cmd_alloc;
+                CmdRef param_24 = cmd_ref;
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_202);
+                uint param_25 = color.rgba_color;
+                float4 fg = unpacksRGB(param_25);
+                for (uint k_8 = 0u; k_8 < 8u; k_8++)
+                {
+                    float4 fg_k = fg * area[k_8];
+                    rgba[k_8] = (rgba[k_8] * (1.0 - fg_k.w)) + fg_k;
+                }
+                cmd_ref.offset += 8u;
+                break;
+            }
+            case 6u:
+            {
+                Alloc param_26 = cmd_alloc;
+                CmdRef param_27 = cmd_ref;
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_202);
+                float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
+                for (uint k_9 = 0u; k_9 < 8u; k_9++)
+                {
+                    uint param_28 = k_9;
+                    float2 chunk_xy = float2(chunk_offset(param_28));
+                    float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y);
+                    int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
+                    float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
+                    float3 param_29 = fg_rgba.xyz;
+                    float3 _1298 = fromsRGB(param_29);
+                    fg_rgba.x = _1298.x;
+                    fg_rgba.y = _1298.y;
+                    fg_rgba.z = _1298.z;
+                    rgba[k_9] = fg_rgba;
+                }
+                cmd_ref.offset += 20u;
+                break;
+            }
+            case 7u:
+            {
+                Alloc param_30 = cmd_alloc;
+                CmdRef param_31 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_202);
+                uint2 param_32 = xy_uint;
+                CmdImage param_33 = fill_img;
+                spvUnsafeArray<float4, 8> img;
+                img = fillImage(param_32, param_33, image_atlas);
+                for (uint k_10 = 0u; k_10 < 8u; k_10++)
+                {
+                    float4 fg_k_1 = img[k_10] * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_1.w)) + fg_k_1;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 8u:
+            {
+                for (uint k_11 = 0u; k_11 < 8u; k_11++)
+                {
+                    uint d_2 = min(clip_depth, 127u);
+                    float4 param_34 = float4(rgba[k_11]);
+                    uint _1390 = packsRGB(param_34);
+                    blend_stack[d_2][k_11] = _1390;
+                    blend_alpha_stack[d_2][k_11] = fast::clamp(abs(area[k_11]), 0.0, 1.0);
+                    rgba[k_11] = float4(0.0);
+                }
+                clip_depth++;
+                cmd_ref.offset += 4u;
+                break;
+            }
+            case 9u:
+            {
+                clip_depth--;
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                {
+                    uint d_3 = min(clip_depth, 127u);
+                    uint param_35 = blend_stack[d_3][k_12];
+                    float4 bg = unpacksRGB(param_35);
+                    float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12];
+                    rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1;
+                }
+                cmd_ref.offset += 4u;
+                break;
+            }
+            case 10u:
+            {
+                Alloc param_36 = cmd_alloc;
+                CmdRef param_37 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_36, param_37, v_202).new_ref };
+                cmd_alloc.offset = cmd_ref.offset;
+                break;
+            }
+        }
+    }
+    for (uint i_1 = 0u; i_1 < 8u; i_1++)
+    {
+        uint param_38 = i_1;
+        float3 param_39 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_39), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_38))));
+    }
+}
+
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
similarity index 100%
rename from piet-gpu/shader/kernel4.spv
rename to piet-gpu/shader/gen/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil
new file mode 100644
index 0000000..9fd593c
--- /dev/null
+++ b/piet-gpu/shader/gen/path_coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/path_coarse.hlsl b/piet-gpu/shader/gen/path_coarse.hlsl
new file mode 100644
index 0000000..6025bde
--- /dev/null
+++ b/piet-gpu/shader/gen/path_coarse.hlsl
@@ -0,0 +1,664 @@
+struct Alloc
+{
+    uint offset;
+};
+
+struct MallocResult
+{
+    Alloc alloc;
+    bool failed;
+};
+
+struct PathCubicRef
+{
+    uint offset;
+};
+
+struct PathCubic
+{
+    float2 p0;
+    float2 p1;
+    float2 p2;
+    float2 p3;
+    uint path_ix;
+    uint trans_ix;
+    float2 stroke;
+};
+
+struct PathSegRef
+{
+    uint offset;
+};
+
+struct PathSegTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct TileRef
+{
+    uint offset;
+};
+
+struct PathRef
+{
+    uint offset;
+};
+
+struct Path
+{
+    uint4 bbox;
+    TileRef tiles;
+};
+
+struct TileSegRef
+{
+    uint offset;
+};
+
+struct TileSeg
+{
+    float2 origin;
+    float2 _vector;
+    float y_edge;
+    TileSegRef next;
+};
+
+struct SubdivResult
+{
+    float val;
+    float a0;
+    float a2;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(32u, 1u, 1u);
+
+static const PathSegTag _721 = { 0u, 0u };
+
+RWByteAddressBuffer _136 : register(u0, space0);
+ByteAddressBuffer _710 : register(t1, space0);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = _136.Load(offset * 4 + 8);
+    return v;
+}
+
+PathSegTag PathSeg_tag(Alloc a, PathSegRef ref)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1);
+    PathSegTag _367 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _367;
+}
+
+PathCubic PathCubic_read(Alloc a, PathCubicRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21);
+    Alloc param_22 = a;
+    uint param_23 = ix + 11u;
+    uint raw11 = read_mem(param_22, param_23);
+    PathCubic s;
+    s.p0 = float2(asfloat(raw0), asfloat(raw1));
+    s.p1 = float2(asfloat(raw2), asfloat(raw3));
+    s.p2 = float2(asfloat(raw4), asfloat(raw5));
+    s.p3 = float2(asfloat(raw6), asfloat(raw7));
+    s.path_ix = raw8;
+    s.trans_ix = raw9;
+    s.stroke = float2(asfloat(raw10), asfloat(raw11));
+    return s;
+}
+
+PathCubic PathSeg_Cubic_read(Alloc a, PathSegRef ref)
+{
+    PathCubicRef _373 = { ref.offset + 4u };
+    Alloc param = a;
+    PathCubicRef param_1 = _373;
+    return PathCubic_read(param, param_1);
+}
+
+float2 eval_cubic(float2 p0, float2 p1, float2 p2, float2 p3, float t)
+{
+    float mt = 1.0f - t;
+    return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0f)) + (((p2 * (mt * 3.0f)) + (p3 * t)) * t)) * t);
+}
+
+float approx_parabola_integral(float x)
+{
+    return x * rsqrt(sqrt(0.3300000131130218505859375f + (0.201511204242706298828125f + ((0.25f * x) * x))));
+}
+
+SubdivResult estimate_subdiv(float2 p0, float2 p1, float2 p2, float sqrt_tol)
+{
+    float2 d01 = p1 - p0;
+    float2 d12 = p2 - p1;
+    float2 dd = d01 - d12;
+    float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x);
+    float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross;
+    float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross;
+    float scale = abs(_cross / (length(dd) * (x2 - x0)));
+    float param = x0;
+    float a0 = approx_parabola_integral(param);
+    float param_1 = x2;
+    float a2 = approx_parabola_integral(param_1);
+    float val = 0.0f;
+    if (scale < 1000000000.0f)
+    {
+        float da = abs(a2 - a0);
+        float sqrt_scale = sqrt(scale);
+        if (sign(x0) == sign(x2))
+        {
+            val = da * sqrt_scale;
+        }
+        else
+        {
+            float xmin = sqrt_tol / sqrt_scale;
+            float param_2 = xmin;
+            val = (sqrt_tol * da) / approx_parabola_integral(param_2);
+        }
+    }
+    SubdivResult _695 = { val, a0, a2 };
+    return _695;
+}
+
+uint fill_mode_from_flags(uint flags)
+{
+    return flags & 1u;
+}
+
+Path Path_read(Alloc a, PathRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Path s;
+    s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+    TileRef _427 = { raw2 };
+    s.tiles = _427;
+    return s;
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+float approx_parabola_inv_integral(float x)
+{
+    return x * sqrt(0.61000001430511474609375f + (0.1520999968051910400390625f + ((0.25f * x) * x)));
+}
+
+float2 eval_quad(float2 p0, float2 p1, float2 p2, float t)
+{
+    float mt = 1.0f - t;
+    return (p0 * (mt * mt)) + (((p1 * (mt * 2.0f)) + (p2 * t)) * t);
+}
+
+MallocResult malloc(uint size)
+{
+    uint _142;
+    _136.InterlockedAdd(0, size, _142);
+    uint offset = _142;
+    uint _149;
+    _136.GetDimensions(_149);
+    _149 = (_149 - 8) / 4;
+    MallocResult r;
+    r.failed = (offset + size) > uint(int(_149) * 4);
+    uint param = offset;
+    uint param_1 = size;
+    bool param_2 = !r.failed;
+    r.alloc = new_alloc(param, param_1, param_2);
+    if (r.failed)
+    {
+        uint _171;
+        _136.InterlockedMax(4, 1u, _171);
+        return r;
+    }
+    return r;
+}
+
+TileRef Tile_index(TileRef ref, uint index)
+{
+    TileRef _385 = { ref.offset + (index * 8u) };
+    return _385;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    _136.Store(offset * 4 + 8, val);
+}
+
+void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = asuint(s.origin.x);
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.origin.y);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s._vector.x);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s._vector.y);
+    write_mem(param_9, param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = asuint(s.y_edge);
+    write_mem(param_12, param_13, param_14);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = s.next.offset;
+    write_mem(param_15, param_16, param_17);
+}
+
+void comp_main()
+{
+    uint element_ix = gl_GlobalInvocationID.x;
+    PathSegRef _718 = { _710.Load(28) + (element_ix * 52u) };
+    PathSegRef ref = _718;
+    PathSegTag tag = _721;
+    if (element_ix < _710.Load(4))
+    {
+        Alloc _731;
+        _731.offset = _710.Load(28);
+        Alloc param;
+        param.offset = _731.offset;
+        PathSegRef param_1 = ref;
+        tag = PathSeg_tag(param, param_1);
+    }
+    bool mem_ok = _136.Load(4) == 0u;
+    switch (tag.tag)
+    {
+        case 1u:
+        {
+            Alloc _748;
+            _748.offset = _710.Load(28);
+            Alloc param_2;
+            param_2.offset = _748.offset;
+            PathSegRef param_3 = ref;
+            PathCubic cubic = PathSeg_Cubic_read(param_2, param_3);
+            float2 err_v = (((cubic.p2 - cubic.p1) * 3.0f) + cubic.p0) - cubic.p3;
+            float err = (err_v.x * err_v.x) + (err_v.y * err_v.y);
+            uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875f, 0.16666667163372039794921875f))), 1u);
+            n_quads = min(n_quads, 16u);
+            float val = 0.0f;
+            float2 qp0 = cubic.p0;
+            float _step = 1.0f / float(n_quads);
+            SubdivResult keep_params[16];
+            for (uint i = 0u; i < n_quads; i++)
+            {
+                float t = float(i + 1u) * _step;
+                float2 param_4 = cubic.p0;
+                float2 param_5 = cubic.p1;
+                float2 param_6 = cubic.p2;
+                float2 param_7 = cubic.p3;
+                float param_8 = t;
+                float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8);
+                float2 param_9 = cubic.p0;
+                float2 param_10 = cubic.p1;
+                float2 param_11 = cubic.p2;
+                float2 param_12 = cubic.p3;
+                float param_13 = t - (0.5f * _step);
+                float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13);
+                qp1 = (qp1 * 2.0f) - ((qp0 + qp2) * 0.5f);
+                float2 param_14 = qp0;
+                float2 param_15 = qp1;
+                float2 param_16 = qp2;
+                float param_17 = 0.4743416607379913330078125f;
+                SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17);
+                keep_params[i] = params;
+                val += params.val;
+                qp0 = qp2;
+            }
+            uint n = max(uint(ceil((val * 0.5f) / 0.4743416607379913330078125f)), 1u);
+            uint param_18 = tag.flags;
+            bool is_stroke = fill_mode_from_flags(param_18) == 1u;
+            uint path_ix = cubic.path_ix;
+            PathRef _904 = { _710.Load(16) + (path_ix * 12u) };
+            Alloc _907;
+            _907.offset = _710.Load(16);
+            Alloc param_19;
+            param_19.offset = _907.offset;
+            PathRef param_20 = _904;
+            Path path = Path_read(param_19, param_20);
+            uint param_21 = path.tiles.offset;
+            uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_23 = mem_ok;
+            Alloc path_alloc = new_alloc(param_21, param_22, param_23);
+            int4 bbox = int4(path.bbox);
+            float2 p0 = cubic.p0;
+            qp0 = cubic.p0;
+            float v_step = val / float(n);
+            int n_out = 1;
+            float val_sum = 0.0f;
+            float2 p1;
+            float _1147;
+            TileSeg tile_seg;
+            for (uint i_1 = 0u; i_1 < n_quads; i_1++)
+            {
+                float t_1 = float(i_1 + 1u) * _step;
+                float2 param_24 = cubic.p0;
+                float2 param_25 = cubic.p1;
+                float2 param_26 = cubic.p2;
+                float2 param_27 = cubic.p3;
+                float param_28 = t_1;
+                float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28);
+                float2 param_29 = cubic.p0;
+                float2 param_30 = cubic.p1;
+                float2 param_31 = cubic.p2;
+                float2 param_32 = cubic.p3;
+                float param_33 = t_1 - (0.5f * _step);
+                float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33);
+                qp1_1 = (qp1_1 * 2.0f) - ((qp0 + qp2_1) * 0.5f);
+                SubdivResult params_1 = keep_params[i_1];
+                float param_34 = params_1.a0;
+                float u0 = approx_parabola_inv_integral(param_34);
+                float param_35 = params_1.a2;
+                float u2 = approx_parabola_inv_integral(param_35);
+                float uscale = 1.0f / (u2 - u0);
+                float target = float(n_out) * v_step;
+                for (;;)
+                {
+                    bool _1040 = uint(n_out) == n;
+                    bool _1050;
+                    if (!_1040)
+                    {
+                        _1050 = target < (val_sum + params_1.val);
+                    }
+                    else
+                    {
+                        _1050 = _1040;
+                    }
+                    if (_1050)
+                    {
+                        if (uint(n_out) == n)
+                        {
+                            p1 = cubic.p3;
+                        }
+                        else
+                        {
+                            float u = (target - val_sum) / params_1.val;
+                            float a = lerp(params_1.a0, params_1.a2, u);
+                            float param_36 = a;
+                            float au = approx_parabola_inv_integral(param_36);
+                            float t_2 = (au - u0) * uscale;
+                            float2 param_37 = qp0;
+                            float2 param_38 = qp1_1;
+                            float2 param_39 = qp2_1;
+                            float param_40 = t_2;
+                            p1 = eval_quad(param_37, param_38, param_39, param_40);
+                        }
+                        float xmin = min(p0.x, p1.x) - cubic.stroke.x;
+                        float xmax = max(p0.x, p1.x) + cubic.stroke.x;
+                        float ymin = min(p0.y, p1.y) - cubic.stroke.y;
+                        float ymax = max(p0.y, p1.y) + cubic.stroke.y;
+                        float dx = p1.x - p0.x;
+                        float dy = p1.y - p0.y;
+                        if (abs(dy) < 9.999999717180685365747194737196e-10f)
+                        {
+                            _1147 = 1000000000.0f;
+                        }
+                        else
+                        {
+                            _1147 = dx / dy;
+                        }
+                        float invslope = _1147;
+                        float c = (cubic.stroke.x + (abs(invslope) * (8.0f + cubic.stroke.y))) * 0.0625f;
+                        float b = invslope;
+                        float a_1 = (p0.x - ((p0.y - 8.0f) * b)) * 0.0625f;
+                        int x0 = int(floor(xmin * 0.0625f));
+                        int x1 = int(floor(xmax * 0.0625f) + 1.0f);
+                        int y0 = int(floor(ymin * 0.0625f));
+                        int y1 = int(floor(ymax * 0.0625f) + 1.0f);
+                        x0 = clamp(x0, bbox.x, bbox.z);
+                        y0 = clamp(y0, bbox.y, bbox.w);
+                        x1 = clamp(x1, bbox.x, bbox.z);
+                        y1 = clamp(y1, bbox.y, bbox.w);
+                        float xc = a_1 + (b * float(y0));
+                        int stride = bbox.z - bbox.x;
+                        int base = ((y0 - bbox.y) * stride) - bbox.x;
+                        uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
+                        uint param_41 = n_tile_alloc * 24u;
+                        MallocResult _1263 = malloc(param_41);
+                        MallocResult tile_alloc = _1263;
+                        if (tile_alloc.failed || (!mem_ok))
+                        {
+                            return;
+                        }
+                        uint tile_offset = tile_alloc.alloc.offset;
+                        int xray = int(floor(p0.x * 0.0625f));
+                        int last_xray = int(floor(p1.x * 0.0625f));
+                        if (p0.y > p1.y)
+                        {
+                            int tmp = xray;
+                            xray = last_xray;
+                            last_xray = tmp;
+                        }
+                        for (int y = y0; y < y1; y++)
+                        {
+                            float tile_y0 = float(y * 16);
+                            int xbackdrop = max((xray + 1), bbox.x);
+                            bool _1319 = !is_stroke;
+                            bool _1329;
+                            if (_1319)
+                            {
+                                _1329 = min(p0.y, p1.y) < tile_y0;
+                            }
+                            else
+                            {
+                                _1329 = _1319;
+                            }
+                            bool _1336;
+                            if (_1329)
+                            {
+                                _1336 = xbackdrop < bbox.z;
+                            }
+                            else
+                            {
+                                _1336 = _1329;
+                            }
+                            if (_1336)
+                            {
+                                int backdrop = (p1.y < p0.y) ? 1 : (-1);
+                                TileRef param_42 = path.tiles;
+                                uint param_43 = uint(base + xbackdrop);
+                                TileRef tile_ref = Tile_index(param_42, param_43);
+                                uint tile_el = tile_ref.offset >> uint(2);
+                                Alloc param_44 = path_alloc;
+                                uint param_45 = tile_el + 1u;
+                                if (touch_mem(param_44, param_45))
+                                {
+                                    uint _1374;
+                                    _136.InterlockedAdd((tile_el + 1u) * 4 + 8, uint(backdrop), _1374);
+                                }
+                            }
+                            int next_xray = last_xray;
+                            if (y < (y1 - 1))
+                            {
+                                float tile_y1 = float((y + 1) * 16);
+                                float x_edge = lerp(p0.x, p1.x, (tile_y1 - p0.y) / dy);
+                                next_xray = int(floor(x_edge * 0.0625f));
+                            }
+                            int min_xray = min(xray, next_xray);
+                            int max_xray = max(xray, next_xray);
+                            int xx0 = min(int(floor(xc - c)), min_xray);
+                            int xx1 = max(int(ceil(xc + c)), (max_xray + 1));
+                            xx0 = clamp(xx0, x0, x1);
+                            xx1 = clamp(xx1, x0, x1);
+                            for (int x = xx0; x < xx1; x++)
+                            {
+                                float tile_x0 = float(x * 16);
+                                TileRef _1454 = { path.tiles.offset };
+                                TileRef param_46 = _1454;
+                                uint param_47 = uint(base + x);
+                                TileRef tile_ref_1 = Tile_index(param_46, param_47);
+                                uint tile_el_1 = tile_ref_1.offset >> uint(2);
+                                uint old = 0u;
+                                Alloc param_48 = path_alloc;
+                                uint param_49 = tile_el_1;
+                                if (touch_mem(param_48, param_49))
+                                {
+                                    uint _1477;
+                                    _136.InterlockedExchange(tile_el_1 * 4 + 8, tile_offset, _1477);
+                                    old = _1477;
+                                }
+                                tile_seg.origin = p0;
+                                tile_seg._vector = p1 - p0;
+                                float y_edge = 0.0f;
+                                if (!is_stroke)
+                                {
+                                    y_edge = lerp(p0.y, p1.y, (tile_x0 - p0.x) / dx);
+                                    if (min(p0.x, p1.x) < tile_x0)
+                                    {
+                                        float2 p = float2(tile_x0, y_edge);
+                                        if (p0.x > p1.x)
+                                        {
+                                            tile_seg._vector = p - p0;
+                                        }
+                                        else
+                                        {
+                                            tile_seg.origin = p;
+                                            tile_seg._vector = p1 - p;
+                                        }
+                                        if (tile_seg._vector.x == 0.0f)
+                                        {
+                                            tile_seg._vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10f;
+                                        }
+                                    }
+                                    if ((x <= min_xray) || (max_xray < x))
+                                    {
+                                        y_edge = 1000000000.0f;
+                                    }
+                                }
+                                tile_seg.y_edge = y_edge;
+                                tile_seg.next.offset = old;
+                                TileSegRef _1559 = { tile_offset };
+                                Alloc param_50 = tile_alloc.alloc;
+                                TileSegRef param_51 = _1559;
+                                TileSeg param_52 = tile_seg;
+                                TileSeg_write(param_50, param_51, param_52);
+                                tile_offset += 24u;
+                            }
+                            xc += b;
+                            base += stride;
+                            xray = next_xray;
+                        }
+                        n_out++;
+                        target += v_step;
+                        p0 = p1;
+                        continue;
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+                val_sum += params_1.val;
+                qp0 = qp2_1;
+            }
+            break;
+        }
+    }
+}
+
+[numthreads(32, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/path_coarse.msl b/piet-gpu/shader/gen/path_coarse.msl
new file mode 100644
index 0000000..d263f31
--- /dev/null
+++ b/piet-gpu/shader/gen/path_coarse.msl
@@ -0,0 +1,708 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct MallocResult
+{
+    Alloc alloc;
+    bool failed;
+};
+
+struct PathCubicRef
+{
+    uint offset;
+};
+
+struct PathCubic
+{
+    float2 p0;
+    float2 p1;
+    float2 p2;
+    float2 p3;
+    uint path_ix;
+    uint trans_ix;
+    float2 stroke;
+};
+
+struct PathSegRef
+{
+    uint offset;
+};
+
+struct PathSegTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct TileRef
+{
+    uint offset;
+};
+
+struct PathRef
+{
+    uint offset;
+};
+
+struct Path
+{
+    uint4 bbox;
+    TileRef tiles;
+};
+
+struct TileSegRef
+{
+    uint offset;
+};
+
+struct TileSeg
+{
+    float2 origin;
+    float2 vector;
+    float y_edge;
+    TileSegRef next;
+};
+
+struct SubdivResult
+{
+    float val;
+    float a0;
+    float a2;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+struct Alloc_1
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc_1 tile_alloc;
+    Alloc_1 bin_alloc;
+    Alloc_1 ptcl_alloc;
+    Alloc_1 pathseg_alloc;
+    Alloc_1 anno_alloc;
+    Alloc_1 trans_alloc;
+    Alloc_1 bbox_alloc;
+    Alloc_1 drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(32u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_136, constant uint& v_136BufferSize)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = v_136.memory[offset];
+    return v;
+}
+
+static inline __attribute__((always_inline))
+PathSegTag PathSeg_tag(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1, v_136, v_136BufferSize);
+    return PathSegTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+}
+
+static inline __attribute__((always_inline))
+PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_136, v_136BufferSize);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_136, v_136BufferSize);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11, v_136, v_136BufferSize);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13, v_136, v_136BufferSize);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15, v_136, v_136BufferSize);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17, v_136, v_136BufferSize);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19, v_136, v_136BufferSize);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21, v_136, v_136BufferSize);
+    Alloc param_22 = a;
+    uint param_23 = ix + 11u;
+    uint raw11 = read_mem(param_22, param_23, v_136, v_136BufferSize);
+    PathCubic s;
+    s.p0 = float2(as_type<float>(raw0), as_type<float>(raw1));
+    s.p1 = float2(as_type<float>(raw2), as_type<float>(raw3));
+    s.p2 = float2(as_type<float>(raw4), as_type<float>(raw5));
+    s.p3 = float2(as_type<float>(raw6), as_type<float>(raw7));
+    s.path_ix = raw8;
+    s.trans_ix = raw9;
+    s.stroke = float2(as_type<float>(raw10), as_type<float>(raw11));
+    return s;
+}
+
+static inline __attribute__((always_inline))
+PathCubic PathSeg_Cubic_read(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
+{
+    Alloc param = a;
+    PathCubicRef param_1 = PathCubicRef{ ref.offset + 4u };
+    return PathCubic_read(param, param_1, v_136, v_136BufferSize);
+}
+
+static inline __attribute__((always_inline))
+float2 eval_cubic(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float2& p3, thread const float& t)
+{
+    float mt = 1.0 - t;
+    return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0)) + (((p2 * (mt * 3.0)) + (p3 * t)) * t)) * t);
+}
+
+static inline __attribute__((always_inline))
+float approx_parabola_integral(thread const float& x)
+{
+    return x * rsqrt(sqrt(0.3300000131130218505859375 + (0.201511204242706298828125 + ((0.25 * x) * x))));
+}
+
+static inline __attribute__((always_inline))
+SubdivResult estimate_subdiv(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& sqrt_tol)
+{
+    float2 d01 = p1 - p0;
+    float2 d12 = p2 - p1;
+    float2 dd = d01 - d12;
+    float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x);
+    float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross;
+    float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross;
+    float scale = abs(_cross / (length(dd) * (x2 - x0)));
+    float param = x0;
+    float a0 = approx_parabola_integral(param);
+    float param_1 = x2;
+    float a2 = approx_parabola_integral(param_1);
+    float val = 0.0;
+    if (scale < 1000000000.0)
+    {
+        float da = abs(a2 - a0);
+        float sqrt_scale = sqrt(scale);
+        if (sign(x0) == sign(x2))
+        {
+            val = da * sqrt_scale;
+        }
+        else
+        {
+            float xmin = sqrt_tol / sqrt_scale;
+            float param_2 = xmin;
+            val = (sqrt_tol * da) / approx_parabola_integral(param_2);
+        }
+    }
+    return SubdivResult{ val, a0, a2 };
+}
+
+static inline __attribute__((always_inline))
+uint fill_mode_from_flags(thread const uint& flags)
+{
+    return flags & 1u;
+}
+
+static inline __attribute__((always_inline))
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize);
+    Path s;
+    s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+    s.tiles = TileRef{ raw2 };
+    return s;
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+static inline __attribute__((always_inline))
+float approx_parabola_inv_integral(thread const float& x)
+{
+    return x * sqrt(0.61000001430511474609375 + (0.1520999968051910400390625 + ((0.25 * x) * x)));
+}
+
+static inline __attribute__((always_inline))
+float2 eval_quad(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& t)
+{
+    float mt = 1.0 - t;
+    return (p0 * (mt * mt)) + (((p1 * (mt * 2.0)) + (p2 * t)) * t);
+}
+
+static inline __attribute__((always_inline))
+MallocResult malloc(thread const uint& size, device Memory& v_136, constant uint& v_136BufferSize)
+{
+    uint _142 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.mem_offset, size, memory_order_relaxed);
+    uint offset = _142;
+    MallocResult r;
+    r.failed = (offset + size) > uint(int((v_136BufferSize - 8) / 4) * 4);
+    uint param = offset;
+    uint param_1 = size;
+    bool param_2 = !r.failed;
+    r.alloc = new_alloc(param, param_1, param_2);
+    if (r.failed)
+    {
+        uint _171 = atomic_fetch_max_explicit((device atomic_uint*)&v_136.mem_error, 1u, memory_order_relaxed);
+        return r;
+    }
+    return r;
+}
+
+static inline __attribute__((always_inline))
+TileRef Tile_index(thread const TileRef& ref, thread const uint& index)
+{
+    return TileRef{ ref.offset + (index * 8u) };
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_136, constant uint& v_136BufferSize)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    v_136.memory[offset] = val;
+}
+
+static inline __attribute__((always_inline))
+void TileSeg_write(thread const Alloc& a, thread const TileSegRef& ref, thread const TileSeg& s, device Memory& v_136, constant uint& v_136BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = as_type<uint>(s.origin.x);
+    write_mem(param, param_1, param_2, v_136, v_136BufferSize);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = as_type<uint>(s.origin.y);
+    write_mem(param_3, param_4, param_5, v_136, v_136BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = as_type<uint>(s.vector.x);
+    write_mem(param_6, param_7, param_8, v_136, v_136BufferSize);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = as_type<uint>(s.vector.y);
+    write_mem(param_9, param_10, param_11, v_136, v_136BufferSize);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = as_type<uint>(s.y_edge);
+    write_mem(param_12, param_13, param_14, v_136, v_136BufferSize);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = s.next.offset;
+    write_mem(param_15, param_16, param_17, v_136, v_136BufferSize);
+}
+
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_136 [[buffer(0)]], const device ConfigBuf& _710 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    constant uint& v_136BufferSize = spvBufferSizeConstants[0];
+    uint element_ix = gl_GlobalInvocationID.x;
+    PathSegRef ref = PathSegRef{ _710.conf.pathseg_alloc.offset + (element_ix * 52u) };
+    PathSegTag tag = PathSegTag{ 0u, 0u };
+    if (element_ix < _710.conf.n_pathseg)
+    {
+        Alloc param;
+        param.offset = _710.conf.pathseg_alloc.offset;
+        PathSegRef param_1 = ref;
+        tag = PathSeg_tag(param, param_1, v_136, v_136BufferSize);
+    }
+    bool mem_ok = v_136.mem_error == 0u;
+    switch (tag.tag)
+    {
+        case 1u:
+        {
+            Alloc param_2;
+            param_2.offset = _710.conf.pathseg_alloc.offset;
+            PathSegRef param_3 = ref;
+            PathCubic cubic = PathSeg_Cubic_read(param_2, param_3, v_136, v_136BufferSize);
+            float2 err_v = (((cubic.p2 - cubic.p1) * 3.0) + cubic.p0) - cubic.p3;
+            float err = (err_v.x * err_v.x) + (err_v.y * err_v.y);
+            uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875, 0.16666667163372039794921875))), 1u);
+            n_quads = min(n_quads, 16u);
+            float val = 0.0;
+            float2 qp0 = cubic.p0;
+            float _step = 1.0 / float(n_quads);
+            spvUnsafeArray<SubdivResult, 16> keep_params;
+            for (uint i = 0u; i < n_quads; i++)
+            {
+                float t = float(i + 1u) * _step;
+                float2 param_4 = cubic.p0;
+                float2 param_5 = cubic.p1;
+                float2 param_6 = cubic.p2;
+                float2 param_7 = cubic.p3;
+                float param_8 = t;
+                float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8);
+                float2 param_9 = cubic.p0;
+                float2 param_10 = cubic.p1;
+                float2 param_11 = cubic.p2;
+                float2 param_12 = cubic.p3;
+                float param_13 = t - (0.5 * _step);
+                float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13);
+                qp1 = (qp1 * 2.0) - ((qp0 + qp2) * 0.5);
+                float2 param_14 = qp0;
+                float2 param_15 = qp1;
+                float2 param_16 = qp2;
+                float param_17 = 0.4743416607379913330078125;
+                SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17);
+                keep_params[i] = params;
+                val += params.val;
+                qp0 = qp2;
+            }
+            uint n = max(uint(ceil((val * 0.5) / 0.4743416607379913330078125)), 1u);
+            uint param_18 = tag.flags;
+            bool is_stroke = fill_mode_from_flags(param_18) == 1u;
+            uint path_ix = cubic.path_ix;
+            Alloc param_19;
+            param_19.offset = _710.conf.tile_alloc.offset;
+            PathRef param_20 = PathRef{ _710.conf.tile_alloc.offset + (path_ix * 12u) };
+            Path path = Path_read(param_19, param_20, v_136, v_136BufferSize);
+            uint param_21 = path.tiles.offset;
+            uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_23 = mem_ok;
+            Alloc path_alloc = new_alloc(param_21, param_22, param_23);
+            int4 bbox = int4(path.bbox);
+            float2 p0 = cubic.p0;
+            qp0 = cubic.p0;
+            float v_step = val / float(n);
+            int n_out = 1;
+            float val_sum = 0.0;
+            float2 p1;
+            float _1147;
+            TileSeg tile_seg;
+            for (uint i_1 = 0u; i_1 < n_quads; i_1++)
+            {
+                float t_1 = float(i_1 + 1u) * _step;
+                float2 param_24 = cubic.p0;
+                float2 param_25 = cubic.p1;
+                float2 param_26 = cubic.p2;
+                float2 param_27 = cubic.p3;
+                float param_28 = t_1;
+                float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28);
+                float2 param_29 = cubic.p0;
+                float2 param_30 = cubic.p1;
+                float2 param_31 = cubic.p2;
+                float2 param_32 = cubic.p3;
+                float param_33 = t_1 - (0.5 * _step);
+                float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33);
+                qp1_1 = (qp1_1 * 2.0) - ((qp0 + qp2_1) * 0.5);
+                SubdivResult params_1 = keep_params[i_1];
+                float param_34 = params_1.a0;
+                float u0 = approx_parabola_inv_integral(param_34);
+                float param_35 = params_1.a2;
+                float u2 = approx_parabola_inv_integral(param_35);
+                float uscale = 1.0 / (u2 - u0);
+                float target = float(n_out) * v_step;
+                for (;;)
+                {
+                    bool _1040 = uint(n_out) == n;
+                    bool _1050;
+                    if (!_1040)
+                    {
+                        _1050 = target < (val_sum + params_1.val);
+                    }
+                    else
+                    {
+                        _1050 = _1040;
+                    }
+                    if (_1050)
+                    {
+                        if (uint(n_out) == n)
+                        {
+                            p1 = cubic.p3;
+                        }
+                        else
+                        {
+                            float u = (target - val_sum) / params_1.val;
+                            float a = mix(params_1.a0, params_1.a2, u);
+                            float param_36 = a;
+                            float au = approx_parabola_inv_integral(param_36);
+                            float t_2 = (au - u0) * uscale;
+                            float2 param_37 = qp0;
+                            float2 param_38 = qp1_1;
+                            float2 param_39 = qp2_1;
+                            float param_40 = t_2;
+                            p1 = eval_quad(param_37, param_38, param_39, param_40);
+                        }
+                        float xmin = fast::min(p0.x, p1.x) - cubic.stroke.x;
+                        float xmax = fast::max(p0.x, p1.x) + cubic.stroke.x;
+                        float ymin = fast::min(p0.y, p1.y) - cubic.stroke.y;
+                        float ymax = fast::max(p0.y, p1.y) + cubic.stroke.y;
+                        float dx = p1.x - p0.x;
+                        float dy = p1.y - p0.y;
+                        if (abs(dy) < 9.999999717180685365747194737196e-10)
+                        {
+                            _1147 = 1000000000.0;
+                        }
+                        else
+                        {
+                            _1147 = dx / dy;
+                        }
+                        float invslope = _1147;
+                        float c = (cubic.stroke.x + (abs(invslope) * (8.0 + cubic.stroke.y))) * 0.0625;
+                        float b = invslope;
+                        float a_1 = (p0.x - ((p0.y - 8.0) * b)) * 0.0625;
+                        int x0 = int(floor(xmin * 0.0625));
+                        int x1 = int(floor(xmax * 0.0625) + 1.0);
+                        int y0 = int(floor(ymin * 0.0625));
+                        int y1 = int(floor(ymax * 0.0625) + 1.0);
+                        x0 = clamp(x0, bbox.x, bbox.z);
+                        y0 = clamp(y0, bbox.y, bbox.w);
+                        x1 = clamp(x1, bbox.x, bbox.z);
+                        y1 = clamp(y1, bbox.y, bbox.w);
+                        float xc = a_1 + (b * float(y0));
+                        int stride = bbox.z - bbox.x;
+                        int base = ((y0 - bbox.y) * stride) - bbox.x;
+                        uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
+                        uint param_41 = n_tile_alloc * 24u;
+                        MallocResult _1263 = malloc(param_41, v_136, v_136BufferSize);
+                        MallocResult tile_alloc = _1263;
+                        if (tile_alloc.failed || (!mem_ok))
+                        {
+                            return;
+                        }
+                        uint tile_offset = tile_alloc.alloc.offset;
+                        int xray = int(floor(p0.x * 0.0625));
+                        int last_xray = int(floor(p1.x * 0.0625));
+                        if (p0.y > p1.y)
+                        {
+                            int tmp = xray;
+                            xray = last_xray;
+                            last_xray = tmp;
+                        }
+                        for (int y = y0; y < y1; y++)
+                        {
+                            float tile_y0 = float(y * 16);
+                            int xbackdrop = max((xray + 1), bbox.x);
+                            bool _1319 = !is_stroke;
+                            bool _1329;
+                            if (_1319)
+                            {
+                                _1329 = fast::min(p0.y, p1.y) < tile_y0;
+                            }
+                            else
+                            {
+                                _1329 = _1319;
+                            }
+                            bool _1336;
+                            if (_1329)
+                            {
+                                _1336 = xbackdrop < bbox.z;
+                            }
+                            else
+                            {
+                                _1336 = _1329;
+                            }
+                            if (_1336)
+                            {
+                                int backdrop = (p1.y < p0.y) ? 1 : (-1);
+                                TileRef param_42 = path.tiles;
+                                uint param_43 = uint(base + xbackdrop);
+                                TileRef tile_ref = Tile_index(param_42, param_43);
+                                uint tile_el = tile_ref.offset >> uint(2);
+                                Alloc param_44 = path_alloc;
+                                uint param_45 = tile_el + 1u;
+                                if (touch_mem(param_44, param_45))
+                                {
+                                    uint _1374 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.memory[tile_el + 1u], uint(backdrop), memory_order_relaxed);
+                                }
+                            }
+                            int next_xray = last_xray;
+                            if (y < (y1 - 1))
+                            {
+                                float tile_y1 = float((y + 1) * 16);
+                                float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
+                                next_xray = int(floor(x_edge * 0.0625));
+                            }
+                            int min_xray = min(xray, next_xray);
+                            int max_xray = max(xray, next_xray);
+                            int xx0 = min(int(floor(xc - c)), min_xray);
+                            int xx1 = max(int(ceil(xc + c)), (max_xray + 1));
+                            xx0 = clamp(xx0, x0, x1);
+                            xx1 = clamp(xx1, x0, x1);
+                            for (int x = xx0; x < xx1; x++)
+                            {
+                                float tile_x0 = float(x * 16);
+                                TileRef param_46 = TileRef{ path.tiles.offset };
+                                uint param_47 = uint(base + x);
+                                TileRef tile_ref_1 = Tile_index(param_46, param_47);
+                                uint tile_el_1 = tile_ref_1.offset >> uint(2);
+                                uint old = 0u;
+                                Alloc param_48 = path_alloc;
+                                uint param_49 = tile_el_1;
+                                if (touch_mem(param_48, param_49))
+                                {
+                                    uint _1477 = atomic_exchange_explicit((device atomic_uint*)&v_136.memory[tile_el_1], tile_offset, memory_order_relaxed);
+                                    old = _1477;
+                                }
+                                tile_seg.origin = p0;
+                                tile_seg.vector = p1 - p0;
+                                float y_edge = 0.0;
+                                if (!is_stroke)
+                                {
+                                    y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
+                                    if (fast::min(p0.x, p1.x) < tile_x0)
+                                    {
+                                        float2 p = float2(tile_x0, y_edge);
+                                        if (p0.x > p1.x)
+                                        {
+                                            tile_seg.vector = p - p0;
+                                        }
+                                        else
+                                        {
+                                            tile_seg.origin = p;
+                                            tile_seg.vector = p1 - p;
+                                        }
+                                        if (tile_seg.vector.x == 0.0)
+                                        {
+                                            tile_seg.vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10;
+                                        }
+                                    }
+                                    if ((x <= min_xray) || (max_xray < x))
+                                    {
+                                        y_edge = 1000000000.0;
+                                    }
+                                }
+                                tile_seg.y_edge = y_edge;
+                                tile_seg.next.offset = old;
+                                Alloc param_50 = tile_alloc.alloc;
+                                TileSegRef param_51 = TileSegRef{ tile_offset };
+                                TileSeg param_52 = tile_seg;
+                                TileSeg_write(param_50, param_51, param_52, v_136, v_136BufferSize);
+                                tile_offset += 24u;
+                            }
+                            xc += b;
+                            base += stride;
+                            xray = next_xray;
+                        }
+                        n_out++;
+                        target += v_step;
+                        p0 = p1;
+                        continue;
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+                val_sum += params_1.val;
+                qp0 = qp2_1;
+            }
+            break;
+        }
+    }
+}
+
diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/gen/path_coarse.spv
similarity index 100%
rename from piet-gpu/shader/path_coarse.spv
rename to piet-gpu/shader/gen/path_coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl
index c7f7df0..a9cee25 100644
--- a/piet-gpu/shader/gen/pathseg.hlsl
+++ b/piet-gpu/shader/gen/pathseg.hlsl
@@ -77,10 +77,10 @@
 static const TagMonoid _135 = { 0u, 0u, 0u, 0u, 0u };
 static const Monoid _567 = { 0.0f.xxxx, 0u };
 
-RWByteAddressBuffer _111 : register(u0);
-ByteAddressBuffer _574 : register(t2);
-ByteAddressBuffer _639 : register(t1);
-ByteAddressBuffer _709 : register(t3);
+RWByteAddressBuffer _111 : register(u0, space0);
+ByteAddressBuffer _574 : register(t2, space0);
+ByteAddressBuffer _639 : register(t1, space0);
+ByteAddressBuffer _709 : register(t3, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl
index dd7c611..291243e 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.hlsl
+++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl
@@ -36,10 +36,10 @@
 
 static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u);
 
-ByteAddressBuffer _139 : register(t1);
-ByteAddressBuffer _150 : register(t2);
-RWByteAddressBuffer _238 : register(u3);
-RWByteAddressBuffer _258 : register(u0);
+ByteAddressBuffer _139 : register(t1, space0);
+ByteAddressBuffer _150 : register(t2, space0);
+RWByteAddressBuffer _238 : register(u3, space0);
+RWByteAddressBuffer _258 : register(u0, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
diff --git a/piet-gpu/shader/gen/pathtag_root.hlsl b/piet-gpu/shader/gen/pathtag_root.hlsl
index 388f99d..f1ec389 100644
--- a/piet-gpu/shader/gen/pathtag_root.hlsl
+++ b/piet-gpu/shader/gen/pathtag_root.hlsl
@@ -11,7 +11,7 @@
 
 static const TagMonoid _18 = { 0u, 0u, 0u, 0u, 0u };
 
-RWByteAddressBuffer _78 : register(u0);
+RWByteAddressBuffer _78 : register(u0, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil
new file mode 100644
index 0000000..d69db16
--- /dev/null
+++ b/piet-gpu/shader/gen/tile_alloc.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl
new file mode 100644
index 0000000..010e714
--- /dev/null
+++ b/piet-gpu/shader/gen/tile_alloc.hlsl
@@ -0,0 +1,335 @@
+struct Alloc
+{
+    uint offset;
+};
+
+struct MallocResult
+{
+    Alloc alloc;
+    bool failed;
+};
+
+struct AnnoEndClipRef
+{
+    uint offset;
+};
+
+struct AnnoEndClip
+{
+    float4 bbox;
+};
+
+struct AnnotatedRef
+{
+    uint offset;
+};
+
+struct AnnotatedTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct PathRef
+{
+    uint offset;
+};
+
+struct TileRef
+{
+    uint offset;
+};
+
+struct Path
+{
+    uint4 bbox;
+    TileRef tiles;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer _92 : register(u0, space0);
+ByteAddressBuffer _305 : register(t1, space0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared uint sh_tile_count[256];
+groupshared MallocResult sh_tile_alloc;
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = _92.Load(offset * 4 + 8);
+    return v;
+}
+
+AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1);
+    AnnotatedTag _236 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _236;
+}
+
+AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    AnnoEndClip s;
+    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    return s;
+}
+
+AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref)
+{
+    AnnoEndClipRef _243 = { ref.offset + 4u };
+    Alloc param = a;
+    AnnoEndClipRef param_1 = _243;
+    return AnnoEndClip_read(param, param_1);
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+MallocResult malloc(uint size)
+{
+    uint _98;
+    _92.InterlockedAdd(0, size, _98);
+    uint offset = _98;
+    uint _105;
+    _92.GetDimensions(_105);
+    _105 = (_105 - 8) / 4;
+    MallocResult r;
+    r.failed = (offset + size) > uint(int(_105) * 4);
+    uint param = offset;
+    uint param_1 = size;
+    bool param_2 = !r.failed;
+    r.alloc = new_alloc(param, param_1, param_2);
+    if (r.failed)
+    {
+        uint _127;
+        _92.InterlockedMax(4, 1u, _127);
+        return r;
+    }
+    return r;
+}
+
+Alloc slice_mem(Alloc a, uint offset, uint size)
+{
+    Alloc _169 = { a.offset + offset };
+    return _169;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    _92.Store(offset * 4 + 8, val);
+}
+
+void Path_write(Alloc a, PathRef ref, Path s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.bbox.x | (s.bbox.y << uint(16));
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = s.bbox.z | (s.bbox.w << uint(16));
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = s.tiles.offset;
+    write_mem(param_6, param_7, param_8);
+}
+
+void comp_main()
+{
+    uint th_ix = gl_LocalInvocationID.x;
+    uint element_ix = gl_GlobalInvocationID.x;
+    PathRef _312 = { _305.Load(16) + (element_ix * 12u) };
+    PathRef path_ref = _312;
+    AnnotatedRef _321 = { _305.Load(32) + (element_ix * 40u) };
+    AnnotatedRef ref = _321;
+    uint tag = 0u;
+    if (element_ix < _305.Load(0))
+    {
+        Alloc _332;
+        _332.offset = _305.Load(32);
+        Alloc param;
+        param.offset = _332.offset;
+        AnnotatedRef param_1 = ref;
+        tag = Annotated_tag(param, param_1).tag;
+    }
+    int x0 = 0;
+    int y0 = 0;
+    int x1 = 0;
+    int y1 = 0;
+    switch (tag)
+    {
+        case 1u:
+        case 2u:
+        case 3u:
+        case 4u:
+        case 5u:
+        {
+            Alloc _350;
+            _350.offset = _305.Load(32);
+            Alloc param_2;
+            param_2.offset = _350.offset;
+            AnnotatedRef param_3 = ref;
+            AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3);
+            x0 = int(floor(clip.bbox.x * 0.0625f));
+            y0 = int(floor(clip.bbox.y * 0.0625f));
+            x1 = int(ceil(clip.bbox.z * 0.0625f));
+            y1 = int(ceil(clip.bbox.w * 0.0625f));
+            break;
+        }
+    }
+    x0 = clamp(x0, 0, int(_305.Load(8)));
+    y0 = clamp(y0, 0, int(_305.Load(12)));
+    x1 = clamp(x1, 0, int(_305.Load(8)));
+    y1 = clamp(y1, 0, int(_305.Load(12)));
+    Path path;
+    path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
+    uint tile_count = uint((x1 - x0) * (y1 - y0));
+    if (tag == 5u)
+    {
+        tile_count = 0u;
+    }
+    sh_tile_count[th_ix] = tile_count;
+    uint total_tile_count = tile_count;
+    for (uint i = 0u; i < 8u; i++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if (th_ix >= uint(1 << int(i)))
+        {
+            total_tile_count += sh_tile_count[th_ix - (1u << i)];
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_tile_count[th_ix] = total_tile_count;
+    }
+    if (th_ix == 255u)
+    {
+        uint param_4 = total_tile_count * 8u;
+        MallocResult _477 = malloc(param_4);
+        sh_tile_alloc = _477;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    MallocResult alloc_start = sh_tile_alloc;
+    bool _488;
+    if (!alloc_start.failed)
+    {
+        _488 = _92.Load(4) != 0u;
+    }
+    else
+    {
+        _488 = alloc_start.failed;
+    }
+    if (_488)
+    {
+        return;
+    }
+    if (element_ix < _305.Load(0))
+    {
+        uint _501;
+        if (th_ix > 0u)
+        {
+            _501 = sh_tile_count[th_ix - 1u];
+        }
+        else
+        {
+            _501 = 0u;
+        }
+        uint tile_subix = _501;
+        Alloc param_5 = alloc_start.alloc;
+        uint param_6 = 8u * tile_subix;
+        uint param_7 = 8u * tile_count;
+        Alloc tiles_alloc = slice_mem(param_5, param_6, param_7);
+        TileRef _523 = { tiles_alloc.offset };
+        path.tiles = _523;
+        Alloc _528;
+        _528.offset = _305.Load(16);
+        Alloc param_8;
+        param_8.offset = _528.offset;
+        PathRef param_9 = path_ref;
+        Path param_10 = path;
+        Path_write(param_8, param_9, param_10);
+    }
+    uint total_count = sh_tile_count[255] * 2u;
+    uint start_ix = alloc_start.alloc.offset >> uint(2);
+    for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
+    {
+        Alloc param_11 = alloc_start.alloc;
+        uint param_12 = start_ix + i_1;
+        uint param_13 = 0u;
+        write_mem(param_11, param_12, param_13);
+    }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl
new file mode 100644
index 0000000..3906536
--- /dev/null
+++ b/piet-gpu/shader/gen/tile_alloc.msl
@@ -0,0 +1,336 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct MallocResult
+{
+    Alloc alloc;
+    bool failed;
+};
+
+struct AnnoEndClipRef
+{
+    uint offset;
+};
+
+struct AnnoEndClip
+{
+    float4 bbox;
+};
+
+struct AnnotatedRef
+{
+    uint offset;
+};
+
+struct AnnotatedTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct PathRef
+{
+    uint offset;
+};
+
+struct TileRef
+{
+    uint offset;
+};
+
+struct Path
+{
+    uint4 bbox;
+    TileRef tiles;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+struct Alloc_1
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc_1 tile_alloc;
+    Alloc_1 bin_alloc;
+    Alloc_1 ptcl_alloc;
+    Alloc_1 pathseg_alloc;
+    Alloc_1 anno_alloc;
+    Alloc_1 trans_alloc;
+    Alloc_1 bbox_alloc;
+    Alloc_1 drawmonoid_alloc;
+    uint n_trans;
+    uint n_path;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_92, constant uint& v_92BufferSize)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = v_92.memory[offset];
+    return v;
+}
+
+static inline __attribute__((always_inline))
+AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_92, constant uint& v_92BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint tag_and_flags = read_mem(param, param_1, v_92, v_92BufferSize);
+    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+}
+
+static inline __attribute__((always_inline))
+AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_92, constant uint& v_92BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_92, v_92BufferSize);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_92, v_92BufferSize);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_92, v_92BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_92, v_92BufferSize);
+    AnnoEndClip s;
+    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    return s;
+}
+
+static inline __attribute__((always_inline))
+AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_92, constant uint& v_92BufferSize)
+{
+    Alloc param = a;
+    AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u };
+    return AnnoEndClip_read(param, param_1, v_92, v_92BufferSize);
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+    Alloc a;
+    a.offset = offset;
+    return a;
+}
+
+static inline __attribute__((always_inline))
+MallocResult malloc(thread const uint& size, device Memory& v_92, constant uint& v_92BufferSize)
+{
+    uint _98 = atomic_fetch_add_explicit((device atomic_uint*)&v_92.mem_offset, size, memory_order_relaxed);
+    uint offset = _98;
+    MallocResult r;
+    r.failed = (offset + size) > uint(int((v_92BufferSize - 8) / 4) * 4);
+    uint param = offset;
+    uint param_1 = size;
+    bool param_2 = !r.failed;
+    r.alloc = new_alloc(param, param_1, param_2);
+    if (r.failed)
+    {
+        uint _127 = atomic_fetch_max_explicit((device atomic_uint*)&v_92.mem_error, 1u, memory_order_relaxed);
+        return r;
+    }
+    return r;
+}
+
+static inline __attribute__((always_inline))
+Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
+{
+    return Alloc{ a.offset + offset };
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_92, constant uint& v_92BufferSize)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    v_92.memory[offset] = val;
+}
+
+static inline __attribute__((always_inline))
+void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_92, constant uint& v_92BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.bbox.x | (s.bbox.y << uint(16));
+    write_mem(param, param_1, param_2, v_92, v_92BufferSize);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = s.bbox.z | (s.bbox.w << uint(16));
+    write_mem(param_3, param_4, param_5, v_92, v_92BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = s.tiles.offset;
+    write_mem(param_6, param_7, param_8, v_92, v_92BufferSize);
+}
+
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_92 [[buffer(0)]], const device ConfigBuf& _305 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    threadgroup uint sh_tile_count[256];
+    threadgroup MallocResult sh_tile_alloc;
+    constant uint& v_92BufferSize = spvBufferSizeConstants[0];
+    uint th_ix = gl_LocalInvocationID.x;
+    uint element_ix = gl_GlobalInvocationID.x;
+    PathRef path_ref = PathRef{ _305.conf.tile_alloc.offset + (element_ix * 12u) };
+    AnnotatedRef ref = AnnotatedRef{ _305.conf.anno_alloc.offset + (element_ix * 40u) };
+    uint tag = 0u;
+    if (element_ix < _305.conf.n_elements)
+    {
+        Alloc param;
+        param.offset = _305.conf.anno_alloc.offset;
+        AnnotatedRef param_1 = ref;
+        tag = Annotated_tag(param, param_1, v_92, v_92BufferSize).tag;
+    }
+    int x0 = 0;
+    int y0 = 0;
+    int x1 = 0;
+    int y1 = 0;
+    switch (tag)
+    {
+        case 1u:
+        case 2u:
+        case 3u:
+        case 4u:
+        case 5u:
+        {
+            Alloc param_2;
+            param_2.offset = _305.conf.anno_alloc.offset;
+            AnnotatedRef param_3 = ref;
+            AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_92, v_92BufferSize);
+            x0 = int(floor(clip.bbox.x * 0.0625));
+            y0 = int(floor(clip.bbox.y * 0.0625));
+            x1 = int(ceil(clip.bbox.z * 0.0625));
+            y1 = int(ceil(clip.bbox.w * 0.0625));
+            break;
+        }
+    }
+    x0 = clamp(x0, 0, int(_305.conf.width_in_tiles));
+    y0 = clamp(y0, 0, int(_305.conf.height_in_tiles));
+    x1 = clamp(x1, 0, int(_305.conf.width_in_tiles));
+    y1 = clamp(y1, 0, int(_305.conf.height_in_tiles));
+    Path path;
+    path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
+    uint tile_count = uint((x1 - x0) * (y1 - y0));
+    if (tag == 5u)
+    {
+        tile_count = 0u;
+    }
+    sh_tile_count[th_ix] = tile_count;
+    uint total_tile_count = tile_count;
+    for (uint i = 0u; i < 8u; i++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (th_ix >= uint(1 << int(i)))
+        {
+            total_tile_count += sh_tile_count[th_ix - (1u << i)];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_tile_count[th_ix] = total_tile_count;
+    }
+    if (th_ix == 255u)
+    {
+        uint param_4 = total_tile_count * 8u;
+        MallocResult _477 = malloc(param_4, v_92, v_92BufferSize);
+        sh_tile_alloc = _477;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    MallocResult alloc_start = sh_tile_alloc;
+    bool _488;
+    if (!alloc_start.failed)
+    {
+        _488 = v_92.mem_error != 0u;
+    }
+    else
+    {
+        _488 = alloc_start.failed;
+    }
+    if (_488)
+    {
+        return;
+    }
+    if (element_ix < _305.conf.n_elements)
+    {
+        uint _501;
+        if (th_ix > 0u)
+        {
+            _501 = sh_tile_count[th_ix - 1u];
+        }
+        else
+        {
+            _501 = 0u;
+        }
+        uint tile_subix = _501;
+        Alloc param_5 = alloc_start.alloc;
+        uint param_6 = 8u * tile_subix;
+        uint param_7 = 8u * tile_count;
+        Alloc tiles_alloc = slice_mem(param_5, param_6, param_7);
+        path.tiles = TileRef{ tiles_alloc.offset };
+        Alloc param_8;
+        param_8.offset = _305.conf.tile_alloc.offset;
+        PathRef param_9 = path_ref;
+        Path param_10 = path;
+        Path_write(param_8, param_9, param_10, v_92, v_92BufferSize);
+    }
+    uint total_count = sh_tile_count[255] * 2u;
+    uint start_ix = alloc_start.alloc.offset >> uint(2);
+    for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
+    {
+        Alloc param_11 = alloc_start.alloc;
+        uint param_12 = start_ix + i_1;
+        uint param_13 = 0u;
+        write_mem(param_11, param_12, param_13, v_92, v_92BufferSize);
+    }
+}
+
diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv
similarity index 79%
rename from piet-gpu/shader/tile_alloc.spv
rename to piet-gpu/shader/gen/tile_alloc.spv
index b443b03..d4a6e31 100644
--- a/piet-gpu/shader/tile_alloc.spv
+++ b/piet-gpu/shader/gen/tile_alloc.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl
index 6fa9267..7744e0f 100644
--- a/piet-gpu/shader/gen/transform_leaf.hlsl
+++ b/piet-gpu/shader/gen/transform_leaf.hlsl
@@ -51,10 +51,10 @@
 
 static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx };
 
-RWByteAddressBuffer _71 : register(u0);
-ByteAddressBuffer _96 : register(t2);
-ByteAddressBuffer _278 : register(t1);
-ByteAddressBuffer _377 : register(t3);
+RWByteAddressBuffer _71 : register(u0, space0);
+ByteAddressBuffer _96 : register(t2, space0);
+ByteAddressBuffer _278 : register(t1, space0);
+ByteAddressBuffer _377 : register(t3, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl
index 60addf3..5ada811 100644
--- a/piet-gpu/shader/gen/transform_reduce.hlsl
+++ b/piet-gpu/shader/gen/transform_reduce.hlsl
@@ -38,10 +38,10 @@
 
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
 
-ByteAddressBuffer _49 : register(t2);
-ByteAddressBuffer _161 : register(t1);
-RWByteAddressBuffer _251 : register(u3);
-RWByteAddressBuffer _267 : register(u0);
+ByteAddressBuffer _49 : register(t2, space0);
+ByteAddressBuffer _161 : register(t1, space0);
+RWByteAddressBuffer _251 : register(u3, space0);
+RWByteAddressBuffer _267 : register(u0, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
diff --git a/piet-gpu/shader/gen/transform_root.hlsl b/piet-gpu/shader/gen/transform_root.hlsl
index 42bbd38..35961b1 100644
--- a/piet-gpu/shader/gen/transform_root.hlsl
+++ b/piet-gpu/shader/gen/transform_root.hlsl
@@ -8,7 +8,7 @@
 
 static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx };
 
-RWByteAddressBuffer _89 : register(u0);
+RWByteAddressBuffer _89 : register(u0, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
diff --git a/piet-gpu/shader/kernel4_idx.spv b/piet-gpu/shader/kernel4_idx.spv
deleted file mode 100644
index 953eae1..0000000
--- a/piet-gpu/shader/kernel4_idx.spv
+++ /dev/null
Binary files differ
diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp
index 6340683..3761e9e 100644
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@@ -73,7 +73,7 @@
     for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
         barrier();
         if (th_ix >= (1 << i)) {
-            total_tile_count += sh_tile_count[th_ix - (1 << i)];
+            total_tile_count += sh_tile_count[th_ix - (1u << i)];
         }
         barrier();
         sh_tile_count[th_ix] = total_tile_count;
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 25627f6..e1bde6a 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -15,7 +15,7 @@
 
 use piet_gpu_hal::{
     BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image, ImageLayout, Pipeline,
-    QueryPool, Session, ShaderCode,
+    QueryPool, Session, ShaderCode, include_shader,
 };
 
 use pico_svg::PicoSvg;
@@ -161,23 +161,23 @@
             })
             .collect();
 
-        let tile_alloc_code = ShaderCode::Spv(include_bytes!("../shader/tile_alloc.spv"));
+        let tile_alloc_code = include_shader!(session, "../shader/gen/tile_alloc");
         let tile_pipeline = session
             .create_compute_pipeline(tile_alloc_code, &[BindType::Buffer, BindType::Buffer])?;
         let tile_ds = session
             .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf])?;
 
-        let path_alloc_code = ShaderCode::Spv(include_bytes!("../shader/path_coarse.spv"));
+        let path_alloc_code = include_shader!(session, "../shader/gen/path_coarse");
         let path_pipeline = session
             .create_compute_pipeline(path_alloc_code, &[BindType::Buffer, BindType::Buffer])?;
         let path_ds = session
             .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?;
 
         let backdrop_code = if session.gpu_info().workgroup_limits.max_invocations >= 1024 {
-            ShaderCode::Spv(include_bytes!("../shader/backdrop_lg.spv"))
+            include_shader!(session, "../shader/gen/backdrop_lg")
         } else {
             println!("using small workgroup backdrop kernel");
-            ShaderCode::Spv(include_bytes!("../shader/backdrop.spv"))
+            include_shader!(session, "../shader/gen/backdrop")
         };
         let backdrop_pipeline = session
             .create_compute_pipeline(backdrop_code, &[BindType::Buffer, BindType::Buffer])?;
@@ -185,13 +185,13 @@
             .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?;
 
         // TODO: constants
-        let bin_code = ShaderCode::Spv(include_bytes!("../shader/binning.spv"));
+        let bin_code = include_shader!(session, "../shader/gen/binning");
         let bin_pipeline =
             session.create_compute_pipeline(bin_code, &[BindType::Buffer, BindType::Buffer])?;
         let bin_ds =
             session.create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf])?;
 
-        let coarse_code = ShaderCode::Spv(include_bytes!("../shader/coarse.spv"));
+        let coarse_code = include_shader!(session, "../shader/gen/coarse");
         let coarse_pipeline =
             session.create_compute_pipeline(coarse_code, &[BindType::Buffer, BindType::Buffer])?;
         let coarse_ds = session
@@ -210,7 +210,7 @@
             .collect();
         let gradients = Self::make_gradient_image(&session);
 
-        let k4_code = ShaderCode::Spv(include_bytes!("../shader/kernel4.spv"));
+        let k4_code = include_shader!(session, "../shader/gen/kernel4");
         let k4_pipeline = session.create_compute_pipeline(
             k4_code,
             &[