Merge branch 'master' into blend_mem

This does the merge and also rebuilds the generated shaders.
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 1b3f252..be891c1 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -151,6 +151,11 @@
     uint part_start_ix = 0;
     uint ready_ix = 0;
 
+    cmd_ref.offset += 4;
+    // Accounting for allocation of blend memory
+    uint render_blend_depth = 0;
+    uint max_blend_depth = 0;
+
     uint drawmonoid_start = conf.drawmonoid_alloc.offset >> 2;
     uint drawtag_start = conf.drawtag_offset >> 2;
     uint drawdata_start = conf.drawdata_offset >> 2;
@@ -414,6 +419,8 @@
                         }
                         Cmd_BeginClip_write(cmd_alloc, cmd_ref);
                         cmd_ref.offset += 4;
+                        render_blend_depth++;
+                        max_blend_depth = max(max_blend_depth, render_blend_depth);
                     }
                     clip_depth++;
                     break;
@@ -426,6 +433,7 @@
                     uint blend = scene[dd];
                     Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
                     cmd_ref.offset += 4 + CmdEndClip_size;
+                    render_blend_depth--;
                     break;
                 }
             } else {
@@ -451,5 +459,8 @@
     }
     if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
         Cmd_End_write(cmd_alloc, cmd_ref);
+        if (max_blend_depth > BLEND_STACK_SPLIT) {
+            // TODO: allocate blend memory and write result
+        }
     }
 }
diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil
index df2be88..0fb9622 100644
--- a/piet-gpu/shader/gen/backdrop.dxil
+++ b/piet-gpu/shader/gen/backdrop.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil
index 81f9b65..e24a6d3 100644
--- a/piet-gpu/shader/gen/backdrop_lg.dxil
+++ b/piet-gpu/shader/gen/backdrop_lg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil
index 6b3efaf..6655b7f 100644
--- a/piet-gpu/shader/gen/bbox_clear.dxil
+++ b/piet-gpu/shader/gen/bbox_clear.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_leaf.dxil b/piet-gpu/shader/gen/clip_leaf.dxil
index b681a65..29a158e 100644
--- a/piet-gpu/shader/gen/clip_leaf.dxil
+++ b/piet-gpu/shader/gen/clip_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_reduce.dxil b/piet-gpu/shader/gen/clip_reduce.dxil
index 0ccaac9..0dff71b 100644
--- a/piet-gpu/shader/gen/clip_reduce.dxil
+++ b/piet-gpu/shader/gen/clip_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index c91fcdf..3370f59 100644
--- a/piet-gpu/shader/gen/coarse.dxil
+++ b/piet-gpu/shader/gen/coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index 0331e33..3a5c1a5 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -177,7 +177,7 @@
 
 RWByteAddressBuffer _260 : register(u0, space0);
 ByteAddressBuffer _1005 : register(t1, space0);
-ByteAddressBuffer _1372 : register(t2, space0);
+ByteAddressBuffer _1378 : register(t2, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -681,6 +681,9 @@
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
+    cmd_ref.offset += 4u;
+    uint render_blend_depth = 0u;
+    uint max_blend_depth = 0u;
     uint drawmonoid_start = _1005.Load(44) >> uint(2);
     uint drawtag_start = _1005.Load(100) >> uint(2);
     uint drawdata_start = _1005.Load(104) >> uint(2);
@@ -688,11 +691,11 @@
     bool mem_ok = _260.Load(4) == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1304;
+    uint _1310;
     uint element_ix;
     Alloc param_14;
     uint tile_count;
-    uint _1605;
+    uint _1611;
     float linewidth;
     CmdLinGrad cmd_lin;
     CmdRadGrad cmd_rad;
@@ -702,34 +705,34 @@
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1356;
+        bool _1362;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1154 = th_ix < 256u;
-                bool _1162;
-                if (_1154)
+                bool _1160 = th_ix < 256u;
+                bool _1168;
+                if (_1160)
                 {
-                    _1162 = (partition_ix + th_ix) < n_partitions;
+                    _1168 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1162 = _1154;
+                    _1168 = _1160;
                 }
-                if (_1162)
+                if (_1168)
                 {
                     uint in_ix = (_1005.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1179;
-                    _1179.offset = _1005.Load(20);
-                    param_3.offset = _1179.offset;
+                    Alloc _1185;
+                    _1185.offset = _1005.Load(20);
+                    param_3.offset = _1185.offset;
                     uint param_4 = in_ix;
                     count = read_mem(param_3, param_4);
-                    Alloc _1190;
-                    _1190.offset = _1005.Load(20);
-                    param_5.offset = _1190.offset;
+                    Alloc _1196;
+                    _1196.offset = _1005.Load(20);
+                    param_5.offset = _1196.offset;
                     uint param_6 = in_ix + 1u;
                     uint offset = read_mem(param_5, param_6);
                     uint param_7 = offset;
@@ -775,16 +778,16 @@
                 }
                 if (part_ix > 0u)
                 {
-                    _1304 = sh_part_count[part_ix - 1u];
+                    _1310 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1304 = part_start_ix;
+                    _1310 = part_start_ix;
                 }
-                ix -= _1304;
+                ix -= _1310;
                 Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1323 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1323;
+                BinInstanceRef _1329 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1329;
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
@@ -794,16 +797,16 @@
             }
             GroupMemoryBarrierWithGroupSync();
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1346 = (wr_ix - rd_ix) < 256u;
-            if (_1346)
+            bool _1352 = (wr_ix - rd_ix) < 256u;
+            if (_1352)
             {
-                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1362 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1356 = _1346;
+                _1362 = _1352;
             }
-            if (_1356)
+            if (_1362)
             {
                 continue;
             }
@@ -816,7 +819,7 @@
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1372.Load((drawtag_start + element_ix) * 4 + 0);
+            tag = _1378.Load((drawtag_start + element_ix) * 4 + 0);
         }
         switch (tag)
         {
@@ -829,11 +832,11 @@
             {
                 uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
                 uint path_ix = _260.Load(drawmonoid_base * 4 + 8);
-                PathRef _1397 = { _1005.Load(16) + (path_ix * 12u) };
-                Alloc _1400;
-                _1400.offset = _1005.Load(16);
-                param_14.offset = _1400.offset;
-                PathRef param_15 = _1397;
+                PathRef _1403 = { _1005.Load(16) + (path_ix * 12u) };
+                Alloc _1406;
+                _1406.offset = _1005.Load(16);
+                param_14.offset = _1406.offset;
+                PathRef param_15 = _1403;
                 Path path = Path_read(param_14, param_15);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
@@ -889,16 +892,16 @@
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1372.Load((drawtag_start + element_ix_1) * 4 + 0);
+            uint tag_1 = _1378.Load((drawtag_start + element_ix_1) * 4 + 0);
             if (el_ix > 0u)
             {
-                _1605 = sh_tile_count[el_ix - 1u];
+                _1611 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1605 = 0u;
+                _1611 = 0u;
             }
-            uint seq_ix = ix_1 - _1605;
+            uint seq_ix = ix_1 - _1611;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -907,9 +910,9 @@
             {
                 uint param_21 = el_ix;
                 bool param_22 = mem_ok;
-                TileRef _1657 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                TileRef _1663 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
                 Alloc param_23 = read_tile_alloc(param_21, param_22);
-                TileRef param_24 = _1657;
+                TileRef param_24 = _1663;
                 Tile tile = Tile_read(param_23, param_24);
                 bool is_clip = (tag_1 & 1u) != 0u;
                 bool is_blend = false;
@@ -918,27 +921,27 @@
                     uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
                     uint scene_offset = _260.Load((drawmonoid_base_1 + 2u) * 4 + 8);
                     uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1372.Load(dd * 4 + 0);
+                    uint blend = _1378.Load(dd * 4 + 0);
                     is_blend = blend != 32771u;
                 }
-                bool _1693 = tile.tile.offset != 0u;
-                bool _1702;
-                if (!_1693)
+                bool _1699 = tile.tile.offset != 0u;
+                bool _1708;
+                if (!_1699)
                 {
-                    _1702 = (tile.backdrop == 0) == is_clip;
+                    _1708 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1702 = _1693;
+                    _1708 = _1699;
                 }
-                include_tile = _1702 || is_blend;
+                include_tile = _1708 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1724;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1724);
+                uint _1730;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1730);
             }
         }
         GroupMemoryBarrierWithGroupSync();
@@ -962,14 +965,14 @@
             uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1372.Load((drawtag_start + element_ix_2) * 4 + 0);
+            uint drawtag = _1378.Load((drawtag_start + element_ix_2) * 4 + 0);
             if (clip_zero_depth == 0u)
             {
                 uint param_25 = element_ref_ix;
                 bool param_26 = mem_ok;
-                TileRef _1801 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                TileRef _1807 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                 Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1801;
+                TileRef param_28 = _1807;
                 Tile tile_1 = Tile_read(param_27, param_28);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
                 uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
@@ -984,11 +987,11 @@
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1849 = alloc_cmd(param_29, param_30, param_31);
+                        bool _1855 = alloc_cmd(param_29, param_30, param_31);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1849)
+                        if (!_1855)
                         {
                             break;
                         }
@@ -998,11 +1001,11 @@
                         float param_35 = linewidth;
                         write_fill(param_32, param_33, param_34, param_35);
                         cmd_ref = param_33;
-                        uint rgba = _1372.Load(dd_1 * 4 + 0);
-                        CmdColor _1872 = { rgba };
+                        uint rgba = _1378.Load(dd_1 * 4 + 0);
+                        CmdColor _1878 = { rgba };
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = _1872;
+                        CmdColor param_38 = _1878;
                         Cmd_Color_write(param_36, param_37, param_38);
                         cmd_ref.offset += 8u;
                         break;
@@ -1012,11 +1015,11 @@
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1890 = alloc_cmd(param_39, param_40, param_41);
+                        bool _1896 = alloc_cmd(param_39, param_40, param_41);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1890)
+                        if (!_1896)
                         {
                             break;
                         }
@@ -1027,7 +1030,7 @@
                         float param_45 = linewidth;
                         write_fill(param_42, param_43, param_44, param_45);
                         cmd_ref = param_43;
-                        cmd_lin.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_lin.index = _1378.Load(dd_1 * 4 + 0);
                         cmd_lin.line_x = asfloat(_260.Load((di + 1u) * 4 + 8));
                         cmd_lin.line_y = asfloat(_260.Load((di + 2u) * 4 + 8));
                         cmd_lin.line_c = asfloat(_260.Load((di + 3u) * 4 + 8));
@@ -1043,11 +1046,11 @@
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1954 = alloc_cmd(param_49, param_50, param_51);
+                        bool _1960 = alloc_cmd(param_49, param_50, param_51);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1954)
+                        if (!_1960)
                         {
                             break;
                         }
@@ -1058,7 +1061,7 @@
                         float param_55 = linewidth;
                         write_fill(param_52, param_53, param_54, param_55);
                         cmd_ref = param_53;
-                        cmd_rad.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_rad.index = _1378.Load(dd_1 * 4 + 0);
                         cmd_rad.mat = asfloat(uint4(_260.Load((di + 1u) * 4 + 8), _260.Load((di + 2u) * 4 + 8), _260.Load((di + 3u) * 4 + 8), _260.Load((di + 4u) * 4 + 8)));
                         cmd_rad.xlat = asfloat(uint2(_260.Load((di + 5u) * 4 + 8), _260.Load((di + 6u) * 4 + 8)));
                         cmd_rad.c1 = asfloat(uint2(_260.Load((di + 7u) * 4 + 8), _260.Load((di + 8u) * 4 + 8)));
@@ -1077,11 +1080,11 @@
                         Alloc param_59 = cmd_alloc;
                         CmdRef param_60 = cmd_ref;
                         uint param_61 = cmd_limit;
-                        bool _2060 = alloc_cmd(param_59, param_60, param_61);
+                        bool _2066 = alloc_cmd(param_59, param_60, param_61);
                         cmd_alloc = param_59;
                         cmd_ref = param_60;
                         cmd_limit = param_61;
-                        if (!_2060)
+                        if (!_2066)
                         {
                             break;
                         }
@@ -1091,30 +1094,30 @@
                         float param_65 = linewidth;
                         write_fill(param_62, param_63, param_64, param_65);
                         cmd_ref = param_63;
-                        uint index = _1372.Load(dd_1 * 4 + 0);
-                        uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
+                        uint index = _1378.Load(dd_1 * 4 + 0);
+                        uint raw1 = _1378.Load((dd_1 + 1u) * 4 + 0);
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _2099 = { index, offset_1 };
+                        CmdImage _2105 = { index, offset_1 };
                         Alloc param_66 = cmd_alloc;
                         CmdRef param_67 = cmd_ref;
-                        CmdImage param_68 = _2099;
+                        CmdImage param_68 = _2105;
                         Cmd_Image_write(param_66, param_67, param_68);
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _2113 = tile_1.tile.offset == 0u;
-                        bool _2119;
-                        if (_2113)
+                        bool _2119 = tile_1.tile.offset == 0u;
+                        bool _2125;
+                        if (_2119)
                         {
-                            _2119 = tile_1.backdrop == 0;
+                            _2125 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2119 = _2113;
+                            _2125 = _2119;
                         }
-                        if (_2119)
+                        if (_2125)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1123,11 +1126,11 @@
                             Alloc param_69 = cmd_alloc;
                             CmdRef param_70 = cmd_ref;
                             uint param_71 = cmd_limit;
-                            bool _2131 = alloc_cmd(param_69, param_70, param_71);
+                            bool _2137 = alloc_cmd(param_69, param_70, param_71);
                             cmd_alloc = param_69;
                             cmd_ref = param_70;
                             cmd_limit = param_71;
-                            if (!_2131)
+                            if (!_2137)
                             {
                                 break;
                             }
@@ -1135,6 +1138,8 @@
                             CmdRef param_73 = cmd_ref;
                             Cmd_BeginClip_write(param_72, param_73);
                             cmd_ref.offset += 4u;
+                            render_blend_depth++;
+                            max_blend_depth = max(max_blend_depth, render_blend_depth);
                         }
                         clip_depth++;
                         break;
@@ -1145,11 +1150,11 @@
                         Alloc param_74 = cmd_alloc;
                         CmdRef param_75 = cmd_ref;
                         uint param_76 = cmd_limit;
-                        bool _2159 = alloc_cmd(param_74, param_75, param_76);
+                        bool _2170 = alloc_cmd(param_74, param_75, param_76);
                         cmd_alloc = param_74;
                         cmd_ref = param_75;
                         cmd_limit = param_76;
-                        if (!_2159)
+                        if (!_2170)
                         {
                             break;
                         }
@@ -1159,13 +1164,14 @@
                         float param_80 = -1.0f;
                         write_fill(param_77, param_78, param_79, param_80);
                         cmd_ref = param_78;
-                        uint blend_1 = _1372.Load(dd_1 * 4 + 0);
-                        CmdEndClip _2182 = { blend_1 };
+                        uint blend_1 = _1378.Load(dd_1 * 4 + 0);
+                        CmdEndClip _2193 = { blend_1 };
                         Alloc param_81 = cmd_alloc;
                         CmdRef param_82 = cmd_ref;
-                        CmdEndClip param_83 = _2182;
+                        CmdEndClip param_83 = _2193;
                         Cmd_EndClip_write(param_81, param_82, param_83);
                         cmd_ref.offset += 8u;
+                        render_blend_depth--;
                         break;
                     }
                 }
@@ -1198,21 +1204,24 @@
             break;
         }
     }
-    bool _2229 = (bin_tile_x + tile_x) < _1005.Load(8);
-    bool _2238;
-    if (_2229)
+    bool _2242 = (bin_tile_x + tile_x) < _1005.Load(8);
+    bool _2251;
+    if (_2242)
     {
-        _2238 = (bin_tile_y + tile_y) < _1005.Load(12);
+        _2251 = (bin_tile_y + tile_y) < _1005.Load(12);
     }
     else
     {
-        _2238 = _2229;
+        _2251 = _2242;
     }
-    if (_2238)
+    if (_2251)
     {
         Alloc param_84 = cmd_alloc;
         CmdRef param_85 = cmd_ref;
         Cmd_End_write(param_84, param_85);
+        if (max_blend_depth > 4u)
+        {
+        }
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index 854d243..94b8738 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -677,7 +677,7 @@
     write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1372 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1378 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_bitmaps[8][256];
     threadgroup Alloc sh_part_elements[256];
@@ -713,6 +713,9 @@
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
+    cmd_ref.offset += 4u;
+    uint render_blend_depth = 0u;
+    uint max_blend_depth = 0u;
     uint drawmonoid_start = _1005.conf.drawmonoid_alloc.offset >> uint(2);
     uint drawtag_start = _1005.conf.drawtag_offset >> uint(2);
     uint drawdata_start = _1005.conf.drawdata_offset >> uint(2);
@@ -720,11 +723,11 @@
     bool mem_ok = v_260.mem_error == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1304;
+    uint _1310;
     uint element_ix;
     Alloc param_14;
     uint tile_count;
-    uint _1605;
+    uint _1611;
     float linewidth;
     CmdLinGrad cmd_lin;
     CmdRadGrad cmd_rad;
@@ -734,24 +737,24 @@
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1356;
+        bool _1362;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1154 = th_ix < 256u;
-                bool _1162;
-                if (_1154)
+                bool _1160 = th_ix < 256u;
+                bool _1168;
+                if (_1160)
                 {
-                    _1162 = (partition_ix + th_ix) < n_partitions;
+                    _1168 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1162 = _1154;
+                    _1168 = _1160;
                 }
-                if (_1162)
+                if (_1168)
                 {
                     uint in_ix = (_1005.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
                     param_3.offset = _1005.conf.bin_alloc.offset;
@@ -803,13 +806,13 @@
                 }
                 if (part_ix > 0u)
                 {
-                    _1304 = sh_part_count[part_ix - 1u];
+                    _1310 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1304 = part_start_ix;
+                    _1310 = part_start_ix;
                 }
-                ix -= _1304;
+                ix -= _1310;
                 Alloc bin_alloc = sh_part_elements[part_ix];
                 BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
                 BinInstanceRef param_10 = inst_ref;
@@ -821,16 +824,16 @@
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1346 = (wr_ix - rd_ix) < 256u;
-            if (_1346)
+            bool _1352 = (wr_ix - rd_ix) < 256u;
+            if (_1352)
             {
-                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1362 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1356 = _1346;
+                _1362 = _1352;
             }
-            if (_1356)
+            if (_1362)
             {
                 continue;
             }
@@ -843,7 +846,7 @@
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1372.scene[drawtag_start + element_ix];
+            tag = _1378.scene[drawtag_start + element_ix];
         }
         switch (tag)
         {
@@ -913,16 +916,16 @@
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1372.scene[drawtag_start + element_ix_1];
+            uint tag_1 = _1378.scene[drawtag_start + element_ix_1];
             if (el_ix > 0u)
             {
-                _1605 = sh_tile_count[el_ix - 1u];
+                _1611 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1605 = 0u;
+                _1611 = 0u;
             }
-            uint seq_ix = ix_1 - _1605;
+            uint seq_ix = ix_1 - _1611;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -941,26 +944,26 @@
                     uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
                     uint scene_offset = v_260.memory[drawmonoid_base_1 + 2u];
                     uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1372.scene[dd];
+                    uint blend = _1378.scene[dd];
                     is_blend = blend != 32771u;
                 }
-                bool _1693 = tile.tile.offset != 0u;
-                bool _1702;
-                if (!_1693)
+                bool _1699 = tile.tile.offset != 0u;
+                bool _1708;
+                if (!_1699)
                 {
-                    _1702 = (tile.backdrop == 0) == is_clip;
+                    _1708 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1702 = _1693;
+                    _1708 = _1699;
                 }
-                include_tile = _1702 || is_blend;
+                include_tile = _1708 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1724 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1730 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -984,7 +987,7 @@
             uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1372.scene[drawtag_start + element_ix_2];
+            uint drawtag = _1378.scene[drawtag_start + element_ix_2];
             if (clip_zero_depth == 0u)
             {
                 uint param_25 = element_ref_ix;
@@ -1005,11 +1008,11 @@
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1849 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
+                        bool _1855 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1849)
+                        if (!_1855)
                         {
                             break;
                         }
@@ -1019,7 +1022,7 @@
                         float param_35 = linewidth;
                         write_fill(param_32, param_33, param_34, param_35, v_260, v_260BufferSize);
                         cmd_ref = param_33;
-                        uint rgba = _1372.scene[dd_1];
+                        uint rgba = _1378.scene[dd_1];
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
                         CmdColor param_38 = CmdColor{ rgba };
@@ -1032,11 +1035,11 @@
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1890 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
+                        bool _1896 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1890)
+                        if (!_1896)
                         {
                             break;
                         }
@@ -1047,7 +1050,7 @@
                         float param_45 = linewidth;
                         write_fill(param_42, param_43, param_44, param_45, v_260, v_260BufferSize);
                         cmd_ref = param_43;
-                        cmd_lin.index = _1372.scene[dd_1];
+                        cmd_lin.index = _1378.scene[dd_1];
                         cmd_lin.line_x = as_type<float>(v_260.memory[di + 1u]);
                         cmd_lin.line_y = as_type<float>(v_260.memory[di + 2u]);
                         cmd_lin.line_c = as_type<float>(v_260.memory[di + 3u]);
@@ -1063,11 +1066,11 @@
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1954 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
+                        bool _1960 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1954)
+                        if (!_1960)
                         {
                             break;
                         }
@@ -1078,7 +1081,7 @@
                         float param_55 = linewidth;
                         write_fill(param_52, param_53, param_54, param_55, v_260, v_260BufferSize);
                         cmd_ref = param_53;
-                        cmd_rad.index = _1372.scene[dd_1];
+                        cmd_rad.index = _1378.scene[dd_1];
                         cmd_rad.mat = as_type<float4>(uint4(v_260.memory[di + 1u], v_260.memory[di + 2u], v_260.memory[di + 3u], v_260.memory[di + 4u]));
                         cmd_rad.xlat = as_type<float2>(uint2(v_260.memory[di + 5u], v_260.memory[di + 6u]));
                         cmd_rad.c1 = as_type<float2>(uint2(v_260.memory[di + 7u], v_260.memory[di + 8u]));
@@ -1097,11 +1100,11 @@
                         Alloc param_59 = cmd_alloc;
                         CmdRef param_60 = cmd_ref;
                         uint param_61 = cmd_limit;
-                        bool _2060 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
+                        bool _2066 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
                         cmd_alloc = param_59;
                         cmd_ref = param_60;
                         cmd_limit = param_61;
-                        if (!_2060)
+                        if (!_2066)
                         {
                             break;
                         }
@@ -1111,8 +1114,8 @@
                         float param_65 = linewidth;
                         write_fill(param_62, param_63, param_64, param_65, v_260, v_260BufferSize);
                         cmd_ref = param_63;
-                        uint index = _1372.scene[dd_1];
-                        uint raw1 = _1372.scene[dd_1 + 1u];
+                        uint index = _1378.scene[dd_1];
+                        uint raw1 = _1378.scene[dd_1 + 1u];
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
                         Alloc param_66 = cmd_alloc;
                         CmdRef param_67 = cmd_ref;
@@ -1123,17 +1126,17 @@
                     }
                     case 5u:
                     {
-                        bool _2113 = tile_1.tile.offset == 0u;
-                        bool _2119;
-                        if (_2113)
+                        bool _2119 = tile_1.tile.offset == 0u;
+                        bool _2125;
+                        if (_2119)
                         {
-                            _2119 = tile_1.backdrop == 0;
+                            _2125 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2119 = _2113;
+                            _2125 = _2119;
                         }
-                        if (_2119)
+                        if (_2125)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1142,11 +1145,11 @@
                             Alloc param_69 = cmd_alloc;
                             CmdRef param_70 = cmd_ref;
                             uint param_71 = cmd_limit;
-                            bool _2131 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
+                            bool _2137 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
                             cmd_alloc = param_69;
                             cmd_ref = param_70;
                             cmd_limit = param_71;
-                            if (!_2131)
+                            if (!_2137)
                             {
                                 break;
                             }
@@ -1154,6 +1157,8 @@
                             CmdRef param_73 = cmd_ref;
                             Cmd_BeginClip_write(param_72, param_73, v_260, v_260BufferSize);
                             cmd_ref.offset += 4u;
+                            render_blend_depth++;
+                            max_blend_depth = max(max_blend_depth, render_blend_depth);
                         }
                         clip_depth++;
                         break;
@@ -1164,11 +1169,11 @@
                         Alloc param_74 = cmd_alloc;
                         CmdRef param_75 = cmd_ref;
                         uint param_76 = cmd_limit;
-                        bool _2159 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
+                        bool _2170 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
                         cmd_alloc = param_74;
                         cmd_ref = param_75;
                         cmd_limit = param_76;
-                        if (!_2159)
+                        if (!_2170)
                         {
                             break;
                         }
@@ -1178,12 +1183,13 @@
                         float param_80 = -1.0;
                         write_fill(param_77, param_78, param_79, param_80, v_260, v_260BufferSize);
                         cmd_ref = param_78;
-                        uint blend_1 = _1372.scene[dd_1];
+                        uint blend_1 = _1378.scene[dd_1];
                         Alloc param_81 = cmd_alloc;
                         CmdRef param_82 = cmd_ref;
                         CmdEndClip param_83 = CmdEndClip{ blend_1 };
                         Cmd_EndClip_write(param_81, param_82, param_83, v_260, v_260BufferSize);
                         cmd_ref.offset += 8u;
+                        render_blend_depth--;
                         break;
                     }
                 }
@@ -1216,21 +1222,24 @@
             break;
         }
     }
-    bool _2229 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
-    bool _2238;
-    if (_2229)
+    bool _2242 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
+    bool _2251;
+    if (_2242)
     {
-        _2238 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
+        _2251 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
     }
     else
     {
-        _2238 = _2229;
+        _2251 = _2242;
     }
-    if (_2238)
+    if (_2251)
     {
         Alloc param_84 = cmd_alloc;
         CmdRef param_85 = cmd_ref;
         Cmd_End_write(param_84, param_85, v_260, v_260BufferSize);
+        if (max_blend_depth > 4u)
+        {
+        }
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index 56a87e5..bcac844 100644
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index 7399fe4..c4446c5 100644
--- a/piet-gpu/shader/gen/kernel4.dxil
+++ b/piet-gpu/shader/gen/kernel4.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index 4839db2..408dcfc 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -999,6 +999,8 @@
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef _1705 = { cmd_alloc.offset };
     CmdRef cmd_ref = _1705;
+    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8);
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -1011,7 +1013,9 @@
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
-    uint blend_stack[128][8];
+    uint blend_stack[4][8];
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1032,8 +1036,8 @@
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1800 = { stroke.tile_ref };
-                tile_seg_ref = _1800;
+                TileSegRef _1810 = { stroke.tile_ref };
+                tile_seg_ref = _1810;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1069,8 +1073,8 @@
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1920 = { fill.tile_ref };
-                tile_seg_ref = _1920;
+                TileSegRef _1930 = { fill.tile_ref };
+                tile_seg_ref = _1930;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1159,10 +1163,10 @@
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2254 = fromsRGB(param_29);
-                    fg_rgba.x = _2254.x;
-                    fg_rgba.y = _2254.y;
-                    fg_rgba.z = _2254.z;
+                    float3 _2264 = fromsRGB(param_29);
+                    fg_rgba.x = _2264.x;
+                    fg_rgba.y = _2264.y;
+                    fg_rgba.z = _2264.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1185,10 +1189,10 @@
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2364 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2364.x;
-                    fg_rgba_1.y = _2364.y;
-                    fg_rgba_1.z = _2364.z;
+                    float3 _2374 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2374.x;
+                    fg_rgba_1.y = _2374.y;
+                    fg_rgba_1.z = _2374.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1202,9 +1206,9 @@
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2407[8];
-                fillImage(_2407, param_36, param_37);
-                float4 img[8] = _2407;
+                float4 _2417[8];
+                fillImage(_2417, param_36, param_37);
+                float4 img[8] = _2417;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1215,13 +1219,26 @@
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2470 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2470;
-                    rgba[k_12] = 0.0f.xxxx;
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2479 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2479;
+                        rgba[k_12] = 0.0f.xxxx;
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2522 = packsRGB(param_39);
+                        _297.Store((base_ix + k_13) * 4 + 8, _2522);
+                        rgba[k_13] = 0.0f.xxxx;
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1229,32 +1246,41 @@
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8);
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                CmdRef _2548 = { Cmd_Jump_read(param_45, param_46).new_ref };
-                cmd_ref = _2548;
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2621;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1262,9 +1288,9 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        float3 param_48 = rgba[i_1].xyz;
-        image[int2(xy_uint + chunk_offset(param_47))] = float4(tosRGB(param_48), rgba[i_1].w);
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w);
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index 4caeaf0..c12e307 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -1056,6 +1056,8 @@
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+    uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)];
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     spvUnsafeArray<float4, 8> rgba;
@@ -1068,7 +1070,9 @@
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
-    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
+    spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1214,10 +1218,10 @@
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2254 = fromsRGB(param_29);
-                    fg_rgba.x = _2254.x;
-                    fg_rgba.y = _2254.y;
-                    fg_rgba.z = _2254.z;
+                    float3 _2264 = fromsRGB(param_29);
+                    fg_rgba.x = _2264.x;
+                    fg_rgba.y = _2264.y;
+                    fg_rgba.z = _2264.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1240,10 +1244,10 @@
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2364 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2364.x;
-                    fg_rgba_1.y = _2364.y;
-                    fg_rgba_1.z = _2364.z;
+                    float3 _2374 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2374.x;
+                    fg_rgba_1.y = _2374.y;
+                    fg_rgba_1.z = _2374.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1269,13 +1273,26 @@
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2470 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2470;
-                    rgba[k_12] = float4(0.0);
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2479 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2479;
+                        rgba[k_12] = float4(0.0);
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2522 = packsRGB(param_39);
+                        v_297.memory[base_ix + k_13] = _2522;
+                        rgba[k_13] = float4(0.0);
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1283,31 +1300,40 @@
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = v_297.memory[base_ix_1 + k_14];
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1315,9 +1341,9 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        float3 param_48 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_48), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index f0e2963..28d4cae 100644
--- a/piet-gpu/shader/gen/kernel4.spv
+++ b/piet-gpu/shader/gen/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index 7b7c19f..c91d35d 100644
--- a/piet-gpu/shader/gen/kernel4_gray.dxil
+++ b/piet-gpu/shader/gen/kernel4_gray.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index 5d9b88d..58d3f5a 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -999,6 +999,8 @@
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef _1705 = { cmd_alloc.offset };
     CmdRef cmd_ref = _1705;
+    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8);
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -1011,7 +1013,9 @@
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
-    uint blend_stack[128][8];
+    uint blend_stack[4][8];
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1032,8 +1036,8 @@
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1800 = { stroke.tile_ref };
-                tile_seg_ref = _1800;
+                TileSegRef _1810 = { stroke.tile_ref };
+                tile_seg_ref = _1810;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1069,8 +1073,8 @@
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1920 = { fill.tile_ref };
-                tile_seg_ref = _1920;
+                TileSegRef _1930 = { fill.tile_ref };
+                tile_seg_ref = _1930;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1159,10 +1163,10 @@
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2254 = fromsRGB(param_29);
-                    fg_rgba.x = _2254.x;
-                    fg_rgba.y = _2254.y;
-                    fg_rgba.z = _2254.z;
+                    float3 _2264 = fromsRGB(param_29);
+                    fg_rgba.x = _2264.x;
+                    fg_rgba.y = _2264.y;
+                    fg_rgba.z = _2264.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1185,10 +1189,10 @@
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2364 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2364.x;
-                    fg_rgba_1.y = _2364.y;
-                    fg_rgba_1.z = _2364.z;
+                    float3 _2374 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2374.x;
+                    fg_rgba_1.y = _2374.y;
+                    fg_rgba_1.z = _2374.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1202,9 +1206,9 @@
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2407[8];
-                fillImage(_2407, param_36, param_37);
-                float4 img[8] = _2407;
+                float4 _2417[8];
+                fillImage(_2417, param_36, param_37);
+                float4 img[8] = _2417;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1215,13 +1219,26 @@
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2470 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2470;
-                    rgba[k_12] = 0.0f.xxxx;
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2479 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2479;
+                        rgba[k_12] = 0.0f.xxxx;
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2522 = packsRGB(param_39);
+                        _297.Store((base_ix + k_13) * 4 + 8, _2522);
+                        rgba[k_13] = 0.0f.xxxx;
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1229,32 +1246,41 @@
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8);
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                CmdRef _2548 = { Cmd_Jump_read(param_45, param_46).new_ref };
-                cmd_ref = _2548;
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2621;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1262,8 +1288,8 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        image[int2(xy_uint + chunk_offset(param_47))] = rgba[i_1].w.x;
+        uint param_48 = i_1;
+        image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x;
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 8c608c3..04f3d69 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -1056,6 +1056,8 @@
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+    uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)];
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     spvUnsafeArray<float4, 8> rgba;
@@ -1068,7 +1070,9 @@
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
-    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
+    spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1214,10 +1218,10 @@
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2254 = fromsRGB(param_29);
-                    fg_rgba.x = _2254.x;
-                    fg_rgba.y = _2254.y;
-                    fg_rgba.z = _2254.z;
+                    float3 _2264 = fromsRGB(param_29);
+                    fg_rgba.x = _2264.x;
+                    fg_rgba.y = _2264.y;
+                    fg_rgba.z = _2264.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1240,10 +1244,10 @@
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2364 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2364.x;
-                    fg_rgba_1.y = _2364.y;
-                    fg_rgba_1.z = _2364.z;
+                    float3 _2374 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2374.x;
+                    fg_rgba_1.y = _2374.y;
+                    fg_rgba_1.z = _2374.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1269,13 +1273,26 @@
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2470 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2470;
-                    rgba[k_12] = float4(0.0);
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2479 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2479;
+                        rgba[k_12] = float4(0.0);
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2522 = packsRGB(param_39);
+                        v_297.memory[base_ix + k_13] = _2522;
+                        rgba[k_13] = float4(0.0);
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1283,31 +1300,40 @@
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = v_297.memory[base_ix_1 + k_14];
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1315,8 +1341,8 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
+        uint param_48 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 6ff1791..93ce73e 100644
--- a/piet-gpu/shader/gen/kernel4_gray.spv
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil
index b6c9398..9fd593c 100644
--- a/piet-gpu/shader/gen/path_coarse.dxil
+++ b/piet-gpu/shader/gen/path_coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil
index 7ce4684..6130712 100644
--- a/piet-gpu/shader/gen/pathseg.dxil
+++ b/piet-gpu/shader/gen/pathseg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil
index ff544b8..4c2bd23 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.dxil
+++ b/piet-gpu/shader/gen/pathtag_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil
index 48584bd..77f12e6 100644
--- a/piet-gpu/shader/gen/pathtag_root.dxil
+++ b/piet-gpu/shader/gen/pathtag_root.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil
index 0c1e376..f9f31e6 100644
--- a/piet-gpu/shader/gen/transform_leaf.dxil
+++ b/piet-gpu/shader/gen/transform_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil
index fc3a311..978dd98 100644
--- a/piet-gpu/shader/gen/transform_reduce.dxil
+++ b/piet-gpu/shader/gen/transform_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_root.dxil b/piet-gpu/shader/gen/transform_root.dxil
index a33ff7f..5b4f059 100644
--- a/piet-gpu/shader/gen/transform_root.dxil
+++ b/piet-gpu/shader/gen/transform_root.dxil
Binary files differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index 99fd22e..2058acc 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -100,11 +100,14 @@
     Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
     CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
 
+    uint blend_offset = memory[cmd_ref.offset >> 2];
+    cmd_ref.offset += 4;
+
     uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x,
                           gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
     vec2 xy = vec2(xy_uint);
     mediump vec4 rgba[CHUNK];
-    uint blend_stack[MAX_BLEND_STACK][CHUNK];
+    uint blend_stack[BLEND_STACK_SPLIT][CHUNK];
     for (uint i = 0; i < CHUNK; i++) {
         rgba[i] = vec4(0.0);
     }
@@ -236,24 +239,38 @@
             cmd_ref.offset += 4 + CmdImage_size;
             break;
         case Cmd_BeginClip:
-            for (uint k = 0; k < CHUNK; k++) {
-                // We reject any inputs that might overflow in render_ctx.rs.
-                // The following is a sanity check so we don't corrupt memory should there be malformed inputs.
-                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
-                blend_stack[d][k] = packsRGB(vec4(rgba[k]));
-                rgba[k] = vec4(0.0);
+            if (clip_depth < BLEND_STACK_SPLIT) {
+                for (uint k = 0; k < CHUNK; k++) {
+                    blend_stack[clip_depth][k] = packsRGB(vec4(rgba[k]));
+                    rgba[k] = vec4(0.0);
+                }
+            } else {
+                uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
+                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
+                for (uint k = 0; k < CHUNK; k++) {
+                    memory[base_ix + k] = packsRGB(vec4(rgba[k]));
+                    rgba[k] = vec4(0.0);
+                }
             }
             clip_depth++;
             cmd_ref.offset += 4;
             break;
         case Cmd_EndClip:
             CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
-            uint blend_mode = uint(end_clip.blend >> 8);
-            uint comp_mode = uint(end_clip.blend & 0xFF);
             clip_depth--;
+            uint base_ix;
+            if (clip_depth < BLEND_STACK_SPLIT) {
+                base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
+                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
+            }
             for (uint k = 0; k < CHUNK; k++) {
-                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
-                mediump vec4 bg = unpacksRGB(blend_stack[d][k]);
+                uint bg_rgba;
+                if (clip_depth < BLEND_STACK_SPLIT) {
+                    bg_rgba = blend_stack[clip_depth][k];
+                } else {
+                    bg_rgba = memory[base_ix + k];
+                }
+                mediump vec4 bg = unpacksRGB(bg_rgba);
                 mediump vec4 fg = rgba[k] * area[k];
                 rgba[k] = mix_blend_compose(bg, fg, end_clip.blend);
             }
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index ec17188..21206e5 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -27,6 +27,10 @@
 
 #define GRADIENT_WIDTH 512
 
+// We allocate this many blend stack entries in registers, and spill
+// to memory for the overflow.
+#define BLEND_STACK_SPLIT 4
+
 #ifdef ERR_MALLOC_FAILED
 struct Config {
     uint n_elements; // paths
@@ -91,7 +95,7 @@
 #define MODE_STROKE 1
 
 // Size of kernel4 clip state, in words.
-#define CLIP_STATE_SIZE 2
+#define CLIP_STATE_SIZE 1
 
 // fill_mode_from_flags extracts the fill mode from tag flags.
 uint fill_mode_from_flags(uint flags) {
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index c75f41f..1ebb5cf 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -34,8 +34,6 @@
 
 const PTCL_INITIAL_ALLOC: usize = 1024;
 
-const MAX_BLEND_STACK: usize = 128;
-
 #[allow(unused)]
 fn dump_scene(buf: &[u8]) {
     for i in 0..(buf.len() / 4) {
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index 14f2561..5d4ffd3 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -5,7 +5,6 @@
 
 use crate::encoder::GlyphEncoder;
 use crate::stages::{Config, Transform};
-use crate::MAX_BLEND_STACK;
 use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};
 use piet::{
     Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext,
@@ -233,9 +232,6 @@
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
         self.new_encoder.begin_clip(None);
-        if self.clip_stack.len() >= MAX_BLEND_STACK {
-            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
-        }
         self.clip_stack.push(ClipElement { blend: None });
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;
@@ -337,9 +333,6 @@
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
         self.new_encoder.begin_clip(Some(blend));
-        if self.clip_stack.len() >= MAX_BLEND_STACK {
-            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
-        }
         self.clip_stack.push(ClipElement { blend: Some(blend) });
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;