First cut at split blend stack

Split the blend stack into register and memory segments. Do blending in registers up to that size, then spill to memory if needed.

This version may regress performance on Pixel 4, as it uses common memory for the blend stack, rather than keeping that memory read-only in fine rasterization, and using a separate buffer for blend stack. This needs investigation. It's possible we'll want to have single common memory as a config option, as it pools allocations and decreases the probability of failure.

Also a flaw in this version: there is no checking of memory overflow.

For understanding code history: this commit largely reverts #77, but there were some intervening changes to blending, and this commit also implements the split so some of the stack is in registers.

Closes #156
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 3abb2e0..c93d002 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -151,6 +151,11 @@
     uint part_start_ix = 0;
     uint ready_ix = 0;
 
+    cmd_ref.offset += 4;
+    // Accounting for allocation of blend memory
+    uint render_blend_depth = 0;
+    uint max_blend_depth = 0;
+
     uint drawmonoid_start = conf.drawmonoid_alloc.offset >> 2;
     uint drawtag_start = conf.drawtag_offset >> 2;
     uint drawdata_start = conf.drawdata_offset >> 2;
@@ -414,6 +419,8 @@
                         }
                         Cmd_BeginClip_write(cmd_alloc, cmd_ref);
                         cmd_ref.offset += 4;
+                        render_blend_depth++;
+                        max_blend_depth = max(max_blend_depth, render_blend_depth);
                     }
                     clip_depth++;
                     break;
@@ -426,6 +433,7 @@
                     uint blend = scene[dd];
                     Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
                     cmd_ref.offset += 4 + CmdEndClip_size;
+                    render_blend_depth--;
                     break;
                 }
             } else {
@@ -451,5 +459,8 @@
     }
     if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
         Cmd_End_write(cmd_alloc, cmd_ref);
+        if (max_blend_depth > BLEND_STACK_SPLIT) {
+            // TODO: allocate blend memory and write result
+        }
     }
 }
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index 910925d..cbebec0 100644
--- a/piet-gpu/shader/gen/coarse.dxil
+++ b/piet-gpu/shader/gen/coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index 04529bb..0519a63 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -177,7 +177,7 @@
 
 RWByteAddressBuffer _260 : register(u0, space0);
 ByteAddressBuffer _1005 : register(t1, space0);
-ByteAddressBuffer _1372 : register(t2, space0);
+ByteAddressBuffer _1378 : register(t2, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -681,6 +681,9 @@
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
+    cmd_ref.offset += 4u;
+    uint render_blend_depth = 0u;
+    uint max_blend_depth = 0u;
     uint drawmonoid_start = _1005.Load(44) >> uint(2);
     uint drawtag_start = _1005.Load(100) >> uint(2);
     uint drawdata_start = _1005.Load(104) >> uint(2);
@@ -688,11 +691,11 @@
     bool mem_ok = _260.Load(4) == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1304;
+    uint _1310;
     uint element_ix;
     Alloc param_14;
     uint tile_count;
-    uint _1605;
+    uint _1611;
     float linewidth;
     CmdLinGrad cmd_lin;
     CmdRadGrad cmd_rad;
@@ -702,34 +705,34 @@
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1356;
+        bool _1362;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1154 = th_ix < 256u;
-                bool _1162;
-                if (_1154)
+                bool _1160 = th_ix < 256u;
+                bool _1168;
+                if (_1160)
                 {
-                    _1162 = (partition_ix + th_ix) < n_partitions;
+                    _1168 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1162 = _1154;
+                    _1168 = _1160;
                 }
-                if (_1162)
+                if (_1168)
                 {
                     uint in_ix = (_1005.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1179;
-                    _1179.offset = _1005.Load(20);
-                    param_3.offset = _1179.offset;
+                    Alloc _1185;
+                    _1185.offset = _1005.Load(20);
+                    param_3.offset = _1185.offset;
                     uint param_4 = in_ix;
                     count = read_mem(param_3, param_4);
-                    Alloc _1190;
-                    _1190.offset = _1005.Load(20);
-                    param_5.offset = _1190.offset;
+                    Alloc _1196;
+                    _1196.offset = _1005.Load(20);
+                    param_5.offset = _1196.offset;
                     uint param_6 = in_ix + 1u;
                     uint offset = read_mem(param_5, param_6);
                     uint param_7 = offset;
@@ -775,16 +778,16 @@
                 }
                 if (part_ix > 0u)
                 {
-                    _1304 = sh_part_count[part_ix - 1u];
+                    _1310 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1304 = part_start_ix;
+                    _1310 = part_start_ix;
                 }
-                ix -= _1304;
+                ix -= _1310;
                 Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1323 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1323;
+                BinInstanceRef _1329 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1329;
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
@@ -794,16 +797,16 @@
             }
             GroupMemoryBarrierWithGroupSync();
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1346 = (wr_ix - rd_ix) < 256u;
-            if (_1346)
+            bool _1352 = (wr_ix - rd_ix) < 256u;
+            if (_1352)
             {
-                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1362 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1356 = _1346;
+                _1362 = _1352;
             }
-            if (_1356)
+            if (_1362)
             {
                 continue;
             }
@@ -816,7 +819,7 @@
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1372.Load((drawtag_start + element_ix) * 4 + 0);
+            tag = _1378.Load((drawtag_start + element_ix) * 4 + 0);
         }
         switch (tag)
         {
@@ -829,11 +832,11 @@
             {
                 uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
                 uint path_ix = _260.Load(drawmonoid_base * 4 + 8);
-                PathRef _1397 = { _1005.Load(16) + (path_ix * 12u) };
-                Alloc _1400;
-                _1400.offset = _1005.Load(16);
-                param_14.offset = _1400.offset;
-                PathRef param_15 = _1397;
+                PathRef _1403 = { _1005.Load(16) + (path_ix * 12u) };
+                Alloc _1406;
+                _1406.offset = _1005.Load(16);
+                param_14.offset = _1406.offset;
+                PathRef param_15 = _1403;
                 Path path = Path_read(param_14, param_15);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
@@ -889,16 +892,16 @@
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1372.Load((drawtag_start + element_ix_1) * 4 + 0);
+            uint tag_1 = _1378.Load((drawtag_start + element_ix_1) * 4 + 0);
             if (el_ix > 0u)
             {
-                _1605 = sh_tile_count[el_ix - 1u];
+                _1611 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1605 = 0u;
+                _1611 = 0u;
             }
-            uint seq_ix = ix_1 - _1605;
+            uint seq_ix = ix_1 - _1611;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -907,9 +910,9 @@
             {
                 uint param_21 = el_ix;
                 bool param_22 = mem_ok;
-                TileRef _1657 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                TileRef _1663 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
                 Alloc param_23 = read_tile_alloc(param_21, param_22);
-                TileRef param_24 = _1657;
+                TileRef param_24 = _1663;
                 Tile tile = Tile_read(param_23, param_24);
                 bool is_clip = (tag_1 & 1u) != 0u;
                 bool is_blend = false;
@@ -918,27 +921,27 @@
                     uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
                     uint scene_offset = _260.Load((drawmonoid_base_1 + 2u) * 4 + 8);
                     uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1372.Load(dd * 4 + 0);
+                    uint blend = _1378.Load(dd * 4 + 0);
                     is_blend = blend != 3u;
                 }
-                bool _1692 = tile.tile.offset != 0u;
-                bool _1701;
-                if (!_1692)
+                bool _1698 = tile.tile.offset != 0u;
+                bool _1707;
+                if (!_1698)
                 {
-                    _1701 = (tile.backdrop == 0) == is_clip;
+                    _1707 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1701 = _1692;
+                    _1707 = _1698;
                 }
-                include_tile = _1701 || is_blend;
+                include_tile = _1707 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1723;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723);
+                uint _1729;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1729);
             }
         }
         GroupMemoryBarrierWithGroupSync();
@@ -962,14 +965,14 @@
             uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1372.Load((drawtag_start + element_ix_2) * 4 + 0);
+            uint drawtag = _1378.Load((drawtag_start + element_ix_2) * 4 + 0);
             if (clip_zero_depth == 0u)
             {
                 uint param_25 = element_ref_ix;
                 bool param_26 = mem_ok;
-                TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                TileRef _1806 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                 Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1800;
+                TileRef param_28 = _1806;
                 Tile tile_1 = Tile_read(param_27, param_28);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
                 uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
@@ -984,11 +987,11 @@
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1848 = alloc_cmd(param_29, param_30, param_31);
+                        bool _1854 = alloc_cmd(param_29, param_30, param_31);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1848)
+                        if (!_1854)
                         {
                             break;
                         }
@@ -998,11 +1001,11 @@
                         float param_35 = linewidth;
                         write_fill(param_32, param_33, param_34, param_35);
                         cmd_ref = param_33;
-                        uint rgba = _1372.Load(dd_1 * 4 + 0);
-                        CmdColor _1871 = { rgba };
+                        uint rgba = _1378.Load(dd_1 * 4 + 0);
+                        CmdColor _1877 = { rgba };
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = _1871;
+                        CmdColor param_38 = _1877;
                         Cmd_Color_write(param_36, param_37, param_38);
                         cmd_ref.offset += 8u;
                         break;
@@ -1012,11 +1015,11 @@
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1889 = alloc_cmd(param_39, param_40, param_41);
+                        bool _1895 = alloc_cmd(param_39, param_40, param_41);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1889)
+                        if (!_1895)
                         {
                             break;
                         }
@@ -1027,7 +1030,7 @@
                         float param_45 = linewidth;
                         write_fill(param_42, param_43, param_44, param_45);
                         cmd_ref = param_43;
-                        cmd_lin.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_lin.index = _1378.Load(dd_1 * 4 + 0);
                         cmd_lin.line_x = asfloat(_260.Load((di + 1u) * 4 + 8));
                         cmd_lin.line_y = asfloat(_260.Load((di + 2u) * 4 + 8));
                         cmd_lin.line_c = asfloat(_260.Load((di + 3u) * 4 + 8));
@@ -1043,11 +1046,11 @@
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1953 = alloc_cmd(param_49, param_50, param_51);
+                        bool _1959 = alloc_cmd(param_49, param_50, param_51);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1953)
+                        if (!_1959)
                         {
                             break;
                         }
@@ -1058,7 +1061,7 @@
                         float param_55 = linewidth;
                         write_fill(param_52, param_53, param_54, param_55);
                         cmd_ref = param_53;
-                        cmd_rad.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_rad.index = _1378.Load(dd_1 * 4 + 0);
                         cmd_rad.mat = asfloat(uint4(_260.Load((di + 1u) * 4 + 8), _260.Load((di + 2u) * 4 + 8), _260.Load((di + 3u) * 4 + 8), _260.Load((di + 4u) * 4 + 8)));
                         cmd_rad.xlat = asfloat(uint2(_260.Load((di + 5u) * 4 + 8), _260.Load((di + 6u) * 4 + 8)));
                         cmd_rad.c1 = asfloat(uint2(_260.Load((di + 7u) * 4 + 8), _260.Load((di + 8u) * 4 + 8)));
@@ -1077,11 +1080,11 @@
                         Alloc param_59 = cmd_alloc;
                         CmdRef param_60 = cmd_ref;
                         uint param_61 = cmd_limit;
-                        bool _2059 = alloc_cmd(param_59, param_60, param_61);
+                        bool _2065 = alloc_cmd(param_59, param_60, param_61);
                         cmd_alloc = param_59;
                         cmd_ref = param_60;
                         cmd_limit = param_61;
-                        if (!_2059)
+                        if (!_2065)
                         {
                             break;
                         }
@@ -1091,30 +1094,30 @@
                         float param_65 = linewidth;
                         write_fill(param_62, param_63, param_64, param_65);
                         cmd_ref = param_63;
-                        uint index = _1372.Load(dd_1 * 4 + 0);
-                        uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
+                        uint index = _1378.Load(dd_1 * 4 + 0);
+                        uint raw1 = _1378.Load((dd_1 + 1u) * 4 + 0);
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _2098 = { index, offset_1 };
+                        CmdImage _2104 = { index, offset_1 };
                         Alloc param_66 = cmd_alloc;
                         CmdRef param_67 = cmd_ref;
-                        CmdImage param_68 = _2098;
+                        CmdImage param_68 = _2104;
                         Cmd_Image_write(param_66, param_67, param_68);
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _2112 = tile_1.tile.offset == 0u;
-                        bool _2118;
-                        if (_2112)
+                        bool _2118 = tile_1.tile.offset == 0u;
+                        bool _2124;
+                        if (_2118)
                         {
-                            _2118 = tile_1.backdrop == 0;
+                            _2124 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2118 = _2112;
+                            _2124 = _2118;
                         }
-                        if (_2118)
+                        if (_2124)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1123,11 +1126,11 @@
                             Alloc param_69 = cmd_alloc;
                             CmdRef param_70 = cmd_ref;
                             uint param_71 = cmd_limit;
-                            bool _2130 = alloc_cmd(param_69, param_70, param_71);
+                            bool _2136 = alloc_cmd(param_69, param_70, param_71);
                             cmd_alloc = param_69;
                             cmd_ref = param_70;
                             cmd_limit = param_71;
-                            if (!_2130)
+                            if (!_2136)
                             {
                                 break;
                             }
@@ -1135,6 +1138,8 @@
                             CmdRef param_73 = cmd_ref;
                             Cmd_BeginClip_write(param_72, param_73);
                             cmd_ref.offset += 4u;
+                            render_blend_depth++;
+                            max_blend_depth = max(max_blend_depth, render_blend_depth);
                         }
                         clip_depth++;
                         break;
@@ -1145,11 +1150,11 @@
                         Alloc param_74 = cmd_alloc;
                         CmdRef param_75 = cmd_ref;
                         uint param_76 = cmd_limit;
-                        bool _2158 = alloc_cmd(param_74, param_75, param_76);
+                        bool _2169 = alloc_cmd(param_74, param_75, param_76);
                         cmd_alloc = param_74;
                         cmd_ref = param_75;
                         cmd_limit = param_76;
-                        if (!_2158)
+                        if (!_2169)
                         {
                             break;
                         }
@@ -1159,13 +1164,14 @@
                         float param_80 = -1.0f;
                         write_fill(param_77, param_78, param_79, param_80);
                         cmd_ref = param_78;
-                        uint blend_1 = _1372.Load(dd_1 * 4 + 0);
-                        CmdEndClip _2181 = { blend_1 };
+                        uint blend_1 = _1378.Load(dd_1 * 4 + 0);
+                        CmdEndClip _2192 = { blend_1 };
                         Alloc param_81 = cmd_alloc;
                         CmdRef param_82 = cmd_ref;
-                        CmdEndClip param_83 = _2181;
+                        CmdEndClip param_83 = _2192;
                         Cmd_EndClip_write(param_81, param_82, param_83);
                         cmd_ref.offset += 8u;
+                        render_blend_depth--;
                         break;
                     }
                 }
@@ -1198,21 +1204,24 @@
             break;
         }
     }
-    bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8);
-    bool _2237;
-    if (_2228)
+    bool _2241 = (bin_tile_x + tile_x) < _1005.Load(8);
+    bool _2250;
+    if (_2241)
     {
-        _2237 = (bin_tile_y + tile_y) < _1005.Load(12);
+        _2250 = (bin_tile_y + tile_y) < _1005.Load(12);
     }
     else
     {
-        _2237 = _2228;
+        _2250 = _2241;
     }
-    if (_2237)
+    if (_2250)
     {
         Alloc param_84 = cmd_alloc;
         CmdRef param_85 = cmd_ref;
         Cmd_End_write(param_84, param_85);
+        if (max_blend_depth > 4u)
+        {
+        }
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index 55812d4..578fa37 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -677,7 +677,7 @@
     write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1372 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1378 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_bitmaps[8][256];
     threadgroup Alloc sh_part_elements[256];
@@ -713,6 +713,9 @@
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
+    cmd_ref.offset += 4u;
+    uint render_blend_depth = 0u;
+    uint max_blend_depth = 0u;
     uint drawmonoid_start = _1005.conf.drawmonoid_alloc.offset >> uint(2);
     uint drawtag_start = _1005.conf.drawtag_offset >> uint(2);
     uint drawdata_start = _1005.conf.drawdata_offset >> uint(2);
@@ -720,11 +723,11 @@
     bool mem_ok = v_260.mem_error == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1304;
+    uint _1310;
     uint element_ix;
     Alloc param_14;
     uint tile_count;
-    uint _1605;
+    uint _1611;
     float linewidth;
     CmdLinGrad cmd_lin;
     CmdRadGrad cmd_rad;
@@ -734,24 +737,24 @@
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1356;
+        bool _1362;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1154 = th_ix < 256u;
-                bool _1162;
-                if (_1154)
+                bool _1160 = th_ix < 256u;
+                bool _1168;
+                if (_1160)
                 {
-                    _1162 = (partition_ix + th_ix) < n_partitions;
+                    _1168 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1162 = _1154;
+                    _1168 = _1160;
                 }
-                if (_1162)
+                if (_1168)
                 {
                     uint in_ix = (_1005.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
                     param_3.offset = _1005.conf.bin_alloc.offset;
@@ -803,13 +806,13 @@
                 }
                 if (part_ix > 0u)
                 {
-                    _1304 = sh_part_count[part_ix - 1u];
+                    _1310 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1304 = part_start_ix;
+                    _1310 = part_start_ix;
                 }
-                ix -= _1304;
+                ix -= _1310;
                 Alloc bin_alloc = sh_part_elements[part_ix];
                 BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
                 BinInstanceRef param_10 = inst_ref;
@@ -821,16 +824,16 @@
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1346 = (wr_ix - rd_ix) < 256u;
-            if (_1346)
+            bool _1352 = (wr_ix - rd_ix) < 256u;
+            if (_1352)
             {
-                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1362 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1356 = _1346;
+                _1362 = _1352;
             }
-            if (_1356)
+            if (_1362)
             {
                 continue;
             }
@@ -843,7 +846,7 @@
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1372.scene[drawtag_start + element_ix];
+            tag = _1378.scene[drawtag_start + element_ix];
         }
         switch (tag)
         {
@@ -913,16 +916,16 @@
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1372.scene[drawtag_start + element_ix_1];
+            uint tag_1 = _1378.scene[drawtag_start + element_ix_1];
             if (el_ix > 0u)
             {
-                _1605 = sh_tile_count[el_ix - 1u];
+                _1611 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1605 = 0u;
+                _1611 = 0u;
             }
-            uint seq_ix = ix_1 - _1605;
+            uint seq_ix = ix_1 - _1611;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -941,26 +944,26 @@
                     uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
                     uint scene_offset = v_260.memory[drawmonoid_base_1 + 2u];
                     uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1372.scene[dd];
+                    uint blend = _1378.scene[dd];
                     is_blend = blend != 3u;
                 }
-                bool _1692 = tile.tile.offset != 0u;
-                bool _1701;
-                if (!_1692)
+                bool _1698 = tile.tile.offset != 0u;
+                bool _1707;
+                if (!_1698)
                 {
-                    _1701 = (tile.backdrop == 0) == is_clip;
+                    _1707 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1701 = _1692;
+                    _1707 = _1698;
                 }
-                include_tile = _1701 || is_blend;
+                include_tile = _1707 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1723 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1729 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -984,7 +987,7 @@
             uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1372.scene[drawtag_start + element_ix_2];
+            uint drawtag = _1378.scene[drawtag_start + element_ix_2];
             if (clip_zero_depth == 0u)
             {
                 uint param_25 = element_ref_ix;
@@ -1005,11 +1008,11 @@
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1848 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
+                        bool _1854 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1848)
+                        if (!_1854)
                         {
                             break;
                         }
@@ -1019,7 +1022,7 @@
                         float param_35 = linewidth;
                         write_fill(param_32, param_33, param_34, param_35, v_260, v_260BufferSize);
                         cmd_ref = param_33;
-                        uint rgba = _1372.scene[dd_1];
+                        uint rgba = _1378.scene[dd_1];
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
                         CmdColor param_38 = CmdColor{ rgba };
@@ -1032,11 +1035,11 @@
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1889 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
+                        bool _1895 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1889)
+                        if (!_1895)
                         {
                             break;
                         }
@@ -1047,7 +1050,7 @@
                         float param_45 = linewidth;
                         write_fill(param_42, param_43, param_44, param_45, v_260, v_260BufferSize);
                         cmd_ref = param_43;
-                        cmd_lin.index = _1372.scene[dd_1];
+                        cmd_lin.index = _1378.scene[dd_1];
                         cmd_lin.line_x = as_type<float>(v_260.memory[di + 1u]);
                         cmd_lin.line_y = as_type<float>(v_260.memory[di + 2u]);
                         cmd_lin.line_c = as_type<float>(v_260.memory[di + 3u]);
@@ -1063,11 +1066,11 @@
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1953 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
+                        bool _1959 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1953)
+                        if (!_1959)
                         {
                             break;
                         }
@@ -1078,7 +1081,7 @@
                         float param_55 = linewidth;
                         write_fill(param_52, param_53, param_54, param_55, v_260, v_260BufferSize);
                         cmd_ref = param_53;
-                        cmd_rad.index = _1372.scene[dd_1];
+                        cmd_rad.index = _1378.scene[dd_1];
                         cmd_rad.mat = as_type<float4>(uint4(v_260.memory[di + 1u], v_260.memory[di + 2u], v_260.memory[di + 3u], v_260.memory[di + 4u]));
                         cmd_rad.xlat = as_type<float2>(uint2(v_260.memory[di + 5u], v_260.memory[di + 6u]));
                         cmd_rad.c1 = as_type<float2>(uint2(v_260.memory[di + 7u], v_260.memory[di + 8u]));
@@ -1097,11 +1100,11 @@
                         Alloc param_59 = cmd_alloc;
                         CmdRef param_60 = cmd_ref;
                         uint param_61 = cmd_limit;
-                        bool _2059 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
+                        bool _2065 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
                         cmd_alloc = param_59;
                         cmd_ref = param_60;
                         cmd_limit = param_61;
-                        if (!_2059)
+                        if (!_2065)
                         {
                             break;
                         }
@@ -1111,8 +1114,8 @@
                         float param_65 = linewidth;
                         write_fill(param_62, param_63, param_64, param_65, v_260, v_260BufferSize);
                         cmd_ref = param_63;
-                        uint index = _1372.scene[dd_1];
-                        uint raw1 = _1372.scene[dd_1 + 1u];
+                        uint index = _1378.scene[dd_1];
+                        uint raw1 = _1378.scene[dd_1 + 1u];
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
                         Alloc param_66 = cmd_alloc;
                         CmdRef param_67 = cmd_ref;
@@ -1123,17 +1126,17 @@
                     }
                     case 5u:
                     {
-                        bool _2112 = tile_1.tile.offset == 0u;
-                        bool _2118;
-                        if (_2112)
+                        bool _2118 = tile_1.tile.offset == 0u;
+                        bool _2124;
+                        if (_2118)
                         {
-                            _2118 = tile_1.backdrop == 0;
+                            _2124 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2118 = _2112;
+                            _2124 = _2118;
                         }
-                        if (_2118)
+                        if (_2124)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1142,11 +1145,11 @@
                             Alloc param_69 = cmd_alloc;
                             CmdRef param_70 = cmd_ref;
                             uint param_71 = cmd_limit;
-                            bool _2130 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
+                            bool _2136 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
                             cmd_alloc = param_69;
                             cmd_ref = param_70;
                             cmd_limit = param_71;
-                            if (!_2130)
+                            if (!_2136)
                             {
                                 break;
                             }
@@ -1154,6 +1157,8 @@
                             CmdRef param_73 = cmd_ref;
                             Cmd_BeginClip_write(param_72, param_73, v_260, v_260BufferSize);
                             cmd_ref.offset += 4u;
+                            render_blend_depth++;
+                            max_blend_depth = max(max_blend_depth, render_blend_depth);
                         }
                         clip_depth++;
                         break;
@@ -1164,11 +1169,11 @@
                         Alloc param_74 = cmd_alloc;
                         CmdRef param_75 = cmd_ref;
                         uint param_76 = cmd_limit;
-                        bool _2158 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
+                        bool _2169 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
                         cmd_alloc = param_74;
                         cmd_ref = param_75;
                         cmd_limit = param_76;
-                        if (!_2158)
+                        if (!_2169)
                         {
                             break;
                         }
@@ -1178,12 +1183,13 @@
                         float param_80 = -1.0;
                         write_fill(param_77, param_78, param_79, param_80, v_260, v_260BufferSize);
                         cmd_ref = param_78;
-                        uint blend_1 = _1372.scene[dd_1];
+                        uint blend_1 = _1378.scene[dd_1];
                         Alloc param_81 = cmd_alloc;
                         CmdRef param_82 = cmd_ref;
                         CmdEndClip param_83 = CmdEndClip{ blend_1 };
                         Cmd_EndClip_write(param_81, param_82, param_83, v_260, v_260BufferSize);
                         cmd_ref.offset += 8u;
+                        render_blend_depth--;
                         break;
                     }
                 }
@@ -1216,21 +1222,24 @@
             break;
         }
     }
-    bool _2228 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
-    bool _2237;
-    if (_2228)
+    bool _2241 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
+    bool _2250;
+    if (_2241)
     {
-        _2237 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
+        _2250 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
     }
     else
     {
-        _2237 = _2228;
+        _2250 = _2241;
     }
-    if (_2237)
+    if (_2250)
     {
         Alloc param_84 = cmd_alloc;
         CmdRef param_85 = cmd_ref;
         Cmd_End_write(param_84, param_85, v_260, v_260BufferSize);
+        if (max_blend_depth > 4u)
+        {
+        }
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index 6d33ee7..718acca 100644
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index da6c563..5617c51 100644
--- a/piet-gpu/shader/gen/kernel4.dxil
+++ b/piet-gpu/shader/gen/kernel4.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index 5d6f839..30779b7 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -162,7 +162,7 @@
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
 RWByteAddressBuffer _297 : register(u0, space0);
-ByteAddressBuffer _1749 : register(t1, space0);
+ByteAddressBuffer _1725 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float4> image : register(u2, space0);
@@ -477,10 +477,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1697 = fromsRGB(param_1);
+        fg_rgba.x = _1697.x;
+        fg_rgba.y = _1697.y;
+        fg_rgba.z = _1697.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -919,12 +919,6 @@
         }
         case 13u:
         {
-            float rev_as = 1.0f - as;
-            float rev_ab = 1.0f - ab;
-            return max(0.0f.xxxx, float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
-        {
             return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
         }
         default:
@@ -992,16 +986,18 @@
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1764;
-    _1764.offset = _1749.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1725.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1740;
+    _1740.offset = _1725.Load(24);
     Alloc param;
-    param.offset = _1764.offset;
+    param.offset = _1740.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1773 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1773;
+    CmdRef _1749 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1749;
+    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8);
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -1014,7 +1010,9 @@
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
-    uint blend_stack[128][8];
+    uint blend_stack[4][8];
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1035,8 +1033,8 @@
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1867 = { stroke.tile_ref };
-                tile_seg_ref = _1867;
+                TileSegRef _1854 = { stroke.tile_ref };
+                tile_seg_ref = _1854;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1072,8 +1070,8 @@
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1987 = { fill.tile_ref };
-                tile_seg_ref = _1987;
+                TileSegRef _1974 = { fill.tile_ref };
+                tile_seg_ref = _1974;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1162,10 +1160,10 @@
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2308 = fromsRGB(param_29);
+                    fg_rgba.x = _2308.x;
+                    fg_rgba.y = _2308.y;
+                    fg_rgba.z = _2308.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1188,10 +1186,10 @@
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2418 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2418.x;
+                    fg_rgba_1.y = _2418.y;
+                    fg_rgba_1.z = _2418.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1205,9 +1203,9 @@
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2474[8];
-                fillImage(_2474, param_36, param_37);
-                float4 img[8] = _2474;
+                float4 _2461[8];
+                fillImage(_2461, param_36, param_37);
+                float4 img[8] = _2461;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1218,13 +1216,26 @@
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
-                    rgba[k_12] = 0.0f.xxxx;
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2523 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2523;
+                        rgba[k_12] = 0.0f.xxxx;
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2566 = packsRGB(param_39);
+                        _297.Store((base_ix + k_13) * 4 + 8, _2566);
+                        rgba[k_13] = 0.0f.xxxx;
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1232,32 +1243,41 @@
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8);
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                CmdRef _2615 = { Cmd_Jump_read(param_45, param_46).new_ref };
-                cmd_ref = _2615;
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                CmdRef _2665 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2665;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1265,9 +1285,9 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        float3 param_48 = rgba[i_1].xyz;
-        image[int2(xy_uint + chunk_offset(param_47))] = float4(tosRGB(param_48), rgba[i_1].w);
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w);
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index 796043b..6325914 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -528,10 +528,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1697 = fromsRGB(param_1);
+        fg_rgba.x = _1697.x;
+        fg_rgba.y = _1697.y;
+        fg_rgba.z = _1697.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -986,12 +986,6 @@
         }
         case 13u:
         {
-            float rev_as = 1.0 - as;
-            float rev_ab = 1.0 - ab;
-            return fast::max(float4(0.0), float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
-        {
             return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
         }
         default:
@@ -1059,15 +1053,17 @@
     return CmdJump_read(param, param_1, v_297);
 }
 
-kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1749 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1725 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1725.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1749.conf.ptcl_alloc.offset;
+    param.offset = _1725.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+    uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)];
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     spvUnsafeArray<float4, 8> rgba;
@@ -1080,7 +1076,9 @@
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
-    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
+    spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1226,10 +1224,10 @@
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2308 = fromsRGB(param_29);
+                    fg_rgba.x = _2308.x;
+                    fg_rgba.y = _2308.y;
+                    fg_rgba.z = _2308.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1252,10 +1250,10 @@
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2418 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2418.x;
+                    fg_rgba_1.y = _2418.y;
+                    fg_rgba_1.z = _2418.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1281,13 +1279,26 @@
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
-                    rgba[k_12] = float4(0.0);
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2523 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2523;
+                        rgba[k_12] = float4(0.0);
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2566 = packsRGB(param_39);
+                        v_297.memory[base_ix + k_13] = _2566;
+                        rgba[k_13] = float4(0.0);
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1295,31 +1306,40 @@
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = v_297.memory[base_ix_1 + k_14];
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1327,9 +1347,9 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        float3 param_48 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_48), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index b145245..978e0a2 100644
--- a/piet-gpu/shader/gen/kernel4.spv
+++ b/piet-gpu/shader/gen/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index abe1d22..37fe62c 100644
--- a/piet-gpu/shader/gen/kernel4_gray.dxil
+++ b/piet-gpu/shader/gen/kernel4_gray.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index f402268..5bd7b3b 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -162,7 +162,7 @@
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
 RWByteAddressBuffer _297 : register(u0, space0);
-ByteAddressBuffer _1749 : register(t1, space0);
+ByteAddressBuffer _1725 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float> image : register(u2, space0);
@@ -477,10 +477,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1697 = fromsRGB(param_1);
+        fg_rgba.x = _1697.x;
+        fg_rgba.y = _1697.y;
+        fg_rgba.z = _1697.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -919,12 +919,6 @@
         }
         case 13u:
         {
-            float rev_as = 1.0f - as;
-            float rev_ab = 1.0f - ab;
-            return max(0.0f.xxxx, float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
-        {
             return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
         }
         default:
@@ -992,16 +986,18 @@
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1764;
-    _1764.offset = _1749.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1725.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1740;
+    _1740.offset = _1725.Load(24);
     Alloc param;
-    param.offset = _1764.offset;
+    param.offset = _1740.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1773 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1773;
+    CmdRef _1749 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1749;
+    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8);
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -1014,7 +1010,9 @@
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
-    uint blend_stack[128][8];
+    uint blend_stack[4][8];
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1035,8 +1033,8 @@
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1867 = { stroke.tile_ref };
-                tile_seg_ref = _1867;
+                TileSegRef _1854 = { stroke.tile_ref };
+                tile_seg_ref = _1854;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1072,8 +1070,8 @@
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1987 = { fill.tile_ref };
-                tile_seg_ref = _1987;
+                TileSegRef _1974 = { fill.tile_ref };
+                tile_seg_ref = _1974;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1162,10 +1160,10 @@
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2308 = fromsRGB(param_29);
+                    fg_rgba.x = _2308.x;
+                    fg_rgba.y = _2308.y;
+                    fg_rgba.z = _2308.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1188,10 +1186,10 @@
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2418 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2418.x;
+                    fg_rgba_1.y = _2418.y;
+                    fg_rgba_1.z = _2418.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1205,9 +1203,9 @@
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2474[8];
-                fillImage(_2474, param_36, param_37);
-                float4 img[8] = _2474;
+                float4 _2461[8];
+                fillImage(_2461, param_36, param_37);
+                float4 img[8] = _2461;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1218,13 +1216,26 @@
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
-                    rgba[k_12] = 0.0f.xxxx;
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2523 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2523;
+                        rgba[k_12] = 0.0f.xxxx;
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2566 = packsRGB(param_39);
+                        _297.Store((base_ix + k_13) * 4 + 8, _2566);
+                        rgba[k_13] = 0.0f.xxxx;
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1232,32 +1243,41 @@
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8);
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                CmdRef _2615 = { Cmd_Jump_read(param_45, param_46).new_ref };
-                cmd_ref = _2615;
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                CmdRef _2665 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2665;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1265,8 +1285,8 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        image[int2(xy_uint + chunk_offset(param_47))] = rgba[i_1].w.x;
+        uint param_48 = i_1;
+        image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x;
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 9647001..2b550b8 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -528,10 +528,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1697 = fromsRGB(param_1);
+        fg_rgba.x = _1697.x;
+        fg_rgba.y = _1697.y;
+        fg_rgba.z = _1697.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -986,12 +986,6 @@
         }
         case 13u:
         {
-            float rev_as = 1.0 - as;
-            float rev_ab = 1.0 - ab;
-            return fast::max(float4(0.0), float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
-        {
             return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
         }
         default:
@@ -1059,15 +1053,17 @@
     return CmdJump_read(param, param_1, v_297);
 }
 
-kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1749 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1725 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1725.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1749.conf.ptcl_alloc.offset;
+    param.offset = _1725.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+    uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)];
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     spvUnsafeArray<float4, 8> rgba;
@@ -1080,7 +1076,9 @@
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
-    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
+    spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1226,10 +1224,10 @@
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2308 = fromsRGB(param_29);
+                    fg_rgba.x = _2308.x;
+                    fg_rgba.y = _2308.y;
+                    fg_rgba.z = _2308.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1252,10 +1250,10 @@
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2418 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2418.x;
+                    fg_rgba_1.y = _2418.y;
+                    fg_rgba_1.z = _2418.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1281,13 +1279,26 @@
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
-                    rgba[k_12] = float4(0.0);
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2523 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2523;
+                        rgba[k_12] = float4(0.0);
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2566 = packsRGB(param_39);
+                        v_297.memory[base_ix + k_13] = _2566;
+                        rgba[k_13] = float4(0.0);
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1295,31 +1306,40 @@
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = v_297.memory[base_ix_1 + k_14];
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1327,8 +1347,8 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
+        uint param_48 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 2dd46c0..bacd9a8 100644
--- a/piet-gpu/shader/gen/kernel4_gray.spv
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
Binary files differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index a0710d2..c9b5dd3 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -87,11 +87,14 @@
     Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
     CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
 
+    uint blend_offset = memory[cmd_ref.offset >> 2];
+    cmd_ref.offset += 4;
+
     uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x,
                           gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
     vec2 xy = vec2(xy_uint);
     mediump vec4 rgba[CHUNK];
-    uint blend_stack[MAX_BLEND_STACK][CHUNK];
+    uint blend_stack[BLEND_STACK_SPLIT][CHUNK];
     for (uint i = 0; i < CHUNK; i++) {
         rgba[i] = vec4(0.0);
     }
@@ -223,24 +226,38 @@
             cmd_ref.offset += 4 + CmdImage_size;
             break;
         case Cmd_BeginClip:
-            for (uint k = 0; k < CHUNK; k++) {
-                // We reject any inputs that might overflow in render_ctx.rs.
-                // The following is a sanity check so we don't corrupt memory should there be malformed inputs.
-                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
-                blend_stack[d][k] = packsRGB(vec4(rgba[k]));
-                rgba[k] = vec4(0.0);
+            if (clip_depth < BLEND_STACK_SPLIT) {
+                for (uint k = 0; k < CHUNK; k++) {
+                    blend_stack[clip_depth][k] = packsRGB(vec4(rgba[k]));
+                    rgba[k] = vec4(0.0);
+                }
+            } else {
+                uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
+                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
+                for (uint k = 0; k < CHUNK; k++) {
+                    memory[base_ix + k] = packsRGB(vec4(rgba[k]));
+                    rgba[k] = vec4(0.0);
+                }
             }
             clip_depth++;
             cmd_ref.offset += 4;
             break;
         case Cmd_EndClip:
             CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
-            uint blend_mode = uint(end_clip.blend >> 8);
-            uint comp_mode = uint(end_clip.blend & 0xFF);
             clip_depth--;
+            uint base_ix;
+            if (clip_depth < BLEND_STACK_SPLIT) {
+                base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
+                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
+            }
             for (uint k = 0; k < CHUNK; k++) {
-                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
-                mediump vec4 bg = unpacksRGB(blend_stack[d][k]);
+                uint bg_rgba;
+                if (clip_depth < BLEND_STACK_SPLIT) {
+                    bg_rgba = blend_stack[clip_depth][k];
+                } else {
+                    bg_rgba = memory[base_ix + k];
+                }
+                mediump vec4 bg = unpacksRGB(bg_rgba);
                 mediump vec4 fg = rgba[k] * area[k];
                 rgba[k] = mix_blend_compose(bg, fg, end_clip.blend);
             }
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index ec17188..21206e5 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -27,6 +27,10 @@
 
 #define GRADIENT_WIDTH 512
 
+// We allocate this many blend stack entries in registers, and spill
+// to memory for the overflow.
+#define BLEND_STACK_SPLIT 4
+
 #ifdef ERR_MALLOC_FAILED
 struct Config {
     uint n_elements; // paths
@@ -91,7 +95,7 @@
 #define MODE_STROKE 1
 
 // Size of kernel4 clip state, in words.
-#define CLIP_STATE_SIZE 2
+#define CLIP_STATE_SIZE 1
 
 // fill_mode_from_flags extracts the fill mode from tag flags.
 uint fill_mode_from_flags(uint flags) {
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index d32a9c5..8915de4 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -34,8 +34,6 @@
 
 const PTCL_INITIAL_ALLOC: usize = 1024;
 
-const MAX_BLEND_STACK: usize = 128;
-
 #[allow(unused)]
 fn dump_scene(buf: &[u8]) {
     for i in 0..(buf.len() / 4) {
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index dca03eb..f78e8ab 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -2,7 +2,6 @@
 
 use crate::encoder::GlyphEncoder;
 use crate::stages::{Config, Transform};
-use crate::MAX_BLEND_STACK;
 use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape};
 use piet::{
     Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext,
@@ -230,9 +229,6 @@
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
         self.new_encoder.begin_clip(None);
-        if self.clip_stack.len() >= MAX_BLEND_STACK {
-            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
-        }
         self.clip_stack.push(ClipElement { blend: None });
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;
@@ -334,9 +330,6 @@
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
         self.new_encoder.begin_clip(Some(blend));
-        if self.clip_stack.len() >= MAX_BLEND_STACK {
-            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
-        }
         self.clip_stack.push(ClipElement { blend: Some(blend) });
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;