Radial gradients

This patch adds radial gradients, including both the piet API and some
new methods specifically to support COLRv1, including the ability to
transform the gradient separately from the path.
diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs
index e8c29c3..14831ca 100644
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@@ -24,6 +24,14 @@
             line_y: f32,
             line_c: f32,
         }
+        struct CmdRadGrad {
+            index: u32,
+            mat: [f32; 4],
+            xlat: [f32; 2],
+            c1: [f32; 2],
+            ra: f32,
+            roff: f32,
+        }
         struct CmdImage {
             index: u32,
             offset: [i16; 2],
@@ -31,6 +39,9 @@
         struct CmdAlpha {
             alpha: f32,
         }
+        struct CmdEndClip {
+            blend: u32,
+        }
         struct CmdJump {
             new_ref: u32,
         }
@@ -42,9 +53,10 @@
             Alpha(CmdAlpha),
             Color(CmdColor),
             LinGrad(CmdLinGrad),
+            RadGrad(CmdRadGrad),
             Image(CmdImage),
             BeginClip,
-            EndClip,
+            EndClip(CmdEndClip),
             Jump(CmdJump),
         }
     }
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 454371c..3abb2e0 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -229,6 +229,7 @@
         case Drawtag_FillColor:
         case Drawtag_FillImage:
         case Drawtag_FillLinGradient:
+        case Drawtag_FillRadGradient:
         case Drawtag_BeginClip:
         case Drawtag_EndClip:
             uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
@@ -373,6 +374,25 @@
                     Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
                     cmd_ref.offset += 4 + CmdLinGrad_size;
                     break;
+                case Drawtag_FillRadGradient:
+                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                        break;
+                    }
+                    linewidth = uintBitsToFloat(memory[di]);
+                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
+                    CmdRadGrad cmd_rad;
+                    cmd_rad.index = scene[dd];
+                    // Given that this is basically a memcpy, we might consider
+                    // letting the fine raster read the info itself.
+                    cmd_rad.mat = uintBitsToFloat(uvec4(memory[di + 1], memory[di + 2],
+                        memory[di + 3], memory[di + 4]));
+                    cmd_rad.xlat = uintBitsToFloat(uvec2(memory[di + 5], memory[di + 6]));
+                    cmd_rad.c1 = uintBitsToFloat(uvec2(memory[di + 7], memory[di + 8]));
+                    cmd_rad.ra = uintBitsToFloat(memory[di + 9]);
+                    cmd_rad.roff = uintBitsToFloat(memory[di + 10]);
+                    Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad);
+                    cmd_ref.offset += 4 + CmdRadGrad_size;
+                    break;
                 case Drawtag_FillImage:
                     linewidth = uintBitsToFloat(memory[di]);
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp
index 1cee0ef..ef369c9 100644
--- a/piet-gpu/shader/draw_leaf.comp
+++ b/piet-gpu/shader/draw_leaf.comp
@@ -94,8 +94,8 @@
         // pipeline. However, going forward we'll get rid of that, and have
         // later stages read scene + bbox etc.
         tag_word = scene[drawtag_base + ix + i];
-        if (tag_word == Drawtag_FillColor || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillImage ||
-            tag_word == Drawtag_BeginClip) {
+        if (tag_word == Drawtag_FillColor || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillRadGradient ||
+            tag_word == Drawtag_FillImage || tag_word == Drawtag_BeginClip) {
             uint bbox_offset = (conf.path_bbox_alloc.offset >> 2) + 6 * m.path_ix;
             float bbox_l = float(memory[bbox_offset]) - 32768.0;
             float bbox_t = float(memory[bbox_offset + 1]) - 32768.0;
@@ -106,11 +106,11 @@
             uint fill_mode = uint(linewidth >= 0.0);
             vec4 mat;
             vec2 translate;
-            if (linewidth >= 0.0 || tag_word == Drawtag_FillLinGradient) {
+            if (linewidth >= 0.0 || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillRadGradient) {
                 uint trans_ix = memory[bbox_offset + 5];
                 uint t = (conf.trans_alloc.offset >> 2) + 6 * trans_ix;
                 mat = uintBitsToFloat(uvec4(memory[t], memory[t + 1], memory[t + 2], memory[t + 3]));
-                if (tag_word == Drawtag_FillLinGradient) {
+                if (tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillRadGradient) {
                     translate = uintBitsToFloat(uvec2(memory[t + 4], memory[t + 5]));
                 }
             }
@@ -125,7 +125,6 @@
                 break;
             case Drawtag_FillLinGradient:
                 memory[di] = floatBitsToUint(linewidth);
-                uint index = scene[dd];
                 vec2 p0 = uintBitsToFloat(uvec2(scene[dd + 1], scene[dd + 2]));
                 vec2 p1 = uintBitsToFloat(uvec2(scene[dd + 3], scene[dd + 4]));
                 p0 = mat.xy * p0.x + mat.zw * p0.y + translate;
@@ -139,6 +138,33 @@
                 memory[di + 2] = floatBitsToUint(line_y);
                 memory[di + 3] = floatBitsToUint(line_c);
                 break;
+            case Drawtag_FillRadGradient:
+                p0 = uintBitsToFloat(uvec2(scene[dd + 1], scene[dd + 2]));
+                p1 = uintBitsToFloat(uvec2(scene[dd + 3], scene[dd + 4]));
+                float r0 = uintBitsToFloat(scene[dd + 5]);
+                float r1 = uintBitsToFloat(scene[dd + 6]);
+                float inv_det = 1.0 / (mat.x * mat.w - mat.y * mat.z);
+                vec4 inv_mat = inv_det * vec4(mat.w, -mat.y, -mat.z, mat.x);
+                vec2 inv_tr = inv_mat.xz * translate.x + inv_mat.yw * translate.y;
+                inv_tr += p0;
+                vec2 center1 = p1 - p0;
+                float rr = r1 / (r1 - r0);
+                float rainv = rr / (r1 * r1 - dot(center1, center1));
+                vec2 c1 = center1 * rainv;
+                float ra = rr * rainv;
+                float roff = rr - 1.0;
+                memory[di] = floatBitsToUint(linewidth);
+                memory[di + 1] = floatBitsToUint(inv_mat.x);
+                memory[di + 2] = floatBitsToUint(inv_mat.y);
+                memory[di + 3] = floatBitsToUint(inv_mat.z);
+                memory[di + 4] = floatBitsToUint(inv_mat.w);
+                memory[di + 5] = floatBitsToUint(inv_tr.x);
+                memory[di + 6] = floatBitsToUint(inv_tr.y);
+                memory[di + 7] = floatBitsToUint(c1.x);
+                memory[di + 8] = floatBitsToUint(c1.y);
+                memory[di + 9] = floatBitsToUint(ra);
+                memory[di + 10] = floatBitsToUint(roff);
+                break;
             case Drawtag_BeginClip:
                 break;
             }
diff --git a/piet-gpu/shader/drawtag.h b/piet-gpu/shader/drawtag.h
index 7f73546..1e35318 100644
--- a/piet-gpu/shader/drawtag.h
+++ b/piet-gpu/shader/drawtag.h
@@ -4,11 +4,12 @@
 
 // Design of draw tag: & 0x1c gives scene size in bytes
 // & 1 gives clip
-// (tag >> 4) & 0x1c is info size in bytes
+// (tag >> 4) & 0x3c is info size in bytes
 
 #define Drawtag_Nop 0
 #define Drawtag_FillColor 0x44
 #define Drawtag_FillLinGradient 0x114
+#define Drawtag_FillRadGradient 0x2dc
 #define Drawtag_FillImage 0x48
 #define Drawtag_BeginClip 0x05
 #define Drawtag_EndClip 0x25
@@ -36,5 +37,5 @@
 DrawMonoid map_tag(uint tag_word) {
     // TODO: at some point, EndClip should not generate a path
     uint has_path = uint(tag_word != Drawtag_Nop);
-    return DrawMonoid(has_path, tag_word & 1, tag_word & 0x1c, (tag_word >> 4) & 0x1c);
+    return DrawMonoid(has_path, tag_word & 1, tag_word & 0x1c, (tag_word >> 4) & 0x3c);
 }
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index 12e88dd..fdab444 100644
--- a/piet-gpu/shader/gen/coarse.dxil
+++ b/piet-gpu/shader/gen/coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index a702df5..04529bb 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -91,6 +91,21 @@
     float line_c;
 };
 
+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
     uint offset;
@@ -160,9 +175,9 @@
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _242 : register(u0, space0);
-ByteAddressBuffer _854 : register(t1, space0);
-ByteAddressBuffer _1222 : register(t2, space0);
+RWByteAddressBuffer _260 : register(u0, space0);
+ByteAddressBuffer _1005 : register(t1, space0);
+ByteAddressBuffer _1372 : register(t2, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -185,8 +200,8 @@
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _319 = { a.offset + offset };
-    return _319;
+    Alloc _337 = { a.offset + offset };
+    return _337;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -202,7 +217,7 @@
     {
         return 0u;
     }
-    uint v = _242.Load(offset * 4 + 8);
+    uint v = _260.Load(offset * 4 + 8);
     return v;
 }
 
@@ -215,8 +230,8 @@
 
 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index)
 {
-    BinInstanceRef _328 = { ref.offset + (index * 4u) };
-    return _328;
+    BinInstanceRef _346 = { ref.offset + (index * 4u) };
+    return _346;
 }
 
 BinInstance BinInstance_read(Alloc a, BinInstanceRef ref)
@@ -244,8 +259,8 @@
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _391 = { raw2 };
-    s.tiles = _391;
+    TileRef _409 = { raw2 };
+    s.tiles = _409;
     return s;
 }
 
@@ -255,11 +270,11 @@
 
 Alloc read_tile_alloc(uint el_ix, bool mem_ok)
 {
-    uint _741;
-    _242.GetDimensions(_741);
-    _741 = (_741 - 8) / 4;
+    uint _892;
+    _260.GetDimensions(_892);
+    _892 = (_892 - 8) / 4;
     uint param = 0u;
-    uint param_1 = uint(int(_741) * 4);
+    uint param_1 = uint(int(_892) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
@@ -273,31 +288,31 @@
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
     uint raw1 = read_mem(param_2, param_3);
-    TileSegRef _416 = { raw0 };
+    TileSegRef _434 = { raw0 };
     Tile s;
-    s.tile = _416;
+    s.tile = _434;
     s.backdrop = int(raw1);
     return s;
 }
 
 MallocResult malloc(uint size)
 {
-    uint _248;
-    _242.InterlockedAdd(0, size, _248);
-    uint offset = _248;
-    uint _255;
-    _242.GetDimensions(_255);
-    _255 = (_255 - 8) / 4;
+    uint _266;
+    _260.InterlockedAdd(0, size, _266);
+    uint offset = _266;
+    uint _273;
+    _260.GetDimensions(_273);
+    _273 = (_273 - 8) / 4;
     MallocResult r;
-    r.failed = (offset + size) > uint(int(_255) * 4);
+    r.failed = (offset + size) > uint(int(_273) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _277;
-        _242.InterlockedMax(4, 1u, _277);
+        uint _295;
+        _260.InterlockedMax(4, 1u, _295);
         return r;
     }
     return r;
@@ -311,7 +326,7 @@
     {
         return;
     }
-    _242.Store(offset * 4 + 8, val);
+    _260.Store(offset * 4 + 8, val);
 }
 
 void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s)
@@ -327,11 +342,11 @@
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 10u;
+    uint param_2 = 11u;
     write_mem(param, param_1, param_2);
-    CmdJumpRef _734 = { ref.offset + 4u };
+    CmdJumpRef _885 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdJumpRef param_4 = _734;
+    CmdJumpRef param_4 = _885;
     CmdJump param_5 = s;
     CmdJump_write(param_3, param_4, param_5);
 }
@@ -343,22 +358,22 @@
         return true;
     }
     uint param = 1024u;
-    MallocResult _762 = malloc(param);
-    MallocResult new_cmd = _762;
+    MallocResult _913 = malloc(param);
+    MallocResult new_cmd = _913;
     if (new_cmd.failed)
     {
         return false;
     }
-    CmdJump _772 = { new_cmd.alloc.offset };
-    CmdJump jump = _772;
+    CmdJump _923 = { new_cmd.alloc.offset };
+    CmdJump jump = _923;
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
     Cmd_Jump_write(param_1, param_2, param_3);
     cmd_alloc = new_cmd.alloc;
-    CmdRef _784 = { cmd_alloc.offset };
-    cmd_ref = _784;
-    cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
+    CmdRef _935 = { cmd_alloc.offset };
+    cmd_ref = _935;
+    cmd_limit = (cmd_alloc.offset + 1024u) - 144u;
     return true;
 }
 
@@ -381,9 +396,9 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
     write_mem(param, param_1, param_2);
-    CmdFillRef _604 = { ref.offset + 4u };
+    CmdFillRef _742 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdFillRef param_4 = _604;
+    CmdFillRef param_4 = _742;
     CmdFill param_5 = s;
     CmdFill_write(param_3, param_4, param_5);
 }
@@ -415,9 +430,9 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
     write_mem(param, param_1, param_2);
-    CmdStrokeRef _622 = { ref.offset + 4u };
+    CmdStrokeRef _760 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdStrokeRef param_4 = _622;
+    CmdStrokeRef param_4 = _760;
     CmdStroke param_5 = s;
     CmdStroke_write(param_3, param_4, param_5);
 }
@@ -428,8 +443,8 @@
     {
         if (tile.tile.offset != 0u)
         {
-            CmdFill _807 = { tile.tile.offset, tile.backdrop };
-            CmdFill cmd_fill = _807;
+            CmdFill _958 = { tile.tile.offset, tile.backdrop };
+            CmdFill cmd_fill = _958;
             Alloc param = alloc;
             CmdRef param_1 = cmd_ref;
             CmdFill param_2 = cmd_fill;
@@ -446,8 +461,8 @@
     }
     else
     {
-        CmdStroke _837 = { tile.tile.offset, 0.5f * linewidth };
-        CmdStroke cmd_stroke = _837;
+        CmdStroke _988 = { tile.tile.offset, 0.5f * linewidth };
+        CmdStroke cmd_stroke = _988;
         Alloc param_5 = alloc;
         CmdRef param_6 = cmd_ref;
         CmdStroke param_7 = cmd_stroke;
@@ -471,9 +486,9 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
     write_mem(param, param_1, param_2);
-    CmdColorRef _649 = { ref.offset + 4u };
+    CmdColorRef _786 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdColorRef param_4 = _649;
+    CmdColorRef param_4 = _786;
     CmdColor param_5 = s;
     CmdColor_write(param_3, param_4, param_5);
 }
@@ -505,13 +520,75 @@
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
     write_mem(param, param_1, param_2);
-    CmdLinGradRef _668 = { ref.offset + 4u };
+    CmdLinGradRef _804 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdLinGradRef param_4 = _668;
+    CmdLinGradRef param_4 = _804;
     CmdLinGrad param_5 = s;
     CmdLinGrad_write(param_3, param_4, param_5);
 }
 
+void CmdRadGrad_write(Alloc a, CmdRadGradRef ref, CmdRadGrad s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.index;
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.mat.x);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s.mat.y);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s.mat.z);
+    write_mem(param_9, param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = asuint(s.mat.w);
+    write_mem(param_12, param_13, param_14);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = asuint(s.xlat.x);
+    write_mem(param_15, param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 6u;
+    uint param_20 = asuint(s.xlat.y);
+    write_mem(param_18, param_19, param_20);
+    Alloc param_21 = a;
+    uint param_22 = ix + 7u;
+    uint param_23 = asuint(s.c1.x);
+    write_mem(param_21, param_22, param_23);
+    Alloc param_24 = a;
+    uint param_25 = ix + 8u;
+    uint param_26 = asuint(s.c1.y);
+    write_mem(param_24, param_25, param_26);
+    Alloc param_27 = a;
+    uint param_28 = ix + 9u;
+    uint param_29 = asuint(s.ra);
+    write_mem(param_27, param_28, param_29);
+    Alloc param_30 = a;
+    uint param_31 = ix + 10u;
+    uint param_32 = asuint(s.roff);
+    write_mem(param_30, param_31, param_32);
+}
+
+void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 7u;
+    write_mem(param, param_1, param_2);
+    CmdRadGradRef _822 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    CmdRadGradRef param_4 = _822;
+    CmdRadGrad param_5 = s;
+    CmdRadGrad_write(param_3, param_4, param_5);
+}
+
 void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s)
 {
     uint ix = ref.offset >> uint(2);
@@ -529,11 +606,11 @@
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 7u;
+    uint param_2 = 8u;
     write_mem(param, param_1, param_2);
-    CmdImageRef _687 = { ref.offset + 4u };
+    CmdImageRef _840 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdImageRef param_4 = _687;
+    CmdImageRef param_4 = _840;
     CmdImage param_5 = s;
     CmdImage_write(param_3, param_4, param_5);
 }
@@ -542,7 +619,7 @@
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 8u;
+    uint param_2 = 9u;
     write_mem(param, param_1, param_2);
 }
 
@@ -559,11 +636,11 @@
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 9u;
+    uint param_2 = 10u;
     write_mem(param, param_1, param_2);
-    CmdEndClipRef _715 = { ref.offset + 4u };
+    CmdEndClipRef _866 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdEndClipRef param_4 = _715;
+    CmdEndClipRef param_4 = _866;
     CmdEndClip param_5 = s;
     CmdEndClip_write(param_3, param_4, param_5);
 }
@@ -578,80 +655,81 @@
 
 void comp_main()
 {
-    uint width_in_bins = ((_854.Load(8) + 16u) - 1u) / 16u;
+    uint width_in_bins = ((_1005.Load(8) + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_854.Load(0) + 256u) - 1u) / 256u;
+    uint n_partitions = ((_1005.Load(0) + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _854.Load(8)) + bin_tile_x) + tile_x;
-    Alloc _919;
-    _919.offset = _854.Load(24);
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1005.Load(8)) + bin_tile_x) + tile_x;
+    Alloc _1070;
+    _1070.offset = _1005.Load(24);
     Alloc param;
-    param.offset = _919.offset;
+    param.offset = _1070.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _928 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _928;
-    uint cmd_limit = (cmd_ref.offset + 1024u) - 60u;
+    CmdRef _1079 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1079;
+    uint cmd_limit = (cmd_ref.offset + 1024u) - 144u;
     uint clip_depth = 0u;
     uint clip_zero_depth = 0u;
     uint rd_ix = 0u;
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    uint drawmonoid_start = _854.Load(44) >> uint(2);
-    uint drawtag_start = _854.Load(100) >> uint(2);
-    uint drawdata_start = _854.Load(104) >> uint(2);
-    uint drawinfo_start = _854.Load(68) >> uint(2);
-    bool mem_ok = _242.Load(4) == 0u;
+    uint drawmonoid_start = _1005.Load(44) >> uint(2);
+    uint drawtag_start = _1005.Load(100) >> uint(2);
+    uint drawdata_start = _1005.Load(104) >> uint(2);
+    uint drawinfo_start = _1005.Load(68) >> uint(2);
+    bool mem_ok = _260.Load(4) == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1154;
+    uint _1304;
     uint element_ix;
     Alloc param_14;
     uint tile_count;
-    uint _1455;
+    uint _1605;
     float linewidth;
     CmdLinGrad cmd_lin;
+    CmdRadGrad cmd_rad;
     while (true)
     {
         for (uint i = 0u; i < 8u; i++)
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1206;
+        bool _1356;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1003 = th_ix < 256u;
-                bool _1011;
-                if (_1003)
+                bool _1154 = th_ix < 256u;
+                bool _1162;
+                if (_1154)
                 {
-                    _1011 = (partition_ix + th_ix) < n_partitions;
+                    _1162 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1011 = _1003;
+                    _1162 = _1154;
                 }
-                if (_1011)
+                if (_1162)
                 {
-                    uint in_ix = (_854.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1029;
-                    _1029.offset = _854.Load(20);
-                    param_3.offset = _1029.offset;
+                    uint in_ix = (_1005.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    Alloc _1179;
+                    _1179.offset = _1005.Load(20);
+                    param_3.offset = _1179.offset;
                     uint param_4 = in_ix;
                     count = read_mem(param_3, param_4);
-                    Alloc _1040;
-                    _1040.offset = _854.Load(20);
-                    param_5.offset = _1040.offset;
+                    Alloc _1190;
+                    _1190.offset = _1005.Load(20);
+                    param_5.offset = _1190.offset;
                     uint param_6 = in_ix + 1u;
                     uint offset = read_mem(param_5, param_6);
                     uint param_7 = offset;
@@ -697,16 +775,16 @@
                 }
                 if (part_ix > 0u)
                 {
-                    _1154 = sh_part_count[part_ix - 1u];
+                    _1304 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1154 = part_start_ix;
+                    _1304 = part_start_ix;
                 }
-                ix -= _1154;
+                ix -= _1304;
                 Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1173 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1173;
+                BinInstanceRef _1323 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1323;
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
@@ -716,16 +794,16 @@
             }
             GroupMemoryBarrierWithGroupSync();
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1196 = (wr_ix - rd_ix) < 256u;
-            if (_1196)
+            bool _1346 = (wr_ix - rd_ix) < 256u;
+            if (_1346)
             {
-                _1206 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1206 = _1196;
+                _1356 = _1346;
             }
-            if (_1206)
+            if (_1356)
             {
                 continue;
             }
@@ -738,23 +816,24 @@
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1222.Load((drawtag_start + element_ix) * 4 + 0);
+            tag = _1372.Load((drawtag_start + element_ix) * 4 + 0);
         }
         switch (tag)
         {
             case 68u:
             case 72u:
             case 276u:
+            case 732u:
             case 5u:
             case 37u:
             {
                 uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
-                uint path_ix = _242.Load(drawmonoid_base * 4 + 8);
-                PathRef _1247 = { _854.Load(16) + (path_ix * 12u) };
-                Alloc _1250;
-                _1250.offset = _854.Load(16);
-                param_14.offset = _1250.offset;
-                PathRef param_15 = _1247;
+                uint path_ix = _260.Load(drawmonoid_base * 4 + 8);
+                PathRef _1397 = { _1005.Load(16) + (path_ix * 12u) };
+                Alloc _1400;
+                _1400.offset = _1005.Load(16);
+                param_14.offset = _1400.offset;
+                PathRef param_15 = _1397;
                 Path path = Path_read(param_14, param_15);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
@@ -810,16 +889,16 @@
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1222.Load((drawtag_start + element_ix_1) * 4 + 0);
+            uint tag_1 = _1372.Load((drawtag_start + element_ix_1) * 4 + 0);
             if (el_ix > 0u)
             {
-                _1455 = sh_tile_count[el_ix - 1u];
+                _1605 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1455 = 0u;
+                _1605 = 0u;
             }
-            uint seq_ix = ix_1 - _1455;
+            uint seq_ix = ix_1 - _1605;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -828,38 +907,38 @@
             {
                 uint param_21 = el_ix;
                 bool param_22 = mem_ok;
-                TileRef _1507 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                TileRef _1657 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
                 Alloc param_23 = read_tile_alloc(param_21, param_22);
-                TileRef param_24 = _1507;
+                TileRef param_24 = _1657;
                 Tile tile = Tile_read(param_23, param_24);
                 bool is_clip = (tag_1 & 1u) != 0u;
                 bool is_blend = false;
                 if (is_clip)
                 {
                     uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
-                    uint scene_offset = _242.Load((drawmonoid_base_1 + 2u) * 4 + 8);
+                    uint scene_offset = _260.Load((drawmonoid_base_1 + 2u) * 4 + 8);
                     uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1222.Load(dd * 4 + 0);
+                    uint blend = _1372.Load(dd * 4 + 0);
                     is_blend = blend != 3u;
                 }
-                bool _1542 = tile.tile.offset != 0u;
-                bool _1551;
-                if (!_1542)
+                bool _1692 = tile.tile.offset != 0u;
+                bool _1701;
+                if (!_1692)
                 {
-                    _1551 = (tile.backdrop == 0) == is_clip;
+                    _1701 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1551 = _1542;
+                    _1701 = _1692;
                 }
-                include_tile = _1551 || is_blend;
+                include_tile = _1701 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1573;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1573);
+                uint _1723;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723);
             }
         }
         GroupMemoryBarrierWithGroupSync();
@@ -883,33 +962,33 @@
             uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1222.Load((drawtag_start + element_ix_2) * 4 + 0);
+            uint drawtag = _1372.Load((drawtag_start + element_ix_2) * 4 + 0);
             if (clip_zero_depth == 0u)
             {
                 uint param_25 = element_ref_ix;
                 bool param_26 = mem_ok;
-                TileRef _1650 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                 Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1650;
+                TileRef param_28 = _1800;
                 Tile tile_1 = Tile_read(param_27, param_28);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
-                uint scene_offset_1 = _242.Load((drawmonoid_base_2 + 2u) * 4 + 8);
-                uint info_offset = _242.Load((drawmonoid_base_2 + 3u) * 4 + 8);
+                uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
+                uint info_offset = _260.Load((drawmonoid_base_2 + 3u) * 4 + 8);
                 uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
                 uint di = drawinfo_start + (info_offset >> uint(2));
                 switch (drawtag)
                 {
                     case 68u:
                     {
-                        linewidth = asfloat(_242.Load(di * 4 + 8));
+                        linewidth = asfloat(_260.Load(di * 4 + 8));
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1697 = alloc_cmd(param_29, param_30, param_31);
+                        bool _1848 = alloc_cmd(param_29, param_30, param_31);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1697)
+                        if (!_1848)
                         {
                             break;
                         }
@@ -919,11 +998,11 @@
                         float param_35 = linewidth;
                         write_fill(param_32, param_33, param_34, param_35);
                         cmd_ref = param_33;
-                        uint rgba = _1222.Load(dd_1 * 4 + 0);
-                        CmdColor _1720 = { rgba };
+                        uint rgba = _1372.Load(dd_1 * 4 + 0);
+                        CmdColor _1871 = { rgba };
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = _1720;
+                        CmdColor param_38 = _1871;
                         Cmd_Color_write(param_36, param_37, param_38);
                         cmd_ref.offset += 8u;
                         break;
@@ -933,25 +1012,25 @@
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1738 = alloc_cmd(param_39, param_40, param_41);
+                        bool _1889 = alloc_cmd(param_39, param_40, param_41);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1738)
+                        if (!_1889)
                         {
                             break;
                         }
-                        linewidth = asfloat(_242.Load(di * 4 + 8));
+                        linewidth = asfloat(_260.Load(di * 4 + 8));
                         Alloc param_42 = cmd_alloc;
                         CmdRef param_43 = cmd_ref;
                         Tile param_44 = tile_1;
                         float param_45 = linewidth;
                         write_fill(param_42, param_43, param_44, param_45);
                         cmd_ref = param_43;
-                        cmd_lin.index = _1222.Load(dd_1 * 4 + 0);
-                        cmd_lin.line_x = asfloat(_242.Load((di + 1u) * 4 + 8));
-                        cmd_lin.line_y = asfloat(_242.Load((di + 2u) * 4 + 8));
-                        cmd_lin.line_c = asfloat(_242.Load((di + 3u) * 4 + 8));
+                        cmd_lin.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_lin.line_x = asfloat(_260.Load((di + 1u) * 4 + 8));
+                        cmd_lin.line_y = asfloat(_260.Load((di + 2u) * 4 + 8));
+                        cmd_lin.line_c = asfloat(_260.Load((di + 3u) * 4 + 8));
                         Alloc param_46 = cmd_alloc;
                         CmdRef param_47 = cmd_ref;
                         CmdLinGrad param_48 = cmd_lin;
@@ -959,69 +1038,102 @@
                         cmd_ref.offset += 20u;
                         break;
                     }
-                    case 72u:
+                    case 732u:
                     {
-                        linewidth = asfloat(_242.Load(di * 4 + 8));
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1806 = alloc_cmd(param_49, param_50, param_51);
+                        bool _1953 = alloc_cmd(param_49, param_50, param_51);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1806)
+                        if (!_1953)
                         {
                             break;
                         }
+                        linewidth = asfloat(_260.Load(di * 4 + 8));
                         Alloc param_52 = cmd_alloc;
                         CmdRef param_53 = cmd_ref;
                         Tile param_54 = tile_1;
                         float param_55 = linewidth;
                         write_fill(param_52, param_53, param_54, param_55);
                         cmd_ref = param_53;
-                        uint index = _1222.Load(dd_1 * 4 + 0);
-                        uint raw1 = _1222.Load((dd_1 + 1u) * 4 + 0);
-                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _1845 = { index, offset_1 };
+                        cmd_rad.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_rad.mat = asfloat(uint4(_260.Load((di + 1u) * 4 + 8), _260.Load((di + 2u) * 4 + 8), _260.Load((di + 3u) * 4 + 8), _260.Load((di + 4u) * 4 + 8)));
+                        cmd_rad.xlat = asfloat(uint2(_260.Load((di + 5u) * 4 + 8), _260.Load((di + 6u) * 4 + 8)));
+                        cmd_rad.c1 = asfloat(uint2(_260.Load((di + 7u) * 4 + 8), _260.Load((di + 8u) * 4 + 8)));
+                        cmd_rad.ra = asfloat(_260.Load((di + 9u) * 4 + 8));
+                        cmd_rad.roff = asfloat(_260.Load((di + 10u) * 4 + 8));
                         Alloc param_56 = cmd_alloc;
                         CmdRef param_57 = cmd_ref;
-                        CmdImage param_58 = _1845;
-                        Cmd_Image_write(param_56, param_57, param_58);
+                        CmdRadGrad param_58 = cmd_rad;
+                        Cmd_RadGrad_write(param_56, param_57, param_58);
+                        cmd_ref.offset += 48u;
+                        break;
+                    }
+                    case 72u:
+                    {
+                        linewidth = asfloat(_260.Load(di * 4 + 8));
+                        Alloc param_59 = cmd_alloc;
+                        CmdRef param_60 = cmd_ref;
+                        uint param_61 = cmd_limit;
+                        bool _2059 = alloc_cmd(param_59, param_60, param_61);
+                        cmd_alloc = param_59;
+                        cmd_ref = param_60;
+                        cmd_limit = param_61;
+                        if (!_2059)
+                        {
+                            break;
+                        }
+                        Alloc param_62 = cmd_alloc;
+                        CmdRef param_63 = cmd_ref;
+                        Tile param_64 = tile_1;
+                        float param_65 = linewidth;
+                        write_fill(param_62, param_63, param_64, param_65);
+                        cmd_ref = param_63;
+                        uint index = _1372.Load(dd_1 * 4 + 0);
+                        uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
+                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+                        CmdImage _2098 = { index, offset_1 };
+                        Alloc param_66 = cmd_alloc;
+                        CmdRef param_67 = cmd_ref;
+                        CmdImage param_68 = _2098;
+                        Cmd_Image_write(param_66, param_67, param_68);
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _1859 = tile_1.tile.offset == 0u;
-                        bool _1865;
-                        if (_1859)
+                        bool _2112 = tile_1.tile.offset == 0u;
+                        bool _2118;
+                        if (_2112)
                         {
-                            _1865 = tile_1.backdrop == 0;
+                            _2118 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _1865 = _1859;
+                            _2118 = _2112;
                         }
-                        if (_1865)
+                        if (_2118)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
                         else
                         {
-                            Alloc param_59 = cmd_alloc;
-                            CmdRef param_60 = cmd_ref;
-                            uint param_61 = cmd_limit;
-                            bool _1877 = alloc_cmd(param_59, param_60, param_61);
-                            cmd_alloc = param_59;
-                            cmd_ref = param_60;
-                            cmd_limit = param_61;
-                            if (!_1877)
+                            Alloc param_69 = cmd_alloc;
+                            CmdRef param_70 = cmd_ref;
+                            uint param_71 = cmd_limit;
+                            bool _2130 = alloc_cmd(param_69, param_70, param_71);
+                            cmd_alloc = param_69;
+                            cmd_ref = param_70;
+                            cmd_limit = param_71;
+                            if (!_2130)
                             {
                                 break;
                             }
-                            Alloc param_62 = cmd_alloc;
-                            CmdRef param_63 = cmd_ref;
-                            Cmd_BeginClip_write(param_62, param_63);
+                            Alloc param_72 = cmd_alloc;
+                            CmdRef param_73 = cmd_ref;
+                            Cmd_BeginClip_write(param_72, param_73);
                             cmd_ref.offset += 4u;
                         }
                         clip_depth++;
@@ -1030,29 +1142,29 @@
                     case 37u:
                     {
                         clip_depth--;
-                        Alloc param_64 = cmd_alloc;
-                        CmdRef param_65 = cmd_ref;
-                        uint param_66 = cmd_limit;
-                        bool _1905 = alloc_cmd(param_64, param_65, param_66);
-                        cmd_alloc = param_64;
-                        cmd_ref = param_65;
-                        cmd_limit = param_66;
-                        if (!_1905)
+                        Alloc param_74 = cmd_alloc;
+                        CmdRef param_75 = cmd_ref;
+                        uint param_76 = cmd_limit;
+                        bool _2158 = alloc_cmd(param_74, param_75, param_76);
+                        cmd_alloc = param_74;
+                        cmd_ref = param_75;
+                        cmd_limit = param_76;
+                        if (!_2158)
                         {
                             break;
                         }
-                        Alloc param_67 = cmd_alloc;
-                        CmdRef param_68 = cmd_ref;
-                        Tile param_69 = tile_1;
-                        float param_70 = -1.0f;
-                        write_fill(param_67, param_68, param_69, param_70);
-                        cmd_ref = param_68;
-                        uint blend_1 = _1222.Load(dd_1 * 4 + 0);
-                        CmdEndClip _1928 = { blend_1 };
-                        Alloc param_71 = cmd_alloc;
-                        CmdRef param_72 = cmd_ref;
-                        CmdEndClip param_73 = _1928;
-                        Cmd_EndClip_write(param_71, param_72, param_73);
+                        Alloc param_77 = cmd_alloc;
+                        CmdRef param_78 = cmd_ref;
+                        Tile param_79 = tile_1;
+                        float param_80 = -1.0f;
+                        write_fill(param_77, param_78, param_79, param_80);
+                        cmd_ref = param_78;
+                        uint blend_1 = _1372.Load(dd_1 * 4 + 0);
+                        CmdEndClip _2181 = { blend_1 };
+                        Alloc param_81 = cmd_alloc;
+                        CmdRef param_82 = cmd_ref;
+                        CmdEndClip param_83 = _2181;
+                        Cmd_EndClip_write(param_81, param_82, param_83);
                         cmd_ref.offset += 8u;
                         break;
                     }
@@ -1086,21 +1198,21 @@
             break;
         }
     }
-    bool _1975 = (bin_tile_x + tile_x) < _854.Load(8);
-    bool _1984;
-    if (_1975)
+    bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8);
+    bool _2237;
+    if (_2228)
     {
-        _1984 = (bin_tile_y + tile_y) < _854.Load(12);
+        _2237 = (bin_tile_y + tile_y) < _1005.Load(12);
     }
     else
     {
-        _1984 = _1975;
+        _2237 = _2228;
     }
-    if (_1984)
+    if (_2237)
     {
-        Alloc param_74 = cmd_alloc;
-        CmdRef param_75 = cmd_ref;
-        Cmd_End_write(param_74, param_75);
+        Alloc param_84 = cmd_alloc;
+        CmdRef param_85 = cmd_ref;
+        Cmd_End_write(param_84, param_85);
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index 4226352..55812d4 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -107,6 +107,21 @@
     float line_c;
 };
 
+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
     uint offset;
@@ -211,7 +226,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_242, constant uint& v_242BufferSize)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -219,7 +234,7 @@
     {
         return 0u;
     }
-    uint v = v_242.memory[offset];
+    uint v = v_260.memory[offset];
     return v;
 }
 
@@ -238,30 +253,30 @@
 }
 
 static inline __attribute__((always_inline))
-BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
+BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_242, v_242BufferSize);
+    uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize);
     BinInstance s;
     s.element_ix = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_242, v_242BufferSize);
+    uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_242, v_242BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_260, v_260BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_242, v_242BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_260, v_260BufferSize);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -274,24 +289,24 @@
 }
 
 static inline __attribute__((always_inline))
-Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_242, constant uint& v_242BufferSize)
+Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint param = 0u;
-    uint param_1 = uint(int((v_242BufferSize - 8) / 4) * 4);
+    uint param_1 = uint(int((v_260BufferSize - 8) / 4) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
 
 static inline __attribute__((always_inline))
-Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
+Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_242, v_242BufferSize);
+    uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_242, v_242BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_260, v_260BufferSize);
     Tile s;
     s.tile = TileSegRef{ raw0 };
     s.backdrop = int(raw1);
@@ -299,26 +314,26 @@
 }
 
 static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_242, constant uint& v_242BufferSize)
+MallocResult malloc(thread const uint& size, device Memory& v_260, constant uint& v_260BufferSize)
 {
-    uint _248 = atomic_fetch_add_explicit((device atomic_uint*)&v_242.mem_offset, size, memory_order_relaxed);
-    uint offset = _248;
+    uint _266 = atomic_fetch_add_explicit((device atomic_uint*)&v_260.mem_offset, size, memory_order_relaxed);
+    uint offset = _266;
     MallocResult r;
-    r.failed = (offset + size) > uint(int((v_242BufferSize - 8) / 4) * 4);
+    r.failed = (offset + size) > uint(int((v_260BufferSize - 8) / 4) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _277 = atomic_fetch_max_explicit((device atomic_uint*)&v_242.mem_error, 1u, memory_order_relaxed);
+        uint _295 = atomic_fetch_max_explicit((device atomic_uint*)&v_260.mem_error, 1u, memory_order_relaxed);
         return r;
     }
     return r;
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_242, constant uint& v_242BufferSize)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -326,42 +341,42 @@
     {
         return;
     }
-    v_242.memory[offset] = val;
+    v_260.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.new_ref;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 10u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    uint param_2 = 11u;
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u };
     CmdJump param_5 = s;
-    CmdJump_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdJump_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_242, constant uint& v_242BufferSize)
+bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_260, constant uint& v_260BufferSize)
 {
     if (cmd_ref.offset < cmd_limit)
     {
         return true;
     }
     uint param = 1024u;
-    MallocResult _762 = malloc(param, v_242, v_242BufferSize);
-    MallocResult new_cmd = _762;
+    MallocResult _913 = malloc(param, v_260, v_260BufferSize);
+    MallocResult new_cmd = _913;
     if (new_cmd.failed)
     {
         return false;
@@ -370,78 +385,78 @@
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
-    Cmd_Jump_write(param_1, param_2, param_3, v_242, v_242BufferSize);
+    Cmd_Jump_write(param_1, param_2, param_3, v_260, v_260BufferSize);
     cmd_alloc = new_cmd.alloc;
     cmd_ref = CmdRef{ cmd_alloc.offset };
-    cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
+    cmd_limit = (cmd_alloc.offset + 1024u) - 144u;
     return true;
 }
 
 static inline __attribute__((always_inline))
-void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = uint(s.backdrop);
-    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
+    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u };
     CmdFill param_5 = s;
-    CmdFill_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdFill_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 3u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.half_width);
-    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
+    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u };
     CmdStroke param_5 = s;
-    CmdStroke_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdStroke_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_242, constant uint& v_242BufferSize)
+void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_260, constant uint& v_260BufferSize)
 {
     if (linewidth < 0.0)
     {
@@ -451,14 +466,14 @@
             Alloc param = alloc;
             CmdRef param_1 = cmd_ref;
             CmdFill param_2 = cmd_fill;
-            Cmd_Fill_write(param, param_1, param_2, v_242, v_242BufferSize);
+            Cmd_Fill_write(param, param_1, param_2, v_260, v_260BufferSize);
             cmd_ref.offset += 12u;
         }
         else
         {
             Alloc param_3 = alloc;
             CmdRef param_4 = cmd_ref;
-            Cmd_Solid_write(param_3, param_4, v_242, v_242BufferSize);
+            Cmd_Solid_write(param_3, param_4, v_260, v_260BufferSize);
             cmd_ref.offset += 4u;
         }
     }
@@ -468,138 +483,201 @@
         Alloc param_5 = alloc;
         CmdRef param_6 = cmd_ref;
         CmdStroke param_7 = cmd_stroke;
-        Cmd_Stroke_write(param_5, param_6, param_7, v_242, v_242BufferSize);
+        Cmd_Stroke_write(param_5, param_6, param_7, v_260, v_260BufferSize);
         cmd_ref.offset += 12u;
     }
 }
 
 static inline __attribute__((always_inline))
-void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.rgba_color;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u };
     CmdColor param_5 = s;
-    CmdColor_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdColor_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.line_x);
-    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
+    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.line_y);
-    write_mem(param_6, param_7, param_8, v_242, v_242BufferSize);
+    write_mem(param_6, param_7, param_8, v_260, v_260BufferSize);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.line_c);
-    write_mem(param_9, param_10, param_11, v_242, v_242BufferSize);
+    write_mem(param_9, param_10, param_11, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u };
     CmdLinGrad param_5 = s;
-    CmdLinGrad_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdLinGrad_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
-    uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
+    uint param_5 = as_type<uint>(s.mat.x);
+    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = as_type<uint>(s.mat.y);
+    write_mem(param_6, param_7, param_8, v_260, v_260BufferSize);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = as_type<uint>(s.mat.z);
+    write_mem(param_9, param_10, param_11, v_260, v_260BufferSize);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = as_type<uint>(s.mat.w);
+    write_mem(param_12, param_13, param_14, v_260, v_260BufferSize);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = as_type<uint>(s.xlat.x);
+    write_mem(param_15, param_16, param_17, v_260, v_260BufferSize);
+    Alloc param_18 = a;
+    uint param_19 = ix + 6u;
+    uint param_20 = as_type<uint>(s.xlat.y);
+    write_mem(param_18, param_19, param_20, v_260, v_260BufferSize);
+    Alloc param_21 = a;
+    uint param_22 = ix + 7u;
+    uint param_23 = as_type<uint>(s.c1.x);
+    write_mem(param_21, param_22, param_23, v_260, v_260BufferSize);
+    Alloc param_24 = a;
+    uint param_25 = ix + 8u;
+    uint param_26 = as_type<uint>(s.c1.y);
+    write_mem(param_24, param_25, param_26, v_260, v_260BufferSize);
+    Alloc param_27 = a;
+    uint param_28 = ix + 9u;
+    uint param_29 = as_type<uint>(s.ra);
+    write_mem(param_27, param_28, param_29, v_260, v_260BufferSize);
+    Alloc param_30 = a;
+    uint param_31 = ix + 10u;
+    uint param_32 = as_type<uint>(s.roff);
+    write_mem(param_30, param_31, param_32, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
-    CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
-    CmdImage param_5 = s;
-    CmdImage_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdRadGradRef param_4 = CmdRadGradRef{ ref.offset + 4u };
+    CmdRadGrad param_5 = s;
+    CmdRadGrad_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_260, constant uint& v_260BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.index;
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
+    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 8u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    Alloc param_3 = a;
+    CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
+    CmdImage param_5 = s;
+    CmdImage_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 9u;
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.blend;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 9u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    uint param_2 = 10u;
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u };
     CmdEndClip param_5 = s;
-    CmdEndClip_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdEndClip_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 0u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_242 [[buffer(0)]], const device ConfigBuf& _854 [[buffer(1)]], const device SceneBuf& _1222 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1372 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_bitmaps[8][256];
     threadgroup Alloc sh_part_elements[256];
@@ -611,76 +689,77 @@
     threadgroup uint sh_tile_y0[256];
     threadgroup uint sh_tile_base[256];
     threadgroup uint sh_tile_count[256];
-    constant uint& v_242BufferSize = spvBufferSizeConstants[0];
-    uint width_in_bins = ((_854.conf.width_in_tiles + 16u) - 1u) / 16u;
+    constant uint& v_260BufferSize = spvBufferSizeConstants[0];
+    uint width_in_bins = ((_1005.conf.width_in_tiles + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_854.conf.n_elements + 256u) - 1u) / 256u;
+    uint n_partitions = ((_1005.conf.n_elements + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _854.conf.width_in_tiles) + bin_tile_x) + tile_x;
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1005.conf.width_in_tiles) + bin_tile_x) + tile_x;
     Alloc param;
-    param.offset = _854.conf.ptcl_alloc.offset;
+    param.offset = _1005.conf.ptcl_alloc.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
-    uint cmd_limit = (cmd_ref.offset + 1024u) - 60u;
+    uint cmd_limit = (cmd_ref.offset + 1024u) - 144u;
     uint clip_depth = 0u;
     uint clip_zero_depth = 0u;
     uint rd_ix = 0u;
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    uint drawmonoid_start = _854.conf.drawmonoid_alloc.offset >> uint(2);
-    uint drawtag_start = _854.conf.drawtag_offset >> uint(2);
-    uint drawdata_start = _854.conf.drawdata_offset >> uint(2);
-    uint drawinfo_start = _854.conf.drawinfo_alloc.offset >> uint(2);
-    bool mem_ok = v_242.mem_error == 0u;
+    uint drawmonoid_start = _1005.conf.drawmonoid_alloc.offset >> uint(2);
+    uint drawtag_start = _1005.conf.drawtag_offset >> uint(2);
+    uint drawdata_start = _1005.conf.drawdata_offset >> uint(2);
+    uint drawinfo_start = _1005.conf.drawinfo_alloc.offset >> uint(2);
+    bool mem_ok = v_260.mem_error == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1154;
+    uint _1304;
     uint element_ix;
     Alloc param_14;
     uint tile_count;
-    uint _1455;
+    uint _1605;
     float linewidth;
     CmdLinGrad cmd_lin;
+    CmdRadGrad cmd_rad;
     while (true)
     {
         for (uint i = 0u; i < 8u; i++)
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1206;
+        bool _1356;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1003 = th_ix < 256u;
-                bool _1011;
-                if (_1003)
+                bool _1154 = th_ix < 256u;
+                bool _1162;
+                if (_1154)
                 {
-                    _1011 = (partition_ix + th_ix) < n_partitions;
+                    _1162 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1011 = _1003;
+                    _1162 = _1154;
                 }
-                if (_1011)
+                if (_1162)
                 {
-                    uint in_ix = (_854.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    param_3.offset = _854.conf.bin_alloc.offset;
+                    uint in_ix = (_1005.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    param_3.offset = _1005.conf.bin_alloc.offset;
                     uint param_4 = in_ix;
-                    count = read_mem(param_3, param_4, v_242, v_242BufferSize);
-                    param_5.offset = _854.conf.bin_alloc.offset;
+                    count = read_mem(param_3, param_4, v_260, v_260BufferSize);
+                    param_5.offset = _1005.conf.bin_alloc.offset;
                     uint param_6 = in_ix + 1u;
-                    uint offset = read_mem(param_5, param_6, v_242, v_242BufferSize);
+                    uint offset = read_mem(param_5, param_6, v_260, v_260BufferSize);
                     uint param_7 = offset;
                     uint param_8 = count * 4u;
                     bool param_9 = mem_ok;
@@ -724,34 +803,34 @@
                 }
                 if (part_ix > 0u)
                 {
-                    _1154 = sh_part_count[part_ix - 1u];
+                    _1304 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1154 = part_start_ix;
+                    _1304 = part_start_ix;
                 }
-                ix -= _1154;
+                ix -= _1304;
                 Alloc bin_alloc = sh_part_elements[part_ix];
                 BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
                 BinInstanceRef param_13 = BinInstance_index(param_10, param_11);
-                BinInstance inst = BinInstance_read(param_12, param_13, v_242, v_242BufferSize);
+                BinInstance inst = BinInstance_read(param_12, param_13, v_260, v_260BufferSize);
                 sh_elements[th_ix] = inst.element_ix;
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1196 = (wr_ix - rd_ix) < 256u;
-            if (_1196)
+            bool _1346 = (wr_ix - rd_ix) < 256u;
+            if (_1346)
             {
-                _1206 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1206 = _1196;
+                _1356 = _1346;
             }
-            if (_1206)
+            if (_1356)
             {
                 continue;
             }
@@ -764,21 +843,22 @@
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1222.scene[drawtag_start + element_ix];
+            tag = _1372.scene[drawtag_start + element_ix];
         }
         switch (tag)
         {
             case 68u:
             case 72u:
             case 276u:
+            case 732u:
             case 5u:
             case 37u:
             {
                 uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
-                uint path_ix = v_242.memory[drawmonoid_base];
-                param_14.offset = _854.conf.tile_alloc.offset;
-                PathRef param_15 = PathRef{ _854.conf.tile_alloc.offset + (path_ix * 12u) };
-                Path path = Path_read(param_14, param_15, v_242, v_242BufferSize);
+                uint path_ix = v_260.memory[drawmonoid_base];
+                param_14.offset = _1005.conf.tile_alloc.offset;
+                PathRef param_15 = PathRef{ _1005.conf.tile_alloc.offset + (path_ix * 12u) };
+                Path path = Path_read(param_14, param_15, v_260, v_260BufferSize);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
                 int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -833,16 +913,16 @@
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1222.scene[drawtag_start + element_ix_1];
+            uint tag_1 = _1372.scene[drawtag_start + element_ix_1];
             if (el_ix > 0u)
             {
-                _1455 = sh_tile_count[el_ix - 1u];
+                _1605 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1455 = 0u;
+                _1605 = 0u;
             }
-            uint seq_ix = ix_1 - _1455;
+            uint seq_ix = ix_1 - _1605;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -851,36 +931,36 @@
             {
                 uint param_21 = el_ix;
                 bool param_22 = mem_ok;
-                Alloc param_23 = read_tile_alloc(param_21, param_22, v_242, v_242BufferSize);
+                Alloc param_23 = read_tile_alloc(param_21, param_22, v_260, v_260BufferSize);
                 TileRef param_24 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                Tile tile = Tile_read(param_23, param_24, v_242, v_242BufferSize);
+                Tile tile = Tile_read(param_23, param_24, v_260, v_260BufferSize);
                 bool is_clip = (tag_1 & 1u) != 0u;
                 bool is_blend = false;
                 if (is_clip)
                 {
                     uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
-                    uint scene_offset = v_242.memory[drawmonoid_base_1 + 2u];
+                    uint scene_offset = v_260.memory[drawmonoid_base_1 + 2u];
                     uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1222.scene[dd];
+                    uint blend = _1372.scene[dd];
                     is_blend = blend != 3u;
                 }
-                bool _1542 = tile.tile.offset != 0u;
-                bool _1551;
-                if (!_1542)
+                bool _1692 = tile.tile.offset != 0u;
+                bool _1701;
+                if (!_1692)
                 {
-                    _1551 = (tile.backdrop == 0) == is_clip;
+                    _1701 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1551 = _1542;
+                    _1701 = _1692;
                 }
-                include_tile = _1551 || is_blend;
+                include_tile = _1701 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1573 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1723 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -904,32 +984,32 @@
             uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1222.scene[drawtag_start + element_ix_2];
+            uint drawtag = _1372.scene[drawtag_start + element_ix_2];
             if (clip_zero_depth == 0u)
             {
                 uint param_25 = element_ref_ix;
                 bool param_26 = mem_ok;
-                Alloc param_27 = read_tile_alloc(param_25, param_26, v_242, v_242BufferSize);
+                Alloc param_27 = read_tile_alloc(param_25, param_26, v_260, v_260BufferSize);
                 TileRef param_28 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                Tile tile_1 = Tile_read(param_27, param_28, v_242, v_242BufferSize);
+                Tile tile_1 = Tile_read(param_27, param_28, v_260, v_260BufferSize);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
-                uint scene_offset_1 = v_242.memory[drawmonoid_base_2 + 2u];
-                uint info_offset = v_242.memory[drawmonoid_base_2 + 3u];
+                uint scene_offset_1 = v_260.memory[drawmonoid_base_2 + 2u];
+                uint info_offset = v_260.memory[drawmonoid_base_2 + 3u];
                 uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
                 uint di = drawinfo_start + (info_offset >> uint(2));
                 switch (drawtag)
                 {
                     case 68u:
                     {
-                        linewidth = as_type<float>(v_242.memory[di]);
+                        linewidth = as_type<float>(v_260.memory[di]);
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1697 = alloc_cmd(param_29, param_30, param_31, v_242, v_242BufferSize);
+                        bool _1848 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1697)
+                        if (!_1848)
                         {
                             break;
                         }
@@ -937,13 +1017,13 @@
                         CmdRef param_33 = cmd_ref;
                         Tile param_34 = tile_1;
                         float param_35 = linewidth;
-                        write_fill(param_32, param_33, param_34, param_35, v_242, v_242BufferSize);
+                        write_fill(param_32, param_33, param_34, param_35, v_260, v_260BufferSize);
                         cmd_ref = param_33;
-                        uint rgba = _1222.scene[dd_1];
+                        uint rgba = _1372.scene[dd_1];
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
                         CmdColor param_38 = CmdColor{ rgba };
-                        Cmd_Color_write(param_36, param_37, param_38, v_242, v_242BufferSize);
+                        Cmd_Color_write(param_36, param_37, param_38, v_260, v_260BufferSize);
                         cmd_ref.offset += 8u;
                         break;
                     }
@@ -952,94 +1032,127 @@
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1738 = alloc_cmd(param_39, param_40, param_41, v_242, v_242BufferSize);
+                        bool _1889 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1738)
+                        if (!_1889)
                         {
                             break;
                         }
-                        linewidth = as_type<float>(v_242.memory[di]);
+                        linewidth = as_type<float>(v_260.memory[di]);
                         Alloc param_42 = cmd_alloc;
                         CmdRef param_43 = cmd_ref;
                         Tile param_44 = tile_1;
                         float param_45 = linewidth;
-                        write_fill(param_42, param_43, param_44, param_45, v_242, v_242BufferSize);
+                        write_fill(param_42, param_43, param_44, param_45, v_260, v_260BufferSize);
                         cmd_ref = param_43;
-                        cmd_lin.index = _1222.scene[dd_1];
-                        cmd_lin.line_x = as_type<float>(v_242.memory[di + 1u]);
-                        cmd_lin.line_y = as_type<float>(v_242.memory[di + 2u]);
-                        cmd_lin.line_c = as_type<float>(v_242.memory[di + 3u]);
+                        cmd_lin.index = _1372.scene[dd_1];
+                        cmd_lin.line_x = as_type<float>(v_260.memory[di + 1u]);
+                        cmd_lin.line_y = as_type<float>(v_260.memory[di + 2u]);
+                        cmd_lin.line_c = as_type<float>(v_260.memory[di + 3u]);
                         Alloc param_46 = cmd_alloc;
                         CmdRef param_47 = cmd_ref;
                         CmdLinGrad param_48 = cmd_lin;
-                        Cmd_LinGrad_write(param_46, param_47, param_48, v_242, v_242BufferSize);
+                        Cmd_LinGrad_write(param_46, param_47, param_48, v_260, v_260BufferSize);
                         cmd_ref.offset += 20u;
                         break;
                     }
-                    case 72u:
+                    case 732u:
                     {
-                        linewidth = as_type<float>(v_242.memory[di]);
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1806 = alloc_cmd(param_49, param_50, param_51, v_242, v_242BufferSize);
+                        bool _1953 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1806)
+                        if (!_1953)
                         {
                             break;
                         }
+                        linewidth = as_type<float>(v_260.memory[di]);
                         Alloc param_52 = cmd_alloc;
                         CmdRef param_53 = cmd_ref;
                         Tile param_54 = tile_1;
                         float param_55 = linewidth;
-                        write_fill(param_52, param_53, param_54, param_55, v_242, v_242BufferSize);
+                        write_fill(param_52, param_53, param_54, param_55, v_260, v_260BufferSize);
                         cmd_ref = param_53;
-                        uint index = _1222.scene[dd_1];
-                        uint raw1 = _1222.scene[dd_1 + 1u];
-                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+                        cmd_rad.index = _1372.scene[dd_1];
+                        cmd_rad.mat = as_type<float4>(uint4(v_260.memory[di + 1u], v_260.memory[di + 2u], v_260.memory[di + 3u], v_260.memory[di + 4u]));
+                        cmd_rad.xlat = as_type<float2>(uint2(v_260.memory[di + 5u], v_260.memory[di + 6u]));
+                        cmd_rad.c1 = as_type<float2>(uint2(v_260.memory[di + 7u], v_260.memory[di + 8u]));
+                        cmd_rad.ra = as_type<float>(v_260.memory[di + 9u]);
+                        cmd_rad.roff = as_type<float>(v_260.memory[di + 10u]);
                         Alloc param_56 = cmd_alloc;
                         CmdRef param_57 = cmd_ref;
-                        CmdImage param_58 = CmdImage{ index, offset_1 };
-                        Cmd_Image_write(param_56, param_57, param_58, v_242, v_242BufferSize);
+                        CmdRadGrad param_58 = cmd_rad;
+                        Cmd_RadGrad_write(param_56, param_57, param_58, v_260, v_260BufferSize);
+                        cmd_ref.offset += 48u;
+                        break;
+                    }
+                    case 72u:
+                    {
+                        linewidth = as_type<float>(v_260.memory[di]);
+                        Alloc param_59 = cmd_alloc;
+                        CmdRef param_60 = cmd_ref;
+                        uint param_61 = cmd_limit;
+                        bool _2059 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
+                        cmd_alloc = param_59;
+                        cmd_ref = param_60;
+                        cmd_limit = param_61;
+                        if (!_2059)
+                        {
+                            break;
+                        }
+                        Alloc param_62 = cmd_alloc;
+                        CmdRef param_63 = cmd_ref;
+                        Tile param_64 = tile_1;
+                        float param_65 = linewidth;
+                        write_fill(param_62, param_63, param_64, param_65, v_260, v_260BufferSize);
+                        cmd_ref = param_63;
+                        uint index = _1372.scene[dd_1];
+                        uint raw1 = _1372.scene[dd_1 + 1u];
+                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+                        Alloc param_66 = cmd_alloc;
+                        CmdRef param_67 = cmd_ref;
+                        CmdImage param_68 = CmdImage{ index, offset_1 };
+                        Cmd_Image_write(param_66, param_67, param_68, v_260, v_260BufferSize);
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _1859 = tile_1.tile.offset == 0u;
-                        bool _1865;
-                        if (_1859)
+                        bool _2112 = tile_1.tile.offset == 0u;
+                        bool _2118;
+                        if (_2112)
                         {
-                            _1865 = tile_1.backdrop == 0;
+                            _2118 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _1865 = _1859;
+                            _2118 = _2112;
                         }
-                        if (_1865)
+                        if (_2118)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
                         else
                         {
-                            Alloc param_59 = cmd_alloc;
-                            CmdRef param_60 = cmd_ref;
-                            uint param_61 = cmd_limit;
-                            bool _1877 = alloc_cmd(param_59, param_60, param_61, v_242, v_242BufferSize);
-                            cmd_alloc = param_59;
-                            cmd_ref = param_60;
-                            cmd_limit = param_61;
-                            if (!_1877)
+                            Alloc param_69 = cmd_alloc;
+                            CmdRef param_70 = cmd_ref;
+                            uint param_71 = cmd_limit;
+                            bool _2130 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
+                            cmd_alloc = param_69;
+                            cmd_ref = param_70;
+                            cmd_limit = param_71;
+                            if (!_2130)
                             {
                                 break;
                             }
-                            Alloc param_62 = cmd_alloc;
-                            CmdRef param_63 = cmd_ref;
-                            Cmd_BeginClip_write(param_62, param_63, v_242, v_242BufferSize);
+                            Alloc param_72 = cmd_alloc;
+                            CmdRef param_73 = cmd_ref;
+                            Cmd_BeginClip_write(param_72, param_73, v_260, v_260BufferSize);
                             cmd_ref.offset += 4u;
                         }
                         clip_depth++;
@@ -1048,28 +1161,28 @@
                     case 37u:
                     {
                         clip_depth--;
-                        Alloc param_64 = cmd_alloc;
-                        CmdRef param_65 = cmd_ref;
-                        uint param_66 = cmd_limit;
-                        bool _1905 = alloc_cmd(param_64, param_65, param_66, v_242, v_242BufferSize);
-                        cmd_alloc = param_64;
-                        cmd_ref = param_65;
-                        cmd_limit = param_66;
-                        if (!_1905)
+                        Alloc param_74 = cmd_alloc;
+                        CmdRef param_75 = cmd_ref;
+                        uint param_76 = cmd_limit;
+                        bool _2158 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
+                        cmd_alloc = param_74;
+                        cmd_ref = param_75;
+                        cmd_limit = param_76;
+                        if (!_2158)
                         {
                             break;
                         }
-                        Alloc param_67 = cmd_alloc;
-                        CmdRef param_68 = cmd_ref;
-                        Tile param_69 = tile_1;
-                        float param_70 = -1.0;
-                        write_fill(param_67, param_68, param_69, param_70, v_242, v_242BufferSize);
-                        cmd_ref = param_68;
-                        uint blend_1 = _1222.scene[dd_1];
-                        Alloc param_71 = cmd_alloc;
-                        CmdRef param_72 = cmd_ref;
-                        CmdEndClip param_73 = CmdEndClip{ blend_1 };
-                        Cmd_EndClip_write(param_71, param_72, param_73, v_242, v_242BufferSize);
+                        Alloc param_77 = cmd_alloc;
+                        CmdRef param_78 = cmd_ref;
+                        Tile param_79 = tile_1;
+                        float param_80 = -1.0;
+                        write_fill(param_77, param_78, param_79, param_80, v_260, v_260BufferSize);
+                        cmd_ref = param_78;
+                        uint blend_1 = _1372.scene[dd_1];
+                        Alloc param_81 = cmd_alloc;
+                        CmdRef param_82 = cmd_ref;
+                        CmdEndClip param_83 = CmdEndClip{ blend_1 };
+                        Cmd_EndClip_write(param_81, param_82, param_83, v_260, v_260BufferSize);
                         cmd_ref.offset += 8u;
                         break;
                     }
@@ -1103,21 +1216,21 @@
             break;
         }
     }
-    bool _1975 = (bin_tile_x + tile_x) < _854.conf.width_in_tiles;
-    bool _1984;
-    if (_1975)
+    bool _2228 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
+    bool _2237;
+    if (_2228)
     {
-        _1984 = (bin_tile_y + tile_y) < _854.conf.height_in_tiles;
+        _2237 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
     }
     else
     {
-        _1984 = _1975;
+        _2237 = _2228;
     }
-    if (_1984)
+    if (_2237)
     {
-        Alloc param_74 = cmd_alloc;
-        CmdRef param_75 = cmd_ref;
-        Cmd_End_write(param_74, param_75, v_242, v_242BufferSize);
+        Alloc param_84 = cmd_alloc;
+        CmdRef param_85 = cmd_ref;
+        Cmd_End_write(param_84, param_85, v_260, v_260BufferSize);
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index b85fd8c..6d33ee7 100644
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil
index 77396c1..200f169 100644
--- a/piet-gpu/shader/gen/draw_leaf.dxil
+++ b/piet-gpu/shader/gen/draw_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl
index f812f52..734d21e 100644
--- a/piet-gpu/shader/gen/draw_leaf.hlsl
+++ b/piet-gpu/shader/gen/draw_leaf.hlsl
@@ -46,10 +46,10 @@
 
 static const DrawMonoid _23 = { 0u, 0u, 0u, 0u };
 
-ByteAddressBuffer _92 : register(t1, space0);
-ByteAddressBuffer _102 : register(t2, space0);
-ByteAddressBuffer _202 : register(t3, space0);
-RWByteAddressBuffer _284 : register(u0, space0);
+ByteAddressBuffer _93 : register(t1, space0);
+ByteAddressBuffer _103 : register(t2, space0);
+ByteAddressBuffer _203 : register(t3, space0);
+RWByteAddressBuffer _285 : register(u0, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -66,8 +66,8 @@
 DrawMonoid map_tag(uint tag_word)
 {
     uint has_path = uint(tag_word != 0u);
-    DrawMonoid _75 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
-    return _75;
+    DrawMonoid _76 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
+    return _76;
 }
 
 DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
@@ -88,15 +88,15 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _92.Load(100) >> uint(2);
-    uint tag_word = _102.Load((drawtag_base + ix) * 4 + 0);
+    uint drawtag_base = _93.Load(100) >> uint(2);
+    uint tag_word = _103.Load((drawtag_base + ix) * 4 + 0);
     uint param = tag_word;
     DrawMonoid agg = map_tag(param);
     DrawMonoid local[8];
     local[0] = agg;
     for (uint i = 1u; i < 8u; i++)
     {
-        tag_word = _102.Load(((drawtag_base + ix) + i) * 4 + 0);
+        tag_word = _103.Load(((drawtag_base + ix) + i) * 4 + 0);
         uint param_1 = tag_word;
         DrawMonoid param_2 = agg;
         DrawMonoid param_3 = map_tag(param_1);
@@ -121,15 +121,15 @@
     DrawMonoid row = draw_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        DrawMonoid _208;
-        _208.path_ix = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 0);
-        _208.clip_ix = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 4);
-        _208.scene_offset = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 8);
-        _208.info_offset = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 12);
-        row.path_ix = _208.path_ix;
-        row.clip_ix = _208.clip_ix;
-        row.scene_offset = _208.scene_offset;
-        row.info_offset = _208.info_offset;
+        DrawMonoid _209;
+        _209.path_ix = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 0);
+        _209.clip_ix = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 4);
+        _209.scene_offset = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 8);
+        _209.info_offset = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 12);
+        row.path_ix = _209.path_ix;
+        row.clip_ix = _209.clip_ix;
+        row.scene_offset = _209.scene_offset;
+        row.info_offset = _209.info_offset;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -137,13 +137,15 @@
         DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
         row = combine_draw_monoid(param_6, param_7);
     }
-    uint drawdata_base = _92.Load(104) >> uint(2);
-    uint drawinfo_base = _92.Load(68) >> uint(2);
+    uint drawdata_base = _93.Load(104) >> uint(2);
+    uint drawinfo_base = _93.Load(68) >> uint(2);
     uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_92.Load(44) >> uint(2)) + (out_ix * 4u);
-    uint clip_out_base = _92.Load(48) >> uint(2);
+    uint out_base = (_93.Load(44) >> uint(2)) + (out_ix * 4u);
+    uint clip_out_base = _93.Load(48) >> uint(2);
     float4 mat;
     float2 translate;
+    float2 p0;
+    float2 p1;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid m = row;
@@ -153,31 +155,31 @@
             DrawMonoid param_9 = local[i_2 - 1u];
             m = combine_draw_monoid(param_8, param_9);
         }
-        _284.Store((out_base + (i_2 * 4u)) * 4 + 8, m.path_ix);
-        _284.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 8, m.clip_ix);
-        _284.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 8, m.scene_offset);
-        _284.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 8, m.info_offset);
+        _285.Store((out_base + (i_2 * 4u)) * 4 + 8, m.path_ix);
+        _285.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 8, m.clip_ix);
+        _285.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 8, m.scene_offset);
+        _285.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 8, m.info_offset);
         uint dd = drawdata_base + (m.scene_offset >> uint(2));
         uint di = drawinfo_base + (m.info_offset >> uint(2));
-        tag_word = _102.Load(((drawtag_base + ix) + i_2) * 4 + 0);
-        if ((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 72u)) || (tag_word == 5u))
+        tag_word = _103.Load(((drawtag_base + ix) + i_2) * 4 + 0);
+        if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u))
         {
-            uint bbox_offset = (_92.Load(40) >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(_284.Load(bbox_offset * 4 + 8)) - 32768.0f;
-            float bbox_t = float(_284.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
-            float bbox_r = float(_284.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
-            float bbox_b = float(_284.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
+            uint bbox_offset = (_93.Load(40) >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_285.Load(bbox_offset * 4 + 8)) - 32768.0f;
+            float bbox_t = float(_285.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
+            float bbox_r = float(_285.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
+            float bbox_b = float(_285.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
             float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = asfloat(_284.Load((bbox_offset + 4u) * 4 + 8));
+            float linewidth = asfloat(_285.Load((bbox_offset + 4u) * 4 + 8));
             uint fill_mode = uint(linewidth >= 0.0f);
-            if ((linewidth >= 0.0f) || (tag_word == 276u))
+            if (((linewidth >= 0.0f) || (tag_word == 276u)) || (tag_word == 732u))
             {
-                uint trans_ix = _284.Load((bbox_offset + 5u) * 4 + 8);
-                uint t = (_92.Load(36) >> uint(2)) + (6u * trans_ix);
-                mat = asfloat(uint4(_284.Load(t * 4 + 8), _284.Load((t + 1u) * 4 + 8), _284.Load((t + 2u) * 4 + 8), _284.Load((t + 3u) * 4 + 8)));
-                if (tag_word == 276u)
+                uint trans_ix = _285.Load((bbox_offset + 5u) * 4 + 8);
+                uint t = (_93.Load(36) >> uint(2)) + (6u * trans_ix);
+                mat = asfloat(uint4(_285.Load(t * 4 + 8), _285.Load((t + 1u) * 4 + 8), _285.Load((t + 2u) * 4 + 8), _285.Load((t + 3u) * 4 + 8)));
+                if ((tag_word == 276u) || (tag_word == 732u))
                 {
-                    translate = asfloat(uint2(_284.Load((t + 4u) * 4 + 8), _284.Load((t + 5u) * 4 + 8)));
+                    translate = asfloat(uint2(_285.Load((t + 4u) * 4 + 8), _285.Load((t + 5u) * 4 + 8)));
                 }
             }
             if (linewidth >= 0.0f)
@@ -189,15 +191,14 @@
                 case 68u:
                 case 72u:
                 {
-                    _284.Store(di * 4 + 8, asuint(linewidth));
+                    _285.Store(di * 4 + 8, asuint(linewidth));
                     break;
                 }
                 case 276u:
                 {
-                    _284.Store(di * 4 + 8, asuint(linewidth));
-                    uint index = _102.Load(dd * 4 + 0);
-                    float2 p0 = asfloat(uint2(_102.Load((dd + 1u) * 4 + 0), _102.Load((dd + 2u) * 4 + 0)));
-                    float2 p1 = asfloat(uint2(_102.Load((dd + 3u) * 4 + 0), _102.Load((dd + 4u) * 4 + 0)));
+                    _285.Store(di * 4 + 8, asuint(linewidth));
+                    p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0)));
+                    p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0)));
                     p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
                     p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
                     float2 dxy = p1 - p0;
@@ -205,9 +206,38 @@
                     float line_x = dxy.x * scale;
                     float line_y = dxy.y * scale;
                     float line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    _284.Store((di + 1u) * 4 + 8, asuint(line_x));
-                    _284.Store((di + 2u) * 4 + 8, asuint(line_y));
-                    _284.Store((di + 3u) * 4 + 8, asuint(line_c));
+                    _285.Store((di + 1u) * 4 + 8, asuint(line_x));
+                    _285.Store((di + 2u) * 4 + 8, asuint(line_y));
+                    _285.Store((di + 3u) * 4 + 8, asuint(line_c));
+                    break;
+                }
+                case 732u:
+                {
+                    p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0)));
+                    p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0)));
+                    float r0 = asfloat(_103.Load((dd + 5u) * 4 + 0));
+                    float r1 = asfloat(_103.Load((dd + 6u) * 4 + 0));
+                    float inv_det = 1.0f / ((mat.x * mat.w) - (mat.y * mat.z));
+                    float4 inv_mat = float4(mat.w, -mat.y, -mat.z, mat.x) * inv_det;
+                    float2 inv_tr = (inv_mat.xz * translate.x) + (inv_mat.yw * translate.y);
+                    inv_tr += p0;
+                    float2 center1 = p1 - p0;
+                    float rr = r1 / (r1 - r0);
+                    float rainv = rr / ((r1 * r1) - dot(center1, center1));
+                    float2 c1 = center1 * rainv;
+                    float ra = rr * rainv;
+                    float roff = rr - 1.0f;
+                    _285.Store(di * 4 + 8, asuint(linewidth));
+                    _285.Store((di + 1u) * 4 + 8, asuint(inv_mat.x));
+                    _285.Store((di + 2u) * 4 + 8, asuint(inv_mat.y));
+                    _285.Store((di + 3u) * 4 + 8, asuint(inv_mat.z));
+                    _285.Store((di + 4u) * 4 + 8, asuint(inv_mat.w));
+                    _285.Store((di + 5u) * 4 + 8, asuint(inv_tr.x));
+                    _285.Store((di + 6u) * 4 + 8, asuint(inv_tr.y));
+                    _285.Store((di + 7u) * 4 + 8, asuint(c1.x));
+                    _285.Store((di + 8u) * 4 + 8, asuint(c1.y));
+                    _285.Store((di + 9u) * 4 + 8, asuint(ra));
+                    _285.Store((di + 10u) * 4 + 8, asuint(roff));
                     break;
                 }
                 case 5u:
@@ -223,7 +253,7 @@
             {
                 path_ix = m.path_ix;
             }
-            _284.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
+            _285.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
         }
     }
 }
diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl
index a8516ae..c11e21b 100644
--- a/piet-gpu/shader/gen/draw_leaf.msl
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@@ -124,7 +124,7 @@
 DrawMonoid map_tag(thread const uint& tag_word)
 {
     uint has_path = uint(tag_word != 0u);
-    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
+    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
 }
 
 static inline __attribute__((always_inline))
@@ -144,19 +144,19 @@
     return DrawMonoid{ 0u, 0u, 0u, 0u };
 }
 
-kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92 [[buffer(1)]], const device SceneBuf& _102 [[buffer(2)]], const device ParentBuf& _202 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device Memory& _285 [[buffer(0)]], const device ConfigBuf& _93 [[buffer(1)]], const device SceneBuf& _103 [[buffer(2)]], const device ParentBuf& _203 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup DrawMonoid sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _92.conf.drawtag_offset >> uint(2);
-    uint tag_word = _102.scene[drawtag_base + ix];
+    uint drawtag_base = _93.conf.drawtag_offset >> uint(2);
+    uint tag_word = _103.scene[drawtag_base + ix];
     uint param = tag_word;
     DrawMonoid agg = map_tag(param);
     spvUnsafeArray<DrawMonoid, 8> local;
     local[0] = agg;
     for (uint i = 1u; i < 8u; i++)
     {
-        tag_word = _102.scene[(drawtag_base + ix) + i];
+        tag_word = _103.scene[(drawtag_base + ix) + i];
         uint param_1 = tag_word;
         DrawMonoid param_2 = agg;
         DrawMonoid param_3 = map_tag(param_1);
@@ -181,11 +181,11 @@
     DrawMonoid row = draw_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        uint _205 = gl_WorkGroupID.x - 1u;
-        row.path_ix = _202.parent[_205].path_ix;
-        row.clip_ix = _202.parent[_205].clip_ix;
-        row.scene_offset = _202.parent[_205].scene_offset;
-        row.info_offset = _202.parent[_205].info_offset;
+        uint _206 = gl_WorkGroupID.x - 1u;
+        row.path_ix = _203.parent[_206].path_ix;
+        row.clip_ix = _203.parent[_206].clip_ix;
+        row.scene_offset = _203.parent[_206].scene_offset;
+        row.info_offset = _203.parent[_206].info_offset;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -193,13 +193,15 @@
         DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
         row = combine_draw_monoid(param_6, param_7);
     }
-    uint drawdata_base = _92.conf.drawdata_offset >> uint(2);
-    uint drawinfo_base = _92.conf.drawinfo_alloc.offset >> uint(2);
+    uint drawdata_base = _93.conf.drawdata_offset >> uint(2);
+    uint drawinfo_base = _93.conf.drawinfo_alloc.offset >> uint(2);
     uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_92.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 4u);
-    uint clip_out_base = _92.conf.clip_alloc.offset >> uint(2);
+    uint out_base = (_93.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 4u);
+    uint clip_out_base = _93.conf.clip_alloc.offset >> uint(2);
     float4 mat;
     float2 translate;
+    float2 p0;
+    float2 p1;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid m = row;
@@ -209,31 +211,31 @@
             DrawMonoid param_9 = local[i_2 - 1u];
             m = combine_draw_monoid(param_8, param_9);
         }
-        _284.memory[out_base + (i_2 * 4u)] = m.path_ix;
-        _284.memory[(out_base + (i_2 * 4u)) + 1u] = m.clip_ix;
-        _284.memory[(out_base + (i_2 * 4u)) + 2u] = m.scene_offset;
-        _284.memory[(out_base + (i_2 * 4u)) + 3u] = m.info_offset;
+        _285.memory[out_base + (i_2 * 4u)] = m.path_ix;
+        _285.memory[(out_base + (i_2 * 4u)) + 1u] = m.clip_ix;
+        _285.memory[(out_base + (i_2 * 4u)) + 2u] = m.scene_offset;
+        _285.memory[(out_base + (i_2 * 4u)) + 3u] = m.info_offset;
         uint dd = drawdata_base + (m.scene_offset >> uint(2));
         uint di = drawinfo_base + (m.info_offset >> uint(2));
-        tag_word = _102.scene[(drawtag_base + ix) + i_2];
-        if ((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 72u)) || (tag_word == 5u))
+        tag_word = _103.scene[(drawtag_base + ix) + i_2];
+        if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u))
         {
-            uint bbox_offset = (_92.conf.path_bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(_284.memory[bbox_offset]) - 32768.0;
-            float bbox_t = float(_284.memory[bbox_offset + 1u]) - 32768.0;
-            float bbox_r = float(_284.memory[bbox_offset + 2u]) - 32768.0;
-            float bbox_b = float(_284.memory[bbox_offset + 3u]) - 32768.0;
+            uint bbox_offset = (_93.conf.path_bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_285.memory[bbox_offset]) - 32768.0;
+            float bbox_t = float(_285.memory[bbox_offset + 1u]) - 32768.0;
+            float bbox_r = float(_285.memory[bbox_offset + 2u]) - 32768.0;
+            float bbox_b = float(_285.memory[bbox_offset + 3u]) - 32768.0;
             float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = as_type<float>(_284.memory[bbox_offset + 4u]);
+            float linewidth = as_type<float>(_285.memory[bbox_offset + 4u]);
             uint fill_mode = uint(linewidth >= 0.0);
-            if ((linewidth >= 0.0) || (tag_word == 276u))
+            if (((linewidth >= 0.0) || (tag_word == 276u)) || (tag_word == 732u))
             {
-                uint trans_ix = _284.memory[bbox_offset + 5u];
-                uint t = (_92.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
-                mat = as_type<float4>(uint4(_284.memory[t], _284.memory[t + 1u], _284.memory[t + 2u], _284.memory[t + 3u]));
-                if (tag_word == 276u)
+                uint trans_ix = _285.memory[bbox_offset + 5u];
+                uint t = (_93.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
+                mat = as_type<float4>(uint4(_285.memory[t], _285.memory[t + 1u], _285.memory[t + 2u], _285.memory[t + 3u]));
+                if ((tag_word == 276u) || (tag_word == 732u))
                 {
-                    translate = as_type<float2>(uint2(_284.memory[t + 4u], _284.memory[t + 5u]));
+                    translate = as_type<float2>(uint2(_285.memory[t + 4u], _285.memory[t + 5u]));
                 }
             }
             if (linewidth >= 0.0)
@@ -245,15 +247,14 @@
                 case 68u:
                 case 72u:
                 {
-                    _284.memory[di] = as_type<uint>(linewidth);
+                    _285.memory[di] = as_type<uint>(linewidth);
                     break;
                 }
                 case 276u:
                 {
-                    _284.memory[di] = as_type<uint>(linewidth);
-                    uint index = _102.scene[dd];
-                    float2 p0 = as_type<float2>(uint2(_102.scene[dd + 1u], _102.scene[dd + 2u]));
-                    float2 p1 = as_type<float2>(uint2(_102.scene[dd + 3u], _102.scene[dd + 4u]));
+                    _285.memory[di] = as_type<uint>(linewidth);
+                    p0 = as_type<float2>(uint2(_103.scene[dd + 1u], _103.scene[dd + 2u]));
+                    p1 = as_type<float2>(uint2(_103.scene[dd + 3u], _103.scene[dd + 4u]));
                     p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
                     p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
                     float2 dxy = p1 - p0;
@@ -261,9 +262,38 @@
                     float line_x = dxy.x * scale;
                     float line_y = dxy.y * scale;
                     float line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    _284.memory[di + 1u] = as_type<uint>(line_x);
-                    _284.memory[di + 2u] = as_type<uint>(line_y);
-                    _284.memory[di + 3u] = as_type<uint>(line_c);
+                    _285.memory[di + 1u] = as_type<uint>(line_x);
+                    _285.memory[di + 2u] = as_type<uint>(line_y);
+                    _285.memory[di + 3u] = as_type<uint>(line_c);
+                    break;
+                }
+                case 732u:
+                {
+                    p0 = as_type<float2>(uint2(_103.scene[dd + 1u], _103.scene[dd + 2u]));
+                    p1 = as_type<float2>(uint2(_103.scene[dd + 3u], _103.scene[dd + 4u]));
+                    float r0 = as_type<float>(_103.scene[dd + 5u]);
+                    float r1 = as_type<float>(_103.scene[dd + 6u]);
+                    float inv_det = 1.0 / ((mat.x * mat.w) - (mat.y * mat.z));
+                    float4 inv_mat = float4(mat.w, -mat.y, -mat.z, mat.x) * inv_det;
+                    float2 inv_tr = (inv_mat.xz * translate.x) + (inv_mat.yw * translate.y);
+                    inv_tr += p0;
+                    float2 center1 = p1 - p0;
+                    float rr = r1 / (r1 - r0);
+                    float rainv = rr / ((r1 * r1) - dot(center1, center1));
+                    float2 c1 = center1 * rainv;
+                    float ra = rr * rainv;
+                    float roff = rr - 1.0;
+                    _285.memory[di] = as_type<uint>(linewidth);
+                    _285.memory[di + 1u] = as_type<uint>(inv_mat.x);
+                    _285.memory[di + 2u] = as_type<uint>(inv_mat.y);
+                    _285.memory[di + 3u] = as_type<uint>(inv_mat.z);
+                    _285.memory[di + 4u] = as_type<uint>(inv_mat.w);
+                    _285.memory[di + 5u] = as_type<uint>(inv_tr.x);
+                    _285.memory[di + 6u] = as_type<uint>(inv_tr.y);
+                    _285.memory[di + 7u] = as_type<uint>(c1.x);
+                    _285.memory[di + 8u] = as_type<uint>(c1.y);
+                    _285.memory[di + 9u] = as_type<uint>(ra);
+                    _285.memory[di + 10u] = as_type<uint>(roff);
                     break;
                 }
                 case 5u:
@@ -279,7 +309,7 @@
             {
                 path_ix = m.path_ix;
             }
-            _284.memory[clip_out_base + m.clip_ix] = path_ix;
+            _285.memory[clip_out_base + m.clip_ix] = path_ix;
         }
     }
 }
diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv
index d18b287..58dde43 100644
--- a/piet-gpu/shader/gen/draw_leaf.spv
+++ b/piet-gpu/shader/gen/draw_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil
index 4df0ec5..be69aad 100644
--- a/piet-gpu/shader/gen/draw_reduce.dxil
+++ b/piet-gpu/shader/gen/draw_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl
index 7220b7e..8311155 100644
--- a/piet-gpu/shader/gen/draw_reduce.hlsl
+++ b/piet-gpu/shader/gen/draw_reduce.hlsl
@@ -44,10 +44,10 @@
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-ByteAddressBuffer _86 : register(t1, space0);
-ByteAddressBuffer _96 : register(t2, space0);
-RWByteAddressBuffer _187 : register(u3, space0);
-RWByteAddressBuffer _205 : register(u0, space0);
+ByteAddressBuffer _87 : register(t1, space0);
+ByteAddressBuffer _97 : register(t2, space0);
+RWByteAddressBuffer _188 : register(u3, space0);
+RWByteAddressBuffer _206 : register(u0, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -64,8 +64,8 @@
 DrawMonoid map_tag(uint tag_word)
 {
     uint has_path = uint(tag_word != 0u);
-    DrawMonoid _69 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
-    return _69;
+    DrawMonoid _70 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
+    return _70;
 }
 
 DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
@@ -81,13 +81,13 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _86.Load(100) >> uint(2);
-    uint tag_word = _96.Load((drawtag_base + ix) * 4 + 0);
+    uint drawtag_base = _87.Load(100) >> uint(2);
+    uint tag_word = _97.Load((drawtag_base + ix) * 4 + 0);
     uint param = tag_word;
     DrawMonoid agg = map_tag(param);
     for (uint i = 1u; i < 8u; i++)
     {
-        uint tag_word_1 = _96.Load(((drawtag_base + ix) + i) * 4 + 0);
+        uint tag_word_1 = _97.Load(((drawtag_base + ix) + i) * 4 + 0);
         uint param_1 = tag_word_1;
         DrawMonoid param_2 = agg;
         DrawMonoid param_3 = map_tag(param_1);
@@ -109,10 +109,10 @@
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        _187.Store(gl_WorkGroupID.x * 16 + 0, agg.path_ix);
-        _187.Store(gl_WorkGroupID.x * 16 + 4, agg.clip_ix);
-        _187.Store(gl_WorkGroupID.x * 16 + 8, agg.scene_offset);
-        _187.Store(gl_WorkGroupID.x * 16 + 12, agg.info_offset);
+        _188.Store(gl_WorkGroupID.x * 16 + 0, agg.path_ix);
+        _188.Store(gl_WorkGroupID.x * 16 + 4, agg.clip_ix);
+        _188.Store(gl_WorkGroupID.x * 16 + 8, agg.scene_offset);
+        _188.Store(gl_WorkGroupID.x * 16 + 12, agg.info_offset);
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl
index 8e409a8..759267c 100644
--- a/piet-gpu/shader/gen/draw_reduce.msl
+++ b/piet-gpu/shader/gen/draw_reduce.msl
@@ -85,7 +85,7 @@
 DrawMonoid map_tag(thread const uint& tag_word)
 {
     uint has_path = uint(tag_word != 0u);
-    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
+    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
 }
 
 static inline __attribute__((always_inline))
@@ -99,17 +99,17 @@
     return c;
 }
 
-kernel void main0(const device ConfigBuf& _86 [[buffer(1)]], const device SceneBuf& _96 [[buffer(2)]], device OutBuf& _187 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(const device ConfigBuf& _87 [[buffer(1)]], const device SceneBuf& _97 [[buffer(2)]], device OutBuf& _188 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup DrawMonoid sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _86.conf.drawtag_offset >> uint(2);
-    uint tag_word = _96.scene[drawtag_base + ix];
+    uint drawtag_base = _87.conf.drawtag_offset >> uint(2);
+    uint tag_word = _97.scene[drawtag_base + ix];
     uint param = tag_word;
     DrawMonoid agg = map_tag(param);
     for (uint i = 1u; i < 8u; i++)
     {
-        uint tag_word_1 = _96.scene[(drawtag_base + ix) + i];
+        uint tag_word_1 = _97.scene[(drawtag_base + ix) + i];
         uint param_1 = tag_word_1;
         DrawMonoid param_2 = agg;
         DrawMonoid param_3 = map_tag(param_1);
@@ -131,10 +131,10 @@
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        _187.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
-        _187.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix;
-        _187.outbuf[gl_WorkGroupID.x].scene_offset = agg.scene_offset;
-        _187.outbuf[gl_WorkGroupID.x].info_offset = agg.info_offset;
+        _188.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
+        _188.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix;
+        _188.outbuf[gl_WorkGroupID.x].scene_offset = agg.scene_offset;
+        _188.outbuf[gl_WorkGroupID.x].info_offset = agg.info_offset;
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv
index 4daf43a..d6c6fb7 100644
--- a/piet-gpu/shader/gen/draw_reduce.spv
+++ b/piet-gpu/shader/gen/draw_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index c0c27c9..e6eccc1 100644
--- a/piet-gpu/shader/gen/kernel4.dxil
+++ b/piet-gpu/shader/gen/kernel4.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index f17b240..92fe05b 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -48,6 +48,21 @@
     float line_c;
 };
 
+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
     uint offset;
@@ -146,8 +161,8 @@
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
-RWByteAddressBuffer _278 : register(u0, space0);
-ByteAddressBuffer _1521 : register(t1, space0);
+RWByteAddressBuffer _291 : register(u0, space0);
+ByteAddressBuffer _1666 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float4> image : register(u2, space0);
@@ -174,8 +189,8 @@
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _291 = { a.offset + offset };
-    return _291;
+    Alloc _304 = { a.offset + offset };
+    return _304;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -191,7 +206,7 @@
     {
         return 0u;
     }
-    uint v = _278.Load(offset * 4 + 8);
+    uint v = _291.Load(offset * 4 + 8);
     return v;
 }
 
@@ -200,8 +215,8 @@
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    CmdTag _525 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _525;
+    CmdTag _663 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _663;
 }
 
 CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
@@ -221,9 +236,9 @@
 
 CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
 {
-    CmdStrokeRef _542 = { ref.offset + 4u };
+    CmdStrokeRef _679 = { ref.offset + 4u };
     Alloc param = a;
-    CmdStrokeRef param_1 = _542;
+    CmdStrokeRef param_1 = _679;
     return CmdStroke_read(param, param_1);
 }
 
@@ -259,8 +274,8 @@
     s.origin = float2(asfloat(raw0), asfloat(raw1));
     s._vector = float2(asfloat(raw2), asfloat(raw3));
     s.y_edge = asfloat(raw4);
-    TileSegRef _675 = { raw5 };
-    s.next = _675;
+    TileSegRef _820 = { raw5 };
+    s.next = _820;
     return s;
 }
 
@@ -286,9 +301,9 @@
 
 CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
 {
-    CmdFillRef _532 = { ref.offset + 4u };
+    CmdFillRef _669 = { ref.offset + 4u };
     Alloc param = a;
-    CmdFillRef param_1 = _532;
+    CmdFillRef param_1 = _669;
     return CmdFill_read(param, param_1);
 }
 
@@ -305,9 +320,9 @@
 
 CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
 {
-    CmdAlphaRef _552 = { ref.offset + 4u };
+    CmdAlphaRef _689 = { ref.offset + 4u };
     Alloc param = a;
-    CmdAlphaRef param_1 = _552;
+    CmdAlphaRef param_1 = _689;
     return CmdAlpha_read(param, param_1);
 }
 
@@ -324,9 +339,9 @@
 
 CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
 {
-    CmdColorRef _562 = { ref.offset + 4u };
+    CmdColorRef _699 = { ref.offset + 4u };
     Alloc param = a;
-    CmdColorRef param_1 = _562;
+    CmdColorRef param_1 = _699;
     return CmdColor_read(param, param_1);
 }
 
@@ -370,12 +385,66 @@
 
 CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
 {
-    CmdLinGradRef _572 = { ref.offset + 4u };
+    CmdLinGradRef _709 = { ref.offset + 4u };
     Alloc param = a;
-    CmdLinGradRef param_1 = _572;
+    CmdLinGradRef param_1 = _709;
     return CmdLinGrad_read(param, param_1);
 }
 
+CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = float4(asfloat(raw1), asfloat(raw2), asfloat(raw3), asfloat(raw4));
+    s.xlat = float2(asfloat(raw5), asfloat(raw6));
+    s.c1 = float2(asfloat(raw7), asfloat(raw8));
+    s.ra = asfloat(raw9);
+    s.roff = asfloat(raw10);
+    return s;
+}
+
+CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref)
+{
+    CmdRadGradRef _719 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdRadGradRef param_1 = _719;
+    return CmdRadGrad_read(param, param_1);
+}
+
 CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
 {
     uint ix = ref.offset >> uint(2);
@@ -393,9 +462,9 @@
 
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
 {
-    CmdImageRef _582 = { ref.offset + 4u };
+    CmdImageRef _729 = { ref.offset + 4u };
     Alloc param = a;
-    CmdImageRef param_1 = _582;
+    CmdImageRef param_1 = _729;
     return CmdImage_read(param, param_1);
 }
 
@@ -408,10 +477,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1493 = fromsRGB(param_1);
-        fg_rgba.x = _1493.x;
-        fg_rgba.y = _1493.y;
-        fg_rgba.z = _1493.z;
+        float3 _1638 = fromsRGB(param_1);
+        fg_rgba.x = _1638.x;
+        fg_rgba.y = _1638.y;
+        fg_rgba.z = _1638.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -445,9 +514,9 @@
 
 CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
 {
-    CmdEndClipRef _592 = { ref.offset + 4u };
+    CmdEndClipRef _739 = { ref.offset + 4u };
     Alloc param = a;
-    CmdEndClipRef param_1 = _592;
+    CmdEndClipRef param_1 = _739;
     return CmdEndClip_read(param, param_1);
 }
 
@@ -637,8 +706,8 @@
 {
     float3 param = c;
     float3 param_1 = c + (l - lum(param)).xxx;
-    float3 _901 = clip_color(param_1);
-    return _901;
+    float3 _1046 = clip_color(param_1);
+    return _1046;
 }
 
 float3 mix_blend(float3 cb, float3 cs, uint mode)
@@ -726,9 +795,9 @@
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1192 = set_sat(param_21, param_22);
+            float3 _1337 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1192;
+            float3 param_24 = _1337;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -738,9 +807,9 @@
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1206 = set_sat(param_27, param_28);
+            float3 _1351 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1206;
+            float3 param_30 = _1351;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -877,24 +946,24 @@
 
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 {
-    CmdJumpRef _602 = { ref.offset + 4u };
+    CmdJumpRef _749 = { ref.offset + 4u };
     Alloc param = a;
-    CmdJumpRef param_1 = _602;
+    CmdJumpRef param_1 = _749;
     return CmdJump_read(param, param_1);
 }
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1521.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1536;
-    _1536.offset = _1521.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1666.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1681;
+    _1681.offset = _1666.Load(24);
     Alloc param;
-    param.offset = _1536.offset;
+    param.offset = _1681.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1545 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1545;
+    CmdRef _1690 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1690;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -903,7 +972,7 @@
         rgba[i] = 0.0f.xxxx;
     }
     uint clip_depth = 0u;
-    bool mem_ok = _278.Load(4) == 0u;
+    bool mem_ok = _291.Load(4) == 0u;
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
@@ -928,8 +997,8 @@
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1638 = { stroke.tile_ref };
-                tile_seg_ref = _1638;
+                TileSegRef _1784 = { stroke.tile_ref };
+                tile_seg_ref = _1784;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -965,8 +1034,8 @@
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1758 = { fill.tile_ref };
-                tile_seg_ref = _1758;
+                TileSegRef _1904 = { fill.tile_ref };
+                tile_seg_ref = _1904;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1055,11 +1124,12 @@
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2092 = fromsRGB(param_29);
-                    fg_rgba.x = _2092.x;
-                    fg_rgba.y = _2092.y;
-                    fg_rgba.z = _2092.z;
-                    rgba[k_9] = fg_rgba;
+                    float3 _2238 = fromsRGB(param_29);
+                    fg_rgba.x = _2238.x;
+                    fg_rgba.y = _2238.y;
+                    fg_rgba.z = _2238.z;
+                    float4 fg_k_1 = fg_rgba * area[k_9];
+                    rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
                 cmd_ref.offset += 20u;
                 break;
@@ -1068,74 +1138,100 @@
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31);
-                uint2 param_32 = xy_uint;
-                CmdImage param_33 = fill_img;
-                float4 _2121[8];
-                fillImage(_2121, param_32, param_33);
-                float4 img[8] = _2121;
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31);
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
-                    float4 fg_k_1 = img[k_10] * area[k_10];
-                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_1.w)) + fg_k_1;
+                    uint param_32 = k_10;
+                    float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+                    my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+                    float ba = dot(my_xy_1, rad.c1);
+                    float ca = rad.ra * dot(my_xy_1, my_xy_1);
+                    float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+                    int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
+                    float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
+                    float3 param_33 = fg_rgba_1.xyz;
+                    float3 _2348 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2348.x;
+                    fg_rgba_1.y = _2348.y;
+                    fg_rgba_1.z = _2348.z;
+                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
-                cmd_ref.offset += 12u;
+                cmd_ref.offset += 48u;
                 break;
             }
             case 8u:
             {
+                Alloc param_34 = cmd_alloc;
+                CmdRef param_35 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35);
+                uint2 param_36 = xy_uint;
+                CmdImage param_37 = fill_img;
+                float4 _2391[8];
+                fillImage(_2391, param_36, param_37);
+                float4 img[8] = _2391;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
+                    float4 fg_k_3 = img[k_11] * area[k_11];
+                    rgba[k_11] = (rgba[k_11] * (1.0f - fg_k_3.w)) + fg_k_3;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 9u:
+            {
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                {
                     uint d_2 = min(clip_depth, 127u);
-                    float4 param_34 = float4(rgba[k_11]);
-                    uint _2184 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _2184;
-                    rgba[k_11] = 0.0f.xxxx;
+                    float4 param_38 = float4(rgba[k_12]);
+                    uint _2454 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2454;
+                    rgba[k_12] = 0.0f.xxxx;
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
                 break;
             }
-            case 9u:
+            case 10u:
             {
-                Alloc param_35 = cmd_alloc;
-                CmdRef param_36 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36);
+                Alloc param_39 = cmd_alloc;
+                CmdRef param_40 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
                 uint blend_mode = end_clip.blend >> uint(8);
                 uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                for (uint k_13 = 0u; k_13 < 8u; k_13++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_37 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_37);
-                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    float3 param_38 = bg.xyz;
-                    float3 param_39 = fg_1.xyz;
-                    uint param_40 = blend_mode;
-                    float3 blend = mix_blend(param_38, param_39, param_40);
-                    float4 _2251 = fg_1;
-                    float _2255 = fg_1.w;
-                    float3 _2262 = lerp(_2251.xyz, blend, float((_2255 * bg.w) > 0.0f).xxx);
-                    fg_1.x = _2262.x;
-                    fg_1.y = _2262.y;
-                    fg_1.z = _2262.z;
-                    float3 param_41 = bg.xyz;
-                    float3 param_42 = fg_1.xyz;
-                    float param_43 = bg.w;
-                    float param_44 = fg_1.w;
-                    uint param_45 = comp_mode;
-                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
+                    uint param_41 = blend_stack[d_3][k_13];
+                    float4 bg = unpacksRGB(param_41);
+                    float4 fg_1 = rgba[k_13] * area[k_13];
+                    float3 param_42 = bg.xyz;
+                    float3 param_43 = fg_1.xyz;
+                    uint param_44 = blend_mode;
+                    float3 blend = mix_blend(param_42, param_43, param_44);
+                    float4 _2521 = fg_1;
+                    float _2525 = fg_1.w;
+                    float3 _2532 = lerp(_2521.xyz, blend, float((_2525 * bg.w) > 0.0f).xxx);
+                    fg_1.x = _2532.x;
+                    fg_1.y = _2532.y;
+                    fg_1.z = _2532.z;
+                    float3 param_45 = bg.xyz;
+                    float3 param_46 = fg_1.xyz;
+                    float param_47 = bg.w;
+                    float param_48 = fg_1.w;
+                    uint param_49 = comp_mode;
+                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
-            case 10u:
+            case 11u:
             {
-                Alloc param_46 = cmd_alloc;
-                CmdRef param_47 = cmd_ref;
-                CmdRef _2299 = { Cmd_Jump_read(param_46, param_47).new_ref };
-                cmd_ref = _2299;
+                Alloc param_50 = cmd_alloc;
+                CmdRef param_51 = cmd_ref;
+                CmdRef _2569 = { Cmd_Jump_read(param_50, param_51).new_ref };
+                cmd_ref = _2569;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1143,9 +1239,9 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_48 = i_1;
-        float3 param_49 = rgba[i_1].xyz;
-        image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w);
+        uint param_52 = i_1;
+        float3 param_53 = rgba[i_1].xyz;
+        image[int2(xy_uint + chunk_offset(param_52))] = float4(tosRGB(param_53), rgba[i_1].w);
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index c1f41af..6489563 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -94,6 +94,21 @@
     float line_c;
 };
 
+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
     uint offset;
@@ -222,7 +237,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_278)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_291)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -230,29 +245,29 @@
     {
         return 0u;
     }
-    uint v = v_278.memory[offset];
+    uint v = v_291.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_278);
+    uint tag_and_flags = read_mem(param, param_1, v_291);
     return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_278)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = as_type<float>(raw1);
@@ -260,11 +275,11 @@
 }
 
 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_278);
+    return CmdStroke_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -276,27 +291,27 @@
 }
 
 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_278)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_278);
+    uint raw2 = read_mem(param_4, param_5, v_291);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_278);
+    uint raw3 = read_mem(param_6, param_7, v_291);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_278);
+    uint raw4 = read_mem(param_8, param_9, v_291);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_278);
+    uint raw5 = read_mem(param_10, param_11, v_291);
     TileSeg s;
     s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -312,15 +327,15 @@
 }
 
 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_278)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -328,51 +343,51 @@
 }
 
 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_278);
+    return CmdFill_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_278)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdAlpha s;
     s.alpha = as_type<float>(raw0);
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_278);
+    return CmdAlpha_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_278)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_278);
+    return CmdColor_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -393,21 +408,21 @@
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_278)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_278);
+    uint raw2 = read_mem(param_4, param_5, v_291);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_278);
+    uint raw3 = read_mem(param_6, param_7, v_291);
     CmdLinGrad s;
     s.index = raw0;
     s.line_x = as_type<float>(raw1);
@@ -417,23 +432,78 @@
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_278);
+    return CmdLinGrad_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_278)
+CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_291);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_291);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_291);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11, v_291);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13, v_291);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15, v_291);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17, v_291);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19, v_291);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21, v_291);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = float4(as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3), as_type<float>(raw4));
+    s.xlat = float2(as_type<float>(raw5), as_type<float>(raw6));
+    s.c1 = float2(as_type<float>(raw7), as_type<float>(raw8));
+    s.ra = as_type<float>(raw9);
+    s.roff = as_type<float>(raw10);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+{
+    Alloc param = a;
+    CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u };
+    return CmdRadGrad_read(param, param_1, v_291);
+}
+
+static inline __attribute__((always_inline))
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_291)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_291);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_291);
     CmdImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -441,11 +511,11 @@
 }
 
 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_278);
+    return CmdImage_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -458,10 +528,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1493 = fromsRGB(param_1);
-        fg_rgba.x = _1493.x;
-        fg_rgba.y = _1493.y;
-        fg_rgba.z = _1493.z;
+        float3 _1638 = fromsRGB(param_1);
+        fg_rgba.x = _1638.x;
+        fg_rgba.y = _1638.y;
+        fg_rgba.z = _1638.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -485,23 +555,23 @@
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_278)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdEndClip s;
     s.blend = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
-    return CmdEndClip_read(param, param_1, v_278);
+    return CmdEndClip_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -701,8 +771,8 @@
 {
     float3 param = c;
     float3 param_1 = c + float3(l - lum(param));
-    float3 _901 = clip_color(param_1);
-    return _901;
+    float3 _1046 = clip_color(param_1);
+    return _1046;
 }
 
 static inline __attribute__((always_inline))
@@ -791,9 +861,9 @@
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1192 = set_sat(param_21, param_22);
+            float3 _1337 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1192;
+            float3 param_24 = _1337;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -803,9 +873,9 @@
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1206 = set_sat(param_27, param_28);
+            float3 _1351 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1206;
+            float3 param_30 = _1351;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -931,30 +1001,30 @@
 }
 
 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_278)
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_278);
+    return CmdJump_read(param, param_1, v_291);
 }
 
-kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1521 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1666 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1521.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1666.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1521.conf.ptcl_alloc.offset;
+    param.offset = _1666.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -967,7 +1037,7 @@
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_278.mem_error == 0u;
+    bool mem_ok = v_291.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
@@ -976,7 +1046,7 @@
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_278).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_291).tag;
         if (tag == 0u)
         {
             break;
@@ -987,7 +1057,7 @@
             {
                 Alloc param_5 = cmd_alloc;
                 CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_278);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_291);
                 for (uint k = 0u; k < 8u; k++)
                 {
                     df[k] = 1000000000.0;
@@ -1000,7 +1070,7 @@
                     bool param_9 = mem_ok;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_278);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_291);
                     float2 line_vec = seg.vector;
                     for (uint k_1 = 0u; k_1 < 8u; k_1++)
                     {
@@ -1023,7 +1093,7 @@
             {
                 Alloc param_13 = cmd_alloc;
                 CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_278);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_291);
                 for (uint k_3 = 0u; k_3 < 8u; k_3++)
                 {
                     area[k_3] = float(fill.backdrop);
@@ -1036,7 +1106,7 @@
                     bool param_17 = mem_ok;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_278);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_291);
                     for (uint k_4 = 0u; k_4 < 8u; k_4++)
                     {
                         uint param_20 = k_4;
@@ -1080,7 +1150,7 @@
             {
                 Alloc param_21 = cmd_alloc;
                 CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_278);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_291);
                 for (uint k_7 = 0u; k_7 < 8u; k_7++)
                 {
                     area[k_7] = alpha.alpha;
@@ -1092,7 +1162,7 @@
             {
                 Alloc param_23 = cmd_alloc;
                 CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_278);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_291);
                 uint param_25 = color.rgba_color;
                 float4 fg = unpacksRGB(param_25);
                 for (uint k_8 = 0u; k_8 < 8u; k_8++)
@@ -1107,7 +1177,7 @@
             {
                 Alloc param_26 = cmd_alloc;
                 CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_278);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_291);
                 float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                 for (uint k_9 = 0u; k_9 < 8u; k_9++)
                 {
@@ -1117,11 +1187,12 @@
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2092 = fromsRGB(param_29);
-                    fg_rgba.x = _2092.x;
-                    fg_rgba.y = _2092.y;
-                    fg_rgba.z = _2092.z;
-                    rgba[k_9] = fg_rgba;
+                    float3 _2238 = fromsRGB(param_29);
+                    fg_rgba.x = _2238.x;
+                    fg_rgba.y = _2238.y;
+                    fg_rgba.z = _2238.z;
+                    float4 fg_k_1 = fg_rgba * area[k_9];
+                    rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
                 cmd_ref.offset += 20u;
                 break;
@@ -1130,72 +1201,98 @@
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_278);
-                uint2 param_32 = xy_uint;
-                CmdImage param_33 = fill_img;
-                spvUnsafeArray<float4, 8> img;
-                img = fillImage(param_32, param_33, image_atlas);
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_291);
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
-                    float4 fg_k_1 = img[k_10] * area[k_10];
-                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_1.w)) + fg_k_1;
+                    uint param_32 = k_10;
+                    float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+                    my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+                    float ba = dot(my_xy_1, rad.c1);
+                    float ca = rad.ra * dot(my_xy_1, my_xy_1);
+                    float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+                    int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
+                    float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
+                    float3 param_33 = fg_rgba_1.xyz;
+                    float3 _2348 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2348.x;
+                    fg_rgba_1.y = _2348.y;
+                    fg_rgba_1.z = _2348.z;
+                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
-                cmd_ref.offset += 12u;
+                cmd_ref.offset += 48u;
                 break;
             }
             case 8u:
             {
+                Alloc param_34 = cmd_alloc;
+                CmdRef param_35 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_291);
+                uint2 param_36 = xy_uint;
+                CmdImage param_37 = fill_img;
+                spvUnsafeArray<float4, 8> img;
+                img = fillImage(param_36, param_37, image_atlas);
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
+                    float4 fg_k_3 = img[k_11] * area[k_11];
+                    rgba[k_11] = (rgba[k_11] * (1.0 - fg_k_3.w)) + fg_k_3;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 9u:
+            {
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                {
                     uint d_2 = min(clip_depth, 127u);
-                    float4 param_34 = float4(rgba[k_11]);
-                    uint _2184 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _2184;
-                    rgba[k_11] = float4(0.0);
+                    float4 param_38 = float4(rgba[k_12]);
+                    uint _2454 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2454;
+                    rgba[k_12] = float4(0.0);
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
                 break;
             }
-            case 9u:
+            case 10u:
             {
-                Alloc param_35 = cmd_alloc;
-                CmdRef param_36 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36, v_278);
+                Alloc param_39 = cmd_alloc;
+                CmdRef param_40 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_291);
                 uint blend_mode = end_clip.blend >> uint(8);
                 uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                for (uint k_13 = 0u; k_13 < 8u; k_13++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_37 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_37);
-                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    float3 param_38 = bg.xyz;
-                    float3 param_39 = fg_1.xyz;
-                    uint param_40 = blend_mode;
-                    float3 blend = mix_blend(param_38, param_39, param_40);
-                    float4 _2251 = fg_1;
-                    float _2255 = fg_1.w;
-                    float3 _2262 = mix(_2251.xyz, blend, float3(float((_2255 * bg.w) > 0.0)));
-                    fg_1.x = _2262.x;
-                    fg_1.y = _2262.y;
-                    fg_1.z = _2262.z;
-                    float3 param_41 = bg.xyz;
-                    float3 param_42 = fg_1.xyz;
-                    float param_43 = bg.w;
-                    float param_44 = fg_1.w;
-                    uint param_45 = comp_mode;
-                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
+                    uint param_41 = blend_stack[d_3][k_13];
+                    float4 bg = unpacksRGB(param_41);
+                    float4 fg_1 = rgba[k_13] * area[k_13];
+                    float3 param_42 = bg.xyz;
+                    float3 param_43 = fg_1.xyz;
+                    uint param_44 = blend_mode;
+                    float3 blend = mix_blend(param_42, param_43, param_44);
+                    float4 _2521 = fg_1;
+                    float _2525 = fg_1.w;
+                    float3 _2532 = mix(_2521.xyz, blend, float3(float((_2525 * bg.w) > 0.0)));
+                    fg_1.x = _2532.x;
+                    fg_1.y = _2532.y;
+                    fg_1.z = _2532.z;
+                    float3 param_45 = bg.xyz;
+                    float3 param_46 = fg_1.xyz;
+                    float param_47 = bg.w;
+                    float param_48 = fg_1.w;
+                    uint param_49 = comp_mode;
+                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
-            case 10u:
+            case 11u:
             {
-                Alloc param_46 = cmd_alloc;
-                CmdRef param_47 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_278).new_ref };
+                Alloc param_50 = cmd_alloc;
+                CmdRef param_51 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_50, param_51, v_291).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1203,9 +1300,9 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_48 = i_1;
-        float3 param_49 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
+        uint param_52 = i_1;
+        float3 param_53 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_53), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_52))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index 91272da..7061263 100644
--- a/piet-gpu/shader/gen/kernel4.spv
+++ b/piet-gpu/shader/gen/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index 18c4b7e..046045f 100644
--- a/piet-gpu/shader/gen/kernel4_gray.dxil
+++ b/piet-gpu/shader/gen/kernel4_gray.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index de95771..019a73c 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -48,6 +48,21 @@
     float line_c;
 };
 
+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
     uint offset;
@@ -146,8 +161,8 @@
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
-RWByteAddressBuffer _278 : register(u0, space0);
-ByteAddressBuffer _1521 : register(t1, space0);
+RWByteAddressBuffer _291 : register(u0, space0);
+ByteAddressBuffer _1666 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float> image : register(u2, space0);
@@ -174,8 +189,8 @@
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _291 = { a.offset + offset };
-    return _291;
+    Alloc _304 = { a.offset + offset };
+    return _304;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -191,7 +206,7 @@
     {
         return 0u;
     }
-    uint v = _278.Load(offset * 4 + 8);
+    uint v = _291.Load(offset * 4 + 8);
     return v;
 }
 
@@ -200,8 +215,8 @@
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    CmdTag _525 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _525;
+    CmdTag _663 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _663;
 }
 
 CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
@@ -221,9 +236,9 @@
 
 CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
 {
-    CmdStrokeRef _542 = { ref.offset + 4u };
+    CmdStrokeRef _679 = { ref.offset + 4u };
     Alloc param = a;
-    CmdStrokeRef param_1 = _542;
+    CmdStrokeRef param_1 = _679;
     return CmdStroke_read(param, param_1);
 }
 
@@ -259,8 +274,8 @@
     s.origin = float2(asfloat(raw0), asfloat(raw1));
     s._vector = float2(asfloat(raw2), asfloat(raw3));
     s.y_edge = asfloat(raw4);
-    TileSegRef _675 = { raw5 };
-    s.next = _675;
+    TileSegRef _820 = { raw5 };
+    s.next = _820;
     return s;
 }
 
@@ -286,9 +301,9 @@
 
 CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
 {
-    CmdFillRef _532 = { ref.offset + 4u };
+    CmdFillRef _669 = { ref.offset + 4u };
     Alloc param = a;
-    CmdFillRef param_1 = _532;
+    CmdFillRef param_1 = _669;
     return CmdFill_read(param, param_1);
 }
 
@@ -305,9 +320,9 @@
 
 CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
 {
-    CmdAlphaRef _552 = { ref.offset + 4u };
+    CmdAlphaRef _689 = { ref.offset + 4u };
     Alloc param = a;
-    CmdAlphaRef param_1 = _552;
+    CmdAlphaRef param_1 = _689;
     return CmdAlpha_read(param, param_1);
 }
 
@@ -324,9 +339,9 @@
 
 CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
 {
-    CmdColorRef _562 = { ref.offset + 4u };
+    CmdColorRef _699 = { ref.offset + 4u };
     Alloc param = a;
-    CmdColorRef param_1 = _562;
+    CmdColorRef param_1 = _699;
     return CmdColor_read(param, param_1);
 }
 
@@ -370,12 +385,66 @@
 
 CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
 {
-    CmdLinGradRef _572 = { ref.offset + 4u };
+    CmdLinGradRef _709 = { ref.offset + 4u };
     Alloc param = a;
-    CmdLinGradRef param_1 = _572;
+    CmdLinGradRef param_1 = _709;
     return CmdLinGrad_read(param, param_1);
 }
 
+CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = float4(asfloat(raw1), asfloat(raw2), asfloat(raw3), asfloat(raw4));
+    s.xlat = float2(asfloat(raw5), asfloat(raw6));
+    s.c1 = float2(asfloat(raw7), asfloat(raw8));
+    s.ra = asfloat(raw9);
+    s.roff = asfloat(raw10);
+    return s;
+}
+
+CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref)
+{
+    CmdRadGradRef _719 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdRadGradRef param_1 = _719;
+    return CmdRadGrad_read(param, param_1);
+}
+
 CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
 {
     uint ix = ref.offset >> uint(2);
@@ -393,9 +462,9 @@
 
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
 {
-    CmdImageRef _582 = { ref.offset + 4u };
+    CmdImageRef _729 = { ref.offset + 4u };
     Alloc param = a;
-    CmdImageRef param_1 = _582;
+    CmdImageRef param_1 = _729;
     return CmdImage_read(param, param_1);
 }
 
@@ -408,10 +477,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1493 = fromsRGB(param_1);
-        fg_rgba.x = _1493.x;
-        fg_rgba.y = _1493.y;
-        fg_rgba.z = _1493.z;
+        float3 _1638 = fromsRGB(param_1);
+        fg_rgba.x = _1638.x;
+        fg_rgba.y = _1638.y;
+        fg_rgba.z = _1638.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -445,9 +514,9 @@
 
 CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
 {
-    CmdEndClipRef _592 = { ref.offset + 4u };
+    CmdEndClipRef _739 = { ref.offset + 4u };
     Alloc param = a;
-    CmdEndClipRef param_1 = _592;
+    CmdEndClipRef param_1 = _739;
     return CmdEndClip_read(param, param_1);
 }
 
@@ -637,8 +706,8 @@
 {
     float3 param = c;
     float3 param_1 = c + (l - lum(param)).xxx;
-    float3 _901 = clip_color(param_1);
-    return _901;
+    float3 _1046 = clip_color(param_1);
+    return _1046;
 }
 
 float3 mix_blend(float3 cb, float3 cs, uint mode)
@@ -726,9 +795,9 @@
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1192 = set_sat(param_21, param_22);
+            float3 _1337 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1192;
+            float3 param_24 = _1337;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -738,9 +807,9 @@
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1206 = set_sat(param_27, param_28);
+            float3 _1351 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1206;
+            float3 param_30 = _1351;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -877,24 +946,24 @@
 
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 {
-    CmdJumpRef _602 = { ref.offset + 4u };
+    CmdJumpRef _749 = { ref.offset + 4u };
     Alloc param = a;
-    CmdJumpRef param_1 = _602;
+    CmdJumpRef param_1 = _749;
     return CmdJump_read(param, param_1);
 }
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1521.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1536;
-    _1536.offset = _1521.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1666.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1681;
+    _1681.offset = _1666.Load(24);
     Alloc param;
-    param.offset = _1536.offset;
+    param.offset = _1681.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1545 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1545;
+    CmdRef _1690 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1690;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -903,7 +972,7 @@
         rgba[i] = 0.0f.xxxx;
     }
     uint clip_depth = 0u;
-    bool mem_ok = _278.Load(4) == 0u;
+    bool mem_ok = _291.Load(4) == 0u;
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
@@ -928,8 +997,8 @@
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1638 = { stroke.tile_ref };
-                tile_seg_ref = _1638;
+                TileSegRef _1784 = { stroke.tile_ref };
+                tile_seg_ref = _1784;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -965,8 +1034,8 @@
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1758 = { fill.tile_ref };
-                tile_seg_ref = _1758;
+                TileSegRef _1904 = { fill.tile_ref };
+                tile_seg_ref = _1904;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1055,11 +1124,12 @@
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2092 = fromsRGB(param_29);
-                    fg_rgba.x = _2092.x;
-                    fg_rgba.y = _2092.y;
-                    fg_rgba.z = _2092.z;
-                    rgba[k_9] = fg_rgba;
+                    float3 _2238 = fromsRGB(param_29);
+                    fg_rgba.x = _2238.x;
+                    fg_rgba.y = _2238.y;
+                    fg_rgba.z = _2238.z;
+                    float4 fg_k_1 = fg_rgba * area[k_9];
+                    rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
                 cmd_ref.offset += 20u;
                 break;
@@ -1068,74 +1138,100 @@
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31);
-                uint2 param_32 = xy_uint;
-                CmdImage param_33 = fill_img;
-                float4 _2121[8];
-                fillImage(_2121, param_32, param_33);
-                float4 img[8] = _2121;
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31);
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
-                    float4 fg_k_1 = img[k_10] * area[k_10];
-                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_1.w)) + fg_k_1;
+                    uint param_32 = k_10;
+                    float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+                    my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+                    float ba = dot(my_xy_1, rad.c1);
+                    float ca = rad.ra * dot(my_xy_1, my_xy_1);
+                    float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+                    int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
+                    float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
+                    float3 param_33 = fg_rgba_1.xyz;
+                    float3 _2348 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2348.x;
+                    fg_rgba_1.y = _2348.y;
+                    fg_rgba_1.z = _2348.z;
+                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
-                cmd_ref.offset += 12u;
+                cmd_ref.offset += 48u;
                 break;
             }
             case 8u:
             {
+                Alloc param_34 = cmd_alloc;
+                CmdRef param_35 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35);
+                uint2 param_36 = xy_uint;
+                CmdImage param_37 = fill_img;
+                float4 _2391[8];
+                fillImage(_2391, param_36, param_37);
+                float4 img[8] = _2391;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
+                    float4 fg_k_3 = img[k_11] * area[k_11];
+                    rgba[k_11] = (rgba[k_11] * (1.0f - fg_k_3.w)) + fg_k_3;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 9u:
+            {
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                {
                     uint d_2 = min(clip_depth, 127u);
-                    float4 param_34 = float4(rgba[k_11]);
-                    uint _2184 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _2184;
-                    rgba[k_11] = 0.0f.xxxx;
+                    float4 param_38 = float4(rgba[k_12]);
+                    uint _2454 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2454;
+                    rgba[k_12] = 0.0f.xxxx;
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
                 break;
             }
-            case 9u:
+            case 10u:
             {
-                Alloc param_35 = cmd_alloc;
-                CmdRef param_36 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36);
+                Alloc param_39 = cmd_alloc;
+                CmdRef param_40 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
                 uint blend_mode = end_clip.blend >> uint(8);
                 uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                for (uint k_13 = 0u; k_13 < 8u; k_13++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_37 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_37);
-                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    float3 param_38 = bg.xyz;
-                    float3 param_39 = fg_1.xyz;
-                    uint param_40 = blend_mode;
-                    float3 blend = mix_blend(param_38, param_39, param_40);
-                    float4 _2251 = fg_1;
-                    float _2255 = fg_1.w;
-                    float3 _2262 = lerp(_2251.xyz, blend, float((_2255 * bg.w) > 0.0f).xxx);
-                    fg_1.x = _2262.x;
-                    fg_1.y = _2262.y;
-                    fg_1.z = _2262.z;
-                    float3 param_41 = bg.xyz;
-                    float3 param_42 = fg_1.xyz;
-                    float param_43 = bg.w;
-                    float param_44 = fg_1.w;
-                    uint param_45 = comp_mode;
-                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
+                    uint param_41 = blend_stack[d_3][k_13];
+                    float4 bg = unpacksRGB(param_41);
+                    float4 fg_1 = rgba[k_13] * area[k_13];
+                    float3 param_42 = bg.xyz;
+                    float3 param_43 = fg_1.xyz;
+                    uint param_44 = blend_mode;
+                    float3 blend = mix_blend(param_42, param_43, param_44);
+                    float4 _2521 = fg_1;
+                    float _2525 = fg_1.w;
+                    float3 _2532 = lerp(_2521.xyz, blend, float((_2525 * bg.w) > 0.0f).xxx);
+                    fg_1.x = _2532.x;
+                    fg_1.y = _2532.y;
+                    fg_1.z = _2532.z;
+                    float3 param_45 = bg.xyz;
+                    float3 param_46 = fg_1.xyz;
+                    float param_47 = bg.w;
+                    float param_48 = fg_1.w;
+                    uint param_49 = comp_mode;
+                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
-            case 10u:
+            case 11u:
             {
-                Alloc param_46 = cmd_alloc;
-                CmdRef param_47 = cmd_ref;
-                CmdRef _2299 = { Cmd_Jump_read(param_46, param_47).new_ref };
-                cmd_ref = _2299;
+                Alloc param_50 = cmd_alloc;
+                CmdRef param_51 = cmd_ref;
+                CmdRef _2569 = { Cmd_Jump_read(param_50, param_51).new_ref };
+                cmd_ref = _2569;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1143,8 +1239,8 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_48 = i_1;
-        image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x;
+        uint param_52 = i_1;
+        image[int2(xy_uint + chunk_offset(param_52))] = rgba[i_1].w.x;
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 5128e99..6402c6f 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -94,6 +94,21 @@
     float line_c;
 };
 
+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
     uint offset;
@@ -222,7 +237,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_278)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_291)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -230,29 +245,29 @@
     {
         return 0u;
     }
-    uint v = v_278.memory[offset];
+    uint v = v_291.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_278);
+    uint tag_and_flags = read_mem(param, param_1, v_291);
     return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_278)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = as_type<float>(raw1);
@@ -260,11 +275,11 @@
 }
 
 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_278);
+    return CmdStroke_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -276,27 +291,27 @@
 }
 
 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_278)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_278);
+    uint raw2 = read_mem(param_4, param_5, v_291);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_278);
+    uint raw3 = read_mem(param_6, param_7, v_291);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_278);
+    uint raw4 = read_mem(param_8, param_9, v_291);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_278);
+    uint raw5 = read_mem(param_10, param_11, v_291);
     TileSeg s;
     s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -312,15 +327,15 @@
 }
 
 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_278)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -328,51 +343,51 @@
 }
 
 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_278);
+    return CmdFill_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_278)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdAlpha s;
     s.alpha = as_type<float>(raw0);
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_278);
+    return CmdAlpha_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_278)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_278);
+    return CmdColor_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -393,21 +408,21 @@
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_278)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_278);
+    uint raw2 = read_mem(param_4, param_5, v_291);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_278);
+    uint raw3 = read_mem(param_6, param_7, v_291);
     CmdLinGrad s;
     s.index = raw0;
     s.line_x = as_type<float>(raw1);
@@ -417,23 +432,78 @@
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_278);
+    return CmdLinGrad_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_278)
+CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_291);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_291);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_291);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11, v_291);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13, v_291);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15, v_291);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17, v_291);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19, v_291);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21, v_291);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = float4(as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3), as_type<float>(raw4));
+    s.xlat = float2(as_type<float>(raw5), as_type<float>(raw6));
+    s.c1 = float2(as_type<float>(raw7), as_type<float>(raw8));
+    s.ra = as_type<float>(raw9);
+    s.roff = as_type<float>(raw10);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+{
+    Alloc param = a;
+    CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u };
+    return CmdRadGrad_read(param, param_1, v_291);
+}
+
+static inline __attribute__((always_inline))
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_291)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_291);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_291);
     CmdImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -441,11 +511,11 @@
 }
 
 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_278);
+    return CmdImage_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -458,10 +528,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1493 = fromsRGB(param_1);
-        fg_rgba.x = _1493.x;
-        fg_rgba.y = _1493.y;
-        fg_rgba.z = _1493.z;
+        float3 _1638 = fromsRGB(param_1);
+        fg_rgba.x = _1638.x;
+        fg_rgba.y = _1638.y;
+        fg_rgba.z = _1638.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -485,23 +555,23 @@
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_278)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdEndClip s;
     s.blend = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
-    return CmdEndClip_read(param, param_1, v_278);
+    return CmdEndClip_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -701,8 +771,8 @@
 {
     float3 param = c;
     float3 param_1 = c + float3(l - lum(param));
-    float3 _901 = clip_color(param_1);
-    return _901;
+    float3 _1046 = clip_color(param_1);
+    return _1046;
 }
 
 static inline __attribute__((always_inline))
@@ -791,9 +861,9 @@
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1192 = set_sat(param_21, param_22);
+            float3 _1337 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1192;
+            float3 param_24 = _1337;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -803,9 +873,9 @@
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1206 = set_sat(param_27, param_28);
+            float3 _1351 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1206;
+            float3 param_30 = _1351;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -931,30 +1001,30 @@
 }
 
 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_278)
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_278);
+    return CmdJump_read(param, param_1, v_291);
 }
 
-kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1521 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1666 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1521.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1666.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1521.conf.ptcl_alloc.offset;
+    param.offset = _1666.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -967,7 +1037,7 @@
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_278.mem_error == 0u;
+    bool mem_ok = v_291.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
@@ -976,7 +1046,7 @@
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_278).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_291).tag;
         if (tag == 0u)
         {
             break;
@@ -987,7 +1057,7 @@
             {
                 Alloc param_5 = cmd_alloc;
                 CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_278);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_291);
                 for (uint k = 0u; k < 8u; k++)
                 {
                     df[k] = 1000000000.0;
@@ -1000,7 +1070,7 @@
                     bool param_9 = mem_ok;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_278);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_291);
                     float2 line_vec = seg.vector;
                     for (uint k_1 = 0u; k_1 < 8u; k_1++)
                     {
@@ -1023,7 +1093,7 @@
             {
                 Alloc param_13 = cmd_alloc;
                 CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_278);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_291);
                 for (uint k_3 = 0u; k_3 < 8u; k_3++)
                 {
                     area[k_3] = float(fill.backdrop);
@@ -1036,7 +1106,7 @@
                     bool param_17 = mem_ok;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_278);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_291);
                     for (uint k_4 = 0u; k_4 < 8u; k_4++)
                     {
                         uint param_20 = k_4;
@@ -1080,7 +1150,7 @@
             {
                 Alloc param_21 = cmd_alloc;
                 CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_278);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_291);
                 for (uint k_7 = 0u; k_7 < 8u; k_7++)
                 {
                     area[k_7] = alpha.alpha;
@@ -1092,7 +1162,7 @@
             {
                 Alloc param_23 = cmd_alloc;
                 CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_278);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_291);
                 uint param_25 = color.rgba_color;
                 float4 fg = unpacksRGB(param_25);
                 for (uint k_8 = 0u; k_8 < 8u; k_8++)
@@ -1107,7 +1177,7 @@
             {
                 Alloc param_26 = cmd_alloc;
                 CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_278);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_291);
                 float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                 for (uint k_9 = 0u; k_9 < 8u; k_9++)
                 {
@@ -1117,11 +1187,12 @@
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2092 = fromsRGB(param_29);
-                    fg_rgba.x = _2092.x;
-                    fg_rgba.y = _2092.y;
-                    fg_rgba.z = _2092.z;
-                    rgba[k_9] = fg_rgba;
+                    float3 _2238 = fromsRGB(param_29);
+                    fg_rgba.x = _2238.x;
+                    fg_rgba.y = _2238.y;
+                    fg_rgba.z = _2238.z;
+                    float4 fg_k_1 = fg_rgba * area[k_9];
+                    rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
                 cmd_ref.offset += 20u;
                 break;
@@ -1130,72 +1201,98 @@
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_278);
-                uint2 param_32 = xy_uint;
-                CmdImage param_33 = fill_img;
-                spvUnsafeArray<float4, 8> img;
-                img = fillImage(param_32, param_33, image_atlas);
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_291);
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
-                    float4 fg_k_1 = img[k_10] * area[k_10];
-                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_1.w)) + fg_k_1;
+                    uint param_32 = k_10;
+                    float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+                    my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+                    float ba = dot(my_xy_1, rad.c1);
+                    float ca = rad.ra * dot(my_xy_1, my_xy_1);
+                    float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+                    int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
+                    float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
+                    float3 param_33 = fg_rgba_1.xyz;
+                    float3 _2348 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2348.x;
+                    fg_rgba_1.y = _2348.y;
+                    fg_rgba_1.z = _2348.z;
+                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
-                cmd_ref.offset += 12u;
+                cmd_ref.offset += 48u;
                 break;
             }
             case 8u:
             {
+                Alloc param_34 = cmd_alloc;
+                CmdRef param_35 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_291);
+                uint2 param_36 = xy_uint;
+                CmdImage param_37 = fill_img;
+                spvUnsafeArray<float4, 8> img;
+                img = fillImage(param_36, param_37, image_atlas);
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
+                    float4 fg_k_3 = img[k_11] * area[k_11];
+                    rgba[k_11] = (rgba[k_11] * (1.0 - fg_k_3.w)) + fg_k_3;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 9u:
+            {
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                {
                     uint d_2 = min(clip_depth, 127u);
-                    float4 param_34 = float4(rgba[k_11]);
-                    uint _2184 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _2184;
-                    rgba[k_11] = float4(0.0);
+                    float4 param_38 = float4(rgba[k_12]);
+                    uint _2454 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2454;
+                    rgba[k_12] = float4(0.0);
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
                 break;
             }
-            case 9u:
+            case 10u:
             {
-                Alloc param_35 = cmd_alloc;
-                CmdRef param_36 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36, v_278);
+                Alloc param_39 = cmd_alloc;
+                CmdRef param_40 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_291);
                 uint blend_mode = end_clip.blend >> uint(8);
                 uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                for (uint k_13 = 0u; k_13 < 8u; k_13++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_37 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_37);
-                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    float3 param_38 = bg.xyz;
-                    float3 param_39 = fg_1.xyz;
-                    uint param_40 = blend_mode;
-                    float3 blend = mix_blend(param_38, param_39, param_40);
-                    float4 _2251 = fg_1;
-                    float _2255 = fg_1.w;
-                    float3 _2262 = mix(_2251.xyz, blend, float3(float((_2255 * bg.w) > 0.0)));
-                    fg_1.x = _2262.x;
-                    fg_1.y = _2262.y;
-                    fg_1.z = _2262.z;
-                    float3 param_41 = bg.xyz;
-                    float3 param_42 = fg_1.xyz;
-                    float param_43 = bg.w;
-                    float param_44 = fg_1.w;
-                    uint param_45 = comp_mode;
-                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
+                    uint param_41 = blend_stack[d_3][k_13];
+                    float4 bg = unpacksRGB(param_41);
+                    float4 fg_1 = rgba[k_13] * area[k_13];
+                    float3 param_42 = bg.xyz;
+                    float3 param_43 = fg_1.xyz;
+                    uint param_44 = blend_mode;
+                    float3 blend = mix_blend(param_42, param_43, param_44);
+                    float4 _2521 = fg_1;
+                    float _2525 = fg_1.w;
+                    float3 _2532 = mix(_2521.xyz, blend, float3(float((_2525 * bg.w) > 0.0)));
+                    fg_1.x = _2532.x;
+                    fg_1.y = _2532.y;
+                    fg_1.z = _2532.z;
+                    float3 param_45 = bg.xyz;
+                    float3 param_46 = fg_1.xyz;
+                    float param_47 = bg.w;
+                    float param_48 = fg_1.w;
+                    uint param_49 = comp_mode;
+                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
-            case 10u:
+            case 11u:
             {
-                Alloc param_46 = cmd_alloc;
-                CmdRef param_47 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_278).new_ref };
+                Alloc param_50 = cmd_alloc;
+                CmdRef param_51 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_50, param_51, v_291).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1203,8 +1300,8 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_48 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
+        uint param_52 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_52))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 791b76c..4633401 100644
--- a/piet-gpu/shader/gen/kernel4_gray.spv
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
Binary files differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index a97715a..c49e2fa 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -192,10 +192,27 @@
                 int x = int(round(clamp(my_d, 0.0, 1.0) * float(GRADIENT_WIDTH - 1)));
                 mediump vec4 fg_rgba = imageLoad(gradients, ivec2(x, int(lin.index)));
                 fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
-                rgba[k] = fg_rgba;
+                mediump vec4 fg_k = fg_rgba * area[k];
+                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
             }
             cmd_ref.offset += 4 + CmdLinGrad_size;
             break;
+        case Cmd_RadGrad:
+            CmdRadGrad rad = Cmd_RadGrad_read(cmd_alloc, cmd_ref);
+            for (uint k = 0; k < CHUNK; k++) {
+                vec2 my_xy = xy + vec2(chunk_offset(k));
+                my_xy = rad.mat.xz * my_xy.x + rad.mat.yw * my_xy.y - rad.xlat;
+                float ba = dot(my_xy, rad.c1);
+                float ca = rad.ra * dot(my_xy, my_xy);
+                float t = sqrt(ba * ba  + ca) - ba - rad.roff;
+                int x = int(round(clamp(t, 0.0, 1.0) * float(GRADIENT_WIDTH - 1)));
+                mediump vec4 fg_rgba = imageLoad(gradients, ivec2(x, int(rad.index)));
+                fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
+                mediump vec4 fg_k = fg_rgba * area[k];
+                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
+            }
+            cmd_ref.offset += 4 + CmdRadGrad_size;
+            break;
         case Cmd_Image:
             CmdImage fill_img = Cmd_Image_read(cmd_alloc, cmd_ref);
             mediump vec4 img[CHUNK] = fillImage(xy_uint, fill_img);
diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
index 9b9b341..54dcc9e 100644
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@@ -18,6 +18,10 @@
     uint offset;
 };
 
+struct CmdRadGradRef {
+    uint offset;
+};
+
 struct CmdImageRef {
     uint offset;
 };
@@ -83,6 +87,21 @@
     return CmdLinGradRef(ref.offset + index * CmdLinGrad_size);
 }
 
+struct CmdRadGrad {
+    uint index;
+    vec4 mat;
+    vec2 xlat;
+    vec2 c1;
+    float ra;
+    float roff;
+};
+
+#define CmdRadGrad_size 44
+
+CmdRadGradRef CmdRadGrad_index(CmdRadGradRef ref, uint index) {
+    return CmdRadGradRef(ref.offset + index * CmdRadGrad_size);
+}
+
 struct CmdImage {
     uint index;
     ivec2 offset;
@@ -131,11 +150,12 @@
 #define Cmd_Alpha 4
 #define Cmd_Color 5
 #define Cmd_LinGrad 6
-#define Cmd_Image 7
-#define Cmd_BeginClip 8
-#define Cmd_EndClip 9
-#define Cmd_Jump 10
-#define Cmd_size 20
+#define Cmd_RadGrad 7
+#define Cmd_Image 8
+#define Cmd_BeginClip 9
+#define Cmd_EndClip 10
+#define Cmd_Jump 11
+#define Cmd_size 48
 
 CmdRef Cmd_index(CmdRef ref, uint index) {
     return CmdRef(ref.offset + index * Cmd_size);
@@ -213,6 +233,44 @@
     write_mem(a, ix + 3, floatBitsToUint(s.line_c));
 }
 
+CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    uint raw7 = read_mem(a, ix + 7);
+    uint raw8 = read_mem(a, ix + 8);
+    uint raw9 = read_mem(a, ix + 9);
+    uint raw10 = read_mem(a, ix + 10);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
+    s.xlat = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6));
+    s.c1 = vec2(uintBitsToFloat(raw7), uintBitsToFloat(raw8));
+    s.ra = uintBitsToFloat(raw9);
+    s.roff = uintBitsToFloat(raw10);
+    return s;
+}
+
+void CmdRadGrad_write(Alloc a, CmdRadGradRef ref, CmdRadGrad s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.index);
+    write_mem(a, ix + 1, floatBitsToUint(s.mat.x));
+    write_mem(a, ix + 2, floatBitsToUint(s.mat.y));
+    write_mem(a, ix + 3, floatBitsToUint(s.mat.z));
+    write_mem(a, ix + 4, floatBitsToUint(s.mat.w));
+    write_mem(a, ix + 5, floatBitsToUint(s.xlat.x));
+    write_mem(a, ix + 6, floatBitsToUint(s.xlat.y));
+    write_mem(a, ix + 7, floatBitsToUint(s.c1.x));
+    write_mem(a, ix + 8, floatBitsToUint(s.c1.y));
+    write_mem(a, ix + 9, floatBitsToUint(s.ra));
+    write_mem(a, ix + 10, floatBitsToUint(s.roff));
+}
+
 CmdImage CmdImage_read(Alloc a, CmdImageRef ref) {
     uint ix = ref.offset >> 2;
     uint raw0 = read_mem(a, ix + 0);
@@ -293,6 +351,10 @@
     return CmdLinGrad_read(a, CmdLinGradRef(ref.offset + 4));
 }
 
+CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref) {
+    return CmdRadGrad_read(a, CmdRadGradRef(ref.offset + 4));
+}
+
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref) {
     return CmdImage_read(a, CmdImageRef(ref.offset + 4));
 }
@@ -338,6 +400,11 @@
     CmdLinGrad_write(a, CmdLinGradRef(ref.offset + 4), s);
 }
 
+void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s) {
+    write_mem(a, ref.offset >> 2, Cmd_RadGrad);
+    CmdRadGrad_write(a, CmdRadGradRef(ref.offset + 4), s);
+}
+
 void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) {
     write_mem(a, ref.offset >> 2, Cmd_Image);
     CmdImage_write(a, CmdImageRef(ref.offset + 4), s);
diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs
index 62c59c4..2f4b85e 100644
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@@ -62,6 +62,7 @@
 // Tags for draw objects. See shader/drawtag.h for the authoritative source.
 const DRAWTAG_FILLCOLOR: u32 = 0x44;
 const DRAWTAG_FILLLINGRADIENT: u32 = 0x114;
+const DRAWTAG_FILLRADGRADIENT: u32 = 0x2dc;
 const DRAWTAG_BEGINCLIP: u32 = 0x05;
 const DRAWTAG_ENDCLIP: u32 = 0x25;
 
@@ -79,6 +80,16 @@
     p1: [f32; 2],
 }
 
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct FillRadGradient {
+    index: u32,
+    p0: [f32; 2],
+    p1: [f32; 2],
+    r0: f32,
+    r1: f32,
+}
+
 #[allow(unused)]
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
@@ -123,6 +134,13 @@
         self.transform_stream.push(transform);
     }
 
+    // Swap the last two tags in the tag stream; used for transformed
+    // gradients.
+    pub fn swap_last_tags(&mut self) {
+        let len = self.tag_stream.len();
+        self.tag_stream.swap(len - 1, len - 2);
+    }
+
     // -1.0 means "fill"
     pub fn linewidth(&mut self, linewidth: f32) {
         self.tag_stream.push(0x40);
@@ -147,6 +165,16 @@
         self.drawdata_stream.extend(bytemuck::bytes_of(&element));
     }
 
+
+    /// Encode a fill radial gradient draw object.
+    ///
+    /// This should be encoded after a path.
+    pub fn fill_rad_gradient(&mut self, index: u32, p0: [f32; 2], p1: [f32; 2], r0: f32, r1: f32) {
+        self.drawtag_stream.push(DRAWTAG_FILLRADGRADIENT);
+        let element = FillRadGradient { index, p0, p1, r0, r1 };
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
+    }
+    
     /// Start a clip.
     pub fn begin_clip(&mut self, blend: Option<Blend>) {
         self.drawtag_stream.push(DRAWTAG_BEGINCLIP);
@@ -220,7 +248,7 @@
         alloc += n_drawobj * DRAW_BBOX_SIZE;
         let drawinfo_alloc = alloc;
         // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
-        const MAX_DRAWINFO_SIZE: usize = 16;
+        const MAX_DRAWINFO_SIZE: usize = 44;
         alloc += n_drawobj * MAX_DRAWINFO_SIZE;
 
         let config = Config {
diff --git a/piet-gpu/src/gradient.rs b/piet-gpu/src/gradient.rs
index 20982e9..e655908 100644
--- a/piet-gpu/src/gradient.rs
+++ b/piet-gpu/src/gradient.rs
@@ -18,15 +18,29 @@
 
 use std::collections::hash_map::{Entry, HashMap};
 
-use piet::{Color, FixedLinearGradient, GradientStop};
+use piet::kurbo::Point;
+use piet::{Color, FixedLinearGradient, GradientStop, FixedRadialGradient};
+
+/// Radial gradient compatible with COLRv1 spec
+#[derive(Debug, Clone)]
+pub struct Colrv1RadialGradient {
+    /// The center of the iner circle.
+    pub center0: Point,
+    /// The offset of the origin relative to the center.
+    pub center1: Point,
+    /// The radius of the inner circle.
+    pub radius0: f64,
+    /// The radius of the outer circle.
+    pub radius1: f64,
+    /// The stops.
+    pub stops: Vec<GradientStop>,
+}
 
 #[derive(Clone)]
 pub struct BakedGradient {
     ramp: Vec<u32>,
 }
 
-/// This is basically the same type as scene::FillLinGradient, so could
-/// potentially use that directly.
 #[derive(Clone)]
 pub struct LinearGradient {
     pub(crate) start: [f32; 2],
@@ -34,6 +48,15 @@
     pub(crate) ramp_id: u32,
 }
 
+#[derive(Clone)]
+pub struct RadialGradient {
+    pub(crate) start: [f32; 2],
+    pub(crate) end: [f32; 2],
+    pub(crate) r0: f32,
+    pub(crate) r1: f32,
+    pub(crate) ramp_id: u32,
+}
+
 #[derive(Default)]
 pub struct RampCache {
     ramps: Vec<GradientRamp>,
@@ -154,6 +177,28 @@
         }
     }
 
+    pub fn add_radial_gradient(&mut self, rad: &FixedRadialGradient) -> RadialGradient {
+        let ramp_id = self.add_ramp(&rad.stops);
+        RadialGradient {
+            ramp_id: ramp_id as u32,
+            start: crate::render_ctx::to_f32_2(rad.center + rad.origin_offset),
+            end: crate::render_ctx::to_f32_2(rad.center),
+            r0: 0.0,
+            r1: rad.radius as f32,
+        }
+    }
+
+    pub fn add_radial_gradient_colrv1(&mut self, rad: &Colrv1RadialGradient) -> RadialGradient {
+        let ramp_id = self.add_ramp(&rad.stops);
+        RadialGradient {
+            ramp_id: ramp_id as u32,
+            start: crate::render_ctx::to_f32_2(rad.center0),
+            end: crate::render_ctx::to_f32_2(rad.center1),
+            r0: rad.radius0 as f32,
+            r1: rad.radius1 as f32,
+        }
+    }
+
     /// Dump the contents of a gradient. This is for debugging.
     #[allow(unused)]
     pub(crate) fn dump_gradient(&self, lin: &LinearGradient) {
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index e12f824..475d723 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -12,6 +12,7 @@
 
 pub use blend::{Blend, BlendMode, CompositionMode};
 pub use render_ctx::PietGpuRenderContext;
+pub use gradient::Colrv1RadialGradient;
 
 use piet::kurbo::Vec2;
 use piet::{ImageFormat, RenderContext};
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index 024dd2b..dca03eb 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -13,7 +13,7 @@
 use piet_gpu_types::encoder::{Encode, Encoder};
 use piet_gpu_types::scene::Element;
 
-use crate::gradient::{LinearGradient, RampCache};
+use crate::gradient::{LinearGradient, RadialGradient, RampCache, Colrv1RadialGradient};
 use crate::text::Font;
 pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder};
 use crate::Blend;
@@ -50,6 +50,7 @@
 pub enum PietGpuBrush {
     Solid(u32),
     LinGradient(LinearGradient),
+    RadGradient(RadialGradient),
 }
 
 #[derive(Default)]
@@ -187,6 +188,10 @@
                 let lin = self.ramp_cache.add_linear_gradient(&lin);
                 Ok(PietGpuBrush::LinGradient(lin))
             }
+            FixedGradient::Radial(rad) => {
+                let rad = self.ramp_cache.add_radial_gradient(&rad);
+                Ok(PietGpuBrush::RadGradient(rad))
+            }
             _ => todo!("don't do radial gradients yet"),
         }
     }
@@ -338,6 +343,20 @@
         }
     }
 
+    pub fn radial_gradient_colrv1(&mut self, rad: &Colrv1RadialGradient) -> PietGpuBrush {
+        PietGpuBrush::RadGradient(self.ramp_cache.add_radial_gradient_colrv1(rad))
+    }
+
+    pub fn fill_transform(&mut self, shape: impl Shape, brush: &PietGpuBrush, transform: Affine) {
+        let path = shape.path_elements(TOLERANCE);
+        self.encode_linewidth(-1.0);
+        self.encode_path(path, true);
+        self.encode_transform(Transform::from_kurbo(transform));
+        self.new_encoder.swap_last_tags();
+        self.encode_brush(&brush);
+        self.encode_transform(Transform::from_kurbo(transform.inverse()));
+    }
+
     fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
         if is_fill {
             self.encode_path_inner(
@@ -420,6 +439,10 @@
                 self.new_encoder
                     .fill_lin_gradient(lin.ramp_id, lin.start, lin.end);
             }
+            PietGpuBrush::RadGradient(rad) => {
+                self.new_encoder
+                    .fill_rad_gradient(rad.ramp_id, rad.start, rad.end, rad.r0, rad.r1);
+            }
         }
     }
 }
diff --git a/piet-gpu/src/test_scenes.rs b/piet-gpu/src/test_scenes.rs
index ee5839d..cf5a50d 100644
--- a/piet-gpu/src/test_scenes.rs
+++ b/piet-gpu/src/test_scenes.rs
@@ -2,10 +2,10 @@
 
 use rand::{Rng, RngCore};
 
-use crate::{Blend, BlendMode, CompositionMode, PietGpuRenderContext};
+use crate::{Blend, BlendMode, CompositionMode, PietGpuRenderContext, Colrv1RadialGradient};
 use piet::kurbo::{Affine, BezPath, Circle, Line, Point, Rect, Shape};
 use piet::{
-    Color, FixedGradient, FixedLinearGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
+    Color, FixedGradient, FixedRadialGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
 };
 
 use crate::{PicoSvg, RenderContext, Vec2};
@@ -32,7 +32,7 @@
     println!("flattening and encoding time: {:?}", start.elapsed());
 }
 
-pub fn render_scene(rc: &mut impl RenderContext) {
+pub fn render_scene(rc: &mut PietGpuRenderContext) {
     const WIDTH: usize = 2048;
     const HEIGHT: usize = 1536;
     let mut rng = rand::thread_rng();
@@ -142,7 +142,7 @@
 }
 
 #[allow(unused)]
-fn render_gradient_test(rc: &mut impl RenderContext) {
+fn render_gradient_test(rc: &mut PietGpuRenderContext) {
     let stops = vec![
         GradientStop {
             color: Color::rgb8(0, 255, 0),
@@ -153,14 +153,18 @@
             pos: 1.0,
         },
     ];
-    let lin = FixedLinearGradient {
-        start: Point::new(0.0, 100.0),
-        end: Point::new(0.0, 300.0),
+    let rad = Colrv1RadialGradient {
+        center0: Point::new(200.0, 200.0),
+        center1: Point::new(250.0, 200.0),
+        radius0: 50.0,
+        radius1: 100.0,
         stops,
     };
-    let brush = FixedGradient::Linear(lin);
+    let brush = rc.radial_gradient_colrv1(&rad);
+    //let brush = FixedGradient::Radial(rad);
     //let brush = Color::rgb8(0, 128, 0);
-    rc.fill(Rect::new(100.0, 100.0, 300.0, 300.0), &brush);
+    let transform = Affine::new([1.0, 0.0, 0.0, 0.5, 0.0, 100.0]);
+    rc.fill_transform(Rect::new(100.0, 100.0, 300.0, 300.0), &brush, transform);
 }
 
 fn diamond(origin: Point) -> impl Shape {