Initial implementation of blend modes

* Add blend and composition mode enums to API
* Mirror these in the shaders
* Add new public blend function to PietGpuRenderContext that mirrors clip
* Plumb the modes through the pipeline from scene to kernel4
diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
index f12a1cf..3ca0742 100644
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@@ -57,16 +57,27 @@
         let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
 
         let mut renderer = Renderer::new(&session, WIDTH, HEIGHT, NUM_FRAMES)?;
+        let mut mode = 0usize;
 
         event_loop.run(move |event, _, control_flow| {
             *control_flow = ControlFlow::Poll; // `ControlFlow::Wait` if only re-render on event
 
             match event {
                 Event::WindowEvent { event, window_id } if window_id == window.id() => {
+                    use winit::event::{ElementState, VirtualKeyCode};
                     match event {
                         WindowEvent::CloseRequested => {
                             *control_flow = ControlFlow::Exit;
                         }
+                        WindowEvent::KeyboardInput { input, .. } => {
+                            if input.state == ElementState::Pressed {
+                                match input.virtual_keycode {
+                                    Some(VirtualKeyCode::Left) => mode = mode.wrapping_sub(1),
+                                    Some(VirtualKeyCode::Right) => mode = mode.wrapping_add(1),
+                                    _ => {}
+                                }
+                            }
+                        }
                         _ => (),
                     }
                 }
@@ -105,7 +116,41 @@
                         }
                         test_scenes::render_svg(&mut ctx, input, scale);
                     } else {
-                        test_scenes::render_anim_frame(&mut ctx, current_frame);
+                        use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
+                        let blends = [
+                            Blend::new(Normal, SrcOver),
+                            Blend::new(Multiply, SrcOver),
+                            Blend::new(Screen, SrcOver),
+                            Blend::new(Overlay, SrcOver),
+                            Blend::new(Darken, SrcOver),
+                            Blend::new(Lighten, SrcOver),
+                            Blend::new(ColorDodge, SrcOver),
+                            Blend::new(ColorBurn, SrcOver),
+                            Blend::new(HardLight, SrcOver),
+                            Blend::new(SoftLight, SrcOver),
+                            Blend::new(Difference, SrcOver),
+                            Blend::new(Exclusion, SrcOver),
+                            Blend::new(Hue, SrcOver),
+                            Blend::new(Saturation, SrcOver),
+                            Blend::new(Color, SrcOver),
+                            Blend::new(Luminosity, SrcOver),
+                            Blend::new(Normal, Clear),
+                            Blend::new(Normal, Copy),
+                            Blend::new(Normal, Dest),
+                            Blend::new(Normal, SrcOver),
+                            Blend::new(Normal, DestOver),
+                            Blend::new(Normal, SrcIn),
+                            Blend::new(Normal, DestIn),
+                            Blend::new(Normal, SrcOut),
+                            Blend::new(Normal, DestOut),
+                            Blend::new(Normal, SrcAtop),
+                            Blend::new(Normal, DestAtop),
+                            Blend::new(Normal, Xor),
+                            Blend::new(Normal, Plus),
+                        ];
+                        let blend = blends[mode % blends.len()];
+                        test_scenes::render_blend_test(&mut ctx, current_frame, blend);
+                        info_string = format!("{:?}", blend);
                     }
                     render_info_string(&mut ctx, &info_string);
                     if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {
diff --git a/piet-gpu/shader/annotated.h b/piet-gpu/shader/annotated.h
index b833574..5a35088 100644
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@@ -69,9 +69,10 @@
 struct AnnoBeginClip {
     vec4 bbox;
     float linewidth;
+    uint blend;
 };
 
-#define AnnoBeginClip_size 20
+#define AnnoBeginClip_size 24
 
 AnnoBeginClipRef AnnoBeginClip_index(AnnoBeginClipRef ref, uint index) {
     return AnnoBeginClipRef(ref.offset + index * AnnoBeginClip_size);
@@ -79,9 +80,10 @@
 
 struct AnnoEndClip {
     vec4 bbox;
+    uint blend;
 };
 
-#define AnnoEndClip_size 16
+#define AnnoEndClip_size 20
 
 AnnoEndClipRef AnnoEndClip_index(AnnoEndClipRef ref, uint index) {
     return AnnoEndClipRef(ref.offset + index * AnnoEndClip_size);
@@ -198,9 +200,11 @@
     uint raw2 = read_mem(a, ix + 2);
     uint raw3 = read_mem(a, ix + 3);
     uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
     AnnoBeginClip s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     s.linewidth = uintBitsToFloat(raw4);
+    s.blend = raw5;
     return s;
 }
 
@@ -211,6 +215,7 @@
     write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
     write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
     write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
+    write_mem(a, ix + 5, s.blend);
 }
 
 AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) {
@@ -219,8 +224,10 @@
     uint raw1 = read_mem(a, ix + 1);
     uint raw2 = read_mem(a, ix + 2);
     uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
     AnnoEndClip s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.blend = raw4;
     return s;
 }
 
@@ -230,6 +237,7 @@
     write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
     write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
     write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
+    write_mem(a, ix + 4, s.blend);
 }
 
 AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) {
@@ -281,8 +289,8 @@
     AnnoBeginClip_write(a, AnnoBeginClipRef(ref.offset + 4), s);
 }
 
-void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoEndClip s) {
-    write_mem(a, ref.offset >> 2, Annotated_EndClip);
+void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoEndClip s) {
+    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_EndClip);
     AnnoEndClip_write(a, AnnoEndClipRef(ref.offset + 4), s);
 }
 
diff --git a/piet-gpu/shader/blend.h b/piet-gpu/shader/blend.h
new file mode 100644
index 0000000..1ac4bd6
--- /dev/null
+++ b/piet-gpu/shader/blend.h
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Mode definitions and functions for blending and composition.
+
+#define Blend_Normal 0
+#define Blend_Multiply 1
+#define Blend_Screen 2
+#define Blend_Overlay 3
+#define Blend_Darken 4
+#define Blend_Lighten 5
+#define Blend_ColorDodge 6
+#define Blend_ColorBurn 7
+#define Blend_HardLight 8
+#define Blend_SoftLight 9
+#define Blend_Difference 10
+#define Blend_Exclusion 11
+#define Blend_Hue 12
+#define Blend_Saturation 13
+#define Blend_Color 14
+#define Blend_Luminosity 15
+
+vec3 screen(vec3 cb, vec3 cs) {
+	return cb + cs - (cb * cs);
+}
+
+float color_dodge(float cb, float cs) {
+    if (cb == 0.0)
+        return 0.0;
+    else if (cs == 1.0)
+        return 1.0;
+    else
+        return min(1.0, cb / (1.0 - cs));
+}
+
+float color_burn(float cb, float cs) {
+    if (cb == 1.0)
+        return 1.0;
+    else if (cs == 0.0)
+        return 0.0;
+    else
+        return 1.0 - min(1.0, (1.0 - cb) / cs);
+}
+
+vec3 hard_light(vec3 cb, vec3 cs) {
+	return mix(
+		screen(cb, 2.0 * cs - 1.0),
+		cb * 2.0 * cs, 
+		vec3(lessThanEqual(cs, vec3(0.5)))
+	);
+}
+
+vec3 soft_light(vec3 cb, vec3 cs) {
+	vec3 d = mix(
+		sqrt(cb),
+		((16.0 * cb - vec3(12.0)) * cb + vec3(4.0)) * cb,
+		vec3(lessThanEqual(cb, vec3(0.25)))
+	);
+	return mix(
+		cb + (2.0 * cs - vec3(1.0)) * (d - cb),
+		cb - (vec3(1.0) - 2.0 * cs) * cb * (vec3(1.0) - cb),
+		vec3(lessThanEqual(cs, vec3(0.5)))
+	);
+}
+
+float sat(vec3 c) {
+    return max(c.r, max(c.g, c.b)) - min(c.r, min(c.g, c.b));
+}
+
+float lum(vec3 c) {
+    vec3 f = vec3(0.3, 0.59, 0.11);
+    return dot(c, f);
+}
+
+vec3 clip_color(vec3 c) {
+    float L = lum(c);
+    float n = min(c.r, min(c.g, c.b));
+    float x = max(c.r, max(c.g, c.b));
+    if (n < 0.0)
+        c = L + (((c - L) * L) / (L - n));
+    if (x > 1.0)
+        c = L + (((c - L) * (1.0 - L)) / (x - L));
+    return c;
+}
+
+vec3 set_lum(vec3 c, float l) {
+    return clip_color(c + (l - lum(c)));
+}
+
+void set_sat_inner(inout float cmin, inout float cmid, inout float cmax, float s) {
+    if (cmax > cmin) {
+        cmid = (((cmid - cmin) * s) / (cmax - cmin));
+        cmax = s;
+    } else {
+        cmid = 0.0;
+        cmax = 0.0;
+    }
+    cmin = 0.0;
+}
+
+vec3 set_sat(vec3 c, float s) {
+    if (c.r <= c.g) {
+        if (c.g <= c.b) {
+            set_sat_inner(c.r, c.g, c.b, s);
+        } else {
+            if (c.r <= c.b) {
+                set_sat_inner(c.r, c.b, c.g, s);
+            } else {
+                set_sat_inner(c.b, c.r, c.g, s);
+            }
+        }
+    } else {
+        if (c.r <= c.b) {
+            set_sat_inner(c.g, c.r, c.b, s);
+        } else {
+            if (c.g <= c.b) {
+                set_sat_inner(c.g, c.b, c.r, s);
+            } else {
+                set_sat_inner(c.b, c.g, c.r, s);
+            }
+        }
+    }
+    return c;
+}
+
+vec3 mix_blend(vec3 cb, vec3 cs, uint mode) {
+	vec3 b = vec3(0.0);
+	switch (mode) {
+	case Blend_Multiply:
+		b = cb * cs;
+		break;
+	case Blend_Screen:
+		b = screen(cb, cs);
+		break;
+	case Blend_Overlay:
+		b = hard_light(cs, cb);
+		break;
+	case Blend_Darken:
+		b = min(cb, cs);
+		break;
+	case Blend_Lighten:
+		b = max(cb, cs);
+		break;
+	case Blend_ColorDodge:
+		b = vec3(color_dodge(cb.x, cs.x), color_dodge(cb.y, cs.y), color_dodge(cb.z, cs.z));
+		break;
+	case Blend_ColorBurn:
+		b = vec3(color_burn(cb.x, cs.x), color_burn(cb.y, cs.y), color_burn(cb.z, cs.z));
+		break;
+	case Blend_HardLight:
+		b = hard_light(cb, cs);
+		break;
+	case Blend_SoftLight:
+		b = soft_light(cb, cs);
+		break;
+	case Blend_Difference:
+		b = abs(cb - cs);
+		break;
+	case Blend_Exclusion:
+		b = cb + cs - 2 * cb * cs;
+		break;
+	case Blend_Hue:
+		b = set_lum(set_sat(cs, sat(cb)), lum(cb));
+		break;
+	case Blend_Saturation:
+		b = set_lum(set_sat(cb, sat(cs)), lum(cb));
+		break;
+	case Blend_Color:
+		b = set_lum(cs, lum(cb));
+		break;
+	case Blend_Luminosity:
+		b = set_lum(cb, lum(cs));
+		break;
+	default:
+		b = cs;
+		break;
+	}
+	return b;
+}
+
+#define Comp_Clear 0
+#define Comp_Copy 1
+#define Comp_Dest 2
+#define Comp_SrcOver 3
+#define Comp_DestOver 4
+#define Comp_SrcIn 5
+#define Comp_DestIn 6
+#define Comp_SrcOut 7
+#define Comp_DestOut 8
+#define Comp_SrcAtop 9
+#define Comp_DestAtop 10
+#define Comp_Xor 11
+#define Comp_Plus 12
+#define Comp_PlusDarker 13
+#define Comp_PlusLighter 14
+
+vec4 mix_compose(vec3 cb, vec3 cs, float ab, float as, uint mode) {
+	float fa = 0.0;
+	float fb = 0.0;
+	switch (mode) {
+	case Comp_Copy:
+		fa = 1.0;
+		fb = 0.0;
+		break;
+	case Comp_Dest:
+		fa = 0.0;
+		fb = 1.0;
+		break;
+	case Comp_SrcOver:
+		fa = 1.0;
+		fb = 1.0 - as;
+		break;
+	case Comp_DestOver:
+		fa = 1.0 - ab;
+		fb = 1.0;
+		break;
+	case Comp_SrcIn:
+		fa = ab;
+		fb = 0.0;
+		break;
+	case Comp_DestIn:
+		fa = 0.0;
+		fb = as;
+		break;
+	case Comp_SrcOut:
+		fa = 1.0 - ab;
+		fb = 0.0;
+		break;
+	case Comp_DestOut:
+		fa = 0.0;
+		fb = 1.0 - as;
+		break;
+	case Comp_SrcAtop:
+		fa = ab;
+		fb = 1.0 - as;
+		break;
+	case Comp_DestAtop:
+		fa = 1.0 - ab;
+		fb = as;
+		break;
+	case Comp_Xor:
+		fa = 1.0 - ab;
+		fb = 1.0 - as;
+		break;
+	case Comp_Plus:
+		fa = 1.0;
+		fb = 1.0;
+		break;
+	case Comp_PlusDarker:
+		return vec4(max(vec4(0.0), 1.0 - as * vec4(cs, as) + 1.0 - ab * vec4(cb, ab)).xyz, 
+			    max(0.0, 1.0 - as + 1.0 - ab));
+	case Comp_PlusLighter:
+		return vec4(min(vec4(1.0), as * vec4(cs, as) + ab * vec4(cb, ab)).xyz,
+			    min(1.0, as + ab));
+	default:
+		break;
+	}
+	return as * fa * vec4(cs, as) + ab * fb * vec4(cb, ab);
+}
+
+#define BlendComp_default (Blend_Normal << 8 | Comp_SrcOver)
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 8b9998f..ac4f3d7 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -53,7 +53,7 @@
 build gen/coarse.dxil: dxil gen/coarse.hlsl
 build gen/coarse.msl: msl gen/coarse.spv
 
-build gen/kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
+build gen/kernel4.spv: glsl kernel4.comp | blend.h ptcl.h setup.h
 build gen/kernel4.hlsl: hlsl gen/kernel4.spv
 build gen/kernel4.dxil: dxil gen/kernel4.hlsl
 build gen/kernel4.msl: msl gen/kernel4.spv
@@ -114,7 +114,7 @@
 build gen/draw_root.dxil: dxil gen/draw_root.hlsl
 build gen/draw_root.msl: msl gen/draw_root.spv
 
-build gen/draw_leaf.spv: glsl draw_leaf.comp | scene.h drawtag.h annotated.h setup.h mem.h
+build gen/draw_leaf.spv: glsl draw_leaf.comp | blend.h scene.h drawtag.h annotated.h setup.h mem.h
 build gen/draw_leaf.hlsl: hlsl gen/draw_leaf.spv
 build gen/draw_leaf.dxil: dxil gen/draw_leaf.hlsl
 build gen/draw_leaf.msl: msl gen/draw_leaf.spv
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 98ab270..df306e0 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -273,7 +273,8 @@
                 }
             }
             AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size);
-            uint tag = Annotated_tag(conf.anno_alloc, ref).tag;
+            AnnotatedTag anno_tag = Annotated_tag(conf.anno_alloc, ref);
+            uint tag = anno_tag.tag;
             uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + seq_ix % width;
@@ -287,7 +288,10 @@
                 // For draws, include the tile if it is solid.
                 // For clips, include the tile if it is empty - this way, logic
                 // below will suppress the drawing of inner elements.
-                include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip;
+                // For blends, include the tile if
+                // (blend_mode, composition_mode) != (Normal, SrcOver)
+                include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
+                    || (is_clip && (anno_tag.flags & 0x2) != 0);
             }
             if (include_tile) {
                 uint el_slice = el_ix / 32;
@@ -387,13 +391,14 @@
                     tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
                                      TileRef(sh_tile_base[element_ref_ix] +
                                              (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    AnnoEndClip end_clip = Annotated_EndClip_read(conf.anno_alloc, ref);
                     clip_depth--;
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
                     }
                     write_fill(cmd_alloc, cmd_ref, MODE_NONZERO, tile, 0.0);
-                    Cmd_EndClip_write(cmd_alloc, cmd_ref);
-                    cmd_ref.offset += 4;
+                    Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(end_clip.blend));
+                    cmd_ref.offset += 4 + CmdEndClip_size;
                     break;
                 }
             } else {
diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp
index f236b7f..74fc2f8 100644
--- a/piet-gpu/shader/draw_leaf.comp
+++ b/piet-gpu/shader/draw_leaf.comp
@@ -28,6 +28,7 @@
 #include "tile.h"
 #include "drawtag.h"
 #include "annotated.h"
+#include "blend.h"
 
 #define Monoid DrawMonoid
 
@@ -149,17 +150,23 @@
                 Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
                 break;
             case Element_BeginClip:
+                Clip begin_clip = Element_BeginClip_read(this_ref);
                 AnnoBeginClip anno_begin_clip;
                 anno_begin_clip.bbox = bbox;
                 anno_begin_clip.linewidth = 0.0; // don't support clip-with-stroke
-                Annotated_BeginClip_write(conf.anno_alloc, out_ref, 0, anno_begin_clip);
+                anno_begin_clip.blend = begin_clip.blend;
+                uint flags = uint(begin_clip.blend != BlendComp_default) << 1;
+                Annotated_BeginClip_write(conf.anno_alloc, out_ref, flags, anno_begin_clip);
                 break;
             }
         } else if (tag_word == Element_EndClip) {
+            Clip end_clip = Element_BeginClip_read(this_ref);
             AnnoEndClip anno_end_clip;
             // The actual bbox will be reconstructed from clip stream output.
             anno_end_clip.bbox = vec4(-1e9, -1e9, 1e9, 1e9);
-            Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
+            anno_end_clip.blend = end_clip.blend;
+            uint flags = uint(end_clip.blend != BlendComp_default) << 1;
+            Annotated_EndClip_write(conf.anno_alloc, out_ref, flags, anno_end_clip);
         }
         // Generate clip stream.
         if (tag_word == Element_BeginClip || tag_word == Element_EndClip) {
diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl
index 0e3b6c8..3bf96da 100644
--- a/piet-gpu/shader/gen/binning.msl
+++ b/piet-gpu/shader/gen/binning.msl
@@ -220,7 +220,7 @@
 kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_94 [[buffer(0)]], const device ConfigBuf& v_202 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint bitmaps[8][256];
-    threadgroup short sh_alloc_failed;
+    threadgroup bool sh_alloc_failed;
     threadgroup uint count[8][256];
     threadgroup Alloc sh_chunk_alloc[256];
     constant uint& v_94BufferSize = spvBufferSizeConstants[0];
@@ -232,7 +232,7 @@
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        sh_alloc_failed = short(false);
+        sh_alloc_failed = false;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
@@ -331,7 +331,7 @@
         sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
         if (chunk.failed)
         {
-            sh_alloc_failed = short(true);
+            sh_alloc_failed = true;
         }
     }
     uint out_ix = (v_202.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
@@ -347,13 +347,13 @@
     write_mem(param_16, param_17, param_18, v_94, v_94BufferSize);
     threadgroup_barrier(mem_flags::mem_threadgroup);
     bool _687;
-    if (!bool(sh_alloc_failed))
+    if (!sh_alloc_failed)
     {
         _687 = v_94.mem_error != 0u;
     }
     else
     {
-        _687 = bool(sh_alloc_failed);
+        _687 = sh_alloc_failed;
     }
     if (_687)
     {
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index e5a0f0d..21bd30c 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -7,13 +7,6 @@
 
 using namespace metal;
 
-// Implementation of the GLSL findLSB() function
-template<typename T>
-inline T spvFindLSB(T x)
-{
-    return select(ctz(x), T(-1), x == T(0));
-}
-
 struct Alloc
 {
     uint offset;
@@ -65,6 +58,17 @@
     float line_c;
 };
 
+struct AnnoEndClipRef
+{
+    uint offset;
+};
+
+struct AnnoEndClip
+{
+    float4 bbox;
+    uint blend;
+};
+
 struct AnnotatedRef
 {
     uint offset;
@@ -169,6 +173,16 @@
     int2 offset;
 };
 
+struct CmdEndClipRef
+{
+    uint offset;
+};
+
+struct CmdEndClip
+{
+    uint blend;
+};
+
 struct CmdJumpRef
 {
     uint offset;
@@ -230,6 +244,13 @@
 
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
+// Implementation of the GLSL findLSB() function
+template<typename T>
+inline T spvFindLSB(T x)
+{
+    return select(ctz(x), T(-1), x == T(0));
+}
+
 static inline __attribute__((always_inline))
 Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
 {
@@ -243,7 +264,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_283, constant uint& v_283BufferSize)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -251,7 +272,7 @@
     {
         return 0u;
     }
-    uint v = v_283.memory[offset];
+    uint v = v_308.memory[offset];
     return v;
 }
 
@@ -270,39 +291,39 @@
 }
 
 static inline __attribute__((always_inline))
-BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
     BinInstance s;
     s.element_ix = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint tag_and_flags = read_mem(param, param_1, v_308, v_308BufferSize);
     return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -315,24 +336,24 @@
 }
 
 static inline __attribute__((always_inline))
-Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_283, constant uint& v_283BufferSize)
+Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint param = 0u;
-    uint param_1 = uint(int((v_283BufferSize - 8) / 4) * 4);
+    uint param_1 = uint(int((v_308BufferSize - 8) / 4) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
 
 static inline __attribute__((always_inline))
-Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
     Tile s;
     s.tile = TileSegRef{ raw0 };
     s.backdrop = int(raw1);
@@ -340,27 +361,27 @@
 }
 
 static inline __attribute__((always_inline))
-AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_283, v_283BufferSize);
+    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_283, v_283BufferSize);
+    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_283, v_283BufferSize);
+    uint raw5 = read_mem(param_10, param_11, v_308, v_308BufferSize);
     AnnoColor s;
     s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
     s.linewidth = as_type<float>(raw4);
@@ -369,34 +390,34 @@
 }
 
 static inline __attribute__((always_inline))
-AnnoColor Annotated_Color_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoColor Annotated_Color_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     AnnoColorRef param_1 = AnnoColorRef{ ref.offset + 4u };
-    return AnnoColor_read(param, param_1, v_283, v_283BufferSize);
+    return AnnoColor_read(param, param_1, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_283, constant uint& v_283BufferSize)
+MallocResult malloc(thread const uint& size, device Memory& v_308, constant uint& v_308BufferSize)
 {
-    uint _289 = atomic_fetch_add_explicit((device atomic_uint*)&v_283.mem_offset, size, memory_order_relaxed);
-    uint offset = _289;
+    uint _314 = atomic_fetch_add_explicit((device atomic_uint*)&v_308.mem_offset, size, memory_order_relaxed);
+    uint offset = _314;
     MallocResult r;
-    r.failed = (offset + size) > uint(int((v_283BufferSize - 8) / 4) * 4);
+    r.failed = (offset + size) > uint(int((v_308BufferSize - 8) / 4) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _318 = atomic_fetch_max_explicit((device atomic_uint*)&v_283.mem_error, 1u, memory_order_relaxed);
+        uint _343 = atomic_fetch_max_explicit((device atomic_uint*)&v_308.mem_error, 1u, memory_order_relaxed);
         return r;
     }
     return r;
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_283, constant uint& v_283BufferSize)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -404,42 +425,42 @@
     {
         return;
     }
-    v_283.memory[offset] = val;
+    v_308.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_283, constant uint& v_283BufferSize)
+void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.new_ref;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 10u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u };
     CmdJump param_5 = s;
-    CmdJump_write(param_3, param_4, param_5, v_283, v_283BufferSize);
+    CmdJump_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_283, constant uint& v_283BufferSize)
+bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_308, constant uint& v_308BufferSize)
 {
     if (cmd_ref.offset < cmd_limit)
     {
         return true;
     }
     uint param = 1024u;
-    MallocResult _1076 = malloc(param, v_283, v_283BufferSize);
-    MallocResult new_cmd = _1076;
+    MallocResult _1190 = malloc(param, v_308, v_308BufferSize);
+    MallocResult new_cmd = _1190;
     if (new_cmd.failed)
     {
         return false;
@@ -448,7 +469,7 @@
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
-    Cmd_Jump_write(param_1, param_2, param_3, v_283, v_283BufferSize);
+    Cmd_Jump_write(param_1, param_2, param_3, v_308, v_308BufferSize);
     cmd_alloc = new_cmd.alloc;
     cmd_ref = CmdRef{ cmd_alloc.offset };
     cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
@@ -462,70 +483,70 @@
 }
 
 static inline __attribute__((always_inline))
-void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_283, constant uint& v_283BufferSize)
+void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = uint(s.backdrop);
-    write_mem(param_3, param_4, param_5, v_283, v_283BufferSize);
+    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u };
     CmdFill param_5 = s;
-    CmdFill_write(param_3, param_4, param_5, v_283, v_283BufferSize);
+    CmdFill_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 3u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_283, constant uint& v_283BufferSize)
+void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.half_width);
-    write_mem(param_3, param_4, param_5, v_283, v_283BufferSize);
+    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u };
     CmdStroke param_5 = s;
-    CmdStroke_write(param_3, param_4, param_5, v_283, v_283BufferSize);
+    CmdStroke_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const uint& flags, thread const Tile& tile, thread const float& linewidth, device Memory& v_283, constant uint& v_283BufferSize)
+void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const uint& flags, thread const Tile& tile, thread const float& linewidth, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint param = flags;
     if (fill_mode_from_flags(param) == 0u)
@@ -536,14 +557,14 @@
             Alloc param_1 = alloc;
             CmdRef param_2 = cmd_ref;
             CmdFill param_3 = cmd_fill;
-            Cmd_Fill_write(param_1, param_2, param_3, v_283, v_283BufferSize);
+            Cmd_Fill_write(param_1, param_2, param_3, v_308, v_308BufferSize);
             cmd_ref.offset += 12u;
         }
         else
         {
             Alloc param_4 = alloc;
             CmdRef param_5 = cmd_ref;
-            Cmd_Solid_write(param_4, param_5, v_283, v_283BufferSize);
+            Cmd_Solid_write(param_4, param_5, v_308, v_308BufferSize);
             cmd_ref.offset += 4u;
         }
     }
@@ -553,65 +574,65 @@
         Alloc param_6 = alloc;
         CmdRef param_7 = cmd_ref;
         CmdStroke param_8 = cmd_stroke;
-        Cmd_Stroke_write(param_6, param_7, param_8, v_283, v_283BufferSize);
+        Cmd_Stroke_write(param_6, param_7, param_8, v_308, v_308BufferSize);
         cmd_ref.offset += 12u;
     }
 }
 
 static inline __attribute__((always_inline))
-void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_283, constant uint& v_283BufferSize)
+void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.rgba_color;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u };
     CmdColor param_5 = s;
-    CmdColor_write(param_3, param_4, param_5, v_283, v_283BufferSize);
+    CmdColor_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLinGradientRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLinGradientRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_283, v_283BufferSize);
+    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_283, v_283BufferSize);
+    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_283, v_283BufferSize);
+    uint raw5 = read_mem(param_10, param_11, v_308, v_308BufferSize);
     Alloc param_12 = a;
     uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_283, v_283BufferSize);
+    uint raw6 = read_mem(param_12, param_13, v_308, v_308BufferSize);
     Alloc param_14 = a;
     uint param_15 = ix + 7u;
-    uint raw7 = read_mem(param_14, param_15, v_283, v_283BufferSize);
+    uint raw7 = read_mem(param_14, param_15, v_308, v_308BufferSize);
     Alloc param_16 = a;
     uint param_17 = ix + 8u;
-    uint raw8 = read_mem(param_16, param_17, v_283, v_283BufferSize);
+    uint raw8 = read_mem(param_16, param_17, v_308, v_308BufferSize);
     AnnoLinGradient s;
     s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
     s.linewidth = as_type<float>(raw4);
@@ -623,73 +644,73 @@
 }
 
 static inline __attribute__((always_inline))
-AnnoLinGradient Annotated_LinGradient_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoLinGradient Annotated_LinGradient_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     AnnoLinGradientRef param_1 = AnnoLinGradientRef{ ref.offset + 4u };
-    return AnnoLinGradient_read(param, param_1, v_283, v_283BufferSize);
+    return AnnoLinGradient_read(param, param_1, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_283, constant uint& v_283BufferSize)
+void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.line_x);
-    write_mem(param_3, param_4, param_5, v_283, v_283BufferSize);
+    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.line_y);
-    write_mem(param_6, param_7, param_8, v_283, v_283BufferSize);
+    write_mem(param_6, param_7, param_8, v_308, v_308BufferSize);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.line_c);
-    write_mem(param_9, param_10, param_11, v_283, v_283BufferSize);
+    write_mem(param_9, param_10, param_11, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u };
     CmdLinGrad param_5 = s;
-    CmdLinGrad_write(param_3, param_4, param_5, v_283, v_283BufferSize);
+    CmdLinGrad_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_283, v_283BufferSize);
+    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_283, v_283BufferSize);
+    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_283, v_283BufferSize);
+    uint raw5 = read_mem(param_10, param_11, v_308, v_308BufferSize);
     Alloc param_12 = a;
     uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_283, v_283BufferSize);
+    uint raw6 = read_mem(param_12, param_13, v_308, v_308BufferSize);
     AnnoImage s;
     s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
     s.linewidth = as_type<float>(raw4);
@@ -699,68 +720,115 @@
 }
 
 static inline __attribute__((always_inline))
-AnnoImage Annotated_Image_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoImage Annotated_Image_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     AnnoImageRef param_1 = AnnoImageRef{ ref.offset + 4u };
-    return AnnoImage_read(param, param_1, v_283, v_283BufferSize);
+    return AnnoImage_read(param, param_1, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_283, constant uint& v_283BufferSize)
+void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_3, param_4, param_5, v_283, v_283BufferSize);
+    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
     CmdImage param_5 = s;
-    CmdImage_write(param_3, param_4, param_5, v_283, v_283BufferSize);
+    CmdImage_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 8u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
+    AnnoEndClip s;
+    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.blend = raw4;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+{
+    Alloc param = a;
+    AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u };
+    return AnnoEndClip_read(param, param_1, v_308, v_308BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_308, constant uint& v_308BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.blend;
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 9u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    Alloc param_3 = a;
+    CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u };
+    CmdEndClip param_5 = s;
+    CmdEndClip_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 0u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_283 [[buffer(0)]], const device ConfigBuf& _1169 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_308 [[buffer(0)]], const device ConfigBuf& _1283 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_bitmaps[8][256];
     threadgroup Alloc sh_part_elements[256];
@@ -772,19 +840,19 @@
     threadgroup uint sh_tile_y0[256];
     threadgroup uint sh_tile_base[256];
     threadgroup uint sh_tile_count[256];
-    constant uint& v_283BufferSize = spvBufferSizeConstants[0];
-    uint width_in_bins = ((_1169.conf.width_in_tiles + 16u) - 1u) / 16u;
+    constant uint& v_308BufferSize = spvBufferSizeConstants[0];
+    uint width_in_bins = ((_1283.conf.width_in_tiles + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_1169.conf.n_elements + 256u) - 1u) / 256u;
+    uint n_partitions = ((_1283.conf.n_elements + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _1169.conf.width_in_tiles) + bin_tile_x) + tile_x;
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1283.conf.width_in_tiles) + bin_tile_x) + tile_x;
     Alloc param;
-    param.offset = _1169.conf.ptcl_alloc.offset;
+    param.offset = _1283.conf.ptcl_alloc.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -796,17 +864,17 @@
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    bool mem_ok = v_283.mem_error == 0u;
+    bool mem_ok = v_308.mem_error == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1448;
+    uint _1562;
     uint element_ix;
     AnnotatedRef ref;
     Alloc param_14;
     Alloc param_16;
     uint tile_count;
     Alloc param_23;
-    uint _1770;
+    uint _1887;
     Alloc param_29;
     Tile tile_1;
     AnnoColor fill;
@@ -814,38 +882,39 @@
     Alloc param_52;
     CmdLinGrad cmd_lin;
     Alloc param_69;
+    Alloc param_95;
     while (true)
     {
         for (uint i = 0u; i < 8u; i++)
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1500;
+        bool _1614;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1298 = th_ix < 256u;
-                bool _1306;
-                if (_1298)
+                bool _1412 = th_ix < 256u;
+                bool _1420;
+                if (_1412)
                 {
-                    _1306 = (partition_ix + th_ix) < n_partitions;
+                    _1420 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1306 = _1298;
+                    _1420 = _1412;
                 }
-                if (_1306)
+                if (_1420)
                 {
-                    uint in_ix = (_1169.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    param_3.offset = _1169.conf.bin_alloc.offset;
+                    uint in_ix = (_1283.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    param_3.offset = _1283.conf.bin_alloc.offset;
                     uint param_4 = in_ix;
-                    count = read_mem(param_3, param_4, v_283, v_283BufferSize);
-                    param_5.offset = _1169.conf.bin_alloc.offset;
+                    count = read_mem(param_3, param_4, v_308, v_308BufferSize);
+                    param_5.offset = _1283.conf.bin_alloc.offset;
                     uint param_6 = in_ix + 1u;
-                    uint offset = read_mem(param_5, param_6, v_283, v_283BufferSize);
+                    uint offset = read_mem(param_5, param_6, v_308, v_308BufferSize);
                     uint param_7 = offset;
                     uint param_8 = count * 4u;
                     bool param_9 = mem_ok;
@@ -889,34 +958,34 @@
                 }
                 if (part_ix > 0u)
                 {
-                    _1448 = sh_part_count[part_ix - 1u];
+                    _1562 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1448 = part_start_ix;
+                    _1562 = part_start_ix;
                 }
-                ix -= _1448;
+                ix -= _1562;
                 Alloc bin_alloc = sh_part_elements[part_ix];
                 BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
                 BinInstanceRef param_13 = BinInstance_index(param_10, param_11);
-                BinInstance inst = BinInstance_read(param_12, param_13, v_283, v_283BufferSize);
+                BinInstance inst = BinInstance_read(param_12, param_13, v_308, v_308BufferSize);
                 sh_elements[th_ix] = inst.element_ix;
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1490 = (wr_ix - rd_ix) < 256u;
-            if (_1490)
+            bool _1604 = (wr_ix - rd_ix) < 256u;
+            if (_1604)
             {
-                _1500 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1614 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1500 = _1490;
+                _1614 = _1604;
             }
-            if (_1500)
+            if (_1614)
             {
                 continue;
             }
@@ -929,10 +998,10 @@
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            ref = AnnotatedRef{ _1169.conf.anno_alloc.offset + (element_ix * 40u) };
-            param_14.offset = _1169.conf.anno_alloc.offset;
+            ref = AnnotatedRef{ _1283.conf.anno_alloc.offset + (element_ix * 40u) };
+            param_14.offset = _1283.conf.anno_alloc.offset;
             AnnotatedRef param_15 = ref;
-            tag = Annotated_tag(param_14, param_15, v_283, v_283BufferSize).tag;
+            tag = Annotated_tag(param_14, param_15, v_308, v_308BufferSize).tag;
         }
         switch (tag)
         {
@@ -942,11 +1011,11 @@
             case 4u:
             case 5u:
             {
-                uint drawmonoid_base = (_1169.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix);
-                uint path_ix = v_283.memory[drawmonoid_base];
-                param_16.offset = _1169.conf.tile_alloc.offset;
-                PathRef param_17 = PathRef{ _1169.conf.tile_alloc.offset + (path_ix * 12u) };
-                Path path = Path_read(param_16, param_17, v_283, v_283BufferSize);
+                uint drawmonoid_base = (_1283.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix);
+                uint path_ix = v_308.memory[drawmonoid_base];
+                param_16.offset = _1283.conf.tile_alloc.offset;
+                PathRef param_17 = PathRef{ _1283.conf.tile_alloc.offset + (path_ix * 12u) };
+                Path path = Path_read(param_16, param_17, v_308, v_308BufferSize);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
                 int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -1000,19 +1069,20 @@
                     el_ix = probe_1;
                 }
             }
-            AnnotatedRef ref_1 = AnnotatedRef{ _1169.conf.anno_alloc.offset + (sh_elements[el_ix] * 40u) };
-            param_23.offset = _1169.conf.anno_alloc.offset;
+            AnnotatedRef ref_1 = AnnotatedRef{ _1283.conf.anno_alloc.offset + (sh_elements[el_ix] * 40u) };
+            param_23.offset = _1283.conf.anno_alloc.offset;
             AnnotatedRef param_24 = ref_1;
-            uint tag_1 = Annotated_tag(param_23, param_24, v_283, v_283BufferSize).tag;
+            AnnotatedTag anno_tag = Annotated_tag(param_23, param_24, v_308, v_308BufferSize);
+            uint tag_1 = anno_tag.tag;
             if (el_ix > 0u)
             {
-                _1770 = sh_tile_count[el_ix - 1u];
+                _1887 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1770 = 0u;
+                _1887 = 0u;
             }
-            uint seq_ix = ix_1 - _1770;
+            uint seq_ix = ix_1 - _1887;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -1021,27 +1091,45 @@
             {
                 uint param_25 = el_ix;
                 bool param_26 = mem_ok;
-                Alloc param_27 = read_tile_alloc(param_25, param_26, v_283, v_283BufferSize);
+                Alloc param_27 = read_tile_alloc(param_25, param_26, v_308, v_308BufferSize);
                 TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                Tile tile = Tile_read(param_27, param_28, v_283, v_283BufferSize);
+                Tile tile = Tile_read(param_27, param_28, v_308, v_308BufferSize);
                 bool is_clip = (tag_1 == 4u) || (tag_1 == 5u);
-                bool _1834 = tile.tile.offset != 0u;
-                bool _1843;
-                if (!_1834)
+                bool _1951 = tile.tile.offset != 0u;
+                bool _1960;
+                if (!_1951)
                 {
-                    _1843 = (tile.backdrop == 0) == is_clip;
+                    _1960 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1843 = _1834;
+                    _1960 = _1951;
                 }
-                include_tile = _1843;
+                bool _1972;
+                if (!_1960)
+                {
+                    bool _1971;
+                    if (is_clip)
+                    {
+                        _1971 = (anno_tag.flags & 2u) != 0u;
+                    }
+                    else
+                    {
+                        _1971 = is_clip;
+                    }
+                    _1972 = _1971;
+                }
+                else
+                {
+                    _1972 = _1960;
+                }
+                include_tile = _1972;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1863 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1992 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -1065,10 +1153,10 @@
             uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
             uint element_ix_1 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            ref = AnnotatedRef{ _1169.conf.anno_alloc.offset + (element_ix_1 * 40u) };
-            param_29.offset = _1169.conf.anno_alloc.offset;
+            ref = AnnotatedRef{ _1283.conf.anno_alloc.offset + (element_ix_1 * 40u) };
+            param_29.offset = _1283.conf.anno_alloc.offset;
             AnnotatedRef param_30 = ref;
-            AnnotatedTag tag_2 = Annotated_tag(param_29, param_30, v_283, v_283BufferSize);
+            AnnotatedTag tag_2 = Annotated_tag(param_29, param_30, v_308, v_308BufferSize);
             if (clip_zero_depth == 0u)
             {
                 switch (tag_2.tag)
@@ -1077,20 +1165,20 @@
                     {
                         uint param_31 = element_ref_ix;
                         bool param_32 = mem_ok;
-                        Alloc param_33 = read_tile_alloc(param_31, param_32, v_283, v_283BufferSize);
+                        Alloc param_33 = read_tile_alloc(param_31, param_32, v_308, v_308BufferSize);
                         TileRef param_34 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_33, param_34, v_283, v_283BufferSize);
-                        param_35.offset = _1169.conf.anno_alloc.offset;
+                        tile_1 = Tile_read(param_33, param_34, v_308, v_308BufferSize);
+                        param_35.offset = _1283.conf.anno_alloc.offset;
                         AnnotatedRef param_36 = ref;
-                        fill = Annotated_Color_read(param_35, param_36, v_283, v_283BufferSize);
+                        fill = Annotated_Color_read(param_35, param_36, v_308, v_308BufferSize);
                         Alloc param_37 = cmd_alloc;
                         CmdRef param_38 = cmd_ref;
                         uint param_39 = cmd_limit;
-                        bool _1977 = alloc_cmd(param_37, param_38, param_39, v_283, v_283BufferSize);
+                        bool _2105 = alloc_cmd(param_37, param_38, param_39, v_308, v_308BufferSize);
                         cmd_alloc = param_37;
                         cmd_ref = param_38;
                         cmd_limit = param_39;
-                        if (!_1977)
+                        if (!_2105)
                         {
                             break;
                         }
@@ -1099,12 +1187,12 @@
                         uint param_42 = tag_2.flags;
                         Tile param_43 = tile_1;
                         float param_44 = fill.linewidth;
-                        write_fill(param_40, param_41, param_42, param_43, param_44, v_283, v_283BufferSize);
+                        write_fill(param_40, param_41, param_42, param_43, param_44, v_308, v_308BufferSize);
                         cmd_ref = param_41;
                         Alloc param_45 = cmd_alloc;
                         CmdRef param_46 = cmd_ref;
                         CmdColor param_47 = CmdColor{ fill.rgba_color };
-                        Cmd_Color_write(param_45, param_46, param_47, v_283, v_283BufferSize);
+                        Cmd_Color_write(param_45, param_46, param_47, v_308, v_308BufferSize);
                         cmd_ref.offset += 8u;
                         break;
                     }
@@ -1112,20 +1200,20 @@
                     {
                         uint param_48 = element_ref_ix;
                         bool param_49 = mem_ok;
-                        Alloc param_50 = read_tile_alloc(param_48, param_49, v_283, v_283BufferSize);
+                        Alloc param_50 = read_tile_alloc(param_48, param_49, v_308, v_308BufferSize);
                         TileRef param_51 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_50, param_51, v_283, v_283BufferSize);
-                        param_52.offset = _1169.conf.anno_alloc.offset;
+                        tile_1 = Tile_read(param_50, param_51, v_308, v_308BufferSize);
+                        param_52.offset = _1283.conf.anno_alloc.offset;
                         AnnotatedRef param_53 = ref;
-                        AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53, v_283, v_283BufferSize);
+                        AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53, v_308, v_308BufferSize);
                         Alloc param_54 = cmd_alloc;
                         CmdRef param_55 = cmd_ref;
                         uint param_56 = cmd_limit;
-                        bool _2049 = alloc_cmd(param_54, param_55, param_56, v_283, v_283BufferSize);
+                        bool _2177 = alloc_cmd(param_54, param_55, param_56, v_308, v_308BufferSize);
                         cmd_alloc = param_54;
                         cmd_ref = param_55;
                         cmd_limit = param_56;
-                        if (!_2049)
+                        if (!_2177)
                         {
                             break;
                         }
@@ -1134,7 +1222,7 @@
                         uint param_59 = tag_2.flags;
                         Tile param_60 = tile_1;
                         float param_61 = fill.linewidth;
-                        write_fill(param_57, param_58, param_59, param_60, param_61, v_283, v_283BufferSize);
+                        write_fill(param_57, param_58, param_59, param_60, param_61, v_308, v_308BufferSize);
                         cmd_ref = param_58;
                         cmd_lin.index = lin.index;
                         cmd_lin.line_x = lin.line_x;
@@ -1143,7 +1231,7 @@
                         Alloc param_62 = cmd_alloc;
                         CmdRef param_63 = cmd_ref;
                         CmdLinGrad param_64 = cmd_lin;
-                        Cmd_LinGrad_write(param_62, param_63, param_64, v_283, v_283BufferSize);
+                        Cmd_LinGrad_write(param_62, param_63, param_64, v_308, v_308BufferSize);
                         cmd_ref.offset += 20u;
                         break;
                     }
@@ -1151,20 +1239,20 @@
                     {
                         uint param_65 = element_ref_ix;
                         bool param_66 = mem_ok;
-                        Alloc param_67 = read_tile_alloc(param_65, param_66, v_283, v_283BufferSize);
+                        Alloc param_67 = read_tile_alloc(param_65, param_66, v_308, v_308BufferSize);
                         TileRef param_68 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_67, param_68, v_283, v_283BufferSize);
-                        param_69.offset = _1169.conf.anno_alloc.offset;
+                        tile_1 = Tile_read(param_67, param_68, v_308, v_308BufferSize);
+                        param_69.offset = _1283.conf.anno_alloc.offset;
                         AnnotatedRef param_70 = ref;
-                        AnnoImage fill_img = Annotated_Image_read(param_69, param_70, v_283, v_283BufferSize);
+                        AnnoImage fill_img = Annotated_Image_read(param_69, param_70, v_308, v_308BufferSize);
                         Alloc param_71 = cmd_alloc;
                         CmdRef param_72 = cmd_ref;
                         uint param_73 = cmd_limit;
-                        bool _2133 = alloc_cmd(param_71, param_72, param_73, v_283, v_283BufferSize);
+                        bool _2261 = alloc_cmd(param_71, param_72, param_73, v_308, v_308BufferSize);
                         cmd_alloc = param_71;
                         cmd_ref = param_72;
                         cmd_limit = param_73;
-                        if (!_2133)
+                        if (!_2261)
                         {
                             break;
                         }
@@ -1173,12 +1261,12 @@
                         uint param_76 = tag_2.flags;
                         Tile param_77 = tile_1;
                         float param_78 = fill_img.linewidth;
-                        write_fill(param_74, param_75, param_76, param_77, param_78, v_283, v_283BufferSize);
+                        write_fill(param_74, param_75, param_76, param_77, param_78, v_308, v_308BufferSize);
                         cmd_ref = param_75;
                         Alloc param_79 = cmd_alloc;
                         CmdRef param_80 = cmd_ref;
                         CmdImage param_81 = CmdImage{ fill_img.index, fill_img.offset };
-                        Cmd_Image_write(param_79, param_80, param_81, v_283, v_283BufferSize);
+                        Cmd_Image_write(param_79, param_80, param_81, v_308, v_308BufferSize);
                         cmd_ref.offset += 12u;
                         break;
                     }
@@ -1186,20 +1274,20 @@
                     {
                         uint param_82 = element_ref_ix;
                         bool param_83 = mem_ok;
-                        Alloc param_84 = read_tile_alloc(param_82, param_83, v_283, v_283BufferSize);
+                        Alloc param_84 = read_tile_alloc(param_82, param_83, v_308, v_308BufferSize);
                         TileRef param_85 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_84, param_85, v_283, v_283BufferSize);
-                        bool _2194 = tile_1.tile.offset == 0u;
-                        bool _2200;
-                        if (_2194)
+                        tile_1 = Tile_read(param_84, param_85, v_308, v_308BufferSize);
+                        bool _2322 = tile_1.tile.offset == 0u;
+                        bool _2328;
+                        if (_2322)
                         {
-                            _2200 = tile_1.backdrop == 0;
+                            _2328 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2200 = _2194;
+                            _2328 = _2322;
                         }
-                        if (_2200)
+                        if (_2328)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1208,17 +1296,17 @@
                             Alloc param_86 = cmd_alloc;
                             CmdRef param_87 = cmd_ref;
                             uint param_88 = cmd_limit;
-                            bool _2212 = alloc_cmd(param_86, param_87, param_88, v_283, v_283BufferSize);
+                            bool _2340 = alloc_cmd(param_86, param_87, param_88, v_308, v_308BufferSize);
                             cmd_alloc = param_86;
                             cmd_ref = param_87;
                             cmd_limit = param_88;
-                            if (!_2212)
+                            if (!_2340)
                             {
                                 break;
                             }
                             Alloc param_89 = cmd_alloc;
                             CmdRef param_90 = cmd_ref;
-                            Cmd_BeginClip_write(param_89, param_90, v_283, v_283BufferSize);
+                            Cmd_BeginClip_write(param_89, param_90, v_308, v_308BufferSize);
                             cmd_ref.offset += 4u;
                         }
                         clip_depth++;
@@ -1228,32 +1316,36 @@
                     {
                         uint param_91 = element_ref_ix;
                         bool param_92 = mem_ok;
-                        Alloc param_93 = read_tile_alloc(param_91, param_92, v_283, v_283BufferSize);
+                        Alloc param_93 = read_tile_alloc(param_91, param_92, v_308, v_308BufferSize);
                         TileRef param_94 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_93, param_94, v_283, v_283BufferSize);
+                        tile_1 = Tile_read(param_93, param_94, v_308, v_308BufferSize);
+                        param_95.offset = _1283.conf.anno_alloc.offset;
+                        AnnotatedRef param_96 = ref;
+                        AnnoEndClip end_clip = Annotated_EndClip_read(param_95, param_96, v_308, v_308BufferSize);
                         clip_depth--;
-                        Alloc param_95 = cmd_alloc;
-                        CmdRef param_96 = cmd_ref;
-                        uint param_97 = cmd_limit;
-                        bool _2261 = alloc_cmd(param_95, param_96, param_97, v_283, v_283BufferSize);
-                        cmd_alloc = param_95;
-                        cmd_ref = param_96;
-                        cmd_limit = param_97;
-                        if (!_2261)
+                        Alloc param_97 = cmd_alloc;
+                        CmdRef param_98 = cmd_ref;
+                        uint param_99 = cmd_limit;
+                        bool _2398 = alloc_cmd(param_97, param_98, param_99, v_308, v_308BufferSize);
+                        cmd_alloc = param_97;
+                        cmd_ref = param_98;
+                        cmd_limit = param_99;
+                        if (!_2398)
                         {
                             break;
                         }
-                        Alloc param_98 = cmd_alloc;
-                        CmdRef param_99 = cmd_ref;
-                        uint param_100 = 0u;
-                        Tile param_101 = tile_1;
-                        float param_102 = 0.0;
-                        write_fill(param_98, param_99, param_100, param_101, param_102, v_283, v_283BufferSize);
-                        cmd_ref = param_99;
-                        Alloc param_103 = cmd_alloc;
-                        CmdRef param_104 = cmd_ref;
-                        Cmd_EndClip_write(param_103, param_104, v_283, v_283BufferSize);
-                        cmd_ref.offset += 4u;
+                        Alloc param_100 = cmd_alloc;
+                        CmdRef param_101 = cmd_ref;
+                        uint param_102 = 0u;
+                        Tile param_103 = tile_1;
+                        float param_104 = 0.0;
+                        write_fill(param_100, param_101, param_102, param_103, param_104, v_308, v_308BufferSize);
+                        cmd_ref = param_101;
+                        Alloc param_105 = cmd_alloc;
+                        CmdRef param_106 = cmd_ref;
+                        CmdEndClip param_107 = CmdEndClip{ end_clip.blend };
+                        Cmd_EndClip_write(param_105, param_106, param_107, v_308, v_308BufferSize);
+                        cmd_ref.offset += 8u;
                         break;
                     }
                 }
@@ -1286,21 +1378,21 @@
             break;
         }
     }
-    bool _2326 = (bin_tile_x + tile_x) < _1169.conf.width_in_tiles;
-    bool _2335;
-    if (_2326)
+    bool _2467 = (bin_tile_x + tile_x) < _1283.conf.width_in_tiles;
+    bool _2476;
+    if (_2467)
     {
-        _2335 = (bin_tile_y + tile_y) < _1169.conf.height_in_tiles;
+        _2476 = (bin_tile_y + tile_y) < _1283.conf.height_in_tiles;
     }
     else
     {
-        _2335 = _2326;
+        _2476 = _2467;
     }
-    if (_2335)
+    if (_2476)
     {
-        Alloc param_105 = cmd_alloc;
-        CmdRef param_106 = cmd_ref;
-        Cmd_End_write(param_105, param_106, v_283, v_283BufferSize);
+        Alloc param_108 = cmd_alloc;
+        CmdRef param_109 = cmd_ref;
+        Cmd_End_write(param_108, param_109, v_308, v_308BufferSize);
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index b30e2d8..1fef2d7 100644
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl
index 8de5379..5b9ecc6 100644
--- a/piet-gpu/shader/gen/draw_leaf.msl
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@@ -87,6 +87,17 @@
     int2 offset;
 };
 
+struct ClipRef
+{
+    uint offset;
+};
+
+struct Clip
+{
+    float4 bbox;
+    uint blend;
+};
+
 struct ElementTag
 {
     uint tag;
@@ -148,6 +159,7 @@
 {
     float4 bbox;
     float linewidth;
+    uint blend;
 };
 
 struct AnnoEndClipRef
@@ -158,6 +170,7 @@
 struct AnnoEndClip
 {
     float4 bbox;
+    uint blend;
 };
 
 struct AnnotatedRef
@@ -228,9 +241,9 @@
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_211)
+ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_223)
 {
-    uint tag_and_flags = v_211.scene[ref.offset >> uint(2)];
+    uint tag_and_flags = v_223.scene[ref.offset >> uint(2)];
     return ElementTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
@@ -279,20 +292,20 @@
 }
 
 static inline __attribute__((always_inline))
-FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_211)
+FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_223)
 {
     uint ix = ref.offset >> uint(2);
-    uint raw0 = v_211.scene[ix + 0u];
+    uint raw0 = v_223.scene[ix + 0u];
     FillColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_211)
+FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_223)
 {
     FillColorRef param = FillColorRef{ ref.offset + 4u };
-    return FillColor_read(param, v_211);
+    return FillColor_read(param, v_223);
 }
 
 static inline __attribute__((always_inline))
@@ -302,7 +315,7 @@
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_187)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_199)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -310,61 +323,61 @@
     {
         return;
     }
-    v_187.memory[offset] = val;
+    v_199.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_187)
+void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_199)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
     Alloc param_12 = a;
     uint param_13 = ix + 4u;
     uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_187);
+    write_mem(param_12, param_13, param_14, v_199);
     Alloc param_15 = a;
     uint param_16 = ix + 5u;
     uint param_17 = s.rgba_color;
-    write_mem(param_15, param_16, param_17, v_187);
+    write_mem(param_15, param_16, param_17, v_199);
 }
 
 static inline __attribute__((always_inline))
-void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_187)
+void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_199)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = (flags << uint(16)) | 1u;
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     AnnoColorRef param_4 = AnnoColorRef{ ref.offset + 4u };
     AnnoColor param_5 = s;
-    AnnoColor_write(param_3, param_4, param_5, v_187);
+    AnnoColor_write(param_3, param_4, param_5, v_199);
 }
 
 static inline __attribute__((always_inline))
-FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_211)
+FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_223)
 {
     uint ix = ref.offset >> uint(2);
-    uint raw0 = v_211.scene[ix + 0u];
-    uint raw1 = v_211.scene[ix + 1u];
-    uint raw2 = v_211.scene[ix + 2u];
-    uint raw3 = v_211.scene[ix + 3u];
-    uint raw4 = v_211.scene[ix + 4u];
+    uint raw0 = v_223.scene[ix + 0u];
+    uint raw1 = v_223.scene[ix + 1u];
+    uint raw2 = v_223.scene[ix + 2u];
+    uint raw3 = v_223.scene[ix + 3u];
+    uint raw4 = v_223.scene[ix + 4u];
     FillLinGradient s;
     s.index = raw0;
     s.p0 = float2(as_type<float>(raw1), as_type<float>(raw2));
@@ -373,73 +386,73 @@
 }
 
 static inline __attribute__((always_inline))
-FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_211)
+FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_223)
 {
     FillLinGradientRef param = FillLinGradientRef{ ref.offset + 4u };
-    return FillLinGradient_read(param, v_211);
+    return FillLinGradient_read(param, v_223);
 }
 
 static inline __attribute__((always_inline))
-void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_187)
+void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_199)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
     Alloc param_12 = a;
     uint param_13 = ix + 4u;
     uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_187);
+    write_mem(param_12, param_13, param_14, v_199);
     Alloc param_15 = a;
     uint param_16 = ix + 5u;
     uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17, v_187);
+    write_mem(param_15, param_16, param_17, v_199);
     Alloc param_18 = a;
     uint param_19 = ix + 6u;
     uint param_20 = as_type<uint>(s.line_x);
-    write_mem(param_18, param_19, param_20, v_187);
+    write_mem(param_18, param_19, param_20, v_199);
     Alloc param_21 = a;
     uint param_22 = ix + 7u;
     uint param_23 = as_type<uint>(s.line_y);
-    write_mem(param_21, param_22, param_23, v_187);
+    write_mem(param_21, param_22, param_23, v_199);
     Alloc param_24 = a;
     uint param_25 = ix + 8u;
     uint param_26 = as_type<uint>(s.line_c);
-    write_mem(param_24, param_25, param_26, v_187);
+    write_mem(param_24, param_25, param_26, v_199);
 }
 
 static inline __attribute__((always_inline))
-void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_187)
+void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_199)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = (flags << uint(16)) | 2u;
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     AnnoLinGradientRef param_4 = AnnoLinGradientRef{ ref.offset + 4u };
     AnnoLinGradient param_5 = s;
-    AnnoLinGradient_write(param_3, param_4, param_5, v_187);
+    AnnoLinGradient_write(param_3, param_4, param_5, v_199);
 }
 
 static inline __attribute__((always_inline))
-FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_211)
+FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_223)
 {
     uint ix = ref.offset >> uint(2);
-    uint raw0 = v_211.scene[ix + 0u];
-    uint raw1 = v_211.scene[ix + 1u];
+    uint raw0 = v_223.scene[ix + 0u];
+    uint raw1 = v_223.scene[ix + 1u];
     FillImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -447,140 +460,169 @@
 }
 
 static inline __attribute__((always_inline))
-FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_211)
+FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_223)
 {
     FillImageRef param = FillImageRef{ ref.offset + 4u };
-    return FillImage_read(param, v_211);
+    return FillImage_read(param, v_223);
 }
 
 static inline __attribute__((always_inline))
-void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_187)
+void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_199)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
     Alloc param_12 = a;
     uint param_13 = ix + 4u;
     uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_187);
+    write_mem(param_12, param_13, param_14, v_199);
     Alloc param_15 = a;
     uint param_16 = ix + 5u;
     uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17, v_187);
+    write_mem(param_15, param_16, param_17, v_199);
     Alloc param_18 = a;
     uint param_19 = ix + 6u;
     uint param_20 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_18, param_19, param_20, v_187);
+    write_mem(param_18, param_19, param_20, v_199);
 }
 
 static inline __attribute__((always_inline))
-void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_187)
+void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_199)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = (flags << uint(16)) | 3u;
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     AnnoImageRef param_4 = AnnoImageRef{ ref.offset + 4u };
     AnnoImage param_5 = s;
-    AnnoImage_write(param_3, param_4, param_5, v_187);
+    AnnoImage_write(param_3, param_4, param_5, v_199);
 }
 
 static inline __attribute__((always_inline))
-void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_187)
+Clip Clip_read(thread const ClipRef& ref, const device SceneBuf& v_223)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = v_223.scene[ix + 0u];
+    uint raw1 = v_223.scene[ix + 1u];
+    uint raw2 = v_223.scene[ix + 2u];
+    uint raw3 = v_223.scene[ix + 3u];
+    Clip s;
+    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.blend = v_223.scene[ix + 4u];
+    return s;
+}
+
+static inline __attribute__((always_inline))
+Clip Element_BeginClip_read(thread const ElementRef& ref, const device SceneBuf& v_223)
+{
+    ClipRef param = ClipRef{ ref.offset + 4u };
+    return Clip_read(param, v_223);
+}
+
+static inline __attribute__((always_inline))
+void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_199)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
     Alloc param_12 = a;
     uint param_13 = ix + 4u;
     uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_187);
+    write_mem(param_12, param_13, param_14, v_199);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = s.blend;
+    write_mem(param_15, param_16, param_17, v_199);
 }
 
 static inline __attribute__((always_inline))
-void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_187)
+void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_199)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = (flags << uint(16)) | 4u;
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     AnnoBeginClipRef param_4 = AnnoBeginClipRef{ ref.offset + 4u };
     AnnoBeginClip param_5 = s;
-    AnnoBeginClip_write(param_3, param_4, param_5, v_187);
+    AnnoBeginClip_write(param_3, param_4, param_5, v_199);
 }
 
 static inline __attribute__((always_inline))
-void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_187)
+void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_199)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = s.blend;
+    write_mem(param_12, param_13, param_14, v_199);
 }
 
 static inline __attribute__((always_inline))
-void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const AnnoEndClip& s, device Memory& v_187)
+void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoEndClip& s, device Memory& v_199)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 5u;
-    write_mem(param, param_1, param_2, v_187);
+    uint param_2 = (flags << uint(16)) | 5u;
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     AnnoEndClipRef param_4 = AnnoEndClipRef{ ref.offset + 4u };
     AnnoEndClip param_5 = s;
-    AnnoEndClip_write(param_3, param_4, param_5, v_187);
+    AnnoEndClip_write(param_3, param_4, param_5, v_199);
 }
 
-kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _968 [[buffer(1)]], const device SceneBuf& v_211 [[buffer(2)]], const device ParentBuf& _934 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device Memory& v_199 [[buffer(0)]], const device ConfigBuf& _1054 [[buffer(1)]], const device SceneBuf& v_223 [[buffer(2)]], const device ParentBuf& _1020 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup DrawMonoid sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
     ElementRef ref = ElementRef{ ix * 36u };
     ElementRef param = ref;
-    uint tag_word = Element_tag(param, v_211).tag;
+    uint tag_word = Element_tag(param, v_223).tag;
     uint param_1 = tag_word;
     DrawMonoid agg = map_tag(param_1);
     spvUnsafeArray<DrawMonoid, 8> local;
@@ -590,7 +632,7 @@
         ElementRef param_2 = ref;
         uint param_3 = i;
         ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4, v_211).tag;
+        tag_word = Element_tag(param_4, v_223).tag;
         uint param_5 = tag_word;
         DrawMonoid param_6 = agg;
         DrawMonoid param_7 = map_tag(param_5);
@@ -615,9 +657,9 @@
     DrawMonoid row = tag_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        uint _937 = gl_WorkGroupID.x - 1u;
-        row.path_ix = _934.parent[_937].path_ix;
-        row.clip_ix = _934.parent[_937].clip_ix;
+        uint _1023 = gl_WorkGroupID.x - 1u;
+        row.path_ix = _1020.parent[_1023].path_ix;
+        row.clip_ix = _1020.parent[_1023].clip_ix;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -626,9 +668,9 @@
         row = combine_tag_monoid(param_10, param_11);
     }
     uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_968.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u);
-    uint clip_out_base = _968.conf.clip_alloc.offset >> uint(2);
-    AnnotatedRef out_ref = AnnotatedRef{ _968.conf.anno_alloc.offset + (out_ix * 40u) };
+    uint out_base = (_1054.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u);
+    uint clip_out_base = _1054.conf.clip_alloc.offset >> uint(2);
+    AnnotatedRef out_ref = AnnotatedRef{ _1054.conf.anno_alloc.offset + (out_ix * 40u) };
     float4 mat;
     float2 translate;
     AnnoColor anno_fill;
@@ -638,9 +680,9 @@
     AnnoImage anno_img;
     Alloc param_28;
     AnnoBeginClip anno_begin_clip;
-    Alloc param_32;
+    Alloc param_33;
     AnnoEndClip anno_end_clip;
-    Alloc param_36;
+    Alloc param_38;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid m = row;
@@ -650,31 +692,31 @@
             DrawMonoid param_13 = local[i_2 - 1u];
             m = combine_tag_monoid(param_12, param_13);
         }
-        v_187.memory[out_base + (i_2 * 2u)] = m.path_ix;
-        v_187.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
+        v_199.memory[out_base + (i_2 * 2u)] = m.path_ix;
+        v_199.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
         ElementRef param_14 = ref;
         uint param_15 = i_2;
         ElementRef this_ref = Element_index(param_14, param_15);
         ElementRef param_16 = this_ref;
-        tag_word = Element_tag(param_16, v_211).tag;
+        tag_word = Element_tag(param_16, v_223).tag;
         if ((((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) || (tag_word == 9u))
         {
-            uint bbox_offset = (_968.conf.bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(v_187.memory[bbox_offset]) - 32768.0;
-            float bbox_t = float(v_187.memory[bbox_offset + 1u]) - 32768.0;
-            float bbox_r = float(v_187.memory[bbox_offset + 2u]) - 32768.0;
-            float bbox_b = float(v_187.memory[bbox_offset + 3u]) - 32768.0;
+            uint bbox_offset = (_1054.conf.bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(v_199.memory[bbox_offset]) - 32768.0;
+            float bbox_t = float(v_199.memory[bbox_offset + 1u]) - 32768.0;
+            float bbox_r = float(v_199.memory[bbox_offset + 2u]) - 32768.0;
+            float bbox_b = float(v_199.memory[bbox_offset + 3u]) - 32768.0;
             float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = as_type<float>(v_187.memory[bbox_offset + 4u]);
+            float linewidth = as_type<float>(v_199.memory[bbox_offset + 4u]);
             uint fill_mode = uint(linewidth >= 0.0);
             if ((linewidth >= 0.0) || (tag_word == 5u))
             {
-                uint trans_ix = v_187.memory[bbox_offset + 5u];
-                uint t = (_968.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
-                mat = as_type<float4>(uint4(v_187.memory[t], v_187.memory[t + 1u], v_187.memory[t + 2u], v_187.memory[t + 3u]));
+                uint trans_ix = v_199.memory[bbox_offset + 5u];
+                uint t = (_1054.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
+                mat = as_type<float4>(uint4(v_199.memory[t], v_199.memory[t + 1u], v_199.memory[t + 2u], v_199.memory[t + 3u]));
                 if (tag_word == 5u)
                 {
-                    translate = as_type<float2>(uint2(v_187.memory[t + 4u], v_187.memory[t + 5u]));
+                    translate = as_type<float2>(uint2(v_199.memory[t + 4u], v_199.memory[t + 5u]));
                 }
             }
             if (linewidth >= 0.0)
@@ -687,21 +729,21 @@
                 case 4u:
                 {
                     ElementRef param_17 = this_ref;
-                    FillColor fill = Element_FillColor_read(param_17, v_211);
+                    FillColor fill = Element_FillColor_read(param_17, v_223);
                     anno_fill.bbox = bbox;
                     anno_fill.linewidth = linewidth;
                     anno_fill.rgba_color = fill.rgba_color;
-                    param_18.offset = _968.conf.anno_alloc.offset;
+                    param_18.offset = _1054.conf.anno_alloc.offset;
                     AnnotatedRef param_19 = out_ref;
                     uint param_20 = fill_mode;
                     AnnoColor param_21 = anno_fill;
-                    Annotated_Color_write(param_18, param_19, param_20, param_21, v_187);
+                    Annotated_Color_write(param_18, param_19, param_20, param_21, v_199);
                     break;
                 }
                 case 5u:
                 {
                     ElementRef param_22 = this_ref;
-                    FillLinGradient lin = Element_FillLinGradient_read(param_22, v_211);
+                    FillLinGradient lin = Element_FillLinGradient_read(param_22, v_223);
                     anno_lin.bbox = bbox;
                     anno_lin.linewidth = linewidth;
                     anno_lin.index = lin.index;
@@ -714,37 +756,41 @@
                     anno_lin.line_x = line_x;
                     anno_lin.line_y = line_y;
                     anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    param_23.offset = _968.conf.anno_alloc.offset;
+                    param_23.offset = _1054.conf.anno_alloc.offset;
                     AnnotatedRef param_24 = out_ref;
                     uint param_25 = fill_mode;
                     AnnoLinGradient param_26 = anno_lin;
-                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_187);
+                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_199);
                     break;
                 }
                 case 6u:
                 {
                     ElementRef param_27 = this_ref;
-                    FillImage fill_img = Element_FillImage_read(param_27, v_211);
+                    FillImage fill_img = Element_FillImage_read(param_27, v_223);
                     anno_img.bbox = bbox;
                     anno_img.linewidth = linewidth;
                     anno_img.index = fill_img.index;
                     anno_img.offset = fill_img.offset;
-                    param_28.offset = _968.conf.anno_alloc.offset;
+                    param_28.offset = _1054.conf.anno_alloc.offset;
                     AnnotatedRef param_29 = out_ref;
                     uint param_30 = fill_mode;
                     AnnoImage param_31 = anno_img;
-                    Annotated_Image_write(param_28, param_29, param_30, param_31, v_187);
+                    Annotated_Image_write(param_28, param_29, param_30, param_31, v_199);
                     break;
                 }
                 case 9u:
                 {
+                    ElementRef param_32 = this_ref;
+                    Clip begin_clip = Element_BeginClip_read(param_32, v_223);
                     anno_begin_clip.bbox = bbox;
                     anno_begin_clip.linewidth = 0.0;
-                    param_32.offset = _968.conf.anno_alloc.offset;
-                    AnnotatedRef param_33 = out_ref;
-                    uint param_34 = 0u;
-                    AnnoBeginClip param_35 = anno_begin_clip;
-                    Annotated_BeginClip_write(param_32, param_33, param_34, param_35, v_187);
+                    anno_begin_clip.blend = begin_clip.blend;
+                    uint flags = uint(begin_clip.blend != 3u) << uint(1);
+                    param_33.offset = _1054.conf.anno_alloc.offset;
+                    AnnotatedRef param_34 = out_ref;
+                    uint param_35 = flags;
+                    AnnoBeginClip param_36 = anno_begin_clip;
+                    Annotated_BeginClip_write(param_33, param_34, param_35, param_36, v_199);
                     break;
                 }
             }
@@ -753,11 +799,16 @@
         {
             if (tag_word == 10u)
             {
+                ElementRef param_37 = this_ref;
+                Clip end_clip = Element_BeginClip_read(param_37, v_223);
                 anno_end_clip.bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
-                param_36.offset = _968.conf.anno_alloc.offset;
-                AnnotatedRef param_37 = out_ref;
-                AnnoEndClip param_38 = anno_end_clip;
-                Annotated_EndClip_write(param_36, param_37, param_38, v_187);
+                anno_end_clip.blend = end_clip.blend;
+                uint flags_1 = uint(end_clip.blend != 3u) << uint(1);
+                param_38.offset = _1054.conf.anno_alloc.offset;
+                AnnotatedRef param_39 = out_ref;
+                uint param_40 = flags_1;
+                AnnoEndClip param_41 = anno_end_clip;
+                Annotated_EndClip_write(param_38, param_39, param_40, param_41, v_199);
             }
         }
         if ((tag_word == 9u) || (tag_word == 10u))
@@ -767,7 +818,7 @@
             {
                 path_ix = m.path_ix;
             }
-            v_187.memory[clip_out_base + m.clip_ix] = path_ix;
+            v_199.memory[clip_out_base + m.clip_ix] = path_ix;
         }
         out_ref.offset += 40u;
     }
diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv
index d5e9136..bdbdb0c 100644
--- a/piet-gpu/shader/gen/draw_leaf.spv
+++ b/piet-gpu/shader/gen/draw_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index 3dc7517..b58218c 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -115,6 +115,16 @@
     float alpha;
 };
 
+struct CmdEndClipRef
+{
+    uint offset;
+};
+
+struct CmdEndClip
+{
+    uint blend;
+};
+
 struct CmdJumpRef
 {
     uint offset;
@@ -208,7 +218,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_202)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_278)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -216,29 +226,29 @@
     {
         return 0u;
     }
-    uint v = v_202.memory[offset];
+    uint v = v_278.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_202);
+    uint tag_and_flags = read_mem(param, param_1, v_278);
     return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_202)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = as_type<float>(raw1);
@@ -246,11 +256,11 @@
 }
 
 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_202);
+    return CmdStroke_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
@@ -262,27 +272,27 @@
 }
 
 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_202)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_202);
+    uint raw2 = read_mem(param_4, param_5, v_278);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_202);
+    uint raw3 = read_mem(param_6, param_7, v_278);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_202);
+    uint raw4 = read_mem(param_8, param_9, v_278);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_202);
+    uint raw5 = read_mem(param_10, param_11, v_278);
     TileSeg s;
     s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -298,15 +308,15 @@
 }
 
 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_202)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -314,51 +324,51 @@
 }
 
 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_202);
+    return CmdFill_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_202)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     CmdAlpha s;
     s.alpha = as_type<float>(raw0);
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_202);
+    return CmdAlpha_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_202)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     CmdColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_202);
+    return CmdColor_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
@@ -379,21 +389,21 @@
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_202)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_202);
+    uint raw2 = read_mem(param_4, param_5, v_278);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_202);
+    uint raw3 = read_mem(param_6, param_7, v_278);
     CmdLinGrad s;
     s.index = raw0;
     s.line_x = as_type<float>(raw1);
@@ -403,23 +413,23 @@
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_202);
+    return CmdLinGrad_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_202)
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     CmdImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -427,11 +437,11 @@
 }
 
 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_202);
+    return CmdImage_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
@@ -444,10 +454,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _695 = fromsRGB(param_1);
-        fg_rgba.x = _695.x;
-        fg_rgba.y = _695.y;
-        fg_rgba.z = _695.z;
+        float3 _1493 = fromsRGB(param_1);
+        fg_rgba.x = _1493.x;
+        fg_rgba.y = _1493.y;
+        fg_rgba.z = _1493.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -471,30 +481,476 @@
 }
 
 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_202)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
+    CmdEndClip s;
+    s.blend = raw0;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+{
+    Alloc param = a;
+    CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
+    return CmdEndClip_read(param, param_1, v_278);
+}
+
+static inline __attribute__((always_inline))
+float3 screen(thread const float3& cb, thread const float3& cs)
+{
+    return (cb + cs) - (cb * cs);
+}
+
+static inline __attribute__((always_inline))
+float3 hard_light(thread const float3& cb, thread const float3& cs)
+{
+    float3 param = cb;
+    float3 param_1 = (cs * 2.0) - float3(1.0);
+    return mix(screen(param, param_1), (cb * 2.0) * cs, select(float3(0.0), float3(1.0), cs <= float3(0.5)));
+}
+
+static inline __attribute__((always_inline))
+float color_dodge(thread const float& cb, thread const float& cs)
+{
+    if (cb == 0.0)
+    {
+        return 0.0;
+    }
+    else
+    {
+        if (cs == 1.0)
+        {
+            return 1.0;
+        }
+        else
+        {
+            return fast::min(1.0, cb / (1.0 - cs));
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+float color_burn(thread const float& cb, thread const float& cs)
+{
+    if (cb == 1.0)
+    {
+        return 1.0;
+    }
+    else
+    {
+        if (cs == 0.0)
+        {
+            return 0.0;
+        }
+        else
+        {
+            return 1.0 - fast::min(1.0, (1.0 - cb) / cs);
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+float3 soft_light(thread const float3& cb, thread const float3& cs)
+{
+    float3 d = mix(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, select(float3(0.0), float3(1.0), cb <= float3(0.25)));
+    return mix(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), select(float3(0.0), float3(1.0), cs <= float3(0.5)));
+}
+
+static inline __attribute__((always_inline))
+float sat(thread const float3& c)
+{
+    return fast::max(c.x, fast::max(c.y, c.z)) - fast::min(c.x, fast::min(c.y, c.z));
+}
+
+static inline __attribute__((always_inline))
+void set_sat_inner(thread float& cmin, thread float& cmid, thread float& cmax, thread const float& s)
+{
+    if (cmax > cmin)
+    {
+        cmid = ((cmid - cmin) * s) / (cmax - cmin);
+        cmax = s;
+    }
+    else
+    {
+        cmid = 0.0;
+        cmax = 0.0;
+    }
+    cmin = 0.0;
+}
+
+static inline __attribute__((always_inline))
+float3 set_sat(thread float3& c, thread const float& s)
+{
+    if (c.x <= c.y)
+    {
+        if (c.y <= c.z)
+        {
+            float param = c.x;
+            float param_1 = c.y;
+            float param_2 = c.z;
+            float param_3 = s;
+            set_sat_inner(param, param_1, param_2, param_3);
+            c.x = param;
+            c.y = param_1;
+            c.z = param_2;
+        }
+        else
+        {
+            if (c.x <= c.z)
+            {
+                float param_4 = c.x;
+                float param_5 = c.z;
+                float param_6 = c.y;
+                float param_7 = s;
+                set_sat_inner(param_4, param_5, param_6, param_7);
+                c.x = param_4;
+                c.z = param_5;
+                c.y = param_6;
+            }
+            else
+            {
+                float param_8 = c.z;
+                float param_9 = c.x;
+                float param_10 = c.y;
+                float param_11 = s;
+                set_sat_inner(param_8, param_9, param_10, param_11);
+                c.z = param_8;
+                c.x = param_9;
+                c.y = param_10;
+            }
+        }
+    }
+    else
+    {
+        if (c.x <= c.z)
+        {
+            float param_12 = c.y;
+            float param_13 = c.x;
+            float param_14 = c.z;
+            float param_15 = s;
+            set_sat_inner(param_12, param_13, param_14, param_15);
+            c.y = param_12;
+            c.x = param_13;
+            c.z = param_14;
+        }
+        else
+        {
+            if (c.y <= c.z)
+            {
+                float param_16 = c.y;
+                float param_17 = c.z;
+                float param_18 = c.x;
+                float param_19 = s;
+                set_sat_inner(param_16, param_17, param_18, param_19);
+                c.y = param_16;
+                c.z = param_17;
+                c.x = param_18;
+            }
+            else
+            {
+                float param_20 = c.z;
+                float param_21 = c.y;
+                float param_22 = c.x;
+                float param_23 = s;
+                set_sat_inner(param_20, param_21, param_22, param_23);
+                c.z = param_20;
+                c.y = param_21;
+                c.x = param_22;
+            }
+        }
+    }
+    return c;
+}
+
+static inline __attribute__((always_inline))
+float lum(thread const float3& c)
+{
+    float3 f = float3(0.300000011920928955078125, 0.589999973773956298828125, 0.10999999940395355224609375);
+    return dot(c, f);
+}
+
+static inline __attribute__((always_inline))
+float3 clip_color(thread float3& c)
+{
+    float3 param = c;
+    float L = lum(param);
+    float n = fast::min(c.x, fast::min(c.y, c.z));
+    float x = fast::max(c.x, fast::max(c.y, c.z));
+    if (n < 0.0)
+    {
+        c = float3(L) + (((c - float3(L)) * L) / float3(L - n));
+    }
+    if (x > 1.0)
+    {
+        c = float3(L) + (((c - float3(L)) * (1.0 - L)) / float3(x - L));
+    }
+    return c;
+}
+
+static inline __attribute__((always_inline))
+float3 set_lum(thread const float3& c, thread const float& l)
+{
+    float3 param = c;
+    float3 param_1 = c + float3(l - lum(param));
+    float3 _901 = clip_color(param_1);
+    return _901;
+}
+
+static inline __attribute__((always_inline))
+float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const uint& mode)
+{
+    float3 b = float3(0.0);
+    switch (mode)
+    {
+        case 1u:
+        {
+            b = cb * cs;
+            break;
+        }
+        case 2u:
+        {
+            float3 param = cb;
+            float3 param_1 = cs;
+            b = screen(param, param_1);
+            break;
+        }
+        case 3u:
+        {
+            float3 param_2 = cs;
+            float3 param_3 = cb;
+            b = hard_light(param_2, param_3);
+            break;
+        }
+        case 4u:
+        {
+            b = fast::min(cb, cs);
+            break;
+        }
+        case 5u:
+        {
+            b = fast::max(cb, cs);
+            break;
+        }
+        case 6u:
+        {
+            float param_4 = cb.x;
+            float param_5 = cs.x;
+            float param_6 = cb.y;
+            float param_7 = cs.y;
+            float param_8 = cb.z;
+            float param_9 = cs.z;
+            b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9));
+            break;
+        }
+        case 7u:
+        {
+            float param_10 = cb.x;
+            float param_11 = cs.x;
+            float param_12 = cb.y;
+            float param_13 = cs.y;
+            float param_14 = cb.z;
+            float param_15 = cs.z;
+            b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15));
+            break;
+        }
+        case 8u:
+        {
+            float3 param_16 = cb;
+            float3 param_17 = cs;
+            b = hard_light(param_16, param_17);
+            break;
+        }
+        case 9u:
+        {
+            float3 param_18 = cb;
+            float3 param_19 = cs;
+            b = soft_light(param_18, param_19);
+            break;
+        }
+        case 10u:
+        {
+            b = abs(cb - cs);
+            break;
+        }
+        case 11u:
+        {
+            b = (cb + cs) - ((cb * 2.0) * cs);
+            break;
+        }
+        case 12u:
+        {
+            float3 param_20 = cb;
+            float3 param_21 = cs;
+            float param_22 = sat(param_20);
+            float3 _1192 = set_sat(param_21, param_22);
+            float3 param_23 = cb;
+            float3 param_24 = _1192;
+            float param_25 = lum(param_23);
+            b = set_lum(param_24, param_25);
+            break;
+        }
+        case 13u:
+        {
+            float3 param_26 = cs;
+            float3 param_27 = cb;
+            float param_28 = sat(param_26);
+            float3 _1206 = set_sat(param_27, param_28);
+            float3 param_29 = cb;
+            float3 param_30 = _1206;
+            float param_31 = lum(param_29);
+            b = set_lum(param_30, param_31);
+            break;
+        }
+        case 14u:
+        {
+            float3 param_32 = cb;
+            float3 param_33 = cs;
+            float param_34 = lum(param_32);
+            b = set_lum(param_33, param_34);
+            break;
+        }
+        case 15u:
+        {
+            float3 param_35 = cs;
+            float3 param_36 = cb;
+            float param_37 = lum(param_35);
+            b = set_lum(param_36, param_37);
+            break;
+        }
+        default:
+        {
+            b = cs;
+            break;
+        }
+    }
+    return b;
+}
+
+static inline __attribute__((always_inline))
+float4 mix_compose(thread const float3& cb, thread const float3& cs, thread const float& ab, thread const float& as, thread const uint& mode)
+{
+    float fa = 0.0;
+    float fb = 0.0;
+    switch (mode)
+    {
+        case 1u:
+        {
+            fa = 1.0;
+            fb = 0.0;
+            break;
+        }
+        case 2u:
+        {
+            fa = 0.0;
+            fb = 1.0;
+            break;
+        }
+        case 3u:
+        {
+            fa = 1.0;
+            fb = 1.0 - as;
+            break;
+        }
+        case 4u:
+        {
+            fa = 1.0 - ab;
+            fb = 1.0;
+            break;
+        }
+        case 5u:
+        {
+            fa = ab;
+            fb = 0.0;
+            break;
+        }
+        case 6u:
+        {
+            fa = 0.0;
+            fb = as;
+            break;
+        }
+        case 7u:
+        {
+            fa = 1.0 - ab;
+            fb = 0.0;
+            break;
+        }
+        case 8u:
+        {
+            fa = 0.0;
+            fb = 1.0 - as;
+            break;
+        }
+        case 9u:
+        {
+            fa = ab;
+            fb = 1.0 - as;
+            break;
+        }
+        case 10u:
+        {
+            fa = 1.0 - ab;
+            fb = as;
+            break;
+        }
+        case 11u:
+        {
+            fa = 1.0 - ab;
+            fb = 1.0 - as;
+            break;
+        }
+        case 12u:
+        {
+            fa = 1.0;
+            fb = 1.0;
+            break;
+        }
+        case 13u:
+        {
+            return float4(fast::max(float4(0.0), ((float4(1.0) - (float4(cs, as) * as)) + float4(1.0)) - (float4(cb, ab) * ab)).xyz, fast::max(0.0, ((1.0 - as) + 1.0) - ab));
+        }
+        case 14u:
+        {
+            return float4(fast::min(float4(1.0), (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, fast::min(1.0, as + ab));
+        }
+        default:
+        {
+            break;
+        }
+    }
+    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_278)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_278);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_202);
+    return CmdJump_read(param, param_1, v_278);
 }
 
-kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _723 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1521 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _723.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1521.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _723.conf.ptcl_alloc.offset;
+    param.offset = _1521.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -507,7 +963,7 @@
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_202.mem_error == 0u;
+    bool mem_ok = v_278.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
@@ -516,7 +972,7 @@
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_202).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_278).tag;
         if (tag == 0u)
         {
             break;
@@ -527,7 +983,7 @@
             {
                 Alloc param_5 = cmd_alloc;
                 CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_202);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_278);
                 for (uint k = 0u; k < 8u; k++)
                 {
                     df[k] = 1000000000.0;
@@ -540,7 +996,7 @@
                     bool param_9 = mem_ok;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_202);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_278);
                     float2 line_vec = seg.vector;
                     for (uint k_1 = 0u; k_1 < 8u; k_1++)
                     {
@@ -563,7 +1019,7 @@
             {
                 Alloc param_13 = cmd_alloc;
                 CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_202);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_278);
                 for (uint k_3 = 0u; k_3 < 8u; k_3++)
                 {
                     area[k_3] = float(fill.backdrop);
@@ -576,7 +1032,7 @@
                     bool param_17 = mem_ok;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_202);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_278);
                     for (uint k_4 = 0u; k_4 < 8u; k_4++)
                     {
                         uint param_20 = k_4;
@@ -620,7 +1076,7 @@
             {
                 Alloc param_21 = cmd_alloc;
                 CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_202);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_278);
                 for (uint k_7 = 0u; k_7 < 8u; k_7++)
                 {
                     area[k_7] = alpha.alpha;
@@ -632,7 +1088,7 @@
             {
                 Alloc param_23 = cmd_alloc;
                 CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_202);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_278);
                 uint param_25 = color.rgba_color;
                 float4 fg = unpacksRGB(param_25);
                 for (uint k_8 = 0u; k_8 < 8u; k_8++)
@@ -647,7 +1103,7 @@
             {
                 Alloc param_26 = cmd_alloc;
                 CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_202);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_278);
                 float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                 for (uint k_9 = 0u; k_9 < 8u; k_9++)
                 {
@@ -657,10 +1113,10 @@
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _1298 = fromsRGB(param_29);
-                    fg_rgba.x = _1298.x;
-                    fg_rgba.y = _1298.y;
-                    fg_rgba.z = _1298.z;
+                    float3 _2092 = fromsRGB(param_29);
+                    fg_rgba.x = _2092.x;
+                    fg_rgba.y = _2092.y;
+                    fg_rgba.z = _2092.z;
                     rgba[k_9] = fg_rgba;
                 }
                 cmd_ref.offset += 20u;
@@ -670,7 +1126,7 @@
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_202);
+                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_278);
                 uint2 param_32 = xy_uint;
                 CmdImage param_33 = fill_img;
                 spvUnsafeArray<float4, 8> img;
@@ -689,8 +1145,8 @@
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_34 = float4(rgba[k_11]);
-                    uint _1390 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _1390;
+                    uint _2184 = packsRGB(param_34);
+                    blend_stack[d_2][k_11] = _2184;
                     rgba[k_11] = float4(0.0);
                 }
                 clip_depth++;
@@ -699,23 +1155,43 @@
             }
             case 9u:
             {
+                Alloc param_35 = cmd_alloc;
+                CmdRef param_36 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36, v_278);
+                uint blend_mode = end_clip.blend >> uint(8);
+                uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
                 for (uint k_12 = 0u; k_12 < 8u; k_12++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_35 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_35);
+                    uint param_37 = blend_stack[d_3][k_12];
+                    float4 bg = unpacksRGB(param_37);
                     float4 fg_1 = rgba[k_12] * area[k_12];
-                    rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1;
+                    float3 param_38 = bg.xyz;
+                    float3 param_39 = fg_1.xyz;
+                    uint param_40 = blend_mode;
+                    float3 blend = mix_blend(param_38, param_39, param_40);
+                    float4 _2251 = fg_1;
+                    float _2255 = fg_1.w;
+                    float3 _2262 = mix(_2251.xyz, blend, float3(float((_2255 * bg.w) > 0.0)));
+                    fg_1.x = _2262.x;
+                    fg_1.y = _2262.y;
+                    fg_1.z = _2262.z;
+                    float3 param_41 = bg.xyz;
+                    float3 param_42 = fg_1.xyz;
+                    float param_43 = bg.w;
+                    float param_44 = fg_1.w;
+                    uint param_45 = comp_mode;
+                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
                 }
-                cmd_ref.offset += 4u;
+                cmd_ref.offset += 8u;
                 break;
             }
             case 10u:
             {
-                Alloc param_36 = cmd_alloc;
-                CmdRef param_37 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_36, param_37, v_202).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_278).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -723,9 +1199,9 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_38 = i_1;
-        float3 param_39 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_39), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_38))));
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index 31f11c9..4d205ce 100644
--- a/piet-gpu/shader/gen/kernel4.spv
+++ b/piet-gpu/shader/gen/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 15351a0..a318ed7 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -115,6 +115,16 @@
     float alpha;
 };
 
+struct CmdEndClipRef
+{
+    uint offset;
+};
+
+struct CmdEndClip
+{
+    uint blend;
+};
+
 struct CmdJumpRef
 {
     uint offset;
@@ -208,7 +218,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_202)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_278)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -216,29 +226,29 @@
     {
         return 0u;
     }
-    uint v = v_202.memory[offset];
+    uint v = v_278.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_202);
+    uint tag_and_flags = read_mem(param, param_1, v_278);
     return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_202)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = as_type<float>(raw1);
@@ -246,11 +256,11 @@
 }
 
 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_202);
+    return CmdStroke_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
@@ -262,27 +272,27 @@
 }
 
 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_202)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_202);
+    uint raw2 = read_mem(param_4, param_5, v_278);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_202);
+    uint raw3 = read_mem(param_6, param_7, v_278);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_202);
+    uint raw4 = read_mem(param_8, param_9, v_278);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_202);
+    uint raw5 = read_mem(param_10, param_11, v_278);
     TileSeg s;
     s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -298,15 +308,15 @@
 }
 
 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_202)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -314,51 +324,51 @@
 }
 
 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_202);
+    return CmdFill_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_202)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     CmdAlpha s;
     s.alpha = as_type<float>(raw0);
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_202);
+    return CmdAlpha_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_202)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     CmdColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_202);
+    return CmdColor_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
@@ -379,21 +389,21 @@
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_202)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_202);
+    uint raw2 = read_mem(param_4, param_5, v_278);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_202);
+    uint raw3 = read_mem(param_6, param_7, v_278);
     CmdLinGrad s;
     s.index = raw0;
     s.line_x = as_type<float>(raw1);
@@ -403,23 +413,23 @@
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_202);
+    return CmdLinGrad_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_202)
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     CmdImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -427,11 +437,11 @@
 }
 
 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_202);
+    return CmdImage_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
@@ -444,10 +454,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _695 = fromsRGB(param_1);
-        fg_rgba.x = _695.x;
-        fg_rgba.y = _695.y;
-        fg_rgba.z = _695.z;
+        float3 _1495 = fromsRGB(param_1);
+        fg_rgba.x = _1495.x;
+        fg_rgba.y = _1495.y;
+        fg_rgba.z = _1495.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -471,30 +481,477 @@
 }
 
 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_202)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
+    CmdEndClip s;
+    s.blend = raw0;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+{
+    Alloc param = a;
+    CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
+    return CmdEndClip_read(param, param_1, v_278);
+}
+
+static inline __attribute__((always_inline))
+float3 screen(thread const float3& cb, thread const float3& cs)
+{
+    return (cb + cs) - (cb * cs);
+}
+
+static inline __attribute__((always_inline))
+float3 hard_light(thread const float3& cb, thread const float3& cs)
+{
+    float3 param = cb;
+    float3 param_1 = (cs * 2.0) - float3(1.0);
+    return mix(screen(param, param_1), (cb * 2.0) * cs, select(float3(0.0), float3(1.0), cs <= float3(0.5)));
+}
+
+static inline __attribute__((always_inline))
+float color_dodge(thread const float& cb, thread const float& cs)
+{
+    if (cb == 0.0)
+    {
+        return 0.0;
+    }
+    else
+    {
+        if (cs == 1.0)
+        {
+            return 1.0;
+        }
+        else
+        {
+            return fast::min(1.0, cb / (1.0 - cs));
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+float color_burn(thread const float& cb, thread const float& cs)
+{
+    if (cb == 1.0)
+    {
+        return 1.0;
+    }
+    else
+    {
+        if (cs == 0.0)
+        {
+            return 0.0;
+        }
+        else
+        {
+            return 1.0 - fast::min(1.0, (1.0 - cb) / cs);
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+float3 soft_light(thread const float3& cb, thread const float3& cs)
+{
+    float3 d = mix(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, select(float3(0.0), float3(1.0), cb <= float3(0.25)));
+    return mix(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), select(float3(0.0), float3(1.0), cs <= float3(0.5)));
+}
+
+static inline __attribute__((always_inline))
+float sat(thread const float3& c)
+{
+    return fast::max(c.x, fast::max(c.y, c.z)) - fast::min(c.x, fast::min(c.y, c.z));
+}
+
+static inline __attribute__((always_inline))
+void SetSatInner(thread float& Cmin, thread float& Cmid, thread float& Cmax, thread const float& s)
+{
+    if (Cmax > Cmin)
+    {
+        Cmid = ((Cmid - Cmin) * s) / (Cmax - Cmin);
+        Cmax = s;
+    }
+    else
+    {
+        Cmid = 0.0;
+        Cmax = 0.0;
+    }
+    Cmin = 0.0;
+}
+
+static inline __attribute__((always_inline))
+float3 set_sat(thread float3& C, thread const float& s)
+{
+    if (C.x <= C.y)
+    {
+        if (C.y <= C.z)
+        {
+            float param = C.x;
+            float param_1 = C.y;
+            float param_2 = C.z;
+            float param_3 = s;
+            SetSatInner(param, param_1, param_2, param_3);
+            C.x = param;
+            C.y = param_1;
+            C.z = param_2;
+        }
+        else
+        {
+            if (C.x <= C.z)
+            {
+                float param_4 = C.x;
+                float param_5 = C.z;
+                float param_6 = C.y;
+                float param_7 = s;
+                SetSatInner(param_4, param_5, param_6, param_7);
+                C.x = param_4;
+                C.z = param_5;
+                C.y = param_6;
+            }
+            else
+            {
+                float param_8 = C.z;
+                float param_9 = C.x;
+                float param_10 = C.y;
+                float param_11 = s;
+                SetSatInner(param_8, param_9, param_10, param_11);
+                C.z = param_8;
+                C.x = param_9;
+                C.y = param_10;
+            }
+        }
+    }
+    else
+    {
+        if (C.x <= C.z)
+        {
+            float param_12 = C.y;
+            float param_13 = C.x;
+            float param_14 = C.z;
+            float param_15 = s;
+            SetSatInner(param_12, param_13, param_14, param_15);
+            C.y = param_12;
+            C.x = param_13;
+            C.z = param_14;
+        }
+        else
+        {
+            if (C.y <= C.z)
+            {
+                float param_16 = C.y;
+                float param_17 = C.z;
+                float param_18 = C.x;
+                float param_19 = s;
+                SetSatInner(param_16, param_17, param_18, param_19);
+                C.y = param_16;
+                C.z = param_17;
+                C.x = param_18;
+            }
+            else
+            {
+                float param_20 = C.z;
+                float param_21 = C.y;
+                float param_22 = C.x;
+                float param_23 = s;
+                SetSatInner(param_20, param_21, param_22, param_23);
+                C.z = param_20;
+                C.y = param_21;
+                C.x = param_22;
+            }
+        }
+    }
+    return C;
+}
+
+static inline __attribute__((always_inline))
+float lum(thread const float3& c)
+{
+    float3 f = float3(0.300000011920928955078125, 0.589999973773956298828125, 0.10999999940395355224609375);
+    return dot(c, f);
+}
+
+static inline __attribute__((always_inline))
+float3 clip_color(thread float3& c)
+{
+    float3 param = c;
+    float L = lum(param);
+    float n = fast::min(c.x, fast::min(c.y, c.z));
+    float x = fast::max(c.x, fast::max(c.y, c.z));
+    if (n < 0.0)
+    {
+        c = float3(L) + (((c - float3(L)) * L) / float3(L - n));
+    }
+    if (x > 1.0)
+    {
+        c = float3(L) + (((c - float3(L)) * (1.0 - L)) / float3(x - L));
+    }
+    return c;
+}
+
+static inline __attribute__((always_inline))
+float3 set_lum(thread const float3& c, thread const float& l)
+{
+    float3 param = c;
+    float d = l - lum(param);
+    float3 param_1 = c + float3(d);
+    float3 _903 = clip_color(param_1);
+    return _903;
+}
+
+static inline __attribute__((always_inline))
+float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const uint& mode)
+{
+    float3 b = float3(0.0);
+    switch (mode)
+    {
+        case 1u:
+        {
+            b = cb * cs;
+            break;
+        }
+        case 2u:
+        {
+            float3 param = cb;
+            float3 param_1 = cs;
+            b = screen(param, param_1);
+            break;
+        }
+        case 3u:
+        {
+            float3 param_2 = cs;
+            float3 param_3 = cb;
+            b = hard_light(param_2, param_3);
+            break;
+        }
+        case 4u:
+        {
+            b = fast::min(cb, cs);
+            break;
+        }
+        case 5u:
+        {
+            b = fast::max(cb, cs);
+            break;
+        }
+        case 6u:
+        {
+            float param_4 = cb.x;
+            float param_5 = cs.x;
+            float param_6 = cb.y;
+            float param_7 = cs.y;
+            float param_8 = cb.z;
+            float param_9 = cs.z;
+            b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9));
+            break;
+        }
+        case 7u:
+        {
+            float param_10 = cb.x;
+            float param_11 = cs.x;
+            float param_12 = cb.y;
+            float param_13 = cs.y;
+            float param_14 = cb.z;
+            float param_15 = cs.z;
+            b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15));
+            break;
+        }
+        case 8u:
+        {
+            float3 param_16 = cb;
+            float3 param_17 = cs;
+            b = hard_light(param_16, param_17);
+            break;
+        }
+        case 9u:
+        {
+            float3 param_18 = cb;
+            float3 param_19 = cs;
+            b = soft_light(param_18, param_19);
+            break;
+        }
+        case 10u:
+        {
+            b = abs(cb - cs);
+            break;
+        }
+        case 11u:
+        {
+            b = (cb + cs) - ((cb * 2.0) * cs);
+            break;
+        }
+        case 12u:
+        {
+            float3 param_20 = cb;
+            float3 param_21 = cs;
+            float param_22 = sat(param_20);
+            float3 _1194 = set_sat(param_21, param_22);
+            float3 param_23 = cb;
+            float3 param_24 = _1194;
+            float param_25 = lum(param_23);
+            b = set_lum(param_24, param_25);
+            break;
+        }
+        case 13u:
+        {
+            float3 param_26 = cs;
+            float3 param_27 = cb;
+            float param_28 = sat(param_26);
+            float3 _1208 = set_sat(param_27, param_28);
+            float3 param_29 = cb;
+            float3 param_30 = _1208;
+            float param_31 = lum(param_29);
+            b = set_lum(param_30, param_31);
+            break;
+        }
+        case 14u:
+        {
+            float3 param_32 = cb;
+            float3 param_33 = cs;
+            float param_34 = lum(param_32);
+            b = set_lum(param_33, param_34);
+            break;
+        }
+        case 15u:
+        {
+            float3 param_35 = cs;
+            float3 param_36 = cb;
+            float param_37 = lum(param_35);
+            b = set_lum(param_36, param_37);
+            break;
+        }
+        default:
+        {
+            b = cs;
+            break;
+        }
+    }
+    return b;
+}
+
+static inline __attribute__((always_inline))
+float4 mix_compose(thread const float3& cb, thread const float3& cs, thread const float& ab, thread const float& as, thread const uint& mode)
+{
+    float fa = 0.0;
+    float fb = 0.0;
+    switch (mode)
+    {
+        case 1u:
+        {
+            fa = 1.0;
+            fb = 0.0;
+            break;
+        }
+        case 2u:
+        {
+            fa = 0.0;
+            fb = 1.0;
+            break;
+        }
+        case 3u:
+        {
+            fa = 1.0;
+            fb = 1.0 - as;
+            break;
+        }
+        case 4u:
+        {
+            fa = 1.0 - ab;
+            fb = 1.0;
+            break;
+        }
+        case 5u:
+        {
+            fa = ab;
+            fb = 0.0;
+            break;
+        }
+        case 6u:
+        {
+            fa = 0.0;
+            fb = as;
+            break;
+        }
+        case 7u:
+        {
+            fa = 1.0 - ab;
+            fb = 0.0;
+            break;
+        }
+        case 8u:
+        {
+            fa = 0.0;
+            fb = 1.0 - as;
+            break;
+        }
+        case 9u:
+        {
+            fa = ab;
+            fb = 1.0 - as;
+            break;
+        }
+        case 10u:
+        {
+            fa = 1.0 - ab;
+            fb = as;
+            break;
+        }
+        case 11u:
+        {
+            fa = 1.0 - ab;
+            fb = 1.0 - as;
+            break;
+        }
+        case 12u:
+        {
+            fa = 1.0;
+            fb = 1.0;
+            break;
+        }
+        case 13u:
+        {
+            return float4(fast::max(float4(0.0), ((float4(1.0) - (float4(cs, as) * as)) + float4(1.0)) - (float4(cb, ab) * ab)).xyz, fast::max(0.0, ((1.0 - as) + 1.0) - ab));
+        }
+        case 14u:
+        {
+            return float4(fast::min(float4(1.0), (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, fast::min(1.0, as + ab));
+        }
+        default:
+        {
+            break;
+        }
+    }
+    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_278)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_278);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_202);
+    return CmdJump_read(param, param_1, v_278);
 }
 
-kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _723 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1523 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _723.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1523.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _723.conf.ptcl_alloc.offset;
+    param.offset = _1523.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -507,7 +964,7 @@
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_202.mem_error == 0u;
+    bool mem_ok = v_278.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
@@ -516,7 +973,7 @@
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_202).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_278).tag;
         if (tag == 0u)
         {
             break;
@@ -527,7 +984,7 @@
             {
                 Alloc param_5 = cmd_alloc;
                 CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_202);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_278);
                 for (uint k = 0u; k < 8u; k++)
                 {
                     df[k] = 1000000000.0;
@@ -540,7 +997,7 @@
                     bool param_9 = mem_ok;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_202);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_278);
                     float2 line_vec = seg.vector;
                     for (uint k_1 = 0u; k_1 < 8u; k_1++)
                     {
@@ -563,7 +1020,7 @@
             {
                 Alloc param_13 = cmd_alloc;
                 CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_202);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_278);
                 for (uint k_3 = 0u; k_3 < 8u; k_3++)
                 {
                     area[k_3] = float(fill.backdrop);
@@ -576,7 +1033,7 @@
                     bool param_17 = mem_ok;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_202);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_278);
                     for (uint k_4 = 0u; k_4 < 8u; k_4++)
                     {
                         uint param_20 = k_4;
@@ -620,7 +1077,7 @@
             {
                 Alloc param_21 = cmd_alloc;
                 CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_202);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_278);
                 for (uint k_7 = 0u; k_7 < 8u; k_7++)
                 {
                     area[k_7] = alpha.alpha;
@@ -632,7 +1089,7 @@
             {
                 Alloc param_23 = cmd_alloc;
                 CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_202);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_278);
                 uint param_25 = color.rgba_color;
                 float4 fg = unpacksRGB(param_25);
                 for (uint k_8 = 0u; k_8 < 8u; k_8++)
@@ -647,7 +1104,7 @@
             {
                 Alloc param_26 = cmd_alloc;
                 CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_202);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_278);
                 float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                 for (uint k_9 = 0u; k_9 < 8u; k_9++)
                 {
@@ -657,10 +1114,10 @@
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _1298 = fromsRGB(param_29);
-                    fg_rgba.x = _1298.x;
-                    fg_rgba.y = _1298.y;
-                    fg_rgba.z = _1298.z;
+                    float3 _2094 = fromsRGB(param_29);
+                    fg_rgba.x = _2094.x;
+                    fg_rgba.y = _2094.y;
+                    fg_rgba.z = _2094.z;
                     rgba[k_9] = fg_rgba;
                 }
                 cmd_ref.offset += 20u;
@@ -670,7 +1127,7 @@
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_202);
+                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_278);
                 uint2 param_32 = xy_uint;
                 CmdImage param_33 = fill_img;
                 spvUnsafeArray<float4, 8> img;
@@ -689,8 +1146,8 @@
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_34 = float4(rgba[k_11]);
-                    uint _1390 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _1390;
+                    uint _2186 = packsRGB(param_34);
+                    blend_stack[d_2][k_11] = _2186;
                     rgba[k_11] = float4(0.0);
                 }
                 clip_depth++;
@@ -699,23 +1156,43 @@
             }
             case 9u:
             {
+                Alloc param_35 = cmd_alloc;
+                CmdRef param_36 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36, v_278);
+                uint blend_mode = end_clip.blend >> uint(8);
+                uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
                 for (uint k_12 = 0u; k_12 < 8u; k_12++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_35 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_35);
+                    uint param_37 = blend_stack[d_3][k_12];
+                    float4 bg = unpacksRGB(param_37);
                     float4 fg_1 = rgba[k_12] * area[k_12];
-                    rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1;
+                    float3 param_38 = bg.xyz;
+                    float3 param_39 = fg_1.xyz;
+                    uint param_40 = blend_mode;
+                    float3 blend = mix_blend(param_38, param_39, param_40);
+                    float4 _2253 = fg_1;
+                    float _2257 = fg_1.w;
+                    float3 _2264 = mix(_2253.xyz, blend, float3(float((_2257 * bg.w) > 0.0)));
+                    fg_1.x = _2264.x;
+                    fg_1.y = _2264.y;
+                    fg_1.z = _2264.z;
+                    float3 param_41 = bg.xyz;
+                    float3 param_42 = fg_1.xyz;
+                    float param_43 = bg.w;
+                    float param_44 = fg_1.w;
+                    uint param_45 = comp_mode;
+                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
                 }
-                cmd_ref.offset += 4u;
+                cmd_ref.offset += 8u;
                 break;
             }
             case 10u:
             {
-                Alloc param_36 = cmd_alloc;
-                CmdRef param_37 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_36, param_37, v_202).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_278).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -723,8 +1200,8 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_38 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_38))));
+        uint param_48 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 42964c8..eb7385f 100644
--- a/piet-gpu/shader/gen/kernel4_gray.spv
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl
index bb10cf0..c03e830 100644
--- a/piet-gpu/shader/gen/tile_alloc.msl
+++ b/piet-gpu/shader/gen/tile_alloc.msl
@@ -26,6 +26,7 @@
 struct AnnoEndClip
 {
     float4 bbox;
+    uint blend;
 };
 
 struct AnnotatedRef
@@ -145,8 +146,12 @@
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
     uint raw3 = read_mem(param_6, param_7, v_92, v_92BufferSize);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_92, v_92BufferSize);
     AnnoEndClip s;
     s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.blend = raw4;
     return s;
 }
 
@@ -221,20 +226,20 @@
     write_mem(param_6, param_7, param_8, v_92, v_92BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_92 [[buffer(0)]], const device ConfigBuf& _305 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_92 [[buffer(0)]], const device ConfigBuf& _314 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
     threadgroup uint sh_tile_count[256];
     threadgroup MallocResult sh_tile_alloc;
     constant uint& v_92BufferSize = spvBufferSizeConstants[0];
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    PathRef path_ref = PathRef{ _305.conf.tile_alloc.offset + (element_ix * 12u) };
-    AnnotatedRef ref = AnnotatedRef{ _305.conf.anno_alloc.offset + (element_ix * 40u) };
+    PathRef path_ref = PathRef{ _314.conf.tile_alloc.offset + (element_ix * 12u) };
+    AnnotatedRef ref = AnnotatedRef{ _314.conf.anno_alloc.offset + (element_ix * 40u) };
     uint tag = 0u;
-    if (element_ix < _305.conf.n_elements)
+    if (element_ix < _314.conf.n_elements)
     {
         Alloc param;
-        param.offset = _305.conf.anno_alloc.offset;
+        param.offset = _314.conf.anno_alloc.offset;
         AnnotatedRef param_1 = ref;
         tag = Annotated_tag(param, param_1, v_92, v_92BufferSize).tag;
     }
@@ -251,7 +256,7 @@
         case 5u:
         {
             Alloc param_2;
-            param_2.offset = _305.conf.anno_alloc.offset;
+            param_2.offset = _314.conf.anno_alloc.offset;
             AnnotatedRef param_3 = ref;
             AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_92, v_92BufferSize);
             x0 = int(floor(clip.bbox.x * 0.0625));
@@ -261,10 +266,10 @@
             break;
         }
     }
-    x0 = clamp(x0, 0, int(_305.conf.width_in_tiles));
-    y0 = clamp(y0, 0, int(_305.conf.height_in_tiles));
-    x1 = clamp(x1, 0, int(_305.conf.width_in_tiles));
-    y1 = clamp(y1, 0, int(_305.conf.height_in_tiles));
+    x0 = clamp(x0, 0, int(_314.conf.width_in_tiles));
+    y0 = clamp(y0, 0, int(_314.conf.height_in_tiles));
+    x1 = clamp(x1, 0, int(_314.conf.width_in_tiles));
+    y1 = clamp(y1, 0, int(_314.conf.height_in_tiles));
     Path path;
     path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
     uint tile_count = uint((x1 - x0) * (y1 - y0));
@@ -287,43 +292,43 @@
     if (th_ix == 255u)
     {
         uint param_4 = total_tile_count * 8u;
-        MallocResult _476 = malloc(param_4, v_92, v_92BufferSize);
-        sh_tile_alloc = _476;
+        MallocResult _485 = malloc(param_4, v_92, v_92BufferSize);
+        sh_tile_alloc = _485;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     MallocResult alloc_start = sh_tile_alloc;
-    bool _487;
+    bool _496;
     if (!alloc_start.failed)
     {
-        _487 = v_92.mem_error != 0u;
+        _496 = v_92.mem_error != 0u;
     }
     else
     {
-        _487 = alloc_start.failed;
+        _496 = alloc_start.failed;
     }
-    if (_487)
+    if (_496)
     {
         return;
     }
-    if (element_ix < _305.conf.n_elements)
+    if (element_ix < _314.conf.n_elements)
     {
-        uint _500;
+        uint _509;
         if (th_ix > 0u)
         {
-            _500 = sh_tile_count[th_ix - 1u];
+            _509 = sh_tile_count[th_ix - 1u];
         }
         else
         {
-            _500 = 0u;
+            _509 = 0u;
         }
-        uint tile_subix = _500;
+        uint tile_subix = _509;
         Alloc param_5 = alloc_start.alloc;
         uint param_6 = 8u * tile_subix;
         uint param_7 = 8u * tile_count;
         Alloc tiles_alloc = slice_mem(param_5, param_6, param_7);
         path.tiles = TileRef{ tiles_alloc.offset };
         Alloc param_8;
-        param_8.offset = _305.conf.tile_alloc.offset;
+        param_8.offset = _314.conf.tile_alloc.offset;
         PathRef param_9 = path_ref;
         Path param_10 = path;
         Path_write(param_8, param_9, param_10, v_92, v_92BufferSize);
diff --git a/piet-gpu/shader/gen/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv
index 12277f1..cf2f01c 100644
--- a/piet-gpu/shader/gen/tile_alloc.spv
+++ b/piet-gpu/shader/gen/tile_alloc.spv
Binary files differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index dd4a855..a97715a 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -35,6 +35,7 @@
 
 #include "ptcl.h"
 #include "tile.h"
+#include "blend.h"
 
 #define MAX_BLEND_STACK 128
 mediump vec3 tosRGB(mediump vec3 rgb) {
@@ -216,14 +217,20 @@
             cmd_ref.offset += 4;
             break;
         case Cmd_EndClip:
+            CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
+            uint blend_mode = uint(end_clip.blend >> 8);
+            uint comp_mode = uint(end_clip.blend & 0xFF);
             clip_depth--;
             for (uint k = 0; k < CHUNK; k++) {
                 uint d = min(clip_depth, MAX_BLEND_STACK - 1);
                 mediump vec4 bg = unpacksRGB(blend_stack[d][k]);
                 mediump vec4 fg = rgba[k] * area[k];
-                rgba[k] = bg * (1.0 - fg.a) + fg;
+                vec3 blend = mix_blend(bg.rgb, fg.rgb, blend_mode);
+                // Apply the blend color only where the foreground and background overlap.
+                fg.rgb = mix(fg.rgb, blend, float((fg.a * bg.a) > 0.0));
+                rgba[k] = mix_compose(bg.rgb, fg.rgb, bg.a, fg.a, comp_mode);
             }
-            cmd_ref.offset += 4;
+            cmd_ref.offset += 4 + CmdEndClip_size;
             break;
         case Cmd_Jump:
             cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref);
diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
index 936c431..9b9b341 100644
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@@ -26,6 +26,10 @@
     uint offset;
 };
 
+struct CmdEndClipRef {
+    uint offset;
+};
+
 struct CmdJumpRef {
     uint offset;
 };
@@ -100,6 +104,16 @@
     return CmdAlphaRef(ref.offset + index * CmdAlpha_size);
 }
 
+struct CmdEndClip {
+    uint blend;
+};
+
+#define CmdEndClip_size 4
+
+CmdEndClipRef CmdEndClip_index(CmdEndClipRef ref, uint index) {
+    return CmdEndClipRef(ref.offset + index * CmdEndClip_size);
+}
+
 struct CmdJump {
     uint new_ref;
 };
@@ -228,6 +242,19 @@
     write_mem(a, ix + 0, floatBitsToUint(s.alpha));
 }
 
+CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    CmdEndClip s;
+    s.blend = raw0;
+    return s;
+}
+
+void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.blend);
+}
+
 CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) {
     uint ix = ref.offset >> 2;
     uint raw0 = read_mem(a, ix + 0);
@@ -270,6 +297,10 @@
     return CmdImage_read(a, CmdImageRef(ref.offset + 4));
 }
 
+CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) {
+    return CmdEndClip_read(a, CmdEndClipRef(ref.offset + 4));
+}
+
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) {
     return CmdJump_read(a, CmdJumpRef(ref.offset + 4));
 }
@@ -316,8 +347,9 @@
     write_mem(a, ref.offset >> 2, Cmd_BeginClip);
 }
 
-void Cmd_EndClip_write(Alloc a, CmdRef ref) {
+void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) {
     write_mem(a, ref.offset >> 2, Cmd_EndClip);
+    CmdEndClip_write(a, CmdEndClipRef(ref.offset + 4), s);
 }
 
 void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) {
diff --git a/piet-gpu/shader/scene.h b/piet-gpu/shader/scene.h
index 254d4aa..3e74b69 100644
--- a/piet-gpu/shader/scene.h
+++ b/piet-gpu/shader/scene.h
@@ -138,9 +138,10 @@
 
 struct Clip {
     vec4 bbox;
+    uint blend;
 };
 
-#define Clip_size 16
+#define Clip_size 20
 
 ClipRef Clip_index(ClipRef ref, uint index) {
     return ClipRef(ref.offset + index * Clip_size);
@@ -286,6 +287,7 @@
     uint raw3 = scene[ix + 3];
     Clip s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.blend = scene[ix + 4];
     return s;
 }
 
diff --git a/piet-gpu/src/blend.rs b/piet-gpu/src/blend.rs
new file mode 100644
index 0000000..6f1e791
--- /dev/null
+++ b/piet-gpu/src/blend.rs
@@ -0,0 +1,99 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+#[repr(C)]
+pub enum BlendMode {
+    Normal = 0,
+    Multiply = 1,
+    Screen = 2,
+    Overlay = 3,
+    Darken = 4,
+    Lighten = 5,
+    ColorDodge = 6,
+    ColorBurn = 7,
+    HardLight = 8,
+    SoftLight = 9,
+    Difference = 10,
+    Exclusion = 11,
+    Hue = 12,
+    Saturation = 13,
+    Color = 14,
+    Luminosity = 15,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+#[repr(C)]
+pub enum CompositionMode {
+    Clear = 0,
+    Copy = 1,
+    Dest = 2,
+    SrcOver = 3,
+    DestOver = 4,
+    SrcIn = 5,
+    DestIn = 6,
+    SrcOut = 7,
+    DestOut = 8,
+    SrcAtop = 9,
+    DestAtop = 10,
+    Xor = 11,
+    Plus = 12,
+    PlusDarker = 13,
+    PlusLighter = 14,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub struct Blend {
+    pub mode: BlendMode,
+    pub composition_mode: CompositionMode,
+}
+
+impl Blend {
+    pub fn new(mode: BlendMode, composition_mode: CompositionMode) -> Self {
+        Self { mode, composition_mode }
+    }
+
+    pub(crate) fn pack(&self) -> u32 {
+        (self.mode as u32) << 8 | self.composition_mode as u32
+    }
+}
+
+impl Default for Blend {
+    fn default() -> Self {
+        Self {
+            mode: BlendMode::Normal,
+            composition_mode: CompositionMode::SrcOver,
+        }
+    }
+}
+
+impl From<BlendMode> for Blend {
+    fn from(mode: BlendMode) -> Self {
+        Self {
+            mode,
+            composition_mode: CompositionMode::SrcOver,
+        }
+    }
+}
+
+impl From<CompositionMode> for Blend {
+    fn from(mode: CompositionMode) -> Self {
+        Self {
+            mode: BlendMode::Normal,
+            composition_mode: mode,
+        }
+    }
+}
diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs
index 767f4ba..c24615e 100644
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@@ -16,6 +16,7 @@
 
 //! Low-level scene encoding.
 
+use crate::Blend;
 use bytemuck::{Pod, Zeroable};
 use piet_gpu_hal::BufWrite;
 
@@ -87,7 +88,8 @@
 pub struct Clip {
     tag: u32,
     bbox: [f32; 4],
-    padding: [u32; 4],
+    blend: u32,
+    padding: [u32; 3],
 }
 
 impl Encoder {
@@ -151,10 +153,11 @@
     }
 
     /// Start a clip and return a save point to be filled in later.
-    pub fn begin_clip(&mut self) -> usize {
+    pub fn begin_clip(&mut self, blend: Option<Blend>) -> usize {
         let saved = self.drawobj_stream.len();
         let element = Clip {
             tag: ELEMENT_BEGINCLIP,
+            blend: blend.unwrap_or(Blend::default()).pack(),
             ..Default::default()
         };
         self.drawobj_stream.extend(bytemuck::bytes_of(&element));
@@ -162,10 +165,11 @@
         saved
     }
 
-    pub fn end_clip(&mut self, bbox: [f32; 4], save_point: usize) {
+    pub fn end_clip(&mut self, bbox: [f32; 4], blend: Option<Blend>, save_point: usize) {
         let element = Clip {
             tag: ELEMENT_ENDCLIP,
             bbox,
+            blend: blend.unwrap_or(Blend::default()).pack(),
             ..Default::default()
         };
         self.drawobj_stream[save_point + 4..save_point + 20]
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index b8b7532..bd26d45 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -1,3 +1,4 @@
+mod blend;
 mod encoder;
 pub mod glyph_render;
 mod gradient;
@@ -9,6 +10,7 @@
 
 use std::convert::TryInto;
 
+pub use blend::{Blend, BlendMode, CompositionMode};
 pub use render_ctx::PietGpuRenderContext;
 
 use piet::kurbo::Vec2;
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index ef0a3a7..1fe1ce9 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -16,6 +16,7 @@
 use crate::gradient::{LinearGradient, RampCache};
 use crate::text::Font;
 pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder};
+use crate::Blend;
 
 pub struct PietGpuImage;
 
@@ -66,6 +67,7 @@
     /// Byte offset of BeginClip element in element vec, for bbox fixup.
     save_point: usize,
     bbox: Option<Rect>,
+    blend: Option<Blend>,
 }
 
 const TOLERANCE: f64 = 0.25;
@@ -230,13 +232,14 @@
         self.encode_linewidth(-1.0);
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
-        let save_point = self.new_encoder.begin_clip();
+        let save_point = self.new_encoder.begin_clip(None);
         if self.clip_stack.len() >= MAX_BLEND_STACK {
             panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
         }
         self.clip_stack.push(ClipElement {
             bbox: None,
             save_point,
+            blend: None,
         });
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;
@@ -333,6 +336,25 @@
 }
 
 impl PietGpuRenderContext {
+    pub fn blend(&mut self, shape: impl Shape, blend: Blend) {
+        self.encode_linewidth(-1.0);
+        let path = shape.path_elements(TOLERANCE);
+        self.encode_path(path, true);
+        let save_point = self.new_encoder.begin_clip(Some(blend));
+        if self.clip_stack.len() >= MAX_BLEND_STACK {
+            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
+        }
+        self.clip_stack.push(ClipElement {
+            bbox: None,
+            save_point,
+            blend: Some(blend),
+        });
+        self.accumulate_bbox(|| shape.bounding_box());
+        if let Some(tos) = self.state_stack.last_mut() {
+            tos.n_clip += 1;
+        }
+    }
+
     fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
         if is_fill {
             self.encode_path_inner(
@@ -386,7 +408,7 @@
         let tos = self.clip_stack.pop().unwrap();
         let bbox = tos.bbox.unwrap_or_default();
         let bbox_f32_4 = rect_to_f32_4(bbox);
-        self.new_encoder.end_clip(bbox_f32_4, tos.save_point);
+        self.new_encoder.end_clip(bbox_f32_4, tos.blend, tos.save_point);
         if let Some(bbox) = tos.bbox {
             self.union_bbox(bbox);
         }
diff --git a/piet-gpu/src/test_scenes.rs b/piet-gpu/src/test_scenes.rs
index 47ace66..118b727 100644
--- a/piet-gpu/src/test_scenes.rs
+++ b/piet-gpu/src/test_scenes.rs
@@ -2,7 +2,8 @@
 
 use rand::{Rng, RngCore};
 
-use piet::kurbo::{BezPath, Circle, Line, Point, Rect, Shape};
+use crate::{PietGpuRenderContext, Blend, BlendMode, CompositionMode};
+use piet::kurbo::{Affine, BezPath, Circle, Line, Point, Rect, Shape};
 use piet::{
     Color, FixedGradient, FixedLinearGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
 };
@@ -11,6 +12,18 @@
 
 const N_CIRCLES: usize = 0;
 
+pub fn render_blend_test(rc: &mut PietGpuRenderContext, i: usize, blend: Blend) {
+    rc.fill(
+        Rect::new(400., 400., 800., 800.),
+        &Color::rgb8(0, 0, 200),
+    );
+    rc.save().unwrap();
+    rc.blend(Rect::new(0., 0., 1000., 1000.), blend);
+    rc.transform(Affine::translate(Vec2::new(600., 600.)) * Affine::rotate(0.01 * i as f64));
+    rc.fill(Rect::new(0., 0., 400., 400.), &Color::rgba8(255, 0, 0, 255));
+    rc.restore().unwrap();
+}
+
 pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) {
     let xml_str = std::fs::read_to_string(filename).unwrap();
     let start = std::time::Instant::now();