Merge pull request #172 from linebender/query_pool_size

Fix query pool size
diff --git a/Cargo.lock b/Cargo.lock
index 1f80fa3..1b9a6e3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -43,6 +43,12 @@
 checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"
 
 [[package]]
+name = "arrayvec"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
+
+[[package]]
 name = "ash"
 version = "0.33.3+1.2.191"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -596,7 +602,16 @@
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "16cb54cd28cb3d2e964d9444ca185676a94fd9b7cce5f02b22c717947ed8e9a2"
 dependencies = [
- "arrayvec",
+ "arrayvec 0.5.2",
+]
+
+[[package]]
+name = "kurbo"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a53776d271cfb873b17c618af0298445c88afc52837f3e948fa3fafd131f449"
+dependencies = [
+ "arrayvec 0.7.2",
 ]
 
 [[package]]
@@ -755,8 +770,7 @@
 [[package]]
 name = "moscato"
 version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8372f6cdc8b2c431750a9c4edbc8d9c511ef1a68472aaa02500493414a407c64"
+source = "git+https://github.com/dfrg/pinot#59db153ff83420449a909dfaace18466bddbf814"
 dependencies = [
  "pinot",
 ]
@@ -954,7 +968,7 @@
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f00543608fb5ee6063f5ff1259246ae23073c1a5e413e643d0469da3d4b7b4de"
 dependencies = [
- "kurbo",
+ "kurbo 0.7.1",
  "unic-bidi",
 ]
 
@@ -1012,7 +1026,7 @@
 dependencies = [
  "bytemuck",
  "clap",
- "kurbo",
+ "kurbo 0.7.1",
  "piet-gpu",
  "piet-gpu-hal",
  "rand",
@@ -1031,8 +1045,8 @@
 version = "0.1.0"
 dependencies = [
  "bytemuck",
+ "kurbo 0.8.3",
  "moscato",
- "pinot",
  "smallvec",
 ]
 
diff --git a/pgpu-render/src/lib.rs b/pgpu-render/src/lib.rs
index 7d4c60b..50462e7 100644
--- a/pgpu-render/src/lib.rs
+++ b/pgpu-render/src/lib.rs
@@ -215,7 +215,10 @@
 /// Computes the bounding box for the glyph after applying the specified
 /// transform.
 #[no_mangle]
-pub unsafe extern "C" fn pgpu_glyph_bbox(glyph: *const PgpuGlyph, transform: &[f32; 6]) -> PgpuRect {
+pub unsafe extern "C" fn pgpu_glyph_bbox(
+    glyph: *const PgpuGlyph,
+    transform: &[f32; 6],
+) -> PgpuRect {
     let transform = piet_scene::geometry::Affine::new(transform);
     let rect = (*glyph).bbox(Some(transform));
     PgpuRect {
diff --git a/pgpu-render/src/render.rs b/pgpu-render/src/render.rs
index 361ef42..5b5d328 100644
--- a/pgpu-render/src/render.rs
+++ b/pgpu-render/src/render.rs
@@ -16,8 +16,8 @@
 
 use piet_gpu::{EncodedSceneRef, PixelFormat, RenderConfig};
 use piet_gpu_hal::{QueryPool, Session};
-use piet_scene::glyph::pinot::{types::Tag, FontDataRef};
 use piet_scene::geometry::{Affine, Rect};
+use piet_scene::glyph::pinot::{types::Tag, FontDataRef};
 use piet_scene::glyph::{GlyphContext, GlyphProvider};
 use piet_scene::resource::ResourceContext;
 use piet_scene::scene::{Fragment, Scene};
@@ -214,7 +214,12 @@
 impl PgpuGlyph {
     pub fn bbox(&self, transform: Option<Affine>) -> Rect {
         if let Some(transform) = &transform {
-            Rect::from_points(self.fragment.points().iter().map(|p| p.transform(transform)))
+            Rect::from_points(
+                self.fragment
+                    .points()
+                    .iter()
+                    .map(|p| p.transform(transform)),
+            )
         } else {
             Rect::from_points(self.fragment.points())
         }
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index 4c785eb..79914bf 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -249,7 +249,8 @@
             println!("parsing time: {:?}", start.elapsed());
             test_scenes::render_svg(&mut ctx, &svg);
         } else {
-            test_scenes::render_scene(&mut ctx);
+            //test_scenes::render_scene(&mut ctx);
+            test_scenes::render_blend_grid(&mut ctx);
         }
 
         let mut renderer = Renderer::new(&session, WIDTH, HEIGHT, 1)?;
diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
index 3341d3a..8f84da4 100644
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@@ -125,7 +125,7 @@
                     }
 
                     let mut ctx = PietGpuRenderContext::new();
-                    let test_blend = false;
+                    let test_blend = true;
                     if let Some(svg) = &svg {
                         test_scenes::render_svg(&mut ctx, svg);
                     } else if test_blend {
diff --git a/piet-gpu/shader/blend.h b/piet-gpu/shader/blend.h
index 1ac4bd6..7366006 100644
--- a/piet-gpu/shader/blend.h
+++ b/piet-gpu/shader/blend.h
@@ -18,6 +18,7 @@
 #define Blend_Saturation 13
 #define Blend_Color 14
 #define Blend_Luminosity 15
+#define Blend_Clip 128
 
 vec3 screen(vec3 cb, vec3 cs) {
 	return cb + cs - (cb * cs);
@@ -45,7 +46,7 @@
 	return mix(
 		screen(cb, 2.0 * cs - 1.0),
 		cb * 2.0 * cs, 
-		vec3(lessThanEqual(cs, vec3(0.5)))
+		lessThanEqual(cs, vec3(0.5))
 	);
 }
 
@@ -53,12 +54,12 @@
 	vec3 d = mix(
 		sqrt(cb),
 		((16.0 * cb - vec3(12.0)) * cb + vec3(4.0)) * cb,
-		vec3(lessThanEqual(cb, vec3(0.25)))
+		lessThanEqual(cb, vec3(0.25))
 	);
 	return mix(
 		cb + (2.0 * cs - vec3(1.0)) * (d - cb),
 		cb - (vec3(1.0) - 2.0 * cs) * cb * (vec3(1.0) - cb),
-		vec3(lessThanEqual(cs, vec3(0.5)))
+		lessThanEqual(cs, vec3(0.5))
 	);
 }
 
@@ -122,6 +123,8 @@
     return c;
 }
 
+// Blends two RGB colors together. The colors are assumed to be in sRGB
+// color space, and this function does not take alpha into account.
 vec3 mix_blend(vec3 cb, vec3 cs, uint mode) {
 	vec3 b = vec3(0.0);
 	switch (mode) {
@@ -190,9 +193,10 @@
 #define Comp_DestAtop 10
 #define Comp_Xor 11
 #define Comp_Plus 12
-#define Comp_PlusDarker 13
-#define Comp_PlusLighter 14
+#define Comp_PlusLighter 13
 
+// Apply general compositing operation.
+// Inputs are separated colors and alpha, output is premultiplied.
 vec4 mix_compose(vec3 cb, vec3 cs, float ab, float as, uint mode) {
 	float fa = 0.0;
 	float fb = 0.0;
@@ -245,16 +249,43 @@
 		fa = 1.0;
 		fb = 1.0;
 		break;
-	case Comp_PlusDarker:
-		return vec4(max(vec4(0.0), 1.0 - as * vec4(cs, as) + 1.0 - ab * vec4(cb, ab)).xyz, 
-			    max(0.0, 1.0 - as + 1.0 - ab));
 	case Comp_PlusLighter:
-		return vec4(min(vec4(1.0), as * vec4(cs, as) + ab * vec4(cb, ab)).xyz,
-			    min(1.0, as + ab));
+		return min(vec4(1.0), vec4(as * cs + ab * cb, as + ab));
 	default:
 		break;
 	}
-	return as * fa * vec4(cs, as) + ab * fb * vec4(cb, ab);
+	float as_fa = as * fa;
+	float ab_fb = ab * fb;
+	vec3 co = as_fa * cs + ab_fb * cb;
+	return vec4(co, as_fa + ab_fb);
 }
 
 #define BlendComp_default (Blend_Normal << 8 | Comp_SrcOver)
+#define BlendComp_clip (Blend_Clip << 8 | Comp_SrcOver)
+
+// This is added to alpha to prevent divide-by-zero
+#define EPSILON 1e-15
+
+// Apply blending and composition. Both input and output colors are
+// premultiplied RGB.
+vec4 mix_blend_compose(vec4 backdrop, vec4 src, uint mode) {
+	if ((mode & 0x7fff) == BlendComp_default) {
+		// Both normal+src_over blend and clip case
+		return backdrop * (1.0 - src.a) + src;
+	}
+	// Un-premultiply colors for blending
+	float inv_src_a = 1.0 / (src.a + EPSILON);
+	vec3 cs = src.rgb * inv_src_a;
+	float inv_backdrop_a = 1.0 / (backdrop.a + EPSILON);
+	vec3 cb = backdrop.rgb * inv_backdrop_a;
+	uint blend_mode = mode >> 8;
+	vec3 blended = mix_blend(cb, cs, blend_mode);
+	cs = mix(cs, blended, backdrop.a);
+	uint comp_mode = mode & 0xff;
+	if (comp_mode == Comp_SrcOver) {
+		vec3 co = mix(backdrop.rgb, cs, src.a);
+		return vec4(co, src.a + backdrop.a * (1 - src.a));
+	} else {
+		return mix_compose(cb, cs, backdrop.a, src.a, comp_mode);
+	}
+}
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 60e5582..09b0683 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -58,7 +58,7 @@
 build gen/kernel4.dxil: dxil gen/kernel4.hlsl
 build gen/kernel4.msl: msl gen/kernel4.spv
 
-build gen/kernel4_gray.spv: glsl kernel4.comp | ptcl.h setup.h mem.h
+build gen/kernel4_gray.spv: glsl kernel4.comp | blend.h ptcl.h setup.h mem.h
   flags = -DGRAY
 build gen/kernel4_gray.hlsl: hlsl gen/kernel4_gray.spv
 build gen/kernel4_gray.dxil: dxil gen/kernel4_gray.hlsl
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 3abb2e0..1b3f252 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -303,7 +303,7 @@
                     uint scene_offset = memory[drawmonoid_base + 2];
                     uint dd = drawdata_start + (scene_offset >> 2);
                     uint blend = scene[dd];
-                    is_blend = (blend != BlendComp_default);
+                    is_blend = (blend != BlendComp_clip);
                 }
                 include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
                     || is_blend;
diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil
index 0fb9622..df2be88 100644
--- a/piet-gpu/shader/gen/backdrop.dxil
+++ b/piet-gpu/shader/gen/backdrop.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil
index e24a6d3..81f9b65 100644
--- a/piet-gpu/shader/gen/backdrop_lg.dxil
+++ b/piet-gpu/shader/gen/backdrop_lg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil
index 6655b7f..6b3efaf 100644
--- a/piet-gpu/shader/gen/bbox_clear.dxil
+++ b/piet-gpu/shader/gen/bbox_clear.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_leaf.dxil b/piet-gpu/shader/gen/clip_leaf.dxil
index 29a158e..b681a65 100644
--- a/piet-gpu/shader/gen/clip_leaf.dxil
+++ b/piet-gpu/shader/gen/clip_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_reduce.dxil b/piet-gpu/shader/gen/clip_reduce.dxil
index 0dff71b..0ccaac9 100644
--- a/piet-gpu/shader/gen/clip_reduce.dxil
+++ b/piet-gpu/shader/gen/clip_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index fdab444..c91fcdf 100644
--- a/piet-gpu/shader/gen/coarse.dxil
+++ b/piet-gpu/shader/gen/coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index 04529bb..0331e33 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -919,26 +919,26 @@
                     uint scene_offset = _260.Load((drawmonoid_base_1 + 2u) * 4 + 8);
                     uint dd = drawdata_start + (scene_offset >> uint(2));
                     uint blend = _1372.Load(dd * 4 + 0);
-                    is_blend = blend != 3u;
+                    is_blend = blend != 32771u;
                 }
-                bool _1692 = tile.tile.offset != 0u;
-                bool _1701;
-                if (!_1692)
+                bool _1693 = tile.tile.offset != 0u;
+                bool _1702;
+                if (!_1693)
                 {
-                    _1701 = (tile.backdrop == 0) == is_clip;
+                    _1702 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1701 = _1692;
+                    _1702 = _1693;
                 }
-                include_tile = _1701 || is_blend;
+                include_tile = _1702 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1723;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723);
+                uint _1724;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1724);
             }
         }
         GroupMemoryBarrierWithGroupSync();
@@ -967,9 +967,9 @@
             {
                 uint param_25 = element_ref_ix;
                 bool param_26 = mem_ok;
-                TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                TileRef _1801 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                 Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1800;
+                TileRef param_28 = _1801;
                 Tile tile_1 = Tile_read(param_27, param_28);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
                 uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
@@ -984,11 +984,11 @@
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1848 = alloc_cmd(param_29, param_30, param_31);
+                        bool _1849 = alloc_cmd(param_29, param_30, param_31);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1848)
+                        if (!_1849)
                         {
                             break;
                         }
@@ -999,10 +999,10 @@
                         write_fill(param_32, param_33, param_34, param_35);
                         cmd_ref = param_33;
                         uint rgba = _1372.Load(dd_1 * 4 + 0);
-                        CmdColor _1871 = { rgba };
+                        CmdColor _1872 = { rgba };
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = _1871;
+                        CmdColor param_38 = _1872;
                         Cmd_Color_write(param_36, param_37, param_38);
                         cmd_ref.offset += 8u;
                         break;
@@ -1012,11 +1012,11 @@
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1889 = alloc_cmd(param_39, param_40, param_41);
+                        bool _1890 = alloc_cmd(param_39, param_40, param_41);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1889)
+                        if (!_1890)
                         {
                             break;
                         }
@@ -1043,11 +1043,11 @@
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1953 = alloc_cmd(param_49, param_50, param_51);
+                        bool _1954 = alloc_cmd(param_49, param_50, param_51);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1953)
+                        if (!_1954)
                         {
                             break;
                         }
@@ -1077,11 +1077,11 @@
                         Alloc param_59 = cmd_alloc;
                         CmdRef param_60 = cmd_ref;
                         uint param_61 = cmd_limit;
-                        bool _2059 = alloc_cmd(param_59, param_60, param_61);
+                        bool _2060 = alloc_cmd(param_59, param_60, param_61);
                         cmd_alloc = param_59;
                         cmd_ref = param_60;
                         cmd_limit = param_61;
-                        if (!_2059)
+                        if (!_2060)
                         {
                             break;
                         }
@@ -1094,27 +1094,27 @@
                         uint index = _1372.Load(dd_1 * 4 + 0);
                         uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _2098 = { index, offset_1 };
+                        CmdImage _2099 = { index, offset_1 };
                         Alloc param_66 = cmd_alloc;
                         CmdRef param_67 = cmd_ref;
-                        CmdImage param_68 = _2098;
+                        CmdImage param_68 = _2099;
                         Cmd_Image_write(param_66, param_67, param_68);
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _2112 = tile_1.tile.offset == 0u;
-                        bool _2118;
-                        if (_2112)
+                        bool _2113 = tile_1.tile.offset == 0u;
+                        bool _2119;
+                        if (_2113)
                         {
-                            _2118 = tile_1.backdrop == 0;
+                            _2119 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2118 = _2112;
+                            _2119 = _2113;
                         }
-                        if (_2118)
+                        if (_2119)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1123,11 +1123,11 @@
                             Alloc param_69 = cmd_alloc;
                             CmdRef param_70 = cmd_ref;
                             uint param_71 = cmd_limit;
-                            bool _2130 = alloc_cmd(param_69, param_70, param_71);
+                            bool _2131 = alloc_cmd(param_69, param_70, param_71);
                             cmd_alloc = param_69;
                             cmd_ref = param_70;
                             cmd_limit = param_71;
-                            if (!_2130)
+                            if (!_2131)
                             {
                                 break;
                             }
@@ -1145,11 +1145,11 @@
                         Alloc param_74 = cmd_alloc;
                         CmdRef param_75 = cmd_ref;
                         uint param_76 = cmd_limit;
-                        bool _2158 = alloc_cmd(param_74, param_75, param_76);
+                        bool _2159 = alloc_cmd(param_74, param_75, param_76);
                         cmd_alloc = param_74;
                         cmd_ref = param_75;
                         cmd_limit = param_76;
-                        if (!_2158)
+                        if (!_2159)
                         {
                             break;
                         }
@@ -1160,10 +1160,10 @@
                         write_fill(param_77, param_78, param_79, param_80);
                         cmd_ref = param_78;
                         uint blend_1 = _1372.Load(dd_1 * 4 + 0);
-                        CmdEndClip _2181 = { blend_1 };
+                        CmdEndClip _2182 = { blend_1 };
                         Alloc param_81 = cmd_alloc;
                         CmdRef param_82 = cmd_ref;
-                        CmdEndClip param_83 = _2181;
+                        CmdEndClip param_83 = _2182;
                         Cmd_EndClip_write(param_81, param_82, param_83);
                         cmd_ref.offset += 8u;
                         break;
@@ -1198,17 +1198,17 @@
             break;
         }
     }
-    bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8);
-    bool _2237;
-    if (_2228)
+    bool _2229 = (bin_tile_x + tile_x) < _1005.Load(8);
+    bool _2238;
+    if (_2229)
     {
-        _2237 = (bin_tile_y + tile_y) < _1005.Load(12);
+        _2238 = (bin_tile_y + tile_y) < _1005.Load(12);
     }
     else
     {
-        _2237 = _2228;
+        _2238 = _2229;
     }
-    if (_2237)
+    if (_2238)
     {
         Alloc param_84 = cmd_alloc;
         CmdRef param_85 = cmd_ref;
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index 55812d4..854d243 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -942,25 +942,25 @@
                     uint scene_offset = v_260.memory[drawmonoid_base_1 + 2u];
                     uint dd = drawdata_start + (scene_offset >> uint(2));
                     uint blend = _1372.scene[dd];
-                    is_blend = blend != 3u;
+                    is_blend = blend != 32771u;
                 }
-                bool _1692 = tile.tile.offset != 0u;
-                bool _1701;
-                if (!_1692)
+                bool _1693 = tile.tile.offset != 0u;
+                bool _1702;
+                if (!_1693)
                 {
-                    _1701 = (tile.backdrop == 0) == is_clip;
+                    _1702 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1701 = _1692;
+                    _1702 = _1693;
                 }
-                include_tile = _1701 || is_blend;
+                include_tile = _1702 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1723 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1724 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -1005,11 +1005,11 @@
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1848 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
+                        bool _1849 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1848)
+                        if (!_1849)
                         {
                             break;
                         }
@@ -1032,11 +1032,11 @@
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1889 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
+                        bool _1890 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1889)
+                        if (!_1890)
                         {
                             break;
                         }
@@ -1063,11 +1063,11 @@
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1953 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
+                        bool _1954 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1953)
+                        if (!_1954)
                         {
                             break;
                         }
@@ -1097,11 +1097,11 @@
                         Alloc param_59 = cmd_alloc;
                         CmdRef param_60 = cmd_ref;
                         uint param_61 = cmd_limit;
-                        bool _2059 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
+                        bool _2060 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
                         cmd_alloc = param_59;
                         cmd_ref = param_60;
                         cmd_limit = param_61;
-                        if (!_2059)
+                        if (!_2060)
                         {
                             break;
                         }
@@ -1123,17 +1123,17 @@
                     }
                     case 5u:
                     {
-                        bool _2112 = tile_1.tile.offset == 0u;
-                        bool _2118;
-                        if (_2112)
+                        bool _2113 = tile_1.tile.offset == 0u;
+                        bool _2119;
+                        if (_2113)
                         {
-                            _2118 = tile_1.backdrop == 0;
+                            _2119 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2118 = _2112;
+                            _2119 = _2113;
                         }
-                        if (_2118)
+                        if (_2119)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1142,11 +1142,11 @@
                             Alloc param_69 = cmd_alloc;
                             CmdRef param_70 = cmd_ref;
                             uint param_71 = cmd_limit;
-                            bool _2130 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
+                            bool _2131 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
                             cmd_alloc = param_69;
                             cmd_ref = param_70;
                             cmd_limit = param_71;
-                            if (!_2130)
+                            if (!_2131)
                             {
                                 break;
                             }
@@ -1164,11 +1164,11 @@
                         Alloc param_74 = cmd_alloc;
                         CmdRef param_75 = cmd_ref;
                         uint param_76 = cmd_limit;
-                        bool _2158 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
+                        bool _2159 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
                         cmd_alloc = param_74;
                         cmd_ref = param_75;
                         cmd_limit = param_76;
-                        if (!_2158)
+                        if (!_2159)
                         {
                             break;
                         }
@@ -1216,17 +1216,17 @@
             break;
         }
     }
-    bool _2228 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
-    bool _2237;
-    if (_2228)
+    bool _2229 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
+    bool _2238;
+    if (_2229)
     {
-        _2237 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
+        _2238 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
     }
     else
     {
-        _2237 = _2228;
+        _2238 = _2229;
     }
-    if (_2237)
+    if (_2238)
     {
         Alloc param_84 = cmd_alloc;
         CmdRef param_85 = cmd_ref;
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index 6d33ee7..56a87e5 100644
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index e6eccc1..7399fe4 100644
--- a/piet-gpu/shader/gen/kernel4.dxil
+++ b/piet-gpu/shader/gen/kernel4.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index 92fe05b..4839db2 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -161,8 +161,8 @@
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
-RWByteAddressBuffer _291 : register(u0, space0);
-ByteAddressBuffer _1666 : register(t1, space0);
+RWByteAddressBuffer _297 : register(u0, space0);
+ByteAddressBuffer _1681 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float4> image : register(u2, space0);
@@ -189,8 +189,8 @@
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _304 = { a.offset + offset };
-    return _304;
+    Alloc _310 = { a.offset + offset };
+    return _310;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -206,7 +206,7 @@
     {
         return 0u;
     }
-    uint v = _291.Load(offset * 4 + 8);
+    uint v = _297.Load(offset * 4 + 8);
     return v;
 }
 
@@ -215,8 +215,8 @@
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    CmdTag _663 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _663;
+    CmdTag _669 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _669;
 }
 
 CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
@@ -236,9 +236,9 @@
 
 CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
 {
-    CmdStrokeRef _679 = { ref.offset + 4u };
+    CmdStrokeRef _685 = { ref.offset + 4u };
     Alloc param = a;
-    CmdStrokeRef param_1 = _679;
+    CmdStrokeRef param_1 = _685;
     return CmdStroke_read(param, param_1);
 }
 
@@ -274,8 +274,8 @@
     s.origin = float2(asfloat(raw0), asfloat(raw1));
     s._vector = float2(asfloat(raw2), asfloat(raw3));
     s.y_edge = asfloat(raw4);
-    TileSegRef _820 = { raw5 };
-    s.next = _820;
+    TileSegRef _826 = { raw5 };
+    s.next = _826;
     return s;
 }
 
@@ -301,9 +301,9 @@
 
 CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
 {
-    CmdFillRef _669 = { ref.offset + 4u };
+    CmdFillRef _675 = { ref.offset + 4u };
     Alloc param = a;
-    CmdFillRef param_1 = _669;
+    CmdFillRef param_1 = _675;
     return CmdFill_read(param, param_1);
 }
 
@@ -320,9 +320,9 @@
 
 CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
 {
-    CmdAlphaRef _689 = { ref.offset + 4u };
+    CmdAlphaRef _695 = { ref.offset + 4u };
     Alloc param = a;
-    CmdAlphaRef param_1 = _689;
+    CmdAlphaRef param_1 = _695;
     return CmdAlpha_read(param, param_1);
 }
 
@@ -339,18 +339,15 @@
 
 CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
 {
-    CmdColorRef _699 = { ref.offset + 4u };
+    CmdColorRef _705 = { ref.offset + 4u };
     Alloc param = a;
-    CmdColorRef param_1 = _699;
+    CmdColorRef param_1 = _705;
     return CmdColor_read(param, param_1);
 }
 
 float3 fromsRGB(float3 srgb)
 {
-    bool3 cutoff = bool3(srgb.x >= 0.040449999272823333740234375f.xxx.x, srgb.y >= 0.040449999272823333740234375f.xxx.y, srgb.z >= 0.040449999272823333740234375f.xxx.z);
-    float3 below = srgb / 12.9200000762939453125f.xxx;
-    float3 above = pow((srgb + 0.054999999701976776123046875f.xxx) / 1.05499994754791259765625f.xxx, 2.400000095367431640625f.xxx);
-    return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z);
+    return srgb;
 }
 
 float4 unpacksRGB(uint srgba)
@@ -385,9 +382,9 @@
 
 CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
 {
-    CmdLinGradRef _709 = { ref.offset + 4u };
+    CmdLinGradRef _715 = { ref.offset + 4u };
     Alloc param = a;
-    CmdLinGradRef param_1 = _709;
+    CmdLinGradRef param_1 = _715;
     return CmdLinGrad_read(param, param_1);
 }
 
@@ -439,9 +436,9 @@
 
 CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref)
 {
-    CmdRadGradRef _719 = { ref.offset + 4u };
+    CmdRadGradRef _725 = { ref.offset + 4u };
     Alloc param = a;
-    CmdRadGradRef param_1 = _719;
+    CmdRadGradRef param_1 = _725;
     return CmdRadGrad_read(param, param_1);
 }
 
@@ -462,9 +459,9 @@
 
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
 {
-    CmdImageRef _729 = { ref.offset + 4u };
+    CmdImageRef _735 = { ref.offset + 4u };
     Alloc param = a;
-    CmdImageRef param_1 = _729;
+    CmdImageRef param_1 = _735;
     return CmdImage_read(param, param_1);
 }
 
@@ -477,10 +474,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1638 = fromsRGB(param_1);
-        fg_rgba.x = _1638.x;
-        fg_rgba.y = _1638.y;
-        fg_rgba.z = _1638.z;
+        float3 _1653 = fromsRGB(param_1);
+        fg_rgba.x = _1653.x;
+        fg_rgba.y = _1653.y;
+        fg_rgba.z = _1653.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -488,10 +485,7 @@
 
 float3 tosRGB(float3 rgb)
 {
-    bool3 cutoff = bool3(rgb.x >= 0.003130800090730190277099609375f.xxx.x, rgb.y >= 0.003130800090730190277099609375f.xxx.y, rgb.z >= 0.003130800090730190277099609375f.xxx.z);
-    float3 below = 12.9200000762939453125f.xxx * rgb;
-    float3 above = (1.05499994754791259765625f.xxx * pow(rgb, 0.416660010814666748046875f.xxx)) - 0.054999999701976776123046875f.xxx;
-    return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z);
+    return rgb;
 }
 
 uint packsRGB(inout float4 rgba)
@@ -514,9 +508,9 @@
 
 CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
 {
-    CmdEndClipRef _739 = { ref.offset + 4u };
+    CmdEndClipRef _745 = { ref.offset + 4u };
     Alloc param = a;
-    CmdEndClipRef param_1 = _739;
+    CmdEndClipRef param_1 = _745;
     return CmdEndClip_read(param, param_1);
 }
 
@@ -529,7 +523,10 @@
 {
     float3 param = cb;
     float3 param_1 = (cs * 2.0f) - 1.0f.xxx;
-    return lerp(screen(param, param_1), (cb * 2.0f) * cs, float3(bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z)));
+    float3 _889 = screen(param, param_1);
+    float3 _893 = (cb * 2.0f) * cs;
+    bool3 _898 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z);
+    return float3(_898.x ? _893.x : _889.x, _898.y ? _893.y : _889.y, _898.z ? _893.z : _889.z);
 }
 
 float color_dodge(float cb, float cs)
@@ -572,8 +569,14 @@
 
 float3 soft_light(float3 cb, float3 cs)
 {
-    float3 d = lerp(sqrt(cb), ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb, float3(bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z)));
-    return lerp(cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb)), cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb)), float3(bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z)));
+    float3 _904 = sqrt(cb);
+    float3 _917 = ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb;
+    bool3 _921 = bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z);
+    float3 d = float3(_921.x ? _917.x : _904.x, _921.y ? _917.y : _904.y, _921.z ? _917.z : _904.z);
+    float3 _932 = cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb));
+    float3 _942 = cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb));
+    bool3 _944 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z);
+    return float3(_944.x ? _942.x : _932.x, _944.y ? _942.y : _932.y, _944.z ? _942.z : _932.z);
 }
 
 float sat(float3 c)
@@ -706,8 +709,8 @@
 {
     float3 param = c;
     float3 param_1 = c + (l - lum(param)).xxx;
-    float3 _1046 = clip_color(param_1);
-    return _1046;
+    float3 _1048 = clip_color(param_1);
+    return _1048;
 }
 
 float3 mix_blend(float3 cb, float3 cs, uint mode)
@@ -795,9 +798,9 @@
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1337 = set_sat(param_21, param_22);
+            float3 _1340 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1337;
+            float3 param_24 = _1340;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -807,9 +810,9 @@
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1351 = set_sat(param_27, param_28);
+            float3 _1354 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1351;
+            float3 param_30 = _1354;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -919,18 +922,50 @@
         }
         case 13u:
         {
-            return float4(max(0.0f.xxxx, ((1.0f.xxxx - (float4(cs, as) * as)) + 1.0f.xxxx) - (float4(cb, ab) * ab)).xyz, max(0.0f, ((1.0f - as) + 1.0f) - ab));
-        }
-        case 14u:
-        {
-            return float4(min(1.0f.xxxx, (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, min(1.0f, as + ab));
+            return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
         }
         default:
         {
             break;
         }
     }
-    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+    float as_fa = as * fa;
+    float ab_fb = ab * fb;
+    float3 co = (cs * as_fa) + (cb * ab_fb);
+    return float4(co, as_fa + ab_fb);
+}
+
+float4 mix_blend_compose(float4 backdrop, float4 src, uint mode)
+{
+    if ((mode & 32767u) == 3u)
+    {
+        return (backdrop * (1.0f - src.w)) + src;
+    }
+    float inv_src_a = 1.0f / (src.w + 1.0000000036274937255387218471014e-15f);
+    float3 cs = src.xyz * inv_src_a;
+    float inv_backdrop_a = 1.0f / (backdrop.w + 1.0000000036274937255387218471014e-15f);
+    float3 cb = backdrop.xyz * inv_backdrop_a;
+    uint blend_mode = mode >> uint(8);
+    float3 param = cb;
+    float3 param_1 = cs;
+    uint param_2 = blend_mode;
+    float3 blended = mix_blend(param, param_1, param_2);
+    cs = lerp(cs, blended, backdrop.w.xxx);
+    uint comp_mode = mode & 255u;
+    if (comp_mode == 3u)
+    {
+        float3 co = lerp(backdrop.xyz, cs, src.w.xxx);
+        return float4(co, src.w + (backdrop.w * (1.0f - src.w)));
+    }
+    else
+    {
+        float3 param_3 = cb;
+        float3 param_4 = cs;
+        float param_5 = backdrop.w;
+        float param_6 = src.w;
+        uint param_7 = comp_mode;
+        return mix_compose(param_3, param_4, param_5, param_6, param_7);
+    }
 }
 
 CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
@@ -946,24 +981,24 @@
 
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 {
-    CmdJumpRef _749 = { ref.offset + 4u };
+    CmdJumpRef _755 = { ref.offset + 4u };
     Alloc param = a;
-    CmdJumpRef param_1 = _749;
+    CmdJumpRef param_1 = _755;
     return CmdJump_read(param, param_1);
 }
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1666.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1681;
-    _1681.offset = _1666.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1681.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1696;
+    _1696.offset = _1681.Load(24);
     Alloc param;
-    param.offset = _1681.offset;
+    param.offset = _1696.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1690 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1690;
+    CmdRef _1705 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1705;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -972,7 +1007,7 @@
         rgba[i] = 0.0f.xxxx;
     }
     uint clip_depth = 0u;
-    bool mem_ok = _291.Load(4) == 0u;
+    bool mem_ok = _297.Load(4) == 0u;
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
@@ -997,8 +1032,8 @@
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1784 = { stroke.tile_ref };
-                tile_seg_ref = _1784;
+                TileSegRef _1800 = { stroke.tile_ref };
+                tile_seg_ref = _1800;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1034,8 +1069,8 @@
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1904 = { fill.tile_ref };
-                tile_seg_ref = _1904;
+                TileSegRef _1920 = { fill.tile_ref };
+                tile_seg_ref = _1920;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1124,10 +1159,10 @@
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2238 = fromsRGB(param_29);
-                    fg_rgba.x = _2238.x;
-                    fg_rgba.y = _2238.y;
-                    fg_rgba.z = _2238.z;
+                    float3 _2254 = fromsRGB(param_29);
+                    fg_rgba.x = _2254.x;
+                    fg_rgba.y = _2254.y;
+                    fg_rgba.z = _2254.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1150,10 +1185,10 @@
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2348 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2348.x;
-                    fg_rgba_1.y = _2348.y;
-                    fg_rgba_1.z = _2348.z;
+                    float3 _2364 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2364.x;
+                    fg_rgba_1.y = _2364.y;
+                    fg_rgba_1.z = _2364.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1167,9 +1202,9 @@
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2391[8];
-                fillImage(_2391, param_36, param_37);
-                float4 img[8] = _2391;
+                float4 _2407[8];
+                fillImage(_2407, param_36, param_37);
+                float4 img[8] = _2407;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1184,8 +1219,8 @@
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_38 = float4(rgba[k_12]);
-                    uint _2454 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2454;
+                    uint _2470 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2470;
                     rgba[k_12] = 0.0f.xxxx;
                 }
                 clip_depth++;
@@ -1206,32 +1241,20 @@
                     uint param_41 = blend_stack[d_3][k_13];
                     float4 bg = unpacksRGB(param_41);
                     float4 fg_1 = rgba[k_13] * area[k_13];
-                    float3 param_42 = bg.xyz;
-                    float3 param_43 = fg_1.xyz;
-                    uint param_44 = blend_mode;
-                    float3 blend = mix_blend(param_42, param_43, param_44);
-                    float4 _2521 = fg_1;
-                    float _2525 = fg_1.w;
-                    float3 _2532 = lerp(_2521.xyz, blend, float((_2525 * bg.w) > 0.0f).xxx);
-                    fg_1.x = _2532.x;
-                    fg_1.y = _2532.y;
-                    fg_1.z = _2532.z;
-                    float3 param_45 = bg.xyz;
-                    float3 param_46 = fg_1.xyz;
-                    float param_47 = bg.w;
-                    float param_48 = fg_1.w;
-                    uint param_49 = comp_mode;
-                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
+                    float4 param_42 = bg;
+                    float4 param_43 = fg_1;
+                    uint param_44 = end_clip.blend;
+                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_50 = cmd_alloc;
-                CmdRef param_51 = cmd_ref;
-                CmdRef _2569 = { Cmd_Jump_read(param_50, param_51).new_ref };
-                cmd_ref = _2569;
+                Alloc param_45 = cmd_alloc;
+                CmdRef param_46 = cmd_ref;
+                CmdRef _2548 = { Cmd_Jump_read(param_45, param_46).new_ref };
+                cmd_ref = _2548;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1239,9 +1262,9 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_52 = i_1;
-        float3 param_53 = rgba[i_1].xyz;
-        image[int2(xy_uint + chunk_offset(param_52))] = float4(tosRGB(param_53), rgba[i_1].w);
+        uint param_47 = i_1;
+        float3 param_48 = rgba[i_1].xyz;
+        image[int2(xy_uint + chunk_offset(param_47))] = float4(tosRGB(param_48), rgba[i_1].w);
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index 6489563..4caeaf0 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -237,7 +237,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_291)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_297)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -245,29 +245,29 @@
     {
         return 0u;
     }
-    uint v = v_291.memory[offset];
+    uint v = v_297.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_291);
+    uint tag_and_flags = read_mem(param, param_1, v_297);
     return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_291)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = as_type<float>(raw1);
@@ -275,11 +275,11 @@
 }
 
 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_291);
+    return CmdStroke_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -291,27 +291,27 @@
 }
 
 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_291)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_291);
+    uint raw2 = read_mem(param_4, param_5, v_297);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_291);
+    uint raw3 = read_mem(param_6, param_7, v_297);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_291);
+    uint raw4 = read_mem(param_8, param_9, v_297);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_291);
+    uint raw5 = read_mem(param_10, param_11, v_297);
     TileSeg s;
     s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -327,15 +327,15 @@
 }
 
 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_291)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -343,60 +343,57 @@
 }
 
 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_291);
+    return CmdFill_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_291)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdAlpha s;
     s.alpha = as_type<float>(raw0);
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_291);
+    return CmdAlpha_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_291)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_291);
+    return CmdColor_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
 float3 fromsRGB(thread const float3& srgb)
 {
-    bool3 cutoff = srgb >= float3(0.040449999272823333740234375);
-    float3 below = srgb / float3(12.9200000762939453125);
-    float3 above = pow((srgb + float3(0.054999999701976776123046875)) / float3(1.05499994754791259765625), float3(2.400000095367431640625));
-    return select(below, above, cutoff);
+    return srgb;
 }
 
 static inline __attribute__((always_inline))
@@ -408,21 +405,21 @@
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_291)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_291);
+    uint raw2 = read_mem(param_4, param_5, v_297);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_291);
+    uint raw3 = read_mem(param_6, param_7, v_297);
     CmdLinGrad s;
     s.index = raw0;
     s.line_x = as_type<float>(raw1);
@@ -432,50 +429,50 @@
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_291);
+    return CmdLinGrad_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_291)
+CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_291);
+    uint raw2 = read_mem(param_4, param_5, v_297);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_291);
+    uint raw3 = read_mem(param_6, param_7, v_297);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_291);
+    uint raw4 = read_mem(param_8, param_9, v_297);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_291);
+    uint raw5 = read_mem(param_10, param_11, v_297);
     Alloc param_12 = a;
     uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_291);
+    uint raw6 = read_mem(param_12, param_13, v_297);
     Alloc param_14 = a;
     uint param_15 = ix + 7u;
-    uint raw7 = read_mem(param_14, param_15, v_291);
+    uint raw7 = read_mem(param_14, param_15, v_297);
     Alloc param_16 = a;
     uint param_17 = ix + 8u;
-    uint raw8 = read_mem(param_16, param_17, v_291);
+    uint raw8 = read_mem(param_16, param_17, v_297);
     Alloc param_18 = a;
     uint param_19 = ix + 9u;
-    uint raw9 = read_mem(param_18, param_19, v_291);
+    uint raw9 = read_mem(param_18, param_19, v_297);
     Alloc param_20 = a;
     uint param_21 = ix + 10u;
-    uint raw10 = read_mem(param_20, param_21, v_291);
+    uint raw10 = read_mem(param_20, param_21, v_297);
     CmdRadGrad s;
     s.index = raw0;
     s.mat = float4(as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3), as_type<float>(raw4));
@@ -487,23 +484,23 @@
 }
 
 static inline __attribute__((always_inline))
-CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u };
-    return CmdRadGrad_read(param, param_1, v_291);
+    return CmdRadGrad_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_291)
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     CmdImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -511,11 +508,11 @@
 }
 
 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_291);
+    return CmdImage_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -528,10 +525,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1638 = fromsRGB(param_1);
-        fg_rgba.x = _1638.x;
-        fg_rgba.y = _1638.y;
-        fg_rgba.z = _1638.z;
+        float3 _1653 = fromsRGB(param_1);
+        fg_rgba.x = _1653.x;
+        fg_rgba.y = _1653.y;
+        fg_rgba.z = _1653.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -540,10 +537,7 @@
 static inline __attribute__((always_inline))
 float3 tosRGB(thread const float3& rgb)
 {
-    bool3 cutoff = rgb >= float3(0.003130800090730190277099609375);
-    float3 below = float3(12.9200000762939453125) * rgb;
-    float3 above = (float3(1.05499994754791259765625) * pow(rgb, float3(0.416660010814666748046875))) - float3(0.054999999701976776123046875);
-    return select(below, above, cutoff);
+    return rgb;
 }
 
 static inline __attribute__((always_inline))
@@ -555,23 +549,23 @@
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_291)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdEndClip s;
     s.blend = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
-    return CmdEndClip_read(param, param_1, v_291);
+    return CmdEndClip_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -585,7 +579,7 @@
 {
     float3 param = cb;
     float3 param_1 = (cs * 2.0) - float3(1.0);
-    return mix(screen(param, param_1), (cb * 2.0) * cs, float3(cs <= float3(0.5)));
+    return select(screen(param, param_1), (cb * 2.0) * cs, cs <= float3(0.5));
 }
 
 static inline __attribute__((always_inline))
@@ -631,8 +625,8 @@
 static inline __attribute__((always_inline))
 float3 soft_light(thread const float3& cb, thread const float3& cs)
 {
-    float3 d = mix(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, float3(cb <= float3(0.25)));
-    return mix(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), float3(cs <= float3(0.5)));
+    float3 d = select(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, cb <= float3(0.25));
+    return select(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), cs <= float3(0.5));
 }
 
 static inline __attribute__((always_inline))
@@ -771,8 +765,8 @@
 {
     float3 param = c;
     float3 param_1 = c + float3(l - lum(param));
-    float3 _1046 = clip_color(param_1);
-    return _1046;
+    float3 _1048 = clip_color(param_1);
+    return _1048;
 }
 
 static inline __attribute__((always_inline))
@@ -861,9 +855,9 @@
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1337 = set_sat(param_21, param_22);
+            float3 _1340 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1337;
+            float3 param_24 = _1340;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -873,9 +867,9 @@
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1351 = set_sat(param_27, param_28);
+            float3 _1354 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1351;
+            float3 param_30 = _1354;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -986,45 +980,78 @@
         }
         case 13u:
         {
-            return float4(fast::max(float4(0.0), ((float4(1.0) - (float4(cs, as) * as)) + float4(1.0)) - (float4(cb, ab) * ab)).xyz, fast::max(0.0, ((1.0 - as) + 1.0) - ab));
-        }
-        case 14u:
-        {
-            return float4(fast::min(float4(1.0), (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, fast::min(1.0, as + ab));
+            return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
         }
         default:
         {
             break;
         }
     }
-    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+    float as_fa = as * fa;
+    float ab_fb = ab * fb;
+    float3 co = (cs * as_fa) + (cb * ab_fb);
+    return float4(co, as_fa + ab_fb);
 }
 
 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_291)
+float4 mix_blend_compose(thread const float4& backdrop, thread const float4& src, thread const uint& mode)
+{
+    if ((mode & 32767u) == 3u)
+    {
+        return (backdrop * (1.0 - src.w)) + src;
+    }
+    float inv_src_a = 1.0 / (src.w + 1.0000000036274937255387218471014e-15);
+    float3 cs = src.xyz * inv_src_a;
+    float inv_backdrop_a = 1.0 / (backdrop.w + 1.0000000036274937255387218471014e-15);
+    float3 cb = backdrop.xyz * inv_backdrop_a;
+    uint blend_mode = mode >> uint(8);
+    float3 param = cb;
+    float3 param_1 = cs;
+    uint param_2 = blend_mode;
+    float3 blended = mix_blend(param, param_1, param_2);
+    cs = mix(cs, blended, float3(backdrop.w));
+    uint comp_mode = mode & 255u;
+    if (comp_mode == 3u)
+    {
+        float3 co = mix(backdrop.xyz, cs, float3(src.w));
+        return float4(co, src.w + (backdrop.w * (1.0 - src.w)));
+    }
+    else
+    {
+        float3 param_3 = cb;
+        float3 param_4 = cs;
+        float param_5 = backdrop.w;
+        float param_6 = src.w;
+        uint param_7 = comp_mode;
+        return mix_compose(param_3, param_4, param_5, param_6, param_7);
+    }
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_291);
+    return CmdJump_read(param, param_1, v_297);
 }
 
-kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1666 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1666.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1666.conf.ptcl_alloc.offset;
+    param.offset = _1681.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -1037,7 +1064,7 @@
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_291.mem_error == 0u;
+    bool mem_ok = v_297.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
@@ -1046,7 +1073,7 @@
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_291).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_297).tag;
         if (tag == 0u)
         {
             break;
@@ -1057,7 +1084,7 @@
             {
                 Alloc param_5 = cmd_alloc;
                 CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_291);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_297);
                 for (uint k = 0u; k < 8u; k++)
                 {
                     df[k] = 1000000000.0;
@@ -1070,7 +1097,7 @@
                     bool param_9 = mem_ok;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_291);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_297);
                     float2 line_vec = seg.vector;
                     for (uint k_1 = 0u; k_1 < 8u; k_1++)
                     {
@@ -1093,7 +1120,7 @@
             {
                 Alloc param_13 = cmd_alloc;
                 CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_291);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_297);
                 for (uint k_3 = 0u; k_3 < 8u; k_3++)
                 {
                     area[k_3] = float(fill.backdrop);
@@ -1106,7 +1133,7 @@
                     bool param_17 = mem_ok;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_291);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297);
                     for (uint k_4 = 0u; k_4 < 8u; k_4++)
                     {
                         uint param_20 = k_4;
@@ -1150,7 +1177,7 @@
             {
                 Alloc param_21 = cmd_alloc;
                 CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_291);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_297);
                 for (uint k_7 = 0u; k_7 < 8u; k_7++)
                 {
                     area[k_7] = alpha.alpha;
@@ -1162,7 +1189,7 @@
             {
                 Alloc param_23 = cmd_alloc;
                 CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_291);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_297);
                 uint param_25 = color.rgba_color;
                 float4 fg = unpacksRGB(param_25);
                 for (uint k_8 = 0u; k_8 < 8u; k_8++)
@@ -1177,7 +1204,7 @@
             {
                 Alloc param_26 = cmd_alloc;
                 CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_291);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_297);
                 float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                 for (uint k_9 = 0u; k_9 < 8u; k_9++)
                 {
@@ -1187,10 +1214,10 @@
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2238 = fromsRGB(param_29);
-                    fg_rgba.x = _2238.x;
-                    fg_rgba.y = _2238.y;
-                    fg_rgba.z = _2238.z;
+                    float3 _2254 = fromsRGB(param_29);
+                    fg_rgba.x = _2254.x;
+                    fg_rgba.y = _2254.y;
+                    fg_rgba.z = _2254.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1201,7 +1228,7 @@
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_291);
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_297);
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
                     uint param_32 = k_10;
@@ -1213,10 +1240,10 @@
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2348 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2348.x;
-                    fg_rgba_1.y = _2348.y;
-                    fg_rgba_1.z = _2348.z;
+                    float3 _2364 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2364.x;
+                    fg_rgba_1.y = _2364.y;
+                    fg_rgba_1.z = _2364.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1227,7 +1254,7 @@
             {
                 Alloc param_34 = cmd_alloc;
                 CmdRef param_35 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_291);
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_297);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
                 spvUnsafeArray<float4, 8> img;
@@ -1246,8 +1273,8 @@
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_38 = float4(rgba[k_12]);
-                    uint _2454 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2454;
+                    uint _2470 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2470;
                     rgba[k_12] = float4(0.0);
                 }
                 clip_depth++;
@@ -1258,7 +1285,7 @@
             {
                 Alloc param_39 = cmd_alloc;
                 CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_291);
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
                 uint blend_mode = end_clip.blend >> uint(8);
                 uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
@@ -1268,31 +1295,19 @@
                     uint param_41 = blend_stack[d_3][k_13];
                     float4 bg = unpacksRGB(param_41);
                     float4 fg_1 = rgba[k_13] * area[k_13];
-                    float3 param_42 = bg.xyz;
-                    float3 param_43 = fg_1.xyz;
-                    uint param_44 = blend_mode;
-                    float3 blend = mix_blend(param_42, param_43, param_44);
-                    float4 _2521 = fg_1;
-                    float _2525 = fg_1.w;
-                    float3 _2532 = mix(_2521.xyz, blend, float3(float((_2525 * bg.w) > 0.0)));
-                    fg_1.x = _2532.x;
-                    fg_1.y = _2532.y;
-                    fg_1.z = _2532.z;
-                    float3 param_45 = bg.xyz;
-                    float3 param_46 = fg_1.xyz;
-                    float param_47 = bg.w;
-                    float param_48 = fg_1.w;
-                    uint param_49 = comp_mode;
-                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
+                    float4 param_42 = bg;
+                    float4 param_43 = fg_1;
+                    uint param_44 = end_clip.blend;
+                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_50 = cmd_alloc;
-                CmdRef param_51 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_50, param_51, v_291).new_ref };
+                Alloc param_45 = cmd_alloc;
+                CmdRef param_46 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1300,9 +1315,9 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_52 = i_1;
-        float3 param_53 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_53), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_52))));
+        uint param_47 = i_1;
+        float3 param_48 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_48), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index 7061263..f0e2963 100644
--- a/piet-gpu/shader/gen/kernel4.spv
+++ b/piet-gpu/shader/gen/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index 046045f..7b7c19f 100644
--- a/piet-gpu/shader/gen/kernel4_gray.dxil
+++ b/piet-gpu/shader/gen/kernel4_gray.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index 019a73c..5d9b88d 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -161,8 +161,8 @@
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
-RWByteAddressBuffer _291 : register(u0, space0);
-ByteAddressBuffer _1666 : register(t1, space0);
+RWByteAddressBuffer _297 : register(u0, space0);
+ByteAddressBuffer _1681 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float> image : register(u2, space0);
@@ -189,8 +189,8 @@
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _304 = { a.offset + offset };
-    return _304;
+    Alloc _310 = { a.offset + offset };
+    return _310;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -206,7 +206,7 @@
     {
         return 0u;
     }
-    uint v = _291.Load(offset * 4 + 8);
+    uint v = _297.Load(offset * 4 + 8);
     return v;
 }
 
@@ -215,8 +215,8 @@
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    CmdTag _663 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _663;
+    CmdTag _669 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _669;
 }
 
 CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
@@ -236,9 +236,9 @@
 
 CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
 {
-    CmdStrokeRef _679 = { ref.offset + 4u };
+    CmdStrokeRef _685 = { ref.offset + 4u };
     Alloc param = a;
-    CmdStrokeRef param_1 = _679;
+    CmdStrokeRef param_1 = _685;
     return CmdStroke_read(param, param_1);
 }
 
@@ -274,8 +274,8 @@
     s.origin = float2(asfloat(raw0), asfloat(raw1));
     s._vector = float2(asfloat(raw2), asfloat(raw3));
     s.y_edge = asfloat(raw4);
-    TileSegRef _820 = { raw5 };
-    s.next = _820;
+    TileSegRef _826 = { raw5 };
+    s.next = _826;
     return s;
 }
 
@@ -301,9 +301,9 @@
 
 CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
 {
-    CmdFillRef _669 = { ref.offset + 4u };
+    CmdFillRef _675 = { ref.offset + 4u };
     Alloc param = a;
-    CmdFillRef param_1 = _669;
+    CmdFillRef param_1 = _675;
     return CmdFill_read(param, param_1);
 }
 
@@ -320,9 +320,9 @@
 
 CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
 {
-    CmdAlphaRef _689 = { ref.offset + 4u };
+    CmdAlphaRef _695 = { ref.offset + 4u };
     Alloc param = a;
-    CmdAlphaRef param_1 = _689;
+    CmdAlphaRef param_1 = _695;
     return CmdAlpha_read(param, param_1);
 }
 
@@ -339,18 +339,15 @@
 
 CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
 {
-    CmdColorRef _699 = { ref.offset + 4u };
+    CmdColorRef _705 = { ref.offset + 4u };
     Alloc param = a;
-    CmdColorRef param_1 = _699;
+    CmdColorRef param_1 = _705;
     return CmdColor_read(param, param_1);
 }
 
 float3 fromsRGB(float3 srgb)
 {
-    bool3 cutoff = bool3(srgb.x >= 0.040449999272823333740234375f.xxx.x, srgb.y >= 0.040449999272823333740234375f.xxx.y, srgb.z >= 0.040449999272823333740234375f.xxx.z);
-    float3 below = srgb / 12.9200000762939453125f.xxx;
-    float3 above = pow((srgb + 0.054999999701976776123046875f.xxx) / 1.05499994754791259765625f.xxx, 2.400000095367431640625f.xxx);
-    return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z);
+    return srgb;
 }
 
 float4 unpacksRGB(uint srgba)
@@ -385,9 +382,9 @@
 
 CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
 {
-    CmdLinGradRef _709 = { ref.offset + 4u };
+    CmdLinGradRef _715 = { ref.offset + 4u };
     Alloc param = a;
-    CmdLinGradRef param_1 = _709;
+    CmdLinGradRef param_1 = _715;
     return CmdLinGrad_read(param, param_1);
 }
 
@@ -439,9 +436,9 @@
 
 CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref)
 {
-    CmdRadGradRef _719 = { ref.offset + 4u };
+    CmdRadGradRef _725 = { ref.offset + 4u };
     Alloc param = a;
-    CmdRadGradRef param_1 = _719;
+    CmdRadGradRef param_1 = _725;
     return CmdRadGrad_read(param, param_1);
 }
 
@@ -462,9 +459,9 @@
 
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
 {
-    CmdImageRef _729 = { ref.offset + 4u };
+    CmdImageRef _735 = { ref.offset + 4u };
     Alloc param = a;
-    CmdImageRef param_1 = _729;
+    CmdImageRef param_1 = _735;
     return CmdImage_read(param, param_1);
 }
 
@@ -477,10 +474,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1638 = fromsRGB(param_1);
-        fg_rgba.x = _1638.x;
-        fg_rgba.y = _1638.y;
-        fg_rgba.z = _1638.z;
+        float3 _1653 = fromsRGB(param_1);
+        fg_rgba.x = _1653.x;
+        fg_rgba.y = _1653.y;
+        fg_rgba.z = _1653.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -488,10 +485,7 @@
 
 float3 tosRGB(float3 rgb)
 {
-    bool3 cutoff = bool3(rgb.x >= 0.003130800090730190277099609375f.xxx.x, rgb.y >= 0.003130800090730190277099609375f.xxx.y, rgb.z >= 0.003130800090730190277099609375f.xxx.z);
-    float3 below = 12.9200000762939453125f.xxx * rgb;
-    float3 above = (1.05499994754791259765625f.xxx * pow(rgb, 0.416660010814666748046875f.xxx)) - 0.054999999701976776123046875f.xxx;
-    return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z);
+    return rgb;
 }
 
 uint packsRGB(inout float4 rgba)
@@ -514,9 +508,9 @@
 
 CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
 {
-    CmdEndClipRef _739 = { ref.offset + 4u };
+    CmdEndClipRef _745 = { ref.offset + 4u };
     Alloc param = a;
-    CmdEndClipRef param_1 = _739;
+    CmdEndClipRef param_1 = _745;
     return CmdEndClip_read(param, param_1);
 }
 
@@ -529,7 +523,10 @@
 {
     float3 param = cb;
     float3 param_1 = (cs * 2.0f) - 1.0f.xxx;
-    return lerp(screen(param, param_1), (cb * 2.0f) * cs, float3(bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z)));
+    float3 _889 = screen(param, param_1);
+    float3 _893 = (cb * 2.0f) * cs;
+    bool3 _898 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z);
+    return float3(_898.x ? _893.x : _889.x, _898.y ? _893.y : _889.y, _898.z ? _893.z : _889.z);
 }
 
 float color_dodge(float cb, float cs)
@@ -572,8 +569,14 @@
 
 float3 soft_light(float3 cb, float3 cs)
 {
-    float3 d = lerp(sqrt(cb), ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb, float3(bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z)));
-    return lerp(cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb)), cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb)), float3(bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z)));
+    float3 _904 = sqrt(cb);
+    float3 _917 = ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb;
+    bool3 _921 = bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z);
+    float3 d = float3(_921.x ? _917.x : _904.x, _921.y ? _917.y : _904.y, _921.z ? _917.z : _904.z);
+    float3 _932 = cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb));
+    float3 _942 = cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb));
+    bool3 _944 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z);
+    return float3(_944.x ? _942.x : _932.x, _944.y ? _942.y : _932.y, _944.z ? _942.z : _932.z);
 }
 
 float sat(float3 c)
@@ -706,8 +709,8 @@
 {
     float3 param = c;
     float3 param_1 = c + (l - lum(param)).xxx;
-    float3 _1046 = clip_color(param_1);
-    return _1046;
+    float3 _1048 = clip_color(param_1);
+    return _1048;
 }
 
 float3 mix_blend(float3 cb, float3 cs, uint mode)
@@ -795,9 +798,9 @@
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1337 = set_sat(param_21, param_22);
+            float3 _1340 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1337;
+            float3 param_24 = _1340;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -807,9 +810,9 @@
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1351 = set_sat(param_27, param_28);
+            float3 _1354 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1351;
+            float3 param_30 = _1354;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -919,18 +922,50 @@
         }
         case 13u:
         {
-            return float4(max(0.0f.xxxx, ((1.0f.xxxx - (float4(cs, as) * as)) + 1.0f.xxxx) - (float4(cb, ab) * ab)).xyz, max(0.0f, ((1.0f - as) + 1.0f) - ab));
-        }
-        case 14u:
-        {
-            return float4(min(1.0f.xxxx, (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, min(1.0f, as + ab));
+            return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
         }
         default:
         {
             break;
         }
     }
-    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+    float as_fa = as * fa;
+    float ab_fb = ab * fb;
+    float3 co = (cs * as_fa) + (cb * ab_fb);
+    return float4(co, as_fa + ab_fb);
+}
+
+float4 mix_blend_compose(float4 backdrop, float4 src, uint mode)
+{
+    if ((mode & 32767u) == 3u)
+    {
+        return (backdrop * (1.0f - src.w)) + src;
+    }
+    float inv_src_a = 1.0f / (src.w + 1.0000000036274937255387218471014e-15f);
+    float3 cs = src.xyz * inv_src_a;
+    float inv_backdrop_a = 1.0f / (backdrop.w + 1.0000000036274937255387218471014e-15f);
+    float3 cb = backdrop.xyz * inv_backdrop_a;
+    uint blend_mode = mode >> uint(8);
+    float3 param = cb;
+    float3 param_1 = cs;
+    uint param_2 = blend_mode;
+    float3 blended = mix_blend(param, param_1, param_2);
+    cs = lerp(cs, blended, backdrop.w.xxx);
+    uint comp_mode = mode & 255u;
+    if (comp_mode == 3u)
+    {
+        float3 co = lerp(backdrop.xyz, cs, src.w.xxx);
+        return float4(co, src.w + (backdrop.w * (1.0f - src.w)));
+    }
+    else
+    {
+        float3 param_3 = cb;
+        float3 param_4 = cs;
+        float param_5 = backdrop.w;
+        float param_6 = src.w;
+        uint param_7 = comp_mode;
+        return mix_compose(param_3, param_4, param_5, param_6, param_7);
+    }
 }
 
 CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
@@ -946,24 +981,24 @@
 
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 {
-    CmdJumpRef _749 = { ref.offset + 4u };
+    CmdJumpRef _755 = { ref.offset + 4u };
     Alloc param = a;
-    CmdJumpRef param_1 = _749;
+    CmdJumpRef param_1 = _755;
     return CmdJump_read(param, param_1);
 }
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1666.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1681;
-    _1681.offset = _1666.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1681.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1696;
+    _1696.offset = _1681.Load(24);
     Alloc param;
-    param.offset = _1681.offset;
+    param.offset = _1696.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1690 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1690;
+    CmdRef _1705 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1705;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -972,7 +1007,7 @@
         rgba[i] = 0.0f.xxxx;
     }
     uint clip_depth = 0u;
-    bool mem_ok = _291.Load(4) == 0u;
+    bool mem_ok = _297.Load(4) == 0u;
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
@@ -997,8 +1032,8 @@
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1784 = { stroke.tile_ref };
-                tile_seg_ref = _1784;
+                TileSegRef _1800 = { stroke.tile_ref };
+                tile_seg_ref = _1800;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1034,8 +1069,8 @@
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1904 = { fill.tile_ref };
-                tile_seg_ref = _1904;
+                TileSegRef _1920 = { fill.tile_ref };
+                tile_seg_ref = _1920;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1124,10 +1159,10 @@
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2238 = fromsRGB(param_29);
-                    fg_rgba.x = _2238.x;
-                    fg_rgba.y = _2238.y;
-                    fg_rgba.z = _2238.z;
+                    float3 _2254 = fromsRGB(param_29);
+                    fg_rgba.x = _2254.x;
+                    fg_rgba.y = _2254.y;
+                    fg_rgba.z = _2254.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1150,10 +1185,10 @@
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2348 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2348.x;
-                    fg_rgba_1.y = _2348.y;
-                    fg_rgba_1.z = _2348.z;
+                    float3 _2364 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2364.x;
+                    fg_rgba_1.y = _2364.y;
+                    fg_rgba_1.z = _2364.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1167,9 +1202,9 @@
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2391[8];
-                fillImage(_2391, param_36, param_37);
-                float4 img[8] = _2391;
+                float4 _2407[8];
+                fillImage(_2407, param_36, param_37);
+                float4 img[8] = _2407;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1184,8 +1219,8 @@
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_38 = float4(rgba[k_12]);
-                    uint _2454 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2454;
+                    uint _2470 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2470;
                     rgba[k_12] = 0.0f.xxxx;
                 }
                 clip_depth++;
@@ -1206,32 +1241,20 @@
                     uint param_41 = blend_stack[d_3][k_13];
                     float4 bg = unpacksRGB(param_41);
                     float4 fg_1 = rgba[k_13] * area[k_13];
-                    float3 param_42 = bg.xyz;
-                    float3 param_43 = fg_1.xyz;
-                    uint param_44 = blend_mode;
-                    float3 blend = mix_blend(param_42, param_43, param_44);
-                    float4 _2521 = fg_1;
-                    float _2525 = fg_1.w;
-                    float3 _2532 = lerp(_2521.xyz, blend, float((_2525 * bg.w) > 0.0f).xxx);
-                    fg_1.x = _2532.x;
-                    fg_1.y = _2532.y;
-                    fg_1.z = _2532.z;
-                    float3 param_45 = bg.xyz;
-                    float3 param_46 = fg_1.xyz;
-                    float param_47 = bg.w;
-                    float param_48 = fg_1.w;
-                    uint param_49 = comp_mode;
-                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
+                    float4 param_42 = bg;
+                    float4 param_43 = fg_1;
+                    uint param_44 = end_clip.blend;
+                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_50 = cmd_alloc;
-                CmdRef param_51 = cmd_ref;
-                CmdRef _2569 = { Cmd_Jump_read(param_50, param_51).new_ref };
-                cmd_ref = _2569;
+                Alloc param_45 = cmd_alloc;
+                CmdRef param_46 = cmd_ref;
+                CmdRef _2548 = { Cmd_Jump_read(param_45, param_46).new_ref };
+                cmd_ref = _2548;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1239,8 +1262,8 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_52 = i_1;
-        image[int2(xy_uint + chunk_offset(param_52))] = rgba[i_1].w.x;
+        uint param_47 = i_1;
+        image[int2(xy_uint + chunk_offset(param_47))] = rgba[i_1].w.x;
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 6402c6f..8c608c3 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -237,7 +237,7 @@
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_291)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_297)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -245,29 +245,29 @@
     {
         return 0u;
     }
-    uint v = v_291.memory[offset];
+    uint v = v_297.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_291);
+    uint tag_and_flags = read_mem(param, param_1, v_297);
     return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_291)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = as_type<float>(raw1);
@@ -275,11 +275,11 @@
 }
 
 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_291);
+    return CmdStroke_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -291,27 +291,27 @@
 }
 
 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_291)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_291);
+    uint raw2 = read_mem(param_4, param_5, v_297);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_291);
+    uint raw3 = read_mem(param_6, param_7, v_297);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_291);
+    uint raw4 = read_mem(param_8, param_9, v_297);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_291);
+    uint raw5 = read_mem(param_10, param_11, v_297);
     TileSeg s;
     s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -327,15 +327,15 @@
 }
 
 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_291)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -343,60 +343,57 @@
 }
 
 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_291);
+    return CmdFill_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_291)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdAlpha s;
     s.alpha = as_type<float>(raw0);
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_291);
+    return CmdAlpha_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_291)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_291);
+    return CmdColor_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
 float3 fromsRGB(thread const float3& srgb)
 {
-    bool3 cutoff = srgb >= float3(0.040449999272823333740234375);
-    float3 below = srgb / float3(12.9200000762939453125);
-    float3 above = pow((srgb + float3(0.054999999701976776123046875)) / float3(1.05499994754791259765625), float3(2.400000095367431640625));
-    return select(below, above, cutoff);
+    return srgb;
 }
 
 static inline __attribute__((always_inline))
@@ -408,21 +405,21 @@
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_291)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_291);
+    uint raw2 = read_mem(param_4, param_5, v_297);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_291);
+    uint raw3 = read_mem(param_6, param_7, v_297);
     CmdLinGrad s;
     s.index = raw0;
     s.line_x = as_type<float>(raw1);
@@ -432,50 +429,50 @@
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_291);
+    return CmdLinGrad_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_291)
+CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_291);
+    uint raw2 = read_mem(param_4, param_5, v_297);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_291);
+    uint raw3 = read_mem(param_6, param_7, v_297);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_291);
+    uint raw4 = read_mem(param_8, param_9, v_297);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_291);
+    uint raw5 = read_mem(param_10, param_11, v_297);
     Alloc param_12 = a;
     uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_291);
+    uint raw6 = read_mem(param_12, param_13, v_297);
     Alloc param_14 = a;
     uint param_15 = ix + 7u;
-    uint raw7 = read_mem(param_14, param_15, v_291);
+    uint raw7 = read_mem(param_14, param_15, v_297);
     Alloc param_16 = a;
     uint param_17 = ix + 8u;
-    uint raw8 = read_mem(param_16, param_17, v_291);
+    uint raw8 = read_mem(param_16, param_17, v_297);
     Alloc param_18 = a;
     uint param_19 = ix + 9u;
-    uint raw9 = read_mem(param_18, param_19, v_291);
+    uint raw9 = read_mem(param_18, param_19, v_297);
     Alloc param_20 = a;
     uint param_21 = ix + 10u;
-    uint raw10 = read_mem(param_20, param_21, v_291);
+    uint raw10 = read_mem(param_20, param_21, v_297);
     CmdRadGrad s;
     s.index = raw0;
     s.mat = float4(as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3), as_type<float>(raw4));
@@ -487,23 +484,23 @@
 }
 
 static inline __attribute__((always_inline))
-CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u };
-    return CmdRadGrad_read(param, param_1, v_291);
+    return CmdRadGrad_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_291)
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     CmdImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -511,11 +508,11 @@
 }
 
 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_291);
+    return CmdImage_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -528,10 +525,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1638 = fromsRGB(param_1);
-        fg_rgba.x = _1638.x;
-        fg_rgba.y = _1638.y;
-        fg_rgba.z = _1638.z;
+        float3 _1653 = fromsRGB(param_1);
+        fg_rgba.x = _1653.x;
+        fg_rgba.y = _1653.y;
+        fg_rgba.z = _1653.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -540,10 +537,7 @@
 static inline __attribute__((always_inline))
 float3 tosRGB(thread const float3& rgb)
 {
-    bool3 cutoff = rgb >= float3(0.003130800090730190277099609375);
-    float3 below = float3(12.9200000762939453125) * rgb;
-    float3 above = (float3(1.05499994754791259765625) * pow(rgb, float3(0.416660010814666748046875))) - float3(0.054999999701976776123046875);
-    return select(below, above, cutoff);
+    return rgb;
 }
 
 static inline __attribute__((always_inline))
@@ -555,23 +549,23 @@
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_291)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdEndClip s;
     s.blend = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
-    return CmdEndClip_read(param, param_1, v_291);
+    return CmdEndClip_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -585,7 +579,7 @@
 {
     float3 param = cb;
     float3 param_1 = (cs * 2.0) - float3(1.0);
-    return mix(screen(param, param_1), (cb * 2.0) * cs, float3(cs <= float3(0.5)));
+    return select(screen(param, param_1), (cb * 2.0) * cs, cs <= float3(0.5));
 }
 
 static inline __attribute__((always_inline))
@@ -631,8 +625,8 @@
 static inline __attribute__((always_inline))
 float3 soft_light(thread const float3& cb, thread const float3& cs)
 {
-    float3 d = mix(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, float3(cb <= float3(0.25)));
-    return mix(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), float3(cs <= float3(0.5)));
+    float3 d = select(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, cb <= float3(0.25));
+    return select(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), cs <= float3(0.5));
 }
 
 static inline __attribute__((always_inline))
@@ -771,8 +765,8 @@
 {
     float3 param = c;
     float3 param_1 = c + float3(l - lum(param));
-    float3 _1046 = clip_color(param_1);
-    return _1046;
+    float3 _1048 = clip_color(param_1);
+    return _1048;
 }
 
 static inline __attribute__((always_inline))
@@ -861,9 +855,9 @@
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1337 = set_sat(param_21, param_22);
+            float3 _1340 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1337;
+            float3 param_24 = _1340;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -873,9 +867,9 @@
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1351 = set_sat(param_27, param_28);
+            float3 _1354 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1351;
+            float3 param_30 = _1354;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -986,45 +980,78 @@
         }
         case 13u:
         {
-            return float4(fast::max(float4(0.0), ((float4(1.0) - (float4(cs, as) * as)) + float4(1.0)) - (float4(cb, ab) * ab)).xyz, fast::max(0.0, ((1.0 - as) + 1.0) - ab));
-        }
-        case 14u:
-        {
-            return float4(fast::min(float4(1.0), (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, fast::min(1.0, as + ab));
+            return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
         }
         default:
         {
             break;
         }
     }
-    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+    float as_fa = as * fa;
+    float ab_fb = ab * fb;
+    float3 co = (cs * as_fa) + (cb * ab_fb);
+    return float4(co, as_fa + ab_fb);
 }
 
 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_291)
+float4 mix_blend_compose(thread const float4& backdrop, thread const float4& src, thread const uint& mode)
+{
+    if ((mode & 32767u) == 3u)
+    {
+        return (backdrop * (1.0 - src.w)) + src;
+    }
+    float inv_src_a = 1.0 / (src.w + 1.0000000036274937255387218471014e-15);
+    float3 cs = src.xyz * inv_src_a;
+    float inv_backdrop_a = 1.0 / (backdrop.w + 1.0000000036274937255387218471014e-15);
+    float3 cb = backdrop.xyz * inv_backdrop_a;
+    uint blend_mode = mode >> uint(8);
+    float3 param = cb;
+    float3 param_1 = cs;
+    uint param_2 = blend_mode;
+    float3 blended = mix_blend(param, param_1, param_2);
+    cs = mix(cs, blended, float3(backdrop.w));
+    uint comp_mode = mode & 255u;
+    if (comp_mode == 3u)
+    {
+        float3 co = mix(backdrop.xyz, cs, float3(src.w));
+        return float4(co, src.w + (backdrop.w * (1.0 - src.w)));
+    }
+    else
+    {
+        float3 param_3 = cb;
+        float3 param_4 = cs;
+        float param_5 = backdrop.w;
+        float param_6 = src.w;
+        uint param_7 = comp_mode;
+        return mix_compose(param_3, param_4, param_5, param_6, param_7);
+    }
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_291);
+    return CmdJump_read(param, param_1, v_297);
 }
 
-kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1666 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1666.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1666.conf.ptcl_alloc.offset;
+    param.offset = _1681.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -1037,7 +1064,7 @@
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_291.mem_error == 0u;
+    bool mem_ok = v_297.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
@@ -1046,7 +1073,7 @@
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_291).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_297).tag;
         if (tag == 0u)
         {
             break;
@@ -1057,7 +1084,7 @@
             {
                 Alloc param_5 = cmd_alloc;
                 CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_291);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_297);
                 for (uint k = 0u; k < 8u; k++)
                 {
                     df[k] = 1000000000.0;
@@ -1070,7 +1097,7 @@
                     bool param_9 = mem_ok;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_291);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_297);
                     float2 line_vec = seg.vector;
                     for (uint k_1 = 0u; k_1 < 8u; k_1++)
                     {
@@ -1093,7 +1120,7 @@
             {
                 Alloc param_13 = cmd_alloc;
                 CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_291);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_297);
                 for (uint k_3 = 0u; k_3 < 8u; k_3++)
                 {
                     area[k_3] = float(fill.backdrop);
@@ -1106,7 +1133,7 @@
                     bool param_17 = mem_ok;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_291);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297);
                     for (uint k_4 = 0u; k_4 < 8u; k_4++)
                     {
                         uint param_20 = k_4;
@@ -1150,7 +1177,7 @@
             {
                 Alloc param_21 = cmd_alloc;
                 CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_291);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_297);
                 for (uint k_7 = 0u; k_7 < 8u; k_7++)
                 {
                     area[k_7] = alpha.alpha;
@@ -1162,7 +1189,7 @@
             {
                 Alloc param_23 = cmd_alloc;
                 CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_291);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_297);
                 uint param_25 = color.rgba_color;
                 float4 fg = unpacksRGB(param_25);
                 for (uint k_8 = 0u; k_8 < 8u; k_8++)
@@ -1177,7 +1204,7 @@
             {
                 Alloc param_26 = cmd_alloc;
                 CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_291);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_297);
                 float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                 for (uint k_9 = 0u; k_9 < 8u; k_9++)
                 {
@@ -1187,10 +1214,10 @@
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2238 = fromsRGB(param_29);
-                    fg_rgba.x = _2238.x;
-                    fg_rgba.y = _2238.y;
-                    fg_rgba.z = _2238.z;
+                    float3 _2254 = fromsRGB(param_29);
+                    fg_rgba.x = _2254.x;
+                    fg_rgba.y = _2254.y;
+                    fg_rgba.z = _2254.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1201,7 +1228,7 @@
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_291);
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_297);
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
                     uint param_32 = k_10;
@@ -1213,10 +1240,10 @@
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2348 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2348.x;
-                    fg_rgba_1.y = _2348.y;
-                    fg_rgba_1.z = _2348.z;
+                    float3 _2364 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2364.x;
+                    fg_rgba_1.y = _2364.y;
+                    fg_rgba_1.z = _2364.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1227,7 +1254,7 @@
             {
                 Alloc param_34 = cmd_alloc;
                 CmdRef param_35 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_291);
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_297);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
                 spvUnsafeArray<float4, 8> img;
@@ -1246,8 +1273,8 @@
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_38 = float4(rgba[k_12]);
-                    uint _2454 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2454;
+                    uint _2470 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2470;
                     rgba[k_12] = float4(0.0);
                 }
                 clip_depth++;
@@ -1258,7 +1285,7 @@
             {
                 Alloc param_39 = cmd_alloc;
                 CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_291);
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
                 uint blend_mode = end_clip.blend >> uint(8);
                 uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
@@ -1268,31 +1295,19 @@
                     uint param_41 = blend_stack[d_3][k_13];
                     float4 bg = unpacksRGB(param_41);
                     float4 fg_1 = rgba[k_13] * area[k_13];
-                    float3 param_42 = bg.xyz;
-                    float3 param_43 = fg_1.xyz;
-                    uint param_44 = blend_mode;
-                    float3 blend = mix_blend(param_42, param_43, param_44);
-                    float4 _2521 = fg_1;
-                    float _2525 = fg_1.w;
-                    float3 _2532 = mix(_2521.xyz, blend, float3(float((_2525 * bg.w) > 0.0)));
-                    fg_1.x = _2532.x;
-                    fg_1.y = _2532.y;
-                    fg_1.z = _2532.z;
-                    float3 param_45 = bg.xyz;
-                    float3 param_46 = fg_1.xyz;
-                    float param_47 = bg.w;
-                    float param_48 = fg_1.w;
-                    uint param_49 = comp_mode;
-                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
+                    float4 param_42 = bg;
+                    float4 param_43 = fg_1;
+                    uint param_44 = end_clip.blend;
+                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_50 = cmd_alloc;
-                CmdRef param_51 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_50, param_51, v_291).new_ref };
+                Alloc param_45 = cmd_alloc;
+                CmdRef param_46 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1300,8 +1315,8 @@
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_52 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_52))));
+        uint param_47 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 4633401..6ff1791 100644
--- a/piet-gpu/shader/gen/kernel4_gray.spv
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil
index 9fd593c..b6c9398 100644
--- a/piet-gpu/shader/gen/path_coarse.dxil
+++ b/piet-gpu/shader/gen/path_coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil
index 6130712..7ce4684 100644
--- a/piet-gpu/shader/gen/pathseg.dxil
+++ b/piet-gpu/shader/gen/pathseg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil
index 4c2bd23..ff544b8 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.dxil
+++ b/piet-gpu/shader/gen/pathtag_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil
index 77f12e6..48584bd 100644
--- a/piet-gpu/shader/gen/pathtag_root.dxil
+++ b/piet-gpu/shader/gen/pathtag_root.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil
index f9f31e6..0c1e376 100644
--- a/piet-gpu/shader/gen/transform_leaf.dxil
+++ b/piet-gpu/shader/gen/transform_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil
index 978dd98..fc3a311 100644
--- a/piet-gpu/shader/gen/transform_reduce.dxil
+++ b/piet-gpu/shader/gen/transform_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_root.dxil b/piet-gpu/shader/gen/transform_root.dxil
index 5b4f059..a33ff7f 100644
--- a/piet-gpu/shader/gen/transform_root.dxil
+++ b/piet-gpu/shader/gen/transform_root.dxil
Binary files differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index c49e2fa..99fd22e 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -9,6 +9,11 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
+// We can do rendering either in sRGB colorspace (for compatibility)
+// or in a linear colorspace, with conversions to sRGB (which will give
+// higher quality antialiasing among other things).
+#define DO_SRGB_CONVERSION 0
+
 #include "mem.h"
 #include "setup.h"
 
@@ -39,18 +44,26 @@
 
 #define MAX_BLEND_STACK 128
 mediump vec3 tosRGB(mediump vec3 rgb) {
+#if DO_SRGB_CONVERSION
     bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308));
     mediump vec3 below = vec3(12.92) * rgb;
     mediump vec3 above = vec3(1.055) * pow(rgb, vec3(0.41666)) - vec3(0.055);
     return mix(below, above, cutoff);
+#else
+    return rgb;
+#endif
 }
 
 mediump vec3 fromsRGB(mediump vec3 srgb) {
+#if DO_SRGB_CONVERSION
     // Formula from EXT_sRGB.
     bvec3 cutoff = greaterThanEqual(srgb, vec3(0.04045));
     mediump vec3 below = srgb / vec3(12.92);
     mediump vec3 above = pow((srgb + vec3(0.055)) / vec3(1.055), vec3(2.4));
     return mix(below, above, cutoff);
+#else
+    return srgb;
+#endif
 }
 
 // unpacksRGB unpacks a color in the sRGB color space to a vec4 in the linear color
@@ -242,10 +255,7 @@
                 uint d = min(clip_depth, MAX_BLEND_STACK - 1);
                 mediump vec4 bg = unpacksRGB(blend_stack[d][k]);
                 mediump vec4 fg = rgba[k] * area[k];
-                vec3 blend = mix_blend(bg.rgb, fg.rgb, blend_mode);
-                // Apply the blend color only where the foreground and background overlap.
-                fg.rgb = mix(fg.rgb, blend, float((fg.a * bg.a) > 0.0));
-                rgba[k] = mix_compose(bg.rgb, fg.rgb, bg.a, fg.a, comp_mode);
+                rgba[k] = mix_blend_compose(bg, fg, end_clip.blend);
             }
             cmd_ref.offset += 4 + CmdEndClip_size;
             break;
diff --git a/piet-gpu/src/blend.rs b/piet-gpu/src/blend.rs
index aacf597..f0ca002 100644
--- a/piet-gpu/src/blend.rs
+++ b/piet-gpu/src/blend.rs
@@ -33,6 +33,8 @@
     Saturation = 13,
     Color = 14,
     Luminosity = 15,
+    // Clip is the same as normal, but doesn't always push a blend group.
+    Clip = 128,
 }
 
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
@@ -51,8 +53,7 @@
     DestAtop = 10,
     Xor = 11,
     Plus = 12,
-    PlusDarker = 13,
-    PlusLighter = 14,
+    PlusLighter = 13,
 }
 
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
@@ -77,7 +78,7 @@
 impl Default for Blend {
     fn default() -> Self {
         Self {
-            mode: BlendMode::Normal,
+            mode: BlendMode::Clip,
             composition_mode: CompositionMode::SrcOver,
         }
     }
diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs
index a24ddbc..d0ef1eb 100644
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@@ -306,16 +306,21 @@
         self.drawdata_stream.extend(bytemuck::bytes_of(&element));
     }
 
-
     /// Encode a fill radial gradient draw object.
     ///
     /// This should be encoded after a path.
     pub fn fill_rad_gradient(&mut self, index: u32, p0: [f32; 2], p1: [f32; 2], r0: f32, r1: f32) {
         self.drawtag_stream.push(DRAWTAG_FILLRADGRADIENT);
-        let element = FillRadGradient { index, p0, p1, r0, r1 };
+        let element = FillRadGradient {
+            index,
+            p0,
+            p1,
+            r0,
+            r1,
+        };
         self.drawdata_stream.extend(bytemuck::bytes_of(&element));
     }
-    
+
     /// Start a clip.
     pub fn begin_clip(&mut self, blend: Option<Blend>) {
         self.drawtag_stream.push(DRAWTAG_BEGINCLIP);
diff --git a/piet-gpu/src/gradient.rs b/piet-gpu/src/gradient.rs
index e655908..443eaec 100644
--- a/piet-gpu/src/gradient.rs
+++ b/piet-gpu/src/gradient.rs
@@ -19,7 +19,7 @@
 use std::collections::hash_map::{Entry, HashMap};
 
 use piet::kurbo::Point;
-use piet::{Color, FixedLinearGradient, GradientStop, FixedRadialGradient};
+use piet::{Color, FixedLinearGradient, FixedRadialGradient, GradientStop};
 
 /// Radial gradient compatible with COLRv1 spec
 #[derive(Debug, Clone)]
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index f5e91d5..c75f41f 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -13,8 +13,8 @@
 
 pub use blend::{Blend, BlendMode, CompositionMode};
 pub use encoder::EncodedSceneRef;
-pub use render_ctx::PietGpuRenderContext;
 pub use gradient::Colrv1RadialGradient;
+pub use render_ctx::PietGpuRenderContext;
 
 use piet::kurbo::Vec2;
 use piet::{ImageFormat, RenderContext};
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index dca03eb..14f2561 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -1,9 +1,12 @@
+// This should match the value in kernel4.comp for correct rendering.
+const DO_SRGB_CONVERSION: bool = false;
+
 use std::borrow::Cow;
 
 use crate::encoder::GlyphEncoder;
 use crate::stages::{Config, Transform};
 use crate::MAX_BLEND_STACK;
-use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape};
+use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};
 use piet::{
     Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext,
     StrokeStyle,
@@ -13,7 +16,7 @@
 use piet_gpu_types::encoder::{Encode, Encoder};
 use piet_gpu_types::scene::Element;
 
-use crate::gradient::{LinearGradient, RadialGradient, RampCache, Colrv1RadialGradient};
+use crate::gradient::{Colrv1RadialGradient, LinearGradient, RadialGradient, RampCache};
 use crate::text::Font;
 pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder};
 use crate::Blend;
@@ -471,19 +474,27 @@
 }
 
 fn to_srgb(f: f64) -> f64 {
-    if f <= 0.0031308 {
-        f * 12.92
+    if DO_SRGB_CONVERSION {
+        if f <= 0.0031308 {
+            f * 12.92
+        } else {
+            let a = 0.055;
+            (1. + a) * f64::powf(f, f64::recip(2.4)) - a
+        }
     } else {
-        let a = 0.055;
-        (1. + a) * f64::powf(f, f64::recip(2.4)) - a
+        f
     }
 }
 
 fn from_srgb(f: f64) -> f64 {
-    if f <= 0.04045 {
-        f / 12.92
+    if DO_SRGB_CONVERSION {
+        if f <= 0.04045 {
+            f / 12.92
+        } else {
+            let a = 0.055;
+            f64::powf((f + a) * f64::recip(1. + a), 2.4)
+        }
     } else {
-        let a = 0.055;
-        f64::powf((f + a) * f64::recip(1. + a), 2.4)
+        f
     }
 }
diff --git a/piet-gpu/src/stages/clip.rs b/piet-gpu/src/stages/clip.rs
index 2fd195b..b7b77eb 100644
--- a/piet-gpu/src/stages/clip.rs
+++ b/piet-gpu/src/stages/clip.rs
@@ -16,7 +16,9 @@
 
 //! The clip processing stage (includes substages).
 
-use piet_gpu_hal::{include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session};
+use piet_gpu_hal::{
+    include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session,
+};
 
 // Note that this isn't the code/stage/binding pattern of most of the other stages
 // in the new element processing pipeline. We want to move those temporary buffers
diff --git a/piet-gpu/src/test_scenes.rs b/piet-gpu/src/test_scenes.rs
index bfd2af2..e3aeaba 100644
--- a/piet-gpu/src/test_scenes.rs
+++ b/piet-gpu/src/test_scenes.rs
@@ -2,10 +2,10 @@
 
 use rand::{Rng, RngCore};
 
-use crate::{Blend, BlendMode, CompositionMode, PietGpuRenderContext, Colrv1RadialGradient};
+use crate::{Blend, BlendMode, Colrv1RadialGradient, CompositionMode, PietGpuRenderContext};
 use piet::kurbo::{Affine, BezPath, Circle, Line, Point, Rect, Shape};
 use piet::{
-    Color, FixedGradient, FixedRadialGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
+    Color, GradientStop, LinearGradient, Text, TextAttribute, TextLayoutBuilder, UnitPoint,
 };
 
 use crate::{PicoSvg, RenderContext, Vec2};
@@ -200,6 +200,113 @@
     println!("flattening and encoding time: {:?}", start.elapsed());
 }
 
+pub fn render_blend_square(rc: &mut PietGpuRenderContext, blend: Blend) {
+    // Inspired by https://developer.mozilla.org/en-US/docs/Web/CSS/mix-blend-mode
+    let rect = Rect::new(0., 0., 200., 200.);
+    let stops = vec![
+        GradientStop {
+            color: Color::BLACK,
+            pos: 0.0,
+        },
+        GradientStop {
+            color: Color::WHITE,
+            pos: 1.0,
+        },
+    ];
+    let linear = LinearGradient::new(UnitPoint::LEFT, UnitPoint::RIGHT, stops);
+    rc.fill(rect, &linear);
+    const GRADIENTS: &[(f64, f64, Color)] = &[
+        (150., 0., Color::rgb8(255, 240, 64)),
+        (175., 100., Color::rgb8(255, 96, 240)),
+        (125., 200., Color::rgb8(64, 192, 255)),
+    ];
+    for (x, y, c) in GRADIENTS {
+        let stops = vec![
+            GradientStop {
+                color: c.clone(),
+                pos: 0.0,
+            },
+            GradientStop {
+                color: Color::rgba8(0, 0, 0, 0),
+                pos: 1.0,
+            },
+        ];
+        let rad = Colrv1RadialGradient {
+            center0: Point::new(*x, *y),
+            center1: Point::new(*x, *y),
+            radius0: 0.0,
+            radius1: 100.0,
+            stops,
+        };
+        let brush = rc.radial_gradient_colrv1(&rad);
+        rc.fill(Rect::new(0., 0., 200., 200.), &brush);
+    }
+    const COLORS: &[Color] = &[
+        Color::rgb8(255, 0, 0),
+        Color::rgb8(0, 255, 0),
+        Color::rgb8(0, 0, 255),
+    ];
+    let _ = rc.with_save(|rc| {
+        // Isolation (this can be removed for non-isolated version)
+        rc.blend(rect, BlendMode::Normal.into());
+        for (i, c) in COLORS.iter().enumerate() {
+            let stops = vec![
+                GradientStop {
+                    color: Color::WHITE,
+                    pos: 0.0,
+                },
+                GradientStop {
+                    color: c.clone(),
+                    pos: 1.0,
+                },
+            ];
+            // squash the ellipse
+            let a = Affine::translate((100., 100.))
+                * Affine::rotate(std::f64::consts::FRAC_PI_3 * (i * 2 + 1) as f64)
+                * Affine::scale_non_uniform(1.0, 0.357)
+                * Affine::translate((-100., -100.));
+            let linear = LinearGradient::new(UnitPoint::TOP, UnitPoint::BOTTOM, stops);
+            let _ = rc.with_save(|rc| {
+                rc.blend(rect, blend);
+                rc.transform(a);
+                rc.fill(Circle::new((100., 100.), 90.), &linear);
+                Ok(())
+            });
+        }
+        Ok(())
+    });
+}
+
+pub fn render_blend_grid(rc: &mut PietGpuRenderContext) {
+    const BLEND_MODES: &[BlendMode] = &[
+        BlendMode::Normal,
+        BlendMode::Multiply,
+        BlendMode::Darken,
+        BlendMode::Screen,
+        BlendMode::Lighten,
+        BlendMode::Overlay,
+        BlendMode::ColorDodge,
+        BlendMode::ColorBurn,
+        BlendMode::HardLight,
+        BlendMode::SoftLight,
+        BlendMode::Difference,
+        BlendMode::Exclusion,
+        BlendMode::Hue,
+        BlendMode::Saturation,
+        BlendMode::Color,
+        BlendMode::Luminosity,
+    ];
+    for (ix, &blend) in BLEND_MODES.iter().enumerate() {
+        let _ = rc.with_save(|rc| {
+            let i = ix % 4;
+            let j = ix / 4;
+            rc.transform(Affine::translate((i as f64 * 225., j as f64 * 225.)));
+            render_blend_square(rc, blend.into());
+            Ok(())
+        });
+    }
+}
+
 pub fn render_anim_frame(rc: &mut impl RenderContext, i: usize) {
     rc.fill(
         Rect::new(0.0, 0.0, 1000.0, 1000.0),
diff --git a/piet-scene/Cargo.toml b/piet-scene/Cargo.toml
index df66483..e80cbb8 100644
--- a/piet-scene/Cargo.toml
+++ b/piet-scene/Cargo.toml
@@ -7,5 +7,5 @@
 [dependencies]
 bytemuck = { version = "1.7.2", features = ["derive"] }
 smallvec = "1.8.0"
-pinot = "0.1.5"
-moscato = "0.1.2"
+moscato = { git = "https://github.com/dfrg/pinot" }
+kurbo = { version = "0.8.3", optional = true }
diff --git a/piet-scene/src/glyph/mod.rs b/piet-scene/src/glyph/mod.rs
index 3bfa36c..f6ebf14 100644
--- a/piet-scene/src/glyph/mod.rs
+++ b/piet-scene/src/glyph/mod.rs
@@ -14,7 +14,7 @@
 //
 // Also licensed under MIT license, at your choice.
 
-pub use pinot;
+pub use moscato::pinot;
 
 use crate::brush::{Brush, Color};
 use crate::geometry::Affine;
@@ -114,7 +114,9 @@
                     };
                     xform_stack.push(xform);
                 }
-                Command::PopTransform => { xform_stack.pop(); },
+                Command::PopTransform => {
+                    xform_stack.pop();
+                }
                 Command::PushClip(path_index) => {
                     let path = glyph.path(*path_index)?;
                     if let Some(xform) = xform_stack.last() {
diff --git a/piet-scene/src/lib.rs b/piet-scene/src/lib.rs
index a72ff54..8f436b2 100644
--- a/piet-scene/src/lib.rs
+++ b/piet-scene/src/lib.rs
@@ -20,3 +20,103 @@
 pub mod path;
 pub mod resource;
 pub mod scene;
+
+/// Implement conversions to and from Kurbo types when the `kurbo` feature is
+/// enabled.
+#[cfg(feature = "kurbo")]
+mod kurbo_conv {
+    use super::geometry::{Affine, Point, Rect};
+    use super::path::Element;
+
+    impl Point {
+        /// Creates a new point from the equivalent kurbo type.
+        pub fn from_kurbo(point: kurbo::Point) -> Self {
+            Self::new(point.x as f32, point.y as f32)
+        }
+    }
+
+    impl From<Point> for kurbo::Point {
+        fn from(p: Point) -> kurbo::Point {
+            Self::new(p.x as f64, p.y as f64)
+        }
+    }
+
+    impl Affine {
+        /// Creates a new affine transformation from the equivalent kurbo type.
+        pub fn from_kurbo(affine: kurbo::Affine) -> Self {
+            let c = affine.as_coeffs();
+            Self {
+                xx: c[0] as f32,
+                yx: c[1] as f32,
+                xy: c[2] as f32,
+                yy: c[3] as f32,
+                dx: c[4] as f32,
+                dy: c[5] as f32,
+            }
+        }
+    }
+
+    impl From<Affine> for kurbo::Affine {
+        fn from(a: Affine) -> Self {
+            Self::new([
+                a.xx as f64,
+                a.yx as f64,
+                a.yx as f64,
+                a.yy as f64,
+                a.dx as f64,
+                a.dy as f64,
+            ])
+        }
+    }
+
+    impl Rect {
+        /// Creates a new rectangle from the equivalent kurbo type.
+        pub fn from_kurbo(rect: kurbo::Rect) -> Self {
+            Self {
+                min: Point::new(rect.x0 as f32, rect.y0 as f32),
+                max: Point::new(rect.x1 as f32, rect.y1 as f32),
+            }
+        }
+    }
+
+    impl From<Rect> for kurbo::Rect {
+        fn from(r: Rect) -> Self {
+            Self {
+                x0: r.min.x as f64,
+                y0: r.min.y as f64,
+                x1: r.max.x as f64,
+                y1: r.max.y as f64,
+            }
+        }
+    }
+
+    impl Element {
+        /// Creates a new path element from the equivalent kurbo type.
+        pub fn from_kurbo(el: kurbo::PathEl) -> Self {
+            use kurbo::PathEl::*;
+            use Point::from_kurbo;
+            match e {
+                MoveTo(p0) => Self::MoveTo(from_kurbo(p0)),
+                LineTo(p0) => Self::LineTo(from_kurbo(p0)),
+                QuadTo(p0, p1) => Self::QuadTo(from_kurbo(p0), from_kurbo(p1)),
+                CurveTo(p0, p1, p2) => {
+                    Self::CurveTo(from_kurbo(p0), from_kurbo(p1), from_kurbo(p2))
+                }
+                ClosePath => Self::Close,
+            }
+        }
+    }
+
+    impl From<Element> for kurbo::PathEl {
+        fn from(e: Element) -> Self {
+            use Element::*;
+            match e {
+                MoveTo(p0) => Self::MoveTo(p0.into()),
+                LineTo(p0) => Self::LineTo(p0.into()),
+                QuadTo(p0, p1) => Self::QuadTo(p0.into(), p1.into()),
+                CurveTo(p0, p1, p2) => Self::CurveTo(p0.into(), p1.into(), p2.into()),
+                Close => Self::ClosePath,
+            }
+        }
+    }
+}
diff --git a/piet-scene/src/scene/blend.rs b/piet-scene/src/scene/blend.rs
index 7edc6cd..d6aa080 100644
--- a/piet-scene/src/scene/blend.rs
+++ b/piet-scene/src/scene/blend.rs
@@ -34,6 +34,8 @@
     Saturation = 13,
     Color = 14,
     Luminosity = 15,
+    // Clip is the same as normal, but doesn't always push a blend group.
+    Clip = 128,
 }
 
 /// Defines the layer composition function for a blend operation.
@@ -53,8 +55,7 @@
     DestAtop = 10,
     Xor = 11,
     Plus = 12,
-    PlusDarker = 13,
-    PlusLighter = 14,
+    PlusLighter = 13,
 }
 
 /// Blend mode consisting of mixing and composition functions.
@@ -77,7 +78,7 @@
 impl Default for Blend {
     fn default() -> Self {
         Self {
-            mix: Mix::Normal,
+            mix: Mix::Clip,
             compose: Compose::SrcOver,
         }
     }
diff --git a/piet-scene/src/scene/builder.rs b/piet-scene/src/scene/builder.rs
index 85d75b2..8aa1bf5 100644
--- a/piet-scene/src/scene/builder.rs
+++ b/piet-scene/src/scene/builder.rs
@@ -23,12 +23,14 @@
 
 const MAX_BLEND_STACK: usize = 256;
 
-/// Creates a new builder for constructing a scene.
-pub fn build_scene<'a>(scene: &'a mut Scene, resources: &'a mut ResourceContext) -> Builder<'a> {
-    Builder::new(&mut scene.data, ResourceData::Scene(resources))
+/// Creates a new builder for filling a scene. Any current content in the scene
+/// will be cleared.
+pub fn build_scene<'a>(scene: &'a mut Scene, rcx: &'a mut ResourceContext) -> Builder<'a> {
+    Builder::new(&mut scene.data, ResourceData::Scene(rcx))
 }
 
-/// Creates a new builder for construction a scene fragment.
+/// Creates a new builder for filling a scene fragment. Any current content in
+/// the fragment will be cleared.
 pub fn build_fragment<'a>(fragment: &'a mut Fragment) -> Builder<'a> {
     Builder::new(
         &mut fragment.data,
diff --git a/piet-scene/src/scene/mod.rs b/piet-scene/src/scene/mod.rs
index 9f7be2f..577f81e 100644
--- a/piet-scene/src/scene/mod.rs
+++ b/piet-scene/src/scene/mod.rs
@@ -23,11 +23,12 @@
 pub use style::*;
 
 use super::brush::*;
-use super::geometry::{Affine, Point, Rect};
+use super::geometry::{Affine, Point};
 use super::path::Element;
 
 use core::ops::Range;
 
+/// Raw data streams describing an encoded scene.
 #[derive(Default)]
 pub struct SceneData {
     pub transform_stream: Vec<Affine>,
@@ -83,6 +84,7 @@
 }
 
 impl Scene {
+    /// Returns the raw encoded scene data streams.
     pub fn data(&self) -> &SceneData {
         &self.data
     }
@@ -96,6 +98,8 @@
 }
 
 impl Fragment {
+    /// Returns the underlying stream of points that defined all encoded path
+    /// segments.
     pub fn points(&self) -> &[Point] {
         bytemuck::cast_slice(&self.data.pathseg_stream)
     }