More blend mode fixes

Adds a test to visualize the blend modes. Fixes a dumb bug in blend.h and also a more subtle issue where default blending is not the same as clipping, as the former needs to always push a blend group (to cause isolation) and the latter does not. This might be something we need to get back to.

This should fix the rendering, so it fairly closely resembles the Mozilla reference image. There's also a compile-time switch to disable sRGB conversion, which is (sadly) needed for compatible rendering.
diff --git a/pgpu-render/src/lib.rs b/pgpu-render/src/lib.rs
index 7d4c60b..50462e7 100644
--- a/pgpu-render/src/lib.rs
+++ b/pgpu-render/src/lib.rs
@@ -215,7 +215,10 @@
 /// Computes the bounding box for the glyph after applying the specified
 /// transform.
 #[no_mangle]
-pub unsafe extern "C" fn pgpu_glyph_bbox(glyph: *const PgpuGlyph, transform: &[f32; 6]) -> PgpuRect {
+pub unsafe extern "C" fn pgpu_glyph_bbox(
+    glyph: *const PgpuGlyph,
+    transform: &[f32; 6],
+) -> PgpuRect {
     let transform = piet_scene::geometry::Affine::new(transform);
     let rect = (*glyph).bbox(Some(transform));
     PgpuRect {
diff --git a/pgpu-render/src/render.rs b/pgpu-render/src/render.rs
index 361ef42..5b5d328 100644
--- a/pgpu-render/src/render.rs
+++ b/pgpu-render/src/render.rs
@@ -16,8 +16,8 @@
 
 use piet_gpu::{EncodedSceneRef, PixelFormat, RenderConfig};
 use piet_gpu_hal::{QueryPool, Session};
-use piet_scene::glyph::pinot::{types::Tag, FontDataRef};
 use piet_scene::geometry::{Affine, Rect};
+use piet_scene::glyph::pinot::{types::Tag, FontDataRef};
 use piet_scene::glyph::{GlyphContext, GlyphProvider};
 use piet_scene::resource::ResourceContext;
 use piet_scene::scene::{Fragment, Scene};
@@ -214,7 +214,12 @@
 impl PgpuGlyph {
     pub fn bbox(&self, transform: Option<Affine>) -> Rect {
         if let Some(transform) = &transform {
-            Rect::from_points(self.fragment.points().iter().map(|p| p.transform(transform)))
+            Rect::from_points(
+                self.fragment
+                    .points()
+                    .iter()
+                    .map(|p| p.transform(transform)),
+            )
         } else {
             Rect::from_points(self.fragment.points())
         }
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index abe6ae1..7d577d3 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -249,7 +249,8 @@
             println!("parsing time: {:?}", start.elapsed());
             test_scenes::render_svg(&mut ctx, &svg);
         } else {
-            test_scenes::render_scene(&mut ctx);
+            //test_scenes::render_scene(&mut ctx);
+            test_scenes::render_blend_grid(&mut ctx);
         }
 
         let mut renderer = Renderer::new(&session, WIDTH, HEIGHT, 1)?;
diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
index 1642026..78867f5 100644
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@@ -125,7 +125,7 @@
                     }
 
                     let mut ctx = PietGpuRenderContext::new();
-                    let test_blend = false;
+                    let test_blend = true;
                     if let Some(svg) = &svg {
                         test_scenes::render_svg(&mut ctx, svg);
                     } else if test_blend {
diff --git a/piet-gpu/shader/blend.h b/piet-gpu/shader/blend.h
index c0ae6af..7366006 100644
--- a/piet-gpu/shader/blend.h
+++ b/piet-gpu/shader/blend.h
@@ -18,6 +18,7 @@
 #define Blend_Saturation 13
 #define Blend_Color 14
 #define Blend_Luminosity 15
+#define Blend_Clip 128
 
 vec3 screen(vec3 cb, vec3 cs) {
 	return cb + cs - (cb * cs);
@@ -45,7 +46,7 @@
 	return mix(
 		screen(cb, 2.0 * cs - 1.0),
 		cb * 2.0 * cs, 
-		vec3(lessThanEqual(cs, vec3(0.5)))
+		lessThanEqual(cs, vec3(0.5))
 	);
 }
 
@@ -53,12 +54,12 @@
 	vec3 d = mix(
 		sqrt(cb),
 		((16.0 * cb - vec3(12.0)) * cb + vec3(4.0)) * cb,
-		vec3(lessThanEqual(cb, vec3(0.25)))
+		lessThanEqual(cb, vec3(0.25))
 	);
 	return mix(
 		cb + (2.0 * cs - vec3(1.0)) * (d - cb),
 		cb - (vec3(1.0) - 2.0 * cs) * cb * (vec3(1.0) - cb),
-		vec3(lessThanEqual(cs, vec3(0.5)))
+		lessThanEqual(cs, vec3(0.5))
 	);
 }
 
@@ -260,6 +261,7 @@
 }
 
 #define BlendComp_default (Blend_Normal << 8 | Comp_SrcOver)
+#define BlendComp_clip (Blend_Clip << 8 | Comp_SrcOver)
 
 // This is added to alpha to prevent divide-by-zero
 #define EPSILON 1e-15
@@ -267,7 +269,8 @@
 // Apply blending and composition. Both input and output colors are
 // premultiplied RGB.
 vec4 mix_blend_compose(vec4 backdrop, vec4 src, uint mode) {
-	if (mode == BlendComp_default) {
+	if ((mode & 0x7fff) == BlendComp_default) {
+		// Both normal+src_over blend and clip case
 		return backdrop * (1.0 - src.a) + src;
 	}
 	// Un-premultiply colors for blending
@@ -276,9 +279,9 @@
 	float inv_backdrop_a = 1.0 / (backdrop.a + EPSILON);
 	vec3 cb = backdrop.rgb * inv_backdrop_a;
 	uint blend_mode = mode >> 8;
-	vec3 blended = mix_blend(cs, cb, blend_mode);
+	vec3 blended = mix_blend(cb, cs, blend_mode);
 	cs = mix(cs, blended, backdrop.a);
-	uint comp_mode = mode * 0xff;
+	uint comp_mode = mode & 0xff;
 	if (comp_mode == Comp_SrcOver) {
 		vec3 co = mix(backdrop.rgb, cs, src.a);
 		return vec4(co, src.a + backdrop.a * (1 - src.a));
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 3abb2e0..1b3f252 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -303,7 +303,7 @@
                     uint scene_offset = memory[drawmonoid_base + 2];
                     uint dd = drawdata_start + (scene_offset >> 2);
                     uint blend = scene[dd];
-                    is_blend = (blend != BlendComp_default);
+                    is_blend = (blend != BlendComp_clip);
                 }
                 include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
                     || is_blend;
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index 910925d..9187e01 100644
--- a/piet-gpu/shader/gen/coarse.dxil
+++ b/piet-gpu/shader/gen/coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index 04529bb..0331e33 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -919,26 +919,26 @@
                     uint scene_offset = _260.Load((drawmonoid_base_1 + 2u) * 4 + 8);
                     uint dd = drawdata_start + (scene_offset >> uint(2));
                     uint blend = _1372.Load(dd * 4 + 0);
-                    is_blend = blend != 3u;
+                    is_blend = blend != 32771u;
                 }
-                bool _1692 = tile.tile.offset != 0u;
-                bool _1701;
-                if (!_1692)
+                bool _1693 = tile.tile.offset != 0u;
+                bool _1702;
+                if (!_1693)
                 {
-                    _1701 = (tile.backdrop == 0) == is_clip;
+                    _1702 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1701 = _1692;
+                    _1702 = _1693;
                 }
-                include_tile = _1701 || is_blend;
+                include_tile = _1702 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1723;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723);
+                uint _1724;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1724);
             }
         }
         GroupMemoryBarrierWithGroupSync();
@@ -967,9 +967,9 @@
             {
                 uint param_25 = element_ref_ix;
                 bool param_26 = mem_ok;
-                TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                TileRef _1801 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                 Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1800;
+                TileRef param_28 = _1801;
                 Tile tile_1 = Tile_read(param_27, param_28);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
                 uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
@@ -984,11 +984,11 @@
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1848 = alloc_cmd(param_29, param_30, param_31);
+                        bool _1849 = alloc_cmd(param_29, param_30, param_31);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1848)
+                        if (!_1849)
                         {
                             break;
                         }
@@ -999,10 +999,10 @@
                         write_fill(param_32, param_33, param_34, param_35);
                         cmd_ref = param_33;
                         uint rgba = _1372.Load(dd_1 * 4 + 0);
-                        CmdColor _1871 = { rgba };
+                        CmdColor _1872 = { rgba };
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = _1871;
+                        CmdColor param_38 = _1872;
                         Cmd_Color_write(param_36, param_37, param_38);
                         cmd_ref.offset += 8u;
                         break;
@@ -1012,11 +1012,11 @@
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1889 = alloc_cmd(param_39, param_40, param_41);
+                        bool _1890 = alloc_cmd(param_39, param_40, param_41);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1889)
+                        if (!_1890)
                         {
                             break;
                         }
@@ -1043,11 +1043,11 @@
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1953 = alloc_cmd(param_49, param_50, param_51);
+                        bool _1954 = alloc_cmd(param_49, param_50, param_51);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1953)
+                        if (!_1954)
                         {
                             break;
                         }
@@ -1077,11 +1077,11 @@
                         Alloc param_59 = cmd_alloc;
                         CmdRef param_60 = cmd_ref;
                         uint param_61 = cmd_limit;
-                        bool _2059 = alloc_cmd(param_59, param_60, param_61);
+                        bool _2060 = alloc_cmd(param_59, param_60, param_61);
                         cmd_alloc = param_59;
                         cmd_ref = param_60;
                         cmd_limit = param_61;
-                        if (!_2059)
+                        if (!_2060)
                         {
                             break;
                         }
@@ -1094,27 +1094,27 @@
                         uint index = _1372.Load(dd_1 * 4 + 0);
                         uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _2098 = { index, offset_1 };
+                        CmdImage _2099 = { index, offset_1 };
                         Alloc param_66 = cmd_alloc;
                         CmdRef param_67 = cmd_ref;
-                        CmdImage param_68 = _2098;
+                        CmdImage param_68 = _2099;
                         Cmd_Image_write(param_66, param_67, param_68);
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _2112 = tile_1.tile.offset == 0u;
-                        bool _2118;
-                        if (_2112)
+                        bool _2113 = tile_1.tile.offset == 0u;
+                        bool _2119;
+                        if (_2113)
                         {
-                            _2118 = tile_1.backdrop == 0;
+                            _2119 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2118 = _2112;
+                            _2119 = _2113;
                         }
-                        if (_2118)
+                        if (_2119)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1123,11 +1123,11 @@
                             Alloc param_69 = cmd_alloc;
                             CmdRef param_70 = cmd_ref;
                             uint param_71 = cmd_limit;
-                            bool _2130 = alloc_cmd(param_69, param_70, param_71);
+                            bool _2131 = alloc_cmd(param_69, param_70, param_71);
                             cmd_alloc = param_69;
                             cmd_ref = param_70;
                             cmd_limit = param_71;
-                            if (!_2130)
+                            if (!_2131)
                             {
                                 break;
                             }
@@ -1145,11 +1145,11 @@
                         Alloc param_74 = cmd_alloc;
                         CmdRef param_75 = cmd_ref;
                         uint param_76 = cmd_limit;
-                        bool _2158 = alloc_cmd(param_74, param_75, param_76);
+                        bool _2159 = alloc_cmd(param_74, param_75, param_76);
                         cmd_alloc = param_74;
                         cmd_ref = param_75;
                         cmd_limit = param_76;
-                        if (!_2158)
+                        if (!_2159)
                         {
                             break;
                         }
@@ -1160,10 +1160,10 @@
                         write_fill(param_77, param_78, param_79, param_80);
                         cmd_ref = param_78;
                         uint blend_1 = _1372.Load(dd_1 * 4 + 0);
-                        CmdEndClip _2181 = { blend_1 };
+                        CmdEndClip _2182 = { blend_1 };
                         Alloc param_81 = cmd_alloc;
                         CmdRef param_82 = cmd_ref;
-                        CmdEndClip param_83 = _2181;
+                        CmdEndClip param_83 = _2182;
                         Cmd_EndClip_write(param_81, param_82, param_83);
                         cmd_ref.offset += 8u;
                         break;
@@ -1198,17 +1198,17 @@
             break;
         }
     }
-    bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8);
-    bool _2237;
-    if (_2228)
+    bool _2229 = (bin_tile_x + tile_x) < _1005.Load(8);
+    bool _2238;
+    if (_2229)
     {
-        _2237 = (bin_tile_y + tile_y) < _1005.Load(12);
+        _2238 = (bin_tile_y + tile_y) < _1005.Load(12);
     }
     else
     {
-        _2237 = _2228;
+        _2238 = _2229;
     }
-    if (_2237)
+    if (_2238)
     {
         Alloc param_84 = cmd_alloc;
         CmdRef param_85 = cmd_ref;
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index 55812d4..854d243 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -942,25 +942,25 @@
                     uint scene_offset = v_260.memory[drawmonoid_base_1 + 2u];
                     uint dd = drawdata_start + (scene_offset >> uint(2));
                     uint blend = _1372.scene[dd];
-                    is_blend = blend != 3u;
+                    is_blend = blend != 32771u;
                 }
-                bool _1692 = tile.tile.offset != 0u;
-                bool _1701;
-                if (!_1692)
+                bool _1693 = tile.tile.offset != 0u;
+                bool _1702;
+                if (!_1693)
                 {
-                    _1701 = (tile.backdrop == 0) == is_clip;
+                    _1702 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1701 = _1692;
+                    _1702 = _1693;
                 }
-                include_tile = _1701 || is_blend;
+                include_tile = _1702 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1723 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1724 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -1005,11 +1005,11 @@
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1848 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
+                        bool _1849 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1848)
+                        if (!_1849)
                         {
                             break;
                         }
@@ -1032,11 +1032,11 @@
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1889 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
+                        bool _1890 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1889)
+                        if (!_1890)
                         {
                             break;
                         }
@@ -1063,11 +1063,11 @@
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1953 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
+                        bool _1954 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1953)
+                        if (!_1954)
                         {
                             break;
                         }
@@ -1097,11 +1097,11 @@
                         Alloc param_59 = cmd_alloc;
                         CmdRef param_60 = cmd_ref;
                         uint param_61 = cmd_limit;
-                        bool _2059 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
+                        bool _2060 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
                         cmd_alloc = param_59;
                         cmd_ref = param_60;
                         cmd_limit = param_61;
-                        if (!_2059)
+                        if (!_2060)
                         {
                             break;
                         }
@@ -1123,17 +1123,17 @@
                     }
                     case 5u:
                     {
-                        bool _2112 = tile_1.tile.offset == 0u;
-                        bool _2118;
-                        if (_2112)
+                        bool _2113 = tile_1.tile.offset == 0u;
+                        bool _2119;
+                        if (_2113)
                         {
-                            _2118 = tile_1.backdrop == 0;
+                            _2119 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2118 = _2112;
+                            _2119 = _2113;
                         }
-                        if (_2118)
+                        if (_2119)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1142,11 +1142,11 @@
                             Alloc param_69 = cmd_alloc;
                             CmdRef param_70 = cmd_ref;
                             uint param_71 = cmd_limit;
-                            bool _2130 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
+                            bool _2131 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
                             cmd_alloc = param_69;
                             cmd_ref = param_70;
                             cmd_limit = param_71;
-                            if (!_2130)
+                            if (!_2131)
                             {
                                 break;
                             }
@@ -1164,11 +1164,11 @@
                         Alloc param_74 = cmd_alloc;
                         CmdRef param_75 = cmd_ref;
                         uint param_76 = cmd_limit;
-                        bool _2158 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
+                        bool _2159 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
                         cmd_alloc = param_74;
                         cmd_ref = param_75;
                         cmd_limit = param_76;
-                        if (!_2158)
+                        if (!_2159)
                         {
                             break;
                         }
@@ -1216,17 +1216,17 @@
             break;
         }
     }
-    bool _2228 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
-    bool _2237;
-    if (_2228)
+    bool _2229 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
+    bool _2238;
+    if (_2229)
     {
-        _2237 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
+        _2238 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
     }
     else
     {
-        _2237 = _2228;
+        _2238 = _2229;
     }
-    if (_2237)
+    if (_2238)
     {
         Alloc param_84 = cmd_alloc;
         CmdRef param_85 = cmd_ref;
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index 6d33ee7..56a87e5 100644
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index da6c563..0322bf6 100644
--- a/piet-gpu/shader/gen/kernel4.dxil
+++ b/piet-gpu/shader/gen/kernel4.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index 5d6f839..4839db2 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -162,7 +162,7 @@
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
 RWByteAddressBuffer _297 : register(u0, space0);
-ByteAddressBuffer _1749 : register(t1, space0);
+ByteAddressBuffer _1681 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float4> image : register(u2, space0);
@@ -347,10 +347,7 @@
 
 float3 fromsRGB(float3 srgb)
 {
-    bool3 cutoff = bool3(srgb.x >= 0.040449999272823333740234375f.xxx.x, srgb.y >= 0.040449999272823333740234375f.xxx.y, srgb.z >= 0.040449999272823333740234375f.xxx.z);
-    float3 below = srgb / 12.9200000762939453125f.xxx;
-    float3 above = pow((srgb + 0.054999999701976776123046875f.xxx) / 1.05499994754791259765625f.xxx, 2.400000095367431640625f.xxx);
-    return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z);
+    return srgb;
 }
 
 float4 unpacksRGB(uint srgba)
@@ -477,10 +474,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1653 = fromsRGB(param_1);
+        fg_rgba.x = _1653.x;
+        fg_rgba.y = _1653.y;
+        fg_rgba.z = _1653.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -488,10 +485,7 @@
 
 float3 tosRGB(float3 rgb)
 {
-    bool3 cutoff = bool3(rgb.x >= 0.003130800090730190277099609375f.xxx.x, rgb.y >= 0.003130800090730190277099609375f.xxx.y, rgb.z >= 0.003130800090730190277099609375f.xxx.z);
-    float3 below = 12.9200000762939453125f.xxx * rgb;
-    float3 above = (1.05499994754791259765625f.xxx * pow(rgb, 0.416660010814666748046875f.xxx)) - 0.054999999701976776123046875f.xxx;
-    return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z);
+    return rgb;
 }
 
 uint packsRGB(inout float4 rgba)
@@ -529,7 +523,10 @@
 {
     float3 param = cb;
     float3 param_1 = (cs * 2.0f) - 1.0f.xxx;
-    return lerp(screen(param, param_1), (cb * 2.0f) * cs, float3(bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z)));
+    float3 _889 = screen(param, param_1);
+    float3 _893 = (cb * 2.0f) * cs;
+    bool3 _898 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z);
+    return float3(_898.x ? _893.x : _889.x, _898.y ? _893.y : _889.y, _898.z ? _893.z : _889.z);
 }
 
 float color_dodge(float cb, float cs)
@@ -572,8 +569,14 @@
 
 float3 soft_light(float3 cb, float3 cs)
 {
-    float3 d = lerp(sqrt(cb), ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb, float3(bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z)));
-    return lerp(cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb)), cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb)), float3(bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z)));
+    float3 _904 = sqrt(cb);
+    float3 _917 = ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb;
+    bool3 _921 = bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z);
+    float3 d = float3(_921.x ? _917.x : _904.x, _921.y ? _917.y : _904.y, _921.z ? _917.z : _904.z);
+    float3 _932 = cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb));
+    float3 _942 = cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb));
+    bool3 _944 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z);
+    return float3(_944.x ? _942.x : _932.x, _944.y ? _942.y : _932.y, _944.z ? _942.z : _932.z);
 }
 
 float sat(float3 c)
@@ -706,8 +709,8 @@
 {
     float3 param = c;
     float3 param_1 = c + (l - lum(param)).xxx;
-    float3 _1052 = clip_color(param_1);
-    return _1052;
+    float3 _1048 = clip_color(param_1);
+    return _1048;
 }
 
 float3 mix_blend(float3 cb, float3 cs, uint mode)
@@ -795,9 +798,9 @@
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1343 = set_sat(param_21, param_22);
+            float3 _1340 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1343;
+            float3 param_24 = _1340;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -807,9 +810,9 @@
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1357 = set_sat(param_27, param_28);
+            float3 _1354 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1357;
+            float3 param_30 = _1354;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -919,12 +922,6 @@
         }
         case 13u:
         {
-            float rev_as = 1.0f - as;
-            float rev_ab = 1.0f - ab;
-            return max(0.0f.xxxx, float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
-        {
             return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
         }
         default:
@@ -940,7 +937,7 @@
 
 float4 mix_blend_compose(float4 backdrop, float4 src, uint mode)
 {
-    if (mode == 3u)
+    if ((mode & 32767u) == 3u)
     {
         return (backdrop * (1.0f - src.w)) + src;
     }
@@ -949,12 +946,12 @@
     float inv_backdrop_a = 1.0f / (backdrop.w + 1.0000000036274937255387218471014e-15f);
     float3 cb = backdrop.xyz * inv_backdrop_a;
     uint blend_mode = mode >> uint(8);
-    float3 param = cs;
-    float3 param_1 = cb;
+    float3 param = cb;
+    float3 param_1 = cs;
     uint param_2 = blend_mode;
     float3 blended = mix_blend(param, param_1, param_2);
     cs = lerp(cs, blended, backdrop.w.xxx);
-    uint comp_mode = mode * 255u;
+    uint comp_mode = mode & 255u;
     if (comp_mode == 3u)
     {
         float3 co = lerp(backdrop.xyz, cs, src.w.xxx);
@@ -992,16 +989,16 @@
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1764;
-    _1764.offset = _1749.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1681.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1696;
+    _1696.offset = _1681.Load(24);
     Alloc param;
-    param.offset = _1764.offset;
+    param.offset = _1696.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1773 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1773;
+    CmdRef _1705 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1705;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -1035,8 +1032,8 @@
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1867 = { stroke.tile_ref };
-                tile_seg_ref = _1867;
+                TileSegRef _1800 = { stroke.tile_ref };
+                tile_seg_ref = _1800;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1072,8 +1069,8 @@
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1987 = { fill.tile_ref };
-                tile_seg_ref = _1987;
+                TileSegRef _1920 = { fill.tile_ref };
+                tile_seg_ref = _1920;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1162,10 +1159,10 @@
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2254 = fromsRGB(param_29);
+                    fg_rgba.x = _2254.x;
+                    fg_rgba.y = _2254.y;
+                    fg_rgba.z = _2254.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1188,10 +1185,10 @@
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2364 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2364.x;
+                    fg_rgba_1.y = _2364.y;
+                    fg_rgba_1.z = _2364.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1205,9 +1202,9 @@
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2474[8];
-                fillImage(_2474, param_36, param_37);
-                float4 img[8] = _2474;
+                float4 _2407[8];
+                fillImage(_2407, param_36, param_37);
+                float4 img[8] = _2407;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1222,8 +1219,8 @@
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
+                    uint _2470 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2470;
                     rgba[k_12] = 0.0f.xxxx;
                 }
                 clip_depth++;
@@ -1256,8 +1253,8 @@
             {
                 Alloc param_45 = cmd_alloc;
                 CmdRef param_46 = cmd_ref;
-                CmdRef _2615 = { Cmd_Jump_read(param_45, param_46).new_ref };
-                cmd_ref = _2615;
+                CmdRef _2548 = { Cmd_Jump_read(param_45, param_46).new_ref };
+                cmd_ref = _2548;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index 796043b..4caeaf0 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -393,10 +393,7 @@
 static inline __attribute__((always_inline))
 float3 fromsRGB(thread const float3& srgb)
 {
-    bool3 cutoff = srgb >= float3(0.040449999272823333740234375);
-    float3 below = srgb / float3(12.9200000762939453125);
-    float3 above = pow((srgb + float3(0.054999999701976776123046875)) / float3(1.05499994754791259765625), float3(2.400000095367431640625));
-    return select(below, above, cutoff);
+    return srgb;
 }
 
 static inline __attribute__((always_inline))
@@ -528,10 +525,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1653 = fromsRGB(param_1);
+        fg_rgba.x = _1653.x;
+        fg_rgba.y = _1653.y;
+        fg_rgba.z = _1653.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -540,10 +537,7 @@
 static inline __attribute__((always_inline))
 float3 tosRGB(thread const float3& rgb)
 {
-    bool3 cutoff = rgb >= float3(0.003130800090730190277099609375);
-    float3 below = float3(12.9200000762939453125) * rgb;
-    float3 above = (float3(1.05499994754791259765625) * pow(rgb, float3(0.416660010814666748046875))) - float3(0.054999999701976776123046875);
-    return select(below, above, cutoff);
+    return rgb;
 }
 
 static inline __attribute__((always_inline))
@@ -585,7 +579,7 @@
 {
     float3 param = cb;
     float3 param_1 = (cs * 2.0) - float3(1.0);
-    return mix(screen(param, param_1), (cb * 2.0) * cs, float3(cs <= float3(0.5)));
+    return select(screen(param, param_1), (cb * 2.0) * cs, cs <= float3(0.5));
 }
 
 static inline __attribute__((always_inline))
@@ -631,8 +625,8 @@
 static inline __attribute__((always_inline))
 float3 soft_light(thread const float3& cb, thread const float3& cs)
 {
-    float3 d = mix(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, float3(cb <= float3(0.25)));
-    return mix(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), float3(cs <= float3(0.5)));
+    float3 d = select(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, cb <= float3(0.25));
+    return select(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), cs <= float3(0.5));
 }
 
 static inline __attribute__((always_inline))
@@ -771,8 +765,8 @@
 {
     float3 param = c;
     float3 param_1 = c + float3(l - lum(param));
-    float3 _1052 = clip_color(param_1);
-    return _1052;
+    float3 _1048 = clip_color(param_1);
+    return _1048;
 }
 
 static inline __attribute__((always_inline))
@@ -861,9 +855,9 @@
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1343 = set_sat(param_21, param_22);
+            float3 _1340 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1343;
+            float3 param_24 = _1340;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -873,9 +867,9 @@
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1357 = set_sat(param_27, param_28);
+            float3 _1354 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1357;
+            float3 param_30 = _1354;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -986,12 +980,6 @@
         }
         case 13u:
         {
-            float rev_as = 1.0 - as;
-            float rev_ab = 1.0 - ab;
-            return fast::max(float4(0.0), float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
-        {
             return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
         }
         default:
@@ -1008,7 +996,7 @@
 static inline __attribute__((always_inline))
 float4 mix_blend_compose(thread const float4& backdrop, thread const float4& src, thread const uint& mode)
 {
-    if (mode == 3u)
+    if ((mode & 32767u) == 3u)
     {
         return (backdrop * (1.0 - src.w)) + src;
     }
@@ -1017,12 +1005,12 @@
     float inv_backdrop_a = 1.0 / (backdrop.w + 1.0000000036274937255387218471014e-15);
     float3 cb = backdrop.xyz * inv_backdrop_a;
     uint blend_mode = mode >> uint(8);
-    float3 param = cs;
-    float3 param_1 = cb;
+    float3 param = cb;
+    float3 param_1 = cs;
     uint param_2 = blend_mode;
     float3 blended = mix_blend(param, param_1, param_2);
     cs = mix(cs, blended, float3(backdrop.w));
-    uint comp_mode = mode * 255u;
+    uint comp_mode = mode & 255u;
     if (comp_mode == 3u)
     {
         float3 co = mix(backdrop.xyz, cs, float3(src.w));
@@ -1059,11 +1047,11 @@
     return CmdJump_read(param, param_1, v_297);
 }
 
-kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1749 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1749.conf.ptcl_alloc.offset;
+    param.offset = _1681.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -1226,10 +1214,10 @@
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2254 = fromsRGB(param_29);
+                    fg_rgba.x = _2254.x;
+                    fg_rgba.y = _2254.y;
+                    fg_rgba.z = _2254.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1252,10 +1240,10 @@
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2364 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2364.x;
+                    fg_rgba_1.y = _2364.y;
+                    fg_rgba_1.z = _2364.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1285,8 +1273,8 @@
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
+                    uint _2470 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2470;
                     rgba[k_12] = float4(0.0);
                 }
                 clip_depth++;
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index b145245..f0e2963 100644
--- a/piet-gpu/shader/gen/kernel4.spv
+++ b/piet-gpu/shader/gen/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index abe1d22..d48974d 100644
--- a/piet-gpu/shader/gen/kernel4_gray.dxil
+++ b/piet-gpu/shader/gen/kernel4_gray.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index f402268..5d9b88d 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -162,7 +162,7 @@
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
 RWByteAddressBuffer _297 : register(u0, space0);
-ByteAddressBuffer _1749 : register(t1, space0);
+ByteAddressBuffer _1681 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float> image : register(u2, space0);
@@ -347,10 +347,7 @@
 
 float3 fromsRGB(float3 srgb)
 {
-    bool3 cutoff = bool3(srgb.x >= 0.040449999272823333740234375f.xxx.x, srgb.y >= 0.040449999272823333740234375f.xxx.y, srgb.z >= 0.040449999272823333740234375f.xxx.z);
-    float3 below = srgb / 12.9200000762939453125f.xxx;
-    float3 above = pow((srgb + 0.054999999701976776123046875f.xxx) / 1.05499994754791259765625f.xxx, 2.400000095367431640625f.xxx);
-    return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z);
+    return srgb;
 }
 
 float4 unpacksRGB(uint srgba)
@@ -477,10 +474,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1653 = fromsRGB(param_1);
+        fg_rgba.x = _1653.x;
+        fg_rgba.y = _1653.y;
+        fg_rgba.z = _1653.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -488,10 +485,7 @@
 
 float3 tosRGB(float3 rgb)
 {
-    bool3 cutoff = bool3(rgb.x >= 0.003130800090730190277099609375f.xxx.x, rgb.y >= 0.003130800090730190277099609375f.xxx.y, rgb.z >= 0.003130800090730190277099609375f.xxx.z);
-    float3 below = 12.9200000762939453125f.xxx * rgb;
-    float3 above = (1.05499994754791259765625f.xxx * pow(rgb, 0.416660010814666748046875f.xxx)) - 0.054999999701976776123046875f.xxx;
-    return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z);
+    return rgb;
 }
 
 uint packsRGB(inout float4 rgba)
@@ -529,7 +523,10 @@
 {
     float3 param = cb;
     float3 param_1 = (cs * 2.0f) - 1.0f.xxx;
-    return lerp(screen(param, param_1), (cb * 2.0f) * cs, float3(bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z)));
+    float3 _889 = screen(param, param_1);
+    float3 _893 = (cb * 2.0f) * cs;
+    bool3 _898 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z);
+    return float3(_898.x ? _893.x : _889.x, _898.y ? _893.y : _889.y, _898.z ? _893.z : _889.z);
 }
 
 float color_dodge(float cb, float cs)
@@ -572,8 +569,14 @@
 
 float3 soft_light(float3 cb, float3 cs)
 {
-    float3 d = lerp(sqrt(cb), ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb, float3(bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z)));
-    return lerp(cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb)), cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb)), float3(bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z)));
+    float3 _904 = sqrt(cb);
+    float3 _917 = ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb;
+    bool3 _921 = bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z);
+    float3 d = float3(_921.x ? _917.x : _904.x, _921.y ? _917.y : _904.y, _921.z ? _917.z : _904.z);
+    float3 _932 = cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb));
+    float3 _942 = cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb));
+    bool3 _944 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z);
+    return float3(_944.x ? _942.x : _932.x, _944.y ? _942.y : _932.y, _944.z ? _942.z : _932.z);
 }
 
 float sat(float3 c)
@@ -706,8 +709,8 @@
 {
     float3 param = c;
     float3 param_1 = c + (l - lum(param)).xxx;
-    float3 _1052 = clip_color(param_1);
-    return _1052;
+    float3 _1048 = clip_color(param_1);
+    return _1048;
 }
 
 float3 mix_blend(float3 cb, float3 cs, uint mode)
@@ -795,9 +798,9 @@
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1343 = set_sat(param_21, param_22);
+            float3 _1340 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1343;
+            float3 param_24 = _1340;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -807,9 +810,9 @@
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1357 = set_sat(param_27, param_28);
+            float3 _1354 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1357;
+            float3 param_30 = _1354;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -919,12 +922,6 @@
         }
         case 13u:
         {
-            float rev_as = 1.0f - as;
-            float rev_ab = 1.0f - ab;
-            return max(0.0f.xxxx, float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
-        {
             return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
         }
         default:
@@ -940,7 +937,7 @@
 
 float4 mix_blend_compose(float4 backdrop, float4 src, uint mode)
 {
-    if (mode == 3u)
+    if ((mode & 32767u) == 3u)
     {
         return (backdrop * (1.0f - src.w)) + src;
     }
@@ -949,12 +946,12 @@
     float inv_backdrop_a = 1.0f / (backdrop.w + 1.0000000036274937255387218471014e-15f);
     float3 cb = backdrop.xyz * inv_backdrop_a;
     uint blend_mode = mode >> uint(8);
-    float3 param = cs;
-    float3 param_1 = cb;
+    float3 param = cb;
+    float3 param_1 = cs;
     uint param_2 = blend_mode;
     float3 blended = mix_blend(param, param_1, param_2);
     cs = lerp(cs, blended, backdrop.w.xxx);
-    uint comp_mode = mode * 255u;
+    uint comp_mode = mode & 255u;
     if (comp_mode == 3u)
     {
         float3 co = lerp(backdrop.xyz, cs, src.w.xxx);
@@ -992,16 +989,16 @@
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1764;
-    _1764.offset = _1749.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1681.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1696;
+    _1696.offset = _1681.Load(24);
     Alloc param;
-    param.offset = _1764.offset;
+    param.offset = _1696.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1773 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1773;
+    CmdRef _1705 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1705;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -1035,8 +1032,8 @@
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1867 = { stroke.tile_ref };
-                tile_seg_ref = _1867;
+                TileSegRef _1800 = { stroke.tile_ref };
+                tile_seg_ref = _1800;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1072,8 +1069,8 @@
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1987 = { fill.tile_ref };
-                tile_seg_ref = _1987;
+                TileSegRef _1920 = { fill.tile_ref };
+                tile_seg_ref = _1920;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1162,10 +1159,10 @@
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2254 = fromsRGB(param_29);
+                    fg_rgba.x = _2254.x;
+                    fg_rgba.y = _2254.y;
+                    fg_rgba.z = _2254.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1188,10 +1185,10 @@
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2364 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2364.x;
+                    fg_rgba_1.y = _2364.y;
+                    fg_rgba_1.z = _2364.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1205,9 +1202,9 @@
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2474[8];
-                fillImage(_2474, param_36, param_37);
-                float4 img[8] = _2474;
+                float4 _2407[8];
+                fillImage(_2407, param_36, param_37);
+                float4 img[8] = _2407;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1222,8 +1219,8 @@
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
+                    uint _2470 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2470;
                     rgba[k_12] = 0.0f.xxxx;
                 }
                 clip_depth++;
@@ -1256,8 +1253,8 @@
             {
                 Alloc param_45 = cmd_alloc;
                 CmdRef param_46 = cmd_ref;
-                CmdRef _2615 = { Cmd_Jump_read(param_45, param_46).new_ref };
-                cmd_ref = _2615;
+                CmdRef _2548 = { Cmd_Jump_read(param_45, param_46).new_ref };
+                cmd_ref = _2548;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 9647001..8c608c3 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -393,10 +393,7 @@
 static inline __attribute__((always_inline))
 float3 fromsRGB(thread const float3& srgb)
 {
-    bool3 cutoff = srgb >= float3(0.040449999272823333740234375);
-    float3 below = srgb / float3(12.9200000762939453125);
-    float3 above = pow((srgb + float3(0.054999999701976776123046875)) / float3(1.05499994754791259765625), float3(2.400000095367431640625));
-    return select(below, above, cutoff);
+    return srgb;
 }
 
 static inline __attribute__((always_inline))
@@ -528,10 +525,10 @@
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1653 = fromsRGB(param_1);
+        fg_rgba.x = _1653.x;
+        fg_rgba.y = _1653.y;
+        fg_rgba.z = _1653.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -540,10 +537,7 @@
 static inline __attribute__((always_inline))
 float3 tosRGB(thread const float3& rgb)
 {
-    bool3 cutoff = rgb >= float3(0.003130800090730190277099609375);
-    float3 below = float3(12.9200000762939453125) * rgb;
-    float3 above = (float3(1.05499994754791259765625) * pow(rgb, float3(0.416660010814666748046875))) - float3(0.054999999701976776123046875);
-    return select(below, above, cutoff);
+    return rgb;
 }
 
 static inline __attribute__((always_inline))
@@ -585,7 +579,7 @@
 {
     float3 param = cb;
     float3 param_1 = (cs * 2.0) - float3(1.0);
-    return mix(screen(param, param_1), (cb * 2.0) * cs, float3(cs <= float3(0.5)));
+    return select(screen(param, param_1), (cb * 2.0) * cs, cs <= float3(0.5));
 }
 
 static inline __attribute__((always_inline))
@@ -631,8 +625,8 @@
 static inline __attribute__((always_inline))
 float3 soft_light(thread const float3& cb, thread const float3& cs)
 {
-    float3 d = mix(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, float3(cb <= float3(0.25)));
-    return mix(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), float3(cs <= float3(0.5)));
+    float3 d = select(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, cb <= float3(0.25));
+    return select(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), cs <= float3(0.5));
 }
 
 static inline __attribute__((always_inline))
@@ -771,8 +765,8 @@
 {
     float3 param = c;
     float3 param_1 = c + float3(l - lum(param));
-    float3 _1052 = clip_color(param_1);
-    return _1052;
+    float3 _1048 = clip_color(param_1);
+    return _1048;
 }
 
 static inline __attribute__((always_inline))
@@ -861,9 +855,9 @@
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1343 = set_sat(param_21, param_22);
+            float3 _1340 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1343;
+            float3 param_24 = _1340;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -873,9 +867,9 @@
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1357 = set_sat(param_27, param_28);
+            float3 _1354 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1357;
+            float3 param_30 = _1354;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -986,12 +980,6 @@
         }
         case 13u:
         {
-            float rev_as = 1.0 - as;
-            float rev_ab = 1.0 - ab;
-            return fast::max(float4(0.0), float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
-        {
             return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
         }
         default:
@@ -1008,7 +996,7 @@
 static inline __attribute__((always_inline))
 float4 mix_blend_compose(thread const float4& backdrop, thread const float4& src, thread const uint& mode)
 {
-    if (mode == 3u)
+    if ((mode & 32767u) == 3u)
     {
         return (backdrop * (1.0 - src.w)) + src;
     }
@@ -1017,12 +1005,12 @@
     float inv_backdrop_a = 1.0 / (backdrop.w + 1.0000000036274937255387218471014e-15);
     float3 cb = backdrop.xyz * inv_backdrop_a;
     uint blend_mode = mode >> uint(8);
-    float3 param = cs;
-    float3 param_1 = cb;
+    float3 param = cb;
+    float3 param_1 = cs;
     uint param_2 = blend_mode;
     float3 blended = mix_blend(param, param_1, param_2);
     cs = mix(cs, blended, float3(backdrop.w));
-    uint comp_mode = mode * 255u;
+    uint comp_mode = mode & 255u;
     if (comp_mode == 3u)
     {
         float3 co = mix(backdrop.xyz, cs, float3(src.w));
@@ -1059,11 +1047,11 @@
     return CmdJump_read(param, param_1, v_297);
 }
 
-kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1749 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1749.conf.ptcl_alloc.offset;
+    param.offset = _1681.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -1226,10 +1214,10 @@
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2254 = fromsRGB(param_29);
+                    fg_rgba.x = _2254.x;
+                    fg_rgba.y = _2254.y;
+                    fg_rgba.z = _2254.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1252,10 +1240,10 @@
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2364 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2364.x;
+                    fg_rgba_1.y = _2364.y;
+                    fg_rgba_1.z = _2364.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1285,8 +1273,8 @@
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
+                    uint _2470 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2470;
                     rgba[k_12] = float4(0.0);
                 }
                 clip_depth++;
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 2dd46c0..6ff1791 100644
--- a/piet-gpu/shader/gen/kernel4_gray.spv
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
Binary files differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index a0710d2..99fd22e 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -9,6 +9,11 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
+// We can do rendering either in sRGB colorspace (for compatibility)
+// or in a linear colorspace, with conversions to sRGB (which will give
+// higher quality antialiasing among other things).
+#define DO_SRGB_CONVERSION 0
+
 #include "mem.h"
 #include "setup.h"
 
@@ -39,18 +44,26 @@
 
 #define MAX_BLEND_STACK 128
 mediump vec3 tosRGB(mediump vec3 rgb) {
+#if DO_SRGB_CONVERSION
     bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308));
     mediump vec3 below = vec3(12.92) * rgb;
     mediump vec3 above = vec3(1.055) * pow(rgb, vec3(0.41666)) - vec3(0.055);
     return mix(below, above, cutoff);
+#else
+    return rgb;
+#endif
 }
 
 mediump vec3 fromsRGB(mediump vec3 srgb) {
+#if DO_SRGB_CONVERSION
     // Formula from EXT_sRGB.
     bvec3 cutoff = greaterThanEqual(srgb, vec3(0.04045));
     mediump vec3 below = srgb / vec3(12.92);
     mediump vec3 above = pow((srgb + vec3(0.055)) / vec3(1.055), vec3(2.4));
     return mix(below, above, cutoff);
+#else
+    return srgb;
+#endif
 }
 
 // unpacksRGB unpacks a color in the sRGB color space to a vec4 in the linear color
diff --git a/piet-gpu/src/blend.rs b/piet-gpu/src/blend.rs
index 7edcb4e..f0ca002 100644
--- a/piet-gpu/src/blend.rs
+++ b/piet-gpu/src/blend.rs
@@ -33,6 +33,8 @@
     Saturation = 13,
     Color = 14,
     Luminosity = 15,
+    // Clip is the same as normal, but doesn't always push a blend group.
+    Clip = 128,
 }
 
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
@@ -76,7 +78,7 @@
 impl Default for Blend {
     fn default() -> Self {
         Self {
-            mode: BlendMode::Normal,
+            mode: BlendMode::Clip,
             composition_mode: CompositionMode::SrcOver,
         }
     }
diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs
index a24ddbc..d0ef1eb 100644
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@@ -306,16 +306,21 @@
         self.drawdata_stream.extend(bytemuck::bytes_of(&element));
     }
 
-
     /// Encode a fill radial gradient draw object.
     ///
     /// This should be encoded after a path.
     pub fn fill_rad_gradient(&mut self, index: u32, p0: [f32; 2], p1: [f32; 2], r0: f32, r1: f32) {
         self.drawtag_stream.push(DRAWTAG_FILLRADGRADIENT);
-        let element = FillRadGradient { index, p0, p1, r0, r1 };
+        let element = FillRadGradient {
+            index,
+            p0,
+            p1,
+            r0,
+            r1,
+        };
         self.drawdata_stream.extend(bytemuck::bytes_of(&element));
     }
-    
+
     /// Start a clip.
     pub fn begin_clip(&mut self, blend: Option<Blend>) {
         self.drawtag_stream.push(DRAWTAG_BEGINCLIP);
diff --git a/piet-gpu/src/gradient.rs b/piet-gpu/src/gradient.rs
index e655908..443eaec 100644
--- a/piet-gpu/src/gradient.rs
+++ b/piet-gpu/src/gradient.rs
@@ -19,7 +19,7 @@
 use std::collections::hash_map::{Entry, HashMap};
 
 use piet::kurbo::Point;
-use piet::{Color, FixedLinearGradient, GradientStop, FixedRadialGradient};
+use piet::{Color, FixedLinearGradient, FixedRadialGradient, GradientStop};
 
 /// Radial gradient compatible with COLRv1 spec
 #[derive(Debug, Clone)]
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index d32a9c5..ba06e71 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -13,8 +13,8 @@
 
 pub use blend::{Blend, BlendMode, CompositionMode};
 pub use encoder::EncodedSceneRef;
-pub use render_ctx::PietGpuRenderContext;
 pub use gradient::Colrv1RadialGradient;
+pub use render_ctx::PietGpuRenderContext;
 
 use piet::kurbo::Vec2;
 use piet::{ImageFormat, RenderContext};
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index dca03eb..14f2561 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -1,9 +1,12 @@
+// This should match the value in kernel4.comp for correct rendering.
+const DO_SRGB_CONVERSION: bool = false;
+
 use std::borrow::Cow;
 
 use crate::encoder::GlyphEncoder;
 use crate::stages::{Config, Transform};
 use crate::MAX_BLEND_STACK;
-use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape};
+use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};
 use piet::{
     Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext,
     StrokeStyle,
@@ -13,7 +16,7 @@
 use piet_gpu_types::encoder::{Encode, Encoder};
 use piet_gpu_types::scene::Element;
 
-use crate::gradient::{LinearGradient, RadialGradient, RampCache, Colrv1RadialGradient};
+use crate::gradient::{Colrv1RadialGradient, LinearGradient, RadialGradient, RampCache};
 use crate::text::Font;
 pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder};
 use crate::Blend;
@@ -471,19 +474,27 @@
 }
 
 fn to_srgb(f: f64) -> f64 {
-    if f <= 0.0031308 {
-        f * 12.92
+    if DO_SRGB_CONVERSION {
+        if f <= 0.0031308 {
+            f * 12.92
+        } else {
+            let a = 0.055;
+            (1. + a) * f64::powf(f, f64::recip(2.4)) - a
+        }
     } else {
-        let a = 0.055;
-        (1. + a) * f64::powf(f, f64::recip(2.4)) - a
+        f
     }
 }
 
 fn from_srgb(f: f64) -> f64 {
-    if f <= 0.04045 {
-        f / 12.92
+    if DO_SRGB_CONVERSION {
+        if f <= 0.04045 {
+            f / 12.92
+        } else {
+            let a = 0.055;
+            f64::powf((f + a) * f64::recip(1. + a), 2.4)
+        }
     } else {
-        let a = 0.055;
-        f64::powf((f + a) * f64::recip(1. + a), 2.4)
+        f
     }
 }
diff --git a/piet-gpu/src/stages/clip.rs b/piet-gpu/src/stages/clip.rs
index 2fd195b..b7b77eb 100644
--- a/piet-gpu/src/stages/clip.rs
+++ b/piet-gpu/src/stages/clip.rs
@@ -16,7 +16,9 @@
 
 //! The clip processing stage (includes substages).
 
-use piet_gpu_hal::{include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session};
+use piet_gpu_hal::{
+    include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session,
+};
 
 // Note that this isn't the code/stage/binding pattern of most of the other stages
 // in the new element processing pipeline. We want to move those temporary buffers
diff --git a/piet-gpu/src/test_scenes.rs b/piet-gpu/src/test_scenes.rs
index bfd2af2..e3aeaba 100644
--- a/piet-gpu/src/test_scenes.rs
+++ b/piet-gpu/src/test_scenes.rs
@@ -2,10 +2,10 @@
 
 use rand::{Rng, RngCore};
 
-use crate::{Blend, BlendMode, CompositionMode, PietGpuRenderContext, Colrv1RadialGradient};
+use crate::{Blend, BlendMode, Colrv1RadialGradient, CompositionMode, PietGpuRenderContext};
 use piet::kurbo::{Affine, BezPath, Circle, Line, Point, Rect, Shape};
 use piet::{
-    Color, FixedGradient, FixedRadialGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
+    Color, GradientStop, LinearGradient, Text, TextAttribute, TextLayoutBuilder, UnitPoint,
 };
 
 use crate::{PicoSvg, RenderContext, Vec2};
@@ -200,6 +200,113 @@
     println!("flattening and encoding time: {:?}", start.elapsed());
 }
 
+pub fn render_blend_square(rc: &mut PietGpuRenderContext, blend: Blend) {
+    // Inspired by https://developer.mozilla.org/en-US/docs/Web/CSS/mix-blend-mode
+    let rect = Rect::new(0., 0., 200., 200.);
+    let stops = vec![
+        GradientStop {
+            color: Color::BLACK,
+            pos: 0.0,
+        },
+        GradientStop {
+            color: Color::WHITE,
+            pos: 1.0,
+        },
+    ];
+    let linear = LinearGradient::new(UnitPoint::LEFT, UnitPoint::RIGHT, stops);
+    rc.fill(rect, &linear);
+    const GRADIENTS: &[(f64, f64, Color)] = &[
+        (150., 0., Color::rgb8(255, 240, 64)),
+        (175., 100., Color::rgb8(255, 96, 240)),
+        (125., 200., Color::rgb8(64, 192, 255)),
+    ];
+    for (x, y, c) in GRADIENTS {
+        let stops = vec![
+            GradientStop {
+                color: c.clone(),
+                pos: 0.0,
+            },
+            GradientStop {
+                color: Color::rgba8(0, 0, 0, 0),
+                pos: 1.0,
+            },
+        ];
+        let rad = Colrv1RadialGradient {
+            center0: Point::new(*x, *y),
+            center1: Point::new(*x, *y),
+            radius0: 0.0,
+            radius1: 100.0,
+            stops,
+        };
+        let brush = rc.radial_gradient_colrv1(&rad);
+        rc.fill(Rect::new(0., 0., 200., 200.), &brush);
+    }
+    const COLORS: &[Color] = &[
+        Color::rgb8(255, 0, 0),
+        Color::rgb8(0, 255, 0),
+        Color::rgb8(0, 0, 255),
+    ];
+    let _ = rc.with_save(|rc| {
+        // Isolation (this can be removed for non-isolated version)
+        rc.blend(rect, BlendMode::Normal.into());
+        for (i, c) in COLORS.iter().enumerate() {
+            let stops = vec![
+                GradientStop {
+                    color: Color::WHITE,
+                    pos: 0.0,
+                },
+                GradientStop {
+                    color: c.clone(),
+                    pos: 1.0,
+                },
+            ];
+            // squash the ellipse
+            let a = Affine::translate((100., 100.))
+                * Affine::rotate(std::f64::consts::FRAC_PI_3 * (i * 2 + 1) as f64)
+                * Affine::scale_non_uniform(1.0, 0.357)
+                * Affine::translate((-100., -100.));
+            let linear = LinearGradient::new(UnitPoint::TOP, UnitPoint::BOTTOM, stops);
+            let _ = rc.with_save(|rc| {
+                rc.blend(rect, blend);
+                rc.transform(a);
+                rc.fill(Circle::new((100., 100.), 90.), &linear);
+                Ok(())
+            });
+        }
+        Ok(())
+    });
+}
+
+pub fn render_blend_grid(rc: &mut PietGpuRenderContext) {
+    const BLEND_MODES: &[BlendMode] = &[
+        BlendMode::Normal,
+        BlendMode::Multiply,
+        BlendMode::Darken,
+        BlendMode::Screen,
+        BlendMode::Lighten,
+        BlendMode::Overlay,
+        BlendMode::ColorDodge,
+        BlendMode::ColorBurn,
+        BlendMode::HardLight,
+        BlendMode::SoftLight,
+        BlendMode::Difference,
+        BlendMode::Exclusion,
+        BlendMode::Hue,
+        BlendMode::Saturation,
+        BlendMode::Color,
+        BlendMode::Luminosity,
+    ];
+    for (ix, &blend) in BLEND_MODES.iter().enumerate() {
+        let _ = rc.with_save(|rc| {
+            let i = ix % 4;
+            let j = ix / 4;
+            rc.transform(Affine::translate((i as f64 * 225., j as f64 * 225.)));
+            render_blend_square(rc, blend.into());
+            Ok(())
+        });
+    }
+}
+
 pub fn render_anim_frame(rc: &mut impl RenderContext, i: usize) {
     rc.fill(
         Rect::new(0.0, 0.0, 1000.0, 1000.0),
diff --git a/piet-scene/src/glyph/mod.rs b/piet-scene/src/glyph/mod.rs
index 3bfa36c..81d9735 100644
--- a/piet-scene/src/glyph/mod.rs
+++ b/piet-scene/src/glyph/mod.rs
@@ -114,7 +114,9 @@
                     };
                     xform_stack.push(xform);
                 }
-                Command::PopTransform => { xform_stack.pop(); },
+                Command::PopTransform => {
+                    xform_stack.pop();
+                }
                 Command::PushClip(path_index) => {
                     let path = glyph.path(*path_index)?;
                     if let Some(xform) = xform_stack.last() {