Merge branch 'master' into mtl_guest
diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs
index 1086d3b..02ac7cb 100644
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@@ -225,6 +225,12 @@
     /// Prepare the timestamps for reading. This isn't required on Vulkan but
     /// is required on (at least) DX12.
     unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
+
+    /// Begin a labeled section for debugging and profiling purposes.
+    unsafe fn begin_debug_label(&mut self, label: &str) {}
+
+    /// End a section opened by `begin_debug_label`.
+    unsafe fn end_debug_label(&mut self) {}
 }
 
 /// A builder for descriptor sets with more complex layouts.
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 8c5926a..cc09832 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -598,6 +598,16 @@
         self.cmd_buf().finish_timestamps(pool);
     }
 
+    /// Begin a labeled section for debugging and profiling purposes.
+    pub unsafe fn begin_debug_label(&mut self, label: &str) {
+        self.cmd_buf().begin_debug_label(label);
+    }
+
+    /// End a section opened by `begin_debug_label`.
+    pub unsafe fn end_debug_label(&mut self) {
+        self.cmd_buf().end_debug_label();
+    }
+
     /// Make sure the resource lives until the command buffer completes.
     ///
     /// The submitted command buffer will hold this reference until the corresponding
diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs
index 4a54e96..af1702d 100644
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@@ -772,6 +772,22 @@
             CmdBuf::Mtl(c) => c.finish_timestamps(pool.mtl()),
         }
     }
+
+    pub unsafe fn begin_debug_label(&mut self, label: &str) {
+        mux_match! { self;
+            CmdBuf::Vk(c) => c.begin_debug_label(label),
+            CmdBuf::Dx12(c) => c.begin_debug_label(label),
+            CmdBuf::Mtl(c) => c.begin_debug_label(label),
+        }
+    }
+
+    pub unsafe fn end_debug_label(&mut self) {
+        mux_match! { self;
+            CmdBuf::Vk(c) => c.end_debug_label(),
+            CmdBuf::Dx12(c) => c.end_debug_label(),
+            CmdBuf::Mtl(c) => c.end_debug_label(),
+        }
+    }
 }
 
 impl Buffer {
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 924e2d6..8392899 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -8,6 +8,7 @@
 
 use ash::extensions::{ext::DebugUtils, khr};
 use ash::{vk, Device, Entry, Instance};
+use ash::vk::DebugUtilsLabelEXT;
 
 use smallvec::SmallVec;
 
@@ -23,7 +24,7 @@
     entry: Entry,
     instance: Instance,
     vk_version: u32,
-    _dbg_loader: Option<DebugUtils>,
+    dbg_loader: Option<DebugUtils>,
     _dbg_callbk: Option<vk::DebugUtilsMessengerEXT>,
 }
 
@@ -39,6 +40,7 @@
 
 struct RawDevice {
     device: Device,
+    dbg_loader: Option<DebugUtils>,
 }
 
 pub struct VkSurface {
@@ -202,7 +204,7 @@
                 None,
             )?;
 
-            let (_dbg_loader, _dbg_callbk) = if has_debug_ext {
+            let (dbg_loader, _dbg_callbk) = if has_debug_ext {
                 let dbg_info = vk::DebugUtilsMessengerCreateInfoEXT::builder()
                     .message_severity(
                         vk::DebugUtilsMessageSeverityFlagsEXT::ERROR
@@ -231,7 +233,7 @@
                 entry,
                 instance,
                 vk_version,
-                _dbg_loader,
+                dbg_loader,
                 _dbg_callbk,
             };
 
@@ -317,7 +319,7 @@
         let queue_index = 0;
         let queue = device.get_device_queue(qfi, queue_index);
 
-        let device = Arc::new(RawDevice { device });
+        let device = Arc::new(RawDevice { device, dbg_loader: self.dbg_loader.clone() });
 
         let props = self.instance.get_physical_device_properties(pdevice);
         let timestamp_period = props.limits.timestamp_period;
@@ -1112,6 +1114,20 @@
             query,
         );
     }
+
+    unsafe fn begin_debug_label(&mut self, label: &str) {
+        if let Some(utils) = &self.device.dbg_loader {
+            let label_cstr = CString::new(label).unwrap();
+            let label_ext = DebugUtilsLabelEXT::builder().label_name(&label_cstr).build();
+            utils.cmd_begin_debug_utils_label(self.cmd_buf, &label_ext);
+        }
+    }
+
+    unsafe fn end_debug_label(&mut self) {
+        if let Some(utils) = &self.device.dbg_loader {
+            utils.cmd_end_debug_utils_label(self.cmd_buf);
+        }
+    }
 }
 
 impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
diff --git a/piet-gpu/shader/.clang-format b/piet-gpu/shader/.clang-format
new file mode 100644
index 0000000..9801ccd
--- /dev/null
+++ b/piet-gpu/shader/.clang-format
@@ -0,0 +1,5 @@
+BasedOnStyle: LLVM
+IndentWidth: 4
+ColumnLimit: 120
+AllowShortFunctionsOnASingleLine: None
+SortIncludes: false
diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp
index e4140cd..0c698b1 100644
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@@ -57,10 +57,10 @@
         if (element_ix < conf.n_elements) {
             AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
             switch (tag.tag) {
-                case Annotated_Image:
-                case Annotated_LinGradient:
-                case Annotated_BeginClip:
-                case Annotated_Color:
+            case Annotated_Image:
+            case Annotated_LinGradient:
+            case Annotated_BeginClip:
+            case Annotated_Color:
                 if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) {
                     break;
                 }
@@ -77,7 +77,8 @@
                     // long as it doesn't cross the left edge.
                     row_count = 0;
                 }
-                Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+                Alloc path_alloc = new_alloc(
+                    path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
                 sh_row_alloc[th_ix] = path_alloc;
             }
         }
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
index c2b81fd..a3a8ffd 100644
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@@ -75,13 +75,14 @@
     // trying to keep divergence low.
     // Right now, it's just a bbox, but we'll get finer with
     // segments.
-    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
-    uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y;
+    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X;
+    uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1) / N_TILE_Y;
     x0 = clamp(x0, 0, int(width_in_bins));
     x1 = clamp(x1, x0, int(width_in_bins));
     y0 = clamp(y0, 0, int(height_in_bins));
     y1 = clamp(y1, y0, int(height_in_bins));
-    if (x0 == x1) y1 = y0;
+    if (x0 == x1)
+        y1 = y0;
     int x = x0, y = y0;
     uint my_slice = gl_LocalInvocationID.x / 32;
     uint my_mask = 1u << (gl_LocalInvocationID.x & 31);
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index e79908a..448caf2 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -118,3 +118,7 @@
 build gen/draw_leaf.hlsl: hlsl gen/draw_leaf.spv
 build gen/draw_leaf.dxil: dxil gen/draw_leaf.hlsl
 build gen/draw_leaf.msl: msl gen/draw_leaf.spv
+
+build spv: phony gen/backdrop_lg.spv gen/backdrop.spv gen/bbox_clear.spv gen/binning.spv gen/coarse.spv gen/draw_leaf.spv gen/draw_reduce.spv gen/draw_root.spv gen/kernel4.spv gen/path_coarse.spv gen/pathseg.spv gen/pathtag_reduce.spv gen/pathtag_root.spv gen/tile_alloc.spv gen/transform_leaf.spv gen/transform_reduce.spv gen/transform_root.spv
+build dxil: phony gen/backdrop.hlsl gen/backdrop_lg.hlsl gen/bbox_clear.hlsl gen/binning.hlsl gen/coarse.hlsl gen/draw_leaf.hlsl gen/draw_reduce.hlsl gen/draw_root.hlsl gen/kernel4.hlsl gen/path_coarse.hlsl gen/pathseg.hlsl gen/pathtag_reduce.hlsl gen/pathtag_root.hlsl gen/tile_alloc.hlsl gen/transform_leaf.hlsl gen/transform_reduce.hlsl gen/transform_root.hlsl
+build msl: phony gen/backdrop_lg.msl gen/backdrop.msl gen/bbox_clear.msl gen/binning.msl gen/coarse.msl gen/draw_leaf.msl gen/draw_reduce.msl gen/draw_root.msl gen/kernel4.msl gen/path_coarse.msl gen/pathseg.msl gen/pathtag_reduce.msl gen/pathtag_root.msl gen/tile_alloc.msl gen/transform_leaf.msl gen/transform_reduce.msl gen/transform_root.msl
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 31a64e4..bf5f949 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -8,7 +8,8 @@
 // Each workgroup operating on one bin by stream compacting
 // the elements corresponding to the bin.
 //
-// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be encoded.
+// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be
+// encoded.
 
 #version 450
 #extension GL_GOOGLE_include_directive : enable
@@ -66,7 +67,7 @@
 
 Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
     // All memory.
-    return new_alloc(0, memory.length()*4, mem_ok);
+    return new_alloc(0, memory.length() * 4, mem_ok);
 }
 #endif
 
@@ -111,7 +112,7 @@
 void main() {
     // Could use either linear or 2d layouts for both dispatch and
     // invocations within the workgroup. We'll use variables to abstract.
-    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
+    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X;
     uint bin_ix = width_in_bins * gl_WorkGroupID.y + gl_WorkGroupID.x;
     uint partition_ix = 0;
     uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE;
@@ -163,7 +164,7 @@
                     uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
                     count = read_mem(conf.bin_alloc, in_ix);
                     uint offset = read_mem(conf.bin_alloc, in_ix + 1);
-                    sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size, mem_ok);
+                    sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, mem_ok);
                 }
                 // prefix sum of counts
                 for (uint i = 0; i < LG_N_PART_READ; i++) {
@@ -245,7 +246,8 @@
             // base relative to bin
             uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
             sh_tile_base[th_ix] = base;
-            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+            Alloc path_alloc = new_alloc(path.tiles.offset,
+                                         (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
             write_tile_alloc(th_ix, path_alloc);
             break;
         default:
@@ -284,7 +286,8 @@
             if (tag == Annotated_BeginClip || tag == Annotated_EndClip) {
                 include_tile = true;
             } else if (mem_ok) {
-                Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
+                Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok),
+                                      TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
                 // Include the path in the tile if
                 // - the tile contains at least a segment (tile offset non-zero)
                 // - the tile is completely covered (backdrop non-zero)
@@ -329,8 +332,9 @@
             if (clip_zero_depth == 0) {
                 switch (tag.tag) {
                 case Annotated_Color:
-                    Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
-                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
+                                          TileRef(sh_tile_base[element_ref_ix] +
+                                                  (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     AnnoColor fill = Annotated_Color_read(conf.anno_alloc, ref);
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
@@ -340,8 +344,9 @@
                     cmd_ref.offset += 4 + CmdColor_size;
                     break;
                 case Annotated_LinGradient:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
-                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
+                                     TileRef(sh_tile_base[element_ref_ix] +
+                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     AnnoLinGradient lin = Annotated_LinGradient_read(conf.anno_alloc, ref);
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
@@ -356,8 +361,9 @@
                     cmd_ref.offset += 4 + CmdLinGrad_size;
                     break;
                 case Annotated_Image:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
-                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
+                                     TileRef(sh_tile_base[element_ref_ix] +
+                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     AnnoImage fill_img = Annotated_Image_read(conf.anno_alloc, ref);
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
@@ -367,8 +373,9 @@
                     cmd_ref.offset += 4 + CmdImage_size;
                     break;
                 case Annotated_BeginClip:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
-                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
+                                     TileRef(sh_tile_base[element_ref_ix] +
+                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     if (tile.tile.offset == 0 && tile.backdrop == 0) {
                         clip_zero_depth = clip_depth + 1;
                     } else if (tile.tile.offset == 0 && clip_depth < 32) {
@@ -418,7 +425,8 @@
         barrier();
 
         rd_ix += N_TILE;
-        if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
+        if (rd_ix >= ready_ix && partition_ix >= n_partitions)
+            break;
     }
     if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
         Cmd_End_write(cmd_alloc, cmd_ref);
diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp
index 5de2652..c020847 100644
--- a/piet-gpu/shader/draw_leaf.comp
+++ b/piet-gpu/shader/draw_leaf.comp
@@ -3,7 +3,6 @@
 // The leaf scan pass for draw tag scan implemented as a tree reduction.
 // This stage can be fused with its consumer but is separate now.
 
-
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
@@ -62,7 +61,7 @@
         barrier();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
-    
+
     barrier();
     Monoid row = tag_monoid_identity();
     if (gl_WorkGroupID.x > 0) {
diff --git a/piet-gpu/shader/draw_scan.comp b/piet-gpu/shader/draw_scan.comp
index 2afc9ba..1c26c26 100644
--- a/piet-gpu/shader/draw_scan.comp
+++ b/piet-gpu/shader/draw_scan.comp
@@ -51,7 +51,7 @@
         barrier();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
-    
+
     barrier();
     // This could be a semigroup instead of a monoid if we reworked the
     // conditional logic, but that might impact performance.
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index 23353bc..9aba204 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -14,7 +14,7 @@
 
 #define CHUNK_X 2
 #define CHUNK_Y 4
-#define CHUNK CHUNK_X * CHUNK_Y
+#define CHUNK (CHUNK_X * CHUNK_Y)
 #define CHUNK_DX (TILE_WIDTH_PX / CHUNK_X)
 #define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y)
 layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in;
@@ -39,16 +39,16 @@
 #define MAX_BLEND_STACK 128
 mediump vec3 tosRGB(mediump vec3 rgb) {
     bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308));
-    mediump vec3 below = vec3(12.92)*rgb;
-    mediump vec3 above = vec3(1.055)*pow(rgb, vec3(0.41666)) - vec3(0.055);
+    mediump vec3 below = vec3(12.92) * rgb;
+    mediump vec3 above = vec3(1.055) * pow(rgb, vec3(0.41666)) - vec3(0.055);
     return mix(below, above, cutoff);
 }
 
 mediump vec3 fromsRGB(mediump vec3 srgb) {
     // Formula from EXT_sRGB.
     bvec3 cutoff = greaterThanEqual(srgb, vec3(0.04045));
-    mediump vec3 below = srgb/vec3(12.92);
-    mediump vec3 above = pow((srgb + vec3(0.055))/vec3(1.055), vec3(2.4));
+    mediump vec3 below = srgb / vec3(12.92);
+    mediump vec3 above = pow((srgb + vec3(0.055)) / vec3(1.055), vec3(2.4));
     return mix(below, above, cutoff);
 }
 
@@ -86,7 +86,8 @@
     Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
     CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
 
-    uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
+    uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x,
+                          gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
     vec2 xy = vec2(xy_uint);
     mediump vec4 rgba[CHUNK];
     uint blend_stack[MAX_BLEND_STACK][CHUNK];
@@ -108,7 +109,8 @@
             // Calculate distance field from all the line segments in this tile.
             CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref);
             mediump float df[CHUNK];
-            for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
+            for (uint k = 0; k < CHUNK; k++)
+                df[k] = 1e9;
             TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
             do {
                 TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
@@ -128,7 +130,8 @@
             break;
         case Cmd_Fill:
             CmdFill fill = Cmd_Fill_read(cmd_alloc, cmd_ref);
-            for (uint k = 0; k < CHUNK; k++) area[k] = float(fill.backdrop);
+            for (uint k = 0; k < CHUNK; k++)
+                area[k] = float(fill.backdrop);
             tile_seg_ref = TileSegRef(fill.tile_ref);
             // Calculate coverage based on backdrop + coverage of each line segment
             do {
diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp
index 1bd06f9..c6d3815 100644
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@@ -139,7 +139,8 @@
         bool is_stroke = fill_mode_from_flags(tag.flags) == MODE_STROKE;
         uint path_ix = cubic.path_ix;
         Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
-        Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+        Alloc path_alloc =
+            new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
         ivec4 bbox = ivec4(path.bbox);
         vec2 p0 = cubic.p0;
         qp0 = cubic.p0;
@@ -206,8 +207,8 @@
 
                 TileSeg tile_seg;
 
-                int xray = int(floor(p0.x*SX));
-                int last_xray = int(floor(p1.x*SX));
+                int xray = int(floor(p0.x * SX));
+                int last_xray = int(floor(p1.x * SX));
                 if (p0.y > p1.y) {
                     int tmp = xray;
                     xray = last_xray;
@@ -231,7 +232,7 @@
                     if (y < y1 - 1) {
                         float tile_y1 = float((y + 1) * TILE_HEIGHT_PX);
                         float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
-                        next_xray = int(floor(x_edge*SX));
+                        next_xray = int(floor(x_edge * SX));
                     }
 
                     int min_xray = min(xray, next_xray);
@@ -265,7 +266,7 @@
                                 // kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
                                 // Nudge zeroes towards the intended sign.
                                 if (tile_seg.vector.x == 0) {
-                                    tile_seg.vector.x = sign(p1.x - p0.x)*1e-9;
+                                    tile_seg.vector.x = sign(p1.x - p0.x) * 1e-9;
                                 }
                             }
                             if (x <= min_xray || max_xray < x) {
diff --git a/piet-gpu/shader/pathseg.comp b/piet-gpu/shader/pathseg.comp
index 12104eb..a2ea86e 100644
--- a/piet-gpu/shader/pathseg.comp
+++ b/piet-gpu/shader/pathseg.comp
@@ -46,8 +46,7 @@
     if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
         c.bbox = a.bbox;
     } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
-        (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
-    {
+               (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) {
         c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
         c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
     }
@@ -246,7 +245,7 @@
     }
     // sh_scratch is the partition-wide inclusive scan of the bbox monoid,
     // sampled at the end of the N_SEQ sub-partition.
-    
+
     barrier();
     uint path_ix = save_path_ix;
     uint bbox_out_ix = (conf.bbox_alloc.offset >> 2) + path_ix * 6;
diff --git a/piet-gpu/shader/pathtag_scan.comp b/piet-gpu/shader/pathtag_scan.comp
index 7c1e74b..798622e 100644
--- a/piet-gpu/shader/pathtag_scan.comp
+++ b/piet-gpu/shader/pathtag_scan.comp
@@ -51,7 +51,7 @@
         barrier();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
-    
+
     barrier();
     // This could be a semigroup instead of a monoid if we reworked the
     // conditional logic, but that might impact performance.
diff --git a/piet-gpu/shader/transform_leaf.comp b/piet-gpu/shader/transform_leaf.comp
index c51dfe6..a5e4003 100644
--- a/piet-gpu/shader/transform_leaf.comp
+++ b/piet-gpu/shader/transform_leaf.comp
@@ -68,7 +68,7 @@
         barrier();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
-    
+
     barrier();
     Monoid row = monoid_identity();
     if (gl_WorkGroupID.x > 0) {
diff --git a/piet-gpu/shader/transform_scan.comp b/piet-gpu/shader/transform_scan.comp
index c4d6745..20b2a8a 100644
--- a/piet-gpu/shader/transform_scan.comp
+++ b/piet-gpu/shader/transform_scan.comp
@@ -66,7 +66,7 @@
         barrier();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
-    
+
     barrier();
     // This could be a semigroup instead of a monoid if we reworked the
     // conditional logic, but that might impact performance.
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 3c1e27f..97e1f28 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -388,6 +388,7 @@
         cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
         cmd_buf.reset_query_pool(&query_pool);
         cmd_buf.write_timestamp(&query_pool, 0);
+        cmd_buf.begin_debug_label("Element bounding box calculation");
         self.element_stage.record(
             cmd_buf,
             &self.element_code,
@@ -397,43 +398,53 @@
             self.n_pathtag as u32,
             self.n_drawobj as u64,
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Tile allocation");
         cmd_buf.dispatch(
             &self.tile_pipeline,
             &self.tile_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 2);
         cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Path flattening");
         cmd_buf.dispatch(
             &self.path_pipeline,
             &self.path_ds,
             (((self.n_pathseg + 31) / 32) as u32, 1, 1),
             (32, 1, 1),
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 3);
         cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Backdrop propagation");
         cmd_buf.dispatch(
             &self.backdrop_pipeline,
             &self.backdrop_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, self.backdrop_y, 1),
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 4);
         // Note: this barrier is not needed as an actual dependency between
         // pipeline stages, but I am keeping it in so that timer queries are
         // easier to interpret.
         cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Element binning");
         cmd_buf.dispatch(
             &self.bin_pipeline,
             &self.bin_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 5);
         cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Coarse raster");
         cmd_buf.dispatch(
             &self.coarse_pipeline,
             &self.coarse_ds,
@@ -444,8 +455,10 @@
             ),
             (256, 1, 1),
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 6);
         cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Fine raster");
         cmd_buf.dispatch(
             &self.k4_pipeline,
             &self.k4_ds,
@@ -456,6 +469,7 @@
             ),
             (8, 4, 1),
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 7);
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);