diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs
index a4422b9..496a6f0 100644
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@@ -218,6 +218,12 @@
     /// Prepare the timestamps for reading. This isn't required on Vulkan but
     /// is required on (at least) DX12.
     unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
+
+    /// Begin a labeled section for debugging and profiling purposes.
+    unsafe fn begin_debug_label(&mut self, label: &str) {}
+
+    /// End a section opened by `begin_debug_label`.
+    unsafe fn end_debug_label(&mut self) {}
 }
 
 /// A builder for descriptor sets with more complex layouts.
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 7b93372..2a7290d 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -569,6 +569,16 @@
         self.cmd_buf().finish_timestamps(pool);
     }
 
+    /// Begin a labeled section for debugging and profiling purposes.
+    pub unsafe fn begin_debug_label(&mut self, label: &str) {
+        self.cmd_buf().begin_debug_label(label);
+    }
+
+    /// End a section opened by `begin_debug_label`.
+    pub unsafe fn end_debug_label(&mut self) {
+        self.cmd_buf().end_debug_label();
+    }
+
     /// Make sure the resource lives until the command buffer completes.
     ///
     /// The submitted command buffer will hold this reference until the corresponding
diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs
index 24fef5c..c67de86 100644
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@@ -734,6 +734,22 @@
             CmdBuf::Mtl(c) => c.finish_timestamps(pool.mtl()),
         }
     }
+
+    pub unsafe fn begin_debug_label(&mut self, label: &str) {
+        mux_match! { self;
+            CmdBuf::Vk(c) => c.begin_debug_label(label),
+            CmdBuf::Dx12(c) => c.begin_debug_label(label),
+            CmdBuf::Mtl(c) => c.begin_debug_label(label),
+        }
+    }
+
+    pub unsafe fn end_debug_label(&mut self) {
+        mux_match! { self;
+            CmdBuf::Vk(c) => c.end_debug_label(),
+            CmdBuf::Dx12(c) => c.end_debug_label(),
+            CmdBuf::Mtl(c) => c.end_debug_label(),
+        }
+    }
 }
 
 impl Buffer {
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index d5b31cb..e34981e 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -8,6 +8,7 @@
 
 use ash::extensions::{ext::DebugUtils, khr};
 use ash::{vk, Device, Entry, Instance};
+use ash::vk::DebugUtilsLabelEXT;
 
 use smallvec::SmallVec;
 
@@ -23,7 +24,7 @@
     entry: Entry,
     instance: Instance,
     vk_version: u32,
-    _dbg_loader: Option<DebugUtils>,
+    dbg_loader: Option<DebugUtils>,
     _dbg_callbk: Option<vk::DebugUtilsMessengerEXT>,
 }
 
@@ -39,6 +40,7 @@
 
 struct RawDevice {
     device: Device,
+    dbg_loader: Option<DebugUtils>,
 }
 
 pub struct VkSurface {
@@ -202,7 +204,7 @@
                 None,
             )?;
 
-            let (_dbg_loader, _dbg_callbk) = if has_debug_ext {
+            let (dbg_loader, _dbg_callbk) = if has_debug_ext {
                 let dbg_info = vk::DebugUtilsMessengerCreateInfoEXT::builder()
                     .message_severity(
                         vk::DebugUtilsMessageSeverityFlagsEXT::ERROR
@@ -231,7 +233,7 @@
                 entry,
                 instance,
                 vk_version,
-                _dbg_loader,
+                dbg_loader,
                 _dbg_callbk,
             };
 
@@ -317,7 +319,7 @@
         let queue_index = 0;
         let queue = device.get_device_queue(qfi, queue_index);
 
-        let device = Arc::new(RawDevice { device });
+        let device = Arc::new(RawDevice { device, dbg_loader: self.dbg_loader.clone() });
 
         let props = self.instance.get_physical_device_properties(pdevice);
         let timestamp_period = props.limits.timestamp_period;
@@ -1108,6 +1110,20 @@
             query,
         );
     }
+
+    unsafe fn begin_debug_label(&mut self, label: &str) {
+        if let Some(utils) = &self.device.dbg_loader {
+            let label_cstr = CString::new(label).unwrap();
+            let label_ext = DebugUtilsLabelEXT::builder().label_name(&label_cstr).build();
+            utils.cmd_begin_debug_utils_label(self.cmd_buf, &label_ext);
+        }
+    }
+
+    unsafe fn end_debug_label(&mut self) {
+        if let Some(utils) = &self.device.dbg_loader {
+            utils.cmd_end_debug_utils_label(self.cmd_buf);
+        }
+    }
 }
 
 impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index f045d65..aa06c3f 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -342,6 +342,7 @@
         cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
         cmd_buf.reset_query_pool(&query_pool);
         cmd_buf.write_timestamp(&query_pool, 0);
+        cmd_buf.begin_debug_label("Element bounding box calculation");
         self.element_stage.record(
             cmd_buf,
             &self.element_code,
@@ -351,43 +352,53 @@
             self.n_pathtag as u32,
             self.n_drawobj as u64,
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Tile allocation");
         cmd_buf.dispatch(
             &self.tile_pipeline,
             &self.tile_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 2);
         cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Path flattening");
         cmd_buf.dispatch(
             &self.path_pipeline,
             &self.path_ds,
             (((self.n_pathseg + 31) / 32) as u32, 1, 1),
             (32, 1, 1),
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 3);
         cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Backdrop propagation");
         cmd_buf.dispatch(
             &self.backdrop_pipeline,
             &self.backdrop_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, self.backdrop_y, 1),
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 4);
         // Note: this barrier is not needed as an actual dependency between
         // pipeline stages, but I am keeping it in so that timer queries are
         // easier to interpret.
         cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Element binning");
         cmd_buf.dispatch(
             &self.bin_pipeline,
             &self.bin_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 5);
         cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Coarse raster");
         cmd_buf.dispatch(
             &self.coarse_pipeline,
             &self.coarse_ds,
@@ -398,8 +409,10 @@
             ),
             (256, 1, 1),
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 6);
         cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Fine raster");
         cmd_buf.dispatch(
             &self.k4_pipeline,
             &self.k4_ds,
@@ -410,6 +423,7 @@
             ),
             (8, 4, 1),
         );
+        cmd_buf.end_debug_label();
         cmd_buf.write_timestamp(&query_pool, 7);
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
