Merge pull request #164 from linebender/metal_timer

Timer queries on more platforms, particularly Metal
diff --git a/Cargo.lock b/Cargo.lock
index 737c033..66c793b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -921,6 +921,7 @@
  "block",
  "bytemuck",
  "cocoa-foundation",
+ "foreign-types",
  "metal",
  "objc",
  "raw-window-handle",
diff --git a/piet-gpu-hal/Cargo.toml b/piet-gpu-hal/Cargo.toml
index 29b51bd..f9b844a 100644
--- a/piet-gpu-hal/Cargo.toml
+++ b/piet-gpu-hal/Cargo.toml
@@ -28,3 +28,4 @@
 objc = "0.2.5"
 block = "0.1.6"
 cocoa-foundation = "0.1"
+foreign-types = "0.3.2"
diff --git a/piet-gpu-hal/examples/collatz.rs b/piet-gpu-hal/examples/collatz.rs
index dae5b31..7aff938 100644
--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@@ -1,4 +1,4 @@
-use piet_gpu_hal::{include_shader, BindType};
+use piet_gpu_hal::{include_shader, BindType, ComputePassDescriptor};
 use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session};
 
 fn main() {
@@ -20,9 +20,9 @@
         let mut cmd_buf = session.cmd_buf().unwrap();
         cmd_buf.begin();
         cmd_buf.reset_query_pool(&query_pool);
-        cmd_buf.write_timestamp(&query_pool, 0);
-        cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
-        cmd_buf.write_timestamp(&query_pool, 1);
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
+        pass.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
+        pass.end();
         cmd_buf.finish_timestamps(&query_pool);
         cmd_buf.host_barrier();
         cmd_buf.finish();
diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs
index 02ac7cb..f2c67a1 100644
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@@ -17,7 +17,8 @@
 //! The generic trait for backends to implement.
 
 use crate::{
-    BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams,
+    BindType, BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, ImageLayout,
+    MapMode, SamplerParams,
 };
 
 pub trait Device: Sized {
@@ -159,14 +160,32 @@
     unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
 }
 
+/// The trait implemented by backend command buffer implementations.
+///
+/// Valid encoding is represented by a state machine (currently not validated
+/// but it is easy to imagine there might be at least debug validation). Most
+/// methods are only valid in a particular state, and some move it to another
+/// state.
 pub trait CmdBuf<D: Device> {
+    /// Begin encoding.
+    ///
+    /// State: init -> ready
     unsafe fn begin(&mut self);
 
+    /// State: ready -> finished
     unsafe fn finish(&mut self);
 
     /// Return true if the command buffer is suitable for reuse.
     unsafe fn reset(&mut self) -> bool;
 
+    /// Begin a compute pass.
+    ///
+    /// State: ready -> in_compute_pass
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor);
+
+    /// Dispatch
+    ///
+    /// State: in_compute_pass
     unsafe fn dispatch(
         &mut self,
         pipeline: &D::Pipeline,
@@ -175,6 +194,9 @@
         workgroup_size: (u32, u32, u32),
     );
 
+    /// State: in_compute_pass -> ready
+    unsafe fn end_compute_pass(&mut self);
+
     /// Insert an execution and memory barrier.
     ///
     /// Compute kernels (and other actions) after this barrier may read from buffers
@@ -202,16 +224,16 @@
     /// This is readily supported in Vulkan, but for portability it is remarkably
     /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
     /// kernel, or organize the code not to need it.
-    unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
+    unsafe fn clear_buffer(&mut self, buffer: &D::Buffer, size: Option<u64>);
 
-    unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
+    unsafe fn copy_buffer(&mut self, src: &D::Buffer, dst: &D::Buffer);
 
-    unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer);
+    unsafe fn copy_image_to_buffer(&mut self, src: &D::Image, dst: &D::Buffer);
 
-    unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image);
+    unsafe fn copy_buffer_to_image(&mut self, src: &D::Buffer, dst: &D::Image);
 
     // low portability, dx12 doesn't support it natively
-    unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image);
+    unsafe fn blit_image(&mut self, src: &D::Image, dst: &D::Image);
 
     /// Reset the query pool.
     ///
@@ -227,7 +249,7 @@
     unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
 
     /// Begin a labeled section for debugging and profiling purposes.
-    unsafe fn begin_debug_label(&mut self, label: &str) {}
+    unsafe fn begin_debug_label(&mut self, _label: &str) {}
 
     /// End a section opened by `begin_debug_label`.
     unsafe fn end_debug_label(&mut self) {}
diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs
index 78ad449..c5e1e04 100644
--- a/piet-gpu-hal/src/dx12.rs
+++ b/piet-gpu-hal/src/dx12.rs
@@ -21,7 +21,7 @@
 
 use smallvec::SmallVec;
 
-use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat};
+use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor};
 
 use self::{
     descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned},
@@ -76,6 +76,7 @@
     c: wrappers::GraphicsCommandList,
     allocator: CommandAllocator,
     needs_reset: bool,
+    end_query: Option<(wrappers::QueryHeap, u32)>,
 }
 
 pub struct Pipeline {
@@ -360,6 +361,7 @@
                 c,
                 allocator,
                 needs_reset: false,
+                end_query: None,
             })
         }
     }
@@ -388,11 +390,10 @@
         let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
         std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size);
         self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
-        let ts0 = buf[0];
         let tsp = (self.ts_freq as f64).recip();
-        let result = buf[1..]
+        let result = buf
             .iter()
-            .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
+            .map(|ts| *ts as f64 * tsp)
             .collect();
         Ok(result)
     }
@@ -610,6 +611,16 @@
         self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
     }
 
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        if let Some((pool, start, end)) = &desc.timer_queries {
+            #[allow(irrefutable_let_patterns)]
+            if let crate::hub::QueryPool::Dx12(pool) = pool {
+                self.write_timestamp(pool, *start);
+                self.end_query = Some((pool.heap.clone(), *end));
+            }
+        }
+    }
+
     unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
@@ -628,6 +639,12 @@
             .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
     }
 
+    unsafe fn end_compute_pass(&mut self) {
+        if let Some((heap, end)) = self.end_query.take() {
+            self.c.end_timing_query(&heap, end);
+        }
+    }
+
     unsafe fn memory_barrier(&mut self) {
         // See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend.
         // The "proper" way to do this would be to name the actual buffers participating
@@ -666,7 +683,7 @@
         self.memory_barrier();
     }
 
-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
         let cpu_ref = buffer.cpu_ref.as_ref().unwrap();
         let (gpu_ref, heap) = buffer
             .gpu_ref
@@ -684,23 +701,23 @@
         );
     }
 
-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
         // TODO: consider using copy_resource here (if sizes match)
         let size = src.size.min(dst.size);
         self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size);
     }
 
-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
         self.c
             .copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1);
     }
 
-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
         self.c
             .copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1);
     }
 
-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
         self.c.copy_resource(&src.resource, &dst.resource);
     }
 
diff --git a/piet-gpu-hal/src/dx12/wrappers.rs b/piet-gpu-hal/src/dx12/wrappers.rs
index 4bbb86c..9a3fb90 100644
--- a/piet-gpu-hal/src/dx12/wrappers.rs
+++ b/piet-gpu-hal/src/dx12/wrappers.rs
@@ -79,7 +79,6 @@
 #[derive(Clone)]
 pub struct ShaderByteCode {
     pub bytecode: d3d12::D3D12_SHADER_BYTECODE,
-    blob: Option<Blob>,
 }
 
 #[derive(Clone)]
@@ -741,7 +740,6 @@
                 BytecodeLength: blob.0.GetBufferSize(),
                 pShaderBytecode: blob.0.GetBufferPointer(),
             },
-            blob: Some(blob),
         }
     }
 
@@ -810,7 +808,6 @@
                 BytecodeLength: bytecode.len(),
                 pShaderBytecode: bytecode.as_ptr() as *const _,
             },
-            blob: None,
         }
     }
 }
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index cc09832..ea17754 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -13,7 +13,7 @@
 use bytemuck::Pod;
 use smallvec::SmallVec;
 
-use crate::{mux, BackendType, BufWrite, ImageFormat, MapMode};
+use crate::{mux, BackendType, BufWrite, ComputePassDescriptor, ImageFormat, MapMode};
 
 use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
 
@@ -135,6 +135,11 @@
     size: u64,
 }
 
+/// A sub-object of a command buffer for a sequence of compute dispatches.
+pub struct ComputePass<'a> {
+    cmd_buf: &'a mut CmdBuf,
+}
+
 impl Session {
     /// Create a new session, choosing the best backend.
     pub fn new(device: mux::Device) -> Session {
@@ -370,8 +375,17 @@
     ///
     /// This should be called after waiting on the command buffer that wrote the
     /// timer queries.
+    ///
+    /// The returned vector is one shorter than the number of timer queries in the
+    /// pool; the first value is subtracted off. It would likely be better to return
+    /// the raw timestamps, but that change should be made consistently.
     pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
-        self.0.device.fetch_query_pool(pool)
+        let result = self.0.device.fetch_query_pool(pool)?;
+        // Subtract off first timestamp.
+        Ok(result[1..]
+            .iter()
+            .map(|ts| *ts as f64 - result[0])
+            .collect())
     }
 
     #[doc(hidden)]
@@ -471,23 +485,10 @@
         self.cmd_buf().finish();
     }
 
-    /// Dispatch a compute shader.
-    ///
-    /// Request a compute shader to be run, using the pipeline to specify the
-    /// code, and the descriptor set to address the resources read and written.
-    ///
-    /// Both the workgroup count (number of workgroups) and the workgroup size
-    /// (number of threads in a workgroup) must be specified here, though not
-    /// all back-ends require the latter info.
-    pub unsafe fn dispatch(
-        &mut self,
-        pipeline: &Pipeline,
-        descriptor_set: &DescriptorSet,
-        workgroup_count: (u32, u32, u32),
-        workgroup_size: (u32, u32, u32),
-    ) {
-        self.cmd_buf()
-            .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
+    /// Begin a compute pass.
+    pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) -> ComputePass {
+        self.cmd_buf().begin_compute_pass(desc);
+        ComputePass { cmd_buf: self }
     }
 
     /// Insert an execution and memory barrier.
@@ -582,13 +583,6 @@
         self.cmd_buf().reset_query_pool(pool);
     }
 
-    /// Write a timestamp.
-    ///
-    /// The query index must be less than the size of the query pool on creation.
-    pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        self.cmd_buf().write_timestamp(pool, query);
-    }
-
     /// Prepare the timestamps for reading. This isn't required on Vulkan but
     /// is required on (at least) DX12.
     ///
@@ -692,6 +686,51 @@
     }
 }
 
+impl<'a> ComputePass<'a> {
+    /// Dispatch a compute shader.
+    ///
+    /// Request a compute shader to be run, using the pipeline to specify the
+    /// code, and the descriptor set to address the resources read and written.
+    ///
+    /// Both the workgroup count (number of workgroups) and the workgroup size
+    /// (number of threads in a workgroup) must be specified here, though not
+    /// all back-ends require the latter info.
+    pub unsafe fn dispatch(
+        &mut self,
+        pipeline: &Pipeline,
+        descriptor_set: &DescriptorSet,
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
+    ) {
+        self.cmd_buf
+            .cmd_buf()
+            .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
+    }
+
+    /// Add a memory barrier.
+    ///
+    /// Inserts a memory barrier in the compute encoder. This is a convenience
+    /// function for calling the same function on the underlying command buffer,
+    /// avoiding borrow check issues.
+    pub unsafe fn memory_barrier(&mut self) {
+        self.cmd_buf.memory_barrier();
+    }
+
+    /// Begin a labeled section for debugging and profiling purposes.
+    pub unsafe fn begin_debug_label(&mut self, label: &str) {
+        self.cmd_buf.begin_debug_label(label);
+    }
+
+    /// End a section opened by `begin_debug_label`.
+    pub unsafe fn end_debug_label(&mut self) {
+        self.cmd_buf.end_debug_label();
+    }
+
+    pub unsafe fn end(self) {
+        self.cmd_buf.cmd_buf().end_compute_pass();
+    }
+}
+
 impl Drop for BufferInner {
     fn drop(&mut self) {
         if let Some(session) = Weak::upgrade(&self.session) {
diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index fab7d65..a1073f4 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -21,8 +21,8 @@
 };
 pub use bufwrite::BufWrite;
 pub use hub::{
-    BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource,
-    Session, SubmittedCmdBuf,
+    BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image,
+    RetainResource, Session, SubmittedCmdBuf,
 };
 
 // TODO: because these are conditionally included, "cargo fmt" does not
@@ -189,3 +189,23 @@
     /// dimension.
     pub max_invocations: u32,
 }
+
+/// Options for creating a compute pass.
+#[derive(Default)]
+pub struct ComputePassDescriptor<'a> {
+    // Maybe label should go here? It does in wgpu and wgpu_hal.
+    /// Timer query parameters.
+    ///
+    /// To record timer queries for a compute pass, set the query pool, start
+    /// query index, and end query index here. The indices must be less than
+    /// the size of the query pool.
+    timer_queries: Option<(&'a QueryPool, u32, u32)>,
+}
+
+impl<'a> ComputePassDescriptor<'a> {
+    pub fn timer(pool: &'a QueryPool, start_query: u32, end_query: u32) -> ComputePassDescriptor {
+        ComputePassDescriptor {
+            timer_queries: Some((pool, start_query, end_query)),
+        }
+    }
+}
diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index e3157d4..307def8 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -15,25 +15,32 @@
 // Also licensed under MIT license, at your choice.
 
 mod clear;
+mod timer;
 mod util;
 
 use std::mem;
 use std::sync::{Arc, Mutex};
 
+use block::Block;
 use cocoa_foundation::base::id;
 use cocoa_foundation::foundation::{NSInteger, NSUInteger};
+use foreign_types::ForeignType;
 use objc::rc::autoreleasepool;
 use objc::runtime::{Object, BOOL, YES};
 use objc::{class, msg_send, sel, sel_impl};
 
-use metal::{CGFloat, MTLFeatureSet};
+use metal::{CGFloat, CommandBufferRef, MTLFeatureSet};
 
 use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
 
-use crate::{BufferUsage, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits};
+use crate::{
+    BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits,
+};
 
 use util::*;
 
+use self::timer::{CounterSampleBuffer, CounterSet, TimeCalibration};
+
 pub struct MtlInstance;
 
 pub struct MtlDevice {
@@ -41,6 +48,18 @@
     cmd_queue: Arc<Mutex<metal::CommandQueue>>,
     gpu_info: GpuInfo,
     helpers: Arc<Helpers>,
+    timer_set: Option<CounterSet>,
+    counter_style: CounterStyle,
+}
+
+/// Type of counter sampling.
+///
+/// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+enum CounterStyle {
+    None,
+    Stage,
+    Command,
 }
 
 pub struct MtlSurface {
@@ -81,9 +100,22 @@
 pub struct CmdBuf {
     cmd_buf: metal::CommandBuffer,
     helpers: Arc<Helpers>,
+    cur_encoder: Encoder,
+    time_calibration: Arc<Mutex<TimeCalibration>>,
+    counter_style: CounterStyle,
 }
 
-pub struct QueryPool;
+enum Encoder {
+    None,
+    Compute(metal::ComputeCommandEncoder, Option<(id, u32)>),
+    Blit(metal::BlitCommandEncoder),
+}
+
+#[derive(Default)]
+pub struct QueryPool {
+    counter_sample_buf: Option<CounterSampleBuffer>,
+    calibration: Arc<Mutex<Option<Arc<Mutex<TimeCalibration>>>>>,
+}
 
 pub struct Pipeline(metal::ComputePipelineState);
 
@@ -209,18 +241,43 @@
         let helpers = Arc::new(Helpers {
             clear_pipeline: clear::make_clear_pipeline(&device),
         });
+        // Timer stuff
+        let timer_set = CounterSet::get_timer_counter_set(&device);
+        let counter_style = if timer_set.is_some() {
+            if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary) {
+                CounterStyle::Stage
+            } else if device
+                .supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary)
+            {
+                CounterStyle::Command
+            } else {
+                CounterStyle::None
+            }
+        } else {
+            CounterStyle::None
+        };
         MtlDevice {
             device,
             cmd_queue: Arc::new(Mutex::new(cmd_queue)),
             gpu_info,
             helpers,
+            timer_set,
+            counter_style,
         }
     }
 
     pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf {
         let cmd_buf = raw_cmd_buf;
         let helpers = self.helpers.clone();
-        CmdBuf { cmd_buf, helpers }
+        let cur_encoder = Encoder::None;
+        let time_calibration = Default::default();
+        CmdBuf {
+            cmd_buf,
+            helpers,
+            cur_encoder,
+            time_calibration,
+            counter_style: self.counter_style,
+        }
     }
 
     pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image {
@@ -330,11 +387,35 @@
 
     fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
         let cmd_queue = self.cmd_queue.lock().unwrap();
+        // A discussion about autorelease pools.
+        //
+        // Autorelease pools are a sore point in Rust/Objective-C interop. Basically,
+        // you can have any two of correctness, ergonomics, and performance. Here we've
+        // chosen the first two, using the pattern of a fine grained autorelease pool
+        // to give the Obj-C object Rust-like lifetime semantics whenever objects are
+        // created as autorelease (by convention, this is any object creation with an
+        // Obj-C method name that doesn't begin with "new" or "alloc").
+        //
+        // To gain back some of the performance, we'd need a way to wrap an autorelease
+        // pool over a chunk of work - that could be one frame of rendering, but for
+        // tests that iterate a number of command buffer submissions, it would need to
+        // be around that. On non-mac platforms, it would be a no-op.
+        //
+        // In any case, this way, the caller doesn't need to worry, and the performance
+        // hit might not be so bad (perhaps we should measure).
+
         // consider new_command_buffer_with_unretained_references for performance
-        let cmd_buf = cmd_queue.new_command_buffer();
-        let cmd_buf = autoreleasepool(|| cmd_buf.to_owned());
+        let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned());
         let helpers = self.helpers.clone();
-        Ok(CmdBuf { cmd_buf, helpers })
+        let cur_encoder = Encoder::None;
+        let time_calibration = Default::default();
+        Ok(CmdBuf {
+            cmd_buf,
+            helpers,
+            cur_encoder,
+            time_calibration,
+            counter_style: self.counter_style,
+        })
     }
 
     unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
@@ -342,12 +423,31 @@
     }
 
     fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
-        // TODO
-        Ok(QueryPool)
+        if let Some(timer_set) = &self.timer_set {
+            let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set)
+                .ok_or("error creating timer query pool")?;
+            return Ok(QueryPool {
+                counter_sample_buf: Some(pool),
+                calibration: Default::default(),
+            });
+        }
+        Ok(QueryPool::default())
     }
 
     unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> {
-        // TODO
+        if let Some(raw) = &pool.counter_sample_buf {
+            let resolved = raw.resolve();
+            let calibration = pool.calibration.lock().unwrap();
+            if let Some(calibration) = &*calibration {
+                let calibration = calibration.lock().unwrap();
+                let result = resolved
+                    .iter()
+                    .map(|time_ns| calibration.correlate(*time_ns))
+                    .collect();
+                return Ok(result);
+            }
+        }
+        // Maybe should return None indicating it wasn't successful? But that might break.
         Ok(Vec::new())
     }
 
@@ -358,7 +458,37 @@
         _signal_semaphores: &[&Self::Semaphore],
         fence: Option<&mut Self::Fence>,
     ) -> Result<(), Error> {
+        unsafe fn add_scheduled_handler(
+            cmd_buf: &metal::CommandBufferRef,
+            block: &Block<(&CommandBufferRef,), ()>,
+        ) {
+            msg_send![cmd_buf, addScheduledHandler: block]
+        }
         for cmd_buf in cmd_bufs {
+            let time_calibration = cmd_buf.time_calibration.clone();
+            let start_block = block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
+                let device: id = msg_send![buffer, device];
+                let mut time_calibration = time_calibration.lock().unwrap();
+                let cpu_ts_ptr = &mut time_calibration.cpu_start_ts as *mut _;
+                let gpu_ts_ptr = &mut time_calibration.gpu_start_ts as *mut _;
+                // TODO: only do this if supported.
+                let () = msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
+            })
+            .copy();
+            add_scheduled_handler(&cmd_buf.cmd_buf, &start_block);
+            let time_calibration = cmd_buf.time_calibration.clone();
+            let completed_block =
+                block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
+                    let device: id = msg_send![buffer, device];
+                    let mut time_calibration = time_calibration.lock().unwrap();
+                    let cpu_ts_ptr = &mut time_calibration.cpu_end_ts as *mut _;
+                    let gpu_ts_ptr = &mut time_calibration.gpu_end_ts as *mut _;
+                    // TODO: only do this if supported.
+                    let () =
+                        msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
+                })
+                .copy();
+            cmd_buf.cmd_buf.add_completed_handler(&completed_block);
             cmd_buf.cmd_buf.commit();
         }
         if let Some(last_cmd_buf) = cmd_bufs.last() {
@@ -439,12 +569,70 @@
 impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
     unsafe fn begin(&mut self) {}
 
-    unsafe fn finish(&mut self) {}
+    unsafe fn finish(&mut self) {
+        self.flush_encoder();
+    }
 
     unsafe fn reset(&mut self) -> bool {
         false
     }
 
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        // TODO: we might want to get better about validation but the following
+        // assert is likely to trigger, and also a case can be made that
+        // validation should be done at the hub level, for consistency.
+        //debug_assert!(matches!(self.cur_encoder, Encoder::None));
+        self.flush_encoder();
+        autoreleasepool(|| {
+            let (encoder, end_query) = match (&desc.timer_queries, self.counter_style) {
+                (Some(queries), CounterStyle::Stage) => {
+                    let descriptor: id =
+                        msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
+                    let attachments: id = msg_send![descriptor, sampleBufferAttachments];
+                    let index: NSUInteger = 0;
+                    let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
+                    // Here we break the hub/mux separation a bit, for expedience
+                    #[allow(irrefutable_let_patterns)]
+                    if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
+                        if let Some(sample_buf) = &query_pool.counter_sample_buf {
+                            let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
+                        }
+                    }
+                    let start_index = queries.1 as NSUInteger;
+                    let end_index = queries.2 as NSInteger;
+                    let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
+                    let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
+                    (
+                        msg_send![
+                            self.cmd_buf,
+                            computeCommandEncoderWithDescriptor: descriptor
+                        ],
+                        None,
+                    )
+                }
+                (Some(queries), CounterStyle::Command) => {
+                    let encoder = self.cmd_buf.new_compute_command_encoder();
+                    #[allow(irrefutable_let_patterns)]
+                    let end_query = if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
+                        if let Some(sample_buf) = &query_pool.counter_sample_buf {
+                            let sample_index = queries.1 as NSUInteger;
+                            let sample_buf = sample_buf.id();
+                            let () = msg_send![encoder, sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
+                            Some((sample_buf, queries.2))
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    };
+                    (encoder, end_query)
+                }
+                _ => (self.cmd_buf.new_compute_command_encoder(), None),
+            };
+            self.cur_encoder = Encoder::Compute(encoder.to_owned(), end_query);
+        });
+    }
+
     unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
@@ -452,7 +640,7 @@
         workgroup_count: (u32, u32, u32),
         workgroup_size: (u32, u32, u32),
     ) {
-        let encoder = self.cmd_buf.new_compute_command_encoder();
+        let encoder = self.compute_command_encoder();
         encoder.set_compute_pipeline_state(&pipeline.0);
         let mut buf_ix = 0;
         for buffer in &descriptor_set.buffers {
@@ -475,7 +663,11 @@
             depth: workgroup_size.2 as u64,
         };
         encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
-        encoder.end_encoding();
+    }
+
+    unsafe fn end_compute_pass(&mut self) {
+        // TODO: might validate that we are in a compute encoder state
+        self.flush_encoder();
     }
 
     unsafe fn memory_barrier(&mut self) {
@@ -494,22 +686,23 @@
         // I think these are being tracked.
     }
 
-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
         let size = size.unwrap_or(buffer.size);
-        let encoder = self.cmd_buf.new_compute_command_encoder();
-        clear::encode_clear(&encoder, &self.helpers.clear_pipeline, &buffer.buffer, size);
-        encoder.end_encoding()
+        let _ = self.compute_command_encoder();
+        // Getting this directly is a workaround for a borrow checker issue.
+        if let Encoder::Compute(e, _) = &self.cur_encoder {
+            clear::encode_clear(e, &self.helpers.clear_pipeline, &buffer.buffer, size);
+        }
     }
 
-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
+        let encoder = self.blit_command_encoder();
         let size = src.size.min(dst.size);
         encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size);
-        encoder.end_encoding();
     }
 
-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
+        let encoder = self.blit_command_encoder();
         assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4);
         let bytes_per_row = (src.width * 4) as NSUInteger;
         let src_size = metal::MTLSize {
@@ -530,11 +723,10 @@
             bytes_per_row * src.height as NSUInteger,
             metal::MTLBlitOption::empty(),
         );
-        encoder.end_encoding();
     }
 
-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
+        let encoder = self.blit_command_encoder();
         assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4);
         let bytes_per_row = (dst.width * 4) as NSUInteger;
         let src_size = metal::MTLSize {
@@ -555,11 +747,10 @@
             origin,
             metal::MTLBlitOption::empty(),
         );
-        encoder.end_encoding();
     }
 
-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
+        let encoder = self.blit_command_encoder();
         let src_size = metal::MTLSize {
             width: src.width.min(dst.width) as NSUInteger,
             height: src.width.min(dst.height) as NSUInteger,
@@ -577,15 +768,79 @@
             0,
             origin,
         );
-        encoder.end_encoding();
     }
 
-    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {}
+    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
+        let mut calibration = pool.calibration.lock().unwrap();
+        *calibration = Some(self.time_calibration.clone());
+    }
 
     unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        // TODO
-        // This really a PITA because it's pretty different than Vulkan.
-        // See https://developer.apple.com/documentation/metal/counter_sampling
+        if let Some(buf) = &pool.counter_sample_buf {
+            if matches!(self.cur_encoder, Encoder::None) {
+                self.cur_encoder =
+                    Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
+            }
+            let sample_index = query as NSUInteger;
+            if self.counter_style == CounterStyle::Command {
+                match &self.cur_encoder {
+                    Encoder::Compute(e, _) => {
+                        let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true];
+                    }
+                    Encoder::None => unreachable!(),
+                    _ => todo!(),
+                }
+            } else if self.counter_style == CounterStyle::Stage {
+                match &self.cur_encoder {
+                    Encoder::Compute(_e, _) => {
+                        println!("write_timestamp is not supported for stage-style encoders");
+                    }
+                    _ => (),
+                }
+            }
+        }
+    }
+}
+
+impl CmdBuf {
+    fn compute_command_encoder(&mut self) -> &metal::ComputeCommandEncoder {
+        if !matches!(self.cur_encoder, Encoder::Compute(..)) {
+            self.flush_encoder();
+            self.cur_encoder =
+                Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
+        }
+        if let Encoder::Compute(e, _) = &self.cur_encoder {
+            e
+        } else {
+            unreachable!()
+        }
+    }
+
+    fn blit_command_encoder(&mut self) -> &metal::BlitCommandEncoder {
+        if !matches!(self.cur_encoder, Encoder::Blit(_)) {
+            self.flush_encoder();
+            self.cur_encoder = Encoder::Blit(self.cmd_buf.new_blit_command_encoder().to_owned());
+        }
+        if let Encoder::Blit(e) = &self.cur_encoder {
+            e
+        } else {
+            unreachable!()
+        }
+    }
+
+    fn flush_encoder(&mut self) {
+        match std::mem::replace(&mut self.cur_encoder, Encoder::None) {
+            Encoder::Compute(e, Some((sample_buf, end_query))) => {
+                let sample_index = end_query as NSUInteger;
+                unsafe {
+                    let () = msg_send![e.as_ptr(), sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
+                }
+                e.end_encoding();
+            }
+            Encoder::Compute(e, None) => e.end_encoding(),
+            Encoder::Blit(e) => e.end_encoding(),
+            Encoder::None => (),
+        }
     }
 }
 
diff --git a/piet-gpu-hal/src/metal/timer.rs b/piet-gpu-hal/src/metal/timer.rs
new file mode 100644
index 0000000..65c8026
--- /dev/null
+++ b/piet-gpu-hal/src/metal/timer.rs
@@ -0,0 +1,172 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Support for timer queries.
+//!
+//! Likely some of this should be upstreamed into metal-rs.
+
+use std::{ffi::CStr, ptr::null_mut};
+
+use cocoa_foundation::{
+    base::id,
+    foundation::{NSRange, NSUInteger},
+};
+use metal::{DeviceRef, MTLStorageMode};
+use objc::{class, msg_send, sel, sel_impl};
+
+pub struct CounterSampleBuffer {
+    id: id,
+    count: u64,
+}
+
+pub struct CounterSet {
+    id: id,
+}
+
+#[derive(Default)]
+pub struct TimeCalibration {
+    pub cpu_start_ts: u64,
+    pub gpu_start_ts: u64,
+    pub cpu_end_ts: u64,
+    pub gpu_end_ts: u64,
+}
+
+impl Drop for CounterSampleBuffer {
+    fn drop(&mut self) {
+        unsafe { msg_send![self.id, release] }
+    }
+}
+
+impl Clone for CounterSampleBuffer {
+    fn clone(&self) -> CounterSampleBuffer {
+        unsafe {
+            CounterSampleBuffer {
+                id: msg_send![self.id, retain],
+                count: self.count,
+            }
+        }
+    }
+}
+
+impl CounterSampleBuffer {
+    pub fn id(&self) -> id {
+        self.id
+    }
+}
+
+impl Drop for CounterSet {
+    fn drop(&mut self) {
+        unsafe { msg_send![self.id, release] }
+    }
+}
+
+impl CounterSet {
+    pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
+        unsafe {
+            // TODO: version check
+            let sets: id = msg_send!(device, counterSets);
+            let count: NSUInteger = msg_send![sets, count];
+            for i in 0..count {
+                let set: id = msg_send![sets, objectAtIndex: i];
+                let name: id = msg_send![set, name];
+                let name_cstr = CStr::from_ptr(msg_send![name, UTF8String]);
+                if name_cstr.to_bytes() == b"timestamp" {
+                    return Some(CounterSet { id: set });
+                }
+            }
+            None
+        }
+    }
+}
+
+// copied from metal-rs; should be in common utilities maybe?
+fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str {
+    let bytes = unsafe {
+        let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String];
+        bytes as *const u8
+    };
+    let len: NSUInteger = unsafe { msg_send![nsstr, length] };
+    unsafe {
+        let bytes = std::slice::from_raw_parts(bytes, len as usize);
+        std::str::from_utf8(bytes).unwrap()
+    }
+}
+
+impl CounterSampleBuffer {
+    pub fn new(
+        device: &DeviceRef,
+        count: u64,
+        counter_set: &CounterSet,
+    ) -> Option<CounterSampleBuffer> {
+        unsafe {
+            let desc_cls = class!(MTLCounterSampleBufferDescriptor);
+            let descriptor: id = msg_send![desc_cls, alloc];
+            let _: id = msg_send![descriptor, init];
+            let count = count as NSUInteger;
+            let () = msg_send![descriptor, setSampleCount: count];
+            let () = msg_send![descriptor, setCounterSet: counter_set.id];
+            let () = msg_send![
+                descriptor,
+                setStorageMode: MTLStorageMode::Shared as NSUInteger
+            ];
+            let mut error: id = null_mut();
+            let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
+            let () = msg_send![descriptor, release];
+            if !error.is_null() {
+                let description = msg_send![error, localizedDescription];
+                println!(
+                    "error allocating sample buffer, code = {}",
+                    nsstring_as_str(description)
+                );
+                let () = msg_send![error, release];
+                return None;
+            }
+            Some(CounterSampleBuffer { id: buf, count })
+        }
+    }
+
+    // Read the timestamps.
+    //
+    // Safety: the lifetime of the returned slice is wrong, it's actually autoreleased.
+    pub unsafe fn resolve(&self) -> &[u64] {
+        let range = NSRange::new(0, self.count);
+        let data: id = msg_send![self.id, resolveCounterRange: range];
+        if data.is_null() {
+            &[]
+        } else {
+            let bytes: *const u64 = msg_send![data, bytes];
+            std::slice::from_raw_parts(bytes, self.count as usize)
+        }
+    }
+}
+
+impl TimeCalibration {
+    /// Convert GPU timestamp into CPU time base.
+    ///
+    /// See https://developer.apple.com/documentation/metal/performance_tuning/correlating_cpu_and_gpu_timestamps
+    pub fn correlate(&self, raw_ts: u64) -> f64 {
+        let delta_cpu = self.cpu_end_ts - self.cpu_start_ts;
+        let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
+        let adj_ts = if delta_gpu > 0 {
+            let scale = delta_cpu as f64 / delta_gpu as f64;
+            self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale
+        } else {
+            // Default is ns on Apple Silicon; on other hardware this will be wrong
+            raw_ts as f64
+        };
+        adj_ts * 1e-9
+    }
+}
diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs
index af1702d..9795193 100644
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@@ -35,6 +35,7 @@
 use crate::backend::Device as DeviceTrait;
 use crate::BackendType;
 use crate::BindType;
+use crate::ComputePassDescriptor;
 use crate::ImageFormat;
 use crate::MapMode;
 use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags};
@@ -658,6 +659,14 @@
         }
     }
 
+    pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        mux_match! { self;
+            CmdBuf::Vk(c) => c.begin_compute_pass(desc),
+            CmdBuf::Dx12(c) => c.begin_compute_pass(desc),
+            CmdBuf::Mtl(c) => c.begin_compute_pass(desc),
+        }
+    }
+
     /// Dispatch a compute shader.
     ///
     /// Note that both the number of workgroups (`workgroup_count`) and the number of
@@ -680,6 +689,14 @@
         }
     }
 
+    pub unsafe fn end_compute_pass(&mut self) {
+        mux_match! { self;
+            CmdBuf::Vk(c) => c.end_compute_pass(),
+            CmdBuf::Dx12(c) => c.end_compute_pass(),
+            CmdBuf::Mtl(c) => c.end_compute_pass(),
+        }
+    }
+
     pub unsafe fn memory_barrier(&mut self) {
         mux_match! { self;
             CmdBuf::Vk(c) => c.memory_barrier(),
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 8392899..504d947 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -15,7 +15,7 @@
 use crate::backend::Device as DeviceTrait;
 use crate::{
     BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize,
-    WorkgroupLimits,
+    WorkgroupLimits, ComputePassDescriptor,
 };
 
 pub struct VkInstance {
@@ -92,6 +92,7 @@
     cmd_buf: vk::CommandBuffer,
     cmd_pool: vk::CommandPool,
     device: Arc<RawDevice>,
+    end_query: Option<(vk::QueryPool, u32)>,
 }
 
 pub struct QueryPool {
@@ -738,6 +739,7 @@
                 cmd_buf,
                 cmd_pool,
                 device: self.device.clone(),
+                end_query: None,
             })
         }
     }
@@ -770,11 +772,10 @@
         // results (Windows 10, AMD 5700 XT).
         let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT;
         device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?;
-        let ts0 = buf[0];
         let tsp = self.timestamp_period as f64 * 1e-9;
-        let result = buf[1..]
+        let result = buf
             .iter()
-            .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
+            .map(|ts| *ts as f64 * tsp)
             .collect();
         Ok(result)
     }
@@ -902,6 +903,16 @@
         true
     }
 
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        if let Some((pool, start, end)) = &desc.timer_queries {
+            #[allow(irrefutable_let_patterns)]
+            if let crate::hub::QueryPool::Vk(pool) = pool {
+                self.write_timestamp_raw(pool.pool, *start);
+                self.end_query = Some((pool.pool, *end));
+            }
+        }
+    }
+
     unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
@@ -931,6 +942,12 @@
         );
     }
 
+    unsafe fn end_compute_pass(&mut self) {
+        if let Some((pool, end)) = self.end_query.take() {
+            self.write_timestamp_raw(pool, end);
+        }
+    }
+
     /// Insert a pipeline barrier for all memory accesses.
     unsafe fn memory_barrier(&mut self) {
         let device = &self.device.device;
@@ -995,13 +1012,13 @@
         );
     }
 
-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
         let device = &self.device.device;
         let size = size.unwrap_or(vk::WHOLE_SIZE);
         device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
     }
 
-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
         let device = &self.device.device;
         let size = src.size.min(dst.size);
         device.cmd_copy_buffer(
@@ -1012,7 +1029,7 @@
         );
     }
 
-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
         let device = &self.device.device;
         device.cmd_copy_image_to_buffer(
             self.cmd_buf,
@@ -1035,7 +1052,7 @@
         );
     }
 
-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
         let device = &self.device.device;
         device.cmd_copy_buffer_to_image(
             self.cmd_buf,
@@ -1058,7 +1075,7 @@
         );
     }
 
-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
         let device = &self.device.device;
         device.cmd_blit_image(
             self.cmd_buf,
@@ -1106,13 +1123,7 @@
     }
 
     unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        let device = &self.device.device;
-        device.cmd_write_timestamp(
-            self.cmd_buf,
-            vk::PipelineStageFlags::COMPUTE_SHADER,
-            pool.pool,
-            query,
-        );
+        self.write_timestamp_raw(pool.pool, query);
     }
 
     unsafe fn begin_debug_label(&mut self, label: &str) {
@@ -1130,6 +1141,18 @@
     }
 }
 
+impl CmdBuf {
+    unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) {
+        let device = &self.device.device;
+        device.cmd_write_timestamp(
+            self.cmd_buf,
+            vk::PipelineStageFlags::COMPUTE_SHADER,
+            pool,
+            query,
+        );
+    }
+}
+
 impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
     fn add_buffers(&mut self, buffers: &[&Buffer]) {
         self.buffers.extend(buffers.iter().map(|b| b.buffer));
diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
index b1db5e0..1642026 100644
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@@ -70,7 +70,7 @@
             .map(|_| session.create_semaphore())
             .collect::<Result<Vec<_>, Error>>()?;
         let query_pools = (0..NUM_FRAMES)
-            .map(|_| session.create_query_pool(8))
+            .map(|_| session.create_query_pool(12))
             .collect::<Result<Vec<_>, Error>>()?;
         let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default();
         let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
@@ -112,22 +112,23 @@
                         if !ts.is_empty() {
                             info_string = format!(
                                 "{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
-                                ts[6] * 1e3,
+                                ts[10] * 1e3,
                                 ts[0] * 1e3,
                                 (ts[1] - ts[0]) * 1e3,
                                 (ts[2] - ts[1]) * 1e3,
-                                (ts[3] - ts[2]) * 1e3,
                                 (ts[4] - ts[3]) * 1e3,
-                                (ts[5] - ts[4]) * 1e3,
                                 (ts[6] - ts[5]) * 1e3,
+                                (ts[8] - ts[7]) * 1e3,
+                                (ts[10] - ts[9]) * 1e3,
                             );
                         }
                     }
 
                     let mut ctx = PietGpuRenderContext::new();
+                    let test_blend = false;
                     if let Some(svg) = &svg {
                         test_scenes::render_svg(&mut ctx, svg);
-                    } else {
+                    } else if test_blend {
                         use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
                         let blends = [
                             Blend::new(Normal, SrcOver),
@@ -163,6 +164,8 @@
                         let blend = blends[mode % blends.len()];
                         test_scenes::render_blend_test(&mut ctx, current_frame, blend);
                         info_string = format!("{:?}", blend);
+                    } else {
+                        test_scenes::render_anim_frame(&mut ctx, current_frame);
                     }
                     render_info_string(&mut ctx, &info_string);
                     if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 45275a5..773007d 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -18,8 +18,8 @@
 use piet::{ImageFormat, RenderContext};
 
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image,
-    ImageLayout, Pipeline, QueryPool, Session,
+    include_shader, BindType, Buffer, BufferUsage, CmdBuf, ComputePassDescriptor, DescriptorSet,
+    Error, Image, ImageLayout, Pipeline, QueryPool, Session,
 };
 
 pub use pico_svg::PicoSvg;
@@ -424,10 +424,10 @@
         cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients);
         cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
         cmd_buf.reset_query_pool(&query_pool);
-        cmd_buf.write_timestamp(&query_pool, 0);
         cmd_buf.begin_debug_label("Element bounding box calculation");
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
         self.element_stage.record(
-            cmd_buf,
+            &mut pass,
             &self.element_code,
             &self.element_bindings[buf_ix],
             self.n_transform as u64,
@@ -435,56 +435,59 @@
             self.n_pathtag as u32,
             self.n_drawobj as u64,
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
-        cmd_buf.begin_debug_label("Clip bounding box calculation");
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 2, 3));
+        pass.begin_debug_label("Clip bounding box calculation");
         self.clip_binding
-            .record(cmd_buf, &self.clip_code, self.n_clip as u32);
-        cmd_buf.end_debug_label();
-        cmd_buf.begin_debug_label("Element binning");
-        cmd_buf.dispatch(
+            .record(&mut pass, &self.clip_code, self.n_clip as u32);
+        pass.end_debug_label();
+        pass.begin_debug_label("Element binning");
+        pass.dispatch(
             &self.bin_pipeline,
             &self.bin_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
-        cmd_buf.end_debug_label();
-        cmd_buf.memory_barrier();
-        cmd_buf.begin_debug_label("Tile allocation");
-        cmd_buf.dispatch(
+        pass.end_debug_label();
+        pass.memory_barrier();
+        pass.begin_debug_label("Tile allocation");
+        pass.dispatch(
             &self.tile_pipeline,
             &self.tile_ds[buf_ix],
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
-        cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 2);
-        cmd_buf.memory_barrier();
+        pass.end_debug_label();
+        pass.end();
         cmd_buf.begin_debug_label("Path flattening");
-        cmd_buf.dispatch(
+        cmd_buf.memory_barrier();
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 4, 5));
+        pass.dispatch(
             &self.path_pipeline,
             &self.path_ds,
             (((self.n_pathseg + 31) / 32) as u32, 1, 1),
             (32, 1, 1),
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 3);
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Backdrop propagation");
-        cmd_buf.dispatch(
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 6, 7));
+        pass.dispatch(
             &self.backdrop_pipeline,
             &self.backdrop_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, self.backdrop_y, 1),
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 4);
         // TODO: redo query accounting
-        cmd_buf.write_timestamp(&query_pool, 5);
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Coarse raster");
-        cmd_buf.dispatch(
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 8, 9));
+        pass.dispatch(
             &self.coarse_pipeline,
             &self.coarse_ds[buf_ix],
             (
@@ -494,11 +497,13 @@
             ),
             (256, 1, 1),
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 6);
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Fine raster");
-        cmd_buf.dispatch(
+        let mut pass =
+            cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11));
+        pass.dispatch(
             &self.k4_pipeline,
             &self.k4_ds,
             (
@@ -508,8 +513,8 @@
             ),
             (8, 4, 1),
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 7);
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
     }
diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs
index 52b8bf1..5442ba3 100644
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@@ -26,7 +26,7 @@
 pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE};
 pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE};
 pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE};
-use piet_gpu_hal::{Buffer, CmdBuf, Session};
+use piet_gpu_hal::{Buffer, ComputePass, Session};
 pub use transform::{
     Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE,
 };
@@ -140,7 +140,7 @@
 
     pub unsafe fn record(
         &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
         code: &ElementCode,
         binding: &ElementBinding,
         n_transform: u64,
@@ -149,14 +149,14 @@
         n_drawobj: u64,
     ) {
         self.transform_stage.record(
-            cmd_buf,
+            pass,
             &code.transform_code,
             &binding.transform_binding,
             n_transform,
         );
         // No memory barrier needed here; path has at least one before pathseg
         self.path_stage.record(
-            cmd_buf,
+            pass,
             &code.path_code,
             &binding.path_binding,
             n_paths,
@@ -164,6 +164,6 @@
         );
         // No memory barrier needed here; draw has at least one before draw_leaf
         self.draw_stage
-            .record(cmd_buf, &code.draw_code, &binding.draw_binding, n_drawobj);
+            .record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
     }
 }
diff --git a/piet-gpu/src/stages/clip.rs b/piet-gpu/src/stages/clip.rs
index e4bc3db..2fd195b 100644
--- a/piet-gpu/src/stages/clip.rs
+++ b/piet-gpu/src/stages/clip.rs
@@ -16,7 +16,7 @@
 
 //! The clip processing stage (includes substages).
 
-use piet_gpu_hal::{include_shader, BindType, Buffer, CmdBuf, DescriptorSet, Pipeline, Session};
+use piet_gpu_hal::{include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session};
 
 // Note that this isn't the code/stage/binding pattern of most of the other stages
 // in the new element processing pipeline. We want to move those temporary buffers
@@ -69,26 +69,26 @@
     /// Record the clip dispatches.
     ///
     /// Assumes memory barrier on entry. Provides memory barrier on exit.
-    pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, code: &ClipCode, n_clip: u32) {
+    pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClipCode, n_clip: u32) {
         let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE;
         if n_wg_reduce > 0 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &self.reduce_ds,
                 (n_wg_reduce, 1, 1),
                 (CLIP_PART_SIZE, 1, 1),
             );
-            cmd_buf.memory_barrier();
+            pass.memory_barrier();
         }
         let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE;
         if n_wg > 0 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.leaf_pipeline,
                 &self.leaf_ds,
                 (n_wg, 1, 1),
                 (CLIP_PART_SIZE, 1, 1),
             );
-            cmd_buf.memory_barrier();
+            pass.memory_barrier();
         }
     }
 }
diff --git a/piet-gpu/src/stages/draw.rs b/piet-gpu/src/stages/draw.rs
index 21312a4..f0ee2b6 100644
--- a/piet-gpu/src/stages/draw.rs
+++ b/piet-gpu/src/stages/draw.rs
@@ -19,7 +19,7 @@
 use bytemuck::{Pod, Zeroable};
 
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
 };
 
 /// The output element of the draw object stage.
@@ -130,7 +130,7 @@
 
     pub unsafe fn record(
         &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
         code: &DrawCode,
         binding: &DrawBinding,
         size: u64,
@@ -140,22 +140,22 @@
         }
         let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE;
         if n_workgroups > 1 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &binding.reduce_ds,
                 (n_workgroups as u32, 1, 1),
                 (DRAW_WG as u32, 1, 1),
             );
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
+            pass.memory_barrier();
+            pass.dispatch(
                 &code.root_pipeline,
                 &self.root_ds,
                 (1, 1, 1),
                 (DRAW_WG as u32, 1, 1),
             );
         }
-        cmd_buf.memory_barrier();
-        cmd_buf.dispatch(
+        pass.memory_barrier();
+        pass.dispatch(
             &code.leaf_pipeline,
             &binding.leaf_ds,
             (n_workgroups as u32, 1, 1),
diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs
index 6c524a2..be33041 100644
--- a/piet-gpu/src/stages/path.rs
+++ b/piet-gpu/src/stages/path.rs
@@ -17,7 +17,7 @@
 //! The path stage (includes substages).
 
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
 };
 
 pub struct PathCode {
@@ -148,7 +148,7 @@
     /// those are consumed. Result is written without barrier.
     pub unsafe fn record(
         &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
         code: &PathCode,
         binding: &PathBinding,
         n_paths: u32,
@@ -166,15 +166,15 @@
         let reduce_part_tags = REDUCE_PART_SIZE * 4;
         let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags;
         if n_wg_tag_reduce > 1 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &binding.reduce_ds,
                 (n_wg_tag_reduce, 1, 1),
                 (REDUCE_WG, 1, 1),
             );
             // I think we can skip root if n_wg_tag_reduce == 2
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
+            pass.memory_barrier();
+            pass.dispatch(
                 &code.tag_root_pipeline,
                 &self.tag_root_ds,
                 (1, 1, 1),
@@ -183,15 +183,15 @@
             // No barrier needed here; clear doesn't depend on path tags
         }
         let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG;
-        cmd_buf.dispatch(
+        pass.dispatch(
             &code.clear_pipeline,
             &binding.clear_ds,
             (n_wg_clear, 1, 1),
             (CLEAR_WG, 1, 1),
         );
-        cmd_buf.memory_barrier();
+        pass.memory_barrier();
         let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE;
-        cmd_buf.dispatch(
+        pass.dispatch(
             &code.pathseg_pipeline,
             &binding.path_ds,
             (n_wg_pathseg, 1, 1),
diff --git a/piet-gpu/src/stages/transform.rs b/piet-gpu/src/stages/transform.rs
index b21712f..8de7cee 100644
--- a/piet-gpu/src/stages/transform.rs
+++ b/piet-gpu/src/stages/transform.rs
@@ -20,7 +20,7 @@
 
 use piet::kurbo::Affine;
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
 };
 
 /// An affine transform.
@@ -132,7 +132,7 @@
 
     pub unsafe fn record(
         &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
         code: &TransformCode,
         binding: &TransformBinding,
         size: u64,
@@ -142,22 +142,22 @@
         }
         let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
         if n_workgroups > 1 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &binding.reduce_ds,
                 (n_workgroups as u32, 1, 1),
                 (TRANSFORM_WG as u32, 1, 1),
             );
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
+            pass.memory_barrier();
+            pass.dispatch(
                 &code.root_pipeline,
                 &self.root_ds,
                 (1, 1, 1),
                 (TRANSFORM_WG as u32, 1, 1),
             );
-            cmd_buf.memory_barrier();
+            pass.memory_barrier();
         }
-        cmd_buf.dispatch(
+        pass.dispatch(
             &code.leaf_pipeline,
             &binding.leaf_ds,
             (n_workgroups as u32, 1, 1),
diff --git a/tests/src/clear.rs b/tests/src/clear.rs
index fc6f063..af4b8ea 100644
--- a/tests/src/clear.rs
+++ b/tests/src/clear.rs
@@ -16,11 +16,11 @@
 
 //! Utilities (and a benchmark) for clearing buffers with compute shaders.
 
-use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
+use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
 use piet_gpu_hal::{Buffer, Pipeline};
 
 use crate::config::Config;
-use crate::runner::{Commands, Runner};
+use crate::runner::Runner;
 use crate::test_result::TestResult;
 
 const WG_SIZE: u64 = 256;
@@ -52,9 +52,9 @@
     let mut total_elapsed = 0.0;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
@@ -108,17 +108,12 @@
         ClearBinding { descriptor_set }
     }
 
-    pub unsafe fn record(
-        &self,
-        commands: &mut Commands,
-        code: &ClearCode,
-        bindings: &ClearBinding,
-    ) {
+    pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) {
         let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
         // An issue: for clearing large buffers (>16M), we need to check the
         // number of workgroups against the (dynamically detected) limit, and
         // potentially issue multiple dispatches.
-        commands.cmd_buf.dispatch(
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (n_workgroups as u32, 1, 1),
diff --git a/tests/src/clip.rs b/tests/src/clip.rs
index 4a38949..b1f8613 100644
--- a/tests/src/clip.rs
+++ b/tests/src/clip.rs
@@ -58,11 +58,11 @@
     let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf);
 
     let mut commands = runner.commands();
-    commands.write_timestamp(0);
     commands.upload(&memory);
-    binding.record(&mut commands.cmd_buf, &code, n_clip as u32);
+    let mut pass = commands.compute_pass(0, 1);
+    binding.record(&mut pass, &code, n_clip as u32);
+    pass.end();
     commands.download(&memory);
-    commands.write_timestamp(1);
     runner.submit(commands);
     let dst = memory.map_read(..);
     if let Some(failure) = data.verify(&dst) {
diff --git a/tests/src/draw.rs b/tests/src/draw.rs
index 4372da4..dc82572 100644
--- a/tests/src/draw.rs
+++ b/tests/src/draw.rs
@@ -77,9 +77,9 @@
     let n_iter = config.n_iter;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands.cmd_buf, &code, &binding, n_tag);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding, n_tag);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&memory);
diff --git a/tests/src/linkedlist.rs b/tests/src/linkedlist.rs
index 5767806..e24adcb 100644
--- a/tests/src/linkedlist.rs
+++ b/tests/src/linkedlist.rs
@@ -45,9 +45,7 @@
     for i in 0..n_iter {
         let mut commands = runner.commands();
         // Might clear only buckets to save time.
-        commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
-        commands.write_timestamp(1);
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&mem_buf);
@@ -107,12 +105,14 @@
         commands.cmd_buf.clear_buffer(out_buf, None);
         commands.cmd_buf.memory_barrier();
         let n_workgroups = N_BUCKETS / WG_SIZE;
-        commands.cmd_buf.dispatch(
+        let mut pass = commands.compute_pass(0, 1);
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (n_workgroups as u32, 1, 1),
             (WG_SIZE as u32, 1, 1),
         );
+        pass.end();
     }
 }
 
diff --git a/tests/src/message_passing.rs b/tests/src/message_passing.rs
index c5d989b..39e71dc 100644
--- a/tests/src/message_passing.rs
+++ b/tests/src/message_passing.rs
@@ -59,9 +59,7 @@
     let mut failures = 0;
     for _ in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding, &out_buf.dev_buf);
-        commands.write_timestamp(1);
         commands.cmd_buf.memory_barrier();
         commands.download(&out_buf);
         total_elapsed += runner.submit(commands);
@@ -128,11 +126,13 @@
         commands.cmd_buf.clear_buffer(&self.data_buf, None);
         commands.cmd_buf.clear_buffer(out_buf, None);
         commands.cmd_buf.memory_barrier();
-        commands.cmd_buf.dispatch(
+        let mut pass = commands.compute_pass(0, 1);
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (256, 1, 1),
             (256, 1, 1),
         );
+        pass.end();
     }
 }
diff --git a/tests/src/path.rs b/tests/src/path.rs
index bf72c68..9d794e1 100644
--- a/tests/src/path.rs
+++ b/tests/src/path.rs
@@ -105,15 +105,15 @@
         let mut commands = runner.commands();
         commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf);
         commands.cmd_buf.memory_barrier();
-        commands.write_timestamp(0);
+        let mut pass = commands.compute_pass(0, 1);
         stage.record(
-            &mut commands.cmd_buf,
+            &mut pass,
             &code,
             &binding,
             path_data.n_path,
             path_data.tags.len() as u32,
         );
-        commands.write_timestamp(1);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&memory);
diff --git a/tests/src/prefix.rs b/tests/src/prefix.rs
index 4174d8d..dbaf256 100644
--- a/tests/src/prefix.rs
+++ b/tests/src/prefix.rs
@@ -85,9 +85,7 @@
     let mut total_elapsed = 0.0;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
@@ -159,12 +157,14 @@
         let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
         commands.cmd_buf.clear_buffer(&self.state_buf, None);
         commands.cmd_buf.memory_barrier();
-        commands.cmd_buf.dispatch(
+        let mut pass = commands.compute_pass(0, 1);
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (n_workgroups as u32, 1, 1),
             (WG_SIZE as u32, 1, 1),
         );
+        pass.end();
         // One thing that's missing here is registering the buffers so
         // they can be safely dropped by Rust code before the execution
         // of the command buffer completes.
diff --git a/tests/src/prefix_tree.rs b/tests/src/prefix_tree.rs
index 24be2af..3c9c813 100644
--- a/tests/src/prefix_tree.rs
+++ b/tests/src/prefix_tree.rs
@@ -66,9 +66,7 @@
         let mut commands = runner.commands();
         commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf);
         commands.cmd_buf.memory_barrier();
-        commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
@@ -175,33 +173,35 @@
         code: &PrefixTreeCode,
         bindings: &PrefixTreeBinding,
     ) {
+        let mut pass = commands.compute_pass(0, 1);
         let n = self.tmp_bufs.len();
         for i in 0..n {
             let n_workgroups = self.sizes[i + 1];
-            commands.cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &bindings.descriptor_sets[i],
                 (n_workgroups as u32, 1, 1),
                 (WG_SIZE as u32, 1, 1),
             );
-            commands.cmd_buf.memory_barrier();
+            pass.memory_barrier();
         }
-        commands.cmd_buf.dispatch(
+        pass.dispatch(
             &code.root_pipeline,
             &bindings.descriptor_sets[n],
             (1, 1, 1),
             (WG_SIZE as u32, 1, 1),
         );
         for i in (0..n).rev() {
-            commands.cmd_buf.memory_barrier();
+            pass.memory_barrier();
             let n_workgroups = self.sizes[i + 1];
-            commands.cmd_buf.dispatch(
+            pass.dispatch(
                 &code.scan_pipeline,
                 &bindings.descriptor_sets[2 * n - i],
                 (n_workgroups as u32, 1, 1),
                 (WG_SIZE as u32, 1, 1),
             );
         }
+        pass.end();
     }
 }
 
diff --git a/tests/src/runner.rs b/tests/src/runner.rs
index 1fd6774..3ba8223 100644
--- a/tests/src/runner.rs
+++ b/tests/src/runner.rs
@@ -20,8 +20,8 @@
 
 use bytemuck::Pod;
 use piet_gpu_hal::{
-    BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags,
-    QueryPool, Session,
+    BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass,
+    ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session,
 };
 
 pub struct Runner {
@@ -118,8 +118,14 @@
 }
 
 impl Commands {
-    pub unsafe fn write_timestamp(&mut self, query: u32) {
-        self.cmd_buf.write_timestamp(&self.query_pool, query);
+    /// Start a compute pass with timer queries.
+    pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass {
+        self.cmd_buf
+            .begin_compute_pass(&ComputePassDescriptor::timer(
+                &self.query_pool,
+                start_query,
+                end_query,
+            ))
     }
 
     pub unsafe fn upload(&mut self, buf: &BufStage) {
diff --git a/tests/src/transform.rs b/tests/src/transform.rs
index 6edcc3f..43bfc67 100644
--- a/tests/src/transform.rs
+++ b/tests/src/transform.rs
@@ -61,9 +61,9 @@
     let n_iter = config.n_iter;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands.cmd_buf, &code, &binding, n_elements);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding, n_elements);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&memory);