rebase on timer query patch
diff --git a/Cargo.lock b/Cargo.lock
index 4e98c9e..cb6b76a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -925,6 +925,7 @@
  "block",
  "bytemuck",
  "cocoa-foundation",
+ "foreign-types",
  "metal",
  "objc",
  "raw-window-handle 0.3.4",
diff --git a/piet-gpu-hal/Cargo.toml b/piet-gpu-hal/Cargo.toml
index 29b51bd..f9b844a 100644
--- a/piet-gpu-hal/Cargo.toml
+++ b/piet-gpu-hal/Cargo.toml
@@ -28,3 +28,4 @@
 objc = "0.2.5"
 block = "0.1.6"
 cocoa-foundation = "0.1"
+foreign-types = "0.3.2"
diff --git a/piet-gpu-hal/examples/collatz.rs b/piet-gpu-hal/examples/collatz.rs
index dae5b31..7aff938 100644
--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@@ -1,4 +1,4 @@
-use piet_gpu_hal::{include_shader, BindType};
+use piet_gpu_hal::{include_shader, BindType, ComputePassDescriptor};
 use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session};
 
 fn main() {
@@ -20,9 +20,9 @@
         let mut cmd_buf = session.cmd_buf().unwrap();
         cmd_buf.begin();
         cmd_buf.reset_query_pool(&query_pool);
-        cmd_buf.write_timestamp(&query_pool, 0);
-        cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
-        cmd_buf.write_timestamp(&query_pool, 1);
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
+        pass.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
+        pass.end();
         cmd_buf.finish_timestamps(&query_pool);
         cmd_buf.host_barrier();
         cmd_buf.finish();
diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs
index 02ac7cb..f2c67a1 100644
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@@ -17,7 +17,8 @@
 //! The generic trait for backends to implement.
 
 use crate::{
-    BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams,
+    BindType, BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, ImageLayout,
+    MapMode, SamplerParams,
 };
 
 pub trait Device: Sized {
@@ -159,14 +160,32 @@
     unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
 }
 
+/// The trait implemented by backend command buffer implementations.
+///
+/// Valid encoding is represented by a state machine (currently not validated
+/// but it is easy to imagine there might be at least debug validation). Most
+/// methods are only valid in a particular state, and some move it to another
+/// state.
 pub trait CmdBuf<D: Device> {
+    /// Begin encoding.
+    ///
+    /// State: init -> ready
     unsafe fn begin(&mut self);
 
+    /// State: ready -> finished
     unsafe fn finish(&mut self);
 
     /// Return true if the command buffer is suitable for reuse.
     unsafe fn reset(&mut self) -> bool;
 
+    /// Begin a compute pass.
+    ///
+    /// State: ready -> in_compute_pass
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor);
+
+    /// Dispatch
+    ///
+    /// State: in_compute_pass
     unsafe fn dispatch(
         &mut self,
         pipeline: &D::Pipeline,
@@ -175,6 +194,9 @@
         workgroup_size: (u32, u32, u32),
     );
 
+    /// State: in_compute_pass -> ready
+    unsafe fn end_compute_pass(&mut self);
+
     /// Insert an execution and memory barrier.
     ///
     /// Compute kernels (and other actions) after this barrier may read from buffers
@@ -202,16 +224,16 @@
     /// This is readily supported in Vulkan, but for portability it is remarkably
     /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
     /// kernel, or organize the code not to need it.
-    unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
+    unsafe fn clear_buffer(&mut self, buffer: &D::Buffer, size: Option<u64>);
 
-    unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
+    unsafe fn copy_buffer(&mut self, src: &D::Buffer, dst: &D::Buffer);
 
-    unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer);
+    unsafe fn copy_image_to_buffer(&mut self, src: &D::Image, dst: &D::Buffer);
 
-    unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image);
+    unsafe fn copy_buffer_to_image(&mut self, src: &D::Buffer, dst: &D::Image);
 
     // low portability, dx12 doesn't support it natively
-    unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image);
+    unsafe fn blit_image(&mut self, src: &D::Image, dst: &D::Image);
 
     /// Reset the query pool.
     ///
@@ -227,7 +249,7 @@
     unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
 
     /// Begin a labeled section for debugging and profiling purposes.
-    unsafe fn begin_debug_label(&mut self, label: &str) {}
+    unsafe fn begin_debug_label(&mut self, _label: &str) {}
 
     /// End a section opened by `begin_debug_label`.
     unsafe fn end_debug_label(&mut self) {}
diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs
index 78ad449..c5e1e04 100644
--- a/piet-gpu-hal/src/dx12.rs
+++ b/piet-gpu-hal/src/dx12.rs
@@ -21,7 +21,7 @@
 
 use smallvec::SmallVec;
 
-use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat};
+use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor};
 
 use self::{
     descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned},
@@ -76,6 +76,7 @@
     c: wrappers::GraphicsCommandList,
     allocator: CommandAllocator,
     needs_reset: bool,
+    end_query: Option<(wrappers::QueryHeap, u32)>,
 }
 
 pub struct Pipeline {
@@ -360,6 +361,7 @@
                 c,
                 allocator,
                 needs_reset: false,
+                end_query: None,
             })
         }
     }
@@ -388,11 +390,10 @@
         let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
         std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size);
         self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
-        let ts0 = buf[0];
         let tsp = (self.ts_freq as f64).recip();
-        let result = buf[1..]
+        let result = buf
             .iter()
-            .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
+            .map(|ts| *ts as f64 * tsp)
             .collect();
         Ok(result)
     }
@@ -610,6 +611,16 @@
         self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
     }
 
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        if let Some((pool, start, end)) = &desc.timer_queries {
+            #[allow(irrefutable_let_patterns)]
+            if let crate::hub::QueryPool::Dx12(pool) = pool {
+                self.write_timestamp(pool, *start);
+                self.end_query = Some((pool.heap.clone(), *end));
+            }
+        }
+    }
+
     unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
@@ -628,6 +639,12 @@
             .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
     }
 
+    unsafe fn end_compute_pass(&mut self) {
+        if let Some((heap, end)) = self.end_query.take() {
+            self.c.end_timing_query(&heap, end);
+        }
+    }
+
     unsafe fn memory_barrier(&mut self) {
         // See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend.
         // The "proper" way to do this would be to name the actual buffers participating
@@ -666,7 +683,7 @@
         self.memory_barrier();
     }
 
-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
         let cpu_ref = buffer.cpu_ref.as_ref().unwrap();
         let (gpu_ref, heap) = buffer
             .gpu_ref
@@ -684,23 +701,23 @@
         );
     }
 
-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
         // TODO: consider using copy_resource here (if sizes match)
         let size = src.size.min(dst.size);
         self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size);
     }
 
-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
         self.c
             .copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1);
     }
 
-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
         self.c
             .copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1);
     }
 
-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
         self.c.copy_resource(&src.resource, &dst.resource);
     }
 
diff --git a/piet-gpu-hal/src/dx12/wrappers.rs b/piet-gpu-hal/src/dx12/wrappers.rs
index 4bbb86c..9a3fb90 100644
--- a/piet-gpu-hal/src/dx12/wrappers.rs
+++ b/piet-gpu-hal/src/dx12/wrappers.rs
@@ -79,7 +79,6 @@
 #[derive(Clone)]
 pub struct ShaderByteCode {
     pub bytecode: d3d12::D3D12_SHADER_BYTECODE,
-    blob: Option<Blob>,
 }
 
 #[derive(Clone)]
@@ -741,7 +740,6 @@
                 BytecodeLength: blob.0.GetBufferSize(),
                 pShaderBytecode: blob.0.GetBufferPointer(),
             },
-            blob: Some(blob),
         }
     }
 
@@ -810,7 +808,6 @@
                 BytecodeLength: bytecode.len(),
                 pShaderBytecode: bytecode.as_ptr() as *const _,
             },
-            blob: None,
         }
     }
 }
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index cc09832..ea17754 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -13,7 +13,7 @@
 use bytemuck::Pod;
 use smallvec::SmallVec;
 
-use crate::{mux, BackendType, BufWrite, ImageFormat, MapMode};
+use crate::{mux, BackendType, BufWrite, ComputePassDescriptor, ImageFormat, MapMode};
 
 use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
 
@@ -135,6 +135,11 @@
     size: u64,
 }
 
+/// A sub-object of a command buffer for a sequence of compute dispatches.
+pub struct ComputePass<'a> {
+    cmd_buf: &'a mut CmdBuf,
+}
+
 impl Session {
     /// Create a new session, choosing the best backend.
     pub fn new(device: mux::Device) -> Session {
@@ -370,8 +375,17 @@
     ///
     /// This should be called after waiting on the command buffer that wrote the
     /// timer queries.
+    ///
+    /// The returned vector is one shorter than the number of timer queries in the
+    /// pool; the first value is subtracted off. It would likely be better to return
+    /// the raw timestamps, but that change should be made consistently.
     pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
-        self.0.device.fetch_query_pool(pool)
+        let result = self.0.device.fetch_query_pool(pool)?;
+        // Subtract off first timestamp.
+        Ok(result[1..]
+            .iter()
+            .map(|ts| *ts as f64 - result[0])
+            .collect())
     }
 
     #[doc(hidden)]
@@ -471,23 +485,10 @@
         self.cmd_buf().finish();
     }
 
-    /// Dispatch a compute shader.
-    ///
-    /// Request a compute shader to be run, using the pipeline to specify the
-    /// code, and the descriptor set to address the resources read and written.
-    ///
-    /// Both the workgroup count (number of workgroups) and the workgroup size
-    /// (number of threads in a workgroup) must be specified here, though not
-    /// all back-ends require the latter info.
-    pub unsafe fn dispatch(
-        &mut self,
-        pipeline: &Pipeline,
-        descriptor_set: &DescriptorSet,
-        workgroup_count: (u32, u32, u32),
-        workgroup_size: (u32, u32, u32),
-    ) {
-        self.cmd_buf()
-            .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
+    /// Begin a compute pass.
+    pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) -> ComputePass {
+        self.cmd_buf().begin_compute_pass(desc);
+        ComputePass { cmd_buf: self }
     }
 
     /// Insert an execution and memory barrier.
@@ -582,13 +583,6 @@
         self.cmd_buf().reset_query_pool(pool);
     }
 
-    /// Write a timestamp.
-    ///
-    /// The query index must be less than the size of the query pool on creation.
-    pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        self.cmd_buf().write_timestamp(pool, query);
-    }
-
     /// Prepare the timestamps for reading. This isn't required on Vulkan but
     /// is required on (at least) DX12.
     ///
@@ -692,6 +686,51 @@
     }
 }
 
+impl<'a> ComputePass<'a> {
+    /// Dispatch a compute shader.
+    ///
+    /// Request a compute shader to be run, using the pipeline to specify the
+    /// code, and the descriptor set to address the resources read and written.
+    ///
+    /// Both the workgroup count (number of workgroups) and the workgroup size
+    /// (number of threads in a workgroup) must be specified here, though not
+    /// all back-ends require the latter info.
+    pub unsafe fn dispatch(
+        &mut self,
+        pipeline: &Pipeline,
+        descriptor_set: &DescriptorSet,
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
+    ) {
+        self.cmd_buf
+            .cmd_buf()
+            .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
+    }
+
+    /// Add a memory barrier.
+    ///
+    /// Inserts a memory barrier in the compute encoder. This is a convenience
+    /// function for calling the same function on the underlying command buffer,
+    /// avoiding borrow check issues.
+    pub unsafe fn memory_barrier(&mut self) {
+        self.cmd_buf.memory_barrier();
+    }
+
+    /// Begin a labeled section for debugging and profiling purposes.
+    pub unsafe fn begin_debug_label(&mut self, label: &str) {
+        self.cmd_buf.begin_debug_label(label);
+    }
+
+    /// End a section opened by `begin_debug_label`.
+    pub unsafe fn end_debug_label(&mut self) {
+        self.cmd_buf.end_debug_label();
+    }
+
+    pub unsafe fn end(self) {
+        self.cmd_buf.cmd_buf().end_compute_pass();
+    }
+}
+
 impl Drop for BufferInner {
     fn drop(&mut self) {
         if let Some(session) = Weak::upgrade(&self.session) {
diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index fab7d65..a1073f4 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -21,8 +21,8 @@
 };
 pub use bufwrite::BufWrite;
 pub use hub::{
-    BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource,
-    Session, SubmittedCmdBuf,
+    BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image,
+    RetainResource, Session, SubmittedCmdBuf,
 };
 
 // TODO: because these are conditionally included, "cargo fmt" does not
@@ -189,3 +189,23 @@
     /// dimension.
     pub max_invocations: u32,
 }
+
+/// Options for creating a compute pass.
+#[derive(Default)]
+pub struct ComputePassDescriptor<'a> {
+    // Maybe label should go here? It does in wgpu and wgpu_hal.
+    /// Timer query parameters.
+    ///
+    /// To record timer queries for a compute pass, set the query pool, start
+    /// query index, and end query index here. The indices must be less than
+    /// the size of the query pool.
+    timer_queries: Option<(&'a QueryPool, u32, u32)>,
+}
+
+impl<'a> ComputePassDescriptor<'a> {
+    pub fn timer(pool: &'a QueryPool, start_query: u32, end_query: u32) -> ComputePassDescriptor {
+        ComputePassDescriptor {
+            timer_queries: Some((pool, start_query, end_query)),
+        }
+    }
+}
diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index e3157d4..307def8 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -15,25 +15,32 @@
 // Also licensed under MIT license, at your choice.
 
 mod clear;
+mod timer;
 mod util;
 
 use std::mem;
 use std::sync::{Arc, Mutex};
 
+use block::Block;
 use cocoa_foundation::base::id;
 use cocoa_foundation::foundation::{NSInteger, NSUInteger};
+use foreign_types::ForeignType;
 use objc::rc::autoreleasepool;
 use objc::runtime::{Object, BOOL, YES};
 use objc::{class, msg_send, sel, sel_impl};
 
-use metal::{CGFloat, MTLFeatureSet};
+use metal::{CGFloat, CommandBufferRef, MTLFeatureSet};
 
 use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
 
-use crate::{BufferUsage, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits};
+use crate::{
+    BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits,
+};
 
 use util::*;
 
+use self::timer::{CounterSampleBuffer, CounterSet, TimeCalibration};
+
 pub struct MtlInstance;
 
 pub struct MtlDevice {
@@ -41,6 +48,18 @@
     cmd_queue: Arc<Mutex<metal::CommandQueue>>,
     gpu_info: GpuInfo,
     helpers: Arc<Helpers>,
+    timer_set: Option<CounterSet>,
+    counter_style: CounterStyle,
+}
+
+/// Type of counter sampling.
+///
+/// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+enum CounterStyle {
+    None,
+    Stage,
+    Command,
 }
 
 pub struct MtlSurface {
@@ -81,9 +100,22 @@
 pub struct CmdBuf {
     cmd_buf: metal::CommandBuffer,
     helpers: Arc<Helpers>,
+    cur_encoder: Encoder,
+    time_calibration: Arc<Mutex<TimeCalibration>>,
+    counter_style: CounterStyle,
 }
 
-pub struct QueryPool;
+enum Encoder {
+    None,
+    Compute(metal::ComputeCommandEncoder, Option<(id, u32)>),
+    Blit(metal::BlitCommandEncoder),
+}
+
+#[derive(Default)]
+pub struct QueryPool {
+    counter_sample_buf: Option<CounterSampleBuffer>,
+    calibration: Arc<Mutex<Option<Arc<Mutex<TimeCalibration>>>>>,
+}
 
 pub struct Pipeline(metal::ComputePipelineState);
 
@@ -209,18 +241,43 @@
         let helpers = Arc::new(Helpers {
             clear_pipeline: clear::make_clear_pipeline(&device),
         });
+        // Timer stuff
+        let timer_set = CounterSet::get_timer_counter_set(&device);
+        let counter_style = if timer_set.is_some() {
+            if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary) {
+                CounterStyle::Stage
+            } else if device
+                .supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary)
+            {
+                CounterStyle::Command
+            } else {
+                CounterStyle::None
+            }
+        } else {
+            CounterStyle::None
+        };
         MtlDevice {
             device,
             cmd_queue: Arc::new(Mutex::new(cmd_queue)),
             gpu_info,
             helpers,
+            timer_set,
+            counter_style,
         }
     }
 
     pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf {
         let cmd_buf = raw_cmd_buf;
         let helpers = self.helpers.clone();
-        CmdBuf { cmd_buf, helpers }
+        let cur_encoder = Encoder::None;
+        let time_calibration = Default::default();
+        CmdBuf {
+            cmd_buf,
+            helpers,
+            cur_encoder,
+            time_calibration,
+            counter_style: self.counter_style,
+        }
     }
 
     pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image {
@@ -330,11 +387,35 @@
 
     fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
         let cmd_queue = self.cmd_queue.lock().unwrap();
+        // A discussion about autorelease pools.
+        //
+        // Autorelease pools are a sore point in Rust/Objective-C interop. Basically,
+        // you can have any two of correctness, ergonomics, and performance. Here we've
+        // chosen the first two, using the pattern of a fine grained autorelease pool
+        // to give the Obj-C object Rust-like lifetime semantics whenever objects are
+        // created as autorelease (by convention, this is any object creation with an
+        // Obj-C method name that doesn't begin with "new" or "alloc").
+        //
+        // To gain back some of the performance, we'd need a way to wrap an autorelease
+        // pool over a chunk of work - that could be one frame of rendering, but for
+        // tests that iterate a number of command buffer submissions, it would need to
+        // be around that. On non-mac platforms, it would be a no-op.
+        //
+        // In any case, this way, the caller doesn't need to worry, and the performance
+        // hit might not be so bad (perhaps we should measure).
+
         // consider new_command_buffer_with_unretained_references for performance
-        let cmd_buf = cmd_queue.new_command_buffer();
-        let cmd_buf = autoreleasepool(|| cmd_buf.to_owned());
+        let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned());
         let helpers = self.helpers.clone();
-        Ok(CmdBuf { cmd_buf, helpers })
+        let cur_encoder = Encoder::None;
+        let time_calibration = Default::default();
+        Ok(CmdBuf {
+            cmd_buf,
+            helpers,
+            cur_encoder,
+            time_calibration,
+            counter_style: self.counter_style,
+        })
     }
 
     unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
@@ -342,12 +423,31 @@
     }
 
     fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
-        // TODO
-        Ok(QueryPool)
+        if let Some(timer_set) = &self.timer_set {
+            let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set)
+                .ok_or("error creating timer query pool")?;
+            return Ok(QueryPool {
+                counter_sample_buf: Some(pool),
+                calibration: Default::default(),
+            });
+        }
+        Ok(QueryPool::default())
     }
 
     unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> {
-        // TODO
+        if let Some(raw) = &pool.counter_sample_buf {
+            let resolved = raw.resolve();
+            let calibration = pool.calibration.lock().unwrap();
+            if let Some(calibration) = &*calibration {
+                let calibration = calibration.lock().unwrap();
+                let result = resolved
+                    .iter()
+                    .map(|time_ns| calibration.correlate(*time_ns))
+                    .collect();
+                return Ok(result);
+            }
+        }
+        // Maybe should return None indicating it wasn't successful? But that might break.
         Ok(Vec::new())
     }
 
@@ -358,7 +458,37 @@
         _signal_semaphores: &[&Self::Semaphore],
         fence: Option<&mut Self::Fence>,
     ) -> Result<(), Error> {
+        unsafe fn add_scheduled_handler(
+            cmd_buf: &metal::CommandBufferRef,
+            block: &Block<(&CommandBufferRef,), ()>,
+        ) {
+            msg_send![cmd_buf, addScheduledHandler: block]
+        }
         for cmd_buf in cmd_bufs {
+            let time_calibration = cmd_buf.time_calibration.clone();
+            let start_block = block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
+                let device: id = msg_send![buffer, device];
+                let mut time_calibration = time_calibration.lock().unwrap();
+                let cpu_ts_ptr = &mut time_calibration.cpu_start_ts as *mut _;
+                let gpu_ts_ptr = &mut time_calibration.gpu_start_ts as *mut _;
+                // TODO: only do this if supported.
+                let () = msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
+            })
+            .copy();
+            add_scheduled_handler(&cmd_buf.cmd_buf, &start_block);
+            let time_calibration = cmd_buf.time_calibration.clone();
+            let completed_block =
+                block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
+                    let device: id = msg_send![buffer, device];
+                    let mut time_calibration = time_calibration.lock().unwrap();
+                    let cpu_ts_ptr = &mut time_calibration.cpu_end_ts as *mut _;
+                    let gpu_ts_ptr = &mut time_calibration.gpu_end_ts as *mut _;
+                    // TODO: only do this if supported.
+                    let () =
+                        msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
+                })
+                .copy();
+            cmd_buf.cmd_buf.add_completed_handler(&completed_block);
             cmd_buf.cmd_buf.commit();
         }
         if let Some(last_cmd_buf) = cmd_bufs.last() {
@@ -439,12 +569,70 @@
 impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
     unsafe fn begin(&mut self) {}
 
-    unsafe fn finish(&mut self) {}
+    unsafe fn finish(&mut self) {
+        self.flush_encoder();
+    }
 
     unsafe fn reset(&mut self) -> bool {
         false
     }
 
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        // TODO: we might want to get better about validation but the following
+        // assert is likely to trigger, and also a case can be made that
+        // validation should be done at the hub level, for consistency.
+        //debug_assert!(matches!(self.cur_encoder, Encoder::None));
+        self.flush_encoder();
+        autoreleasepool(|| {
+            let (encoder, end_query) = match (&desc.timer_queries, self.counter_style) {
+                (Some(queries), CounterStyle::Stage) => {
+                    let descriptor: id =
+                        msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
+                    let attachments: id = msg_send![descriptor, sampleBufferAttachments];
+                    let index: NSUInteger = 0;
+                    let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
+                    // Here we break the hub/mux separation a bit, for expedience
+                    #[allow(irrefutable_let_patterns)]
+                    if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
+                        if let Some(sample_buf) = &query_pool.counter_sample_buf {
+                            let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
+                        }
+                    }
+                    let start_index = queries.1 as NSUInteger;
+                    let end_index = queries.2 as NSInteger;
+                    let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
+                    let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
+                    (
+                        msg_send![
+                            self.cmd_buf,
+                            computeCommandEncoderWithDescriptor: descriptor
+                        ],
+                        None,
+                    )
+                }
+                (Some(queries), CounterStyle::Command) => {
+                    let encoder = self.cmd_buf.new_compute_command_encoder();
+                    #[allow(irrefutable_let_patterns)]
+                    let end_query = if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
+                        if let Some(sample_buf) = &query_pool.counter_sample_buf {
+                            let sample_index = queries.1 as NSUInteger;
+                            let sample_buf = sample_buf.id();
+                            let () = msg_send![encoder, sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
+                            Some((sample_buf, queries.2))
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    };
+                    (encoder, end_query)
+                }
+                _ => (self.cmd_buf.new_compute_command_encoder(), None),
+            };
+            self.cur_encoder = Encoder::Compute(encoder.to_owned(), end_query);
+        });
+    }
+
     unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
@@ -452,7 +640,7 @@
         workgroup_count: (u32, u32, u32),
         workgroup_size: (u32, u32, u32),
     ) {
-        let encoder = self.cmd_buf.new_compute_command_encoder();
+        let encoder = self.compute_command_encoder();
         encoder.set_compute_pipeline_state(&pipeline.0);
         let mut buf_ix = 0;
         for buffer in &descriptor_set.buffers {
@@ -475,7 +663,11 @@
             depth: workgroup_size.2 as u64,
         };
         encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
-        encoder.end_encoding();
+    }
+
+    unsafe fn end_compute_pass(&mut self) {
+        // TODO: might validate that we are in a compute encoder state
+        self.flush_encoder();
     }
 
     unsafe fn memory_barrier(&mut self) {
@@ -494,22 +686,23 @@
         // I think these are being tracked.
     }
 
-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
         let size = size.unwrap_or(buffer.size);
-        let encoder = self.cmd_buf.new_compute_command_encoder();
-        clear::encode_clear(&encoder, &self.helpers.clear_pipeline, &buffer.buffer, size);
-        encoder.end_encoding()
+        let _ = self.compute_command_encoder();
+        // Getting this directly is a workaround for a borrow checker issue.
+        if let Encoder::Compute(e, _) = &self.cur_encoder {
+            clear::encode_clear(e, &self.helpers.clear_pipeline, &buffer.buffer, size);
+        }
     }
 
-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
+        let encoder = self.blit_command_encoder();
         let size = src.size.min(dst.size);
         encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size);
-        encoder.end_encoding();
     }
 
-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
+        let encoder = self.blit_command_encoder();
         assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4);
         let bytes_per_row = (src.width * 4) as NSUInteger;
         let src_size = metal::MTLSize {
@@ -530,11 +723,10 @@
             bytes_per_row * src.height as NSUInteger,
             metal::MTLBlitOption::empty(),
         );
-        encoder.end_encoding();
     }
 
-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
+        let encoder = self.blit_command_encoder();
         assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4);
         let bytes_per_row = (dst.width * 4) as NSUInteger;
         let src_size = metal::MTLSize {
@@ -555,11 +747,10 @@
             origin,
             metal::MTLBlitOption::empty(),
         );
-        encoder.end_encoding();
     }
 
-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
+        let encoder = self.blit_command_encoder();
         let src_size = metal::MTLSize {
             width: src.width.min(dst.width) as NSUInteger,
             height: src.width.min(dst.height) as NSUInteger,
@@ -577,15 +768,79 @@
             0,
             origin,
         );
-        encoder.end_encoding();
     }
 
-    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {}
+    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
+        let mut calibration = pool.calibration.lock().unwrap();
+        *calibration = Some(self.time_calibration.clone());
+    }
 
     unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        // TODO
-        // This really a PITA because it's pretty different than Vulkan.
-        // See https://developer.apple.com/documentation/metal/counter_sampling
+        if let Some(buf) = &pool.counter_sample_buf {
+            if matches!(self.cur_encoder, Encoder::None) {
+                self.cur_encoder =
+                    Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
+            }
+            let sample_index = query as NSUInteger;
+            if self.counter_style == CounterStyle::Command {
+                match &self.cur_encoder {
+                    Encoder::Compute(e, _) => {
+                        let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true];
+                    }
+                    Encoder::None => unreachable!(),
+                    _ => todo!(),
+                }
+            } else if self.counter_style == CounterStyle::Stage {
+                match &self.cur_encoder {
+                    Encoder::Compute(_e, _) => {
+                        println!("write_timestamp is not supported for stage-style encoders");
+                    }
+                    _ => (),
+                }
+            }
+        }
+    }
+}
+
+impl CmdBuf {
+    fn compute_command_encoder(&mut self) -> &metal::ComputeCommandEncoder {
+        if !matches!(self.cur_encoder, Encoder::Compute(..)) {
+            self.flush_encoder();
+            self.cur_encoder =
+                Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
+        }
+        if let Encoder::Compute(e, _) = &self.cur_encoder {
+            e
+        } else {
+            unreachable!()
+        }
+    }
+
+    fn blit_command_encoder(&mut self) -> &metal::BlitCommandEncoder {
+        if !matches!(self.cur_encoder, Encoder::Blit(_)) {
+            self.flush_encoder();
+            self.cur_encoder = Encoder::Blit(self.cmd_buf.new_blit_command_encoder().to_owned());
+        }
+        if let Encoder::Blit(e) = &self.cur_encoder {
+            e
+        } else {
+            unreachable!()
+        }
+    }
+
+    fn flush_encoder(&mut self) {
+        match std::mem::replace(&mut self.cur_encoder, Encoder::None) {
+            Encoder::Compute(e, Some((sample_buf, end_query))) => {
+                let sample_index = end_query as NSUInteger;
+                unsafe {
+                    let () = msg_send![e.as_ptr(), sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
+                }
+                e.end_encoding();
+            }
+            Encoder::Compute(e, None) => e.end_encoding(),
+            Encoder::Blit(e) => e.end_encoding(),
+            Encoder::None => (),
+        }
     }
 }
 
diff --git a/piet-gpu-hal/src/metal/timer.rs b/piet-gpu-hal/src/metal/timer.rs
new file mode 100644
index 0000000..65c8026
--- /dev/null
+++ b/piet-gpu-hal/src/metal/timer.rs
@@ -0,0 +1,172 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Support for timer queries.
+//!
+//! Likely some of this should be upstreamed into metal-rs.
+
+use std::{ffi::CStr, ptr::null_mut};
+
+use cocoa_foundation::{
+    base::id,
+    foundation::{NSRange, NSUInteger},
+};
+use metal::{DeviceRef, MTLStorageMode};
+use objc::{class, msg_send, sel, sel_impl};
+
+pub struct CounterSampleBuffer {
+    id: id,
+    count: u64,
+}
+
+pub struct CounterSet {
+    id: id,
+}
+
+#[derive(Default)]
+pub struct TimeCalibration {
+    pub cpu_start_ts: u64,
+    pub gpu_start_ts: u64,
+    pub cpu_end_ts: u64,
+    pub gpu_end_ts: u64,
+}
+
+impl Drop for CounterSampleBuffer {
+    fn drop(&mut self) {
+        unsafe { msg_send![self.id, release] }
+    }
+}
+
+impl Clone for CounterSampleBuffer {
+    fn clone(&self) -> CounterSampleBuffer {
+        unsafe {
+            CounterSampleBuffer {
+                id: msg_send![self.id, retain],
+                count: self.count,
+            }
+        }
+    }
+}
+
+impl CounterSampleBuffer {
+    pub fn id(&self) -> id {
+        self.id
+    }
+}
+
+impl Drop for CounterSet {
+    fn drop(&mut self) {
+        unsafe { msg_send![self.id, release] }
+    }
+}
+
+impl CounterSet {
+    pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
+        unsafe {
+            // TODO: version check
+            let sets: id = msg_send!(device, counterSets);
+            let count: NSUInteger = msg_send![sets, count];
+            for i in 0..count {
+                let set: id = msg_send![sets, objectAtIndex: i];
+                let name: id = msg_send![set, name];
+                let name_cstr = CStr::from_ptr(msg_send![name, UTF8String]);
+                if name_cstr.to_bytes() == b"timestamp" {
+                    return Some(CounterSet { id: set });
+                }
+            }
+            None
+        }
+    }
+}
+
+// copied from metal-rs; should be in common utilities maybe?
+fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str {
+    let bytes = unsafe {
+        let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String];
+        bytes as *const u8
+    };
+    let len: NSUInteger = unsafe { msg_send![nsstr, length] };
+    unsafe {
+        let bytes = std::slice::from_raw_parts(bytes, len as usize);
+        std::str::from_utf8(bytes).unwrap()
+    }
+}
+
+impl CounterSampleBuffer {
+    pub fn new(
+        device: &DeviceRef,
+        count: u64,
+        counter_set: &CounterSet,
+    ) -> Option<CounterSampleBuffer> {
+        unsafe {
+            let desc_cls = class!(MTLCounterSampleBufferDescriptor);
+            let descriptor: id = msg_send![desc_cls, alloc];
+            let _: id = msg_send![descriptor, init];
+            let count = count as NSUInteger;
+            let () = msg_send![descriptor, setSampleCount: count];
+            let () = msg_send![descriptor, setCounterSet: counter_set.id];
+            let () = msg_send![
+                descriptor,
+                setStorageMode: MTLStorageMode::Shared as NSUInteger
+            ];
+            let mut error: id = null_mut();
+            let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
+            let () = msg_send![descriptor, release];
+            if !error.is_null() {
+                let description = msg_send![error, localizedDescription];
+                println!(
+                    "error allocating sample buffer, code = {}",
+                    nsstring_as_str(description)
+                );
+                let () = msg_send![error, release];
+                return None;
+            }
+            Some(CounterSampleBuffer { id: buf, count })
+        }
+    }
+
+    // Read the timestamps.
+    //
+    // Safety: the lifetime of the returned slice is wrong, it's actually autoreleased.
+    pub unsafe fn resolve(&self) -> &[u64] {
+        let range = NSRange::new(0, self.count);
+        let data: id = msg_send![self.id, resolveCounterRange: range];
+        if data.is_null() {
+            &[]
+        } else {
+            let bytes: *const u64 = msg_send![data, bytes];
+            std::slice::from_raw_parts(bytes, self.count as usize)
+        }
+    }
+}
+
+impl TimeCalibration {
+    /// Convert GPU timestamp into CPU time base.
+    ///
+    /// See https://developer.apple.com/documentation/metal/performance_tuning/correlating_cpu_and_gpu_timestamps
+    pub fn correlate(&self, raw_ts: u64) -> f64 {
+        let delta_cpu = self.cpu_end_ts - self.cpu_start_ts;
+        let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
+        let adj_ts = if delta_gpu > 0 {
+            let scale = delta_cpu as f64 / delta_gpu as f64;
+            self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale
+        } else {
+            // Default is ns on Apple Silicon; on other hardware this will be wrong
+            raw_ts as f64
+        };
+        adj_ts * 1e-9
+    }
+}
diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs
index af1702d..9795193 100644
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@@ -35,6 +35,7 @@
 use crate::backend::Device as DeviceTrait;
 use crate::BackendType;
 use crate::BindType;
+use crate::ComputePassDescriptor;
 use crate::ImageFormat;
 use crate::MapMode;
 use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags};
@@ -658,6 +659,14 @@
         }
     }
 
+    pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        mux_match! { self;
+            CmdBuf::Vk(c) => c.begin_compute_pass(desc),
+            CmdBuf::Dx12(c) => c.begin_compute_pass(desc),
+            CmdBuf::Mtl(c) => c.begin_compute_pass(desc),
+        }
+    }
+
     /// Dispatch a compute shader.
     ///
     /// Note that both the number of workgroups (`workgroup_count`) and the number of
@@ -680,6 +689,14 @@
         }
     }
 
+    pub unsafe fn end_compute_pass(&mut self) {
+        mux_match! { self;
+            CmdBuf::Vk(c) => c.end_compute_pass(),
+            CmdBuf::Dx12(c) => c.end_compute_pass(),
+            CmdBuf::Mtl(c) => c.end_compute_pass(),
+        }
+    }
+
     pub unsafe fn memory_barrier(&mut self) {
         mux_match! { self;
             CmdBuf::Vk(c) => c.memory_barrier(),
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 8392899..504d947 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -15,7 +15,7 @@
 use crate::backend::Device as DeviceTrait;
 use crate::{
     BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize,
-    WorkgroupLimits,
+    WorkgroupLimits, ComputePassDescriptor,
 };
 
 pub struct VkInstance {
@@ -92,6 +92,7 @@
     cmd_buf: vk::CommandBuffer,
     cmd_pool: vk::CommandPool,
     device: Arc<RawDevice>,
+    end_query: Option<(vk::QueryPool, u32)>,
 }
 
 pub struct QueryPool {
@@ -738,6 +739,7 @@
                 cmd_buf,
                 cmd_pool,
                 device: self.device.clone(),
+                end_query: None,
             })
         }
     }
@@ -770,11 +772,10 @@
         // results (Windows 10, AMD 5700 XT).
         let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT;
         device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?;
-        let ts0 = buf[0];
         let tsp = self.timestamp_period as f64 * 1e-9;
-        let result = buf[1..]
+        let result = buf
             .iter()
-            .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
+            .map(|ts| *ts as f64 * tsp)
             .collect();
         Ok(result)
     }
@@ -902,6 +903,16 @@
         true
     }
 
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        if let Some((pool, start, end)) = &desc.timer_queries {
+            #[allow(irrefutable_let_patterns)]
+            if let crate::hub::QueryPool::Vk(pool) = pool {
+                self.write_timestamp_raw(pool.pool, *start);
+                self.end_query = Some((pool.pool, *end));
+            }
+        }
+    }
+
     unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
@@ -931,6 +942,12 @@
         );
     }
 
+    unsafe fn end_compute_pass(&mut self) {
+        if let Some((pool, end)) = self.end_query.take() {
+            self.write_timestamp_raw(pool, end);
+        }
+    }
+
     /// Insert a pipeline barrier for all memory accesses.
     unsafe fn memory_barrier(&mut self) {
         let device = &self.device.device;
@@ -995,13 +1012,13 @@
         );
     }
 
-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
         let device = &self.device.device;
         let size = size.unwrap_or(vk::WHOLE_SIZE);
         device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
     }
 
-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
         let device = &self.device.device;
         let size = src.size.min(dst.size);
         device.cmd_copy_buffer(
@@ -1012,7 +1029,7 @@
         );
     }
 
-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
         let device = &self.device.device;
         device.cmd_copy_image_to_buffer(
             self.cmd_buf,
@@ -1035,7 +1052,7 @@
         );
     }
 
-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
         let device = &self.device.device;
         device.cmd_copy_buffer_to_image(
             self.cmd_buf,
@@ -1058,7 +1075,7 @@
         );
     }
 
-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
         let device = &self.device.device;
         device.cmd_blit_image(
             self.cmd_buf,
@@ -1106,13 +1123,7 @@
     }
 
     unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        let device = &self.device.device;
-        device.cmd_write_timestamp(
-            self.cmd_buf,
-            vk::PipelineStageFlags::COMPUTE_SHADER,
-            pool.pool,
-            query,
-        );
+        self.write_timestamp_raw(pool.pool, query);
     }
 
     unsafe fn begin_debug_label(&mut self, label: &str) {
@@ -1130,6 +1141,18 @@
     }
 }
 
+impl CmdBuf {
+    unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) {
+        let device = &self.device.device;
+        device.cmd_write_timestamp(
+            self.cmd_buf,
+            vk::PipelineStageFlags::COMPUTE_SHADER,
+            pool,
+            query,
+        );
+    }
+}
+
 impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
     fn add_buffers(&mut self, buffers: &[&Buffer]) {
         self.buffers.extend(buffers.iter().map(|b| b.buffer));
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index 70023af..abe6ae1 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -6,7 +6,7 @@
 
 use piet_gpu_hal::{BufferUsage, Error, Instance, InstanceFlags, Session};
 
-use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer};
+use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer};
 
 const WIDTH: usize = 2048;
 const HEIGHT: usize = 1536;
@@ -243,7 +243,11 @@
             if matches.is_present("flip") {
                 scale = -scale;
             }
-            test_scenes::render_svg(&mut ctx, input, scale);
+            let xml_str = std::fs::read_to_string(input).unwrap();
+            let start = std::time::Instant::now();
+            let svg = PicoSvg::load(&xml_str, scale).unwrap();
+            println!("parsing time: {:?}", start.elapsed());
+            test_scenes::render_svg(&mut ctx, &svg);
         } else {
             test_scenes::render_scene(&mut ctx);
         }
diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
index 3ca0742..1642026 100644
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@@ -2,7 +2,7 @@
 use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder};
 use piet_gpu_hal::{CmdBuf, Error, ImageLayout, Instance, Session, SubmittedCmdBuf};
 
-use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer};
+use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer};
 
 use clap::{App, Arg};
 
@@ -29,6 +29,25 @@
         )
         .get_matches();
 
+    // Collect SVG if input
+    let svg = match matches.value_of("INPUT") {
+        Some(file) => {
+            let mut scale = matches
+                .value_of("scale")
+                .map(|scale| scale.parse().unwrap())
+                .unwrap_or(8.0);
+            if matches.is_present("flip") {
+                scale = -scale;
+            }
+            let xml_str = std::fs::read_to_string(file).unwrap();
+            let start = std::time::Instant::now();
+            let svg = PicoSvg::load(&xml_str, scale).unwrap();
+            println!("parsing time: {:?}", start.elapsed());
+            Some(svg)
+        }
+        None => None,
+    };
+
     let event_loop = EventLoop::new();
     let window = WindowBuilder::new()
         .with_inner_size(winit::dpi::LogicalSize {
@@ -51,7 +70,7 @@
             .map(|_| session.create_semaphore())
             .collect::<Result<Vec<_>, Error>>()?;
         let query_pools = (0..NUM_FRAMES)
-            .map(|_| session.create_query_pool(8))
+            .map(|_| session.create_query_pool(12))
             .collect::<Result<Vec<_>, Error>>()?;
         let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default();
         let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
@@ -93,29 +112,23 @@
                         if !ts.is_empty() {
                             info_string = format!(
                                 "{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
-                                ts[6] * 1e3,
+                                ts[10] * 1e3,
                                 ts[0] * 1e3,
                                 (ts[1] - ts[0]) * 1e3,
                                 (ts[2] - ts[1]) * 1e3,
-                                (ts[3] - ts[2]) * 1e3,
                                 (ts[4] - ts[3]) * 1e3,
-                                (ts[5] - ts[4]) * 1e3,
                                 (ts[6] - ts[5]) * 1e3,
+                                (ts[8] - ts[7]) * 1e3,
+                                (ts[10] - ts[9]) * 1e3,
                             );
                         }
                     }
 
                     let mut ctx = PietGpuRenderContext::new();
-                    if let Some(input) = matches.value_of("INPUT") {
-                        let mut scale = matches
-                            .value_of("scale")
-                            .map(|scale| scale.parse().unwrap())
-                            .unwrap_or(8.0);
-                        if matches.is_present("flip") {
-                            scale = -scale;
-                        }
-                        test_scenes::render_svg(&mut ctx, input, scale);
-                    } else {
+                    let test_blend = false;
+                    if let Some(svg) = &svg {
+                        test_scenes::render_svg(&mut ctx, svg);
+                    } else if test_blend {
                         use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
                         let blends = [
                             Blend::new(Normal, SrcOver),
@@ -151,6 +164,8 @@
                         let blend = blends[mode % blends.len()];
                         test_scenes::render_blend_test(&mut ctx, current_frame, blend);
                         info_string = format!("{:?}", blend);
+                    } else {
+                        test_scenes::render_anim_frame(&mut ctx, current_frame);
                     }
                     render_info_string(&mut ctx, &info_string);
                     if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index adbedfd..3abb2e0 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -306,7 +306,7 @@
                     is_blend = (blend != BlendComp_default);
                 }
                 include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
-                    || (is_clip && is_blend);
+                    || is_blend;
             }
             if (include_tile) {
                 uint el_slice = el_ix / 32;
diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil
index 4a4f073..3050aa8 100644
--- a/piet-gpu/shader/gen/binning.dxil
+++ b/piet-gpu/shader/gen/binning.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index 879b7c8..fdab444 100644
--- a/piet-gpu/shader/gen/coarse.dxil
+++ b/piet-gpu/shader/gen/coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index 1e610ec..04529bb 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -931,23 +931,14 @@
                 {
                     _1701 = _1692;
                 }
-                bool _1708;
-                if (!_1701)
-                {
-                    _1708 = is_clip && is_blend;
-                }
-                else
-                {
-                    _1708 = _1701;
-                }
-                include_tile = _1708;
+                include_tile = _1701 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1728;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1728);
+                uint _1723;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723);
             }
         }
         GroupMemoryBarrierWithGroupSync();
@@ -976,9 +967,9 @@
             {
                 uint param_25 = element_ref_ix;
                 bool param_26 = mem_ok;
-                TileRef _1805 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                 Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1805;
+                TileRef param_28 = _1800;
                 Tile tile_1 = Tile_read(param_27, param_28);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
                 uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
@@ -993,11 +984,11 @@
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1853 = alloc_cmd(param_29, param_30, param_31);
+                        bool _1848 = alloc_cmd(param_29, param_30, param_31);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1853)
+                        if (!_1848)
                         {
                             break;
                         }
@@ -1008,10 +999,10 @@
                         write_fill(param_32, param_33, param_34, param_35);
                         cmd_ref = param_33;
                         uint rgba = _1372.Load(dd_1 * 4 + 0);
-                        CmdColor _1876 = { rgba };
+                        CmdColor _1871 = { rgba };
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = _1876;
+                        CmdColor param_38 = _1871;
                         Cmd_Color_write(param_36, param_37, param_38);
                         cmd_ref.offset += 8u;
                         break;
@@ -1021,11 +1012,11 @@
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1894 = alloc_cmd(param_39, param_40, param_41);
+                        bool _1889 = alloc_cmd(param_39, param_40, param_41);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1894)
+                        if (!_1889)
                         {
                             break;
                         }
@@ -1052,11 +1043,11 @@
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1958 = alloc_cmd(param_49, param_50, param_51);
+                        bool _1953 = alloc_cmd(param_49, param_50, param_51);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1958)
+                        if (!_1953)
                         {
                             break;
                         }
@@ -1086,11 +1077,11 @@
                         Alloc param_59 = cmd_alloc;
                         CmdRef param_60 = cmd_ref;
                         uint param_61 = cmd_limit;
-                        bool _2064 = alloc_cmd(param_59, param_60, param_61);
+                        bool _2059 = alloc_cmd(param_59, param_60, param_61);
                         cmd_alloc = param_59;
                         cmd_ref = param_60;
                         cmd_limit = param_61;
-                        if (!_2064)
+                        if (!_2059)
                         {
                             break;
                         }
@@ -1103,27 +1094,27 @@
                         uint index = _1372.Load(dd_1 * 4 + 0);
                         uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _2103 = { index, offset_1 };
+                        CmdImage _2098 = { index, offset_1 };
                         Alloc param_66 = cmd_alloc;
                         CmdRef param_67 = cmd_ref;
-                        CmdImage param_68 = _2103;
+                        CmdImage param_68 = _2098;
                         Cmd_Image_write(param_66, param_67, param_68);
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _2117 = tile_1.tile.offset == 0u;
-                        bool _2123;
-                        if (_2117)
+                        bool _2112 = tile_1.tile.offset == 0u;
+                        bool _2118;
+                        if (_2112)
                         {
-                            _2123 = tile_1.backdrop == 0;
+                            _2118 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2123 = _2117;
+                            _2118 = _2112;
                         }
-                        if (_2123)
+                        if (_2118)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1132,11 +1123,11 @@
                             Alloc param_69 = cmd_alloc;
                             CmdRef param_70 = cmd_ref;
                             uint param_71 = cmd_limit;
-                            bool _2135 = alloc_cmd(param_69, param_70, param_71);
+                            bool _2130 = alloc_cmd(param_69, param_70, param_71);
                             cmd_alloc = param_69;
                             cmd_ref = param_70;
                             cmd_limit = param_71;
-                            if (!_2135)
+                            if (!_2130)
                             {
                                 break;
                             }
@@ -1154,11 +1145,11 @@
                         Alloc param_74 = cmd_alloc;
                         CmdRef param_75 = cmd_ref;
                         uint param_76 = cmd_limit;
-                        bool _2163 = alloc_cmd(param_74, param_75, param_76);
+                        bool _2158 = alloc_cmd(param_74, param_75, param_76);
                         cmd_alloc = param_74;
                         cmd_ref = param_75;
                         cmd_limit = param_76;
-                        if (!_2163)
+                        if (!_2158)
                         {
                             break;
                         }
@@ -1169,10 +1160,10 @@
                         write_fill(param_77, param_78, param_79, param_80);
                         cmd_ref = param_78;
                         uint blend_1 = _1372.Load(dd_1 * 4 + 0);
-                        CmdEndClip _2186 = { blend_1 };
+                        CmdEndClip _2181 = { blend_1 };
                         Alloc param_81 = cmd_alloc;
                         CmdRef param_82 = cmd_ref;
-                        CmdEndClip param_83 = _2186;
+                        CmdEndClip param_83 = _2181;
                         Cmd_EndClip_write(param_81, param_82, param_83);
                         cmd_ref.offset += 8u;
                         break;
@@ -1207,17 +1198,17 @@
             break;
         }
     }
-    bool _2233 = (bin_tile_x + tile_x) < _1005.Load(8);
-    bool _2242;
-    if (_2233)
+    bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8);
+    bool _2237;
+    if (_2228)
     {
-        _2242 = (bin_tile_y + tile_y) < _1005.Load(12);
+        _2237 = (bin_tile_y + tile_y) < _1005.Load(12);
     }
     else
     {
-        _2242 = _2233;
+        _2237 = _2228;
     }
-    if (_2242)
+    if (_2237)
     {
         Alloc param_84 = cmd_alloc;
         CmdRef param_85 = cmd_ref;
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index abd636b..55812d4 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -954,22 +954,13 @@
                 {
                     _1701 = _1692;
                 }
-                bool _1708;
-                if (!_1701)
-                {
-                    _1708 = is_clip && is_blend;
-                }
-                else
-                {
-                    _1708 = _1701;
-                }
-                include_tile = _1708;
+                include_tile = _1701 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1728 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1723 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -1014,11 +1005,11 @@
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1853 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
+                        bool _1848 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1853)
+                        if (!_1848)
                         {
                             break;
                         }
@@ -1041,11 +1032,11 @@
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1894 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
+                        bool _1889 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1894)
+                        if (!_1889)
                         {
                             break;
                         }
@@ -1072,11 +1063,11 @@
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1958 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
+                        bool _1953 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1958)
+                        if (!_1953)
                         {
                             break;
                         }
@@ -1106,11 +1097,11 @@
                         Alloc param_59 = cmd_alloc;
                         CmdRef param_60 = cmd_ref;
                         uint param_61 = cmd_limit;
-                        bool _2064 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
+                        bool _2059 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
                         cmd_alloc = param_59;
                         cmd_ref = param_60;
                         cmd_limit = param_61;
-                        if (!_2064)
+                        if (!_2059)
                         {
                             break;
                         }
@@ -1132,17 +1123,17 @@
                     }
                     case 5u:
                     {
-                        bool _2117 = tile_1.tile.offset == 0u;
-                        bool _2123;
-                        if (_2117)
+                        bool _2112 = tile_1.tile.offset == 0u;
+                        bool _2118;
+                        if (_2112)
                         {
-                            _2123 = tile_1.backdrop == 0;
+                            _2118 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2123 = _2117;
+                            _2118 = _2112;
                         }
-                        if (_2123)
+                        if (_2118)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1151,11 +1142,11 @@
                             Alloc param_69 = cmd_alloc;
                             CmdRef param_70 = cmd_ref;
                             uint param_71 = cmd_limit;
-                            bool _2135 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
+                            bool _2130 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
                             cmd_alloc = param_69;
                             cmd_ref = param_70;
                             cmd_limit = param_71;
-                            if (!_2135)
+                            if (!_2130)
                             {
                                 break;
                             }
@@ -1173,11 +1164,11 @@
                         Alloc param_74 = cmd_alloc;
                         CmdRef param_75 = cmd_ref;
                         uint param_76 = cmd_limit;
-                        bool _2163 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
+                        bool _2158 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
                         cmd_alloc = param_74;
                         cmd_ref = param_75;
                         cmd_limit = param_76;
-                        if (!_2163)
+                        if (!_2158)
                         {
                             break;
                         }
@@ -1225,17 +1216,17 @@
             break;
         }
     }
-    bool _2233 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
-    bool _2242;
-    if (_2233)
+    bool _2228 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
+    bool _2237;
+    if (_2228)
     {
-        _2242 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
+        _2237 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
     }
     else
     {
-        _2242 = _2233;
+        _2237 = _2228;
     }
-    if (_2242)
+    if (_2237)
     {
         Alloc param_84 = cmd_alloc;
         CmdRef param_85 = cmd_ref;
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index fdc10a0..6d33ee7 100644
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil
index 6353f19..200f169 100644
--- a/piet-gpu/shader/gen/draw_leaf.dxil
+++ b/piet-gpu/shader/gen/draw_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil
index c101fc8..be69aad 100644
--- a/piet-gpu/shader/gen/draw_reduce.dxil
+++ b/piet-gpu/shader/gen/draw_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_root.dxil b/piet-gpu/shader/gen/draw_root.dxil
index 873fa29..4ea23f7 100644
--- a/piet-gpu/shader/gen/draw_root.dxil
+++ b/piet-gpu/shader/gen/draw_root.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index 9f8080b..e6eccc1 100644
--- a/piet-gpu/shader/gen/kernel4.dxil
+++ b/piet-gpu/shader/gen/kernel4.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index a594d50..046045f 100644
--- a/piet-gpu/shader/gen/kernel4_gray.dxil
+++ b/piet-gpu/shader/gen/kernel4_gray.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil
index 7b130e0..7759910 100644
--- a/piet-gpu/shader/gen/tile_alloc.dxil
+++ b/piet-gpu/shader/gen/tile_alloc.dxil
Binary files differ
diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs
index 8f21485..2f4b85e 100644
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@@ -37,147 +37,6 @@
     n_clip: u32,
 }
 
-#[derive(Copy, Clone, Debug)]
-pub struct EncodedSceneRef<'a, T: Copy + Pod> {
-    pub transform_stream: &'a [T],
-    pub tag_stream: &'a [u8],
-    pub pathseg_stream: &'a [u8],
-    pub linewidth_stream: &'a [f32],
-    pub drawtag_stream: &'a [u32],
-    pub drawdata_stream: &'a [u8],
-    pub n_path: u32,
-    pub n_pathseg: u32,
-    pub n_clip: u32,
-    pub ramp_data: &'a [u32],
-}
-
-impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> {
-    /// Return a config for the element processing pipeline.
-    ///
-    /// This does not include further pipeline processing. Also returns the
-    /// beginning of free memory.
-    pub fn stage_config(&self) -> (Config, usize) {
-        // Layout of scene buffer
-        let drawtag_offset = 0;
-        let n_drawobj = self.n_drawobj();
-        let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
-        let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
-        let trans_offset = drawdata_offset + self.drawdata_stream.len();
-        let n_trans = self.transform_stream.len();
-        let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
-        let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
-        let n_linewidth = self.linewidth_stream.len();
-        let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE;
-        let n_pathtag = self.tag_stream.len();
-        let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
-        let pathseg_offset = pathtag_offset + n_pathtag_padded;
-
-        // Layout of memory
-        let mut alloc = 0;
-        let trans_alloc = alloc;
-        alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
-        let pathseg_alloc = alloc;
-        alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
-        let path_bbox_alloc = alloc;
-        let n_path = self.n_path as usize;
-        alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
-        let drawmonoid_alloc = alloc;
-        alloc += n_drawobj_padded * DRAWMONOID_SIZE;
-        let anno_alloc = alloc;
-        alloc += n_drawobj * ANNOTATED_SIZE;
-        let clip_alloc = alloc;
-        let n_clip = self.n_clip as usize;
-        const CLIP_SIZE: usize = 4;
-        alloc += n_clip * CLIP_SIZE;
-        let clip_bic_alloc = alloc;
-        const CLIP_BIC_SIZE: usize = 8;
-        // This can round down, as we only reduce the prefix
-        alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
-        let clip_stack_alloc = alloc;
-        const CLIP_EL_SIZE: usize = 20;
-        alloc += n_clip * CLIP_EL_SIZE;
-        let clip_bbox_alloc = alloc;
-        const CLIP_BBOX_SIZE: usize = 16;
-        alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
-        let draw_bbox_alloc = alloc;
-        alloc += n_drawobj * DRAW_BBOX_SIZE;
-        let drawinfo_alloc = alloc;
-        // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
-        const MAX_DRAWINFO_SIZE: usize = 44;
-        alloc += n_drawobj * MAX_DRAWINFO_SIZE;
-
-        let config = Config {
-            n_elements: n_drawobj as u32,
-            n_pathseg: self.n_pathseg,
-            pathseg_alloc: pathseg_alloc as u32,
-            anno_alloc: anno_alloc as u32,
-            trans_alloc: trans_alloc as u32,
-            path_bbox_alloc: path_bbox_alloc as u32,
-            drawmonoid_alloc: drawmonoid_alloc as u32,
-            clip_alloc: clip_alloc as u32,
-            clip_bic_alloc: clip_bic_alloc as u32,
-            clip_stack_alloc: clip_stack_alloc as u32,
-            clip_bbox_alloc: clip_bbox_alloc as u32,
-            draw_bbox_alloc: draw_bbox_alloc as u32,
-            drawinfo_alloc: drawinfo_alloc as u32,
-            n_trans: n_trans as u32,
-            n_path: self.n_path,
-            n_clip: self.n_clip,
-            trans_offset: trans_offset as u32,
-            linewidth_offset: linewidth_offset as u32,
-            pathtag_offset: pathtag_offset as u32,
-            pathseg_offset: pathseg_offset as u32,
-            drawtag_offset: drawtag_offset as u32,
-            drawdata_offset: drawdata_offset as u32,
-            ..Default::default()
-        };
-        (config, alloc)
-    }
-
-    pub fn write_scene(&self, buf: &mut BufWrite) {
-        buf.extend_slice(&self.drawtag_stream);
-        let n_drawobj = self.drawtag_stream.len();
-        buf.fill_zero(padding(n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE);
-        buf.extend_slice(&self.drawdata_stream);
-        buf.extend_slice(&self.transform_stream);
-        let n_trans = self.transform_stream.len();
-        buf.fill_zero(padding(n_trans, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE);
-        buf.extend_slice(&self.linewidth_stream);
-        buf.extend_slice(&self.tag_stream);
-        let n_pathtag = self.tag_stream.len();
-        buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize));
-        buf.extend_slice(&self.pathseg_stream);
-    }
-
-    /// The number of draw objects in the draw object stream.
-    pub(crate) fn n_drawobj(&self) -> usize {
-        self.drawtag_stream.len()
-    }
-
-    /// The number of paths.
-    pub(crate) fn n_path(&self) -> u32 {
-        self.n_path
-    }
-
-    /// The number of path segments.
-    pub(crate) fn n_pathseg(&self) -> u32 {
-        self.n_pathseg
-    }
-
-    pub(crate) fn n_transform(&self) -> usize {
-        self.transform_stream.len()
-    }
-
-    /// The number of tags in the path stream.
-    pub(crate) fn n_pathtag(&self) -> usize {
-        self.tag_stream.len()
-    }
-
-    pub(crate) fn n_clip(&self) -> u32 {
-        self.n_clip
-    }
-}
-
 /// A scene fragment encoding a glyph.
 ///
 /// This is a reduced version of the full encoder.
@@ -471,21 +330,6 @@
         self.n_path += glyph.n_path;
         self.n_pathseg += glyph.n_pathseg;
     }
-
-    pub(crate) fn scene_ref(&self) -> EncodedSceneRef<stages::Transform> {
-        EncodedSceneRef {
-            transform_stream: &self.transform_stream,
-            tag_stream: &self.tag_stream,
-            pathseg_stream: &self.pathseg_stream,
-            linewidth_stream: &self.linewidth_stream,
-            drawtag_stream: &self.drawtag_stream,
-            drawdata_stream: &self.drawdata_stream,
-            n_path: self.n_path,
-            n_pathseg: self.n_pathseg,
-            n_clip: self.n_clip,
-            ramp_data: &[],
-        }
-    }
 }
 
 fn align_up(x: usize, align: usize) -> usize {
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index b3ead90..773007d 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -10,10 +10,7 @@
 
 use std::convert::TryInto;
 
-use bytemuck::Pod;
-
 pub use blend::{Blend, BlendMode, CompositionMode};
-pub use encoder::EncodedSceneRef;
 pub use render_ctx::PietGpuRenderContext;
 pub use gradient::Colrv1RadialGradient;
 
@@ -21,11 +18,11 @@
 use piet::{ImageFormat, RenderContext};
 
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image,
-    ImageLayout, Pipeline, QueryPool, Session,
+    include_shader, BindType, Buffer, BufferUsage, CmdBuf, ComputePassDescriptor, DescriptorSet,
+    Error, Image, ImageLayout, Pipeline, QueryPool, Session,
 };
 
-use pico_svg::PicoSvg;
+pub use pico_svg::PicoSvg;
 use stages::{ClipBinding, ElementBinding, ElementCode};
 
 use crate::stages::{ClipCode, Config, ElementStage};
@@ -358,27 +355,16 @@
         render_ctx: &mut PietGpuRenderContext,
         buf_ix: usize,
     ) -> Result<(), Error> {
-        let mut scene = render_ctx.encoded_scene();
-        let ramp_data = render_ctx.get_ramp_data();
-        scene.ramp_data = &ramp_data;
-        self.upload_scene(&scene, buf_ix)
-    }
-
-    pub fn upload_scene<T: Copy + Pod>(
-        &mut self,
-        scene: &EncodedSceneRef<T>,
-        buf_ix: usize,
-    ) -> Result<(), Error> {
-        let (mut config, mut alloc) = scene.stage_config();
-        let n_drawobj = scene.n_drawobj();
+        let (mut config, mut alloc) = render_ctx.stage_config();
+        let n_drawobj = render_ctx.n_drawobj();
         // TODO: be more consistent in size types
-        let n_path = scene.n_path() as usize;
+        let n_path = render_ctx.n_path() as usize;
         self.n_paths = n_path;
-        self.n_transform = scene.n_transform();
-        self.n_drawobj = scene.n_drawobj();
-        self.n_pathseg = scene.n_pathseg() as usize;
-        self.n_pathtag = scene.n_pathtag();
-        self.n_clip = scene.n_clip();
+        self.n_transform = render_ctx.n_transform();
+        self.n_drawobj = render_ctx.n_drawobj();
+        self.n_pathseg = render_ctx.n_pathseg() as usize;
+        self.n_pathtag = render_ctx.n_pathtag();
+        self.n_clip = render_ctx.n_clip();
 
         // These constants depend on encoding and may need to be updated.
         // Perhaps we can plumb these from piet-gpu-derive?
@@ -402,18 +388,19 @@
             // TODO: reallocate scene buffer if size is inadequate
             {
                 let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?;
-                scene.write_scene(&mut mapped_scene);
+                render_ctx.write_scene(&mut mapped_scene);
             }
             self.config_bufs[buf_ix].write(&[config])?;
             self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;
 
             // Upload gradient data.
-            if !scene.ramp_data.is_empty() {
+            let ramp_data = render_ctx.get_ramp_data();
+            if !ramp_data.is_empty() {
                 assert!(
                     self.gradient_bufs[buf_ix].size() as usize
-                        >= std::mem::size_of_val(&*scene.ramp_data)
+                        >= std::mem::size_of_val(&*ramp_data)
                 );
-                self.gradient_bufs[buf_ix].write(scene.ramp_data)?;
+                self.gradient_bufs[buf_ix].write(&ramp_data)?;
             }
         }
         Ok(())
@@ -437,10 +424,10 @@
         cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients);
         cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
         cmd_buf.reset_query_pool(&query_pool);
-        cmd_buf.write_timestamp(&query_pool, 0);
         cmd_buf.begin_debug_label("Element bounding box calculation");
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
         self.element_stage.record(
-            cmd_buf,
+            &mut pass,
             &self.element_code,
             &self.element_bindings[buf_ix],
             self.n_transform as u64,
@@ -448,56 +435,59 @@
             self.n_pathtag as u32,
             self.n_drawobj as u64,
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
-        cmd_buf.begin_debug_label("Clip bounding box calculation");
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 2, 3));
+        pass.begin_debug_label("Clip bounding box calculation");
         self.clip_binding
-            .record(cmd_buf, &self.clip_code, self.n_clip as u32);
-        cmd_buf.end_debug_label();
-        cmd_buf.begin_debug_label("Element binning");
-        cmd_buf.dispatch(
+            .record(&mut pass, &self.clip_code, self.n_clip as u32);
+        pass.end_debug_label();
+        pass.begin_debug_label("Element binning");
+        pass.dispatch(
             &self.bin_pipeline,
             &self.bin_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
-        cmd_buf.end_debug_label();
-        cmd_buf.memory_barrier();
-        cmd_buf.begin_debug_label("Tile allocation");
-        cmd_buf.dispatch(
+        pass.end_debug_label();
+        pass.memory_barrier();
+        pass.begin_debug_label("Tile allocation");
+        pass.dispatch(
             &self.tile_pipeline,
             &self.tile_ds[buf_ix],
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
-        cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 2);
-        cmd_buf.memory_barrier();
+        pass.end_debug_label();
+        pass.end();
         cmd_buf.begin_debug_label("Path flattening");
-        cmd_buf.dispatch(
+        cmd_buf.memory_barrier();
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 4, 5));
+        pass.dispatch(
             &self.path_pipeline,
             &self.path_ds,
             (((self.n_pathseg + 31) / 32) as u32, 1, 1),
             (32, 1, 1),
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 3);
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Backdrop propagation");
-        cmd_buf.dispatch(
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 6, 7));
+        pass.dispatch(
             &self.backdrop_pipeline,
             &self.backdrop_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, self.backdrop_y, 1),
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 4);
         // TODO: redo query accounting
-        cmd_buf.write_timestamp(&query_pool, 5);
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Coarse raster");
-        cmd_buf.dispatch(
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 8, 9));
+        pass.dispatch(
             &self.coarse_pipeline,
             &self.coarse_ds[buf_ix],
             (
@@ -507,11 +497,13 @@
             ),
             (256, 1, 1),
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 6);
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Fine raster");
-        cmd_buf.dispatch(
+        let mut pass =
+            cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11));
+        pass.dispatch(
             &self.k4_pipeline,
             &self.k4_ds,
             (
@@ -521,8 +513,8 @@
             ),
             (8, 4, 1),
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 7);
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
     }
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index ad608ca..dca03eb 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -1,6 +1,6 @@
 use std::borrow::Cow;
 
-use crate::encoder::{EncodedSceneRef, GlyphEncoder};
+use crate::encoder::GlyphEncoder;
 use crate::stages::{Config, Transform};
 use crate::MAX_BLEND_STACK;
 use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape};
@@ -97,10 +97,6 @@
         self.new_encoder.stage_config()
     }
 
-    pub fn encoded_scene(&self) -> EncodedSceneRef<crate::stages::Transform> {
-        self.new_encoder.scene_ref()
-    }
-
     /// Number of draw objects.
     ///
     /// This is for the new element processing pipeline. It's not necessarily the
diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs
index 52b8bf1..5442ba3 100644
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@@ -26,7 +26,7 @@
 pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE};
 pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE};
 pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE};
-use piet_gpu_hal::{Buffer, CmdBuf, Session};
+use piet_gpu_hal::{Buffer, ComputePass, Session};
 pub use transform::{
     Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE,
 };
@@ -140,7 +140,7 @@
 
     pub unsafe fn record(
         &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
         code: &ElementCode,
         binding: &ElementBinding,
         n_transform: u64,
@@ -149,14 +149,14 @@
         n_drawobj: u64,
     ) {
         self.transform_stage.record(
-            cmd_buf,
+            pass,
             &code.transform_code,
             &binding.transform_binding,
             n_transform,
         );
         // No memory barrier needed here; path has at least one before pathseg
         self.path_stage.record(
-            cmd_buf,
+            pass,
             &code.path_code,
             &binding.path_binding,
             n_paths,
@@ -164,6 +164,6 @@
         );
         // No memory barrier needed here; draw has at least one before draw_leaf
         self.draw_stage
-            .record(cmd_buf, &code.draw_code, &binding.draw_binding, n_drawobj);
+            .record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
     }
 }
diff --git a/piet-gpu/src/stages/clip.rs b/piet-gpu/src/stages/clip.rs
index e4bc3db..2fd195b 100644
--- a/piet-gpu/src/stages/clip.rs
+++ b/piet-gpu/src/stages/clip.rs
@@ -16,7 +16,7 @@
 
 //! The clip processing stage (includes substages).
 
-use piet_gpu_hal::{include_shader, BindType, Buffer, CmdBuf, DescriptorSet, Pipeline, Session};
+use piet_gpu_hal::{include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session};
 
 // Note that this isn't the code/stage/binding pattern of most of the other stages
 // in the new element processing pipeline. We want to move those temporary buffers
@@ -69,26 +69,26 @@
     /// Record the clip dispatches.
     ///
     /// Assumes memory barrier on entry. Provides memory barrier on exit.
-    pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, code: &ClipCode, n_clip: u32) {
+    pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClipCode, n_clip: u32) {
         let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE;
         if n_wg_reduce > 0 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &self.reduce_ds,
                 (n_wg_reduce, 1, 1),
                 (CLIP_PART_SIZE, 1, 1),
             );
-            cmd_buf.memory_barrier();
+            pass.memory_barrier();
         }
         let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE;
         if n_wg > 0 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.leaf_pipeline,
                 &self.leaf_ds,
                 (n_wg, 1, 1),
                 (CLIP_PART_SIZE, 1, 1),
             );
-            cmd_buf.memory_barrier();
+            pass.memory_barrier();
         }
     }
 }
diff --git a/piet-gpu/src/stages/draw.rs b/piet-gpu/src/stages/draw.rs
index 21312a4..f0ee2b6 100644
--- a/piet-gpu/src/stages/draw.rs
+++ b/piet-gpu/src/stages/draw.rs
@@ -19,7 +19,7 @@
 use bytemuck::{Pod, Zeroable};
 
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
 };
 
 /// The output element of the draw object stage.
@@ -130,7 +130,7 @@
 
     pub unsafe fn record(
         &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
         code: &DrawCode,
         binding: &DrawBinding,
         size: u64,
@@ -140,22 +140,22 @@
         }
         let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE;
         if n_workgroups > 1 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &binding.reduce_ds,
                 (n_workgroups as u32, 1, 1),
                 (DRAW_WG as u32, 1, 1),
             );
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
+            pass.memory_barrier();
+            pass.dispatch(
                 &code.root_pipeline,
                 &self.root_ds,
                 (1, 1, 1),
                 (DRAW_WG as u32, 1, 1),
             );
         }
-        cmd_buf.memory_barrier();
-        cmd_buf.dispatch(
+        pass.memory_barrier();
+        pass.dispatch(
             &code.leaf_pipeline,
             &binding.leaf_ds,
             (n_workgroups as u32, 1, 1),
diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs
index 6c524a2..be33041 100644
--- a/piet-gpu/src/stages/path.rs
+++ b/piet-gpu/src/stages/path.rs
@@ -17,7 +17,7 @@
 //! The path stage (includes substages).
 
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
 };
 
 pub struct PathCode {
@@ -148,7 +148,7 @@
     /// those are consumed. Result is written without barrier.
     pub unsafe fn record(
         &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
         code: &PathCode,
         binding: &PathBinding,
         n_paths: u32,
@@ -166,15 +166,15 @@
         let reduce_part_tags = REDUCE_PART_SIZE * 4;
         let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags;
         if n_wg_tag_reduce > 1 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &binding.reduce_ds,
                 (n_wg_tag_reduce, 1, 1),
                 (REDUCE_WG, 1, 1),
             );
             // I think we can skip root if n_wg_tag_reduce == 2
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
+            pass.memory_barrier();
+            pass.dispatch(
                 &code.tag_root_pipeline,
                 &self.tag_root_ds,
                 (1, 1, 1),
@@ -183,15 +183,15 @@
             // No barrier needed here; clear doesn't depend on path tags
         }
         let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG;
-        cmd_buf.dispatch(
+        pass.dispatch(
             &code.clear_pipeline,
             &binding.clear_ds,
             (n_wg_clear, 1, 1),
             (CLEAR_WG, 1, 1),
         );
-        cmd_buf.memory_barrier();
+        pass.memory_barrier();
         let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE;
-        cmd_buf.dispatch(
+        pass.dispatch(
             &code.pathseg_pipeline,
             &binding.path_ds,
             (n_wg_pathseg, 1, 1),
diff --git a/piet-gpu/src/stages/transform.rs b/piet-gpu/src/stages/transform.rs
index b21712f..8de7cee 100644
--- a/piet-gpu/src/stages/transform.rs
+++ b/piet-gpu/src/stages/transform.rs
@@ -20,7 +20,7 @@
 
 use piet::kurbo::Affine;
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
 };
 
 /// An affine transform.
@@ -132,7 +132,7 @@
 
     pub unsafe fn record(
         &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
         code: &TransformCode,
         binding: &TransformBinding,
         size: u64,
@@ -142,22 +142,22 @@
         }
         let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
         if n_workgroups > 1 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &binding.reduce_ds,
                 (n_workgroups as u32, 1, 1),
                 (TRANSFORM_WG as u32, 1, 1),
             );
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
+            pass.memory_barrier();
+            pass.dispatch(
                 &code.root_pipeline,
                 &self.root_ds,
                 (1, 1, 1),
                 (TRANSFORM_WG as u32, 1, 1),
             );
-            cmd_buf.memory_barrier();
+            pass.memory_barrier();
         }
-        cmd_buf.dispatch(
+        pass.dispatch(
             &code.leaf_pipeline,
             &binding.leaf_ds,
             (n_workgroups as u32, 1, 1),
diff --git a/piet-gpu/src/test_scenes.rs b/piet-gpu/src/test_scenes.rs
index cf5a50d..bfd2af2 100644
--- a/piet-gpu/src/test_scenes.rs
+++ b/piet-gpu/src/test_scenes.rs
@@ -21,12 +21,7 @@
     rc.restore().unwrap();
 }
 
-pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) {
-    let xml_str = std::fs::read_to_string(filename).unwrap();
-    let start = std::time::Instant::now();
-    let svg = PicoSvg::load(&xml_str, scale).unwrap();
-    println!("parsing time: {:?}", start.elapsed());
-
+pub fn render_svg(rc: &mut impl RenderContext, svg: &PicoSvg) {
     let start = std::time::Instant::now();
     svg.render(rc);
     println!("flattening and encoding time: {:?}", start.elapsed());
diff --git a/tests/src/clear.rs b/tests/src/clear.rs
index fc6f063..af4b8ea 100644
--- a/tests/src/clear.rs
+++ b/tests/src/clear.rs
@@ -16,11 +16,11 @@
 
 //! Utilities (and a benchmark) for clearing buffers with compute shaders.
 
-use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
+use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
 use piet_gpu_hal::{Buffer, Pipeline};
 
 use crate::config::Config;
-use crate::runner::{Commands, Runner};
+use crate::runner::Runner;
 use crate::test_result::TestResult;
 
 const WG_SIZE: u64 = 256;
@@ -52,9 +52,9 @@
     let mut total_elapsed = 0.0;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
@@ -108,17 +108,12 @@
         ClearBinding { descriptor_set }
     }
 
-    pub unsafe fn record(
-        &self,
-        commands: &mut Commands,
-        code: &ClearCode,
-        bindings: &ClearBinding,
-    ) {
+    pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) {
         let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
         // An issue: for clearing large buffers (>16M), we need to check the
         // number of workgroups against the (dynamically detected) limit, and
         // potentially issue multiple dispatches.
-        commands.cmd_buf.dispatch(
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (n_workgroups as u32, 1, 1),
diff --git a/tests/src/clip.rs b/tests/src/clip.rs
index 4a38949..b1f8613 100644
--- a/tests/src/clip.rs
+++ b/tests/src/clip.rs
@@ -58,11 +58,11 @@
     let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf);
 
     let mut commands = runner.commands();
-    commands.write_timestamp(0);
     commands.upload(&memory);
-    binding.record(&mut commands.cmd_buf, &code, n_clip as u32);
+    let mut pass = commands.compute_pass(0, 1);
+    binding.record(&mut pass, &code, n_clip as u32);
+    pass.end();
     commands.download(&memory);
-    commands.write_timestamp(1);
     runner.submit(commands);
     let dst = memory.map_read(..);
     if let Some(failure) = data.verify(&dst) {
diff --git a/tests/src/draw.rs b/tests/src/draw.rs
index 4372da4..dc82572 100644
--- a/tests/src/draw.rs
+++ b/tests/src/draw.rs
@@ -77,9 +77,9 @@
     let n_iter = config.n_iter;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands.cmd_buf, &code, &binding, n_tag);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding, n_tag);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&memory);
diff --git a/tests/src/linkedlist.rs b/tests/src/linkedlist.rs
index 5767806..e24adcb 100644
--- a/tests/src/linkedlist.rs
+++ b/tests/src/linkedlist.rs
@@ -45,9 +45,7 @@
     for i in 0..n_iter {
         let mut commands = runner.commands();
         // Might clear only buckets to save time.
-        commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
-        commands.write_timestamp(1);
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&mem_buf);
@@ -107,12 +105,14 @@
         commands.cmd_buf.clear_buffer(out_buf, None);
         commands.cmd_buf.memory_barrier();
         let n_workgroups = N_BUCKETS / WG_SIZE;
-        commands.cmd_buf.dispatch(
+        let mut pass = commands.compute_pass(0, 1);
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (n_workgroups as u32, 1, 1),
             (WG_SIZE as u32, 1, 1),
         );
+        pass.end();
     }
 }
 
diff --git a/tests/src/message_passing.rs b/tests/src/message_passing.rs
index c5d989b..39e71dc 100644
--- a/tests/src/message_passing.rs
+++ b/tests/src/message_passing.rs
@@ -59,9 +59,7 @@
     let mut failures = 0;
     for _ in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding, &out_buf.dev_buf);
-        commands.write_timestamp(1);
         commands.cmd_buf.memory_barrier();
         commands.download(&out_buf);
         total_elapsed += runner.submit(commands);
@@ -128,11 +126,13 @@
         commands.cmd_buf.clear_buffer(&self.data_buf, None);
         commands.cmd_buf.clear_buffer(out_buf, None);
         commands.cmd_buf.memory_barrier();
-        commands.cmd_buf.dispatch(
+        let mut pass = commands.compute_pass(0, 1);
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (256, 1, 1),
             (256, 1, 1),
         );
+        pass.end();
     }
 }
diff --git a/tests/src/path.rs b/tests/src/path.rs
index bf72c68..9d794e1 100644
--- a/tests/src/path.rs
+++ b/tests/src/path.rs
@@ -105,15 +105,15 @@
         let mut commands = runner.commands();
         commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf);
         commands.cmd_buf.memory_barrier();
-        commands.write_timestamp(0);
+        let mut pass = commands.compute_pass(0, 1);
         stage.record(
-            &mut commands.cmd_buf,
+            &mut pass,
             &code,
             &binding,
             path_data.n_path,
             path_data.tags.len() as u32,
         );
-        commands.write_timestamp(1);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&memory);
diff --git a/tests/src/prefix.rs b/tests/src/prefix.rs
index 4174d8d..dbaf256 100644
--- a/tests/src/prefix.rs
+++ b/tests/src/prefix.rs
@@ -85,9 +85,7 @@
     let mut total_elapsed = 0.0;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
@@ -159,12 +157,14 @@
         let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
         commands.cmd_buf.clear_buffer(&self.state_buf, None);
         commands.cmd_buf.memory_barrier();
-        commands.cmd_buf.dispatch(
+        let mut pass = commands.compute_pass(0, 1);
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (n_workgroups as u32, 1, 1),
             (WG_SIZE as u32, 1, 1),
         );
+        pass.end();
         // One thing that's missing here is registering the buffers so
         // they can be safely dropped by Rust code before the execution
         // of the command buffer completes.
diff --git a/tests/src/prefix_tree.rs b/tests/src/prefix_tree.rs
index 24be2af..3c9c813 100644
--- a/tests/src/prefix_tree.rs
+++ b/tests/src/prefix_tree.rs
@@ -66,9 +66,7 @@
         let mut commands = runner.commands();
         commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf);
         commands.cmd_buf.memory_barrier();
-        commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
@@ -175,33 +173,35 @@
         code: &PrefixTreeCode,
         bindings: &PrefixTreeBinding,
     ) {
+        let mut pass = commands.compute_pass(0, 1);
         let n = self.tmp_bufs.len();
         for i in 0..n {
             let n_workgroups = self.sizes[i + 1];
-            commands.cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &bindings.descriptor_sets[i],
                 (n_workgroups as u32, 1, 1),
                 (WG_SIZE as u32, 1, 1),
             );
-            commands.cmd_buf.memory_barrier();
+            pass.memory_barrier();
         }
-        commands.cmd_buf.dispatch(
+        pass.dispatch(
             &code.root_pipeline,
             &bindings.descriptor_sets[n],
             (1, 1, 1),
             (WG_SIZE as u32, 1, 1),
         );
         for i in (0..n).rev() {
-            commands.cmd_buf.memory_barrier();
+            pass.memory_barrier();
             let n_workgroups = self.sizes[i + 1];
-            commands.cmd_buf.dispatch(
+            pass.dispatch(
                 &code.scan_pipeline,
                 &bindings.descriptor_sets[2 * n - i],
                 (n_workgroups as u32, 1, 1),
                 (WG_SIZE as u32, 1, 1),
             );
         }
+        pass.end();
     }
 }
 
diff --git a/tests/src/runner.rs b/tests/src/runner.rs
index 1fd6774..3ba8223 100644
--- a/tests/src/runner.rs
+++ b/tests/src/runner.rs
@@ -20,8 +20,8 @@
 
 use bytemuck::Pod;
 use piet_gpu_hal::{
-    BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags,
-    QueryPool, Session,
+    BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass,
+    ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session,
 };
 
 pub struct Runner {
@@ -118,8 +118,14 @@
 }
 
 impl Commands {
-    pub unsafe fn write_timestamp(&mut self, query: u32) {
-        self.cmd_buf.write_timestamp(&self.query_pool, query);
+    /// Start a compute pass with timer queries.
+    pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass {
+        self.cmd_buf
+            .begin_compute_pass(&ComputePassDescriptor::timer(
+                &self.query_pool,
+                start_query,
+                end_query,
+            ))
     }
 
     pub unsafe fn upload(&mut self, buf: &BufStage) {
diff --git a/tests/src/transform.rs b/tests/src/transform.rs
index 6edcc3f..43bfc67 100644
--- a/tests/src/transform.rs
+++ b/tests/src/transform.rs
@@ -61,9 +61,9 @@
     let n_iter = config.n_iter;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands.cmd_buf, &code, &binding, n_elements);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding, n_elements);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&memory);