Fix timer queries in Vulkan and DX12 backends

Current status: the piet-gpu-hal module (including the collatz example)
have the new API (with queries set on compute pass) implemented. The
other uses have not yet been updated.

On Metal, only M1 is tested. The "command" counter style is partly
implemented, but not fully wired up.
diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs
index 78ad449..c5e1e04 100644
--- a/piet-gpu-hal/src/dx12.rs
+++ b/piet-gpu-hal/src/dx12.rs
@@ -21,7 +21,7 @@
 
 use smallvec::SmallVec;
 
-use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat};
+use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor};
 
 use self::{
     descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned},
@@ -76,6 +76,7 @@
     c: wrappers::GraphicsCommandList,
     allocator: CommandAllocator,
     needs_reset: bool,
+    end_query: Option<(wrappers::QueryHeap, u32)>,
 }
 
 pub struct Pipeline {
@@ -360,6 +361,7 @@
                 c,
                 allocator,
                 needs_reset: false,
+                end_query: None,
             })
         }
     }
@@ -388,11 +390,10 @@
         let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
         std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size);
         self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
-        let ts0 = buf[0];
         let tsp = (self.ts_freq as f64).recip();
-        let result = buf[1..]
+        let result = buf
             .iter()
-            .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
+            .map(|ts| *ts as f64 * tsp)
             .collect();
         Ok(result)
     }
@@ -610,6 +611,16 @@
         self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
     }
 
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        if let Some((pool, start, end)) = &desc.timer_queries {
+            #[allow(irrefutable_let_patterns)]
+            if let crate::hub::QueryPool::Dx12(pool) = pool {
+                self.write_timestamp(pool, *start);
+                self.end_query = Some((pool.heap.clone(), *end));
+            }
+        }
+    }
+
     unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
@@ -628,6 +639,12 @@
             .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
     }
 
+    unsafe fn end_compute_pass(&mut self) {
+        if let Some((heap, end)) = self.end_query.take() {
+            self.c.end_timing_query(&heap, end);
+        }
+    }
+
     unsafe fn memory_barrier(&mut self) {
         // See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend.
         // The "proper" way to do this would be to name the actual buffers participating
@@ -666,7 +683,7 @@
         self.memory_barrier();
     }
 
-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
         let cpu_ref = buffer.cpu_ref.as_ref().unwrap();
         let (gpu_ref, heap) = buffer
             .gpu_ref
@@ -684,23 +701,23 @@
         );
     }
 
-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
         // TODO: consider using copy_resource here (if sizes match)
         let size = src.size.min(dst.size);
         self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size);
     }
 
-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
         self.c
             .copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1);
     }
 
-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
         self.c
             .copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1);
     }
 
-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
         self.c.copy_resource(&src.resource, &dst.resource);
     }
 
diff --git a/piet-gpu-hal/src/dx12/wrappers.rs b/piet-gpu-hal/src/dx12/wrappers.rs
index 4bbb86c..9a3fb90 100644
--- a/piet-gpu-hal/src/dx12/wrappers.rs
+++ b/piet-gpu-hal/src/dx12/wrappers.rs
@@ -79,7 +79,6 @@
 #[derive(Clone)]
 pub struct ShaderByteCode {
     pub bytecode: d3d12::D3D12_SHADER_BYTECODE,
-    blob: Option<Blob>,
 }
 
 #[derive(Clone)]
@@ -741,7 +740,6 @@
                 BytecodeLength: blob.0.GetBufferSize(),
                 pShaderBytecode: blob.0.GetBufferPointer(),
             },
-            blob: Some(blob),
         }
     }
 
@@ -810,7 +808,6 @@
                 BytecodeLength: bytecode.len(),
                 pShaderBytecode: bytecode.as_ptr() as *const _,
             },
-            blob: None,
         }
     }
 }
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 37c59df..5c7122a 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -375,8 +375,17 @@
     ///
     /// This should be called after waiting on the command buffer that wrote the
     /// timer queries.
+    ///
+    /// The returned vector is one shorter than the number of timer queries in the
+    /// pool; the first value is subtracted off. It would likely be better to return
+    /// the raw timestamps, but that change should be made consistently.
     pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
-        self.0.device.fetch_query_pool(pool)
+        let result = self.0.device.fetch_query_pool(pool)?;
+        // Subtract off first timestamp.
+        Ok(result[1..]
+            .iter()
+            .map(|ts| *ts as f64 - result[0])
+            .collect())
     }
 
     #[doc(hidden)]
@@ -602,6 +611,10 @@
     /// Write a timestamp.
     ///
     /// The query index must be less than the size of the query pool on creation.
+    ///
+    /// Deprecation: for greater portability, set timestamp queries on compute
+    /// passes instead.
+    #[deprecated(note = "use compute pass descriptor instead")]
     pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
         self.cmd_buf().write_timestamp(pool, query);
     }
diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index 241cdfd..18f6390 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -190,9 +190,15 @@
     pub max_invocations: u32,
 }
 
+/// Options for creating a compute pass.
 #[derive(Default)]
 pub struct ComputePassDescriptor<'a> {
     // Maybe label should go here? It does in wgpu and wgpu_hal.
+    /// Timer query parameters.
+    ///
+    /// To record timer queries for a compute pass, set the query pool, start
+    /// query index, and end query index here. The indices must be less than
+    /// the size of the query pool.
     timer_queries: Option<(&'a QueryPool, u32, u32)>,
 }
 
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 8392899..504d947 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -15,7 +15,7 @@
 use crate::backend::Device as DeviceTrait;
 use crate::{
     BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize,
-    WorkgroupLimits,
+    WorkgroupLimits, ComputePassDescriptor,
 };
 
 pub struct VkInstance {
@@ -92,6 +92,7 @@
     cmd_buf: vk::CommandBuffer,
     cmd_pool: vk::CommandPool,
     device: Arc<RawDevice>,
+    end_query: Option<(vk::QueryPool, u32)>,
 }
 
 pub struct QueryPool {
@@ -738,6 +739,7 @@
                 cmd_buf,
                 cmd_pool,
                 device: self.device.clone(),
+                end_query: None,
             })
         }
     }
@@ -770,11 +772,10 @@
         // results (Windows 10, AMD 5700 XT).
         let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT;
         device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?;
-        let ts0 = buf[0];
         let tsp = self.timestamp_period as f64 * 1e-9;
-        let result = buf[1..]
+        let result = buf
             .iter()
-            .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
+            .map(|ts| *ts as f64 * tsp)
             .collect();
         Ok(result)
     }
@@ -902,6 +903,16 @@
         true
     }
 
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        if let Some((pool, start, end)) = &desc.timer_queries {
+            #[allow(irrefutable_let_patterns)]
+            if let crate::hub::QueryPool::Vk(pool) = pool {
+                self.write_timestamp_raw(pool.pool, *start);
+                self.end_query = Some((pool.pool, *end));
+            }
+        }
+    }
+
     unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
@@ -931,6 +942,12 @@
         );
     }
 
+    unsafe fn end_compute_pass(&mut self) {
+        if let Some((pool, end)) = self.end_query.take() {
+            self.write_timestamp_raw(pool, end);
+        }
+    }
+
     /// Insert a pipeline barrier for all memory accesses.
     unsafe fn memory_barrier(&mut self) {
         let device = &self.device.device;
@@ -995,13 +1012,13 @@
         );
     }
 
-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
         let device = &self.device.device;
         let size = size.unwrap_or(vk::WHOLE_SIZE);
         device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
     }
 
-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
         let device = &self.device.device;
         let size = src.size.min(dst.size);
         device.cmd_copy_buffer(
@@ -1012,7 +1029,7 @@
         );
     }
 
-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
         let device = &self.device.device;
         device.cmd_copy_image_to_buffer(
             self.cmd_buf,
@@ -1035,7 +1052,7 @@
         );
     }
 
-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
         let device = &self.device.device;
         device.cmd_copy_buffer_to_image(
             self.cmd_buf,
@@ -1058,7 +1075,7 @@
         );
     }
 
-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
         let device = &self.device.device;
         device.cmd_blit_image(
             self.cmd_buf,
@@ -1106,13 +1123,7 @@
     }
 
     unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        let device = &self.device.device;
-        device.cmd_write_timestamp(
-            self.cmd_buf,
-            vk::PipelineStageFlags::COMPUTE_SHADER,
-            pool.pool,
-            query,
-        );
+        self.write_timestamp_raw(pool.pool, query);
     }
 
     unsafe fn begin_debug_label(&mut self, label: &str) {
@@ -1130,6 +1141,18 @@
     }
 }
 
+impl CmdBuf {
+    unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) {
+        let device = &self.device.device;
+        device.cmd_write_timestamp(
+            self.cmd_buf,
+            vk::PipelineStageFlags::COMPUTE_SHADER,
+            pool,
+            query,
+        );
+    }
+}
+
 impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
     fn add_buffers(&mut self, buffers: &[&Buffer]) {
         self.buffers.extend(buffers.iter().map(|b| b.buffer));