Clean up command buffers

This patch deallocates command buffers after command submission completes (the same time as other resources are released).

It should be portable and robust on all back-ends, but not necessarily the most efficient. But reuse of command buffers, as well as more efficient allocation on Vulkan and DX12, are for followup work.
diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs
index 7b1d59f..926f43a 100644
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@@ -16,7 +16,7 @@
 
 //! The generic trait for backends to implement.
 
-use crate::{mux::ShaderCode, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
+use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
 
 pub trait Device: Sized {
     type Buffer: 'static;
@@ -105,6 +105,9 @@
 
     fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error>;
 
+    /// If the command buffer was submitted, it must complete before this is called.
+    unsafe fn destroy_cmd_buf(&self, cmd_buf: Self::CmdBuf) -> Result<(), Error>;
+
     fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error>;
 
     /// Get results from query pool, destroying it in the process.
@@ -158,6 +161,7 @@
 
     unsafe fn create_semaphore(&self) -> Result<Self::Semaphore, Error>;
     unsafe fn create_fence(&self, signaled: bool) -> Result<Self::Fence, Error>;
+    unsafe fn destroy_fence(&self, fence: Self::Fence) -> Result<(), Error>;
     unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error>;
     unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result<bool, Error>;
 
diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs
index bcef409..3fa57b4 100644
--- a/piet-gpu-hal/src/dx12.rs
+++ b/piet-gpu-hal/src/dx12.rs
@@ -316,6 +316,10 @@
         }
     }
 
+    unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
+        Ok(())
+    }
+
     fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
         unsafe {
             let heap = self
@@ -409,6 +413,10 @@
         Ok(Fence { fence, event, val })
     }
 
+    unsafe fn destroy_fence(&self, _fence: Self::Fence) -> Result<(), Error> {
+        Ok(())
+    }
+
     unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error> {
         for fence in fences {
             // TODO: probably handle errors here.
diff --git a/piet-gpu-hal/src/dx12/wrappers.rs b/piet-gpu-hal/src/dx12/wrappers.rs
index 800959f..7acd8ac 100644
--- a/piet-gpu-hal/src/dx12/wrappers.rs
+++ b/piet-gpu-hal/src/dx12/wrappers.rs
@@ -661,6 +661,10 @@
         Ok(Fence(ComPtr::from_raw(fence)))
     }
 
+    pub unsafe fn destroy_fence(&self, fence: &Fence) -> Result<(), Error> {
+        Ok(())
+    }
+
     pub unsafe fn create_committed_resource(
         &self,
         heap_properties: &d3d12::D3D12_HEAP_PROPERTIES,
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index ec6de6d..d53833f 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -30,8 +30,13 @@
 
 struct SessionInner {
     device: mux::Device,
+    /// A pool of command buffers that can be reused.
+    ///
+    /// Currently this is not used, as it only works well on Vulkan. At some
+    /// point, we will want to efficiently reuse command buffers rather than
+    /// allocating them each time, but that is a TODO.
     cmd_buf_pool: Mutex<Vec<(mux::CmdBuf, Fence)>>,
-    /// Command buffers that are still pending (so resources can't be freed).
+    /// Command buffers that are still pending (so resources can't be freed yet).
     pending: Mutex<Vec<SubmittedCmdBufInner>>,
     /// A command buffer that is used for copying from staging buffers.
     staging_cmd_buf: Mutex<Option<CmdBuf>>,
@@ -148,10 +153,8 @@
         let (cmd_buf, fence) = if let Some(cf) = self.0.cmd_buf_pool.lock().unwrap().pop() {
             cf
         } else {
-            println!("allocating cmd buf..");
             let cmd_buf = self.0.device.create_cmd_buf()?;
             let fence = unsafe { self.0.device.create_fence(false)? };
-            println!("done");
             (cmd_buf, fence)
         };
         Ok(CmdBuf {
@@ -171,19 +174,7 @@
                     let mut item = pending.swap_remove(i);
                     // TODO: wait is superfluous, can just reset
                     let _ = self.0.device.wait_and_reset(vec![&mut item.fence]);
-
-                    // Reuse of command buffers works on Vulkan, but not at all on
-                    // Metal and is problematic on DX12 (the allocator is returned)
-                    // to the pool. Punt for now.
-                    
-                    let mut pool = self.0.cmd_buf_pool.lock().unwrap();
-                    pool.push((item.cmd_buf, item.fence));
-                    std::mem::drop(item.resources);
-                    if let Some(staging_cmd_buf) = item.staging_cmd_buf {
-                        pool.push((staging_cmd_buf.cmd_buf, staging_cmd_buf.fence));
-                        std::mem::drop(staging_cmd_buf.resources);
-                    }
-                    
+                    self.0.cleanup_submitted_cmd_buf(item);
                 } else {
                     i += 1;
                 }
@@ -395,6 +386,25 @@
     }
 }
 
+impl SessionInner {
+    /// Clean up a submitted command buffer.
+    ///
+    /// This drops the resources used by the command buffer and also cleans up the command
+    /// buffer itself. Currently that means destroying it, but at some point we'll want to
+    /// be better at reuse.
+    unsafe fn cleanup_submitted_cmd_buf(&self, item: SubmittedCmdBufInner) {
+        let _should_handle_err = self.device.destroy_cmd_buf(item.cmd_buf);
+        let _should_handle_err = self.device.destroy_fence(item.fence);
+
+        std::mem::drop(item.resources);
+        if let Some(staging_cmd_buf) = item.staging_cmd_buf {
+            let _should_handle_err = self.device.destroy_cmd_buf(staging_cmd_buf.cmd_buf);
+            let _should_handle_err = self.device.destroy_fence(staging_cmd_buf.fence);
+            std::mem::drop(staging_cmd_buf.resources);
+        }
+    }
+}
+
 impl CmdBuf {
     /// Begin recording into a command buffer.
     ///
@@ -569,14 +579,8 @@
         if let Some(session) = Weak::upgrade(&self.1) {
             unsafe {
                 session.device.wait_and_reset(vec![&mut item.fence])?;
+                session.cleanup_submitted_cmd_buf(item);
             }
-            // See discussion in `poll_cleanup`
-            session
-                .cmd_buf_pool
-                .lock()
-                .unwrap()
-                .push((item.cmd_buf, item.fence));
-            std::mem::drop(item.resources);
         }
         // else session dropped error?
         Ok(())
diff --git a/piet-gpu-hal/src/macros.rs b/piet-gpu-hal/src/macros.rs
index 8131e50..38897a8 100644
--- a/piet-gpu-hal/src/macros.rs
+++ b/piet-gpu-hal/src/macros.rs
@@ -72,6 +72,16 @@
                     }
                 }
             }
+            $crate::mux_cfg! {
+                #[cfg(vk)]
+                #[allow(unused)]
+                fn vk_owned(self) -> $vk {
+                    match self {
+                        $name::Vk(x) => x,
+                        _ => panic!("downcast error")
+                    }
+                }
+            }
 
             $crate::mux_cfg! {
                 #[cfg(dx12)]
@@ -93,6 +103,16 @@
                     }
                 }
             }
+            $crate::mux_cfg! {
+                #[cfg(dx12)]
+                #[allow(unused)]
+                fn dx12_owned(self) -> $dx12 {
+                    match self {
+                        $name::Dx12(x) => x,
+                        _ => panic!("downcast error")
+                    }
+                }
+            }
 
             $crate::mux_cfg! {
                 #[cfg(mtl)]
@@ -112,6 +132,15 @@
                     }
                 }
             }
+            $crate::mux_cfg! {
+                #[cfg(mtl)]
+                #[allow(unused)]
+                fn mtl_owned(self) -> $mtl {
+                    match self {
+                        $name::Mtl(x) => x,
+                    }
+                }
+            }
         }
     };
 }
diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index b8a1bf9..dbfc8d9 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -289,6 +289,10 @@
         Ok(CmdBuf { cmd_buf })
     }
 
+    unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
+        Ok(())
+    }
+
     fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
         // TODO
         Ok(QueryPool)
@@ -365,6 +369,10 @@
         Ok(Fence::Idle)
     }
 
+    unsafe fn destroy_fence(&self, _fence: Self::Fence) -> Result<(), Error> {
+        Ok(())
+    }
+
     unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error> {
         for fence in fences {
             match fence {
diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs
index 8f93eb6..31d2f4c 100644
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@@ -242,6 +242,14 @@
         }
     }
 
+    pub unsafe fn destroy_fence(&self, fence: Fence) -> Result<(), Error> {
+        mux_match! { self;
+            Device::Vk(d) => d.destroy_fence(fence.vk_owned()),
+            Device::Dx12(d) => d.destroy_fence(fence.dx12_owned()),
+            Device::Mtl(d) => d.destroy_fence(fence.mtl_owned()),
+        }
+    }
+
     // Consider changing Vec to iterator (as is done in gfx-hal)
     pub unsafe fn wait_and_reset(&self, fences: Vec<&mut Fence>) -> Result<(), Error> {
         mux_match! { self;
@@ -309,6 +317,14 @@
         }
     }
 
+    pub unsafe fn destroy_cmd_buf(&self, cmd_buf: CmdBuf) -> Result<(), Error> {
+        mux_match! { self;
+            Device::Vk(d) => d.destroy_cmd_buf(cmd_buf.vk_owned()),
+            Device::Dx12(d) => d.destroy_cmd_buf(cmd_buf.dx12_owned()),
+            Device::Mtl(d) => d.destroy_cmd_buf(cmd_buf.mtl_owned()),
+        }
+    }
+
     pub fn create_query_pool(&self, n_queries: u32) -> Result<QueryPool, Error> {
         mux_match! { self;
             Device::Vk(d) => d.create_query_pool(n_queries).map(QueryPool::Vk),
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 96cbc54..5b0dfd6 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -89,6 +89,7 @@
 
 pub struct CmdBuf {
     cmd_buf: vk::CommandBuffer,
+    cmd_pool: vk::CommandPool,
     device: Arc<RawDevice>,
 }
 
@@ -620,6 +621,12 @@
         Ok(device.create_fence(&vk::FenceCreateInfo::builder().flags(flags).build(), None)?)
     }
 
+    unsafe fn destroy_fence(&self, fence: Self::Fence) -> Result<(), Error> {
+        let device = &self.device.device;
+        device.destroy_fence(fence, None);
+        Ok(())
+    }
+
     unsafe fn create_semaphore(&self) -> Result<Self::Semaphore, Error> {
         let device = &self.device.device;
         Ok(device.create_semaphore(&vk::SemaphoreCreateInfo::default(), None)?)
@@ -658,7 +665,7 @@
     fn create_cmd_buf(&self) -> Result<CmdBuf, Error> {
         unsafe {
             let device = &self.device.device;
-            let command_pool = device.create_command_pool(
+            let cmd_pool = device.create_command_pool(
                 &vk::CommandPoolCreateInfo::builder()
                     .flags(vk::CommandPoolCreateFlags::RESET_COMMAND_BUFFER)
                     .queue_family_index(self.qfi),
@@ -666,17 +673,24 @@
             )?;
             let cmd_buf = device.allocate_command_buffers(
                 &vk::CommandBufferAllocateInfo::builder()
-                    .command_pool(command_pool)
+                    .command_pool(cmd_pool)
                     .level(vk::CommandBufferLevel::PRIMARY)
                     .command_buffer_count(1),
             )?[0];
             Ok(CmdBuf {
                 cmd_buf,
+                cmd_pool,
                 device: self.device.clone(),
             })
         }
     }
 
+    unsafe fn destroy_cmd_buf(&self, cmd_buf: CmdBuf) -> Result<(), Error> {
+        let device = &self.device.device;
+        device.destroy_command_pool(cmd_buf.cmd_pool, None);
+        Ok(())
+    }
+
     /// Create a query pool for timestamp queries.
     fn create_query_pool(&self, n_queries: u32) -> Result<QueryPool, Error> {
         unsafe {
diff --git a/piet-gpu/src/test_scenes.rs b/piet-gpu/src/test_scenes.rs
index 580d591..47ace66 100644
--- a/piet-gpu/src/test_scenes.rs
+++ b/piet-gpu/src/test_scenes.rs
@@ -2,19 +2,12 @@
 
 use rand::{Rng, RngCore};
 
-<<<<<<< HEAD
 use piet::kurbo::{BezPath, Circle, Line, Point, Rect, Shape};
 use piet::{
     Color, FixedGradient, FixedLinearGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
 };
 
 use crate::{PicoSvg, RenderContext, Vec2};
-=======
-use piet::{Color, FixedGradient, FixedLinearGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder};
-use piet::kurbo::{BezPath, Circle, Line, Point, Rect, Shape};
-
-use crate::{RenderContext, PicoSvg, Vec2};
->>>>>>> Animating scene
 
 const N_CIRCLES: usize = 0;
 
@@ -199,14 +192,10 @@
 }
 
 pub fn render_anim_frame(rc: &mut impl RenderContext, i: usize) {
-<<<<<<< HEAD
     rc.fill(
         Rect::new(0.0, 0.0, 1000.0, 1000.0),
         &Color::rgb8(128, 128, 128),
     );
-=======
-    rc.fill(Rect::new(0.0, 0.0, 1000.0, 1000.0), &Color::rgb8(128, 128, 128));
->>>>>>> Animating scene
     let text_size = 60.0 + 40.0 * (0.01 * i as f64).sin();
     rc.save().unwrap();
     //rc.transform(Affine::new([0.2, 0.0, 0.0, -0.2, 200.0, 800.0]));
@@ -224,8 +213,4 @@
     let p1 = center + 400.0 * Vec2::from_angle(th);
     let line = Line::new(center, p1);
     rc.stroke(line, &Color::rgb8(128, 0, 0), 5.0);
-<<<<<<< HEAD
 }
-=======
-}
->>>>>>> Animating scene