Fixes for mac

Fix incorrect workgroup sizes, and change strategy for assigning binding
numbers; ultimately we should get correct values for those from shader
compilation, but this works for now.
diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index e12cef2..45e0406 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -277,7 +277,8 @@
     }
 
     unsafe fn destroy_image(&self, _image: &Self::Image) -> Result<(), Error> {
-        todo!()
+        // TODO figure out what we want to do here
+        Ok(())
     }
 
     unsafe fn create_compute_pipeline(
@@ -429,7 +430,7 @@
             encoder.set_buffer(buf_ix, Some(&buffer.buffer), 0);
             buf_ix += 1;
         }
-        let mut img_ix = 0;
+        let mut img_ix = buf_ix;
         for image in &descriptor_set.images {
             encoder.set_texture(img_ix, Some(&image.texture));
             img_ix += 1;
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index 469a01e..70023af 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -264,22 +264,23 @@
         submitted.wait()?;
         println!("elapsed = {:?}", start.elapsed());
         let ts = session.fetch_query_pool(&query_pool).unwrap();
-        println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
-        println!(
-            "Tile allocation kernel time: {:.3}ms",
-            (ts[1] - ts[0]) * 1e3
-        );
-        println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
-        println!("Backdrop kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
-        println!("Binning kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3);
-        println!("Coarse raster kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3);
-        println!("Render kernel time: {:.3}ms", (ts[6] - ts[5]) * 1e3);
+        if !ts.is_empty() {
+            println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
+            println!(
+                "Tile allocation kernel time: {:.3}ms",
+                (ts[1] - ts[0]) * 1e3
+            );
+            println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
+            println!("Backdrop kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
+            println!("Binning kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3);
+            println!("Coarse raster kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3);
+            println!("Render kernel time: {:.3}ms", (ts[6] - ts[5]) * 1e3);
+        }
 
         /*
         let mut data: Vec<u32> = Default::default();
         renderer.memory_buf_dev.read(&mut data).unwrap();
         piet_gpu::dump_k1_data(&data[2..]);
-        trace_ptcl(&data);
         */
 
         let mut img_data: Vec<u8> = Default::default();
diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
index bff0f70..f12a1cf 100644
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@@ -79,17 +79,19 @@
                     if let Some(submitted) = submitted[frame_idx].take() {
                         cmd_bufs[frame_idx] = submitted.wait().unwrap();
                         let ts = session.fetch_query_pool(&query_pools[frame_idx]).unwrap();
-                        info_string = format!(
-                            "{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
-                            ts[6] * 1e3,
-                            ts[0] * 1e3,
-                            (ts[1] - ts[0]) * 1e3,
-                            (ts[2] - ts[1]) * 1e3,
-                            (ts[3] - ts[2]) * 1e3,
-                            (ts[4] - ts[3]) * 1e3,
-                            (ts[5] - ts[4]) * 1e3,
-                            (ts[6] - ts[5]) * 1e3,
-                        );
+                        if !ts.is_empty() {
+                            info_string = format!(
+                                "{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
+                                ts[6] * 1e3,
+                                ts[0] * 1e3,
+                                (ts[1] - ts[0]) * 1e3,
+                                (ts[2] - ts[1]) * 1e3,
+                                (ts[3] - ts[2]) * 1e3,
+                                (ts[4] - ts[3]) * 1e3,
+                                (ts[5] - ts[4]) * 1e3,
+                                (ts[6] - ts[5]) * 1e3,
+                            );
+                        }
                     }
 
                     let mut ctx = PietGpuRenderContext::new();
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 127f805..ad13c24 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -81,6 +81,7 @@
 
     backdrop_pipeline: Pipeline,
     backdrop_ds: DescriptorSet,
+    backdrop_y: u32,
 
     bin_pipeline: Pipeline,
     bin_ds: DescriptorSet,
@@ -170,12 +171,13 @@
         let path_ds = session
             .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?;
 
-        let backdrop_code = if session.gpu_info().workgroup_limits.max_invocations >= 1024 {
-            include_shader!(session, "../shader/gen/backdrop_lg")
-        } else {
-            println!("using small workgroup backdrop kernel");
-            include_shader!(session, "../shader/gen/backdrop")
-        };
+        let (backdrop_code, backdrop_y) =
+            if session.gpu_info().workgroup_limits.max_invocations >= 1024 {
+                (include_shader!(session, "../shader/gen/backdrop_lg"), 4)
+            } else {
+                println!("using small workgroup backdrop kernel");
+                (include_shader!(session, "../shader/gen/backdrop"), 1)
+            };
         let backdrop_pipeline = session
             .create_compute_pipeline(backdrop_code, &[BindType::Buffer, BindType::BufReadOnly])?;
         let backdrop_ds = session
@@ -243,6 +245,7 @@
             path_ds,
             backdrop_pipeline,
             backdrop_ds,
+            backdrop_y,
             bin_pipeline,
             bin_ds,
             coarse_pipeline,
@@ -367,7 +370,7 @@
             &self.backdrop_pipeline,
             &self.backdrop_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
-            (256, 1, 1),
+            (256, self.backdrop_y, 1),
         );
         cmd_buf.write_timestamp(&query_pool, 4);
         // Note: this barrier is not needed as an actual dependency between
@@ -390,7 +393,7 @@
                 (self.height as u32 + 255) / 256,
                 1,
             ),
-            (256, 256, 1),
+            (256, 1, 1),
         );
         cmd_buf.write_timestamp(&query_pool, 6);
         cmd_buf.memory_barrier();