Address review feedback

Mostly minor changes in response to review. The main behavior change is to preserve use_cpu when hot-reloading (just the GPU) shaders.
diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs
index f982a22..b3b6317 100644
--- a/examples/with_winit/src/lib.rs
+++ b/examples/with_winit/src/lib.rs
@@ -73,6 +73,7 @@
     let mut render_cx = render_cx;
     #[cfg(not(target_arch = "wasm32"))]
     let mut render_state = None::<RenderState>;
+    let use_cpu = args.use_cpu;
     // The design of `RenderContext` forces delayed renderer initialisation to
     // not work on wasm, as WASM futures effectively must be 'static.
     // Otherwise, this could work by sending the result to event_loop.proxy
@@ -87,7 +88,7 @@
                 &RendererOptions {
                     surface_format: Some(render_state.surface.format),
                     timestamp_period: render_cx.devices[id].queue.get_timestamp_period(),
-                    use_cpu: false,
+                    use_cpu: use_cpu,
                 },
             )
             .expect("Could create renderer"),
@@ -129,7 +130,6 @@
     let mut profile_stored = None;
     let mut prev_scene_ix = scene_ix - 1;
     let mut profile_taken = Instant::now();
-    let use_cpu = args.use_cpu;
     // _event_loop is used on non-wasm platforms to create new windows
     event_loop.run(move |event, _event_loop, control_flow| match event {
         Event::WindowEvent {
diff --git a/src/cpu_dispatch.rs b/src/cpu_dispatch.rs
index 87977af..0b8bbc8 100644
--- a/src/cpu_dispatch.rs
+++ b/src/cpu_dispatch.rs
@@ -1,3 +1,6 @@
+// Copyright 2023 The Vello authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 //! Support for CPU implementations of compute shaders.
 
 use std::{
diff --git a/src/engine.rs b/src/engine.rs
index e30a471..03535b3 100644
--- a/src/engine.rs
+++ b/src/engine.rs
@@ -409,6 +409,14 @@
                     // println!("dispatching {:?} with {} bindings", wg_size, bindings.len());
                     let shader = &self.shaders[shader_id.0];
                     if let Some(cpu_shader) = shader.cpu_shader {
+                        // The current strategy is to run the CPU shader synchronously. This
+                        // works because there is currently the added constraint that data
+                        // can only flow from CPU to GPU, not the other way around. If and
+                        // when we implement that, we will need to defer the execution. Of
+                        // course, we will also need to wire up more async sychronization
+                        // mechanisms, as the CPU dispatch can't run until the preceding
+                        // command buffer submission completes (and, in WebGPU, the async
+                        // mapping operations on the buffers completes).
                         let resources =
                             transient_map.create_cpu_resources(&mut self.bind_map, bindings);
                         cpu_shader(wg_size.0, &resources);
@@ -435,6 +443,7 @@
                 Command::DispatchIndirect(shader_id, proxy, offset, bindings) => {
                     let shader = &self.shaders[shader_id.0];
                     if let Some(cpu_shader) = shader.cpu_shader {
+                        // Same consideration as above about running the CPU shader synchronously.
                         let n_wg;
                         if let CpuBinding::BufferRW(b) = self.bind_map.get_cpu_buf(proxy.id) {
                             let slice = b.borrow();
@@ -496,9 +505,7 @@
                                 if let Some(size) = size {
                                     slice = &mut slice[..size.get() as usize];
                                 }
-                                for x in slice {
-                                    *x = 0;
-                                }
+                                slice.fill(0);
                             }
                         }
                     } else {
@@ -860,6 +867,11 @@
 }
 
 impl BindMapBuffer {
+    // Upload a buffer from CPU to GPU if needed.
+    //
+    // Note data flow is one way only, from CPU to GPU. Once this method is
+    // called, the buffer is no longer materialized on CPU, and cannot be
+    // accessed from a CPU shader.
     fn upload_if_needed(
         &mut self,
         proxy: &BufProxy,
@@ -1031,6 +1043,7 @@
             match resource {
                 ResourceProxy::Buf(buf) => match self.bufs.get(&buf.id) {
                     Some(TransientBuf::Cpu(_)) => (),
+                    Some(TransientBuf::Gpu(_)) => panic!("buffer was already materialized on GPU"),
                     _ => bind_map.materialize_cpu_buf(buf),
                 },
                 ResourceProxy::Image(_) => todo!(),