Start async wiring

Make async versions of the main library entry points, and invoke those from the with_winit example.

Right now this just prints the contents of the bump buffer before just running the fine dispatch, but it could apply conditional logic.
diff --git a/examples/with_winit/src/main.rs b/examples/with_winit/src/main.rs
index 4046538..f12f8c3 100644
--- a/examples/with_winit/src/main.rs
+++ b/examples/with_winit/src/main.rs
@@ -22,6 +22,7 @@
 
 use clap::Parser;
 use vello::{
+    block_on_wgpu,
     kurbo::{Affine, Vec2},
     util::RenderContext,
     Renderer, Scene, SceneBuilder,
@@ -187,18 +188,25 @@
                 .surface
                 .get_current_texture()
                 .expect("failed to get surface texture");
-            renderer
-                .render_to_surface(
-                    &device_handle.device,
-                    &device_handle.queue,
-                    &scene,
-                    &surface_texture,
-                    width,
-                    height,
-                )
-                .expect("failed to render to surface");
-            surface_texture.present();
-            device_handle.device.poll(wgpu::Maintain::Wait);
+            let fut = async {
+                renderer
+                    .render_to_surface_async(
+                        &device_handle.device,
+                        &device_handle.queue,
+                        &scene,
+                        &surface_texture,
+                        width,
+                        height,
+                    )
+                    .await
+                    .expect("failed to render to surface");
+                surface_texture.present();
+            };
+            #[cfg(not(target_arch = "wasm32"))]
+            block_on_wgpu(&device_handle.device, fut);
+            #[cfg(target_arch = "wasm32")]
+            wasm_bindgen_futures::spawn_local(fut);
+            device_handle.device.poll(wgpu::Maintain::Poll);
         }
         Event::UserEvent(event) => match event {
             #[cfg(not(target_arch = "wasm32"))]
diff --git a/src/engine.rs b/src/engine.rs
index d4d9c91..76d7524 100644
--- a/src/engine.rs
+++ b/src/engine.rs
@@ -21,12 +21,9 @@
     sync::atomic::{AtomicU64, Ordering},
 };
 
-use futures_intrusive::channel::shared::GenericOneshotReceiver;
-use parking_lot::RawMutex;
 use wgpu::{
-    util::DeviceExt, BindGroup, BindGroupLayout, Buffer, BufferAsyncError, BufferSlice,
-    BufferUsages, BufferView, ComputePipeline, Device, Queue, Texture, TextureAspect,
-    TextureFormat, TextureUsages, TextureView, TextureViewDimension,
+    util::DeviceExt, BindGroup, BindGroupLayout, Buffer, BufferUsages, ComputePipeline, Device,
+    Queue, Texture, TextureAspect, TextureFormat, TextureUsages, TextureView, TextureViewDimension,
 };
 
 pub type Error = Box<dyn std::error::Error>;
@@ -43,6 +40,7 @@
     shaders: Vec<Shader>,
     pool: ResourcePool,
     bind_map: BindMap,
+    downloads: HashMap<Id, Buffer>,
 }
 
 struct Shader {
@@ -101,11 +99,6 @@
     FreeImage(ImageProxy),
 }
 
-#[derive(Default)]
-pub struct Downloads {
-    buf_map: HashMap<Id, Buffer>,
-}
-
 /// The type of resource that will be bound to a slot in a shader.
 #[derive(Clone, Copy, PartialEq, Eq)]
 pub enum BindType {
@@ -153,6 +146,7 @@
             shaders: vec![],
             pool: Default::default(),
             bind_map: Default::default(),
+            downloads: Default::default(),
         }
     }
 
@@ -253,8 +247,7 @@
         queue: &Queue,
         recording: &Recording,
         external_resources: &[ExternalResource],
-    ) -> Result<Downloads, Error> {
-        let mut downloads = Downloads::default();
+    ) -> Result<(), Error> {
         let mut free_bufs: HashSet<Id> = Default::default();
         let mut free_images: HashSet<Id> = Default::default();
 
@@ -264,7 +257,9 @@
                 Command::Upload(buf_proxy, bytes) => {
                     let usage =
                         BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE;
-                    let buf = self.pool.get_buf(buf_proxy, usage, device);
+                    let buf = self
+                        .pool
+                        .get_buf(buf_proxy.size, buf_proxy.name, usage, device);
                     // TODO: if buffer is newly created, might be better to make it mapped at creation
                     // and copy. However, we expect reuse will be most common.
                     queue.write_buffer(&buf, 0, bytes);
@@ -273,7 +268,9 @@
                 Command::UploadUniform(buf_proxy, bytes) => {
                     let usage = BufferUsages::UNIFORM | BufferUsages::COPY_DST;
                     // Same consideration as above
-                    let buf = self.pool.get_buf(buf_proxy, usage, device);
+                    let buf = self
+                        .pool
+                        .get_buf(buf_proxy.size, buf_proxy.name, usage, device);
                     queue.write_buffer(&buf, 0, bytes);
                     self.bind_map.insert_buf(buf_proxy, buf);
                 }
@@ -351,14 +348,10 @@
                         .buf_map
                         .get(&proxy.id)
                         .ok_or("buffer not in map")?;
-                    let buf = device.create_buffer(&wgpu::BufferDescriptor {
-                        label: Some(proxy.name),
-                        size: proxy.size,
-                        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
-                        mapped_at_creation: false,
-                    });
+                    let usage = BufferUsages::MAP_READ | BufferUsages::COPY_DST;
+                    let buf = self.pool.get_buf(proxy.size, "download", usage, device);
                     encoder.copy_buffer_to_buffer(&src_buf.buffer, 0, &buf, 0, proxy.size);
-                    downloads.buf_map.insert(proxy.id, buf);
+                    self.downloads.insert(proxy.id, buf);
                 }
                 Command::Clear(proxy, offset, size) => {
                     let buffer = self
@@ -393,7 +386,15 @@
                 drop(view);
             }
         }
-        Ok(downloads)
+        Ok(())
+    }
+
+    pub fn get_download(&self, buf: BufProxy) -> Option<&Buffer> {
+        self.downloads.get(&buf.id)
+    }
+
+    pub fn free_download(&mut self, buf: BufProxy) {
+        self.downloads.remove(&buf.id);
     }
 }
 
@@ -441,6 +442,10 @@
         ));
     }
 
+    /// Prepare a buffer for downloading.
+    ///
+    /// Currently this copies to a download buffer. The original buffer can be freed
+    /// immediately after.
     pub fn download(&mut self, buf: BufProxy) {
         self.push(Command::Download(buf));
     }
@@ -603,7 +608,7 @@
                     if let Entry::Vacant(v) = self.buf_map.entry(proxy.id) {
                         let usage =
                             BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE;
-                        let buf = pool.get_buf(&proxy, usage, device);
+                        let buf = pool.get_buf(proxy.size, proxy.name, usage, device);
                         v.insert(BindMapBuffer {
                             buffer: buf,
                             label: proxy.name,
@@ -685,7 +690,7 @@
             Entry::Occupied(occupied) => Ok(&occupied.into_mut().buffer),
             Entry::Vacant(vacant) => {
                 let usage = BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE;
-                let buf = pool.get_buf(&proxy, usage, device);
+                let buf = pool.get_buf(proxy.size, proxy.name, usage, device);
                 Ok(&vacant
                     .insert(BindMapBuffer {
                         buffer: buf,
@@ -697,52 +702,22 @@
     }
 }
 
-pub struct DownloadsMapped<'a>(
-    HashMap<
-        Id,
-        (
-            BufferSlice<'a>,
-            GenericOneshotReceiver<RawMutex, Result<(), BufferAsyncError>>,
-        ),
-    >,
-);
-
-impl Downloads {
-    // Discussion: should API change so we get one buffer, rather than mapping all?
-    pub fn map(&self) -> DownloadsMapped {
-        let mut map = HashMap::new();
-        for (id, buf) in &self.buf_map {
-            let buf_slice = buf.slice(..);
-            let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel();
-            buf_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
-            map.insert(*id, (buf_slice, receiver));
-        }
-        DownloadsMapped(map)
-    }
-}
-
-impl<'a> DownloadsMapped<'a> {
-    pub async fn get_mapped(&self, proxy: BufProxy) -> Result<BufferView, Error> {
-        let (slice, recv) = self.0.get(&proxy.id).ok_or("buffer not in map")?;
-        if let Some(recv_result) = recv.receive().await {
-            recv_result?;
-        } else {
-            return Err("channel was closed".into());
-        }
-        Ok(slice.get_mapped_range())
-    }
-}
-
 const SIZE_CLASS_BITS: u32 = 1;
 
 impl ResourcePool {
     /// Get a buffer from the pool or create one.
-    fn get_buf(&mut self, proxy: &BufProxy, usage: BufferUsages, device: &Device) -> Buffer {
-        let rounded_size = Self::size_class(proxy.size, SIZE_CLASS_BITS);
+    fn get_buf(
+        &mut self,
+        size: u64,
+        name: &'static str,
+        usage: BufferUsages,
+        device: &Device,
+    ) -> Buffer {
+        let rounded_size = Self::size_class(size, SIZE_CLASS_BITS);
         let props = BufferProperties {
             size: rounded_size,
             usages: usage,
-            name: proxy.name,
+            name: name,
         };
         if let Some(buf_vec) = self.bufs.get_mut(&props) {
             if let Some(buf) = buf_vec.pop() {
@@ -751,7 +726,7 @@
         }
         device.create_buffer(&wgpu::BufferDescriptor {
             #[cfg(feature = "buffer_labels")]
-            label: Some(proxy.name),
+            label: Some(name),
             #[cfg(not(feature = "buffer_labels"))]
             label: None,
             size: rounded_size,
diff --git a/src/lib.rs b/src/lib.rs
index 6dc7bb2..db2968f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,9 +29,11 @@
 pub mod glyph;
 pub mod util;
 
+use render::Render;
 pub use scene::{Scene, SceneBuilder, SceneFragment};
+pub use util::block_on_wgpu;
 
-use engine::{Engine, ExternalResource};
+use engine::{Engine, ExternalResource, Recording};
 use shaders::FullShaders;
 
 use wgpu::{Device, Queue, SurfaceTexture, TextureFormat, TextureView};
@@ -83,8 +85,7 @@
             *target.as_image().unwrap(),
             texture,
         )];
-        let _ = self
-            .engine
+        self.engine
             .run_recording(device, queue, &recording, &external_resources)?;
         Ok(())
     }
@@ -164,6 +165,105 @@
         self.shaders = shaders;
         Ok(())
     }
+
+    /// Renders a scene to the target texture.
+    ///
+    /// The texture is assumed to be of the specified dimensions and have been created with
+    /// the [wgpu::TextureFormat::Rgba8Unorm] format and the [wgpu::TextureUsages::STORAGE_BINDING]
+    /// flag set.
+    pub async fn render_to_texture_async(
+        &mut self,
+        device: &Device,
+        queue: &Queue,
+        scene: &Scene,
+        texture: &TextureView,
+        width: u32,
+        height: u32,
+    ) -> Result<()> {
+        let mut render = Render::new();
+        let encoding = scene.data();
+        let recording = render.render_encoding_coarse(encoding, &self.shaders, width, height);
+        let target = render.out_image();
+        let bump_buf = render.bump_buf();
+        self.engine.run_recording(device, queue, &recording, &[])?;
+        if let Some(bump_buf) = self.engine.get_download(bump_buf) {
+            let buf_slice = bump_buf.slice(..);
+            let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel();
+            buf_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
+            if let Some(recv_result) = receiver.receive().await {
+                recv_result?;
+            } else {
+                return Err("channel was closed".into());
+            }
+            let mapped = buf_slice.get_mapped_range();
+            println!("{:?}", bytemuck::cast_slice::<_, u32>(&mapped));
+        }
+        // TODO: apply logic to determine whether we need to rerun coarse, and also
+        // allocate the blend stack as needed.
+        self.engine.free_download(bump_buf);
+        // Maybe clear to reuse allocation?
+        let mut recording = Recording::default();
+        render.record_fine(&self.shaders, &mut recording);
+        let external_resources = [ExternalResource::Image(target, texture)];
+        self.engine
+            .run_recording(device, queue, &recording, &external_resources)?;
+        Ok(())
+    }
+
+    pub async fn render_to_surface_async(
+        &mut self,
+        device: &Device,
+        queue: &Queue,
+        scene: &Scene,
+        surface: &SurfaceTexture,
+        width: u32,
+        height: u32,
+    ) -> Result<()> {
+        let mut target = self
+            .target
+            .take()
+            .unwrap_or_else(|| TargetTexture::new(device, width, height));
+        // TODO: implement clever resizing semantics here to avoid thrashing the memory allocator
+        // during resize, specifically on metal.
+        if target.width != width || target.height != height {
+            target = TargetTexture::new(device, width, height);
+        }
+        self.render_to_texture_async(device, queue, scene, &target.view, width, height)
+            .await?;
+        let mut encoder =
+            device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+        {
+            let surface_view = surface
+                .texture
+                .create_view(&wgpu::TextureViewDescriptor::default());
+            let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+                label: None,
+                layout: &self.blit.bind_layout,
+                entries: &[wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: wgpu::BindingResource::TextureView(&target.view),
+                }],
+            });
+            let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+                label: None,
+                color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                    view: &surface_view,
+                    resolve_target: None,
+                    ops: wgpu::Operations {
+                        load: wgpu::LoadOp::Clear(wgpu::Color::default()),
+                        store: true,
+                    },
+                })],
+                depth_stencil_attachment: None,
+            });
+            render_pass.set_pipeline(&self.blit.pipeline);
+            render_pass.set_bind_group(0, &bind_group, &[]);
+            render_pass.draw(0..6, 0..1);
+        }
+        queue.submit(Some(encoder.finish()));
+        self.target = Some(target);
+        Ok(())
+    }
 }
 
 struct TargetTexture {
diff --git a/src/render.rs b/src/render.rs
index fa3d97b..710c77b 100644
--- a/src/render.rs
+++ b/src/render.rs
@@ -196,6 +196,8 @@
     height: u32,
 ) -> (Recording, ResourceProxy) {
     let mut render = Render::new();
+    // TODO: leaks the download of the bump buf; a good way to fix would be to conditionalize
+    // that download.
     let mut recording = render.render_encoding_coarse(encoding, shaders, width, height);
     let out_image = render.out_image();
     render.record_fine(shaders, &mut recording);
@@ -524,13 +526,14 @@
             info_bin_data_buf,
             out_image,
         });
+        recording.download(*bump_buf.as_buf().unwrap());
+        recording.free_resource(bump_buf);
         recording
     }
 
     /// Run fine rasterization assuming the coarse phase succeeded.
     pub fn record_fine(&mut self, shaders: &FullShaders, recording: &mut Recording) {
         let fine = self.fine.take().unwrap();
-        recording.free_resource(fine.bump_buf);
         recording.dispatch(
             shaders.fine,
             (self.width_in_tiles, self.height_in_tiles, 1),
@@ -559,4 +562,8 @@
     pub fn out_image(&self) -> ImageProxy {
         self.fine.as_ref().unwrap().out_image
     }
+
+    pub fn bump_buf(&self) -> BufProxy {
+        *self.fine.as_ref().unwrap().bump_buf.as_buf().unwrap()
+    }
 }
diff --git a/src/util.rs b/src/util.rs
index 0245d11..a75f804 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -16,6 +16,8 @@
 
 //! Simple helpers for managing wgpu state and surfaces.
 
+use std::future::Future;
+
 use super::Result;
 
 use raw_window_handle::{HasRawDisplayHandle, HasRawWindowHandle};
@@ -132,3 +134,27 @@
     pub config: SurfaceConfiguration,
     pub dev_id: usize,
 }
+
+struct NullWake;
+
+impl std::task::Wake for NullWake {
+    fn wake(self: std::sync::Arc<Self>) {}
+}
+
+/// Block on a future, polling the device as needed.
+///
+/// This will deadlock if the future is awaiting anything other than GPU progress.
+pub fn block_on_wgpu<F: Future>(device: &Device, mut fut: F) -> F::Output {
+    let waker = std::task::Waker::from(std::sync::Arc::new(NullWake));
+    let mut context = std::task::Context::from_waker(&waker);
+    // Same logic as `pin_mut!` macro from `pin_utils`.
+    let mut fut = unsafe { std::pin::Pin::new_unchecked(&mut fut) };
+    loop {
+        match fut.as_mut().poll(&mut context) {
+            std::task::Poll::Pending => {
+                device.poll(wgpu::Maintain::Wait);
+            }
+            std::task::Poll::Ready(item) => break item,
+        }
+    }
+}