// Copyright 2023 The Vello authors
// SPDX-License-Identifier: Apache-2.0 OR MIT
use std::{
collections::{hash_map::Entry, HashMap, HashSet},
use wgpu::{
BindGroup, BindGroupLayout, Buffer, BufferUsages, CommandEncoder, CommandEncoderDescriptor,
ComputePipeline, Device, Queue, Texture, TextureAspect, TextureUsages, TextureView,
use crate::{
engine::{BindType, Error},
BufProxy, Command, Id, ImageProxy, Recording, ResourceProxy, ShaderId,
pub struct WgpuEngine {
shaders: Vec<Shader>,
pool: ResourcePool,
bind_map: BindMap,
downloads: HashMap<Id, Buffer>,
struct Shader {
pipeline: ComputePipeline,
bind_group_layout: BindGroupLayout,
label: &'static str,
cpu_shader: Option<fn(u32, &[CpuBinding])>,
pub enum ExternalResource<'a> {
Buf(BufProxy, &'a Buffer),
Image(ImageProxy, &'a TextureView),
/// A buffer can exist either on the GPU or on CPU.
enum MaterializedBuffer {
struct BindMapBuffer {
buffer: MaterializedBuffer,
#[cfg_attr(not(feature = "buffer_labels"), allow(unused))]
label: &'static str,
struct BindMap {
buf_map: HashMap<Id, BindMapBuffer>,
image_map: HashMap<Id, (Texture, TextureView)>,
pending_clears: HashSet<Id>,
#[derive(Hash, PartialEq, Eq)]
struct BufferProperties {
size: u64,
usages: BufferUsages,
#[cfg(feature = "buffer_labels")]
name: &'static str,
struct ResourcePool {
bufs: HashMap<BufferProperties, Vec<Buffer>>,
/// The transient bind map contains short-lifetime resources.
/// In particular, it has resources scoped to a single call of
/// `run_recording()`, including external resources and also buffer
/// uploads.
struct TransientBindMap<'a> {
bufs: HashMap<Id, TransientBuf<'a>>,
// TODO: create transient image type
images: HashMap<Id, &'a TextureView>,
enum TransientBuf<'a> {
Cpu(&'a [u8]),
Gpu(&'a Buffer),
impl WgpuEngine {
pub fn new() -> WgpuEngine {
WgpuEngine {
shaders: vec![],
pool: Default::default(),
bind_map: Default::default(),
downloads: Default::default(),
/// Add a shader.
/// This function is somewhat limited, it doesn't apply a label, only allows one bind group,
/// doesn't support push constants, and entry point is hardcoded as "main".
/// Maybe should do template instantiation here? But shader compilation pipeline feels maybe
/// a bit separate.
pub fn add_shader(
&mut self,
device: &Device,
label: &'static str,
wgsl: Cow<'static, str>,
layout: &[BindType],
) -> Result<ShaderId, Error> {
let shader_module = device.create_shader_module(wgpu::ShaderModuleDescriptor {
label: Some(label),
source: wgpu::ShaderSource::Wgsl(wgsl),
let entries = layout
.map(|(i, bind_type)| match bind_type {
BindType::Buffer | BindType::BufReadOnly => wgpu::BindGroupLayoutEntry {
binding: i as u32,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage {
read_only: *bind_type == BindType::BufReadOnly,
has_dynamic_offset: false,
min_binding_size: None,
count: None,
BindType::Uniform => wgpu::BindGroupLayoutEntry {
binding: i as u32,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Uniform,
has_dynamic_offset: false,
min_binding_size: None,
count: None,
BindType::Image(format) | BindType::ImageRead(format) => {
wgpu::BindGroupLayoutEntry {
binding: i as u32,
visibility: wgpu::ShaderStages::COMPUTE,
ty: if *bind_type == BindType::ImageRead(*format) {
wgpu::BindingType::Texture {
sample_type: wgpu::TextureSampleType::Float { filterable: true },
view_dimension: wgpu::TextureViewDimension::D2,
multisampled: false,
} else {
wgpu::BindingType::StorageTexture {
access: wgpu::StorageTextureAccess::WriteOnly,
format: format.to_wgpu(),
view_dimension: wgpu::TextureViewDimension::D2,
count: None,
let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: None,
entries: &entries,
let compute_pipeline_layout =
device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: None,
bind_group_layouts: &[&bind_group_layout],
push_constant_ranges: &[],
let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: Some(label),
layout: Some(&compute_pipeline_layout),
module: &shader_module,
entry_point: "main",
let cpu_shader = None;
let shader = Shader {
let id = self.shaders.len();
pub fn set_cpu_shader(&mut self, id: ShaderId, f: fn(u32, &[CpuBinding])) {
self.shaders[id.0].cpu_shader = Some(f);
pub fn run_recording(
&mut self,
device: &Device,
queue: &Queue,
recording: &Recording,
external_resources: &[ExternalResource],
label: &'static str,
#[cfg(feature = "wgpu-profiler")] profiler: &mut wgpu_profiler::GpuProfiler,
) -> Result<(), Error> {
let mut free_bufs: HashSet<Id> = Default::default();
let mut free_images: HashSet<Id> = Default::default();
let mut transient_map = TransientBindMap::new(external_resources);
let mut encoder =
device.create_command_encoder(&CommandEncoderDescriptor { label: Some(label) });
#[cfg(feature = "wgpu-profiler")]
profiler.begin_scope(label, &mut encoder, device);
for command in &recording.commands {
match command {
Command::Upload(buf_proxy, bytes) => {
.insert(, TransientBuf::Cpu(bytes));
let usage =
BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE;
let buf = self
.get_buf(buf_proxy.size,, usage, device);
// TODO: if buffer is newly created, might be better to make it mapped at creation
// and copy. However, we expect reuse will be most common.
queue.write_buffer(&buf, 0, bytes);
self.bind_map.insert_buf(buf_proxy, buf);
Command::UploadUniform(buf_proxy, bytes) => {
.insert(, TransientBuf::Cpu(bytes));
let usage = BufferUsages::UNIFORM | BufferUsages::COPY_DST;
// Same consideration as above
let buf = self
.get_buf(buf_proxy.size,, usage, device);
queue.write_buffer(&buf, 0, bytes);
self.bind_map.insert_buf(buf_proxy, buf);
Command::UploadImage(image_proxy, bytes) => {
let format = image_proxy.format.to_wgpu();
let block_size = format
.expect("ImageFormat must have a valid block size");
let texture = device.create_texture(&wgpu::TextureDescriptor {
label: None,
size: wgpu::Extent3d {
width: image_proxy.width,
height: image_proxy.height,
depth_or_array_layers: 1,
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
usage: TextureUsages::TEXTURE_BINDING | TextureUsages::COPY_DST,
view_formats: &[],
let texture_view = texture.create_view(&wgpu::TextureViewDescriptor {
label: None,
dimension: Some(TextureViewDimension::D2),
aspect: TextureAspect::All,
mip_level_count: None,
base_mip_level: 0,
base_array_layer: 0,
array_layer_count: None,
format: Some(format),
wgpu::ImageCopyTexture {
texture: &texture,
mip_level: 0,
origin: wgpu::Origin3d { x: 0, y: 0, z: 0 },
aspect: TextureAspect::All,
wgpu::ImageDataLayout {
offset: 0,
bytes_per_row: Some(image_proxy.width * block_size),
rows_per_image: None,
wgpu::Extent3d {
width: image_proxy.width,
height: image_proxy.height,
depth_or_array_layers: 1,
.insert_image(, texture, texture_view)
Command::WriteImage(proxy, [x, y, width, height], data) => {
if let Ok((texture, _)) = self.bind_map.get_or_create_image(*proxy, device) {
let format = proxy.format.to_wgpu();
let block_size = format
.expect("ImageFormat must have a valid block size");
wgpu::ImageCopyTexture {
mip_level: 0,
origin: wgpu::Origin3d { x: *x, y: *y, z: 0 },
aspect: TextureAspect::All,
wgpu::ImageDataLayout {
offset: 0,
bytes_per_row: Some(*width * block_size),
rows_per_image: None,
wgpu::Extent3d {
width: *width,
height: *height,
depth_or_array_layers: 1,
Command::Dispatch(shader_id, wg_size, bindings) => {
// println!("dispatching {:?} with {} bindings", wg_size, bindings.len());
let shader = &self.shaders[shader_id.0];
if let Some(cpu_shader) = shader.cpu_shader {
// The current strategy is to run the CPU shader synchronously. This
// works because there is currently the added constraint that data
// can only flow from CPU to GPU, not the other way around. If and
// when we implement that, we will need to defer the execution. Of
// course, we will also need to wire up more async sychronization
// mechanisms, as the CPU dispatch can't run until the preceding
// command buffer submission completes (and, in WebGPU, the async
// mapping operations on the buffers completes).
let resources =
transient_map.create_cpu_resources(&mut self.bind_map, bindings);
cpu_shader(wg_size.0, &resources);
} else {
let bind_group = transient_map.create_bind_group(
&mut self.bind_map,
&mut self.pool,
&mut encoder,
let mut cpass = encoder.begin_compute_pass(&Default::default());
#[cfg(feature = "wgpu-profiler")]
profiler.begin_scope(shader.label, &mut cpass, device);
cpass.set_bind_group(0, &bind_group, &[]);
cpass.dispatch_workgroups(wg_size.0, wg_size.1, wg_size.2);
#[cfg(feature = "wgpu-profiler")]
profiler.end_scope(&mut cpass);
Command::DispatchIndirect(shader_id, proxy, offset, bindings) => {
let shader = &self.shaders[shader_id.0];
if let Some(cpu_shader) = shader.cpu_shader {
// Same consideration as above about running the CPU shader synchronously.
let n_wg;
if let CpuBinding::BufferRW(b) = self.bind_map.get_cpu_buf( {
let slice = b.borrow();
let indirect: &[u32] = bytemuck::cast_slice(&slice);
n_wg = indirect[0];
} else {
panic!("indirect buffer missing from bind map");
let resources =
transient_map.create_cpu_resources(&mut self.bind_map, bindings);
cpu_shader(n_wg, &resources);
} else {
let bind_group = transient_map.create_bind_group(
&mut self.bind_map,
&mut self.pool,
&mut encoder,
&mut self.bind_map,
&mut self.pool,
let mut cpass = encoder.begin_compute_pass(&Default::default());
#[cfg(feature = "wgpu-profiler")]
profiler.begin_scope(shader.label, &mut cpass, device);
cpass.set_bind_group(0, &bind_group, &[]);
let buf = self
.ok_or("buffer for indirect dispatch not in map")?;
cpass.dispatch_workgroups_indirect(buf, *offset);
#[cfg(feature = "wgpu-profiler")]
profiler.end_scope(&mut cpass);
Command::Download(proxy) => {
let src_buf = self
.ok_or("buffer not in map")?;
let usage = BufferUsages::MAP_READ | BufferUsages::COPY_DST;
let buf = self.pool.get_buf(proxy.size, "download", usage, device);
encoder.copy_buffer_to_buffer(src_buf, 0, &buf, 0, proxy.size);
self.downloads.insert(, buf);
Command::Clear(proxy, offset, size) => {
if let Some(buf) = self.bind_map.get_buf(*proxy) {
match &buf.buffer {
MaterializedBuffer::Gpu(b) => encoder.clear_buffer(b, *offset, *size),
MaterializedBuffer::Cpu(b) => {
let mut slice = &mut b.borrow_mut()[*offset as usize..];
if let Some(size) = size {
slice = &mut slice[..size.get() as usize];
} else {
Command::FreeBuf(proxy) => {
Command::FreeImage(proxy) => {
#[cfg(feature = "wgpu-profiler")]
profiler.end_scope(&mut encoder);
for id in free_bufs {
if let Some(buf) = self.bind_map.buf_map.remove(&id) {
if let MaterializedBuffer::Gpu(gpu_buf) = buf.buffer {
let props = BufferProperties {
size: gpu_buf.size(),
usages: gpu_buf.usage(),
#[cfg(feature = "buffer_labels")]
name: buf.label,
for id in free_images {
if let Some((texture, view)) = self.bind_map.image_map.remove(&id) {
// TODO: have a pool to avoid needless re-allocation
pub fn get_download(&self, buf: BufProxy) -> Option<&Buffer> {
pub fn free_download(&mut self, buf: BufProxy) {
impl BindMap {
fn insert_buf(&mut self, proxy: &BufProxy, buffer: Buffer) {
BindMapBuffer {
buffer: MaterializedBuffer::Gpu(buffer),
/// Get a buffer, only if it's on GPU.
fn get_gpu_buf(&self, id: Id) -> Option<&Buffer> {
self.buf_map.get(&id).and_then(|b| match &b.buffer {
MaterializedBuffer::Gpu(b) => Some(b),
_ => None,
/// Get a CPU buffer.
/// Panics if buffer is not present or is on GPU.
fn get_cpu_buf(&self, id: Id) -> CpuBinding {
match &self.buf_map[&id].buffer {
MaterializedBuffer::Cpu(b) => CpuBinding::BufferRW(b),
_ => panic!("getting cpu buffer, but it's on gpu"),
fn materialize_cpu_buf(&mut self, buf: &BufProxy) {
self.buf_map.entry(|| {
let buffer = MaterializedBuffer::Cpu(RefCell::new(vec![0; buf.size as usize]));
BindMapBuffer {
// TODO: do we need to cfg this?
fn insert_image(&mut self, id: Id, image: Texture, image_view: TextureView) {
self.image_map.insert(id, (image, image_view));
fn get_buf(&mut self, proxy: BufProxy) -> Option<&BindMapBuffer> {
fn get_or_create_image(
&mut self,
proxy: ImageProxy,
device: &Device,
) -> Result<&(Texture, TextureView), Error> {
match self.image_map.entry( {
Entry::Occupied(occupied) => Ok(occupied.into_mut()),
Entry::Vacant(vacant) => {
let format = proxy.format.to_wgpu();
let texture = device.create_texture(&wgpu::TextureDescriptor {
label: None,
size: wgpu::Extent3d {
width: proxy.width,
height: proxy.height,
depth_or_array_layers: 1,
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
usage: TextureUsages::TEXTURE_BINDING | TextureUsages::COPY_DST,
view_formats: &[],
let texture_view = texture.create_view(&wgpu::TextureViewDescriptor {
label: None,
dimension: Some(TextureViewDimension::D2),
aspect: TextureAspect::All,
mip_level_count: None,
base_mip_level: 0,
base_array_layer: 0,
array_layer_count: None,
format: Some(proxy.format.to_wgpu()),
Ok(vacant.insert((texture, texture_view)))
const SIZE_CLASS_BITS: u32 = 1;
impl ResourcePool {
/// Get a buffer from the pool or create one.
fn get_buf(
&mut self,
size: u64,
#[allow(unused)] name: &'static str,
usage: BufferUsages,
device: &Device,
) -> Buffer {
let rounded_size = Self::size_class(size, SIZE_CLASS_BITS);
let props = BufferProperties {
size: rounded_size,
usages: usage,
#[cfg(feature = "buffer_labels")]
if let Some(buf_vec) = self.bufs.get_mut(&props) {
if let Some(buf) = buf_vec.pop() {
return buf;
device.create_buffer(&wgpu::BufferDescriptor {
#[cfg(feature = "buffer_labels")]
label: Some(name),
#[cfg(not(feature = "buffer_labels"))]
label: None,
size: rounded_size,
mapped_at_creation: false,
/// Quantize a size up to the nearest size class.
fn size_class(x: u64, bits: u32) -> u64 {
if x > 1 << bits {
let a = (x - 1).leading_zeros();
let b = (x - 1) | (((u64::MAX / 2) >> bits) >> a);
b + 1
} else {
1 << bits
impl BindMapBuffer {
// Upload a buffer from CPU to GPU if needed.
// Note data flow is one way only, from CPU to GPU. Once this method is
// called, the buffer is no longer materialized on CPU, and cannot be
// accessed from a CPU shader.
fn upload_if_needed(
&mut self,
proxy: &BufProxy,
device: &Device,
queue: &Queue,
pool: &mut ResourcePool,
) {
if let MaterializedBuffer::Cpu(cpu_buf) = &self.buffer {
let usage = BufferUsages::COPY_SRC
| BufferUsages::COPY_DST
| BufferUsages::STORAGE
| BufferUsages::INDIRECT;
let buf = pool.get_buf(proxy.size,, usage, device);
queue.write_buffer(&buf, 0, &cpu_buf.borrow());
self.buffer = MaterializedBuffer::Gpu(buf);
impl<'a> TransientBindMap<'a> {
/// Create new transient bind map, seeded from external resources
fn new(external_resources: &'a [ExternalResource]) -> Self {
let mut bufs = HashMap::default();
let mut images = HashMap::default();
for resource in external_resources {
match resource {
ExternalResource::Buf(proxy, gpu_buf) => {
bufs.insert(, TransientBuf::Gpu(gpu_buf));
ExternalResource::Image(proxy, gpu_image) => {
images.insert(, *gpu_image);
TransientBindMap { bufs, images }
fn materialize_gpu_buf_for_indirect(
&mut self,
bind_map: &mut BindMap,
pool: &mut ResourcePool,
device: &Device,
queue: &Queue,
buf: &BufProxy,
) {
if !self.bufs.contains_key(& {
if let Some(b) = bind_map.buf_map.get_mut(& {
b.upload_if_needed(buf, device, queue, pool);
fn create_bind_group(
&mut self,
bind_map: &mut BindMap,
pool: &mut ResourcePool,
device: &Device,
queue: &Queue,
encoder: &mut CommandEncoder,
layout: &BindGroupLayout,
bindings: &[ResourceProxy],
) -> Result<BindGroup, Error> {
for proxy in bindings {
match proxy {
ResourceProxy::Buf(proxy) => {
if self.bufs.contains_key(& {
match bind_map.buf_map.entry( {
Entry::Vacant(v) => {
// TODO: only some buffers will need indirect, but does it hurt?
let usage = BufferUsages::COPY_SRC
| BufferUsages::COPY_DST
| BufferUsages::STORAGE
| BufferUsages::INDIRECT;
let buf = pool.get_buf(proxy.size,, usage, device);
if bind_map.pending_clears.remove(& {
encoder.clear_buffer(&buf, 0, None);
v.insert(BindMapBuffer {
buffer: MaterializedBuffer::Gpu(buf),
Entry::Occupied(mut o) => {
o.get_mut().upload_if_needed(proxy, device, queue, pool)
ResourceProxy::Image(proxy) => {
if self.images.contains_key(& {
if let Entry::Vacant(v) = bind_map.image_map.entry( {
let format = proxy.format.to_wgpu();
let texture = device.create_texture(&wgpu::TextureDescriptor {
label: None,
size: wgpu::Extent3d {
width: proxy.width,
height: proxy.height,
depth_or_array_layers: 1,
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
usage: TextureUsages::TEXTURE_BINDING | TextureUsages::COPY_DST,
view_formats: &[],
let texture_view = texture.create_view(&wgpu::TextureViewDescriptor {
label: None,
dimension: Some(TextureViewDimension::D2),
aspect: TextureAspect::All,
mip_level_count: None,
base_mip_level: 0,
base_array_layer: 0,
array_layer_count: None,
format: Some(proxy.format.to_wgpu()),
v.insert((texture, texture_view));
let entries = bindings
.map(|(i, proxy)| match proxy {
ResourceProxy::Buf(proxy) => {
let buf = match self.bufs.get(& {
Some(TransientBuf::Gpu(b)) => b,
_ => bind_map.get_gpu_buf(,
Ok(wgpu::BindGroupEntry {
binding: i as u32,
resource: buf.as_entire_binding(),
ResourceProxy::Image(proxy) => {
let view = self
.or_else(|| bind_map.image_map.get(&|v| &v.1))
Ok(wgpu::BindGroupEntry {
binding: i as u32,
resource: wgpu::BindingResource::TextureView(view),
.collect::<Result<Vec<_>, Error>>()?;
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
label: None,
entries: &entries,
fn create_cpu_resources(
bind_map: &'a mut BindMap,
bindings: &[ResourceProxy],
) -> Vec<CpuBinding> {
// First pass is mutable; create buffers as needed
for resource in bindings {
match resource {
ResourceProxy::Buf(buf) => match self.bufs.get(& {
Some(TransientBuf::Cpu(_)) => (),
Some(TransientBuf::Gpu(_)) => panic!("buffer was already materialized on GPU"),
_ => bind_map.materialize_cpu_buf(buf),
ResourceProxy::Image(_) => todo!(),
// Second pass takes immutable references
.map(|resource| match resource {
ResourceProxy::Buf(buf) => match self.bufs.get(& {
Some(TransientBuf::Cpu(b)) => CpuBinding::Buffer(b),
_ => bind_map.get_cpu_buf(,
ResourceProxy::Image(_) => todo!(),