capture computation of workgroup and buffer sizes
diff --git a/Cargo.toml b/Cargo.toml
index cdbde01..8ab85a5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,7 @@
 [workspace]
 resolver = "2"
 
-members = ["examples/with_winit", "examples/with_bevy", "examples/run_wasm", "examples/usvg_viewer", "vello_shaders"]
+members = ["examples/with_winit", "examples/with_bevy", "examples/run_wasm", "examples/usvg_viewer", "vello_shaders", "vello_native"]
 
 [workspace.package]
 edition = "2021"
diff --git a/src/encoding.rs b/src/encoding.rs
index f24c3bf..9782ddd 100644
--- a/src/encoding.rs
+++ b/src/encoding.rs
@@ -16,6 +16,9 @@
 
 //! Raw scene encoding.
 
+mod binning;
+mod clip;
+mod config;
 mod draw;
 mod encoding;
 mod math;
@@ -25,12 +28,19 @@
 
 pub mod resource;
 
+pub use binning::BinHeader;
+pub use clip::{ClipBbox, ClipBic, ClipElement, Clip};
+pub use config::{
+    BufferSize, BufferSizes, BumpAllocators, CpuConfig, GpuConfig, WorkgroupSize, WorkgroupSizes,
+};
 pub use draw::{
-    DrawBeginClip, DrawColor, DrawImage, DrawLinearGradient, DrawMonoid, DrawRadialGradient,
-    DrawTag,
+    DrawBbox, DrawBeginClip, DrawColor, DrawImage, DrawLinearGradient, DrawMonoid,
+    DrawRadialGradient, DrawTag,
 };
 pub use encoding::Encoding;
 pub use math::Transform;
 pub use monoid::Monoid;
-pub use packed::{Config, Layout, PackedEncoding};
-pub use path::{PathBbox, PathEncoder, PathMonoid, PathSegment, PathSegmentType, PathTag};
+pub use packed::{Layout, PackedEncoding};
+pub use path::{
+    Cubic, Path, PathBbox, PathEncoder, PathMonoid, PathSegment, PathSegmentType, PathTag, Tile,
+};
diff --git a/src/encoding/binning.rs b/src/encoding/binning.rs
new file mode 100644
index 0000000..f9ec7d0
--- /dev/null
+++ b/src/encoding/binning.rs
@@ -0,0 +1,12 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use bytemuck::{Pod, Zeroable};
+
+/// Binning header.
+#[derive(Copy, Clone, Pod, Zeroable, Debug, Default)]
+#[repr(C)]
+pub struct BinHeader {
+    pub element_count: u32,
+    pub chunk_offset: u32,
+}
diff --git a/src/encoding/clip.rs b/src/encoding/clip.rs
new file mode 100644
index 0000000..b621630
--- /dev/null
+++ b/src/encoding/clip.rs
@@ -0,0 +1,42 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use bytemuck::{Pod, Zeroable};
+
+/// Clip stack element.
+#[derive(Copy, Clone, Pod, Zeroable, Debug, Default)]
+#[repr(C)]
+pub struct ClipBic {
+    pub a: u32,
+    pub b: u32,
+}
+
+/// Clip element.
+#[derive(Copy, Clone, Pod, Zeroable, Debug, Default)]
+#[repr(C)]
+pub struct ClipElement {
+    pub parent_ix: u32,
+    pub bbox: [f32; 4],
+}
+
+/// Clip resolution.
+/// 
+/// This is an intermediate element used to match clips to associated paths
+/// and is also used to connect begin and end clip pairs.
+#[derive(Copy, Clone, Pod, Zeroable, Debug, Default)]
+#[repr(C)]
+pub struct Clip {
+    // Index of the draw object.
+    pub ix: u32,
+    /// This is a packed encoding of an enum with the sign bit as the tag. If positive,
+    /// this entry is a BeginClip and contains the associated path index. If negative,
+    /// it is an EndClip and contains the bitwise-not of the EndClip draw object index.    
+    pub path_ix: i32,
+}
+
+/// Clip bounding box.
+#[derive(Copy, Clone, Pod, Zeroable, Debug, Default)]
+#[repr(C)]
+pub struct ClipBbox {
+    pub bbox: [f32; 4],
+}
diff --git a/src/encoding/config.rs b/src/encoding/config.rs
new file mode 100644
index 0000000..dfe894b
--- /dev/null
+++ b/src/encoding/config.rs
@@ -0,0 +1,301 @@
+use super::{
+    BinHeader, ClipBbox, ClipBic, ClipElement, Clip, Cubic, DrawBbox, DrawMonoid, Layout,
+    PackedEncoding, Path, PathBbox, PathMonoid, PathSegment, Tile,
+};
+use bytemuck::{Pod, Zeroable};
+use std::mem;
+
+/// Counters for tracking dynamic allocation on the GPU.
+///
+/// This must be kept in sync with the struct in shader/shared/bump.wgsl
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct BumpAllocators {
+    pub failed: u32,
+    // Final needed dynamic size of the buffers. If any of these are larger
+    // than the corresponding `_size` element reallocation needs to occur.
+    pub binning: u32,
+    pub ptcl: u32,
+    pub tile: u32,
+    pub segments: u32,
+    pub blend: u32,
+}
+
+/// GPU side configuration.
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+#[repr(C)]
+pub struct GpuConfig {
+    /// Width of the scene in tiles.
+    pub width_in_tiles: u32,
+    /// Height of the scene in tiles.
+    pub height_in_tiles: u32,
+    /// Width of the target in pixels.
+    pub target_width: u32,
+    /// Height of the target in pixels.
+    pub target_height: u32,
+    /// Layout of packed scene data.
+    pub layout: Layout,
+    /// Size of binning buffer allocation (in u32s).
+    pub binning_size: u32,
+    /// Size of tile buffer allocation (in Tiles).
+    pub tiles_size: u32,
+    /// Size of segment buffer allocation (in PathSegments).
+    pub segments_size: u32,
+    /// Size of per-tile command list buffer allocation (in u32s).
+    pub ptcl_size: u32,
+}
+
+/// CPU side setup and configuration.
+#[derive(Default)]
+pub struct CpuConfig {
+    /// GPU side configuration.
+    pub gpu: GpuConfig,
+    /// Workgroup sizes for all compute pipelines.
+    pub workgroup_sizes: WorkgroupSizes,
+    /// Sizes of all buffer resources.
+    pub buffer_sizes: BufferSizes,
+}
+
+impl CpuConfig {
+    pub fn new(encoding: &PackedEncoding, width: u32, height: u32) -> Self {
+        let new_width = next_multiple_of(width, 16);
+        let new_height = next_multiple_of(height, 16);
+        let mut config = GpuConfig {
+            width_in_tiles: new_width / 16,
+            height_in_tiles: new_height / 16,
+            target_width: width,
+            target_height: height,
+            binning_size: 0,
+            tiles_size: 0,
+            segments_size: 0,
+            ptcl_size: 0,
+            layout: encoding.layout,
+        };
+        let n_path_tags = encoding.path_tags().len() as u32;
+        let workgroup_sizes = WorkgroupSizes::new(&config, n_path_tags);
+        let buffer_sizes = BufferSizes::new(&config, &workgroup_sizes, n_path_tags);
+        config.binning_size = buffer_sizes.bin_data.len();
+        config.tiles_size = buffer_sizes.tiles.len();
+        config.segments_size = buffer_sizes.tiles.len();
+        config.ptcl_size = buffer_sizes.tiles.len();
+        Self {
+            gpu: config,
+            workgroup_sizes,
+            buffer_sizes,
+        }
+    }
+}
+
+const PATH_REDUCE_WG: u32 = 256;
+const PATH_BBOX_WG: u32 = 256;
+const PATH_COARSE_WG: u32 = 256;
+const CLIP_REDUCE_WG: u32 = 256;
+
+/// Type alias for a workgroup size.
+pub type WorkgroupSize = (u32, u32, u32);
+
+/// Computed sizes for all dispatches.
+#[derive(Copy, Clone, Debug, Default)]
+pub struct WorkgroupSizes {
+    pub use_large_path_scan: bool,
+    pub path_reduce: WorkgroupSize,
+    pub path_reduce2: WorkgroupSize,
+    pub path_scan1: WorkgroupSize,
+    pub path_scan: WorkgroupSize,
+    pub bbox_clear: WorkgroupSize,
+    pub path_seg: WorkgroupSize,
+    pub draw_reduce: WorkgroupSize,
+    pub draw_leaf: WorkgroupSize,
+    pub clip_reduce: WorkgroupSize,
+    pub clip_leaf: WorkgroupSize,
+    pub binning: WorkgroupSize,
+    pub tile_alloc: WorkgroupSize,
+    pub path_coarse: WorkgroupSize,
+    pub backdrop: WorkgroupSize,
+    pub coarse: WorkgroupSize,
+    pub fine: WorkgroupSize,
+}
+
+impl WorkgroupSizes {
+    pub fn new(config: &GpuConfig, n_path_tags: u32) -> Self {
+        let n_paths = config.layout.n_paths;
+        let n_draw_objects = config.layout.n_draw_objects;
+        let n_clips = config.layout.n_clips;
+        let path_tag_padded = align_up(n_path_tags, 4 * PATH_REDUCE_WG);
+        let path_tag_wgs = path_tag_padded / (4 * PATH_REDUCE_WG);
+        let use_large_path_scan = path_tag_wgs > PATH_REDUCE_WG;
+        let path_reduce_wgs = if use_large_path_scan {
+            align_up(path_tag_wgs, PATH_REDUCE_WG)
+        } else {
+            path_tag_wgs
+        };
+        let draw_object_wgs = (n_draw_objects + PATH_BBOX_WG - 1) / PATH_BBOX_WG;
+        let path_coarse_wgs = (n_path_tags + PATH_COARSE_WG - 1) / PATH_COARSE_WG;
+        let clip_reduce_wgs = n_clips.saturating_sub(1) / CLIP_REDUCE_WG;
+        let clip_wgs = (n_clips + CLIP_REDUCE_WG - 1) / CLIP_REDUCE_WG;
+        let path_wgs = (n_paths + PATH_BBOX_WG - 1) / PATH_BBOX_WG;
+        let width_in_bins = (config.width_in_tiles + 15) / 16;
+        let height_in_bins = (config.height_in_tiles + 15) / 16;
+        Self {
+            use_large_path_scan,
+            path_reduce: (path_reduce_wgs, 1, 1),
+            path_reduce2: (PATH_REDUCE_WG, 1, 1),
+            path_scan1: (path_reduce_wgs / PATH_REDUCE_WG, 1, 1),
+            path_scan: (path_tag_wgs, 1, 1),
+            bbox_clear: (draw_object_wgs, 1, 1),
+            path_seg: (path_coarse_wgs, 1, 1),
+            draw_reduce: (draw_object_wgs, 1, 1),
+            draw_leaf: (draw_object_wgs, 1, 1),
+            clip_reduce: (clip_reduce_wgs, 1, 1),
+            clip_leaf: (clip_wgs, 1, 1),
+            binning: (draw_object_wgs, 1, 1),
+            tile_alloc: (path_wgs, 1, 1),
+            path_coarse: (path_coarse_wgs, 1, 1),
+            backdrop: (path_wgs, 1, 1),
+            coarse: (width_in_bins, height_in_bins, 1),
+            fine: (config.width_in_tiles, config.height_in_tiles, 1),
+        }
+    }
+}
+
+/// Typed buffer size primitive.
+#[derive(Copy, Clone, Eq, Ord, Default, Debug)]
+pub struct BufferSize<T: Sized> {
+    len: u32,
+    _phantom: std::marker::PhantomData<T>,
+}
+
+impl<T: Sized> BufferSize<T> {
+    /// Creates a new buffer size from number of elements.
+    pub const fn new(len: u32) -> Self {
+        Self {
+            len,
+            _phantom: std::marker::PhantomData,
+        }
+    }
+
+    /// Creates a new buffer size from size in bytes.
+    pub const fn from_size_in_bytes(size: u32) -> Self {
+        Self::new(size / mem::size_of::<T>() as u32)
+    }
+
+    /// Returns the number of elements.
+    pub const fn len(self) -> u32 {
+        self.len
+    }
+
+    /// Returns the size in bytes.
+    pub const fn size_in_bytes(self) -> u32 {
+        mem::size_of::<T>() as u32 * self.len
+    }
+
+    /// Returns the size in bytes aligned up to the given value.
+    pub const fn aligned_in_bytes(self, alignment: u32) -> u32 {
+        align_up(self.size_in_bytes(), alignment)
+    }
+}
+
+impl<T: Sized> PartialEq for BufferSize<T> {
+    fn eq(&self, other: &Self) -> bool {
+        self.len == other.len
+    }
+}
+
+impl<T: Sized> PartialOrd for BufferSize<T> {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        self.len.partial_cmp(&other.len)
+    }
+}
+
+/// Computed sizes for all buffers.
+#[derive(Copy, Clone, Debug, Default)]
+pub struct BufferSizes {
+    // Known size buffers
+    pub path_reduced: BufferSize<PathMonoid>,
+    pub path_reduced2: BufferSize<PathMonoid>,
+    pub path_reduced_scan: BufferSize<PathMonoid>,
+    pub path_monoids: BufferSize<PathMonoid>,
+    pub path_bboxes: BufferSize<PathBbox>,
+    pub cubics: BufferSize<Cubic>,
+    pub draw_reduced: BufferSize<DrawMonoid>,
+    pub draw_monoids: BufferSize<DrawMonoid>,
+    pub info: BufferSize<u32>,
+    pub clip_inps: BufferSize<Clip>,
+    pub clip_els: BufferSize<ClipElement>,
+    pub clip_bics: BufferSize<ClipBic>,
+    pub clip_bboxes: BufferSize<ClipBbox>,
+    pub draw_bboxes: BufferSize<DrawBbox>,
+    pub bin_headers: BufferSize<BinHeader>,
+    pub paths: BufferSize<Path>,
+    // Bump allocated buffers
+    pub bin_data: BufferSize<u32>,
+    pub tiles: BufferSize<Tile>,
+    pub segments: BufferSize<PathSegment>,
+    pub ptcl: BufferSize<u32>,
+}
+
+impl BufferSizes {
+    pub fn new(config: &GpuConfig, workgroups: &WorkgroupSizes, n_path_tags: u32) -> Self {
+        let n_paths = config.layout.n_paths;
+        let n_draw_objects = config.layout.n_draw_objects;
+        let n_clips = config.layout.n_clips;
+        let path_tag_wgs = workgroups.path_reduce.0;
+        let path_reduced = BufferSize::new(path_tag_wgs);
+        let path_reduced2 = BufferSize::new(PATH_REDUCE_WG);
+        let path_reduced_scan = BufferSize::new(path_tag_wgs);
+        let path_monoids = BufferSize::new(path_tag_wgs * PATH_REDUCE_WG);
+        let path_bboxes = BufferSize::new(n_paths);
+        let cubics = BufferSize::new(n_path_tags);
+        let draw_object_wgs = workgroups.draw_reduce.0;
+        let draw_reduced = BufferSize::new(draw_object_wgs);
+        let draw_monoids = BufferSize::new(n_draw_objects);
+        let info = BufferSize::new(config.layout.bin_data_start);
+        let clip_inps = BufferSize::new(n_clips);
+        let clip_els = BufferSize::new(n_clips);
+        let clip_bics = BufferSize::new(n_clips / CLIP_REDUCE_WG);
+        let clip_bboxes = BufferSize::new(n_clips);
+        let draw_bboxes = BufferSize::new(n_paths);
+        let bin_headers = BufferSize::new(draw_object_wgs * 256);
+        let n_paths_aligned = align_up(n_paths, 256);
+        let paths = BufferSize::new(n_paths_aligned);
+        // TODO: better heuristics. Just use 128k for now
+        let initial_bump_size = 128 * 1024;
+        let bin_data = BufferSize::from_size_in_bytes(initial_bump_size);
+        let tiles = BufferSize::from_size_in_bytes(initial_bump_size);
+        let segments = BufferSize::from_size_in_bytes(initial_bump_size);
+        let ptcl = BufferSize::from_size_in_bytes(initial_bump_size);
+        Self {
+            path_reduced,
+            path_reduced2,
+            path_reduced_scan,
+            path_monoids,
+            path_bboxes,
+            cubics,
+            draw_reduced,
+            draw_monoids,
+            info,
+            clip_inps,
+            clip_els,
+            clip_bics,
+            clip_bboxes,
+            draw_bboxes,
+            bin_headers,
+            paths,
+            bin_data,
+            tiles,
+            segments,
+            ptcl,
+        }
+    }
+}
+
+const fn align_up(len: u32, alignment: u32) -> u32 {
+    len + (len.wrapping_neg() & (alignment - 1))
+}
+
+const fn next_multiple_of(val: u32, rhs: u32) -> u32 {
+    match val % rhs {
+        0 => val,
+        r => val + (rhs - r),
+    }
+}
diff --git a/src/encoding/draw.rs b/src/encoding/draw.rs
index 1ddaead..5a9b07a 100644
--- a/src/encoding/draw.rs
+++ b/src/encoding/draw.rs
@@ -54,6 +54,13 @@
     }
 }
 
+/// Draw object bounding box.
+#[derive(Copy, Clone, Pod, Zeroable, Debug, Default)]
+#[repr(C)]
+pub struct DrawBbox {
+    pub bbox: [f32; 4],
+}
+
 /// Draw data for a solid color.
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
 #[repr(C)]
@@ -131,7 +138,7 @@
 }
 
 /// Monoid for the draw tag stream.
-#[derive(Copy, Clone, PartialEq, Eq, Pod, Zeroable, Default)]
+#[derive(Copy, Clone, PartialEq, Eq, Pod, Zeroable, Default, Debug)]
 #[repr(C)]
 pub struct DrawMonoid {
     // The number of paths preceding this draw object.
diff --git a/src/encoding/packed.rs b/src/encoding/packed.rs
index 55ffc17..00a9e3a 100644
--- a/src/encoding/packed.rs
+++ b/src/encoding/packed.rs
@@ -48,30 +48,6 @@
     pub linewidth_base: u32,
 }
 
-/// Scene configuration.
-#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
-#[repr(C)]
-pub struct Config {
-    /// Width of the scene in tiles.
-    pub width_in_tiles: u32,
-    /// Height of the scene in tiles.
-    pub height_in_tiles: u32,
-    /// Width of the target in pixels.
-    pub target_width: u32,
-    /// Height of the target in pixels.
-    pub target_height: u32,
-    /// Layout of packed scene data.
-    pub layout: Layout,
-    /// Size of binning buffer allocation (in u32s).
-    pub binning_size: u32,
-    /// Size of tile buffer allocation (in Tiles).
-    pub tiles_size: u32,
-    /// Size of segment buffer allocation (in PathSegments).
-    pub segments_size: u32,
-    /// Size of per-tile command list buffer allocation (in u32s).
-    pub ptcl_size: u32,
-}
-
 /// Packed encoding of scene data.
 #[derive(Default)]
 pub struct PackedEncoding {
diff --git a/src/encoding/path.rs b/src/encoding/path.rs
index 760eb32..0ebfcda 100644
--- a/src/encoding/path.rs
+++ b/src/encoding/path.rs
@@ -20,7 +20,7 @@
 use super::Monoid;
 
 /// Path segment.
-#[derive(Clone, Copy, Debug, Zeroable, Pod)]
+#[derive(Clone, Copy, Debug, Zeroable, Pod, Default)]
 #[repr(C)]
 pub struct PathSegment {
     pub origin: [f32; 2],
@@ -170,6 +170,19 @@
     }
 }
 
+/// Cubic path segment.
+#[derive(Copy, Clone, Pod, Zeroable, Debug, Default)]
+#[repr(C)]
+pub struct Cubic {
+    pub p0: [f32; 2],
+    pub p1: [f32; 2],
+    pub p2: [f32; 2],
+    pub p3: [f32; 2],
+    pub stroke: [f32; 2],
+    pub path_ix: u32,
+    pub flags: u32,
+}
+
 /// Path bounding box.
 #[derive(Copy, Clone, Pod, Zeroable, Default, Debug)]
 #[repr(C)]
@@ -188,6 +201,26 @@
     pub trans_ix: u32,
 }
 
+/// Tiled path object.
+#[derive(Copy, Clone, Pod, Zeroable, Debug, Default)]
+#[repr(C)]
+pub struct Path {
+    /// Bounding box in tiles.
+    pub bbox: [f32; 4],
+    /// Offset (in u32s) to tile rectangle.
+    pub tiles: u32,
+}
+
+/// Tile object.
+#[derive(Copy, Clone, Pod, Zeroable, Debug, Default)]
+#[repr(C)]
+pub struct Tile {
+    /// Accumulated backdrop at the left edge of the tile.
+    pub backdrop: i32,
+    /// Index of first path segment.
+    pub segments: u32,
+}
+
 /// Encoder for path segments.
 pub struct PathEncoder<'a> {
     tags: &'a mut Vec<PathTag>,
diff --git a/src/render.rs b/src/render.rs
index 9136931..8d6f627 100644
--- a/src/render.rs
+++ b/src/render.rs
@@ -191,7 +191,7 @@
     let new_height = next_multiple_of(height, 16);
 
     let info_size = packed.layout.bin_data_start;
-    let config = crate::encoding::Config {
+    let config = crate::encoding::GpuConfig {
         width_in_tiles: new_width / 16,
         height_in_tiles: new_height / 16,
         target_width: width,
diff --git a/vello_native/Cargo.toml b/vello_native/Cargo.toml
new file mode 100644
index 0000000..233ed7d
--- /dev/null
+++ b/vello_native/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "vello_native"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+vello = { path = "../" }
+vello_shaders = { path = "../vello_shaders" }
+bytemuck = { version = "1.12.1", features = ["derive"] }
+
+[target.'cfg(target_os="macos")'.dependencies]
+metal = "0.24"
+objc = "0.2"
+block = "0.1"
+cocoa-foundation = "0.1"
+# Note: foreign-types is up to 0.5 but metal hasn't upgraded to it
+foreign-types = "0.3"
+core-graphics-types = "0.1"
diff --git a/vello_native/src/lib.rs b/vello_native/src/lib.rs
new file mode 100644
index 0000000..0367c24
--- /dev/null
+++ b/vello_native/src/lib.rs
@@ -0,0 +1 @@
+pub mod metal;
diff --git a/vello_native/src/metal.rs b/vello_native/src/metal.rs
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/vello_native/src/metal.rs
@@ -0,0 +1 @@
+
diff --git a/vello_native/src/metal/util.rs b/vello_native/src/metal/util.rs
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/vello_native/src/metal/util.rs