Merge branch 'main' into async
diff --git a/examples/with_winit/src/main.rs b/examples/with_winit/src/main.rs
index f12f8c3..706c3de 100644
--- a/examples/with_winit/src/main.rs
+++ b/examples/with_winit/src/main.rs
@@ -233,6 +233,9 @@
 
 fn main() {
     let args = Args::parse();
+    // TODO: initializing both env_logger and console_logger fails on wasm.
+    // Figure out a more principled approach.
+    #[cfg(not(target_arch = "wasm32"))]
     env_logger::init();
     #[cfg(not(target_arch = "wasm32"))]
     {
@@ -269,6 +272,6 @@
             .and_then(|doc| doc.body())
             .and_then(|body| body.append_child(&web_sys::Element::from(canvas)).ok())
             .expect("couldn't append canvas to document body");
-        wasm_bindgen_futures::spawn_local(run(event_loop, window));
+        wasm_bindgen_futures::spawn_local(run(event_loop, window, args));
     }
 }
diff --git a/examples/with_winit/src/test_scene.rs b/examples/with_winit/src/test_scene.rs
index cd74f15..96fe0c2 100644
--- a/examples/with_winit/src/test_scene.rs
+++ b/examples/with_winit/src/test_scene.rs
@@ -82,9 +82,11 @@
 ) {
     let scene_frag = scene.get_or_insert_with(|| {
         use super::pico_svg::*;
+        #[cfg(not(target_arch = "wasm32"))]
         let start = Instant::now();
         eprintln!("Starting to parse svg");
         let svg = PicoSvg::load(svg, scale).unwrap();
+        #[cfg(not(target_arch = "wasm32"))]
         eprintln!("Parsing svg took {:?}", start.elapsed());
         let mut new_scene = SceneFragment::new();
         let mut builder = SceneBuilder::for_fragment(&mut new_scene);
diff --git a/shader/coarse.wgsl b/shader/coarse.wgsl
index cea3637..df09de9 100644
--- a/shader/coarse.wgsl
+++ b/shader/coarse.wgsl
@@ -148,7 +148,17 @@
     // Exit early if prior stages failed, as we can't run this stage.
     // We need to check only prior stages, as if this stage has failed in another workgroup, 
     // we still want to know this workgroup's memory requirement.   
-    if (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) != 0u {
+    if local_id.x == 0u {
+        // Reuse sh_part_count to hold failed flag, shmem is tight
+        sh_part_count[0] = atomicLoad(&bump.failed);
+    }
+#ifdef have_uniform
+    let failed = workgroupUniformLoad(&sh_part_count[0]);
+#else
+    workgroupBarrier();
+    let failed = sh_part_count[0];
+#endif
+    if (failed & (STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) != 0u {
         return;
     }
     let width_in_bins = (config.width_in_tiles + N_TILE_X - 1u) / N_TILE_X;
@@ -207,8 +217,12 @@
                     workgroupBarrier();
                 }
                 sh_part_count[local_id.x] = part_start_ix + count;
+#ifdef have_uniform
+                ready_ix = workgroupUniformLoad(&sh_part_count[WG_SIZE - 1u]);
+#else
                 workgroupBarrier();
                 ready_ix = sh_part_count[WG_SIZE - 1u];
+#endif
                 partition_ix += WG_SIZE;
             }
             // use binary search to find draw object to read
diff --git a/shader/tile_alloc.wgsl b/shader/tile_alloc.wgsl
index b28166e..8d39e7c 100644
--- a/shader/tile_alloc.wgsl
+++ b/shader/tile_alloc.wgsl
@@ -29,6 +29,7 @@
 
 var<workgroup> sh_tile_count: array<u32, WG_SIZE>;
 var<workgroup> sh_tile_offset: u32;
+var<workgroup> sh_atomic_failed: u32;
 
 @compute @workgroup_size(256)
 fn main(
@@ -37,8 +38,17 @@
 ) {
     // Exit early if prior stages failed, as we can't run this stage.
     // We need to check only prior stages, as if this stage has failed in another workgroup, 
-    // we still want to know this workgroup's memory requirement.   
-    if (atomicLoad(&bump.failed) & STAGE_BINNING) != 0u {
+    // we still want to know this workgroup's memory requirement.
+    if local_id.x == 0u {
+        sh_atomic_failed = atomicLoad(&bump.failed);
+    }
+#ifdef have_uniform
+    let failed = workgroupUniformLoad(&sh_atomic_failed);
+#else
+    workgroupBarrier();
+    let failed = sh_atomic_failed;
+#endif
+    if (failed & STAGE_BINNING) != 0u {
         return;
     }    
     // scale factors useful for converting coordinates to tiles
diff --git a/src/engine.rs b/src/engine.rs
index 1a0e9f8..d3f152e 100644
--- a/src/engine.rs
+++ b/src/engine.rs
@@ -357,7 +357,19 @@
                     let buffer = self
                         .bind_map
                         .get_or_create(*proxy, device, &mut self.pool)?;
+                    #[cfg(not(target_arch = "wasm32"))]
                     encoder.clear_buffer(buffer, *offset, *size);
+                    #[cfg(target_arch = "wasm32")]
+                    {
+                        // TODO: remove this workaround when wgpu implements clear_buffer
+                        // Also note: semantics are wrong, it's queue order rather than encoder.
+                        let size = match size {
+                            Some(size) => size.get(),
+                            None => proxy.size,
+                        };
+                        let zeros = vec![0; size as usize];
+                        queue.write_buffer(buffer, *offset, &zeros);
+                    }
                 }
                 Command::FreeBuf(proxy) => {
                     free_bufs.insert(proxy.id);
diff --git a/src/shaders.rs b/src/shaders.rs
index 6e09a97..4c7b3da 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -160,6 +160,11 @@
     let mut small_config = HashSet::new();
     small_config.insert("full".into());
     small_config.insert("small".into());
+    // TODO: remove this workaround when workgroupUniformLoad lands in naga
+    #[allow(unused_mut)]
+    let mut uniform = HashSet::new();
+    #[cfg(target_arch = "wasm32")]
+    uniform.insert("have_uniform".into());
     let pathtag_reduce = engine.add_shader(
         device,
         "pathtag_reduce",
@@ -286,7 +291,7 @@
     let tile_alloc = engine.add_shader(
         device,
         "tile_alloc",
-        preprocess::preprocess(shader!("tile_alloc"), &empty, &imports).into(),
+        preprocess::preprocess(shader!("tile_alloc"), &uniform, &imports).into(),
         &[
             BindType::Uniform,
             BindType::BufReadOnly,
@@ -321,7 +326,7 @@
     let coarse = engine.add_shader(
         device,
         "coarse",
-        preprocess::preprocess(shader!("coarse"), &empty, &imports).into(),
+        preprocess::preprocess(shader!("coarse"), &uniform, &imports).into(),
         &[
             BindType::Uniform,
             BindType::BufReadOnly,