Allow deeper blend stacks by spilling to a buffer (#657)

This brings in support for blend spilling (which was supported in the
old piet-gpu).

I don't have a good heuristic for how big to make the buffer. That is
something which will need to be addressed in #606 (or its successors). I
just guessed that 256 spills would be fine. I think this is probably too
small - I suspect we'll get feedback from @TrueDoctor about this.

I have confirmed that the robustness works as expected with the GPU
shaders.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a0f9b2..76dfa8f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,8 @@
 
 ### Added
 
+- Support blends more than four layers deep ([#657][] by [@DJMcNab][])
+
 ### Changed
 
 - Breaking: Updated `wgpu` to 22.1.0. ([#635] by [@waywardmonkeys])
@@ -119,6 +121,7 @@
 [#630]: https://github.com/linebender/vello/pull/630
 [#631]: https://github.com/linebender/vello/pull/631
 [#635]: https://github.com/linebender/vello/pull/635
+[#657]: https://github.com/linebender/vello/pull/657
 
 <!-- Note that this still comparing against 0.2.0, because 0.2.1 is a cherry-picked patch -->
 [Unreleased]: https://github.com/linebender/vello/compare/v0.2.0...HEAD
diff --git a/examples/scenes/src/test_scenes.rs b/examples/scenes/src/test_scenes.rs
index 93e59c7..6fc9189 100644
--- a/examples/scenes/src/test_scenes.rs
+++ b/examples/scenes/src/test_scenes.rs
@@ -67,6 +67,7 @@
     two_point_radial(two_point_radial),
     brush_transform(brush_transform: animated),
     blend_grid(blend_grid),
+    deep_blend(deep_blend),
     conflation_artifacts(conflation_artifacts),
     labyrinth(labyrinth),
     robust_paths(robust_paths),
@@ -1057,6 +1058,42 @@
         }
     }
 
+    pub(super) fn deep_blend(scene: &mut Scene, params: &mut SceneParams) {
+        params.resolution = Some(Vec2::new(1000., 1000.));
+        let main_rect = Rect::from_origin_size((10., 10.), (900., 900.));
+        scene.fill(
+            Fill::EvenOdd,
+            Affine::IDENTITY,
+            Color::RED,
+            None,
+            &main_rect,
+        );
+        let options = [
+            (800., Color::AQUA),
+            (700., Color::RED),
+            (600., Color::ALICE_BLUE),
+            (500., Color::YELLOW),
+            (400., Color::GREEN),
+            (300., Color::BLUE),
+            (200., Color::ORANGE),
+            (100., Color::WHITE),
+        ];
+        let mut depth = 0;
+        for (width, colour) in &options[..params.complexity.min(options.len() - 1)] {
+            scene.push_layer(
+                Mix::Normal,
+                0.9,
+                Affine::IDENTITY,
+                &Rect::from_origin_size((10., 10.), (*width, *width)),
+            );
+            scene.fill(Fill::EvenOdd, Affine::IDENTITY, colour, None, &main_rect);
+            depth += 1;
+        }
+        for _ in 0..depth {
+            scene.pop_layer();
+        }
+    }
+
     // Support functions
 
     pub(super) fn render_cardioid(scene: &mut Scene) {
diff --git a/vello/src/render.rs b/vello/src/render.rs
index bbd1c8c..952ef49 100644
--- a/vello/src/render.rs
+++ b/vello/src/render.rs
@@ -46,6 +46,7 @@
     gradient_image: ResourceProxy,
     info_bin_data_buf: ResourceProxy,
     image_atlas: ResourceProxy,
+    blend_spill_buf: ResourceProxy,
 
     out_image: ImageProxy,
 }
@@ -450,6 +451,10 @@
         recording.free_resource(bin_header_buf);
         recording.free_resource(path_buf);
         let out_image = ImageProxy::new(params.width, params.height, ImageFormat::Rgba8);
+        let blend_spill_buf = BufferProxy::new(
+            buffer_sizes.blend_spill.size_in_bytes().into(),
+            "blend_spill",
+        );
         self.fine_wg_count = Some(wg_counts.fine);
         self.fine_resources = Some(FineResources {
             aa_config: params.antialiasing_method,
@@ -460,6 +465,7 @@
             ptcl_buf,
             gradient_image,
             info_bin_data_buf,
+            blend_spill_buf: ResourceProxy::Buffer(blend_spill_buf),
             image_atlas: ResourceProxy::Image(image_atlas),
             out_image,
         });
@@ -510,6 +516,7 @@
                         fine.segments_buf,
                         fine.ptcl_buf,
                         fine.info_bin_data_buf,
+                        fine.blend_spill_buf,
                         ResourceProxy::Image(fine.out_image),
                         fine.gradient_image,
                         fine.image_atlas,
@@ -543,6 +550,7 @@
                         fine.segments_buf,
                         fine.ptcl_buf,
                         fine.info_bin_data_buf,
+                        fine.blend_spill_buf,
                         ResourceProxy::Image(fine.out_image),
                         fine.gradient_image,
                         fine.image_atlas,
diff --git a/vello/src/shaders.rs b/vello/src/shaders.rs
index bf34bad..a58e0ed 100644
--- a/vello/src/shaders.rs
+++ b/vello/src/shaders.rs
@@ -211,6 +211,7 @@
         BindType::BufReadOnly,
         BindType::BufReadOnly,
         BindType::BufReadOnly,
+        BindType::Buffer,
         BindType::Image(ImageFormat::Rgba8),
         BindType::ImageRead(ImageFormat::Rgba8),
         BindType::ImageRead(ImageFormat::Rgba8),
diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs
index 8b44bb9..88da7fd 100644
--- a/vello_encoding/src/config.rs
+++ b/vello_encoding/src/config.rs
@@ -147,6 +147,9 @@
     pub seg_counts_size: u32,
     /// Size of segment buffer allocation (in [`PathSegment`]s).
     pub segments_size: u32,
+    /// Size of blend spill buffer (in `u32` pixels).
+    // TODO: Maybe store in TILE_WIDTH * TILE_HEIGHT blocks of pixels instead?
+    pub blend_size: u32,
     /// Size of per-tile command list buffer allocation (in `u32`s).
     pub ptcl_size: u32,
 }
@@ -184,6 +187,7 @@
                 tiles_size: buffer_sizes.tiles.len(),
                 seg_counts_size: buffer_sizes.seg_counts.len(),
                 segments_size: buffer_sizes.segments.len(),
+                blend_size: buffer_sizes.blend_spill.len(),
                 ptcl_size: buffer_sizes.ptcl.len(),
                 layout: *layout,
             },
@@ -352,6 +356,7 @@
     pub tiles: BufferSize<Tile>,
     pub seg_counts: BufferSize<SegmentCount>,
     pub segments: BufferSize<PathSegment>,
+    pub blend_spill: BufferSize<u32>,
     pub ptcl: BufferSize<u32>,
 }
 
@@ -395,6 +400,8 @@
         let lines = BufferSize::new(1 << 21);
         let seg_counts = BufferSize::new(1 << 21);
         let segments = BufferSize::new(1 << 21);
+        // 16 * 16 (1 << 8) is one blend spill, so this allows for 4096 spills.
+        let blend_spill = BufferSize::new(1 << 20);
         let ptcl = BufferSize::new(1 << 23);
         Self {
             path_reduced,
@@ -419,6 +426,7 @@
             tiles,
             seg_counts,
             segments,
+            blend_spill,
             ptcl,
         }
     }
diff --git a/vello_shaders/shader/coarse.wgsl b/vello_shaders/shader/coarse.wgsl
index c28f8d2..6856396 100644
--- a/vello_shaders/shader/coarse.wgsl
+++ b/vello_shaders/shader/coarse.wgsl
@@ -444,8 +444,11 @@
         ptcl[cmd_offset] = CMD_END;
         var blend_ix = 0u;
         if max_blend_depth > BLEND_STACK_SPLIT {
-            let scratch_size = max_blend_depth * TILE_WIDTH * TILE_HEIGHT;
+            let scratch_size = (max_blend_depth - BLEND_STACK_SPLIT) * TILE_WIDTH * TILE_HEIGHT;
             blend_ix = atomicAdd(&bump.blend, scratch_size);
+            if blend_ix + scratch_size > config.blend_size {
+                atomicOr(&bump.failed, STAGE_COARSE);
+            }
         }
         ptcl[blend_offset] = blend_ix;
     }
diff --git a/vello_shaders/shader/fine.wgsl b/vello_shaders/shader/fine.wgsl
index 5af82f4..810f416 100644
--- a/vello_shaders/shader/fine.wgsl
+++ b/vello_shaders/shader/fine.wgsl
@@ -39,6 +39,9 @@
 var<storage> info: array<u32>;
 
 @group(0) @binding(4)
+var<storage, read_write> blend_spill: array<u32>;
+
+@group(0) @binding(5)
 #ifdef r8
 var output: texture_storage_2d<r8unorm, write>;
 #else
@@ -46,10 +49,10 @@
 #endif
 
 #ifdef full
-@group(0) @binding(5)
+@group(0) @binding(6)
 var gradients: texture_2d<f32>;
 
-@group(0) @binding(6)
+@group(0) @binding(7)
 var image_atlas: texture_2d<f32>;
 #endif
 
@@ -57,9 +60,9 @@
 #ifdef msaa
 
 #ifdef full
-const MASK_LUT_INDEX: u32 = 7;
+const MASK_LUT_INDEX: u32 = 8;
 #else
-const MASK_LUT_INDEX: u32 = 5;
+const MASK_LUT_INDEX: u32 = 6;
 #endif
 
 #ifdef msaa8
@@ -947,7 +950,13 @@
                         rgba[i] = vec4(0.0);
                     }
                 } else {
-                    // TODO: spill to memory
+                    let blend_in_scratch = clip_depth - BLEND_STACK_SPLIT;
+                    let local_tile_ix = local_id.x * PIXELS_PER_THREAD + local_id.y * TILE_WIDTH;
+                    let local_blend_start = blend_offset + blend_in_scratch * TILE_WIDTH * TILE_HEIGHT + local_tile_ix;
+                    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                        blend_spill[local_blend_start + i] = pack4x8unorm(rgba[i]);
+                        rgba[i] = vec4(0.0);
+                    }
                 }
                 clip_depth += 1u;
                 cmd_ix += 1u;
@@ -960,7 +969,10 @@
                     if clip_depth < BLEND_STACK_SPLIT {
                         bg_rgba = blend_stack[clip_depth][i];
                     } else {
-                        // load from memory
+                        let blend_in_scratch = clip_depth - BLEND_STACK_SPLIT;
+                        let local_tile_ix = local_id.x * PIXELS_PER_THREAD + local_id.y * TILE_WIDTH;
+                        let local_blend_start = blend_offset + blend_in_scratch * TILE_WIDTH * TILE_HEIGHT + local_tile_ix;
+                        bg_rgba = blend_spill[local_blend_start + i];
                     }
                     let bg = unpack4x8unorm(bg_rgba);
                     let fg = rgba[i] * area[i] * end_clip.alpha;
diff --git a/vello_shaders/shader/shared/config.wgsl b/vello_shaders/shader/shared/config.wgsl
index ef7b928..3391afd 100644
--- a/vello_shaders/shader/shared/config.wgsl
+++ b/vello_shaders/shader/shared/config.wgsl
@@ -1,7 +1,7 @@
 // Copyright 2022 the Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
-// This must be kept in sync with the struct in src/encoding/resolve.rs
+// This must be kept in sync with `ConfigUniform` in `vello_encoding/src/config.rs`
 struct Config {
     width_in_tiles: u32,
     height_in_tiles: u32,
@@ -38,6 +38,7 @@
     tiles_size: u32,
     seg_counts_size: u32,
     segments_size: u32,
+    blend_size: u32,
     ptcl_size: u32,
 }
 
@@ -54,6 +55,9 @@
 // Not currently supporting non-square tiles
 let TILE_SCALE = 0.0625;
 
+// The "split" point between using local memory in fine for the blend stack and spilling to the blend_spill buffer.
+// A higher value will increase vgpr ("register") pressure in fine, but decrease required dynamic memory allocation.
+// If changing, also change in vello_shaders/src/cpu/coarse.rs.
 let BLEND_STACK_SPLIT = 4u;
 
 // The following are computed in draw_leaf from the generic gradient parameters
diff --git a/vello_shaders/src/cpu/coarse.rs b/vello_shaders/src/cpu/coarse.rs
index dddcd4b..88ec603 100644
--- a/vello_shaders/src/cpu/coarse.rs
+++ b/vello_shaders/src/cpu/coarse.rs
@@ -1,6 +1,8 @@
 // Copyright 2023 the Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
+use std::cmp::max;
+
 use vello_encoding::{
     BinHeader, BumpAllocators, ConfigUniform, DrawMonoid, DrawTag, Path, Tile,
     DRAW_INFO_FLAGS_FILL_RULE_BIT,
@@ -11,10 +13,18 @@
     CMD_LIN_GRAD, CMD_RAD_GRAD, CMD_SOLID, CMD_SWEEP_GRAD, PTCL_INITIAL_ALLOC,
 };
 
+// Tiles per bin
 const N_TILE_X: usize = 16;
 const N_TILE_Y: usize = 16;
 const N_TILE: usize = N_TILE_X * N_TILE_Y;
 
+// If changing also change in config.wgsl
+const BLEND_STACK_SPLIT: u32 = 4;
+
+// Pixels per tile
+const TILE_WIDTH: u32 = 16;
+const TILE_HEIGHT: u32 = 16;
+
 const PTCL_INCREMENT: u32 = 256;
 const PTCL_HEADROOM: u32 = 2;
 
@@ -219,6 +229,8 @@
             let blend_offset = tile_state.cmd_offset;
             tile_state.cmd_offset += 1;
             let mut clip_depth = 0;
+            let mut render_blend_depth = 0;
+            let mut max_blend_depth = 0_u32;
             let mut clip_zero_depth = 0;
             for drawobj_ix in &compacted[tile_ix] {
                 let drawtag = scene[(drawtag_base + drawobj_ix) as usize];
@@ -306,7 +318,10 @@
                                     clip_zero_depth = clip_depth + 1;
                                 } else {
                                     tile_state.write_begin_clip(config, bump, ptcl);
-                                    // TODO: update blend depth
+                                    // TODO: Do we need to track this separately, seems like it
+                                    // is always the same as clip_depth in this code path
+                                    render_blend_depth += 1;
+                                    max_blend_depth = max(render_blend_depth, max_blend_depth);
                                 }
                                 clip_depth += 1;
                             }
@@ -317,6 +332,7 @@
                                 let blend = scene[dd as usize];
                                 let alpha = f32::from_bits(scene[dd as usize + 1]);
                                 tile_state.write_end_clip(config, bump, ptcl, blend, alpha);
+                                render_blend_depth -= 1;
                             }
                             _ => todo!(),
                         }
@@ -338,7 +354,8 @@
 
             if bin_tile_x + tile_x < width_in_tiles && bin_tile_y + tile_y < height_in_tiles {
                 ptcl[tile_state.cmd_offset as usize] = CMD_END;
-                let scratch_size = 0; // TODO: actually compute blend depth
+                let scratch_size =
+                    (max_blend_depth.saturating_sub(BLEND_STACK_SPLIT)) * TILE_WIDTH * TILE_HEIGHT;
                 ptcl[blend_offset as usize] = bump.blend;
                 bump.blend += scratch_size;
             }
diff --git a/vello_tests/snapshots/deep_blend.png b/vello_tests/snapshots/deep_blend.png
new file mode 100644
index 0000000..8375462
--- /dev/null
+++ b/vello_tests/snapshots/deep_blend.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69f822f9fbe2de48420f5dee7ab0a502e022e65e08f99f2f16823927e8b63f95
+size 7000
diff --git a/vello_tests/tests/compare_gpu_cpu.rs b/vello_tests/tests/compare_gpu_cpu.rs
index e906f31..860728f 100644
--- a/vello_tests/tests/compare_gpu_cpu.rs
+++ b/vello_tests/tests/compare_gpu_cpu.rs
@@ -77,7 +77,15 @@
 #[cfg_attr(skip_gpu_tests, ignore)]
 fn compare_fill_types() {
     let test_scene = test_scenes::fill_types();
-    assert_eq!(test_scene.config.name, "fill_types");
     let params = TestParams::new("compare_fill_types", 1400, 700);
     compare_test_scene(test_scene, params);
 }
+
+#[test]
+#[cfg_attr(skip_gpu_tests, ignore)]
+fn compare_deep_blend() {
+    let test_scene = test_scenes::deep_blend();
+    assert_eq!(test_scene.config.name, "deep_blend");
+    let params = TestParams::new("compare_deep_blend", 150, 150);
+    compare_test_scene(test_scene, params);
+}
diff --git a/vello_tests/tests/snapshots.rs b/vello_tests/tests/snapshots.rs
index 033af03..4abcc70 100644
--- a/vello_tests/tests/snapshots.rs
+++ b/vello_tests/tests/snapshots.rs
@@ -71,7 +71,14 @@
 #[cfg_attr(skip_gpu_tests, ignore)]
 fn snapshot_fill_types() {
     let test_scene = test_scenes::fill_types();
-    assert_eq!(test_scene.config.name, "fill_types");
     let params = TestParams::new("fill_types", 700, 350);
     snapshot_test_scene(test_scene, params);
 }
+
+#[test]
+#[cfg_attr(skip_gpu_tests, ignore)]
+fn snapshot_deep_blend() {
+    let test_scene = test_scenes::deep_blend();
+    let params = TestParams::new("deep_blend", 200, 200);
+    snapshot_test_scene(test_scene, params);
+}