prototype gpu merge
diff --git a/Cargo.lock b/Cargo.lock
index 82f33ab..079fa00 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3922,6 +3922,8 @@
name = "vello_hybrid_winit"
version = "0.0.0"
dependencies = [
+ "bytemuck",
+ "futures-intrusive",
"pollster",
"vello_common",
"vello_example_scenes",
diff --git a/sparse_strips/vello_bench/src/data.rs b/sparse_strips/vello_bench/src/data.rs
index 989eeec..b860060 100644
--- a/sparse_strips/vello_bench/src/data.rs
+++ b/sparse_strips/vello_bench/src/data.rs
@@ -147,6 +147,7 @@
/// Get the alpha buffer and rendered strips.
pub fn strips(&self) -> (Vec<u8>, Vec<Strip>) {
let mut strip_buf = vec![];
+ let mut pmt_buf = vec![];
let mut alpha_buf = vec![];
let lines = self.lines();
let tiles = self.sorted_tiles();
@@ -155,6 +156,7 @@
Level::fallback(),
&tiles,
&mut strip_buf,
+ &mut pmt_buf,
&mut alpha_buf,
Fill::NonZero,
None,
diff --git a/sparse_strips/vello_bench/src/strip.rs b/sparse_strips/vello_bench/src/strip.rs
index 8690941..f395dec 100644
--- a/sparse_strips/vello_bench/src/strip.rs
+++ b/sparse_strips/vello_bench/src/strip.rs
@@ -17,6 +17,7 @@
g.bench_function(format!("{}_{}", $item.name.clone(), $suffix), |b| {
let mut strip_buf = vec![];
+ let mut pmt_buf = vec![];
let mut alpha_buf = vec![];
b.iter(|| {
@@ -27,6 +28,7 @@
$level,
&tiles,
&mut strip_buf,
+ &mut pmt_buf,
&mut alpha_buf,
Fill::NonZero,
None,
diff --git a/sparse_strips/vello_common/src/recording.rs b/sparse_strips/vello_common/src/recording.rs
index e840e84..481364f 100644
--- a/sparse_strips/vello_common/src/recording.rs
+++ b/sparse_strips/vello_common/src/recording.rs
@@ -11,7 +11,7 @@
#[cfg(feature = "text")]
use crate::peniko::FontData;
use crate::peniko::{BlendMode, Fill};
-use crate::strip::Strip;
+use crate::strip::{PreMergeTile, Strip};
use crate::strip_generator::StripStorage;
use alloc::vec::Vec;
@@ -59,6 +59,10 @@
&self.strip_storage.strips
}
+ pub fn pre_merge_tiles(&self) -> &[PreMergeTile] {
+ &self.strip_storage.pre_merge_tiles
+ }
+
/// Get alphas as slice
pub fn alphas(&self) -> &[u8] {
&self.strip_storage.alphas
@@ -177,8 +181,12 @@
}
/// Get cached strips.
- pub fn get_cached_strips(&self) -> (&[Strip], &[u8]) {
- (self.cached_strips.strips(), self.cached_strips.alphas())
+ pub fn get_cached_strips(&self) -> (&[Strip], &[PreMergeTile], &[u8]) {
+ (
+ self.cached_strips.strips(),
+ self.cached_strips.pre_merge_tiles(),
+ self.cached_strips.alphas(),
+ )
}
/// Takes cached strip buffers.
diff --git a/sparse_strips/vello_common/src/strip.rs b/sparse_strips/vello_common/src/strip.rs
index f2e5b7e..c8f99be 100644
--- a/sparse_strips/vello_common/src/strip.rs
+++ b/sparse_strips/vello_common/src/strip.rs
@@ -2,13 +2,17 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT
//! Rendering strips.
-
use crate::flatten::Line;
use crate::peniko::Fill;
use crate::tile::{Tile, Tiles};
use crate::util::f32_to_u8;
use alloc::vec::Vec;
+use bytemuck::{Pod, Zeroable};
use fearless_simd::*;
+use std::format;
+use std::println;
+use std::string::String;
+use std::vec;
/// A strip.
#[derive(Debug, Clone, Copy)]
@@ -85,17 +89,115 @@
}
}
+/// Pre-Merge-Tile:
+#[repr(C)]
+#[derive(Debug, Clone, Copy, Zeroable, Pod)]
+pub struct PreMergeTile {
+ /// The index into the alpha buffer that an "end tile" should write to.
+ pub alpha_index: u32,
+ /// Contains tile location information
+ pub packed_info: u32,
+ /// The exclusive prefix sum of the signed winding number, uploading it skips scan in shader.
+ pub scanned_winding: i32,
+ /// Necessary padding for alignment
+ pub padding: u32,
+ /// Line points, adjusted by tile position
+ pub p0: [f32; 2],
+ /// Line points, adjusted by tile position
+ pub p1: [f32; 2],
+}
+
+// MSB LSB
+// 31------------------21|20------------------12|11-------------------3|2|1|0|
+// | Unused (11) | Seg Start ID (9) | Tile Start ID (9) |F|T|E|
+//
+// F = FILL_RULE_MASK
+// E = IS_END_TILE_MASK
+// T = IS_TILE_FIRST_COL_MASK
+const IS_END_TILE_MASK: u32 = 1 << 0;
+const IS_TILE_FIRST_COL_MASK: u32 = 1 << 1;
+const FILL_RULE_MASK: u32 = 1 << 2;
+const TILE_START_ID_SHIFT: u32 = 3;
+const SEG_START_ID_SHIFT: u32 = 12;
+const ID_MASK: u32 = 0x1ff;
+const INVALID_ID: u32 = 256;
+const BLOCK_DIM_LG: u32 = 8;
+
+impl PreMergeTile {
+ pub fn set_is_end_tile(&mut self, is_end: bool) {
+ if is_end {
+ self.packed_info |= IS_END_TILE_MASK;
+ } else {
+ self.packed_info &= !IS_END_TILE_MASK;
+ }
+ }
+
+ pub fn is_end_tile(&self) -> bool {
+ (self.packed_info & IS_END_TILE_MASK) != 0
+ }
+
+ pub fn set_is_tile_first_col(&mut self, is_first: bool) {
+ if is_first {
+ self.packed_info |= IS_TILE_FIRST_COL_MASK;
+ }
+ }
+
+ pub fn is_tile_first_col(&self) -> bool {
+ (self.packed_info & IS_TILE_FIRST_COL_MASK) != 0
+ }
+
+ pub fn set_fill_rule(& mut self, is_fill_rule_non_zero: bool) {
+ if is_fill_rule_non_zero {
+ self.packed_info |= FILL_RULE_MASK;
+ }
+ }
+
+ pub fn set_tile_start_id(&mut self, id: u32) {
+ self.packed_info |= (id & ID_MASK) << TILE_START_ID_SHIFT;
+ }
+
+ pub fn tile_start_id(&self) -> u32 {
+ (self.packed_info >> TILE_START_ID_SHIFT) & ID_MASK
+ }
+
+ pub fn set_seg_start_id(&mut self, id: u32) {
+ self.packed_info |= (id & ID_MASK) << SEG_START_ID_SHIFT;
+ }
+
+ pub fn seg_start_id(&self) -> u32 {
+ (self.packed_info >> SEG_START_ID_SHIFT) & ID_MASK
+ }
+}
+
/// Render the tiles stored in `tiles` into the strip and alpha buffer.
+/// Aliasing threshold is unused
pub fn render(
- level: Level,
+ _level: Level,
tiles: &Tiles,
- strip_buf: &mut Vec<Strip>,
- alpha_buf: &mut Vec<u8>,
+ strip_buf: &mut Vec<Strip>, // Output parameter for strips
+ pmt_buf: &mut Vec<PreMergeTile>, // Output paramter for pre merge tiles
+ alpha_buf: &mut Vec<u8>, // Output parameter for alpha bytes
fill_rule: Fill,
aliasing_threshold: Option<u8>,
lines: &[Line],
) {
- dispatch!(level, simd => render_impl(simd, tiles, strip_buf, alpha_buf, fill_rule, aliasing_threshold, lines));
+ // let mut winding_fine_ref: Vec<[[f32; 4]; 4]> = Vec::new();
+ // let mut winding_acc_ref: Vec<[f32; 4]> = Vec::new();
+ // let mut winding_coarse_ref: Vec<i32> = Vec::new();
+ // let mut winding_fine_comp: Vec<[[f32; 4]; 4]> = Vec::new();
+ // let mut winding_acc_comp: Vec<[f32; 4]> = Vec::new();
+ // let mut winding_coarse_comp: Vec<i32> = Vec::new();
+
+ prepare_gpu_inputs(tiles, strip_buf, pmt_buf, alpha_buf, fill_rule, lines);
+
+ // compare_windings(
+ // &winding_fine_ref,
+ // &winding_coarse_ref,
+ // &winding_acc_ref,
+ // &winding_fine_comp,
+ // &winding_coarse_comp,
+ // &winding_acc_comp,
+ // );
}
fn render_impl<S: Simd>(
@@ -106,6 +208,9 @@
fill_rule: Fill,
aliasing_threshold: Option<u8>,
lines: &[Line],
+ winding_fine_ref: &mut Vec<[[f32; 4]; 4]>,
+ winding_acc_ref: &mut Vec<[f32; 4]>,
+ winding_coarse_ref: &mut Vec<i32>,
) {
if tiles.is_empty() {
return;
@@ -155,6 +260,10 @@
// Push out the winding as an alpha mask when we move to the next location (i.e., a tile
// without the same location).
if !prev_tile.same_loc(&tile) {
+ //winding_fine_ref.push(location_winding.map(|v| v.val));
+ // winding_acc_ref.push(accumulated_winding.val);
+ // winding_coarse_ref.push(winding_delta);
+
match fill_rule {
Fill::NonZero => {
let p1 = f32x4::splat(s, 0.5);
@@ -203,7 +312,7 @@
);
}
- alpha_buf.extend_from_slice(&u8_vals.val);
+ //alpha_buf.extend_from_slice(&u8_vals.val);
#[expect(clippy::needless_range_loop, reason = "dimension clarity")]
for x in 0..Tile::WIDTH as usize {
@@ -213,26 +322,26 @@
// Push out the strip if we're moving to a next strip.
if !prev_tile.same_loc(&tile) && !prev_tile.prev_loc(&tile) {
- debug_assert_eq!(
- (prev_tile.x + 1) * Tile::WIDTH - strip.x,
- ((alpha_buf.len() - strip.alpha_idx() as usize) / usize::from(Tile::HEIGHT)) as u16,
- "The number of columns written to the alpha buffer should equal the number of columns spanned by this strip."
- );
- strip_buf.push(strip);
+ // debug_assert_eq!(
+ // (prev_tile.x + 1) * Tile::WIDTH - strip.x,
+ // ((alpha_buf.len() - strip.alpha_idx() as usize) / usize::from(Tile::HEIGHT)) as u16,
+ // "The number of columns written to the alpha buffer should equal the number of columns spanned by this strip."
+ // );
+ //strip_buf.push(strip);
let is_sentinel = tile_idx == tiles.len() as usize;
if !prev_tile.same_row(&tile) {
// Emit a final strip in the row if there is non-zero winding for the sparse fill,
// or unconditionally if we've reached the sentinel tile to end the path (the
// `alpha_idx` field is used for width calculations).
- if winding_delta != 0 || is_sentinel {
- strip_buf.push(Strip::new(
- u16::MAX,
- prev_tile.y * Tile::HEIGHT,
- alpha_buf.len() as u32,
- should_fill(winding_delta),
- ));
- }
+ // if winding_delta != 0 || is_sentinel {
+ // strip_buf.push(Strip::new(
+ // u16::MAX,
+ // prev_tile.y * Tile::HEIGHT,
+ // alpha_buf.len() as u32,
+ // should_fill(winding_delta),
+ // ));
+ // }
winding_delta = 0;
accumulated_winding = f32x4::splat(s, 0.0);
@@ -405,10 +514,739 @@
// The trapezoidal area enclosed between the line and the right edge of the pixel
// square.
let area = 0.5 * h * (2. * px_right_x - line_px_right_yx - line_px_left_yx);
+ let local_contribution = acc.madd(sign, area);
+ if x_idx == 0 {
+ //println!("{}: {:<10.4} ", tile_idx, local_contribution.val[0]);
+ }
+ location_winding[x_idx as usize] = location_winding[x_idx as usize] + local_contribution;
+ acc = acc.madd(sign, h);
location_winding[x_idx as usize] += area.madd(sign, acc);
acc = h.madd(sign, acc);
}
- accumulated_winding += acc;
+ accumulated_winding = accumulated_winding + acc;
+ }
+}
+
+/// Prepares gpu inputs
+fn prepare_gpu_inputs(
+ tiles: &Tiles,
+ strip_buf: &mut Vec<Strip>,
+ pre_merge_buf: &mut Vec<PreMergeTile>,
+ alpha_buf: &mut Vec<u8>,
+ fill_rule: Fill,
+ lines: &[Line],
+) {
+ if tiles.is_empty() {
+ return;
+ }
+
+ let should_fill = |winding: i32| match fill_rule {
+ Fill::NonZero => winding != 0,
+ Fill::EvenOdd => winding % 2 != 0,
+ };
+
+ let mut winding_delta: i32 = 0;
+ let mut prev_tile = *tiles.get(0);
+ let mut alpha_offset: u32 = 0;
+ let initial_alpha_len = alpha_buf.len() as u32;
+
+ const SENTINEL: Tile = Tile::new(u16::MAX, u16::MAX, 0, false);
+
+ let mut start_tile_idx = 0;
+ let mut start_seg_idx = 0;
+ let mut strip = Strip::new(
+ prev_tile.x * Tile::WIDTH,
+ prev_tile.y * Tile::HEIGHT,
+ initial_alpha_len,
+ false,
+ );
+ for (tile_idx, tile) in tiles.iter().copied().chain([SENTINEL]).enumerate() {
+ let is_start_tile = tile_idx == 0 || !prev_tile.same_loc(&tile);
+ let is_start_segment = tile_idx == 0 || (is_start_tile && !prev_tile.prev_loc(&tile));
+ if is_start_tile {
+ start_tile_idx = pre_merge_buf.len();
+ if tile_idx > 0 {
+ alpha_offset += (Tile::WIDTH * Tile::HEIGHT) as u32;
+ }
+ }
+
+ if is_start_segment {
+ start_seg_idx = pre_merge_buf.len();
+ strip_buf.push(strip);
+
+ let is_sentinel = tile_idx == tiles.len() as usize;
+ if !prev_tile.same_row(&tile) {
+ if winding_delta != 0 || is_sentinel {
+ strip_buf.push(Strip::new(
+ u16::MAX,
+ prev_tile.y * Tile::HEIGHT,
+ initial_alpha_len + alpha_offset,
+ should_fill(winding_delta),
+ ));
+ }
+ winding_delta = 0;
+ }
+
+ if is_sentinel {
+ break;
+ }
+
+ strip = Strip::new(
+ tile.x * Tile::WIDTH,
+ tile.y * Tile::HEIGHT,
+ initial_alpha_len + alpha_offset,
+ should_fill(winding_delta),
+ );
+ }
+
+ let line = lines[tile.line_idx() as usize];
+ let tile_left_x = f32::from(tile.x) * f32::from(Tile::WIDTH);
+ let tile_top_y = f32::from(tile.y) * f32::from(Tile::HEIGHT);
+ let p0_x = line.p0.x - tile_left_x;
+ let p0_y = line.p0.y - tile_top_y;
+ let p1_x = line.p1.x - tile_left_x;
+ let p1_y = line.p1.y - tile_top_y;
+
+ let mut pmt = PreMergeTile {
+ alpha_index: initial_alpha_len + alpha_offset,
+ packed_info: 0,
+ scanned_winding: winding_delta,
+ padding: 0,
+ p0: [p0_x, p0_y],
+ p1: [p1_x, p1_y],
+ };
+
+ let sign = (p0_y - p1_y).signum();
+ let signed_winding = sign as i32 * tile.winding() as i32;
+ winding_delta += signed_winding;
+
+ pmt.set_is_end_tile(
+ tile_idx == tiles.len() as usize - 1 ||
+ !tiles.get(tile_idx as u32 + 1).same_loc(&tile),
+ );
+ pmt.set_is_tile_first_col(tile.x == 0);
+
+ let block_idx = pre_merge_buf.len() >> BLOCK_DIM_LG << BLOCK_DIM_LG;
+ let start_tile_pack = if start_tile_idx < block_idx {
+ INVALID_ID
+ } else {
+ (start_tile_idx - block_idx) as u32
+ };
+ pmt.set_tile_start_id(start_tile_pack);
+
+ let start_seg_pack = if start_seg_idx < block_idx {
+ INVALID_ID
+ } else {
+ (start_seg_idx - block_idx) as u32
+ };
+ pmt.set_seg_start_id(start_seg_pack);
+ pmt.set_fill_rule(fill_rule == Fill::NonZero);
+
+ pre_merge_buf.push(pmt);
+ prev_tile = tile;
+ }
+
+ //TODO alpha buff should be removed in place of a single accumulated u32
+ alpha_buf.resize(alpha_buf.len() + alpha_offset as usize, 0);
+}
+
+
+#[derive(Debug, Clone, Copy, Default)]
+pub struct StitchIndicator {
+ /// True if the tile's location dependency must be stitched.
+ pub loc_stitch_required: bool,
+ /// True if the tile's accumulation dependency must be stitched.
+ pub acc_stitch_required: bool,
+ /// The ID of the partition (workgroup) that processed this tile.
+ pub partition_id: usize,
+}
+
+#[derive(Debug, Clone, Copy, Default)]
+pub struct PartitionIndicator {
+ /// Does this partition contain a `seg_start`?
+ pub has_seg_start: bool,
+ /// Does this partition contain a `tile_start`?
+ pub has_tile_start: bool,
+}
+
+/// Simulate the merge_shader using the PreMergeTile intermediate representation.
+/// Note: THIS WILL NOT WORK with the current prepare_gpu_inputs because it is expecting local
+/// indexes relative to the call of strip::render, rather than the current global indexes provided
+/// by using the pre_merge_buffer length.
+fn cpu_merge(
+ pmt_buf: &mut Vec<PreMergeTile>,
+ alpha_buf: &mut Vec<u8>,
+ fill_rule: Fill,
+ winding_fine_comp: &mut Vec<[[f32; 4]; 4]>,
+ winding_acc_comp: &mut Vec<[f32; 4]>,
+ winding_coarse_comp: &mut Vec<i32>,
+) {
+ if pmt_buf.is_empty() {
+ return;
+ }
+
+ let pmt_count = pmt_buf.len();
+ let mut temp_acc = vec![[0f32; Tile::HEIGHT as usize]; pmt_count];
+ let mut temp_fine = vec![[[0f32; Tile::HEIGHT as usize]; Tile::WIDTH as usize]; pmt_count];
+
+ for gid in 0..pmt_buf.len() {
+ let pmt = pmt_buf[gid];
+ let p0_x = pmt.p0[0];
+ let p0_y = pmt.p0[1];
+ let p1_x = pmt.p1[0];
+ let p1_y = pmt.p1[1];
+
+ if (p0_y - p1_y).abs() < 1e-6 {
+ continue;
+ }
+
+ let (line_top_y, line_top_x, line_bottom_y, line_bottom_x) = if p0_y < p1_y {
+ (p0_y, p0_x, p1_y, p1_x)
+ } else {
+ (p1_y, p1_x, p0_y, p0_x)
+ };
+ let (line_left_x, line_left_y, line_right_x) = if p0_x < p1_x {
+ (p0_x, p0_y, p1_x)
+ } else {
+ (p1_x, p1_y, p0_x)
+ };
+
+
+ let dx = line_bottom_x - line_top_x; // this becomes the operative issue.
+ let is_vertical = dx.abs() < 1e-6; // Now use an epsilon to be sure
+ let dy = line_bottom_y - line_top_y; // we skip horizontal or close to horizontal lines by now
+ let y_slope = if dx == 0.0 { f32::MAX } else { dy / dx };
+ let x_slope = if dy == 0.0 { f32::MAX } else { dx / dy };
+ let sign = (p0_y - p1_y).signum();
+
+ if pmt.is_tile_first_col() && line_left_x < 0. {
+ let (ymin, ymax) = if is_vertical {
+ (line_top_y, line_bottom_y)
+ } else {
+ let line_viewport_left_y = (line_top_y - line_top_x * y_slope)
+ .max(line_top_y)
+ .min(line_bottom_y);
+ (
+ f32::min(line_left_y, line_viewport_left_y),
+ f32::max(line_left_y, line_viewport_left_y),
+ )
+ };
+
+ let px_top_y_arr = [0.0, 1.0, 2.0, 3.0];
+ let mut h = [0.0; Tile::HEIGHT as usize];
+ for y in 0..Tile::HEIGHT as usize {
+ let px_top_y: f32 = px_top_y_arr[y];
+ let px_bottom_y: f32 = 1.0 + px_top_y;
+ let ymin_clamped = px_top_y.max(ymin);
+ let ymax_clamped = px_bottom_y.min(ymax);
+ h[y] = (ymax_clamped - ymin_clamped).max(0.0);
+ temp_acc[gid][y] = sign * h[y];
+ }
+
+ if line_right_x < 0. {
+ for x_idx in 0..Tile::WIDTH as usize {
+ for y in 0..Tile::HEIGHT as usize {
+ temp_fine[gid][x_idx][y] = temp_acc[gid][y];
+ }
+ }
+ continue;
+ }
+ }
+
+ if is_vertical {
+ let line_x = line_top_x;
+ let event_x_idx = line_x.floor() as usize;
+ for x_idx in 0..Tile::WIDTH as usize {
+ for y in 0..Tile::HEIGHT as usize {
+ let mut h = 0.0;
+ let mut area = 0.0;
+ if x_idx == event_x_idx {
+ let px_top_y = y as f32;
+ let px_bottom_y = 1.0 + px_top_y;
+
+ let ymin = line_top_y.max(px_top_y);
+ let ymax = line_bottom_y.min(px_bottom_y);
+ let coverage_right = (x_idx as f32 + 1.0) - line_x;
+ h = (ymax - ymin).max(0.0);
+ area = h * coverage_right;
+ }
+ temp_fine[gid][x_idx][y] = temp_acc[gid][y] + sign * area;
+ temp_acc[gid][y] += sign * h;
+ }
+ }
+ } else {
+ for x_idx in 0..Tile::WIDTH {
+ let x_idx_f = x_idx as f32;
+ let px_left_x = x_idx_f;
+ let px_right_x = 1.0 + x_idx_f;
+
+ for y in 0..Tile::HEIGHT as usize {
+ let px_top_y = y as f32;
+ let px_bottom_y = 1.0 + px_top_y;
+ let ymin = line_top_y.max(px_top_y);
+ let ymax = line_bottom_y.min(px_bottom_y);
+
+ let line_px_left_y = (line_top_y + (px_left_x - line_top_x) * y_slope)
+ .max(ymin)
+ .min(ymax);
+ let line_px_right_y = (line_top_y + (px_right_x - line_top_x) * y_slope)
+ .max(ymin)
+ .min(ymax);
+
+ let line_px_left_yx = line_top_x + (line_px_left_y - line_top_y) * x_slope;
+ let line_px_right_yx = line_top_x + (line_px_right_y - line_top_y) * x_slope;
+ let h = (line_px_right_y - line_px_left_y).abs();
+ let area = 0.5 * h * (2. * px_right_x - line_px_right_yx - line_px_left_yx);
+
+ temp_fine[gid][x_idx as usize][y] = temp_acc[gid][y] + sign * area;
+ temp_acc[gid][y] += sign * h;
+ }
+ }
+ }
+ }
+
+ const BLOCK_DIM: usize = 256;
+ let GRID_DIM: usize = (pmt_buf.len() + BLOCK_DIM - 1) / BLOCK_DIM;
+ let mut stitch_indicator: Vec<StitchIndicator> = Vec::new();
+ let mut part_indicator: Vec<PartitionIndicator> = Vec::new();
+ let mut stitch_loc: Vec<[[f32; Tile::HEIGHT as usize]; Tile::WIDTH as usize]> = Vec::new();
+ let mut part_loc: Vec<[[f32; Tile::HEIGHT as usize]; Tile::WIDTH as usize]> = Vec::new();
+ let mut part_acc_seg: Vec<[f32; Tile::HEIGHT as usize]> = Vec::new();
+ let mut part_acc_mine: Vec<[f32; Tile::HEIGHT as usize]> = Vec::new();
+
+ for wgid in 0..GRID_DIM {
+ let mut wg_loc = [[[0f32; Tile::HEIGHT as usize]; Tile::WIDTH as usize]; BLOCK_DIM];
+ let mut wg_acc = [[0f32; Tile::HEIGHT as usize]; BLOCK_DIM];
+
+ // dirty, blend the scan with the stitch preparation
+ for tid in 0..BLOCK_DIM {
+ let gid = tid + wgid * BLOCK_DIM;
+ if gid < pmt_count {
+ wg_acc[tid] = temp_acc[gid];
+ wg_loc[tid] = temp_fine[gid];
+ }
+
+ if tid > 0 {
+ for y in 0..Tile::HEIGHT as usize {
+ wg_acc[tid][y] += wg_acc[tid - 1][y];
+ }
+ for x in 0..Tile::WIDTH as usize {
+ for y in 0..Tile::HEIGHT as usize {
+ wg_loc[tid][x][y] += wg_loc[tid - 1][x][y];
+ }
+ }
+ }
+ // barrier
+
+ // now duplicate previous logic. . .
+ if gid < pmt_count {
+ let pmt = pmt_buf[gid];
+ if pmt.is_end_tile() || tid == BLOCK_DIM - 1 {
+ let mut my_acc = wg_acc[tid];
+ let mut my_loc = wg_loc[tid];
+
+ let tile_start_id = pmt.tile_start_id() as usize;
+ let seg_start_id = pmt.seg_start_id() as usize;
+ let tile_start_valid = tile_start_id != INVALID_ID as usize;
+ let seg_start_valid = seg_start_id != INVALID_ID as usize;
+
+ if tile_start_valid {
+ if tile_start_id != 0 {
+ for x in 0..Tile::WIDTH as usize {
+ for y in 0..Tile::HEIGHT as usize {
+ my_loc[x][y] -= wg_loc[tile_start_id - 1][x][y];
+ }
+ }
+
+ my_acc = wg_acc[tile_start_id - 1];
+ } else {
+ my_acc = [0f32; 4];
+ }
+ }
+
+ if seg_start_valid {
+ let scanned_winding = pmt_buf[wgid * BLOCK_DIM + seg_start_id].scanned_winding as f32;
+ if seg_start_id != 0 {
+ for y in 0..Tile::HEIGHT as usize {
+ my_acc[y] += scanned_winding - wg_acc[seg_start_id - 1][y];
+ }
+ } else {
+ for y in 0..Tile::HEIGHT as usize {
+ my_acc[y] += scanned_winding;
+ }
+ }
+ }
+
+ if tile_start_valid {
+ for x in 0..Tile::WIDTH as usize {
+ for y in 0..Tile::HEIGHT as usize {
+ my_loc[x][y] += my_acc[y];
+ }
+ }
+ }
+
+ let mut stitch = StitchIndicator {
+ loc_stitch_required: false,
+ acc_stitch_required: false,
+ partition_id: 0,
+ };
+
+ // only end tiles participate in the write out
+ if pmt.is_end_tile() {
+ if tile_start_valid {
+ if seg_start_valid {
+ temp_fine[gid] = my_loc;
+ } else {
+ stitch.acc_stitch_required = true;
+ stitch.partition_id = wgid;
+ }
+ } else {
+ stitch.loc_stitch_required = true;
+ stitch.partition_id = wgid;
+ }
+
+ }
+ stitch_loc.push(my_loc);
+ stitch_indicator.push(stitch);
+
+ if tid == BLOCK_DIM - 1 {
+ // Full OR tile - seg_start
+ part_acc_mine.push(my_acc);
+
+ // Push Full
+ let mut seg_acc = [0f32; Tile::HEIGHT as usize];
+ if seg_start_valid {
+ let scanned_winding = pmt_buf[wgid * BLOCK_DIM + seg_start_id].scanned_winding as f32;
+ if seg_start_id != 0 {
+ for y in 0..Tile::HEIGHT as usize {
+ seg_acc[y] = scanned_winding + wg_acc[tid][y] -
+ wg_acc[seg_start_id - 1][y];
+ }
+ } else {
+ for y in 0..Tile::HEIGHT as usize {
+ seg_acc[y] = scanned_winding + wg_acc[tid][y];
+ }
+ }
+ part_acc_seg.push(seg_acc);
+ } else {
+ part_acc_seg.push(wg_acc[tid]);
+ }
+
+ if tile_start_valid && tile_start_id != 0 {
+ let mut p_loc = [[0f32; Tile::HEIGHT as usize]; Tile::WIDTH as usize];
+ for x in 0..Tile::WIDTH as usize {
+ for y in 0..Tile::HEIGHT as usize {
+ p_loc[x][y] += wg_loc[tid][x][y] - wg_loc[pmt.tile_start_id() as usize - 1][x][y];
+ }
+ }
+ part_loc.push(p_loc);
+ } else {
+ part_loc.push(wg_loc[tid]);
+ }
+ let p_ind = PartitionIndicator {
+ has_seg_start: seg_start_valid,
+ has_tile_start: tile_start_valid,
+ };
+ part_indicator.push(p_ind);
+ }
+ } else {
+ let stitch = StitchIndicator {
+ loc_stitch_required: false,
+ acc_stitch_required: false,
+ partition_id: 0,
+ };
+ stitch_loc.push([[0f32; 4]; 4]);
+ stitch_indicator.push(stitch);
+ }
+ }
+ }
+ }
+
+ //Stitching is not done in grid blocked
+ for gid in 0..pmt_count {
+ let s_ind = stitch_indicator[gid];
+ if s_ind.loc_stitch_required && !s_ind.acc_stitch_required {
+ let mut loc = stitch_loc[gid];
+ let mut lookback_id = s_ind.partition_id - 1;
+ let mut part_ind:PartitionIndicator;
+ loop {
+ // Sum the prev
+ for x in 0..Tile::WIDTH as usize {
+ for y in 0..Tile::HEIGHT as usize {
+ loc[x][y] += part_loc[lookback_id][x][y];
+ }
+ }
+
+ // Did we hit a tile_start?
+ part_ind = part_indicator[lookback_id];
+ if part_ind.has_tile_start {
+ break;
+ } else {
+ lookback_id -= 1;
+ }
+ }
+
+ //Once we hit a tile start. . . Then we have to stitch the acc
+ //Since this partition is guaranteed to have the tile start
+ let mut acc = part_acc_mine[lookback_id];
+
+ // If this partition also had a seg start, we're done, otherwise we need to continue
+ // the traversal
+ if !part_ind.has_seg_start {
+ lookback_id -= 1;
+ loop {
+ // Sum the prev
+ for y in 0..Tile::HEIGHT as usize {
+ acc[y] += part_acc_seg[lookback_id][y];
+ }
+
+ // Did we hit a seg_start?
+ part_ind = part_indicator[lookback_id];
+ if part_ind.has_seg_start {
+ break;
+ } else {
+ lookback_id -= 1;
+ }
+ }
+ }
+
+ //Push out
+ temp_acc[gid] = acc;
+ for x in 0..Tile::WIDTH as usize {
+ for y in 0..Tile::HEIGHT as usize {
+ temp_fine[gid][x][y] = loc[x][y] + acc[y];
+ }
+ }
+ }
+
+ if s_ind.acc_stitch_required && !s_ind.loc_stitch_required {
+ let mut acc = [0f32; Tile::HEIGHT as usize];
+ let mut lookback_id = s_ind.partition_id - 1;
+ loop {
+ let s_acc = part_acc_seg[lookback_id];
+ for y in 0..Tile::HEIGHT as usize {
+ acc[y] += s_acc[y];
+ }
+ if part_indicator[lookback_id].has_seg_start {
+ break;
+ } else {
+ lookback_id -= 1;
+ }
+ }
+
+ // Push out
+ let loc = stitch_loc[gid];
+ for x in 0..Tile::WIDTH as usize {
+ for y in 0..Tile::HEIGHT as usize {
+ temp_fine[gid][x][y] = loc[x][y] + acc[y];
+ }
+ }
+ }
+ }
+
+ for gid in 0..(pmt_count - 1) {
+ let pmt = pmt_buf[gid];
+ if pmt.is_end_tile() {
+ winding_fine_comp.push(temp_fine[gid]);
+ winding_acc_comp.push(temp_acc[gid + 1]);
+ winding_coarse_comp.push(pmt_buf[gid + 1].scanned_winding);
+
+ match fill_rule {
+ Fill::NonZero => {
+ for x in 0..Tile::WIDTH as usize {
+ for y in 0..Tile::HEIGHT as usize {
+ let area = temp_fine[gid][x][y];
+ let coverage = area.abs();
+ let mulled = 0.5 + coverage * 255.0;
+ temp_fine[gid][x][y] = mulled.min(255.0);
+ }
+ }
+ }
+ Fill::EvenOdd => {
+ for x in 0..Tile::WIDTH as usize {
+ for y in 0..Tile::HEIGHT as usize {
+ let area = temp_fine[gid][x][y];
+ let im1 = (0.5 + area * 0.5).floor();
+ let coverage = (im1 + area * -2.0).abs();
+ let mulled = 0.5 + coverage * 255.0;
+ temp_fine[gid][x][y] = mulled.min(255.0);
+ }
+ }
+ }
+ };
+
+ let mut u8_vals = [0u8; 16];
+ let mut i = 0;
+ for x in 0..Tile::WIDTH as usize {
+ for y in 0..Tile::HEIGHT as usize {
+ u8_vals[i] = temp_fine[gid][x][y].round() as u8;
+ i += 1;
+ }
+ }
+
+ // this will be indexed into
+ alpha_buf.extend_from_slice(&u8_vals);
+ }
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+pub fn compare_windings(
+ winding_fine_ref: &[[[f32; 4]; 4]],
+ winding_coarse_ref: &[i32],
+ winding_acc_ref: &[[f32; 4]],
+ winding_fine_comp: &[[[f32; 4]; 4]],
+ winding_coarse_comp: &[i32],
+ winding_acc_comp: &[[f32; 4]],
+) -> bool {
+ let coarse_match = compare_coarse_windings(winding_coarse_ref, winding_coarse_comp);
+ if !coarse_match {
+ println!("\nComparison halted due to coarse winding mismatch.");
+ return false;
+ }
+ let fine_match = compare_fine_windings(winding_fine_ref, winding_fine_comp);
+ if !fine_match {
+ println!("\nComparison failed at fine winding stage.");
+ return false;
+ }
+ let acc_match = compare_acc_windings(winding_acc_ref, winding_acc_comp);
+
+ acc_match && coarse_match && fine_match
+}
+
+fn compare_coarse_windings(reference: &[i32], comp: &[i32]) -> bool {
+ if reference.len() != comp.len() {
+ println!(
+ "❌ FATAL ERROR: Coarse winding vectors have different lengths! Reference: {}, Comp: {}",
+ reference.len(),
+ comp.len()
+ );
+ return false;
+ }
+ let mut mismatches = 0;
+ for (i, (ref_val, comp_val)) in reference.iter().zip(comp.iter()).enumerate() {
+ if ref_val != comp_val {
+ if mismatches == 0 {
+ println!("\n--- Coarse Winding Mismatches Found ---");
+ }
+ println!(
+ "Mismatch at tile index {}: Reference = {}, Comp = {}",
+ i, ref_val, comp_val
+ );
+ mismatches += 1;
+ }
+ }
+ if mismatches == 0 {
+ true
+ } else {
+ println!(
+ "❌ Coarse winding comparison FAILED. Found {} mismatches out of {} tiles.",
+ mismatches,
+ reference.len()
+ );
+ false
+ }
+}
+
+fn compare_fine_windings(reference: &[[[f32; 4]; 4]], comp: &[[[f32; 4]; 4]]) -> bool {
+ if reference.len() != comp.len() {
+ println!(
+ "❌ FATAL ERROR: Fine winding vectors have different lengths! Reference: {}, Comp: {}",
+ reference.len(),
+ comp.len()
+ );
+ return false;
+ }
+
+ let mut total_mismatches = 0;
+ let mut first_mismatch_tile_index = None;
+
+ for i in 0..reference.len() {
+ let ref_tile = reference[i];
+ let comp_tile = comp[i];
+ let mut tile_has_mismatch = false;
+ for x in 0..4 {
+ for y in 0..4 {
+ let ref_val = ref_tile[x][y];
+ let comp_val = comp_tile[x][y];
+ if (ref_val - comp_val).abs() > 0.01 {
+ total_mismatches += 1;
+ tile_has_mismatch = true;
+ }
+ }
+ }
+ if tile_has_mismatch && first_mismatch_tile_index.is_none() {
+ first_mismatch_tile_index = Some(i);
+ }
+ }
+
+ if total_mismatches == 0 {
+ true
+ } else {
+ println!(
+ "❌ Fine winding comparison FAILED. Found {} total mismatches.",
+ total_mismatches
+ );
+ if let Some(i) = first_mismatch_tile_index {
+ println!(
+ "\n--- Detailed Mismatch Report for First Failing Tile (Index {}) ---",
+ i
+ );
+ println!("Column-major format: pixel (x, y)");
+ println!("{:<45} {:<45}", "Reference (cpu_merge_ref)", "Comp");
+ println!("{:-<45} {:-<45}", "-", "-");
+
+ let ref_tile = reference[i];
+ let comp_tile = comp[i];
+
+ for y in 0..4 {
+ let mut ref_row_str = String::new();
+ let mut comp_row_str = String::new();
+ for x in 0..4 {
+ ref_row_str.push_str(&format!("({x},{y})={:<+8.4} ", ref_tile[x][y]));
+ comp_row_str.push_str(&format!("({x},{y})={:<+8.4} ", comp_tile[x][y]));
+ }
+ println!("{} {}", ref_row_str.trim_end(), comp_row_str.trim_end());
+ }
+ }
+ false
+ }
+}
+
+fn compare_acc_windings(reference: &[[f32; 4]], comp: &[[f32; 4]]) -> bool {
+ if reference.len() != comp.len() {
+ println!(
+ "❌ FATAL ERROR: Accumulator winding vectors have different lengths! Reference: {}, Comp: {}",
+ reference.len(),
+ comp.len()
+ );
+ return false;
+ }
+ let mut mismatches = 0;
+ for i in 0..reference.len() {
+ let ref_tile_acc = reference[i];
+ let comp_tile_acc = comp[i];
+ for y in 0..4 {
+ if (ref_tile_acc[y] - comp_tile_acc[y]).abs() > 0.01 {
+ if mismatches == 0 {
+ println!("\n--- Accumulator Winding Mismatches Found ---");
+ }
+ println!(
+ "Mismatch at tile index {} row {}: Reference = {:.4}, Comp = {:.4}",
+ i, y, ref_tile_acc[y], comp_tile_acc[y]
+ );
+ mismatches += 1;
+ }
+ }
+ }
+ if mismatches == 0 {
+ true
+ } else {
+ println!(
+ "❌ Accumulator winding comparison FAILED. Found {} mismatches.",
+ mismatches,
+ );
+ false
}
}
diff --git a/sparse_strips/vello_common/src/strip_generator.rs b/sparse_strips/vello_common/src/strip_generator.rs
index e7bae82..2198c98 100644
--- a/sparse_strips/vello_common/src/strip_generator.rs
+++ b/sparse_strips/vello_common/src/strip_generator.rs
@@ -7,7 +7,7 @@
use crate::flatten::{FlattenCtx, Line};
use crate::kurbo::{Affine, PathEl, Stroke};
use crate::peniko::Fill;
-use crate::strip::Strip;
+use crate::strip::{PreMergeTile, Strip};
use crate::tile::Tiles;
use crate::{flatten, strip};
use alloc::vec::Vec;
@@ -20,6 +20,9 @@
pub strips: Vec<Strip>,
/// The alphas in the storage.
pub alphas: Vec<u8>,
+ /// For the pre merge tiles
+ pub pre_merge_tiles: Vec<PreMergeTile>,
+
generation_mode: GenerationMode,
}
@@ -141,6 +144,7 @@
self.level,
&self.tiles,
&mut strip_storage.strips,
+ &mut strip_storage.pre_merge_tiles,
&mut strip_storage.alphas,
fill_rule,
aliasing_threshold,
diff --git a/sparse_strips/vello_cpu/src/render.rs b/sparse_strips/vello_cpu/src/render.rs
index ebcb5ef..e5e497b 100644
--- a/sparse_strips/vello_cpu/src/render.rs
+++ b/sparse_strips/vello_cpu/src/render.rs
@@ -27,7 +27,7 @@
use vello_common::peniko::{BlendMode, Compose, Fill, Mix};
use vello_common::pixmap::Pixmap;
use vello_common::recording::{PushLayerCommand, Recordable, Recorder, Recording, RenderCommand};
-use vello_common::strip::Strip;
+use vello_common::strip::{PreMergeTile, Strip};
use vello_common::strip_generator::{GenerationMode, StripGenerator, StripStorage};
#[cfg(feature = "text")]
use vello_common::{
@@ -691,8 +691,8 @@
}
fn execute_recording(&mut self, recording: &Recording) {
- let (cached_strips, cached_alphas) = recording.get_cached_strips();
- let adjusted_strips = self.prepare_cached_strips(cached_strips, cached_alphas);
+ let (cached_strips, cached_pmt, cached_alphas) = recording.get_cached_strips();
+ let adjusted_strips = self.prepare_cached_strips(cached_strips, cached_pmt, cached_alphas);
// Use pre-calculated strip start indices from when we generated the cache.
let strip_start_indices = recording.get_strip_start_indices();
@@ -910,18 +910,23 @@
fn prepare_cached_strips(
&mut self,
cached_strips: &[Strip],
+ cached_pmt: &[PreMergeTile],
cached_alphas: &[u8],
) -> Vec<Strip> {
// Calculate offset for alpha indices based on current dispatcher's alpha buffer size.
let alpha_offset = {
let storage = self.dispatcher.strip_storage_mut();
let offset = storage.alphas.len() as u32;
- // Extend the dispatcher's alpha buffer with cached alphas.
- storage.alphas.extend(cached_alphas);
+ storage.alphas.extend_from_slice(cached_alphas);
+ storage.pre_merge_tiles.extend(cached_pmt.iter().map(|pmt| {
+ let mut adjusted_pmt = *pmt;
+ adjusted_pmt.alpha_index += offset;
+ adjusted_pmt
+ }));
offset
};
- // Create adjusted strips with corrected alpha indices.
+
cached_strips
.iter()
.map(move |strip| {
diff --git a/sparse_strips/vello_hybrid/examples/winit/Cargo.toml b/sparse_strips/vello_hybrid/examples/winit/Cargo.toml
index 23e8426..20a0dcd 100644
--- a/sparse_strips/vello_hybrid/examples/winit/Cargo.toml
+++ b/sparse_strips/vello_hybrid/examples/winit/Cargo.toml
@@ -16,3 +16,5 @@
vello_hybrid = { workspace = true }
vello_example_scenes = { workspace = true }
pollster = { workspace = true }
+futures-intrusive = "0.5"
+bytemuck = { version = "1.15", features = ["derive"] }
\ No newline at end of file
diff --git a/sparse_strips/vello_hybrid/examples/winit/src/main.rs b/sparse_strips/vello_hybrid/examples/winit/src/main.rs
index 5d429ff..8843a8c 100644
--- a/sparse_strips/vello_hybrid/examples/winit/src/main.rs
+++ b/sparse_strips/vello_hybrid/examples/winit/src/main.rs
@@ -22,6 +22,8 @@
window::{Window, WindowId},
};
+use futures_intrusive::channel::shared::oneshot_channel;
+
const ZOOM_STEP: f64 = 0.1;
struct App<'s> {
@@ -306,9 +308,58 @@
)
.unwrap();
+ let renderer = self.renderers[surface.dev_id].as_mut().unwrap();
+ let source_buffer = renderer.get_stitch_indicator_buffer();
+ let buffer_size = source_buffer.size();
+
+ let staging_buffer = device_handle.device.create_buffer(&wgpu::BufferDescriptor {
+ label: Some("Stitch Indicator Readback Buffer"),
+ size: buffer_size,
+ usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+ mapped_at_creation: false,
+ });
+
+ encoder.copy_buffer_to_buffer(
+ source_buffer,
+ 0, // source offset
+ &staging_buffer,
+ 0, // destination offset
+ buffer_size,
+ );
+
device_handle.queue.submit([encoder.finish()]);
surface_texture.present();
+ let buffer_slice = staging_buffer.slice(..);
+ let (sender, receiver) = oneshot_channel();
+ buffer_slice.map_async(wgpu::MapMode::Read, move |result| {
+ sender.send(result).unwrap();
+ });
+
+ let _ = device_handle.device.poll(wgpu::PollType::Wait);
+
+ // if let Some(Ok(())) = pollster::block_on(receiver.receive()) {
+ // let data = buffer_slice.get_mapped_range();
+ // let u32_data: &[u32] = bytemuck::cast_slice(&data);
+ // let num_to_print = u32_data.len().min(256);
+ // println!("--- Readback Buffer (First {} u32s) ---", num_to_print);
+ // for (i, val) in u32_data.iter().take(num_to_print).enumerate() {
+ // println!("{}: {:<10} ", i, val)
+ // }
+ // println!("\n--- End of Readback ---");
+ // }
+
+ // if let Some(Ok(())) = pollster::block_on(receiver.receive()) {
+ // let data = buffer_slice.get_mapped_range();
+ // let f32_data: &[f32] = bytemuck::cast_slice(&data);
+ // let num_to_print = f32_data.len().min(128);
+ // println!("--- Readback Buffer (First {} f32s) ---", num_to_print);
+ // for (i, val) in f32_data.iter().take(num_to_print).enumerate() {
+ // println!("{}: {:<10.4} ", i, val);
+ // }
+ // println!("\n--- End of Readback ---");
+ // }
+
device_handle.device.poll(wgpu::PollType::Poll).unwrap();
}
_ => {}
diff --git a/sparse_strips/vello_hybrid/src/lib.rs b/sparse_strips/vello_hybrid/src/lib.rs
index 1865cc4..87c1e27 100644
--- a/sparse_strips/vello_hybrid/src/lib.rs
+++ b/sparse_strips/vello_hybrid/src/lib.rs
@@ -29,7 +29,7 @@
//!
//! See the individual module documentation for more details on usage and implementation.
-#![no_std]
+//#![no_std]
extern crate alloc;
diff --git a/sparse_strips/vello_hybrid/src/render/common.rs b/sparse_strips/vello_hybrid/src/render/common.rs
index 253445d..bbe9aa0 100644
--- a/sparse_strips/vello_hybrid/src/render/common.rs
+++ b/sparse_strips/vello_hybrid/src/render/common.rs
@@ -44,6 +44,20 @@
pub alphas_tex_width_bits: u32,
}
+/// Configuration for the Merge and Stitch compute shaders
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Pod, Zeroable)]
+pub struct ComputeConfig {
+ /// PreMergeTileCount; 1:1 with tiles
+ pub pmt_count: u32,
+ /// The total number writing tiles. alpha_buffer.len() / (tile::WIDTH * tile::HEIGHT)
+ pub end_tile_count: u32,
+ /// Placeholder
+ pub c: u32,
+ /// Placeholder
+ pub d: u32,
+}
+
/// Represents a GPU strip for rendering.
///
/// This struct corresponds to the `StripInstance` struct in the shader.
diff --git a/sparse_strips/vello_hybrid/src/render/wgpu.rs b/sparse_strips/vello_hybrid/src/render/wgpu.rs
index a862776..51756e4 100644
--- a/sparse_strips/vello_hybrid/src/render/wgpu.rs
+++ b/sparse_strips/vello_hybrid/src/render/wgpu.rs
@@ -22,6 +22,7 @@
use alloc::{sync::Arc, vec};
use core::{fmt::Debug, mem, num::NonZeroU64};
use wgpu::Extent3d;
+use std::println;
use crate::AtlasConfig;
use crate::multi_atlas::AtlasId;
@@ -32,7 +33,7 @@
render::{
Config,
common::{
- GPU_ENCODED_IMAGE_SIZE_TEXELS, GPU_LINEAR_GRADIENT_SIZE_TEXELS,
+ ComputeConfig, GPU_ENCODED_IMAGE_SIZE_TEXELS, GPU_LINEAR_GRADIENT_SIZE_TEXELS,
GPU_RADIAL_GRADIENT_SIZE_TEXELS, GPU_SWEEP_GRADIENT_SIZE_TEXELS, GpuEncodedImage,
GpuEncodedPaint, GpuLinearGradient, GpuRadialGradient, GpuSweepGradient,
pack_image_offset, pack_image_params, pack_image_size, pack_radial_kind_and_swapped,
@@ -50,6 +51,7 @@
paint::ImageSource,
peniko,
pixmap::Pixmap,
+ strip::PreMergeTile,
tile::Tile,
};
use wgpu::{
@@ -99,6 +101,11 @@
Self::new_with(device, render_target_config, RenderSettings::default())
}
+ /// TODO
+ pub fn get_stitch_indicator_buffer(&self) -> &wgpu::Buffer {
+ &self.programs.resources.stitch_indicator_buffer
+ }
+
/// Creates a new renderer with specific settings.
pub fn new_with(
device: &Device,
@@ -146,12 +153,41 @@
self.programs.prepare(
device,
queue,
+ &scene.strip_storage.pre_merge_tiles,
&mut self.gradient_cache,
&self.encoded_paints,
&scene.strip_storage.alphas,
render_size,
&self.paint_idxs,
);
+
+ const BLOCK_DIM: u32 = 256;
+ {
+ let tile_count = scene.strip_storage.pre_merge_tiles.len() as u32;
+ println!("PMT Size {}\n", tile_count);
+ println!("Alpha Size {}\n", scene.strip_storage.alphas.len());
+ let workgroup_x = (tile_count + BLOCK_DIM - 1) / BLOCK_DIM;
+
+ let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+ label: Some("Merge Compute Pass"),
+ timestamp_writes: None,
+ });
+ compute_pass.set_pipeline(&self.programs.merge_pipeline);
+ compute_pass.set_bind_group(0, &self.programs.resources.merge_bind_group, &[]);
+ compute_pass.dispatch_workgroups(workgroup_x, 1, 1);
+ }
+ {
+ let end_tile_count = (scene.strip_storage.alphas.len() / 16) as u32;
+ let workgroup_x = (end_tile_count + BLOCK_DIM - 1) / BLOCK_DIM;
+ let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+ label: Some("Stitch Compute Pass"),
+ timestamp_writes: None,
+ });
+ compute_pass.set_pipeline(&self.programs.stitch_pipeline);
+ compute_pass.set_bind_group(0, &self.programs.resources.merge_bind_group, &[]);
+ compute_pass.dispatch_workgroups(workgroup_x, 1, 1);
+ }
+
let mut junk = RendererContext {
programs: &mut self.programs,
device,
@@ -447,10 +483,19 @@
gradient_bind_group_layout: BindGroupLayout,
/// Bind group layout for atlas textures
atlas_bind_group_layout: BindGroupLayout,
+ /// Bind group layout for merge and stitch shaders
+ merge_bind_group_layout: BindGroupLayout,
+
/// Pipeline for clearing slots in slot textures.
clear_pipeline: RenderPipeline,
/// Pipeline for clearing atlas regions.
atlas_clear_pipeline: RenderPipeline,
+
+ /// Merge compute pipeline
+ merge_pipeline: wgpu::ComputePipeline,
+ /// Stitch compute pipeline
+ stitch_pipeline: wgpu::ComputePipeline,
+
/// GPU resources for rendering (created during prepare)
resources: GpuResources,
/// Dimensions of the rendering target
@@ -487,6 +532,16 @@
view_config_buffer: Buffer,
/// Config buffer for rendering wide tile commands into a slot texture.
slot_config_buffer: Buffer,
+ /// Config buffer for Stitch and Merge compute shaders.
+ compute_config_buffer: Buffer,
+
+ // Merge and stitch
+ premerge_tile_buffer: Buffer,
+ stitch_indicator_buffer: Buffer,
+ stitch_loc_buffer: Buffer,
+ part_indicator_buffer: Buffer,
+ part_acc_buffer: Buffer,
+ part_loc_buffer: Buffer,
/// Buffer for slot indices used in `clear_slots`
clear_slot_indices_buffer: Buffer,
@@ -497,6 +552,8 @@
/// Bind group for clear slots operation
clear_bind_group: BindGroup,
+
+ merge_bind_group: BindGroup,
}
const SIZE_OF_CONFIG: NonZeroU64 = NonZeroU64::new(size_of::<Config>() as u64).unwrap();
@@ -633,6 +690,93 @@
}],
});
+ let merge_bind_group_layout =
+ device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+ label: Some("Merge/Stitch Bind Group Layout"),
+ entries: &[
+ wgpu::BindGroupLayoutEntry {
+ binding: 0, // config
+ visibility: wgpu::ShaderStages::COMPUTE,
+ ty: wgpu::BindingType::Buffer {
+ ty: wgpu::BufferBindingType::Uniform,
+ has_dynamic_offset: false,
+ min_binding_size: None,
+ },
+ count: None,
+ },
+ wgpu::BindGroupLayoutEntry {
+ binding: 1, // pmt_in
+ visibility: wgpu::ShaderStages::COMPUTE,
+ ty: wgpu::BindingType::Buffer {
+ ty: wgpu::BufferBindingType::Storage { read_only: true },
+ has_dynamic_offset: false,
+ min_binding_size: None,
+ },
+ count: None,
+ },
+ wgpu::BindGroupLayoutEntry {
+ binding: 2, // stitch_indicator
+ visibility: wgpu::ShaderStages::COMPUTE,
+ ty: wgpu::BindingType::Buffer {
+ ty: wgpu::BufferBindingType::Storage { read_only: false },
+ has_dynamic_offset: false,
+ min_binding_size: None,
+ },
+ count: None,
+ },
+ wgpu::BindGroupLayoutEntry {
+ binding: 3, // part_indicator
+ visibility: wgpu::ShaderStages::COMPUTE,
+ ty: wgpu::BindingType::Buffer {
+ ty: wgpu::BufferBindingType::Storage { read_only: false },
+ has_dynamic_offset: false,
+ min_binding_size: None,
+ },
+ count: None,
+ },
+ wgpu::BindGroupLayoutEntry {
+ binding: 4, // stitch_loc
+ visibility: wgpu::ShaderStages::COMPUTE,
+ ty: wgpu::BindingType::Buffer {
+ ty: wgpu::BufferBindingType::Storage { read_only: false },
+ has_dynamic_offset: false,
+ min_binding_size: None,
+ },
+ count: None,
+ },
+ wgpu::BindGroupLayoutEntry {
+ binding: 5, // part_acc
+ visibility: wgpu::ShaderStages::COMPUTE,
+ ty: wgpu::BindingType::Buffer {
+ ty: wgpu::BufferBindingType::Storage { read_only: false },
+ has_dynamic_offset: false,
+ min_binding_size: None,
+ },
+ count: None,
+ },
+ wgpu::BindGroupLayoutEntry {
+ binding: 6, // part_loc
+ visibility: wgpu::ShaderStages::COMPUTE,
+ ty: wgpu::BindingType::Buffer {
+ ty: wgpu::BufferBindingType::Storage { read_only: false },
+ has_dynamic_offset: false,
+ min_binding_size: None,
+ },
+ count: None,
+ },
+ wgpu::BindGroupLayoutEntry {
+ binding: 7, // output (alphas texture)
+ visibility: wgpu::ShaderStages::COMPUTE,
+ ty: wgpu::BindingType::StorageTexture {
+ access: wgpu::StorageTextureAccess::WriteOnly,
+ format: wgpu::TextureFormat::Rgba32Uint,
+ view_dimension: wgpu::TextureViewDimension::D2,
+ },
+ count: None,
+ },
+ ],
+ });
+
let strip_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
label: Some("Strip Shader"),
source: wgpu::ShaderSource::Wgsl(vello_sparse_shaders::wgsl::RENDER_STRIPS.into()),
@@ -643,6 +787,9 @@
source: wgpu::ShaderSource::Wgsl(vello_sparse_shaders::wgsl::CLEAR_SLOTS.into()),
});
+ let merge_shader =
+ device.create_shader_module(wgpu::include_wgsl!("../shaders/merge.wgsl"));
+
let strip_pipeline_layout =
device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: Some("Strip Pipeline Layout"),
@@ -770,6 +917,31 @@
cache: None,
});
+ let merge_pipeline_layout =
+ device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+ label: Some("Merge/Stitch Pipeline Layout"),
+ bind_group_layouts: &[&merge_bind_group_layout],
+ push_constant_ranges: &[],
+ });
+
+ let merge_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+ label: Some("Merge Pipeline"),
+ layout: Some(&merge_pipeline_layout),
+ module: &merge_shader,
+ entry_point: Some("merge"),
+ compilation_options: Default::default(),
+ cache: None,
+ });
+
+ let stitch_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+ label: Some("Stitch Pipeline"),
+ layout: Some(&merge_pipeline_layout),
+ module: &merge_shader,
+ entry_point: Some("stitch"),
+ compilation_options: Default::default(),
+ cache: None,
+ });
+
let slot_texture_views: [TextureView; 2] = core::array::from_fn(|_| {
device
.create_texture(&wgpu::TextureDescriptor {
@@ -896,7 +1068,54 @@
&slot_texture_views,
);
- let resources = GpuResources {
+ let compute_config_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+ label: Some("Compute Config Buffer"),
+ contents: bytemuck::bytes_of(&ComputeConfig {
+ pmt_count: 0,
+ end_tile_count: 0,
+ c: 0,
+ d: 0,
+ }),
+ usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+ });
+ let premerge_tile_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+ label: Some("Pre-merge Tile Buffer"),
+ size: 128,
+ usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+ mapped_at_creation: false,
+ });
+ let stitch_indicator_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+ label: Some("Stitch Indicator Buffer"),
+ size: 128,
+ usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
+ mapped_at_creation: false,
+ });
+ let stitch_loc_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+ label: Some("Stitch Location Buffer"),
+ size: 128,
+ usage: wgpu::BufferUsages::STORAGE,
+ mapped_at_creation: false,
+ });
+ let part_indicator_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+ label: Some("Partition Indicator Buffer"),
+ size: 128,
+ usage: wgpu::BufferUsages::STORAGE,
+ mapped_at_creation: false,
+ });
+ let part_acc_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+ label: Some("Partition Accumulator Buffer"),
+ size: 128,
+ usage: wgpu::BufferUsages::STORAGE,
+ mapped_at_creation: false,
+ });
+ let part_loc_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+ label: Some("Partition Location Buffer"),
+ size: 128,
+ usage: wgpu::BufferUsages::STORAGE,
+ mapped_at_creation: false,
+ });
+
+ let mut resources = GpuResources {
strips_buffer: Self::create_strips_buffer(device, 0),
clear_slot_indices_buffer,
slot_texture_views,
@@ -912,13 +1131,35 @@
gradient_texture,
gradient_bind_group,
view_config_buffer,
+ compute_config_buffer,
+ premerge_tile_buffer,
+ stitch_indicator_buffer,
+ stitch_loc_buffer,
+ part_indicator_buffer,
+ part_acc_buffer,
+ part_loc_buffer,
+ merge_bind_group: Self::create_placeholder_bind_group(device, &merge_bind_group_layout),
};
+ resources.merge_bind_group = Self::create_merge_bind_group(
+ device,
+ &merge_bind_group_layout,
+ &resources.compute_config_buffer,
+ &resources.premerge_tile_buffer,
+ &resources.stitch_indicator_buffer,
+ &resources.part_indicator_buffer,
+ &resources.stitch_loc_buffer,
+ &resources.part_acc_buffer,
+ &resources.part_loc_buffer,
+ &resources.alphas_texture,
+ );
+
Self {
strip_pipeline,
strip_bind_group_layout,
encoded_paints_bind_group_layout,
gradient_bind_group_layout,
+ merge_bind_group_layout,
atlas_bind_group_layout,
resources,
alpha_data,
@@ -929,9 +1170,110 @@
},
clear_pipeline,
atlas_clear_pipeline,
+ merge_pipeline,
+ stitch_pipeline,
}
}
+ fn create_placeholder_bind_group(
+ device: &wgpu::Device,
+ layout: &wgpu::BindGroupLayout
+) -> wgpu::BindGroup {
+ // Create dummy resources that match the binding types
+ let dummy_uniform_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+ label: Some("Dummy Uniform"),
+ size: 16,
+ usage: wgpu::BufferUsages::UNIFORM,
+ mapped_at_creation: false,
+ });
+ let dummy_storage_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+ label: Some("Dummy Storage"),
+ size: 16,
+ usage: wgpu::BufferUsages::STORAGE,
+ mapped_at_creation: false,
+ });
+ let dummy_texture = device.create_texture(&wgpu::TextureDescriptor {
+ label: Some("Dummy Texture"),
+ size: wgpu::Extent3d { width: 1, height: 1, depth_or_array_layers: 1 },
+ mip_level_count: 1,
+ sample_count: 1,
+ dimension: wgpu::TextureDimension::D2,
+ format: wgpu::TextureFormat::Rgba32Uint,
+ usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::STORAGE_BINDING,
+ view_formats: &[],
+ });
+ let dummy_texture_view = dummy_texture.create_view(&Default::default());
+
+ device.create_bind_group(&wgpu::BindGroupDescriptor {
+ label: Some("Placeholder Merge Bind Group"),
+ layout,
+ entries: &[
+ wgpu::BindGroupEntry { binding: 0, resource: dummy_uniform_buffer.as_entire_binding() },
+ wgpu::BindGroupEntry { binding: 1, resource: dummy_storage_buffer.as_entire_binding() },
+ wgpu::BindGroupEntry { binding: 2, resource: dummy_storage_buffer.as_entire_binding() },
+ wgpu::BindGroupEntry { binding: 3, resource: dummy_storage_buffer.as_entire_binding() },
+ wgpu::BindGroupEntry { binding: 4, resource: dummy_storage_buffer.as_entire_binding() },
+ wgpu::BindGroupEntry { binding: 5, resource: dummy_storage_buffer.as_entire_binding() },
+ wgpu::BindGroupEntry { binding: 6, resource: dummy_storage_buffer.as_entire_binding() },
+ wgpu::BindGroupEntry { binding: 7, resource: wgpu::BindingResource::TextureView(&dummy_texture_view) },
+ ],
+ })
+}
+
+ fn create_merge_bind_group(
+ device: &wgpu::Device,
+ layout: &wgpu::BindGroupLayout,
+ config: &wgpu::Buffer,
+ pmt: &wgpu::Buffer,
+ stitch_indicator: &wgpu::Buffer,
+ part_indicator: &wgpu::Buffer,
+ stitch_loc: &wgpu::Buffer,
+ part_acc: &wgpu::Buffer,
+ part_loc: &wgpu::Buffer,
+ alphas: &wgpu::Texture,
+ ) -> wgpu::BindGroup {
+ device.create_bind_group(&wgpu::BindGroupDescriptor {
+ label: Some("Merge/Stitch Bind Group"),
+ layout,
+ entries: &[
+ wgpu::BindGroupEntry {
+ binding: 0,
+ resource: config.as_entire_binding(),
+ },
+ wgpu::BindGroupEntry {
+ binding: 1,
+ resource: pmt.as_entire_binding(),
+ },
+ wgpu::BindGroupEntry {
+ binding: 2,
+ resource: stitch_indicator.as_entire_binding(),
+ },
+ wgpu::BindGroupEntry {
+ binding: 3,
+ resource: part_indicator.as_entire_binding(),
+ },
+ wgpu::BindGroupEntry {
+ binding: 4,
+ resource: stitch_loc.as_entire_binding(),
+ },
+ wgpu::BindGroupEntry {
+ binding: 5,
+ resource: part_acc.as_entire_binding(),
+ },
+ wgpu::BindGroupEntry {
+ binding: 6,
+ resource: part_loc.as_entire_binding(),
+ },
+ wgpu::BindGroupEntry {
+ binding: 7,
+ resource: wgpu::BindingResource::TextureView(
+ &alphas.create_view(&Default::default()),
+ ),
+ },
+ ],
+ })
+ }
+
fn create_strips_buffer(device: &Device, required_strips_size: u64) -> Buffer {
device.create_buffer(&wgpu::BufferDescriptor {
label: Some("Strips Buffer"),
@@ -979,7 +1321,9 @@
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::Rgba32Uint,
- usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST,
+ usage: wgpu::TextureUsages::TEXTURE_BINDING
+ | wgpu::TextureUsages::COPY_DST
+ | wgpu::TextureUsages::STORAGE_BINDING,
view_formats: &[],
})
}
@@ -1172,18 +1516,99 @@
&mut self,
device: &Device,
queue: &Queue,
+ pmt: &[PreMergeTile],
gradient_cache: &mut GradientRampCache,
encoded_paints: &[GpuEncodedPaint],
alphas: &[u8],
new_render_size: &RenderSize,
paint_idxs: &[u32],
) {
+ let mut new_bg = false;
let max_texture_dimension_2d = device.limits().max_texture_dimension_2d;
- self.maybe_resize_alphas_tex(device, max_texture_dimension_2d, alphas);
+ new_bg |= self.maybe_resize_alphas_tex(device, max_texture_dimension_2d, alphas);
self.maybe_resize_encoded_paints_tex(device, max_texture_dimension_2d, paint_idxs);
self.maybe_update_config_buffer(queue, max_texture_dimension_2d, new_render_size);
- self.upload_alpha_texture(queue, alphas);
+ new_bg |= Self::maybe_resize_buffer(
+ device,
+ &mut self.resources.premerge_tile_buffer,
+ pmt.len() as u64 * std::mem::size_of::<PreMergeTile>() as u64,
+ "PreMergeTile Buffer",
+ );
+
+ let end_tiles = (alphas.len() / 16) as u64;
+ new_bg |= Self::maybe_resize_buffer(
+ device,
+ &mut self.resources.stitch_indicator_buffer,
+ end_tiles * 4,
+ "StitchIndicator",
+ );
+
+ new_bg |= Self::maybe_resize_buffer(
+ device,
+ &mut self.resources.stitch_loc_buffer,
+ end_tiles * 64,
+ "StitchLoc",
+ );
+
+ const BLOCK_DIM: u64 = 256;
+ let num_partitions = (pmt.len() as u64 + BLOCK_DIM - 1) / BLOCK_DIM;
+ new_bg |= Self::maybe_resize_buffer(
+ device,
+ &mut self.resources.part_indicator_buffer,
+ num_partitions * 4,
+ "PartIndicator",
+ );
+ new_bg |= Self::maybe_resize_buffer(
+ device,
+ &mut self.resources.part_acc_buffer,
+ num_partitions * 32,
+ "PartAcc",
+ );
+ new_bg |= Self::maybe_resize_buffer(
+ device,
+ &mut self.resources.part_loc_buffer,
+ num_partitions * 64,
+ "PartLoc",
+ );
+
+ if new_bg {
+ self.resources.merge_bind_group = Self::create_merge_bind_group(
+ device,
+ &self.merge_bind_group_layout,
+ &self.resources.compute_config_buffer,
+ &self.resources.premerge_tile_buffer,
+ &self.resources.stitch_indicator_buffer,
+ &self.resources.part_indicator_buffer,
+ &self.resources.stitch_loc_buffer,
+ &self.resources.part_acc_buffer,
+ &self.resources.part_loc_buffer,
+ &self.resources.alphas_texture,
+ );
+ }
+
+ //TODO maybe_update_compute_config_buffer
+ queue.write_buffer(
+ &self.resources.compute_config_buffer,
+ 0,
+ bytemuck::bytes_of(&ComputeConfig {
+ pmt_count: pmt.len() as u32,
+ end_tile_count: alphas.len() as u32 / 16,
+ c: 0,
+ d: 0,
+ }),
+ );
+
+ if !pmt.is_empty() {
+ queue.write_buffer(
+ &self.resources.premerge_tile_buffer,
+ 0,
+ bytemuck::cast_slice(pmt),
+ );
+ }
+
+
+ //self.upload_alpha_texture(queue, alphas); //delete this when alpha_buff is removed
self.upload_encoded_paints_texture(queue, encoded_paints);
if gradient_cache.has_changed() {
@@ -1199,7 +1624,7 @@
device: &Device,
max_texture_dimension_2d: u32,
alphas: &[u8],
- ) {
+ ) -> bool {
let required_alpha_height = u32::try_from(alphas.len())
.unwrap()
// There are 16 1-byte alpha values per texel.
@@ -1239,7 +1664,9 @@
&self.resources.view_config_buffer,
&self.resources.slot_texture_views,
);
+ return true
}
+ false
}
/// Update the encoded paints texture size if needed.
@@ -1345,6 +1772,26 @@
}
}
+ fn maybe_resize_buffer(
+ device: &Device,
+ buffer: &mut Buffer,
+ required_size: u64,
+ label: &'static str,
+ ) -> bool {
+ if required_size > buffer.size() {
+ let new_size = (required_size + 15) & !15;
+ *buffer = device.create_buffer(&wgpu::BufferDescriptor {
+ label: Some(label),
+ size: new_size,
+ usage: buffer.usage(),
+ mapped_at_creation: false,
+ });
+ true
+ } else {
+ false
+ }
+ }
+
/// Resize the texture array to accommodate more atlases.
fn maybe_resize_atlas_texture_array(
device: &Device,
diff --git a/sparse_strips/vello_hybrid/src/scene.rs b/sparse_strips/vello_hybrid/src/scene.rs
index 40234d8..986aced 100644
--- a/sparse_strips/vello_hybrid/src/scene.rs
+++ b/sparse_strips/vello_hybrid/src/scene.rs
@@ -16,7 +16,7 @@
use vello_common::peniko::color::palette::css::BLACK;
use vello_common::peniko::{BlendMode, Compose, Fill, Mix};
use vello_common::recording::{PushLayerCommand, Recordable, Recorder, Recording, RenderCommand};
-use vello_common::strip::Strip;
+use vello_common::strip::{PreMergeTile, Strip};
use vello_common::strip_generator::{GenerationMode, StripGenerator, StripStorage};
use crate::AtlasConfig;
@@ -439,8 +439,8 @@
}
fn execute_recording(&mut self, recording: &Recording) {
- let (cached_strips, cached_alphas) = recording.get_cached_strips();
- let adjusted_strips = self.prepare_cached_strips(cached_strips, cached_alphas);
+ let (cached_strips, cached_pmt, cached_alphas) = recording.get_cached_strips();
+ let adjusted_strips = self.prepare_cached_strips(cached_strips, cached_pmt, cached_alphas);
// Use pre-calculated strip start indices from when we generated the cache
let strip_start_indices = recording.get_strip_start_indices();
@@ -637,13 +637,23 @@
fn prepare_cached_strips(
&mut self,
cached_strips: &[Strip],
+ cached_pmt: &[PreMergeTile],
cached_alphas: &[u8],
) -> Vec<Strip> {
// Calculate offset for alpha indices based on current buffer size.
let alpha_offset = self.strip_storage.alphas.len() as u32;
- // Extend current alpha buffer with cached alphas.
- self.strip_storage.alphas.extend(cached_alphas);
- // Create adjusted strips with corrected alpha indices
+
+ // Extend current alpha and pre-merge tile buffers with cached data.
+ self.strip_storage.alphas.extend_from_slice(cached_alphas);
+ self.strip_storage
+ .pre_merge_tiles
+ .extend(cached_pmt.iter().map(move |pmt| {
+ let mut adjusted_pmt = *pmt;
+ adjusted_pmt.alpha_index += alpha_offset;
+ adjusted_pmt
+ }));
+
+ // Create adjusted strips with corrected alpha indices.
cached_strips
.iter()
.map(move |strip| {
diff --git a/sparse_strips/vello_hybrid/src/shaders/merge.wgsl b/sparse_strips/vello_hybrid/src/shaders/merge.wgsl
new file mode 100644
index 0000000..a027467
--- /dev/null
+++ b/sparse_strips/vello_hybrid/src/shaders/merge.wgsl
@@ -0,0 +1,536 @@
+// Copyright 2025 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+struct Config {
+ // Count of pre merge tiles
+ pmt_count: u32,
+ // Count of tiles which will write to the alpha texture
+ end_tile_count: u32,
+ c: u32,
+ d: u32,
+};
+
+struct PreMergeTile {
+ // The index into the alpha buffer that an "end tile" should write to.
+ alpha_index: u32,
+ // Contains tile location information
+ packed_info: u32,
+ // The exclusive prefix sum of the signed winding number.
+ // Uploading it saves doing the scan here
+ scanned_winding: i32,
+ padding: u32,
+ // Line points, adjusted by the tile position
+ p0: vec2f,
+ p1: vec2f,
+};
+
+const TILE_HEIGHT = 4u;
+const TILE_WIDTH = 4u;
+
+@group(0) @binding(0)
+var<uniform> config : Config;
+
+@group(0) @binding(1)
+var<storage, read> pmt_in: array<PreMergeTile>;
+
+@group(0) @binding(2)
+var<storage, read_write> stitch_indicator: array<u32>;
+
+@group(0) @binding(3)
+var<storage, read_write> part_indicator: array<u32>;
+
+@group(0) @binding(4)
+var<storage, read_write> stitch_loc: array<array<array<f32, TILE_HEIGHT>, TILE_WIDTH>>;
+
+// This is double strided. This is because we need either:
+// 1) The reduction from the last segment start to the end of the partition
+// 2) The reduction from the last segment start to the last tile start
+// These are not mutually exclusive! Stitching requires both!
+@group(0) @binding(5)
+var<storage, read_write> part_acc: array<array<f32, TILE_HEIGHT>>;
+
+@group(0) @binding(6)
+var<storage, read_write> part_loc: array<array<array<f32, TILE_HEIGHT>, TILE_WIDTH>>;
+
+@group(0) @binding(7)
+var output: texture_storage_2d<rgba32uint, write>;
+
+// @group(0) @binding(7)
+// var<storage, read_write> alpha_buff: array<vec4<u32>>;
+
+// MSB LSB
+// 31------------------21|20------------------12|11-------------------3|2|1|0|
+// | Unused (11) | Seg Start ID (9) | Tile Start ID (9) |F|T|E|
+//
+// F = FILL_RULE_MASK
+// T = IS_TILE_FIRST_COL_MASK
+// E = IS_END_TILE_MASK
+const IS_END_TILE_MASK: u32 = 1u;
+const IS_TILE_FIRST_COL_MASK: u32 = 2u;
+const FILL_RULE_MASK: u32 = 4u;
+const TILE_START_ID_SHIFT: u32 = 3u;
+const SEG_START_ID_SHIFT: u32 = 12u;
+const ID_MASK: u32 = 0x1ffu;
+const INVALID_ID: u32 = 256u;
+
+fn is_end_tile(packed_info: u32) -> bool {
+ return (packed_info & IS_END_TILE_MASK) != 0u;
+}
+
+fn is_tile_first_col(packed_info: u32) -> bool {
+ return (packed_info & IS_TILE_FIRST_COL_MASK) != 0u;
+}
+
+fn is_fill_rule_non_zero(packed_info: u32) -> bool {
+ return (packed_info & FILL_RULE_MASK) != 0u;
+}
+
+fn get_tile_start_id(packed_info: u32) -> u32 {
+ return (packed_info >> TILE_START_ID_SHIFT) & ID_MASK;
+}
+
+fn get_seg_start_id(packed_info: u32) -> u32 {
+ return (packed_info >> SEG_START_ID_SHIFT) & ID_MASK;
+}
+
+// MSB LSB
+// 31|30|29--------------------------------------------------------------0|
+// |L|A| Partition ID (30 bits) |
+//
+// T = STITCH_LOC_MASK
+// A = STITCH_ACC_MASK
+const STITCH_LOC_MASK = 1u << 31u;
+const STITCH_ACC_MASK = 1u << 30u;
+const PART_ID_MASK = 0x3fffffffu;
+
+fn loc_stitch_required(in: u32) -> bool {
+ return (in & STITCH_LOC_MASK) != 0u;
+}
+
+fn acc_stitch_required(in: u32) -> bool {
+ return (in & STITCH_ACC_MASK) != 0u;
+}
+
+fn get_part_id(in: u32) -> u32 {
+ return in & PART_ID_MASK;
+}
+
+// MSB LSB
+// 31-----------------------------------------------------------------3|1|0|
+// | Unused (30 bits) |T|S|
+//
+// S = PART_SEG_START_MASK
+// T = PART_TILE_START_MASK
+const PART_SEG_START_MASK: u32 = 1u << 0u;
+const PART_TILE_START_MASK: u32 = 1u << 1u;
+fn part_has_seg_start(in: u32) -> bool {
+ return (in & PART_SEG_START_MASK) != 0u;
+}
+
+fn part_has_tile_start(in: u32) -> bool {
+ return (in & PART_TILE_START_MASK) != 0u;
+}
+
+const BLOCK_DIM = 256u;
+const LG_BLOCK_DIM = 8u;
+const SCAN_CONST = LG_BLOCK_DIM - 1u;
+const TILE_SIZE = TILE_HEIGHT * TILE_WIDTH;
+const EPSILON = 1e-6;
+
+// TODO! Do not use more than 4096
+var<workgroup> wg_acc: array<array<f32, TILE_HEIGHT>, BLOCK_DIM>;
+var<workgroup> wg_loc: array<array<array<f32, TILE_HEIGHT>, TILE_WIDTH>, BLOCK_DIM>;
+
+// 1 thread per pre_pmt
+@compute @workgroup_size(BLOCK_DIM, 1, 1)
+fn merge(@builtin(local_invocation_id) tid: vec3<u32>,
+ @builtin(global_invocation_id) gid: vec3<u32>,
+ @builtin(workgroup_id) wgid: vec3<u32>) {
+ var pmt: PreMergeTile;
+ if (gid.x < config.pmt_count) {
+ pmt = pmt_in[gid.x];
+ } else {
+ pmt = PreMergeTile(
+ 0xffffffffu,
+ 0u,
+ 0,
+ 0u,
+ vec2f(0.0, 0.0),
+ vec2f(0.0, 0.0),
+ );
+ }
+
+ var acc = array<f32, TILE_HEIGHT>();
+ var loc = array<array<f32, TILE_HEIGHT>, TILE_WIDTH>();
+ if (abs(pmt.p0.y - pmt.p1.y) >= EPSILON) { // If not horizontal. . .
+ var line_top_y: f32;
+ var line_top_x: f32;
+ var line_bottom_y: f32;
+ var line_bottom_x: f32;
+ if (pmt.p0.y < pmt.p1.y) {
+ line_top_y = pmt.p0.y;
+ line_top_x = pmt.p0.x;
+ line_bottom_y = pmt.p1.y;
+ line_bottom_x = pmt.p1.x;
+ } else {
+ line_top_y = pmt.p1.y;
+ line_top_x = pmt.p1.x;
+ line_bottom_y = pmt.p0.y;
+ line_bottom_x = pmt.p0.x;
+ }
+
+ var line_left_x: f32;
+ var line_left_y: f32;
+ var line_right_x: f32;
+ if (pmt.p0.x < pmt.p1.x) {
+ line_left_x = pmt.p0.x;
+ line_left_y = pmt.p0.y;
+ line_right_x = pmt.p1.x;
+ } else {
+ line_left_x = pmt.p1.x;
+ line_left_y = pmt.p1.y;
+ line_right_x = pmt.p0.x;
+ }
+
+ let dx = line_bottom_x - line_top_x;
+ let dy = line_bottom_y - line_top_y;
+ let is_vertical = abs(dx) < EPSILON;
+ let y_slope = dy / dx;
+ let x_slope = dx / dy;
+
+ // Unnecessary, we remove vertical and horizontal cases, so a comparison will be sufficient;
+ // i.e. dont care about sign of zero
+ //let sign = select(1.0f, -1.0f, (bitcast<u32>(pmt.p0.y - pmt.p1.y) & 0x80000000u) != 0u);
+ let sign = select(1.0f, -1.0f, pmt.p0.y < pmt.p1.y);
+
+ if (is_tile_first_col(pmt.packed_info) && line_left_x < 0.0f) {
+ var ymin: f32;
+ var ymax: f32;
+ if (is_vertical) {
+ ymin = line_top_y;
+ ymax = line_bottom_y;
+ } else {
+ let line_viewport_left_y =
+ min(max((line_top_y - line_top_x * y_slope), line_top_y), line_bottom_y);
+ ymin = min(line_left_y, line_viewport_left_y);
+ ymax = max(line_left_y, line_viewport_left_y);
+ }
+
+ var h = array<f32, TILE_HEIGHT>();
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ let px_top_y = f32(y);
+ let px_bottom_y = 1.0f + px_top_y;
+ let ymin_clamped = max(ymin, px_top_y);
+ let ymax_clamped = min(ymax, px_bottom_y);
+ h[y] = max(ymax_clamped - ymin_clamped, 0.0f);
+ acc[y] = sign * h[y];
+ }
+ } else {
+ // No need to clear in wgsl, but other shader languages required.
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ acc[y] = 0.0f;
+ }
+ }
+
+ if (line_right_x >= 0.0f) {
+ if (is_vertical) {
+ let x_int = u32(floor(line_top_x));
+ for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ var h = 0.0f;
+ var area = 0.0f;
+ if x == x_int {
+ let px_top_y = f32(y);
+ let px_bottom_y = 1.0f + px_top_y;
+ let ymin = max(px_top_y, line_top_y);
+ let ymax = min(px_bottom_y, line_bottom_y);
+ let coverage_right = f32(x) + 1.0f - line_top_x;
+ h = max(ymax - ymin, 0.0f);
+ area = h * coverage_right;
+ }
+ loc[x][y] = acc[y] + sign * area;
+ acc[y] += sign * h;
+ }
+ }
+ }
+
+ if (!is_vertical) {
+ for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+ let px_left_x = f32(x);
+ let px_right_x = 1.0f + px_left_x;
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ let px_top_y = f32(y);
+ let px_bottom_y = 1.0f + px_top_y;
+ let ymin = max(line_top_y, px_top_y);
+ let ymax = min(line_bottom_y, px_bottom_y);
+
+ let line_px_left_y =
+ min(max((line_top_y + (px_left_x - line_top_x) * y_slope), ymin), ymax);
+ let line_px_right_y =
+ min(max((line_top_y + (px_right_x - line_top_x) * y_slope), ymin), ymax);
+ let line_px_left_yx =
+ line_top_x + (line_px_left_y - line_top_y) * x_slope;
+ let line_px_right_yx =
+ line_top_x + (line_px_right_y - line_top_y) * x_slope;
+ let h = abs(line_px_right_y - line_px_left_y);
+ let area =
+ 0.5f * h * (2.0f * px_right_x - line_px_right_yx - line_px_left_yx);
+ loc[x][y] = acc[y] + sign * area;
+ acc[y] += sign * h;
+ }
+ }
+ }
+
+ wg_loc[tid.x] = loc;
+ } else {
+ for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+ wg_loc[tid.x][x] = acc;
+ }
+ }
+ wg_acc[tid.x] = acc;
+ } else {
+ // TODO For non wgsl this needs to clear
+ wg_acc[tid.x] = acc;
+ wg_loc[tid.x] = loc;
+ }
+
+ for (var i = 1u; i <= BLOCK_DIM; i <<= 1u) {
+ workgroupBarrier();
+ let ii = tid.x - i;
+ if (tid.x >= i) {
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ acc[y] += wg_acc[ii][y];
+ }
+
+ for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ loc[x][y] += wg_loc[ii][x][y];
+ }
+ }
+ }
+ workgroupBarrier();
+
+ if (tid.x >= i) {
+ wg_acc[tid.x] = acc;
+ wg_loc[tid.x] = loc;
+ }
+ }
+ workgroupBarrier();
+
+ var scanned_winding = 0.0f;
+ let end_tile = is_end_tile(pmt.packed_info);
+ let seg_start_id = get_seg_start_id(pmt.packed_info);
+ let tile_start_id = get_tile_start_id(pmt.packed_info);
+ let seg_is_valid = seg_start_id != INVALID_ID;
+ let tile_is_valid = tile_start_id != INVALID_ID;
+ if (end_tile || tid.x == BLOCK_DIM - 1u) {
+ if (tile_is_valid) {
+ if (tile_start_id != 0u) {
+ for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ loc[x][y] -= wg_loc[tile_start_id - 1u][x][y];
+ }
+ }
+ acc = wg_acc[tile_start_id - 1u];
+ } else {
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ acc[y] = 0.0f;
+ }
+ }
+ }
+
+ if (seg_is_valid) {
+ scanned_winding = f32(pmt_in[(wgid.x << LG_BLOCK_DIM) + seg_start_id].scanned_winding);
+ if (seg_start_id != 0u) {
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ acc[y] += scanned_winding - wg_acc[seg_start_id - 1u][y];
+ }
+ } else {
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ acc[y] += scanned_winding;
+ }
+ }
+ }
+
+ // At this point, if this is the last thread in the threadBlock, this contains the reduction
+ // from the tile start
+ if (tid.x == BLOCK_DIM - 1u) {
+ part_loc[wgid.x] = loc;
+ }
+
+ // This will add in the acc for the cases we want: tile_is_valid, tile_is_valid && seg_is_valid
+ if (tile_is_valid) {
+ for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ loc[x][y] += acc[y];
+ }
+ }
+ }
+
+ // Only end tiles participate in the write out
+ if (end_tile) {
+ var s_ind = 0u;
+ if (tile_is_valid && seg_is_valid) { // Safe to write out
+ var final_alphas: vec4<u32>;
+ if (is_fill_rule_non_zero(pmt.packed_info)) {
+ for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+ final_alphas[x] = pack4x8unorm(abs(vec4<f32>(loc[x][0], loc[x][1],
+ loc[x][2], loc[x][3])));
+ }
+ } else {
+ // EvenOdd fill rule logic
+ for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+ let area = vec4<f32>(loc[x][0], loc[x][1], loc[x][2], loc[x][3]);
+ let im1 = floor(area * 0.5 + 0.5);
+ let coverage = abs(area - 2.0 * im1);
+ final_alphas[x] = pack4x8unorm(coverage);
+ }
+ }
+ let tex_dims = textureDimensions(output);
+ let tex_width = tex_dims.x;
+ let output_coords = vec2<u32>(
+ (pmt.alpha_index >> 4) % tex_width,
+ (pmt.alpha_index >> 4) / tex_width,
+ );
+ if (output_coords.y < tex_dims.y) {
+ textureStore(output, output_coords, final_alphas);
+ }
+ } else {
+ // TODO fill rule on stitch
+ s_ind = wgid.x |
+ select(0u, STITCH_ACC_MASK, tile_is_valid && !seg_is_valid) |
+ select(0u, STITCH_LOC_MASK, !tile_is_valid);
+ stitch_loc[pmt.alpha_index >> 4u] = loc;
+ }
+ stitch_indicator[pmt.alpha_index >> 4u] = s_ind;
+ }
+ }
+
+ if (tid.x == BLOCK_DIM - 1) {
+ part_acc[wgid.x << 1u] = acc;
+ var seg_acc = wg_acc[tid.x];
+ if (seg_is_valid) {
+ if (seg_start_id != 0u) {
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ seg_acc[y] += scanned_winding - wg_acc[seg_start_id - 1u][y];
+ }
+ } else {
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ seg_acc[y] += scanned_winding;
+ }
+ }
+ }
+ part_acc[(wgid.x << 1u) + 1u] = seg_acc;
+
+ part_indicator[wgid.x] = select(0u, PART_TILE_START_MASK, tile_is_valid) |
+ select(0u, PART_SEG_START_MASK, seg_is_valid);
+ }
+}
+
+// 1 thread per end_tile
+// TODO once subgroups, this should be 1 subgroup : tile
+@compute @workgroup_size(BLOCK_DIM, 1, 1)
+fn stitch(@builtin(local_invocation_id) tid: vec3<u32>,
+ @builtin(global_invocation_id) gid: vec3<u32>,
+ @builtin(workgroup_id) wgid: vec3<u32>) {
+ var s_indicator = 0u;
+ if (gid.x < config.end_tile_count) {
+ s_indicator = stitch_indicator[gid.x];
+ }
+
+ // No stitching needed or oob
+ if (s_indicator == 0u) {
+ return;
+ }
+ let part_id = get_part_id(s_indicator);
+ var loc = stitch_loc[gid.x];
+ var acc = array<f32, TILE_HEIGHT>();
+ if (loc_stitch_required(s_indicator)) {
+ var lookback_id = part_id - 1u;
+ var part_ind: u32;
+ while (true) {
+ let p_loc = part_loc[lookback_id];
+ for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ loc[x][y] += p_loc[x][y];
+ }
+ }
+
+ // Did we hit a tile start?
+ part_ind = part_indicator[lookback_id];
+ if (part_has_tile_start(part_ind)) {
+ break;
+ } else {
+ lookback_id -= 1u;
+ }
+ }
+
+ lookback_id <<= 1u;
+ acc = part_acc[lookback_id];
+
+ // If the tile start also included a seg start, we're done. Else we will have to traverse to
+ // the previous seg start
+ if (!part_has_seg_start(part_ind)) {
+ lookback_id -= 1u; // Down to the upper
+ while (true) {
+ let p_acc = part_acc[lookback_id];
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ acc[y] += p_acc[y];
+ }
+
+ if (part_has_seg_start(part_indicator[lookback_id >> 1u])) {
+ break;
+ } else {
+ lookback_id -= 2u;
+ }
+ }
+ }
+ }
+
+ if (acc_stitch_required(s_indicator)) {
+ var lookback_id = (part_id - 1u) << 1u;
+ while (true) {
+ var s_acc = part_acc[lookback_id + 1u];
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ acc[y] += s_acc[y];
+ }
+
+ if (part_has_seg_start(part_indicator[lookback_id >> 1u])) {
+ break;
+ } else {
+ lookback_id -= 2u;
+ }
+ }
+ }
+
+ // Combine acc and loc
+ for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+ for (var y = 0u; y < TILE_HEIGHT; y += 1u) {
+ loc[x][y] += acc[y];
+ }
+ }
+
+ var final_alphas: vec4<u32>;
+ if (true) { // TODO put fill rule onto the stitch
+ for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+ final_alphas[x] = pack4x8unorm(abs(vec4<f32>(loc[x][0], loc[x][1],
+ loc[x][2], loc[x][3])));
+ }
+ } else {
+ // EvenOdd fill rule logic
+ for (var x = 0u; x < TILE_WIDTH; x += 1u) {
+ let area = vec4<f32>(loc[x][0], loc[x][1], loc[x][2], loc[x][3]);
+ let im1 = floor(area * 0.5 + 0.5);
+ let coverage = abs(area - 2.0 * im1);
+ final_alphas[x] = pack4x8unorm(coverage);
+ }
+ }
+ let tex_dims = textureDimensions(output);
+ let tex_width = tex_dims.x;
+ let output_coords = vec2<u32>(
+ gid.x % tex_width,
+ gid.x / tex_width,
+ );
+ if (output_coords.y < tex_dims.y) {
+ textureStore(output, output_coords, final_alphas);
+ }
+}
diff --git a/sparse_strips/vello_toy/src/debug.rs b/sparse_strips/vello_toy/src/debug.rs
index 01732a8..f2b390e 100644
--- a/sparse_strips/vello_toy/src/debug.rs
+++ b/sparse_strips/vello_toy/src/debug.rs
@@ -33,6 +33,7 @@
let mut line_buf = vec![];
let mut tiles = Tiles::new(Level::new());
let mut strip_buf = vec![];
+ let mut pmt_buf = vec![];
let mut alpha_buf = vec![];
let mut wide = Wide::<MODE_CPU>::new(args.width, args.height);
@@ -79,6 +80,7 @@
Level::new(),
&tiles,
&mut strip_buf,
+ &mut pmt_buf,
&mut alpha_buf,
args.fill_rule,
None,