| // Copyright 2024 the Vello Authors |
| // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense |
| |
| // Merge tiles |
| |
| // TODO: lots of rework here |
| struct Minitile { |
| path_ix: u32, |
| x: u32, |
| y: u32, |
| p0: u32, // packed |
| p1: u32, // packed |
| } |
| |
| fn unpack_point(p: u32) -> vec2f { |
| let x = f32(p & 0xffffu) * (1.0 / 8192.0); |
| let y = f32(p >> 16u) * (1.0 / 8192.0); |
| return vec2(x, y); |
| } |
| |
| @group(0) @binding(0) |
| var<storage> input: array<Minitile>; |
| |
| struct MergeMonoid { |
| winding: i32, |
| n_strips: u32, |
| // this probably doesn't belong in reduced |
| start: u32, |
| start_x: u32, |
| } |
| |
| @group(1) @binding(1) |
| var<storage> reduced_mm: array<MergeMonoid>; |
| |
| fn combine_merge_monoid(a: MergeMonoid, b: MergeMonoid) -> MergeMonoid { |
| var c: MergeMonoid; |
| c.winding = a.winding + b.winding; |
| c.n_strips = a.n_strips + b.n_strips; |
| c.start = max(a.start, b.start); |
| return c; |
| } |
| |
| fn mt_delta(t: Minitile) -> i32 { |
| return i32((t.p1 >> 16u) == 0u) - i32((t.p0 >> 16u) == 0u); |
| } |
| |
| fn mt_histogram(t: Minitile) -> u32 { |
| let x0 = f32(t.p0 & 0xffffu) * (1.0 / 8192.0); |
| let x1 = f32(t.p1 & 0xffffu) * (1.0 / 8192.0); |
| let xmin = u32(floor(min(x0, x1))); |
| let xmax = u32(ceil(max(x0, x1))); |
| let rshift = (4u - (xmax - xmin)) * 8u; |
| let lshift = xmin * 8u; |
| return (0x01010101u >> rshift) << lshift; |
| } |
| |
| fn reduce_histo(histo: u32) -> u32 { |
| let tmp = (histo & 0xff00ffu) + ((histo >> 8u) & 0xff00ffu); |
| return (tmp >> 16u) + (tmp & 0xffffu); |
| } |
| |
| const WG_SIZE = 256u; |
| |
| var<workgroup> sh_mm: array<MergeMonoid, WG_SIZE>; |
| var<workgroup> sh_histo: array<u32, WG_SIZE>; |
| var<workgroup> sh_seg_end: array<u32, WG_SIZE>; |
| var<workgroup> sh_inclusive_cols: array<u32, WG_SIZE>; |
| var<workgroup> sh_area: array<atomic<i32>, WG_SIZE>; |
| var<workgroup> sh_carryover: array<i32, 4>; |
| |
| @compute @workgroup_size(256) |
| fn main( |
| @builtin(local_invocation_id) local_id: vec3<u32>, |
| @builtin(workgroup_id) wg_id: vec3<u32>, |
| ) { |
| // scan merge monoid |
| var first = false; |
| var first_x = false; |
| // predicate? or pad? |
| let global_ix = wg_id.x * WG_SIZE + local_id.x; |
| let tile = input[global_ix]; |
| if global_ix != 0u { |
| let prev = input[global_ix - 1u]; |
| first = tile.path_ix != prev.path_ix || tile.y != prev.y; |
| first_x = first || tile.x != prev.x; |
| } |
| let winding = mt_delta(tile); |
| let n_strips = u32(first); |
| let start = select(0u, local_id.x, first); |
| let start_x = select(0u, local_id.x, first_x); |
| var agg = MergeMonoid(winding, n_strips, start, start_x); |
| |
| let local_histo = mt_histogram(tile); |
| var histo = local_histo; |
| sh_mm[local_id.x] = agg; |
| sh_histo[local_id.x] = histo; |
| for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) { |
| workgroupBarrier(); |
| if local_id.x >= 1u << i { |
| let other = sh_mm[local_id.x - (1u << i)]; |
| agg = combine_merge_monoid(agg, other); |
| histo += sh_histo[local_id.x - (1u << i)]; |
| } |
| workgroupBarrier(); |
| sh_mm[local_id.x] = agg; |
| sh_histo[local_id.x] = histo; |
| } |
| if wg_id.x > 0u { |
| let prefix = reduced_mm[wg_id.x - 1u]; |
| agg = combine_merge_monoid(prefix, agg); |
| } |
| workgroupBarrier(); |
| // subtract off start of scanline winding number |
| |
| // This is a workaround for overflow at 256. One alternative approach is to |
| // only do 255 items per workgroup. |
| sh_inclusive_cols[local_id.x] = reduce_histo(local_histo) + reduce_histo(histo - local_histo); |
| let seg_rel_histo = histo - select(0u, sh_histo[agg.start_x - 1u], agg.start_x > 0u); |
| workgroupBarrier(); |
| // sh_histo now contains histograms relative to segment |
| sh_histo[local_id.x] = seg_rel_histo; |
| |
| let last_x = local_id.x == WG_SIZE - 1u || sh_mm[local_id.x + 1u].start_x != agg.start_x; |
| if last_x { |
| sh_seg_end[agg.start_x] = local_id.x; |
| } |
| |
| let total_cols = workgroupUniformLoad(&sh_inclusive_cols[WG_SIZE - 1u]); |
| // maybe don't need to fill this; consumers can only read from starts |
| if agg.start_x != local_id.x { |
| sh_seg_end[local_id.x] = sh_seg_end[agg.start_x]; |
| } |
| let n_blocks = (total_cols + WG_SIZE - 1u) / WG_SIZE; |
| for (var block_ix = 0u; block_ix < n_blocks; block_ix++) { |
| let ix = block_ix * WG_SIZE + local_id.x; |
| // binary search to find work item |
| var cols = 0u; // misnamed |
| for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) { |
| let probe = cols + ((WG_SIZE / 2u) >> i); |
| if ix > sh_inclusive_cols[probe - 1u] { |
| cols = probe; |
| } |
| } |
| let seg_start = sh_mm[cols].start_x; |
| let prefix_cols = select(0u, sh_inclusive_cols[seg_start - 1u], seg_start > 0u); |
| let col_within_segment = ix - prefix_cols; |
| // now choose a column; this can fail in the 256 case |
| let seg_end = sh_seg_end[seg_start]; |
| let last_histo = sh_histo[seg_end]; |
| var tile_within_col = col_within_segment; |
| var col = 0u; |
| while col < 3u { |
| let hist_val = (last_histo >> (col * 8u)) & 0xffu; |
| if tile_within_col >= hist_val { |
| tile_within_col -= hist_val; |
| col++; |
| } else { |
| break; |
| } |
| } |
| // do binary search to find tile within column |
| // (search is in seg_start..=seg_end) |
| var lo = seg_start; |
| var hi = seg_end + 1u; |
| let goal = tile_within_col; |
| while hi > lo + 1u { |
| let mid = (lo + hi) >> 1u; |
| if goal >= ((sh_histo[mid - 1u] >> (col * 8u)) & 0xffu) { |
| lo = mid; |
| } else { |
| hi = mid; |
| } |
| } |
| // at this point, lo should index our tile |
| // TODO: predicate on ix < total_cols? |
| let render_tile = input[wg_id.x * WG_SIZE + lo]; |
| var alphas = 0u; |
| for (var y = 0u; y < 4u; y++) { |
| if tile_within_col == 0u { |
| atomicStore(&sh_area[local_id.x], 0); |
| } |
| workgroupBarrier(); |
| var area_init = 0; |
| if local_id.x == 0u && block_ix != 0u { |
| area_init = sh_carryover[y]; |
| } |
| let area = area_init; // TODO: compute from tile |
| atomicAdd(&sh_area[local_id.x - tile_within_col], area); |
| workgroupBarrier(); |
| if tile_within_col == 0u { |
| let summed_area = atomicLoad(&sh_area[local_id.x]); |
| if seg_end == WG_SIZE - 1u { |
| // TODO: only if last column |
| sh_carryover[y] = summed_area; |
| } |
| let winding_area = sh_mm[seg_end].winding * 256; |
| let alpha_u8 = u32(min(abs(summed_area + winding_area), 255)); |
| alphas = (alphas >> 8u) + (alpha_u8 << 24u); |
| } |
| } |
| if tile_within_col == 0u { |
| // TODO: store alphas |
| } |
| } |
| } |