First cut at bbox improvement This works but is currently limited to 64k path segments.
diff --git a/.vscode/settings.json b/.vscode/settings.json index 883465a..6deaef7 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json
@@ -1,6 +1,7 @@ { "wgsl-analyzer.customImports": { "bbox": "${workspaceFolder}/shader/shared/bbox.wgsl", + "bbox_monoid": "${workspaceFolder}/shader/shared/bbox_monoid.wgsl", "blend": "${workspaceFolder}/shader/shared/blend.wgsl", "bump": "${workspaceFolder}/shader/shared/bump.wgsl", "clip": "${workspaceFolder}/shader/shared/clip.wgsl",
diff --git a/shader/bbox_clear.wgsl b/shader/bbox_clear.wgsl deleted file mode 100644 index fe8cceb..0000000 --- a/shader/bbox_clear.wgsl +++ /dev/null
@@ -1,31 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -#import config - -@group(0) @binding(0) -var<uniform> config: Config; - -struct PathBbox { - x0: i32, - y0: i32, - x1: i32, - y1: i32, - linewidth: f32, - trans_ix: u32, -} - -@group(0) @binding(1) -var<storage, read_write> path_bboxes: array<PathBbox>; - -@compute @workgroup_size(256) -fn main( - @builtin(global_invocation_id) global_id: vec3<u32>, -) { - let ix = global_id.x; - if ix < config.n_path { - path_bboxes[ix].x0 = 0x7fffffff; - path_bboxes[ix].y0 = 0x7fffffff; - path_bboxes[ix].x1 = -0x80000000; - path_bboxes[ix].y1 = -0x80000000; - } -}
diff --git a/shader/bbox_fixup.wgsl b/shader/bbox_fixup.wgsl new file mode 100644 index 0000000..d336466 --- /dev/null +++ b/shader/bbox_fixup.wgsl
@@ -0,0 +1,75 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +#import config +#import pathtag +#import bbox +#import bbox_monoid + +@group(0) @binding(0) +var<uniform> config: Config; + +@group(0) @binding(1) +var<storage> tag_monoids: array<TagMonoid>; + +@group(0) @binding(2) +var<storage> bbox_reduced: array<BboxMonoid>; + +@group(0) @binding(3) +var<storage, read_write> path_bboxes: array<PathBbox>; + +let WG_SIZE = 256u; +var<workgroup> sh_bbox: array<BboxMonoid, WG_SIZE>; + +fn round_down(x: f32) -> i32 { + return i32(floor(x)); +} + +fn round_up(x: f32) -> i32 { + return i32(ceil(x)); +} + +// In the configuration with <= 64k pathtags, there's only one +// workgroup here, so the distinction between global and local is +// not meaningful. But we'll probably want to #ifdef a larger +// configuration, in which we also bind a doubly reduced buffer. +@compute @workgroup_size(256) +fn main( + @builtin(global_invocation_id) global_id: vec3<u32>, + @builtin(local_invocation_id) local_id: vec3<u32>, +) { + var agg: BboxMonoid; + if global_id.x * WG_SIZE < config.n_pathtag { + agg = bbox_reduced[global_id.x]; + } + sh_bbox[local_id.x] = agg; + for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) { + workgroupBarrier(); + if local_id.x >= 1u << i { + let other = sh_bbox[local_id.x - (1u << i)]; + agg = combine_bbox_monoid(other, agg); + } + workgroupBarrier(); + sh_bbox[local_id.x] = agg; + } + // Explanation of this trick: we don't need to fix up first bbox. + // By offsetting the index, we can use the inclusive scan. + let ix = global_id.x + 1u; + if ix * WG_SIZE < config.n_pathtag { + // First path of the workgroup. + let path_ix = tag_monoids[ix * (WG_SIZE / 4u)].path_ix; + if (agg.flags & FLAG_RESET_BBOX) == 0u && (agg.bbox.z > agg.bbox.x || agg.bbox.w > agg.bbox.y) { + let out = &path_bboxes[path_ix]; + // TODO: casting goes away + var bbox = vec4(f32((*out).x0), f32((*out).y0), f32((*out).x1), f32((*out).y1)); + if bbox.z > bbox.x || bbox.w > bbox.y { + bbox = vec4(min(agg.bbox.xy, bbox.xy), max(agg.bbox.zw, bbox.zw)); + } else { + bbox = agg.bbox; + } + (*out).x0 = round_down(bbox.x); + (*out).y0 = round_down(bbox.y); + (*out).x1 = round_up(bbox.z); + (*out).y1 = round_up(bbox.w); + } + } +}
diff --git a/shader/pathseg.wgsl b/shader/pathseg.wgsl index ec059ab..a8f32f2 100644 --- a/shader/pathseg.wgsl +++ b/shader/pathseg.wgsl
@@ -11,6 +11,8 @@ // There's some duplication of the decoding code but we won't worry about // that just now. Perhaps it could be factored more nicely later. +#import bbox +#import bbox_monoid #import config #import pathtag #import cubic @@ -24,51 +26,14 @@ @group(0) @binding(2) var<storage> tag_monoids: array<TagMonoid>; -struct AtomicPathBbox { - x0: atomic<i32>, - y0: atomic<i32>, - x1: atomic<i32>, - y1: atomic<i32>, - linewidth: f32, - trans_ix: u32, -} - @group(0) @binding(3) -var<storage, read_write> path_bboxes: array<AtomicPathBbox>; - +var<storage, read_write> path_bboxes: array<PathBbox>; @group(0) @binding(4) var<storage, read_write> cubics: array<Cubic>; -// Monoid is yagni, for future optimization - -// struct BboxMonoid { -// bbox: vec4<f32>, -// flags: u32, -// } - -// let FLAG_RESET_BBOX = 1u; -// let FLAG_SET_BBOX = 2u; - -// fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid { -// var c: BboxMonoid; -// c.bbox = b.bbox; -// // TODO: previous-me thought this should be gated on b & SET_BBOX == false also -// if (a.flags & FLAG_RESET_BBOX) == 0u && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y { -// c.bbox = a.bbox; -// } else if (a.flags & FLAG_RESET_BBOX) == 0u && (b.flags & FLAG_SET_BBOX) == 0u || -// (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y) -// { -// c.bbox = vec4<f32>(min(a.bbox.xy, c.bbox.xy), max(a.bbox.xw, c.bbox.zw)); -// } -// c.flags = (a.flags & FLAG_SET_BBOX) | b.flags; -// c.flags |= (a.flags & FLAG_RESET_BBOX) << 1u; -// return c; -// } - -// fn bbox_monoid_identity() -> BboxMonoid { -// return BboxMonoid(); -// } +@group(0) @binding(5) +var<storage, read_write> bbox_reduced: array<BboxMonoid>; var<private> pathdata_base: u32; @@ -115,10 +80,14 @@ return i32(ceil(x)); } +let WG_SIZE = 256u; +var<workgroup> sh_bbox: array<BboxMonoid, WG_SIZE>; + @compute @workgroup_size(256) fn main( @builtin(global_invocation_id) global_id: vec3<u32>, @builtin(local_invocation_id) local_id: vec3<u32>, + @builtin(workgroup_id) wg_id: vec3<u32>, ) { let ix = global_id.x; let tag_word = scene[config.pathtag_base + (ix >> 2u)]; @@ -130,12 +99,10 @@ let out = &path_bboxes[tm.path_ix]; let linewidth = bitcast<f32>(scene[config.linewidth_base + tm.linewidth_ix]); - if (tag_byte & PATH_TAG_PATH) != 0u { - (*out).linewidth = linewidth; - (*out).trans_ix = tm.trans_ix; - } + let bbox_flags = u32((tag_byte & PATH_TAG_PATH) != 0u); // Decode path data let seg_type = tag_byte & PATH_TAG_SEG_TYPE; + var bbox: vec4<f32>; if seg_type != 0u { var p0: vec2<f32>; var p1: vec2<f32>; @@ -163,7 +130,7 @@ let transform = read_transform(config.transform_base, tm.trans_ix); p0 = transform_apply(transform, p0); p1 = transform_apply(transform, p1); - var bbox = vec4(min(p0, p1), max(p0, p1)); + bbox = vec4(min(p0, p1), max(p0, p1)); // Degree-raise if seg_type == PATH_TAG_LINETO { p3 = p1; @@ -191,13 +158,29 @@ } let flags = u32(linewidth >= 0.0); cubics[global_id.x] = Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags); - // Update bounding box using atomics only. Computing a monoid is a - // potential future optimization. - if bbox.z > bbox.x || bbox.w > bbox.y { - atomicMin(&(*out).x0, round_down(bbox.x)); - atomicMin(&(*out).y0, round_down(bbox.y)); - atomicMax(&(*out).x1, round_up(bbox.z)); - atomicMax(&(*out).y1, round_up(bbox.w)); + } + var agg = BboxMonoid(bbox, bbox_flags); + sh_bbox[local_id.x] = agg; + for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) { + workgroupBarrier(); + if local_id.x >= 1u << i { + let other = sh_bbox[local_id.x - (1u << i)]; + agg = combine_bbox_monoid(other, agg); } + workgroupBarrier(); + sh_bbox[local_id.x] = agg; + } + if local_id.x == WG_SIZE - 1u { + bbox_reduced[wg_id.x] = agg; + } + if bbox_flags != 0u { + let out = &path_bboxes[tm.path_ix]; + // TODO: now that we're not atomic, don't need fixed-point + (*out).x0 = round_down(agg.bbox.x); + (*out).y0 = round_down(agg.bbox.y); + (*out).x1 = round_up(agg.bbox.z); + (*out).y1 = round_up(agg.bbox.w); + (*out).linewidth = linewidth; + (*out).trans_ix = tm.trans_ix; } }
diff --git a/shader/shared/bbox_monoid.wgsl b/shader/shared/bbox_monoid.wgsl new file mode 100644 index 0000000..1a301d2 --- /dev/null +++ b/shader/shared/bbox_monoid.wgsl
@@ -0,0 +1,25 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +struct BboxMonoid { + bbox: vec4<f32>, + flags: u32, +} + +let FLAG_RESET_BBOX = 1u; +let FLAG_SET_BBOX = 2u; + +// Technically this is a semigroup with a left identity rather than a +// true monoid, but that is good enough for our purposes. +fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid { + var bbox = b.bbox; + if (b.flags & FLAG_SET_BBOX) == 0u && (a.flags & FLAG_RESET_BBOX) == 0u { + if bbox.z <= bbox.x && bbox.w <= bbox.y { + bbox = a.bbox; + } else if a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y { + bbox = vec4(min(a.bbox.xy, bbox.xy), max(a.bbox.zw, bbox.zw)); + } + } + let flags = ((a.flags | (a.flags << 1u)) & FLAG_SET_BBOX) | b.flags; + return BboxMonoid(bbox, flags); +} +
diff --git a/shader/shared/config.wgsl b/shader/shared/config.wgsl index 0cb56d8..6ef9434 100644 --- a/shader/shared/config.wgsl +++ b/shader/shared/config.wgsl
@@ -10,6 +10,7 @@ n_drawobj: u32, n_path: u32, n_clip: u32, + n_pathtag: u32, // To reduce the number of bindings, info and bin data are combined // into one buffer.
diff --git a/src/encoding/packed.rs b/src/encoding/packed.rs index 7004477..34f7aef 100644 --- a/src/encoding/packed.rs +++ b/src/encoding/packed.rs
@@ -32,6 +32,8 @@ pub n_paths: u32, /// Number of clips. pub n_clips: u32, + /// Number of path tags. + pub n_pathtag: u32, /// Start of binning data. pub bin_data_start: u32, /// Start of path tag stream. @@ -139,13 +141,14 @@ // Pack encoded data. let layout = &mut self.layout; *layout = Layout::default(); + let n_path_tags = encoding.path_tags.len(); layout.n_paths = encoding.n_paths; layout.n_draw_objects = encoding.n_paths; layout.n_clips = encoding.n_clips; + layout.n_pathtag = n_path_tags as u32; let data = &mut self.data; data.clear(); // Path tag stream - let n_path_tags = encoding.path_tags.len(); let path_tag_padded = align_up(n_path_tags, 4 * shaders::PATHTAG_REDUCE_WG); let capacity = path_tag_padded + slice_size_in_bytes(&encoding.path_data)
diff --git a/src/render.rs b/src/render.rs index 0ee195a..fb5c253 100644 --- a/src/render.rs +++ b/src/render.rs
@@ -23,7 +23,10 @@ const DRAW_BBOX_SIZE: u64 = 16; const BUMP_SIZE: u64 = 16; const BIN_HEADER_SIZE: u64 = 8; +const BBOX_MONOID_SIZE: u64 = 32; +// Note: this is defined here as it's still used by the reduced pipeline, +// but for the full pipeline, use the version in `encoding`. #[repr(C)] #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)] struct Config { @@ -34,6 +37,7 @@ n_drawobj: u32, n_path: u32, n_clip: u32, + n_pathtag: u32, bin_data_start: u32, pathtag_base: u32, pathdata_base: u32, @@ -202,14 +206,10 @@ ); let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG; let path_bbox_buf = ResourceProxy::new_buf(n_paths as u64 * PATH_BBOX_SIZE); - recording.dispatch( - shaders.bbox_clear, - (drawobj_wgs, 1, 1), - [config_buf, path_bbox_buf], - ); let cubic_buf = ResourceProxy::new_buf(n_pathtag as u64 * CUBIC_SIZE); let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG; + let bbox_reduced_buf = ResourceProxy::new_buf(path_coarse_wgs as u64 * BBOX_MONOID_SIZE); recording.dispatch( shaders.pathseg, (path_coarse_wgs, 1, 1), @@ -219,8 +219,14 @@ tagmonoid_buf, path_bbox_buf, cubic_buf, + bbox_reduced_buf, ], ); + recording.dispatch( + shaders.bbox_fixup, + (1, 1, 1), + [config_buf, tagmonoid_buf, bbox_reduced_buf, path_bbox_buf], + ); let draw_reduced_buf = ResourceProxy::new_buf(drawobj_wgs as u64 * DRAWMONOID_SIZE); recording.dispatch( shaders.draw_reduce,
diff --git a/src/shaders.rs b/src/shaders.rs index 1df6b14..9371d1a 100644 --- a/src/shaders.rs +++ b/src/shaders.rs
@@ -48,8 +48,8 @@ pub struct FullShaders { pub pathtag_reduce: ShaderId, pub pathtag_scan: ShaderId, - pub bbox_clear: ShaderId, pub pathseg: ShaderId, + pub bbox_fixup: ShaderId, pub draw_reduce: ShaderId, pub draw_leaf: ShaderId, pub clip_reduce: ShaderId, @@ -144,11 +144,6 @@ BindType::Buffer, ], )?; - let bbox_clear = engine.add_shader( - device, - preprocess::preprocess(shader!("bbox_clear"), &empty, &imports).into(), - &[BindType::Uniform, BindType::Buffer], - )?; let pathseg = engine.add_shader( device, preprocess::preprocess(shader!("pathseg"), &full_config, &imports).into(), @@ -158,6 +153,17 @@ BindType::BufReadOnly, BindType::Buffer, BindType::Buffer, + BindType::Buffer, + ], + )?; + let bbox_fixup = engine.add_shader( + device, + preprocess::preprocess(shader!("bbox_fixup"), &full_config, &imports).into(), + &[ + BindType::Uniform, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::Buffer, ], )?; let draw_reduce = engine.add_shader( @@ -279,8 +285,8 @@ Ok(FullShaders { pathtag_reduce, pathtag_scan, - bbox_clear, pathseg, + bbox_fixup, draw_reduce, draw_leaf, clip_reduce, @@ -305,6 +311,7 @@ const SHARED_SHADERS: &[(&str, &str)] = &[ shared_shader!("bbox"), + shared_shader!("bbox_monoid"), shared_shader!("blend"), shared_shader!("bump"), shared_shader!("clip"),