First cut at bbox improvement
This works but is currently limited to 64k path segments.
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 883465a..6deaef7 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,6 +1,7 @@
{
"wgsl-analyzer.customImports": {
"bbox": "${workspaceFolder}/shader/shared/bbox.wgsl",
+ "bbox_monoid": "${workspaceFolder}/shader/shared/bbox_monoid.wgsl",
"blend": "${workspaceFolder}/shader/shared/blend.wgsl",
"bump": "${workspaceFolder}/shader/shared/bump.wgsl",
"clip": "${workspaceFolder}/shader/shared/clip.wgsl",
diff --git a/shader/bbox_clear.wgsl b/shader/bbox_clear.wgsl
deleted file mode 100644
index fe8cceb..0000000
--- a/shader/bbox_clear.wgsl
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-#import config
-
-@group(0) @binding(0)
-var<uniform> config: Config;
-
-struct PathBbox {
- x0: i32,
- y0: i32,
- x1: i32,
- y1: i32,
- linewidth: f32,
- trans_ix: u32,
-}
-
-@group(0) @binding(1)
-var<storage, read_write> path_bboxes: array<PathBbox>;
-
-@compute @workgroup_size(256)
-fn main(
- @builtin(global_invocation_id) global_id: vec3<u32>,
-) {
- let ix = global_id.x;
- if ix < config.n_path {
- path_bboxes[ix].x0 = 0x7fffffff;
- path_bboxes[ix].y0 = 0x7fffffff;
- path_bboxes[ix].x1 = -0x80000000;
- path_bboxes[ix].y1 = -0x80000000;
- }
-}
diff --git a/shader/bbox_fixup.wgsl b/shader/bbox_fixup.wgsl
new file mode 100644
index 0000000..d336466
--- /dev/null
+++ b/shader/bbox_fixup.wgsl
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+#import config
+#import pathtag
+#import bbox
+#import bbox_monoid
+
+@group(0) @binding(0)
+var<uniform> config: Config;
+
+@group(0) @binding(1)
+var<storage> tag_monoids: array<TagMonoid>;
+
+@group(0) @binding(2)
+var<storage> bbox_reduced: array<BboxMonoid>;
+
+@group(0) @binding(3)
+var<storage, read_write> path_bboxes: array<PathBbox>;
+
+let WG_SIZE = 256u;
+var<workgroup> sh_bbox: array<BboxMonoid, WG_SIZE>;
+
+fn round_down(x: f32) -> i32 {
+ return i32(floor(x));
+}
+
+fn round_up(x: f32) -> i32 {
+ return i32(ceil(x));
+}
+
+// In the configuration with <= 64k pathtags, there's only one
+// workgroup here, so the distinction between global and local is
+// not meaningful. But we'll probably want to #ifdef a larger
+// configuration, in which we also bind a doubly reduced buffer.
+@compute @workgroup_size(256)
+fn main(
+ @builtin(global_invocation_id) global_id: vec3<u32>,
+ @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+ var agg: BboxMonoid;
+ if global_id.x * WG_SIZE < config.n_pathtag {
+ agg = bbox_reduced[global_id.x];
+ }
+ sh_bbox[local_id.x] = agg;
+ for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) {
+ workgroupBarrier();
+ if local_id.x >= 1u << i {
+ let other = sh_bbox[local_id.x - (1u << i)];
+ agg = combine_bbox_monoid(other, agg);
+ }
+ workgroupBarrier();
+ sh_bbox[local_id.x] = agg;
+ }
+ // Explanation of this trick: we don't need to fix up first bbox.
+ // By offsetting the index, we can use the inclusive scan.
+ let ix = global_id.x + 1u;
+ if ix * WG_SIZE < config.n_pathtag {
+ // First path of the workgroup.
+ let path_ix = tag_monoids[ix * (WG_SIZE / 4u)].path_ix;
+ if (agg.flags & FLAG_RESET_BBOX) == 0u && (agg.bbox.z > agg.bbox.x || agg.bbox.w > agg.bbox.y) {
+ let out = &path_bboxes[path_ix];
+ // TODO: casting goes away
+ var bbox = vec4(f32((*out).x0), f32((*out).y0), f32((*out).x1), f32((*out).y1));
+ if bbox.z > bbox.x || bbox.w > bbox.y {
+ bbox = vec4(min(agg.bbox.xy, bbox.xy), max(agg.bbox.zw, bbox.zw));
+ } else {
+ bbox = agg.bbox;
+ }
+ (*out).x0 = round_down(bbox.x);
+ (*out).y0 = round_down(bbox.y);
+ (*out).x1 = round_up(bbox.z);
+ (*out).y1 = round_up(bbox.w);
+ }
+ }
+}
diff --git a/shader/pathseg.wgsl b/shader/pathseg.wgsl
index ec059ab..a8f32f2 100644
--- a/shader/pathseg.wgsl
+++ b/shader/pathseg.wgsl
@@ -11,6 +11,8 @@
// There's some duplication of the decoding code but we won't worry about
// that just now. Perhaps it could be factored more nicely later.
+#import bbox
+#import bbox_monoid
#import config
#import pathtag
#import cubic
@@ -24,51 +26,14 @@
@group(0) @binding(2)
var<storage> tag_monoids: array<TagMonoid>;
-struct AtomicPathBbox {
- x0: atomic<i32>,
- y0: atomic<i32>,
- x1: atomic<i32>,
- y1: atomic<i32>,
- linewidth: f32,
- trans_ix: u32,
-}
-
@group(0) @binding(3)
-var<storage, read_write> path_bboxes: array<AtomicPathBbox>;
-
+var<storage, read_write> path_bboxes: array<PathBbox>;
@group(0) @binding(4)
var<storage, read_write> cubics: array<Cubic>;
-// Monoid is yagni, for future optimization
-
-// struct BboxMonoid {
-// bbox: vec4<f32>,
-// flags: u32,
-// }
-
-// let FLAG_RESET_BBOX = 1u;
-// let FLAG_SET_BBOX = 2u;
-
-// fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid {
-// var c: BboxMonoid;
-// c.bbox = b.bbox;
-// // TODO: previous-me thought this should be gated on b & SET_BBOX == false also
-// if (a.flags & FLAG_RESET_BBOX) == 0u && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y {
-// c.bbox = a.bbox;
-// } else if (a.flags & FLAG_RESET_BBOX) == 0u && (b.flags & FLAG_SET_BBOX) == 0u ||
-// (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)
-// {
-// c.bbox = vec4<f32>(min(a.bbox.xy, c.bbox.xy), max(a.bbox.xw, c.bbox.zw));
-// }
-// c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
-// c.flags |= (a.flags & FLAG_RESET_BBOX) << 1u;
-// return c;
-// }
-
-// fn bbox_monoid_identity() -> BboxMonoid {
-// return BboxMonoid();
-// }
+@group(0) @binding(5)
+var<storage, read_write> bbox_reduced: array<BboxMonoid>;
var<private> pathdata_base: u32;
@@ -115,10 +80,14 @@
return i32(ceil(x));
}
+let WG_SIZE = 256u;
+var<workgroup> sh_bbox: array<BboxMonoid, WG_SIZE>;
+
@compute @workgroup_size(256)
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
+ @builtin(workgroup_id) wg_id: vec3<u32>,
) {
let ix = global_id.x;
let tag_word = scene[config.pathtag_base + (ix >> 2u)];
@@ -130,12 +99,10 @@
let out = &path_bboxes[tm.path_ix];
let linewidth = bitcast<f32>(scene[config.linewidth_base + tm.linewidth_ix]);
- if (tag_byte & PATH_TAG_PATH) != 0u {
- (*out).linewidth = linewidth;
- (*out).trans_ix = tm.trans_ix;
- }
+ let bbox_flags = u32((tag_byte & PATH_TAG_PATH) != 0u);
// Decode path data
let seg_type = tag_byte & PATH_TAG_SEG_TYPE;
+ var bbox: vec4<f32>;
if seg_type != 0u {
var p0: vec2<f32>;
var p1: vec2<f32>;
@@ -163,7 +130,7 @@
let transform = read_transform(config.transform_base, tm.trans_ix);
p0 = transform_apply(transform, p0);
p1 = transform_apply(transform, p1);
- var bbox = vec4(min(p0, p1), max(p0, p1));
+ bbox = vec4(min(p0, p1), max(p0, p1));
// Degree-raise
if seg_type == PATH_TAG_LINETO {
p3 = p1;
@@ -191,13 +158,29 @@
}
let flags = u32(linewidth >= 0.0);
cubics[global_id.x] = Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags);
- // Update bounding box using atomics only. Computing a monoid is a
- // potential future optimization.
- if bbox.z > bbox.x || bbox.w > bbox.y {
- atomicMin(&(*out).x0, round_down(bbox.x));
- atomicMin(&(*out).y0, round_down(bbox.y));
- atomicMax(&(*out).x1, round_up(bbox.z));
- atomicMax(&(*out).y1, round_up(bbox.w));
+ }
+ var agg = BboxMonoid(bbox, bbox_flags);
+ sh_bbox[local_id.x] = agg;
+ for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) {
+ workgroupBarrier();
+ if local_id.x >= 1u << i {
+ let other = sh_bbox[local_id.x - (1u << i)];
+ agg = combine_bbox_monoid(other, agg);
}
+ workgroupBarrier();
+ sh_bbox[local_id.x] = agg;
+ }
+ if local_id.x == WG_SIZE - 1u {
+ bbox_reduced[wg_id.x] = agg;
+ }
+ if bbox_flags != 0u {
+ let out = &path_bboxes[tm.path_ix];
+ // TODO: now that we're not atomic, don't need fixed-point
+ (*out).x0 = round_down(agg.bbox.x);
+ (*out).y0 = round_down(agg.bbox.y);
+ (*out).x1 = round_up(agg.bbox.z);
+ (*out).y1 = round_up(agg.bbox.w);
+ (*out).linewidth = linewidth;
+ (*out).trans_ix = tm.trans_ix;
}
}
diff --git a/shader/shared/bbox_monoid.wgsl b/shader/shared/bbox_monoid.wgsl
new file mode 100644
index 0000000..1a301d2
--- /dev/null
+++ b/shader/shared/bbox_monoid.wgsl
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+struct BboxMonoid {
+ bbox: vec4<f32>,
+ flags: u32,
+}
+
+let FLAG_RESET_BBOX = 1u;
+let FLAG_SET_BBOX = 2u;
+
+// Technically this is a semigroup with a left identity rather than a
+// true monoid, but that is good enough for our purposes.
+fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid {
+ var bbox = b.bbox;
+ if (b.flags & FLAG_SET_BBOX) == 0u && (a.flags & FLAG_RESET_BBOX) == 0u {
+ if bbox.z <= bbox.x && bbox.w <= bbox.y {
+ bbox = a.bbox;
+ } else if a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y {
+ bbox = vec4(min(a.bbox.xy, bbox.xy), max(a.bbox.zw, bbox.zw));
+ }
+ }
+ let flags = ((a.flags | (a.flags << 1u)) & FLAG_SET_BBOX) | b.flags;
+ return BboxMonoid(bbox, flags);
+}
+
diff --git a/shader/shared/config.wgsl b/shader/shared/config.wgsl
index 0cb56d8..6ef9434 100644
--- a/shader/shared/config.wgsl
+++ b/shader/shared/config.wgsl
@@ -10,6 +10,7 @@
n_drawobj: u32,
n_path: u32,
n_clip: u32,
+ n_pathtag: u32,
// To reduce the number of bindings, info and bin data are combined
// into one buffer.
diff --git a/src/encoding/packed.rs b/src/encoding/packed.rs
index 7004477..34f7aef 100644
--- a/src/encoding/packed.rs
+++ b/src/encoding/packed.rs
@@ -32,6 +32,8 @@
pub n_paths: u32,
/// Number of clips.
pub n_clips: u32,
+ /// Number of path tags.
+ pub n_pathtag: u32,
/// Start of binning data.
pub bin_data_start: u32,
/// Start of path tag stream.
@@ -139,13 +141,14 @@
// Pack encoded data.
let layout = &mut self.layout;
*layout = Layout::default();
+ let n_path_tags = encoding.path_tags.len();
layout.n_paths = encoding.n_paths;
layout.n_draw_objects = encoding.n_paths;
layout.n_clips = encoding.n_clips;
+ layout.n_pathtag = n_path_tags as u32;
let data = &mut self.data;
data.clear();
// Path tag stream
- let n_path_tags = encoding.path_tags.len();
let path_tag_padded = align_up(n_path_tags, 4 * shaders::PATHTAG_REDUCE_WG);
let capacity = path_tag_padded
+ slice_size_in_bytes(&encoding.path_data)
diff --git a/src/render.rs b/src/render.rs
index 0ee195a..fb5c253 100644
--- a/src/render.rs
+++ b/src/render.rs
@@ -23,7 +23,10 @@
const DRAW_BBOX_SIZE: u64 = 16;
const BUMP_SIZE: u64 = 16;
const BIN_HEADER_SIZE: u64 = 8;
+const BBOX_MONOID_SIZE: u64 = 32;
+// Note: this is defined here as it's still used by the reduced pipeline,
+// but for the full pipeline, use the version in `encoding`.
#[repr(C)]
#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
struct Config {
@@ -34,6 +37,7 @@
n_drawobj: u32,
n_path: u32,
n_clip: u32,
+ n_pathtag: u32,
bin_data_start: u32,
pathtag_base: u32,
pathdata_base: u32,
@@ -202,14 +206,10 @@
);
let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
let path_bbox_buf = ResourceProxy::new_buf(n_paths as u64 * PATH_BBOX_SIZE);
- recording.dispatch(
- shaders.bbox_clear,
- (drawobj_wgs, 1, 1),
- [config_buf, path_bbox_buf],
- );
let cubic_buf = ResourceProxy::new_buf(n_pathtag as u64 * CUBIC_SIZE);
let path_coarse_wgs =
(n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
+ let bbox_reduced_buf = ResourceProxy::new_buf(path_coarse_wgs as u64 * BBOX_MONOID_SIZE);
recording.dispatch(
shaders.pathseg,
(path_coarse_wgs, 1, 1),
@@ -219,8 +219,14 @@
tagmonoid_buf,
path_bbox_buf,
cubic_buf,
+ bbox_reduced_buf,
],
);
+ recording.dispatch(
+ shaders.bbox_fixup,
+ (1, 1, 1),
+ [config_buf, tagmonoid_buf, bbox_reduced_buf, path_bbox_buf],
+ );
let draw_reduced_buf = ResourceProxy::new_buf(drawobj_wgs as u64 * DRAWMONOID_SIZE);
recording.dispatch(
shaders.draw_reduce,
diff --git a/src/shaders.rs b/src/shaders.rs
index 1df6b14..9371d1a 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -48,8 +48,8 @@
pub struct FullShaders {
pub pathtag_reduce: ShaderId,
pub pathtag_scan: ShaderId,
- pub bbox_clear: ShaderId,
pub pathseg: ShaderId,
+ pub bbox_fixup: ShaderId,
pub draw_reduce: ShaderId,
pub draw_leaf: ShaderId,
pub clip_reduce: ShaderId,
@@ -144,11 +144,6 @@
BindType::Buffer,
],
)?;
- let bbox_clear = engine.add_shader(
- device,
- preprocess::preprocess(shader!("bbox_clear"), &empty, &imports).into(),
- &[BindType::Uniform, BindType::Buffer],
- )?;
let pathseg = engine.add_shader(
device,
preprocess::preprocess(shader!("pathseg"), &full_config, &imports).into(),
@@ -158,6 +153,17 @@
BindType::BufReadOnly,
BindType::Buffer,
BindType::Buffer,
+ BindType::Buffer,
+ ],
+ )?;
+ let bbox_fixup = engine.add_shader(
+ device,
+ preprocess::preprocess(shader!("bbox_fixup"), &full_config, &imports).into(),
+ &[
+ BindType::Uniform,
+ BindType::BufReadOnly,
+ BindType::BufReadOnly,
+ BindType::Buffer,
],
)?;
let draw_reduce = engine.add_shader(
@@ -279,8 +285,8 @@
Ok(FullShaders {
pathtag_reduce,
pathtag_scan,
- bbox_clear,
pathseg,
+ bbox_fixup,
draw_reduce,
draw_leaf,
clip_reduce,
@@ -305,6 +311,7 @@
const SHARED_SHADERS: &[(&str, &str)] = &[
shared_shader!("bbox"),
+ shared_shader!("bbox_monoid"),
shared_shader!("blend"),
shared_shader!("bump"),
shared_shader!("clip"),