Add bounding box intersection test
diff --git a/tests/shader/build.ninja b/tests/shader/build.ninja
index 56518d6..2d6504d 100644
--- a/tests/shader/build.ninja
+++ b/tests/shader/build.ninja
@@ -90,3 +90,13 @@
 build gen/union_leaf.hlsl: hlsl gen/union_leaf.spv
 build gen/union_leaf.dxil: dxil gen/union_leaf.hlsl
 build gen/union_leaf.msl: msl gen/union_leaf.spv
+
+build gen/intersection_reduce.spv: glsl intersection_reduce.comp
+build gen/intersection_reduce.hlsl: hlsl gen/intersection_reduce.spv
+build gen/intersection_reduce.dxil: dxil gen/intersection_reduce.hlsl
+build gen/intersection_reduce.msl: msl gen/intersection_reduce.spv
+
+build gen/intersection_leaf.spv: glsl intersection_leaf.comp
+build gen/intersection_leaf.hlsl: hlsl gen/intersection_leaf.spv
+build gen/intersection_leaf.dxil: dxil gen/intersection_leaf.hlsl
+build gen/intersection_leaf.msl: msl gen/intersection_leaf.spv
diff --git a/tests/shader/intersection_leaf.comp b/tests/shader/intersection_leaf.comp
new file mode 100644
index 0000000..c00bd32
--- /dev/null
+++ b/tests/shader/intersection_leaf.comp
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// The main phase for the bbox union computation
+
+#version 450
+
+#define LG_N_SEQ 0
+#define N_SEQ (1 << LG_N_SEQ)
+#define LG_WG_SIZE 8
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PART_SIZE (WG_SIZE * N_SEQ)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+// The bicyclic monoid
+struct Bic {
+    uint a;
+    uint b;
+};
+
+Bic bic_combine(Bic x, Bic y) {
+    uint m = min(x.b, y.a);
+    return Bic(x.a + y.a - m, x.b + y.b - m);
+}
+
+// Node type values
+#define OPEN_PAREN 0
+#define CLOSE_PAREN 1
+
+struct Node {
+    uint node_type;
+    uint pad1;
+    uint pad2;
+    uint pad3;
+    vec4 bbox;
+};
+
+#define BBOX_IDENTITY vec4(-1e9, -1e9, 1e9, 1e9)
+
+vec4 bbox_intersect(vec4 a, vec4 b) {
+    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
+}
+
+layout(binding = 0) readonly buffer InBuf {
+    Node[] inbuf;
+};
+
+layout(binding = 1) readonly buffer BicBuf {
+    Bic[] bicbuf;
+};
+
+layout(binding = 2) readonly buffer StackBuf {
+    vec4[] stack;
+};
+
+layout(binding = 3) buffer OutBuf {
+    vec4[] outbuf;
+};
+
+shared Bic sh_bic[WG_SIZE * 2 - 2];
+shared vec4 sh_stack[WG_SIZE];
+shared uint sh_link[WG_SIZE];
+shared vec4 sh_bbox[WG_SIZE];
+
+void main() {
+    uint th = gl_LocalInvocationID.x;
+    // materialize stack snapshot as of the start of this partition
+
+    // start with reverse scan of bic values from first dispatch
+    Bic bic = Bic(0, 0);
+    if (th < gl_WorkGroupID.x) {
+        bic = bicbuf[th];
+    }
+    sh_bic[th] = bic;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        uint other_ix = th + (1u << i);
+        if (other_ix < WG_SIZE) {
+            bic = bic_combine(bic, sh_bic[other_ix]);
+        }
+        barrier();
+        sh_bic[th] = bic;
+    }
+
+    barrier();
+    uint size = sh_bic[0].b;
+
+    // forward scan of "top of stack" values
+    uint bic_next_b = 0;
+    if (th + 1 < WG_SIZE) {
+        bic_next_b = sh_bic[th + 1].b;
+    }
+    vec4 bbox = BBOX_IDENTITY;
+    if (bic.b > bic_next_b) {
+        bbox = stack[th * PART_SIZE + bic.b - bic_next_b - 1];
+    }
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        sh_bbox[th] = bbox;
+        barrier();
+        if (th >= (1u << i)) {
+            bbox = bbox_intersect(sh_bbox[th - (1u << i)], bbox);
+        }
+        barrier();
+    }
+    sh_bbox[th] = bbox;
+    barrier();
+
+    // binary search in stack to do stream compaction
+    uint sp = PART_SIZE - 1 - th;
+    uint ix = 0;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        uint probe = ix + (uint(PART_SIZE / 2) >> i);
+        if (sp < sh_bic[probe].b) {
+            ix = probe;
+        }
+    }
+    // ix is the largest value such that sp < sh_bic[ix].b (if any)
+    uint b = sh_bic[ix].b;
+    if (sp < b) {
+        bbox = stack[ix * PART_SIZE + b - sp - 1];
+        if (ix > 0) {
+            bbox = bbox_intersect(sh_bbox[ix - 1], bbox);
+        }
+        sh_stack[th] = bbox;
+    }
+
+    // Do tree reduction of bicyclic semigroups
+    Node inp = inbuf[gl_GlobalInvocationID.x];
+    uint node_type = inp.node_type;
+    bic = Bic(uint(node_type == CLOSE_PAREN), uint(node_type == OPEN_PAREN));
+    sh_bic[th] = bic;
+    uint inbase = 0;
+    for (uint i = 0; i < LG_WG_SIZE - 1; i++) {
+        uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i));
+        barrier();
+        if (th < (1u << (LG_WG_SIZE - 1 - i))) {
+            sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]);
+        }
+        inbase = outbase;
+    }
+    barrier();
+
+    // Search for predecessor node
+    ix = th;
+    bic = Bic(0, 0);
+    uint j = 0;
+    while (j < LG_WG_SIZE) {
+        uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
+        if (((ix >> j) & 1) != 0) {
+            Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
+            if (test.b > 0) {
+                break;
+            }
+            bic = test;
+            ix -= 1u << j;
+        }
+        j++;
+    }
+    if (ix > 0) {
+        while (j > 0) {
+            j--;
+            uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
+            Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
+            if (test.b == 0) {
+                bic = test;
+                ix -= 1u << j;
+            }
+        }
+    }
+    // ix is the smallest value such that reduce(ix..th).b == 0
+    uint link = ix > 0 ? ix - 1 : ~0u - bic.a;
+    bbox = inp.bbox;
+
+    // scan along parent links
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        sh_link[th] = link;
+        sh_bbox[th] = bbox;
+        barrier();
+        if (int(link) >= 0) {
+            bbox = bbox_intersect(sh_bbox[link], bbox);
+            link = sh_link[link];
+        }
+        barrier();
+    }
+    if (int(link + size) >= 0) {
+        bbox = bbox_intersect(sh_stack[WG_SIZE + link], bbox);
+    }
+    outbuf[gl_GlobalInvocationID.x] = bbox;
+}
diff --git a/tests/shader/intersection_reduce.comp b/tests/shader/intersection_reduce.comp
new file mode 100644
index 0000000..cc1e600
--- /dev/null
+++ b/tests/shader/intersection_reduce.comp
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// The reduction phase for the blend/union test
+
+#version 450
+
+// At least for now, N_SEQ is hardwired at 1
+#define LG_WG_SIZE 8
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PART_SIZE WG_SIZE
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+// The bicyclic semigroup (monoid)
+struct Bic {
+    uint a;
+    uint b;
+};
+
+Bic bic_combine(Bic x, Bic y) {
+    uint m = min(x.b, y.a);
+    return Bic(x.a + y.a - m, x.b + y.b - m);
+}
+
+// Node type values
+#define OPEN_PAREN 0
+#define CLOSE_PAREN 1
+
+struct Node {
+    uint node_type;
+    uint pad1;
+    uint pad2;
+    uint pad3;
+    vec4 bbox;
+};
+
+vec4 bbox_intersect(vec4 a, vec4 b) {
+    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
+}
+
+layout(binding = 0) readonly buffer InBuf {
+    Node[] inbuf;
+};
+
+layout(binding = 1) buffer BicBuf {
+    Bic[] bicbuf;
+};
+
+// Output buffer for stack slice data
+layout(binding = 2) buffer StackBuf {
+    // open question: do we need the parent link or are bboxes sufficient?
+    vec4[] stack;
+};
+
+shared Bic sh_bic[WG_SIZE];
+shared vec4 sh_bbox[WG_SIZE];
+
+void main() {
+    uint th = gl_LocalInvocationID.x;
+    Node inp = inbuf[gl_GlobalInvocationID.x];
+    uint node_type = inp.node_type;
+    Bic bic = Bic(uint(node_type == CLOSE_PAREN), uint(node_type == OPEN_PAREN));
+    sh_bic[gl_LocalInvocationID.x] = bic;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        uint other_ix = gl_LocalInvocationID.x + (1u << i);
+        if (other_ix < WG_SIZE) {
+            bic = bic_combine(bic, sh_bic[other_ix]);
+        }
+        barrier();
+        sh_bic[th] = bic;
+    }
+    if (th == 0) {
+        bicbuf[gl_WorkGroupID.x] = bic;
+    }
+    barrier();
+    // Number of elements in stack slice output by this partition
+    uint size = sh_bic[0].b;
+    bic = Bic(0, 0);
+    if (th + 1 < WG_SIZE) {
+        bic = sh_bic[th + 1];
+    }
+    // Do stream compaction
+    if (inp.node_type == OPEN_PAREN && bic.a == 0) {
+        uint out_ix = size - bic.b - 1;
+        sh_bbox[out_ix] = inp.bbox;
+    }
+    barrier();
+    vec4 bbox;
+    if (th < size) {
+        bbox = sh_bbox[th];
+    }
+    // Forward scan of intersection over resulting stack slice
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        if (th < size && th >= (1u << i)) {
+            bbox = bbox_intersect(sh_bbox[th - (1u << i)], bbox);
+        }
+        barrier();
+        if (th < size) {
+            sh_bbox[th] = bbox;
+        }
+        barrier();
+    }
+    if (th < size) {
+        stack[gl_GlobalInvocationID.x] = bbox;
+    }
+}
diff --git a/tests/shader/union_reduce.comp b/tests/shader/union_reduce.comp
index 645cb0b..f584ae3 100644
--- a/tests/shader/union_reduce.comp
+++ b/tests/shader/union_reduce.comp
@@ -73,7 +73,6 @@
         barrier();
         uint other_ix = gl_LocalInvocationID.x + (1u << i);
         if (other_ix < WG_SIZE) {
-            Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)];
             bic = bic_combine(bic, sh_bic[other_ix]);
             bbox = bbox_union(bbox, sh_bbox[other_ix]);
         }
diff --git a/tests/src/bbox_intersection.rs b/tests/src/bbox_intersection.rs
new file mode 100644
index 0000000..ebfed5a
--- /dev/null
+++ b/tests/src/bbox_intersection.rs
@@ -0,0 +1,273 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+use rand::Rng;
+
+use bytemuck::{Pod, Zeroable};
+use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
+use piet_gpu_hal::{Buffer, Pipeline};
+
+use crate::config::Config;
+use crate::runner::Runner;
+use crate::test_result::TestResult;
+
+const WG_SIZE: u64 = 256;
+const N_ROWS: u64 = 1;
+const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
+
+struct IntersectionCode {
+    reduce_pipeline: Pipeline,
+    leaf_pipeline: Pipeline,
+}
+
+struct IntersectionStage {
+    bic_buf: Buffer,
+    intersection_buf: Buffer,
+}
+
+struct IntersectionBinding {
+    reduce_ds: DescriptorSet,
+    leaf_ds: DescriptorSet,
+}
+
+const OPEN_PAREN: u32 = 0;
+const CLOSE_PAREN: u32 = 1;
+
+#[derive(Clone, Copy, Zeroable, Pod, Default, Debug)]
+#[repr(C)]
+struct Node {
+    node_type: u32,
+    // Note: the padding here is annoying, but it leaves the code in a somewhat
+    // more readable state than hand-marshaling.
+    pad1: u32,
+    pad2: u32,
+    pad3: u32,
+    bbox: [f32; 4],
+}
+
+struct IntersectionData {
+    nodes: Vec<Node>,
+}
+
+pub unsafe fn run_intersection_test(runner: &mut Runner, config: &Config) -> TestResult {
+    let mut result = TestResult::new("Bounding box Intersection");
+    let n_elements: u64 = config.size.choose(1 << 9, 1 << 12, 1 << 18);
+    let data = IntersectionData::new(n_elements);
+    let data_buf = runner
+        .session
+        .create_buffer_init(&data.nodes, BufferUsage::STORAGE)
+        .unwrap();
+    let out_buf = runner.buf_down(data_buf.size(), BufferUsage::empty());
+
+    let code = IntersectionCode::new(runner);
+    let stage = IntersectionStage::new(runner, n_elements);
+    let binding = stage.bind(runner, &code, &data_buf, &out_buf.dev_buf);
+
+    let mut total_elapsed = 0.0;
+    let n_iter = config.n_iter;
+    for i in 0..n_iter {
+        let mut commands = runner.commands();
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding, n_elements);
+        pass.end();
+        if i == 0 || config.verify_all {
+            commands.cmd_buf.memory_barrier();
+            commands.download(&out_buf);
+        }
+        total_elapsed += runner.submit(commands);
+        if i == 0 || config.verify_all {
+            let dst = out_buf.map_read(..);
+            if let Some(failure) = data.verify(dst.cast_slice()) {
+                result.fail(failure);
+            }
+        }
+    }
+
+    result.timing(total_elapsed, n_elements * n_iter);
+    result
+}
+
+impl IntersectionCode {
+    unsafe fn new(runner: &mut Runner) -> IntersectionCode {
+        let reduce_code = include_shader!(&runner.session, "../shader/gen/intersection_reduce");
+        let reduce_pipeline = runner
+            .session
+            .create_compute_pipeline(
+                reduce_code,
+                &[BindType::BufReadOnly, BindType::Buffer, BindType::Buffer],
+            )
+            .unwrap();
+        let leaf_code = include_shader!(&runner.session, "../shader/gen/intersection_leaf");
+        let leaf_pipeline = runner
+            .session
+            .create_compute_pipeline(
+                leaf_code,
+                &[
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::Buffer,
+                ],
+            )
+            .unwrap();
+        IntersectionCode {
+            reduce_pipeline,
+            leaf_pipeline,
+        }
+    }
+}
+
+impl IntersectionStage {
+    unsafe fn new(runner: &mut Runner, n_elements: u64) -> IntersectionStage {
+        assert!(n_elements <= ELEMENTS_PER_WG.pow(2));
+        let intersection_buf = runner
+            .session
+            .create_buffer(16 * n_elements, BufferUsage::STORAGE)
+            .unwrap();
+        let bic_size = ELEMENTS_PER_WG * 8;
+        let bic_buf = runner
+            .session
+            .create_buffer(bic_size, BufferUsage::STORAGE)
+            .unwrap();
+        IntersectionStage { bic_buf, intersection_buf }
+    }
+
+    unsafe fn bind(
+        &self,
+        runner: &mut Runner,
+        code: &IntersectionCode,
+        in_buf: &Buffer,
+        out_buf: &Buffer,
+    ) -> IntersectionBinding {
+        let reduce_ds = runner
+            .session
+            .create_simple_descriptor_set(
+                &code.reduce_pipeline,
+                &[in_buf, &self.bic_buf, &self.intersection_buf],
+            )
+            .unwrap();
+        let leaf_ds = runner
+            .session
+            .create_simple_descriptor_set(
+                &code.leaf_pipeline,
+                &[in_buf, &self.bic_buf, &self.intersection_buf, out_buf],
+            )
+            .unwrap();
+        IntersectionBinding { reduce_ds, leaf_ds }
+    }
+
+    unsafe fn record(
+        &self,
+        pass: &mut ComputePass,
+        code: &IntersectionCode,
+        binding: &IntersectionBinding,
+        size: u64,
+    ) {
+        let n_workgroups = (size + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
+        pass.dispatch(
+            &code.reduce_pipeline,
+            &binding.reduce_ds,
+            (n_workgroups as u32, 1, 1),
+            (WG_SIZE as u32, 1, 1),
+        );
+        pass.memory_barrier();
+        pass.dispatch(
+            &code.leaf_pipeline,
+            &binding.leaf_ds,
+            (n_workgroups as u32, 1, 1),
+            (WG_SIZE as u32, 1, 1),
+        );
+    }
+}
+
+fn rand_bbox() -> [f32; 4] {
+    let mut rng = rand::thread_rng();
+    const Y_MIN: f32 = -1000.0;
+    const Y_MAX: f32 = 1000.0;
+    let mut x0 = rng.gen_range(Y_MIN, Y_MAX);
+    let mut y0 = rng.gen_range(Y_MIN, Y_MAX);
+    let mut x1 = rng.gen_range(Y_MIN, Y_MAX);
+    let mut y1 = rng.gen_range(Y_MIN, Y_MAX);
+    if x0 > x1 {
+        std::mem::swap(&mut x0, &mut x1);
+    }
+    if y0 > y1 {
+        std::mem::swap(&mut y0, &mut y1);
+    }
+    [x0, y0, x1, y1]
+}
+
+fn bbox_intersection(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
+    [a[0].max(b[0]), a[1].max(b[1]), a[2].min(b[2]), a[3].min(b[3])]
+}
+
+const EMPTY_BBOX: [f32; 4] = [-1e9, -1e9, 1e9, 1e9];
+
+impl IntersectionData {
+    /// Generate a random scene
+    fn new(n: u64) -> IntersectionData {
+        let mut rng = rand::thread_rng();
+        let mut depth = 0;
+        let nodes = (0..n)
+            .map(|_| {
+                let node_type = if depth < 1 {
+                    OPEN_PAREN
+                } else {
+                    // Note: we'll run this test with paren nodes only, because
+                    // leaf nodes are a bit of an extra pain.
+                    rng.gen_range(0, 2)
+                };
+                if node_type == OPEN_PAREN {
+                    depth += 1;
+                } else if node_type == CLOSE_PAREN {
+                    depth -= 1;
+                }
+                let bbox = if node_type == OPEN_PAREN {
+                    rand_bbox()
+                } else {
+                    EMPTY_BBOX
+                };
+                Node { node_type, bbox, .. Default::default() }
+            })
+            .collect();
+        IntersectionData { nodes }
+    }
+
+    fn verify(&self, data: &[[f32; 4]]) -> Option<String> {
+        let mut stack = Vec::new();
+        let mut tos = EMPTY_BBOX;
+        for (i, (inp, outp)) in self.nodes.iter().zip(data).enumerate() {
+            //println!("{}: {:.1?} {:.1?}", i, inp, outp);
+            let expected = match inp.node_type {
+                OPEN_PAREN => {
+                    tos = bbox_intersection(tos, inp.bbox);
+                    stack.push(tos);
+                    tos
+                }
+                CLOSE_PAREN => {
+                    stack.pop().unwrap();
+                    tos = *stack.last().unwrap_or(&EMPTY_BBOX);
+                    EMPTY_BBOX
+                }
+                _ => unreachable!(),
+            };
+            if inp.node_type == OPEN_PAREN && expected != *outp {
+                println!("{}: {:.1?} {:.1?}", i, expected, outp);
+            }
+        }
+        None
+    }
+}
diff --git a/tests/src/union.rs b/tests/src/bbox_union.rs
similarity index 100%
rename from tests/src/union.rs
rename to tests/src/bbox_union.rs
diff --git a/tests/src/main.rs b/tests/src/main.rs
index e45a843..7f231f1 100644
--- a/tests/src/main.rs
+++ b/tests/src/main.rs
@@ -16,6 +16,8 @@
 
 //! Tests for piet-gpu shaders and GPU capabilities.
 
+mod bbox_intersection;
+mod bbox_union;
 mod clear;
 mod clip;
 mod config;
@@ -27,7 +29,6 @@
 mod runner;
 mod stack;
 mod test_result;
-mod union;
 
 #[cfg(feature = "piet-gpu")]
 mod path;
@@ -139,7 +140,8 @@
         }
         if config.groups.matches("stack") {
             report(&stack::run_stack_test(&mut runner, &config));
-            report(&union::run_union_test(&mut runner, &config));
+            report(&bbox_union::run_union_test(&mut runner, &config));
+            report(&bbox_intersection::run_intersection_test(&mut runner, &config));
         }
         #[cfg(feature = "piet-gpu")]
         if config.groups.matches("piet") {