Measure throughput over problems sizes

Measure both GPU and CPU throughput over a range of problem sizes, formatting the output for gnuplot.
diff --git a/tests/src/bbox_intersection.rs b/tests/src/bbox_intersection.rs
index 66851c7..fac9704 100644
--- a/tests/src/bbox_intersection.rs
+++ b/tests/src/bbox_intersection.rs
@@ -65,7 +65,8 @@
 pub unsafe fn run_intersection_test(runner: &mut Runner, config: &Config) -> TestResult {
     let mut result = TestResult::new("Bounding box Intersection");
     println!("# bounding box intersection");
-    for exp in 10..=16 {
+    let expmax = 2 * WG_SIZE.trailing_zeros();
+    for exp in 10..=expmax {
         let n_elements: u64 = 1 << exp;
         let data = IntersectionData::new(n_elements);
         let data_buf = runner
@@ -101,6 +102,18 @@
         println!("{} {}", n_elements, throughput);
     }
     println!("e");
+    println!("# bounding box intersection, CPU");
+    for exp in 10..=expmax {
+        let n_elements: u64 = 1 << exp;
+        let data = IntersectionData::new(n_elements);
+        let start = std::time::Instant::now();
+        let result = data.run();
+        let elapsed = start.elapsed().as_secs_f64();
+        let throughput = n_elements as f64 / elapsed;
+        println!("{} {}", n_elements, throughput);
+        data.verify(&result);
+    }
+    println!("e");
 
     //result.timing(total_elapsed, n_elements * n_iter);
     result
@@ -148,7 +161,10 @@
             .session
             .create_buffer(bic_size, BufferUsage::STORAGE)
             .unwrap();
-        IntersectionStage { bic_buf, intersection_buf }
+        IntersectionStage {
+            bic_buf,
+            intersection_buf,
+        }
     }
 
     unsafe fn bind(
@@ -217,7 +233,12 @@
 }
 
 fn bbox_intersection(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
-    [a[0].max(b[0]), a[1].max(b[1]), a[2].min(b[2]), a[3].min(b[3])]
+    [
+        a[0].max(b[0]),
+        a[1].max(b[1]),
+        a[2].min(b[2]),
+        a[3].min(b[3]),
+    ]
 }
 
 const EMPTY_BBOX: [f32; 4] = [-1e9, -1e9, 1e9, 1e9];
@@ -246,12 +267,37 @@
                 } else {
                     EMPTY_BBOX
                 };
-                Node { node_type, bbox, .. Default::default() }
+                Node {
+                    node_type,
+                    bbox,
+                    ..Default::default()
+                }
             })
             .collect();
         IntersectionData { nodes }
     }
 
+    fn run(&self) -> Vec<[f32; 4]> {
+        let mut stack = Vec::new();
+        let mut tos = EMPTY_BBOX;
+        self.nodes
+            .iter()
+            .map(|inp| match inp.node_type {
+                OPEN_PAREN => {
+                    tos = bbox_intersection(tos, inp.bbox);
+                    stack.push(tos);
+                    tos
+                }
+                CLOSE_PAREN => {
+                    stack.pop().unwrap();
+                    tos = *stack.last().unwrap_or(&EMPTY_BBOX);
+                    EMPTY_BBOX
+                }
+                _ => unreachable!(),
+            })
+            .collect()
+    }
+
     fn verify(&self, data: &[[f32; 4]]) -> Option<String> {
         let mut stack = Vec::new();
         let mut tos = EMPTY_BBOX;
diff --git a/tests/src/bbox_union.rs b/tests/src/bbox_union.rs
index 9224cb1..ca2224c 100644
--- a/tests/src/bbox_union.rs
+++ b/tests/src/bbox_union.rs
@@ -66,7 +66,8 @@
 pub unsafe fn run_union_test(runner: &mut Runner, config: &Config) -> TestResult {
     let mut result = TestResult::new("Bounding box union");
     println!("# bounding box union");
-    for exp in 10..=16 {
+    let expmax = 2 * WG_SIZE.trailing_zeros();
+    for exp in 10..=expmax {
         let n_elements: u64 = 1 << exp;
         let data = UnionData::new(n_elements);
         let data_buf = runner
@@ -102,6 +103,18 @@
         println!("{} {}", n_elements, throughput);
     }
     println!("e");
+    println!("# bounding box union, CPU");
+    for exp in 10..=expmax {
+        let n_elements: u64 = 1 << exp;
+        let data = UnionData::new(n_elements);
+        let start = std::time::Instant::now();
+        let result = data.run();
+        let elapsed = start.elapsed().as_secs_f64();
+        let throughput = n_elements as f64 / elapsed;
+        println!("{} {}", n_elements, throughput);
+        data.verify(&result);
+    }
+    println!("e");
 
     //result.timing(total_elapsed, n_elements * n_iter);
     result
@@ -218,7 +231,12 @@
 }
 
 fn bbox_union(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
-    [a[0].min(b[0]), a[1].min(b[1]), a[2].max(b[2]), a[3].max(b[3])]
+    [
+        a[0].min(b[0]),
+        a[1].min(b[1]),
+        a[2].max(b[2]),
+        a[3].max(b[3]),
+    ]
 }
 
 const EMPTY_BBOX: [f32; 4] = [1e9, 1e9, -1e9, -1e9];
@@ -246,12 +264,43 @@
                 } else {
                     EMPTY_BBOX
                 };
-                Node { node_type, bbox, .. Default::default() }
+                Node {
+                    node_type,
+                    bbox,
+                    ..Default::default()
+                }
             })
             .collect();
         UnionData { nodes }
     }
 
+    // Run on CPU side, for performance comparison
+    fn run(&self) -> Vec<[f32; 4]> {
+        let mut stack = Vec::new();
+        self.nodes
+            .iter()
+            .map(|inp| {
+                let mut expected = inp.bbox;
+                match inp.node_type {
+                    OPEN_PAREN => stack.push(EMPTY_BBOX),
+                    CLOSE_PAREN => {
+                        let tos = stack.pop().unwrap();
+                        expected = tos;
+                        if let Some(nos) = stack.last_mut() {
+                            *nos = bbox_union(*nos, tos);
+                        }
+                    }
+                    LEAF => {
+                        let tos = stack.last_mut().unwrap();
+                        *tos = bbox_union(*tos, inp.bbox);
+                    }
+                    _ => unreachable!(),
+                }
+                expected
+            })
+            .collect()
+    }
+
     fn verify(&self, data: &[[f32; 4]]) -> Option<String> {
         let mut stack = Vec::new();
         for (i, (inp, outp)) in self.nodes.iter().zip(data).enumerate() {
diff --git a/tests/src/main.rs b/tests/src/main.rs
index 7f231f1..754848d 100644
--- a/tests/src/main.rs
+++ b/tests/src/main.rs
@@ -141,7 +141,10 @@
         if config.groups.matches("stack") {
             report(&stack::run_stack_test(&mut runner, &config));
             report(&bbox_union::run_union_test(&mut runner, &config));
-            report(&bbox_intersection::run_intersection_test(&mut runner, &config));
+            report(&bbox_intersection::run_intersection_test(
+                &mut runner,
+                &config,
+            ));
         }
         #[cfg(feature = "piet-gpu")]
         if config.groups.matches("piet") {
diff --git a/tests/src/stack.rs b/tests/src/stack.rs
index c86e3df..aba875b 100644
--- a/tests/src/stack.rs
+++ b/tests/src/stack.rs
@@ -47,7 +47,8 @@
 pub unsafe fn run_stack_test(runner: &mut Runner, config: &Config) -> TestResult {
     let mut result = TestResult::new("stack monoid");
     println!("# stack monoid (just parentheses matching)");
-    for exp in 10..=18 {
+    let expmax = 2 * (WG_SIZE * N_ROWS).trailing_zeros();
+    for exp in 10..=expmax {
         let n_elements: u64 = 1 << exp;
         let data = StackData::new(n_elements);
         let data_buf = runner
@@ -83,6 +84,18 @@
         println!("{} {}", n_elements, throughput);
     }
     println!("e");
+    println!("# stack monoid, CPU");
+    for exp in 10..=expmax {
+        let n_elements: u64 = 1 << exp;
+        let data = StackData::new(n_elements);
+        let start = std::time::Instant::now();
+        let result = data.run();
+        let elapsed = start.elapsed().as_secs_f64();
+        let throughput = n_elements as f64 / elapsed;
+        println!("{} {}", n_elements, throughput);
+        data.verify(&result);
+    }
+    println!("e");
 
     //result.timing(total_elapsed, n_elements * n_iter);
     result
@@ -208,6 +221,24 @@
         StackData { dyck }
     }
 
+    // Run on CPU side, for performance comparison
+    fn run(&self) -> Vec<u32> {
+        let mut stack = Vec::new();
+        self.dyck
+            .iter()
+            .enumerate()
+            .map(|(i, inp)| {
+                let expected = *stack.last().unwrap_or(&!0);
+                if *inp == 0 {
+                    stack.pop();
+                } else {
+                    stack.push(i as u32);
+                }
+                expected
+            })
+            .collect()
+    }
+
     fn verify(&self, data: &[u32]) -> Option<String> {
         let mut stack = Vec::new();
         for (i, (inp, outp)) in self.dyck.iter().zip(data).enumerate() {