| // Copyright 2021 The piet-gpu authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| // Also licensed under MIT license, at your choice. |
| |
| use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet}; |
| use piet_gpu_hal::{Buffer, Pipeline}; |
| |
| use crate::config::Config; |
| use crate::runner::Runner; |
| use crate::test_result::TestResult; |
| |
| const WG_SIZE: u64 = 64; |
| const N_ROWS: u64 = 8; |
| const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS; |
| |
| struct StackCode { |
| reduce_pipeline: Pipeline, |
| leaf_pipeline: Pipeline, |
| } |
| |
| struct StackStage { |
| bic_buf: Buffer, |
| stack_buf: Buffer, |
| } |
| |
| struct StackBinding { |
| reduce_ds: DescriptorSet, |
| leaf_ds: DescriptorSet, |
| } |
| |
| struct StackData { |
| dyck: Vec<u32>, |
| } |
| |
| pub unsafe fn run_stack_test(runner: &mut Runner, config: &Config) -> TestResult { |
| let mut result = TestResult::new("stack monoid"); |
| println!("# stack monoid (just parentheses matching)"); |
| let expmax = 2 * (WG_SIZE * N_ROWS).trailing_zeros(); |
| for exp in 10..=expmax { |
| let n_elements: u64 = 1 << exp; |
| let data = StackData::new(n_elements); |
| let data_buf = runner |
| .session |
| .create_buffer_init(&data.dyck, BufferUsage::STORAGE) |
| .unwrap(); |
| let out_buf = runner.buf_down(data_buf.size(), BufferUsage::empty()); |
| |
| let code = StackCode::new(runner); |
| let stage = StackStage::new(runner, n_elements); |
| let binding = stage.bind(runner, &code, &data_buf, &out_buf.dev_buf); |
| |
| let mut total_elapsed = 0.0; |
| let n_iter = config.n_iter; |
| for i in 0..n_iter { |
| let mut commands = runner.commands(); |
| let mut pass = commands.compute_pass(0, 1); |
| stage.record(&mut pass, &code, &binding, n_elements); |
| pass.end(); |
| if i == 0 || config.verify_all { |
| commands.cmd_buf.memory_barrier(); |
| commands.download(&out_buf); |
| } |
| total_elapsed += runner.submit(commands); |
| if i == 0 || config.verify_all { |
| let dst = out_buf.map_read(..); |
| if let Some(failure) = data.verify(dst.cast_slice()) { |
| result.fail(failure); |
| } |
| } |
| } |
| let throughput = (n_elements * n_iter) as f64 / total_elapsed; |
| println!("{} {}", n_elements, throughput); |
| } |
| println!("e"); |
| println!("# stack monoid, CPU"); |
| for exp in 10..=expmax { |
| let n_elements: u64 = 1 << exp; |
| let data = StackData::new(n_elements); |
| let start = std::time::Instant::now(); |
| let result = data.run(); |
| let elapsed = start.elapsed().as_secs_f64(); |
| let throughput = n_elements as f64 / elapsed; |
| println!("{} {}", n_elements, throughput); |
| data.verify(&result); |
| } |
| println!("e"); |
| |
| //result.timing(total_elapsed, n_elements * n_iter); |
| result |
| } |
| |
| impl StackCode { |
| unsafe fn new(runner: &mut Runner) -> StackCode { |
| let reduce_code = include_shader!(&runner.session, "../shader/gen/stack_reduce"); |
| let reduce_pipeline = runner |
| .session |
| .create_compute_pipeline( |
| reduce_code, |
| &[BindType::BufReadOnly, BindType::Buffer, BindType::Buffer], |
| ) |
| .unwrap(); |
| let leaf_code = include_shader!(&runner.session, "../shader/gen/stack_leaf"); |
| let leaf_pipeline = runner |
| .session |
| .create_compute_pipeline( |
| leaf_code, |
| &[ |
| BindType::BufReadOnly, |
| BindType::BufReadOnly, |
| BindType::BufReadOnly, |
| BindType::Buffer, |
| ], |
| ) |
| .unwrap(); |
| StackCode { |
| reduce_pipeline, |
| leaf_pipeline, |
| } |
| } |
| } |
| |
| impl StackStage { |
| unsafe fn new(runner: &mut Runner, n_elements: u64) -> StackStage { |
| assert!(n_elements <= ELEMENTS_PER_WG.pow(2)); |
| let stack_buf = runner |
| .session |
| .create_buffer(4 * n_elements, BufferUsage::STORAGE) |
| .unwrap(); |
| let bic_size = ELEMENTS_PER_WG * 8; |
| let bic_buf = runner |
| .session |
| .create_buffer(bic_size, BufferUsage::STORAGE) |
| .unwrap(); |
| StackStage { bic_buf, stack_buf } |
| } |
| |
| unsafe fn bind( |
| &self, |
| runner: &mut Runner, |
| code: &StackCode, |
| in_buf: &Buffer, |
| out_buf: &Buffer, |
| ) -> StackBinding { |
| let reduce_ds = runner |
| .session |
| .create_simple_descriptor_set( |
| &code.reduce_pipeline, |
| &[in_buf, &self.bic_buf, &self.stack_buf], |
| ) |
| .unwrap(); |
| let leaf_ds = runner |
| .session |
| .create_simple_descriptor_set( |
| &code.leaf_pipeline, |
| &[in_buf, &self.bic_buf, &self.stack_buf, out_buf], |
| ) |
| .unwrap(); |
| StackBinding { reduce_ds, leaf_ds } |
| } |
| |
| unsafe fn record( |
| &self, |
| pass: &mut ComputePass, |
| code: &StackCode, |
| binding: &StackBinding, |
| size: u64, |
| ) { |
| let n_workgroups = (size + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG; |
| pass.dispatch( |
| &code.reduce_pipeline, |
| &binding.reduce_ds, |
| (n_workgroups as u32, 1, 1), |
| (WG_SIZE as u32, 1, 1), |
| ); |
| pass.memory_barrier(); |
| pass.dispatch( |
| &code.leaf_pipeline, |
| &binding.leaf_ds, |
| (n_workgroups as u32, 1, 1), |
| (WG_SIZE as u32, 1, 1), |
| ); |
| } |
| } |
| |
| impl StackData { |
| /// Generate a random Dyck sequence. |
| /// |
| /// Here the encoding is: 1 is push, 0 is pop. |
| fn new(n: u64) -> StackData { |
| // Simple LCG random generator, so we don't need to import rand |
| let mut z = 20170705u64; |
| let mut depth = 0; |
| let dyck = (0..n) |
| .map(|_| { |
| let is_push = if depth < 2 { |
| 1 |
| } else { |
| z = z.wrapping_mul(742938285) % ((1 << 31) - 1); |
| (z % 2) as u32 |
| }; |
| if is_push == 1 { |
| depth += 1; |
| } else { |
| depth -= 1; |
| } |
| is_push |
| }) |
| .collect(); |
| StackData { dyck } |
| } |
| |
| // Run on CPU side, for performance comparison |
| fn run(&self) -> Vec<u32> { |
| let mut stack = Vec::new(); |
| self.dyck |
| .iter() |
| .enumerate() |
| .map(|(i, inp)| { |
| let expected = *stack.last().unwrap_or(&!0); |
| if *inp == 0 { |
| stack.pop(); |
| } else { |
| stack.push(i as u32); |
| } |
| expected |
| }) |
| .collect() |
| } |
| |
| fn verify(&self, data: &[u32]) -> Option<String> { |
| let mut stack = Vec::new(); |
| for (i, (inp, outp)) in self.dyck.iter().zip(data).enumerate() { |
| if let Some(tos) = stack.last() { |
| if tos != outp { |
| return Some(format!("mismatch at {}: {} != {}", i, tos, outp)); |
| } |
| } |
| if *inp == 0 { |
| stack.pop(); |
| } else { |
| stack.push(i as u32); |
| } |
| } |
| None |
| } |
| } |