blob: aba875bba1c6c62c73e6afd811f93e1de4b848cf [file] [log] [blame]
// Copyright 2021 The piet-gpu authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Also licensed under MIT license, at your choice.
use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
use piet_gpu_hal::{Buffer, Pipeline};
use crate::config::Config;
use crate::runner::Runner;
use crate::test_result::TestResult;
const WG_SIZE: u64 = 64;
const N_ROWS: u64 = 8;
const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
struct StackCode {
reduce_pipeline: Pipeline,
leaf_pipeline: Pipeline,
}
struct StackStage {
bic_buf: Buffer,
stack_buf: Buffer,
}
struct StackBinding {
reduce_ds: DescriptorSet,
leaf_ds: DescriptorSet,
}
struct StackData {
dyck: Vec<u32>,
}
pub unsafe fn run_stack_test(runner: &mut Runner, config: &Config) -> TestResult {
let mut result = TestResult::new("stack monoid");
println!("# stack monoid (just parentheses matching)");
let expmax = 2 * (WG_SIZE * N_ROWS).trailing_zeros();
for exp in 10..=expmax {
let n_elements: u64 = 1 << exp;
let data = StackData::new(n_elements);
let data_buf = runner
.session
.create_buffer_init(&data.dyck, BufferUsage::STORAGE)
.unwrap();
let out_buf = runner.buf_down(data_buf.size(), BufferUsage::empty());
let code = StackCode::new(runner);
let stage = StackStage::new(runner, n_elements);
let binding = stage.bind(runner, &code, &data_buf, &out_buf.dev_buf);
let mut total_elapsed = 0.0;
let n_iter = config.n_iter;
for i in 0..n_iter {
let mut commands = runner.commands();
let mut pass = commands.compute_pass(0, 1);
stage.record(&mut pass, &code, &binding, n_elements);
pass.end();
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&out_buf);
}
total_elapsed += runner.submit(commands);
if i == 0 || config.verify_all {
let dst = out_buf.map_read(..);
if let Some(failure) = data.verify(dst.cast_slice()) {
result.fail(failure);
}
}
}
let throughput = (n_elements * n_iter) as f64 / total_elapsed;
println!("{} {}", n_elements, throughput);
}
println!("e");
println!("# stack monoid, CPU");
for exp in 10..=expmax {
let n_elements: u64 = 1 << exp;
let data = StackData::new(n_elements);
let start = std::time::Instant::now();
let result = data.run();
let elapsed = start.elapsed().as_secs_f64();
let throughput = n_elements as f64 / elapsed;
println!("{} {}", n_elements, throughput);
data.verify(&result);
}
println!("e");
//result.timing(total_elapsed, n_elements * n_iter);
result
}
impl StackCode {
unsafe fn new(runner: &mut Runner) -> StackCode {
let reduce_code = include_shader!(&runner.session, "../shader/gen/stack_reduce");
let reduce_pipeline = runner
.session
.create_compute_pipeline(
reduce_code,
&[BindType::BufReadOnly, BindType::Buffer, BindType::Buffer],
)
.unwrap();
let leaf_code = include_shader!(&runner.session, "../shader/gen/stack_leaf");
let leaf_pipeline = runner
.session
.create_compute_pipeline(
leaf_code,
&[
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
],
)
.unwrap();
StackCode {
reduce_pipeline,
leaf_pipeline,
}
}
}
impl StackStage {
unsafe fn new(runner: &mut Runner, n_elements: u64) -> StackStage {
assert!(n_elements <= ELEMENTS_PER_WG.pow(2));
let stack_buf = runner
.session
.create_buffer(4 * n_elements, BufferUsage::STORAGE)
.unwrap();
let bic_size = ELEMENTS_PER_WG * 8;
let bic_buf = runner
.session
.create_buffer(bic_size, BufferUsage::STORAGE)
.unwrap();
StackStage { bic_buf, stack_buf }
}
unsafe fn bind(
&self,
runner: &mut Runner,
code: &StackCode,
in_buf: &Buffer,
out_buf: &Buffer,
) -> StackBinding {
let reduce_ds = runner
.session
.create_simple_descriptor_set(
&code.reduce_pipeline,
&[in_buf, &self.bic_buf, &self.stack_buf],
)
.unwrap();
let leaf_ds = runner
.session
.create_simple_descriptor_set(
&code.leaf_pipeline,
&[in_buf, &self.bic_buf, &self.stack_buf, out_buf],
)
.unwrap();
StackBinding { reduce_ds, leaf_ds }
}
unsafe fn record(
&self,
pass: &mut ComputePass,
code: &StackCode,
binding: &StackBinding,
size: u64,
) {
let n_workgroups = (size + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
pass.dispatch(
&code.reduce_pipeline,
&binding.reduce_ds,
(n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1),
);
pass.memory_barrier();
pass.dispatch(
&code.leaf_pipeline,
&binding.leaf_ds,
(n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1),
);
}
}
impl StackData {
/// Generate a random Dyck sequence.
///
/// Here the encoding is: 1 is push, 0 is pop.
fn new(n: u64) -> StackData {
// Simple LCG random generator, so we don't need to import rand
let mut z = 20170705u64;
let mut depth = 0;
let dyck = (0..n)
.map(|_| {
let is_push = if depth < 2 {
1
} else {
z = z.wrapping_mul(742938285) % ((1 << 31) - 1);
(z % 2) as u32
};
if is_push == 1 {
depth += 1;
} else {
depth -= 1;
}
is_push
})
.collect();
StackData { dyck }
}
// Run on CPU side, for performance comparison
fn run(&self) -> Vec<u32> {
let mut stack = Vec::new();
self.dyck
.iter()
.enumerate()
.map(|(i, inp)| {
let expected = *stack.last().unwrap_or(&!0);
if *inp == 0 {
stack.pop();
} else {
stack.push(i as u32);
}
expected
})
.collect()
}
fn verify(&self, data: &[u32]) -> Option<String> {
let mut stack = Vec::new();
for (i, (inp, outp)) in self.dyck.iter().zip(data).enumerate() {
if let Some(tos) = stack.last() {
if tos != outp {
return Some(format!("mismatch at {}: {} != {}", i, tos, outp));
}
}
if *inp == 0 {
stack.pop();
} else {
stack.push(i as u32);
}
}
None
}
}