Parallel permuted version of CoRR4 litmus Trying it to see if I can catch failures.
diff --git a/tests/shader/build.ninja b/tests/shader/build.ninja index 49e0260..0048429 100644 --- a/tests/shader/build.ninja +++ b/tests/shader/build.ninja
@@ -66,6 +66,12 @@ build gen/message_passing_vkmm.spv: glsl message_passing.comp flags = -DVKMM +build gen/corr4.spv: glsl corr4.comp + flags = -DVKMM +build gen/corr4.hlsl: hlsl gen/corr4.spv +build gen/corr4.dxil: dxil gen/corr4.hlsl +build gen/corr4.msl: msl gen/corr4.spv + build gen/linkedlist.spv: glsl linkedlist.comp build gen/linkedlist.hlsl: hlsl gen/linkedlist.spv build gen/linkedlist.dxil: dxil gen/linkedlist.hlsl
diff --git a/tests/shader/corr4.comp b/tests/shader/corr4.comp new file mode 100644 index 0000000..7d28373 --- /dev/null +++ b/tests/shader/corr4.comp
@@ -0,0 +1,68 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Our version of the CorRR4 atomic litmus test. + +#version 450 + +#extension GL_KHR_memory_scope_semantics : enable + +#ifdef VKMM +#pragma use_vulkan_memory_model +#define ACQUIRE gl_StorageSemanticsBuffer, gl_SemanticsAcquire +#define RELEASE gl_StorageSemanticsBuffer, gl_SemanticsRelease +#else +#define ACQUIRE 0, 0 +#define RELEASE 0, 0 +#endif + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) buffer DataBuf +{ + uint data[]; +} data_buf; + +struct Result { + uint r0; + uint r1; + uint r2; + uint r3; +}; + +layout(binding = 1) buffer OutBuf +{ + Result r[]; +} out_buf; + +void main() +{ + uint ix = gl_GlobalInvocationID.x; + + // This code will do all four roles. For any given "x" we want 4 (different) + // threads to map to it. We'll use prime permutations. + + uint role_0_ix = (ix * 661u) & 65535u; + uint role_1_ix = (ix * 1087u) & 65535u; + uint role_2_ix = (ix * 2749u) & 65535u; + uint role_3_ix = (ix * 3433u) & 65535u; + + // Role 0: atomicStore(x, 1) + atomicStore(data_buf.data[role_0_ix], 1u, gl_ScopeDevice, 0, 0); + + // Role 1: two atomic loads + uint r0 = atomicLoad(data_buf.data[role_1_ix], gl_ScopeDevice, 0, 0); + uint r1 = atomicLoad(data_buf.data[role_1_ix], gl_ScopeDevice, 0, 0); + + // Role 2: atomicStore(x, 2) + atomicStore(data_buf.data[role_2_ix], 2u, gl_ScopeDevice, 0, 0); + + // Role 3: two atomic loads + uint r2 = atomicLoad(data_buf.data[role_3_ix], gl_ScopeDevice, 0, 0); + uint r3 = atomicLoad(data_buf.data[role_3_ix], gl_ScopeDevice, 0, 0); + + // Store results in output buffer + out_buf.r[role_1_ix].r0 = r0; + out_buf.r[role_1_ix].r1 = r1; + out_buf.r[role_3_ix].r2 = r2; + out_buf.r[role_3_ix].r3 = r3; +}
diff --git a/tests/shader/gen/corr4.dxil b/tests/shader/gen/corr4.dxil new file mode 100644 index 0000000..6beffd9 --- /dev/null +++ b/tests/shader/gen/corr4.dxil Binary files differ
diff --git a/tests/shader/gen/corr4.hlsl b/tests/shader/gen/corr4.hlsl new file mode 100644 index 0000000..9b51b81 --- /dev/null +++ b/tests/shader/gen/corr4.hlsl
@@ -0,0 +1,54 @@ +struct Result +{ + uint r0; + uint r1; + uint r2; + uint r3; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer data_buf : register(u0); +RWByteAddressBuffer out_buf : register(u1); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x; + uint role_0_ix = (ix * 661u) & 65535u; + uint role_1_ix = (ix * 1087u) & 65535u; + uint role_2_ix = (ix * 2749u) & 65535u; + uint role_3_ix = (ix * 3433u) & 65535u; + uint _89; + data_buf.InterlockedExchange(role_0_ix * 4 + 0, 1u, _89); + uint _52; + data_buf.InterlockedAdd(role_1_ix * 4 + 0, 0, _52); + uint r0 = _52; + uint _56; + data_buf.InterlockedAdd(role_1_ix * 4 + 0, 0, _56); + uint r1 = _56; + uint _90; + data_buf.InterlockedExchange(role_2_ix * 4 + 0, 2u, _90); + uint _63; + data_buf.InterlockedAdd(role_3_ix * 4 + 0, 0, _63); + uint r2 = _63; + uint _67; + data_buf.InterlockedAdd(role_3_ix * 4 + 0, 0, _67); + uint r3 = _67; + out_buf.Store(role_1_ix * 16 + 0, r0); + out_buf.Store(role_1_ix * 16 + 4, r1); + out_buf.Store(role_3_ix * 16 + 8, r2); + out_buf.Store(role_3_ix * 16 + 12, r3); +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +}
diff --git a/tests/shader/gen/corr4.msl b/tests/shader/gen/corr4.msl new file mode 100644 index 0000000..7556b80 --- /dev/null +++ b/tests/shader/gen/corr4.msl
@@ -0,0 +1,51 @@ +#pragma clang diagnostic ignored "-Wunused-variable" + +#include <metal_stdlib> +#include <simd/simd.h> +#include <metal_atomic> + +using namespace metal; + +struct DataBuf +{ + uint data[1]; +}; + +struct Result +{ + uint r0; + uint r1; + uint r2; + uint r3; +}; + +struct OutBuf +{ + Result r[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +kernel void main0(device DataBuf& data_buf [[buffer(0)]], device OutBuf& out_buf [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + uint ix = gl_GlobalInvocationID.x; + uint role_0_ix = (ix * 661u) & 65535u; + uint role_1_ix = (ix * 1087u) & 65535u; + uint role_2_ix = (ix * 2749u) & 65535u; + uint role_3_ix = (ix * 3433u) & 65535u; + atomic_store_explicit((device atomic_uint*)&data_buf.data[role_0_ix], 1u, memory_order_relaxed); + uint _52 = atomic_load_explicit((device atomic_uint*)&data_buf.data[role_1_ix], memory_order_relaxed); + uint r0 = _52; + uint _56 = atomic_load_explicit((device atomic_uint*)&data_buf.data[role_1_ix], memory_order_relaxed); + uint r1 = _56; + atomic_store_explicit((device atomic_uint*)&data_buf.data[role_2_ix], 2u, memory_order_relaxed); + uint _63 = atomic_load_explicit((device atomic_uint*)&data_buf.data[role_3_ix], memory_order_relaxed); + uint r2 = _63; + uint _67 = atomic_load_explicit((device atomic_uint*)&data_buf.data[role_3_ix], memory_order_relaxed); + uint r3 = _67; + out_buf.r[role_1_ix].r0 = r0; + out_buf.r[role_1_ix].r1 = r1; + out_buf.r[role_3_ix].r2 = r2; + out_buf.r[role_3_ix].r3 = r3; +} +
diff --git a/tests/shader/gen/corr4.spv b/tests/shader/gen/corr4.spv new file mode 100644 index 0000000..bfced5e --- /dev/null +++ b/tests/shader/gen/corr4.spv Binary files differ
diff --git a/tests/src/corr4.rs b/tests/src/corr4.rs new file mode 100644 index 0000000..3feaa9f --- /dev/null +++ b/tests/src/corr4.rs
@@ -0,0 +1,182 @@ +// Copyright 2021 The piet-gpu authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Also licensed under MIT license, at your choice. + +use piet_gpu_hal::{include_shader, BackendType, BindType, BufferUsage, DescriptorSet}; +use piet_gpu_hal::{Buffer, Pipeline}; + +use crate::clear::{ClearBinding, ClearCode, ClearStage}; +use crate::config::Config; +use crate::runner::{Commands, Runner}; +use crate::test_result::TestResult; + +const N_ELEMENTS: u64 = 65536; + +/// The shader code for corr4 example. +struct Corr4Code { + pipeline: Pipeline, + clear_code: Option<ClearCode>, +} + +/// The stage resources for corr4 example. +struct Corr4Stage { + data_buf: Buffer, + clear_stages: Option<(ClearStage, ClearBinding, ClearStage)>, +} + +/// The binding for corr4 example. +struct Corr4Binding { + descriptor_set: DescriptorSet, + clear_binding: Option<ClearBinding>, +} + +pub unsafe fn run_corr4_test( + runner: &mut Runner, + config: &Config, +) -> TestResult { + let mut result = TestResult::new(format!("CoRR4 litmus")); + let out_buf = runner.buf_down(16 * N_ELEMENTS); + let code = Corr4Code::new(runner); + let stage = Corr4Stage::new(runner, &code); + let binding = stage.bind(runner, &code, &out_buf.dev_buf); + let n_iter = config.n_iter; + let mut total_elapsed = 0.0; + let mut failures = 0; + for _ in 0..n_iter { + let mut commands = runner.commands(); + commands.write_timestamp(0); + stage.record(&mut commands, &code, &binding, &out_buf.dev_buf); + commands.write_timestamp(1); + commands.cmd_buf.memory_barrier(); + commands.download(&out_buf); + total_elapsed += runner.submit(commands); + let mut dst: Vec<u32> = Default::default(); + out_buf.read(&mut dst); + failures += analyze(&dst); + } + if failures > 0 { + result.fail(format!("{} failures", failures)); + } + result.timing(total_elapsed, N_ELEMENTS * n_iter); + result +} + +impl Corr4Code { + unsafe fn new(runner: &mut Runner) -> Corr4Code { + let code = include_shader!(&runner.session, "../shader/gen/corr4"); + let pipeline = runner + .session + .create_compute_pipeline(code, &[BindType::Buffer, BindType::Buffer]) + .unwrap(); + // Currently, DX12 and Metal backends don't support buffer clearing, so use a + // compute shader as a workaround. + let clear_code = if runner.backend_type() != BackendType::Vulkan { + Some(ClearCode::new(runner)) + } else { + None + }; + Corr4Code { + pipeline, + clear_code, + } + } +} + +impl Corr4Stage { + unsafe fn new(runner: &mut Runner, code: &Corr4Code) -> Corr4Stage { + let data_buf_size = 4 * N_ELEMENTS; + let data_buf = runner + .session + .create_buffer(data_buf_size, BufferUsage::STORAGE | BufferUsage::COPY_DST) + .unwrap(); + let clear_stages = if let Some(clear_code) = &code.clear_code { + let stage0 = ClearStage::new(runner, N_ELEMENTS * 2); + let binding0 = stage0.bind(runner, clear_code, &data_buf); + let stage1 = ClearStage::new(runner, 1); + Some((stage0, binding0, stage1)) + } else { + None + }; + Corr4Stage { + data_buf, + clear_stages, + } + } + + unsafe fn bind( + &self, + runner: &mut Runner, + code: &Corr4Code, + out_buf: &Buffer, + ) -> Corr4Binding { + let descriptor_set = runner + .session + .create_simple_descriptor_set(&code.pipeline, &[&self.data_buf, out_buf]) + .unwrap(); + let clear_binding = if let Some(clear_code) = &code.clear_code { + Some( + self.clear_stages + .as_ref() + .unwrap() + .2 + .bind(runner, clear_code, out_buf), + ) + } else { + None + }; + Corr4Binding { + descriptor_set, + clear_binding, + } + } + + unsafe fn record( + &self, + commands: &mut Commands, + code: &Corr4Code, + bindings: &Corr4Binding, + out_buf: &Buffer, + ) { + if let Some((stage0, binding0, stage1)) = &self.clear_stages { + let code = code.clear_code.as_ref().unwrap(); + stage0.record(commands, code, binding0); + stage1.record(commands, code, bindings.clear_binding.as_ref().unwrap()); + } else { + commands.cmd_buf.clear_buffer(&self.data_buf, None); + commands.cmd_buf.clear_buffer(out_buf, None); + } + commands.cmd_buf.memory_barrier(); + commands.cmd_buf.dispatch( + &code.pipeline, + &bindings.descriptor_set, + (256, 1, 1), + (256, 1, 1), + ); + } +} + +fn analyze(data: &[u32]) -> u64 { + let mut failures = 0; + for i in 0..N_ELEMENTS as usize { + let r0 = data[i * 4 + 0]; + let r1 = data[i * 4 + 1]; + let r2 = data[i * 4 + 2]; + let r3 = data[i * 4 + 3]; + if (r0 == 1 && r1 == 2 && r2 == 2 && r3 == 1) || (r0 == 2 && r1 == 1 && r2 == 1 && r3 == 2) || (r0 != 0 && r1 == 0) || (r2 != 0 && r3 == 0) { + failures += 1; + } + } + failures +}
diff --git a/tests/src/main.rs b/tests/src/main.rs index dd6f4bd..8301e91 100644 --- a/tests/src/main.rs +++ b/tests/src/main.rs
@@ -18,6 +18,7 @@ mod clear; mod config; +mod corr4; mod linkedlist; mod message_passing; mod prefix; @@ -121,6 +122,7 @@ message_passing::Variant::Vkmm, )); } + report(&corr4::run_corr4_test(&mut runner, &config)); report(&linkedlist::run_linkedlist_test(&mut runner, &config)); } }