Parallel permuted version of CoRR4 litmus

Trying it to see if I can catch failures.
diff --git a/tests/shader/build.ninja b/tests/shader/build.ninja
index 49e0260..0048429 100644
--- a/tests/shader/build.ninja
+++ b/tests/shader/build.ninja
@@ -66,6 +66,12 @@
 build gen/message_passing_vkmm.spv: glsl message_passing.comp
   flags = -DVKMM
 
+build gen/corr4.spv: glsl corr4.comp
+  flags = -DVKMM
+build gen/corr4.hlsl: hlsl gen/corr4.spv
+build gen/corr4.dxil: dxil gen/corr4.hlsl
+build gen/corr4.msl: msl gen/corr4.spv
+
 build gen/linkedlist.spv: glsl linkedlist.comp
 build gen/linkedlist.hlsl: hlsl gen/linkedlist.spv
 build gen/linkedlist.dxil: dxil gen/linkedlist.hlsl
diff --git a/tests/shader/corr4.comp b/tests/shader/corr4.comp
new file mode 100644
index 0000000..7d28373
--- /dev/null
+++ b/tests/shader/corr4.comp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Our version of the CorRR4 atomic litmus test.
+
+#version 450
+
+#extension GL_KHR_memory_scope_semantics : enable
+
+#ifdef VKMM
+#pragma use_vulkan_memory_model
+#define ACQUIRE gl_StorageSemanticsBuffer, gl_SemanticsAcquire
+#define RELEASE gl_StorageSemanticsBuffer, gl_SemanticsRelease
+#else
+#define ACQUIRE 0, 0
+#define RELEASE 0, 0
+#endif
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 0) buffer DataBuf
+{
+    uint data[];
+} data_buf;
+
+struct Result {
+    uint r0;
+    uint r1;
+    uint r2;
+    uint r3;
+};
+
+layout(binding = 1) buffer OutBuf
+{
+    Result r[];
+} out_buf;
+
+void main()
+{
+    uint ix = gl_GlobalInvocationID.x;
+
+    // This code will do all four roles. For any given "x" we want 4 (different)
+    // threads to map to it. We'll use prime permutations.
+
+    uint role_0_ix = (ix * 661u) & 65535u;
+    uint role_1_ix = (ix * 1087u) & 65535u;
+    uint role_2_ix = (ix * 2749u) & 65535u;
+    uint role_3_ix = (ix * 3433u) & 65535u;
+
+    // Role 0: atomicStore(x, 1)
+    atomicStore(data_buf.data[role_0_ix], 1u, gl_ScopeDevice, 0, 0);
+
+    // Role 1: two atomic loads
+    uint r0 = atomicLoad(data_buf.data[role_1_ix], gl_ScopeDevice, 0, 0);
+    uint r1 = atomicLoad(data_buf.data[role_1_ix], gl_ScopeDevice, 0, 0);
+
+    // Role 2: atomicStore(x, 2)
+    atomicStore(data_buf.data[role_2_ix], 2u, gl_ScopeDevice, 0, 0);
+
+    // Role 3: two atomic loads
+    uint r2 = atomicLoad(data_buf.data[role_3_ix], gl_ScopeDevice, 0, 0);
+    uint r3 = atomicLoad(data_buf.data[role_3_ix], gl_ScopeDevice, 0, 0);
+
+    // Store results in output buffer
+    out_buf.r[role_1_ix].r0 = r0;
+    out_buf.r[role_1_ix].r1 = r1;
+    out_buf.r[role_3_ix].r2 = r2;
+    out_buf.r[role_3_ix].r3 = r3;
+}
diff --git a/tests/shader/gen/corr4.dxil b/tests/shader/gen/corr4.dxil
new file mode 100644
index 0000000..6beffd9
--- /dev/null
+++ b/tests/shader/gen/corr4.dxil
Binary files differ
diff --git a/tests/shader/gen/corr4.hlsl b/tests/shader/gen/corr4.hlsl
new file mode 100644
index 0000000..9b51b81
--- /dev/null
+++ b/tests/shader/gen/corr4.hlsl
@@ -0,0 +1,54 @@
+struct Result
+{
+    uint r0;
+    uint r1;
+    uint r2;
+    uint r3;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer data_buf : register(u0);
+RWByteAddressBuffer out_buf : register(u1);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x;
+    uint role_0_ix = (ix * 661u) & 65535u;
+    uint role_1_ix = (ix * 1087u) & 65535u;
+    uint role_2_ix = (ix * 2749u) & 65535u;
+    uint role_3_ix = (ix * 3433u) & 65535u;
+    uint _89;
+    data_buf.InterlockedExchange(role_0_ix * 4 + 0, 1u, _89);
+    uint _52;
+    data_buf.InterlockedAdd(role_1_ix * 4 + 0, 0, _52);
+    uint r0 = _52;
+    uint _56;
+    data_buf.InterlockedAdd(role_1_ix * 4 + 0, 0, _56);
+    uint r1 = _56;
+    uint _90;
+    data_buf.InterlockedExchange(role_2_ix * 4 + 0, 2u, _90);
+    uint _63;
+    data_buf.InterlockedAdd(role_3_ix * 4 + 0, 0, _63);
+    uint r2 = _63;
+    uint _67;
+    data_buf.InterlockedAdd(role_3_ix * 4 + 0, 0, _67);
+    uint r3 = _67;
+    out_buf.Store(role_1_ix * 16 + 0, r0);
+    out_buf.Store(role_1_ix * 16 + 4, r1);
+    out_buf.Store(role_3_ix * 16 + 8, r2);
+    out_buf.Store(role_3_ix * 16 + 12, r3);
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/tests/shader/gen/corr4.msl b/tests/shader/gen/corr4.msl
new file mode 100644
index 0000000..7556b80
--- /dev/null
+++ b/tests/shader/gen/corr4.msl
@@ -0,0 +1,51 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct DataBuf
+{
+    uint data[1];
+};
+
+struct Result
+{
+    uint r0;
+    uint r1;
+    uint r2;
+    uint r3;
+};
+
+struct OutBuf
+{
+    Result r[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+kernel void main0(device DataBuf& data_buf [[buffer(0)]], device OutBuf& out_buf [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    uint ix = gl_GlobalInvocationID.x;
+    uint role_0_ix = (ix * 661u) & 65535u;
+    uint role_1_ix = (ix * 1087u) & 65535u;
+    uint role_2_ix = (ix * 2749u) & 65535u;
+    uint role_3_ix = (ix * 3433u) & 65535u;
+    atomic_store_explicit((device atomic_uint*)&data_buf.data[role_0_ix], 1u, memory_order_relaxed);
+    uint _52 = atomic_load_explicit((device atomic_uint*)&data_buf.data[role_1_ix], memory_order_relaxed);
+    uint r0 = _52;
+    uint _56 = atomic_load_explicit((device atomic_uint*)&data_buf.data[role_1_ix], memory_order_relaxed);
+    uint r1 = _56;
+    atomic_store_explicit((device atomic_uint*)&data_buf.data[role_2_ix], 2u, memory_order_relaxed);
+    uint _63 = atomic_load_explicit((device atomic_uint*)&data_buf.data[role_3_ix], memory_order_relaxed);
+    uint r2 = _63;
+    uint _67 = atomic_load_explicit((device atomic_uint*)&data_buf.data[role_3_ix], memory_order_relaxed);
+    uint r3 = _67;
+    out_buf.r[role_1_ix].r0 = r0;
+    out_buf.r[role_1_ix].r1 = r1;
+    out_buf.r[role_3_ix].r2 = r2;
+    out_buf.r[role_3_ix].r3 = r3;
+}
+
diff --git a/tests/shader/gen/corr4.spv b/tests/shader/gen/corr4.spv
new file mode 100644
index 0000000..bfced5e
--- /dev/null
+++ b/tests/shader/gen/corr4.spv
Binary files differ
diff --git a/tests/src/corr4.rs b/tests/src/corr4.rs
new file mode 100644
index 0000000..3feaa9f
--- /dev/null
+++ b/tests/src/corr4.rs
@@ -0,0 +1,182 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+use piet_gpu_hal::{include_shader, BackendType, BindType, BufferUsage, DescriptorSet};
+use piet_gpu_hal::{Buffer, Pipeline};
+
+use crate::clear::{ClearBinding, ClearCode, ClearStage};
+use crate::config::Config;
+use crate::runner::{Commands, Runner};
+use crate::test_result::TestResult;
+
+const N_ELEMENTS: u64 = 65536;
+
+/// The shader code for corr4 example.
+struct Corr4Code {
+    pipeline: Pipeline,
+    clear_code: Option<ClearCode>,
+}
+
+/// The stage resources for corr4 example.
+struct Corr4Stage {
+    data_buf: Buffer,
+    clear_stages: Option<(ClearStage, ClearBinding, ClearStage)>,
+}
+
+/// The binding for corr4 example.
+struct Corr4Binding {
+    descriptor_set: DescriptorSet,
+    clear_binding: Option<ClearBinding>,
+}
+
+pub unsafe fn run_corr4_test(
+    runner: &mut Runner,
+    config: &Config,
+) -> TestResult {
+    let mut result = TestResult::new(format!("CoRR4 litmus"));
+    let out_buf = runner.buf_down(16 * N_ELEMENTS);
+    let code = Corr4Code::new(runner);
+    let stage = Corr4Stage::new(runner, &code);
+    let binding = stage.bind(runner, &code, &out_buf.dev_buf);
+    let n_iter = config.n_iter;
+    let mut total_elapsed = 0.0;
+    let mut failures = 0;
+    for _ in 0..n_iter {
+        let mut commands = runner.commands();
+        commands.write_timestamp(0);
+        stage.record(&mut commands, &code, &binding, &out_buf.dev_buf);
+        commands.write_timestamp(1);
+        commands.cmd_buf.memory_barrier();
+        commands.download(&out_buf);
+        total_elapsed += runner.submit(commands);
+        let mut dst: Vec<u32> = Default::default();
+        out_buf.read(&mut dst);
+        failures += analyze(&dst);
+    }
+    if failures > 0 {
+        result.fail(format!("{} failures", failures));
+    }
+    result.timing(total_elapsed, N_ELEMENTS * n_iter);
+    result
+}
+
+impl Corr4Code {
+    unsafe fn new(runner: &mut Runner) -> Corr4Code {
+        let code = include_shader!(&runner.session, "../shader/gen/corr4");
+        let pipeline = runner
+            .session
+            .create_compute_pipeline(code, &[BindType::Buffer, BindType::Buffer])
+            .unwrap();
+        // Currently, DX12 and Metal backends don't support buffer clearing, so use a
+        // compute shader as a workaround.
+        let clear_code = if runner.backend_type() != BackendType::Vulkan {
+            Some(ClearCode::new(runner))
+        } else {
+            None
+        };
+        Corr4Code {
+            pipeline,
+            clear_code,
+        }
+    }
+}
+
+impl Corr4Stage {
+    unsafe fn new(runner: &mut Runner, code: &Corr4Code) -> Corr4Stage {
+        let data_buf_size = 4 * N_ELEMENTS;
+        let data_buf = runner
+            .session
+            .create_buffer(data_buf_size, BufferUsage::STORAGE | BufferUsage::COPY_DST)
+            .unwrap();
+        let clear_stages = if let Some(clear_code) = &code.clear_code {
+            let stage0 = ClearStage::new(runner, N_ELEMENTS * 2);
+            let binding0 = stage0.bind(runner, clear_code, &data_buf);
+            let stage1 = ClearStage::new(runner, 1);
+            Some((stage0, binding0, stage1))
+        } else {
+            None
+        };
+        Corr4Stage {
+            data_buf,
+            clear_stages,
+        }
+    }
+
+    unsafe fn bind(
+        &self,
+        runner: &mut Runner,
+        code: &Corr4Code,
+        out_buf: &Buffer,
+    ) -> Corr4Binding {
+        let descriptor_set = runner
+            .session
+            .create_simple_descriptor_set(&code.pipeline, &[&self.data_buf, out_buf])
+            .unwrap();
+        let clear_binding = if let Some(clear_code) = &code.clear_code {
+            Some(
+                self.clear_stages
+                    .as_ref()
+                    .unwrap()
+                    .2
+                    .bind(runner, clear_code, out_buf),
+            )
+        } else {
+            None
+        };
+        Corr4Binding {
+            descriptor_set,
+            clear_binding,
+        }
+    }
+
+    unsafe fn record(
+        &self,
+        commands: &mut Commands,
+        code: &Corr4Code,
+        bindings: &Corr4Binding,
+        out_buf: &Buffer,
+    ) {
+        if let Some((stage0, binding0, stage1)) = &self.clear_stages {
+            let code = code.clear_code.as_ref().unwrap();
+            stage0.record(commands, code, binding0);
+            stage1.record(commands, code, bindings.clear_binding.as_ref().unwrap());
+        } else {
+            commands.cmd_buf.clear_buffer(&self.data_buf, None);
+            commands.cmd_buf.clear_buffer(out_buf, None);
+        }
+        commands.cmd_buf.memory_barrier();
+        commands.cmd_buf.dispatch(
+            &code.pipeline,
+            &bindings.descriptor_set,
+            (256, 1, 1),
+            (256, 1, 1),
+        );
+    }
+}
+
+fn analyze(data: &[u32]) -> u64 {
+    let mut failures = 0;
+    for i in 0..N_ELEMENTS as usize {
+        let r0 = data[i * 4 + 0];
+        let r1 = data[i * 4 + 1];
+        let r2 = data[i * 4 + 2];
+        let r3 = data[i * 4 + 3];
+        if (r0 == 1 && r1 == 2 && r2 == 2 && r3 == 1) || (r0 == 2 && r1 == 1 && r2 == 1 && r3 == 2) || (r0 != 0 && r1 == 0) || (r2 != 0 && r3 == 0) {
+            failures += 1;
+        }
+    }
+    failures
+}
diff --git a/tests/src/main.rs b/tests/src/main.rs
index dd6f4bd..8301e91 100644
--- a/tests/src/main.rs
+++ b/tests/src/main.rs
@@ -18,6 +18,7 @@
 
 mod clear;
 mod config;
+mod corr4;
 mod linkedlist;
 mod message_passing;
 mod prefix;
@@ -121,6 +122,7 @@
                     message_passing::Variant::Vkmm,
                 ));
             }
+            report(&corr4::run_corr4_test(&mut runner, &config));
             report(&linkedlist::run_linkedlist_test(&mut runner, &config));
         }
     }