Revert unintended shader changes

The previous commit picked up some experiments with shaders by mistake.
diff --git a/tests/shader/gen/message_passing.dxil b/tests/shader/gen/message_passing.dxil
index dc2eced..2be73da 100644
--- a/tests/shader/gen/message_passing.dxil
+++ b/tests/shader/gen/message_passing.dxil
Binary files differ
diff --git a/tests/shader/gen/message_passing.hlsl b/tests/shader/gen/message_passing.hlsl
index 3a48808..ba8ce5f 100644
--- a/tests/shader/gen/message_passing.hlsl
+++ b/tests/shader/gen/message_passing.hlsl
@@ -6,7 +6,7 @@
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-globallycoherent RWByteAddressBuffer data_buf : register(u0);
+RWByteAddressBuffer data_buf : register(u0);
 RWByteAddressBuffer control_buf : register(u1);
 
 static uint3 gl_GlobalInvocationID;
@@ -22,12 +22,13 @@
 
 void comp_main()
 {
-    data_buf.Store(gl_GlobalInvocationID.x * 8 + 0, 1u);
+    uint _76;
+    data_buf.InterlockedExchange(gl_GlobalInvocationID.x * 8 + 0, 1u, _76);
     DeviceMemoryBarrier();
     uint param = gl_GlobalInvocationID.x;
     uint write_flag_ix = permute_flag_ix(param);
-    uint _76;
-    data_buf.InterlockedExchange(write_flag_ix * 8 + 4, 1u, _76);
+    uint _77;
+    data_buf.InterlockedExchange(write_flag_ix * 8 + 4, 1u, _77);
     uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u;
     uint param_1 = read_ix;
     uint read_flag_ix = permute_flag_ix(param_1);
@@ -35,7 +36,9 @@
     data_buf.InterlockedAdd(read_flag_ix * 8 + 4, 0, _58);
     uint flag = _58;
     DeviceMemoryBarrier();
-    uint data = data_buf.Load(read_ix * 8 + 0);
+    uint _62;
+    data_buf.InterlockedAdd(read_ix * 8 + 0, 0, _62);
+    uint data = _62;
     if (flag > data)
     {
         uint _73;
diff --git a/tests/shader/gen/message_passing.msl b/tests/shader/gen/message_passing.msl
index 1bda181..e48f48a 100644
--- a/tests/shader/gen/message_passing.msl
+++ b/tests/shader/gen/message_passing.msl
@@ -31,20 +31,21 @@
     return (data_ix * 419u) & 65535u;
 }
 
-kernel void main0(volatile device DataBuf& data_buf [[buffer(0)]], device ControlBuf& control_buf [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+kernel void main0(device DataBuf& data_buf [[buffer(0)]], device ControlBuf& control_buf [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
-    data_buf.data[gl_GlobalInvocationID.x].data = 1u;
+    atomic_store_explicit((device atomic_uint*)&data_buf.data[gl_GlobalInvocationID.x].data, 1u, memory_order_relaxed);
     threadgroup_barrier(mem_flags::mem_device);
     uint param = gl_GlobalInvocationID.x;
     uint write_flag_ix = permute_flag_ix(param);
-    atomic_store_explicit((volatile device atomic_uint*)&data_buf.data[write_flag_ix].flag, 1u, memory_order_relaxed);
+    atomic_store_explicit((device atomic_uint*)&data_buf.data[write_flag_ix].flag, 1u, memory_order_relaxed);
     uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u;
     uint param_1 = read_ix;
     uint read_flag_ix = permute_flag_ix(param_1);
-    uint _58 = atomic_load_explicit((volatile device atomic_uint*)&data_buf.data[read_flag_ix].flag, memory_order_relaxed);
+    uint _58 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_flag_ix].flag, memory_order_relaxed);
     uint flag = _58;
     threadgroup_barrier(mem_flags::mem_device);
-    uint data = data_buf.data[read_ix].data;
+    uint _62 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_ix].data, memory_order_relaxed);
+    uint data = _62;
     if (flag > data)
     {
         uint _73 = atomic_fetch_add_explicit((device atomic_uint*)&control_buf.failures, 1u, memory_order_relaxed);
diff --git a/tests/shader/gen/message_passing.spv b/tests/shader/gen/message_passing.spv
index fa7fb3e..e5f56d6 100644
--- a/tests/shader/gen/message_passing.spv
+++ b/tests/shader/gen/message_passing.spv
Binary files differ
diff --git a/tests/shader/gen/message_passing_vkmm.spv b/tests/shader/gen/message_passing_vkmm.spv
index 58d3521..8527c2b 100644
--- a/tests/shader/gen/message_passing_vkmm.spv
+++ b/tests/shader/gen/message_passing_vkmm.spv
Binary files differ
diff --git a/tests/shader/gen/prefix.dxil b/tests/shader/gen/prefix.dxil
index 12d0d50..34f3d6a 100644
--- a/tests/shader/gen/prefix.dxil
+++ b/tests/shader/gen/prefix.dxil
Binary files differ
diff --git a/tests/shader/gen/prefix.hlsl b/tests/shader/gen/prefix.hlsl
index ba76110..3af5a96 100644
--- a/tests/shader/gen/prefix.hlsl
+++ b/tests/shader/gen/prefix.hlsl
@@ -12,11 +12,11 @@
 
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
 
-static const Monoid _183 = { 0u };
+static const Monoid _185 = { 0u };
 
 globallycoherent RWByteAddressBuffer _43 : register(u2);
 ByteAddressBuffer _67 : register(t0);
-RWByteAddressBuffer _367 : register(u1);
+RWByteAddressBuffer _372 : register(u1);
 
 static uint3 gl_LocalInvocationID;
 struct SPIRV_Cross_Input
@@ -76,12 +76,10 @@
     }
     if (gl_LocalInvocationID.x == 511u)
     {
-        uint _378;
-        _43.InterlockedExchange(part_ix * 12 + 8, agg.element, _378);
+        _43.Store(part_ix * 12 + 8, agg.element);
         if (part_ix == 0u)
         {
-            uint _379;
-            _43.InterlockedExchange(12, agg.element, _379);
+            _43.Store(12, agg.element);
         }
     }
     DeviceMemoryBarrier();
@@ -94,11 +92,12 @@
         }
         _43.Store(part_ix * 12 + 4, flag);
     }
-    Monoid exclusive = _183;
+    Monoid exclusive = _185;
     if (part_ix != 0u)
     {
         uint look_back_ix = part_ix - 1u;
         uint their_ix = 0u;
+        Monoid their_prefix;
         Monoid their_agg;
         Monoid m;
         while (true)
@@ -114,10 +113,9 @@
             {
                 if (gl_LocalInvocationID.x == 511u)
                 {
-                    uint _221;
-                    _43.InterlockedAdd(look_back_ix * 12 + 12, 0, _221);
-                    Monoid _222 = { _221 };
-                    Monoid their_prefix = _222;
+                    Monoid _223;
+                    _223.element = _43.Load(look_back_ix * 12 + 12);
+                    their_prefix.element = _223.element;
                     Monoid param_4 = their_prefix;
                     Monoid param_5 = exclusive;
                     exclusive = combine_monoid(param_4, param_5);
@@ -130,9 +128,9 @@
                 {
                     if (gl_LocalInvocationID.x == 511u)
                     {
-                        uint _242;
-                        _43.InterlockedAdd(look_back_ix * 12 + 8, 0, _242);
-                        their_agg.element = _242;
+                        Monoid _245;
+                        _245.element = _43.Load(look_back_ix * 12 + 8);
+                        their_agg.element = _245.element;
                         Monoid param_6 = their_agg;
                         Monoid param_7 = exclusive;
                         exclusive = combine_monoid(param_6, param_7);
@@ -144,9 +142,9 @@
             }
             if (gl_LocalInvocationID.x == 511u)
             {
-                Monoid _263;
-                _263.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
-                m.element = _263.element;
+                Monoid _267;
+                _267.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
+                m.element = _267.element;
                 if (their_ix == 0u)
                 {
                     their_agg = m;
@@ -187,8 +185,7 @@
             Monoid param_13 = agg;
             Monoid inclusive_prefix = combine_monoid(param_12, param_13);
             sh_prefix = exclusive;
-            uint _380;
-            _43.InterlockedExchange(part_ix * 12 + 12, inclusive_prefix.element, _380);
+            _43.Store(part_ix * 12 + 12, inclusive_prefix.element);
         }
         DeviceMemoryBarrier();
         if (gl_LocalInvocationID.x == 511u)
@@ -214,7 +211,7 @@
         Monoid param_16 = row;
         Monoid param_17 = local[i_2];
         Monoid m_1 = combine_monoid(param_16, param_17);
-        _367.Store((ix + i_2) * 4 + 0, m_1.element);
+        _372.Store((ix + i_2) * 4 + 0, m_1.element);
     }
 }
 
diff --git a/tests/shader/gen/prefix.msl b/tests/shader/gen/prefix.msl
index aaa7158..8e402a9 100644
--- a/tests/shader/gen/prefix.msl
+++ b/tests/shader/gen/prefix.msl
@@ -87,7 +87,7 @@
     return Monoid{ a.element + b.element };
 }
 
-kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _367 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_part_ix;
     threadgroup Monoid sh_scratch[512];
@@ -127,10 +127,10 @@
     }
     if (gl_LocalInvocationID.x == 511u)
     {
-        atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].aggregate.element, agg.element, memory_order_relaxed);
+        _43.state[part_ix].aggregate.element = agg.element;
         if (part_ix == 0u)
         {
-            atomic_store_explicit((volatile device atomic_uint*)&_43.state[0].prefix.element, agg.element, memory_order_relaxed);
+            _43.state[0].prefix.element = agg.element;
         }
     }
     threadgroup_barrier(mem_flags::mem_device);
@@ -148,6 +148,7 @@
     {
         uint look_back_ix = part_ix - 1u;
         uint their_ix = 0u;
+        Monoid their_prefix;
         Monoid their_agg;
         Monoid m;
         while (true)
@@ -163,8 +164,7 @@
             {
                 if (gl_LocalInvocationID.x == 511u)
                 {
-                    uint _221 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].prefix.element, memory_order_relaxed);
-                    Monoid their_prefix = Monoid{ _221 };
+                    their_prefix.element = _43.state[look_back_ix].prefix.element;
                     Monoid param_4 = their_prefix;
                     Monoid param_5 = exclusive;
                     exclusive = combine_monoid(param_4, param_5);
@@ -177,8 +177,7 @@
                 {
                     if (gl_LocalInvocationID.x == 511u)
                     {
-                        uint _242 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].aggregate.element, memory_order_relaxed);
-                        their_agg.element = _242;
+                        their_agg.element = _43.state[look_back_ix].aggregate.element;
                         Monoid param_6 = their_agg;
                         Monoid param_7 = exclusive;
                         exclusive = combine_monoid(param_6, param_7);
@@ -231,7 +230,7 @@
             Monoid param_13 = agg;
             Monoid inclusive_prefix = combine_monoid(param_12, param_13);
             sh_prefix = exclusive;
-            atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].prefix.element, inclusive_prefix.element, memory_order_relaxed);
+            _43.state[part_ix].prefix.element = inclusive_prefix.element;
         }
         threadgroup_barrier(mem_flags::mem_device);
         if (gl_LocalInvocationID.x == 511u)
@@ -257,7 +256,7 @@
         Monoid param_16 = row;
         Monoid param_17 = local[i_2];
         Monoid m_1 = combine_monoid(param_16, param_17);
-        _367.outbuf[ix + i_2].element = m_1.element;
+        _372.outbuf[ix + i_2].element = m_1.element;
     }
 }
 
diff --git a/tests/shader/gen/prefix.spv b/tests/shader/gen/prefix.spv
index 0da9ed8..d2c1aad 100644
--- a/tests/shader/gen/prefix.spv
+++ b/tests/shader/gen/prefix.spv
Binary files differ
diff --git a/tests/shader/gen/prefix_atomic.dxil b/tests/shader/gen/prefix_atomic.dxil
index 80c05a3..68f47e5 100644
--- a/tests/shader/gen/prefix_atomic.dxil
+++ b/tests/shader/gen/prefix_atomic.dxil
Binary files differ
diff --git a/tests/shader/gen/prefix_atomic.hlsl b/tests/shader/gen/prefix_atomic.hlsl
index 6105c01..10f7081 100644
--- a/tests/shader/gen/prefix_atomic.hlsl
+++ b/tests/shader/gen/prefix_atomic.hlsl
@@ -12,11 +12,11 @@
 
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
 
-static const Monoid _183 = { 0u };
+static const Monoid _185 = { 0u };
 
 globallycoherent RWByteAddressBuffer _43 : register(u2);
 ByteAddressBuffer _67 : register(t0);
-RWByteAddressBuffer _367 : register(u1);
+RWByteAddressBuffer _372 : register(u1);
 
 static uint3 gl_LocalInvocationID;
 struct SPIRV_Cross_Input
@@ -76,12 +76,10 @@
     }
     if (gl_LocalInvocationID.x == 511u)
     {
-        uint _378;
-        _43.InterlockedExchange(part_ix * 12 + 8, agg.element, _378);
+        _43.Store(part_ix * 12 + 8, agg.element);
         if (part_ix == 0u)
         {
-            uint _379;
-            _43.InterlockedExchange(12, agg.element, _379);
+            _43.Store(12, agg.element);
         }
     }
     DeviceMemoryBarrier();
@@ -92,23 +90,24 @@
         {
             flag = 2u;
         }
-        uint _380;
-        _43.InterlockedExchange(part_ix * 12 + 4, flag, _380);
+        uint _383;
+        _43.InterlockedExchange(part_ix * 12 + 4, flag, _383);
     }
-    Monoid exclusive = _183;
+    Monoid exclusive = _185;
     if (part_ix != 0u)
     {
         uint look_back_ix = part_ix - 1u;
         uint their_ix = 0u;
+        Monoid their_prefix;
         Monoid their_agg;
         Monoid m;
         while (true)
         {
             if (gl_LocalInvocationID.x == 511u)
             {
-                uint _206;
-                _43.InterlockedAdd(look_back_ix * 12 + 4, 0, _206);
-                sh_flag = _206;
+                uint _208;
+                _43.InterlockedAdd(look_back_ix * 12 + 4, 0, _208);
+                sh_flag = _208;
             }
             GroupMemoryBarrierWithGroupSync();
             DeviceMemoryBarrier();
@@ -117,10 +116,9 @@
             {
                 if (gl_LocalInvocationID.x == 511u)
                 {
-                    uint _221;
-                    _43.InterlockedAdd(look_back_ix * 12 + 12, 0, _221);
-                    Monoid _222 = { _221 };
-                    Monoid their_prefix = _222;
+                    Monoid _223;
+                    _223.element = _43.Load(look_back_ix * 12 + 12);
+                    their_prefix.element = _223.element;
                     Monoid param_4 = their_prefix;
                     Monoid param_5 = exclusive;
                     exclusive = combine_monoid(param_4, param_5);
@@ -133,9 +131,9 @@
                 {
                     if (gl_LocalInvocationID.x == 511u)
                     {
-                        uint _242;
-                        _43.InterlockedAdd(look_back_ix * 12 + 8, 0, _242);
-                        their_agg.element = _242;
+                        Monoid _245;
+                        _245.element = _43.Load(look_back_ix * 12 + 8);
+                        their_agg.element = _245.element;
                         Monoid param_6 = their_agg;
                         Monoid param_7 = exclusive;
                         exclusive = combine_monoid(param_6, param_7);
@@ -147,9 +145,9 @@
             }
             if (gl_LocalInvocationID.x == 511u)
             {
-                Monoid _263;
-                _263.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
-                m.element = _263.element;
+                Monoid _267;
+                _267.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
+                m.element = _267.element;
                 if (their_ix == 0u)
                 {
                     their_agg = m;
@@ -190,14 +188,13 @@
             Monoid param_13 = agg;
             Monoid inclusive_prefix = combine_monoid(param_12, param_13);
             sh_prefix = exclusive;
-            uint _381;
-            _43.InterlockedExchange(part_ix * 12 + 12, inclusive_prefix.element, _381);
+            _43.Store(part_ix * 12 + 12, inclusive_prefix.element);
         }
         DeviceMemoryBarrier();
         if (gl_LocalInvocationID.x == 511u)
         {
-            uint _382;
-            _43.InterlockedExchange(part_ix * 12 + 4, 2u, _382);
+            uint _384;
+            _43.InterlockedExchange(part_ix * 12 + 4, 2u, _384);
         }
     }
     GroupMemoryBarrierWithGroupSync();
@@ -218,7 +215,7 @@
         Monoid param_16 = row;
         Monoid param_17 = local[i_2];
         Monoid m_1 = combine_monoid(param_16, param_17);
-        _367.Store((ix + i_2) * 4 + 0, m_1.element);
+        _372.Store((ix + i_2) * 4 + 0, m_1.element);
     }
 }
 
diff --git a/tests/shader/gen/prefix_atomic.msl b/tests/shader/gen/prefix_atomic.msl
index d5de44a..6d7d155 100644
--- a/tests/shader/gen/prefix_atomic.msl
+++ b/tests/shader/gen/prefix_atomic.msl
@@ -87,7 +87,7 @@
     return Monoid{ a.element + b.element };
 }
 
-kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _367 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_part_ix;
     threadgroup Monoid sh_scratch[512];
@@ -127,10 +127,10 @@
     }
     if (gl_LocalInvocationID.x == 511u)
     {
-        atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].aggregate.element, agg.element, memory_order_relaxed);
+        _43.state[part_ix].aggregate.element = agg.element;
         if (part_ix == 0u)
         {
-            atomic_store_explicit((volatile device atomic_uint*)&_43.state[0].prefix.element, agg.element, memory_order_relaxed);
+            _43.state[0].prefix.element = agg.element;
         }
     }
     threadgroup_barrier(mem_flags::mem_device);
@@ -148,14 +148,15 @@
     {
         uint look_back_ix = part_ix - 1u;
         uint their_ix = 0u;
+        Monoid their_prefix;
         Monoid their_agg;
         Monoid m;
         while (true)
         {
             if (gl_LocalInvocationID.x == 511u)
             {
-                uint _206 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].flag, memory_order_relaxed);
-                sh_flag = _206;
+                uint _208 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].flag, memory_order_relaxed);
+                sh_flag = _208;
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             threadgroup_barrier(mem_flags::mem_device);
@@ -164,8 +165,7 @@
             {
                 if (gl_LocalInvocationID.x == 511u)
                 {
-                    uint _221 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].prefix.element, memory_order_relaxed);
-                    Monoid their_prefix = Monoid{ _221 };
+                    their_prefix.element = _43.state[look_back_ix].prefix.element;
                     Monoid param_4 = their_prefix;
                     Monoid param_5 = exclusive;
                     exclusive = combine_monoid(param_4, param_5);
@@ -178,8 +178,7 @@
                 {
                     if (gl_LocalInvocationID.x == 511u)
                     {
-                        uint _242 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].aggregate.element, memory_order_relaxed);
-                        their_agg.element = _242;
+                        their_agg.element = _43.state[look_back_ix].aggregate.element;
                         Monoid param_6 = their_agg;
                         Monoid param_7 = exclusive;
                         exclusive = combine_monoid(param_6, param_7);
@@ -232,7 +231,7 @@
             Monoid param_13 = agg;
             Monoid inclusive_prefix = combine_monoid(param_12, param_13);
             sh_prefix = exclusive;
-            atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].prefix.element, inclusive_prefix.element, memory_order_relaxed);
+            _43.state[part_ix].prefix.element = inclusive_prefix.element;
         }
         threadgroup_barrier(mem_flags::mem_device);
         if (gl_LocalInvocationID.x == 511u)
@@ -258,7 +257,7 @@
         Monoid param_16 = row;
         Monoid param_17 = local[i_2];
         Monoid m_1 = combine_monoid(param_16, param_17);
-        _367.outbuf[ix + i_2].element = m_1.element;
+        _372.outbuf[ix + i_2].element = m_1.element;
     }
 }
 
diff --git a/tests/shader/gen/prefix_atomic.spv b/tests/shader/gen/prefix_atomic.spv
index 1b31f67..acca545 100644
--- a/tests/shader/gen/prefix_atomic.spv
+++ b/tests/shader/gen/prefix_atomic.spv
Binary files differ
diff --git a/tests/shader/gen/prefix_vkmm.spv b/tests/shader/gen/prefix_vkmm.spv
index c8b9ea2..0b8d475 100644
--- a/tests/shader/gen/prefix_vkmm.spv
+++ b/tests/shader/gen/prefix_vkmm.spv
Binary files differ
diff --git a/tests/shader/message_passing.comp b/tests/shader/message_passing.comp
index 521dedd..e5e53b6 100644
--- a/tests/shader/message_passing.comp
+++ b/tests/shader/message_passing.comp
@@ -23,7 +23,7 @@
     uint flag;
 };
 
-layout(binding = 0) coherent buffer DataBuf
+layout(binding = 0) buffer DataBuf
 {
     Element data[];
 } data_buf;
@@ -40,7 +40,7 @@
 
 void main()
 {
-    data_buf.data[gl_GlobalInvocationID.x].data = 1u;
+    atomicStore(data_buf.data[gl_GlobalInvocationID.x].data, 1u, gl_ScopeDevice, 0, 0);
 #ifndef VKMM
     memoryBarrierBuffer();
 #endif
@@ -52,7 +52,7 @@
 #ifndef VKMM
     memoryBarrierBuffer();
 #endif
-    uint data = data_buf.data[read_ix].data;
+    uint data = atomicLoad(data_buf.data[read_ix].data, gl_ScopeDevice, 0, 0);
     if (flag > data)
     {
         atomicAdd(control_buf.failures, 1u);
diff --git a/tests/shader/prefix.comp b/tests/shader/prefix.comp
index 0fc9909..a6a0d57 100644
--- a/tests/shader/prefix.comp
+++ b/tests/shader/prefix.comp
@@ -99,9 +99,9 @@
 
     // Publish aggregate for this partition
     if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-        atomicStore(state[part_ix].aggregate.element, agg.element, gl_ScopeDevice, 0, 0);
+        state[part_ix].aggregate = agg;
         if (part_ix == 0) {
-            atomicStore(state[0].prefix.element, agg.element, gl_ScopeDevice, 0, 0);
+            state[0].prefix = agg;
         }
     }
     // Write flag with release semantics; this is done portably with a barrier.
@@ -147,13 +147,13 @@
 
             if (flag == FLAG_PREFIX_READY) {
                 if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-                    Monoid their_prefix = Monoid(atomicLoad(state[look_back_ix].prefix.element, gl_ScopeDevice, 0, 0));
+                    Monoid their_prefix = state[look_back_ix].prefix;
                     exclusive = combine_monoid(their_prefix, exclusive);
                 }
                 break;
             } else if (flag == FLAG_AGGREGATE_READY) {
                 if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-                    their_agg.element = atomicLoad(state[look_back_ix].aggregate.element, gl_ScopeDevice, 0, 0);
+                    their_agg = state[look_back_ix].aggregate;
                     exclusive = combine_monoid(their_agg, exclusive);
                 }
                 look_back_ix--;
@@ -193,7 +193,7 @@
         if (gl_LocalInvocationID.x == WG_SIZE - 1) {
             Monoid inclusive_prefix = combine_monoid(exclusive, agg);
             sh_prefix = exclusive;
-            atomicStore(state[part_ix].prefix.element, inclusive_prefix.element, gl_ScopeDevice, 0, 0);
+            state[part_ix].prefix = inclusive_prefix;
         }
 #ifndef VKMM
         memoryBarrierBuffer();