Work around lack of timer query on Metal
diff --git a/tests/shader/gen/message_passing.dxil b/tests/shader/gen/message_passing.dxil
index 2be73da..dc2eced 100644
--- a/tests/shader/gen/message_passing.dxil
+++ b/tests/shader/gen/message_passing.dxil
Binary files differ
diff --git a/tests/shader/gen/message_passing.hlsl b/tests/shader/gen/message_passing.hlsl
index ba8ce5f..3a48808 100644
--- a/tests/shader/gen/message_passing.hlsl
+++ b/tests/shader/gen/message_passing.hlsl
@@ -6,7 +6,7 @@
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer data_buf : register(u0);
+globallycoherent RWByteAddressBuffer data_buf : register(u0);
 RWByteAddressBuffer control_buf : register(u1);
 
 static uint3 gl_GlobalInvocationID;
@@ -22,13 +22,12 @@
 
 void comp_main()
 {
-    uint _76;
-    data_buf.InterlockedExchange(gl_GlobalInvocationID.x * 8 + 0, 1u, _76);
+    data_buf.Store(gl_GlobalInvocationID.x * 8 + 0, 1u);
     DeviceMemoryBarrier();
     uint param = gl_GlobalInvocationID.x;
     uint write_flag_ix = permute_flag_ix(param);
-    uint _77;
-    data_buf.InterlockedExchange(write_flag_ix * 8 + 4, 1u, _77);
+    uint _76;
+    data_buf.InterlockedExchange(write_flag_ix * 8 + 4, 1u, _76);
     uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u;
     uint param_1 = read_ix;
     uint read_flag_ix = permute_flag_ix(param_1);
@@ -36,9 +35,7 @@
     data_buf.InterlockedAdd(read_flag_ix * 8 + 4, 0, _58);
     uint flag = _58;
     DeviceMemoryBarrier();
-    uint _62;
-    data_buf.InterlockedAdd(read_ix * 8 + 0, 0, _62);
-    uint data = _62;
+    uint data = data_buf.Load(read_ix * 8 + 0);
     if (flag > data)
     {
         uint _73;
diff --git a/tests/shader/gen/message_passing.msl b/tests/shader/gen/message_passing.msl
index e48f48a..1bda181 100644
--- a/tests/shader/gen/message_passing.msl
+++ b/tests/shader/gen/message_passing.msl
@@ -31,21 +31,20 @@
     return (data_ix * 419u) & 65535u;
 }
 
-kernel void main0(device DataBuf& data_buf [[buffer(0)]], device ControlBuf& control_buf [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+kernel void main0(volatile device DataBuf& data_buf [[buffer(0)]], device ControlBuf& control_buf [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
-    atomic_store_explicit((device atomic_uint*)&data_buf.data[gl_GlobalInvocationID.x].data, 1u, memory_order_relaxed);
+    data_buf.data[gl_GlobalInvocationID.x].data = 1u;
     threadgroup_barrier(mem_flags::mem_device);
     uint param = gl_GlobalInvocationID.x;
     uint write_flag_ix = permute_flag_ix(param);
-    atomic_store_explicit((device atomic_uint*)&data_buf.data[write_flag_ix].flag, 1u, memory_order_relaxed);
+    atomic_store_explicit((volatile device atomic_uint*)&data_buf.data[write_flag_ix].flag, 1u, memory_order_relaxed);
     uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u;
     uint param_1 = read_ix;
     uint read_flag_ix = permute_flag_ix(param_1);
-    uint _58 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_flag_ix].flag, memory_order_relaxed);
+    uint _58 = atomic_load_explicit((volatile device atomic_uint*)&data_buf.data[read_flag_ix].flag, memory_order_relaxed);
     uint flag = _58;
     threadgroup_barrier(mem_flags::mem_device);
-    uint _62 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_ix].data, memory_order_relaxed);
-    uint data = _62;
+    uint data = data_buf.data[read_ix].data;
     if (flag > data)
     {
         uint _73 = atomic_fetch_add_explicit((device atomic_uint*)&control_buf.failures, 1u, memory_order_relaxed);
diff --git a/tests/shader/gen/message_passing.spv b/tests/shader/gen/message_passing.spv
index e5f56d6..fa7fb3e 100644
--- a/tests/shader/gen/message_passing.spv
+++ b/tests/shader/gen/message_passing.spv
Binary files differ
diff --git a/tests/shader/gen/message_passing_vkmm.spv b/tests/shader/gen/message_passing_vkmm.spv
index 8527c2b..58d3521 100644
--- a/tests/shader/gen/message_passing_vkmm.spv
+++ b/tests/shader/gen/message_passing_vkmm.spv
Binary files differ
diff --git a/tests/shader/gen/prefix.dxil b/tests/shader/gen/prefix.dxil
index 34f3d6a..12d0d50 100644
--- a/tests/shader/gen/prefix.dxil
+++ b/tests/shader/gen/prefix.dxil
Binary files differ
diff --git a/tests/shader/gen/prefix.hlsl b/tests/shader/gen/prefix.hlsl
index 3af5a96..ba76110 100644
--- a/tests/shader/gen/prefix.hlsl
+++ b/tests/shader/gen/prefix.hlsl
@@ -12,11 +12,11 @@
 
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
 
-static const Monoid _185 = { 0u };
+static const Monoid _183 = { 0u };
 
 globallycoherent RWByteAddressBuffer _43 : register(u2);
 ByteAddressBuffer _67 : register(t0);
-RWByteAddressBuffer _372 : register(u1);
+RWByteAddressBuffer _367 : register(u1);
 
 static uint3 gl_LocalInvocationID;
 struct SPIRV_Cross_Input
@@ -76,10 +76,12 @@
     }
     if (gl_LocalInvocationID.x == 511u)
     {
-        _43.Store(part_ix * 12 + 8, agg.element);
+        uint _378;
+        _43.InterlockedExchange(part_ix * 12 + 8, agg.element, _378);
         if (part_ix == 0u)
         {
-            _43.Store(12, agg.element);
+            uint _379;
+            _43.InterlockedExchange(12, agg.element, _379);
         }
     }
     DeviceMemoryBarrier();
@@ -92,12 +94,11 @@
         }
         _43.Store(part_ix * 12 + 4, flag);
     }
-    Monoid exclusive = _185;
+    Monoid exclusive = _183;
     if (part_ix != 0u)
     {
         uint look_back_ix = part_ix - 1u;
         uint their_ix = 0u;
-        Monoid their_prefix;
         Monoid their_agg;
         Monoid m;
         while (true)
@@ -113,9 +114,10 @@
             {
                 if (gl_LocalInvocationID.x == 511u)
                 {
-                    Monoid _223;
-                    _223.element = _43.Load(look_back_ix * 12 + 12);
-                    their_prefix.element = _223.element;
+                    uint _221;
+                    _43.InterlockedAdd(look_back_ix * 12 + 12, 0, _221);
+                    Monoid _222 = { _221 };
+                    Monoid their_prefix = _222;
                     Monoid param_4 = their_prefix;
                     Monoid param_5 = exclusive;
                     exclusive = combine_monoid(param_4, param_5);
@@ -128,9 +130,9 @@
                 {
                     if (gl_LocalInvocationID.x == 511u)
                     {
-                        Monoid _245;
-                        _245.element = _43.Load(look_back_ix * 12 + 8);
-                        their_agg.element = _245.element;
+                        uint _242;
+                        _43.InterlockedAdd(look_back_ix * 12 + 8, 0, _242);
+                        their_agg.element = _242;
                         Monoid param_6 = their_agg;
                         Monoid param_7 = exclusive;
                         exclusive = combine_monoid(param_6, param_7);
@@ -142,9 +144,9 @@
             }
             if (gl_LocalInvocationID.x == 511u)
             {
-                Monoid _267;
-                _267.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
-                m.element = _267.element;
+                Monoid _263;
+                _263.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
+                m.element = _263.element;
                 if (their_ix == 0u)
                 {
                     their_agg = m;
@@ -185,7 +187,8 @@
             Monoid param_13 = agg;
             Monoid inclusive_prefix = combine_monoid(param_12, param_13);
             sh_prefix = exclusive;
-            _43.Store(part_ix * 12 + 12, inclusive_prefix.element);
+            uint _380;
+            _43.InterlockedExchange(part_ix * 12 + 12, inclusive_prefix.element, _380);
         }
         DeviceMemoryBarrier();
         if (gl_LocalInvocationID.x == 511u)
@@ -211,7 +214,7 @@
         Monoid param_16 = row;
         Monoid param_17 = local[i_2];
         Monoid m_1 = combine_monoid(param_16, param_17);
-        _372.Store((ix + i_2) * 4 + 0, m_1.element);
+        _367.Store((ix + i_2) * 4 + 0, m_1.element);
     }
 }
 
diff --git a/tests/shader/gen/prefix.msl b/tests/shader/gen/prefix.msl
index 8e402a9..aaa7158 100644
--- a/tests/shader/gen/prefix.msl
+++ b/tests/shader/gen/prefix.msl
@@ -87,7 +87,7 @@
     return Monoid{ a.element + b.element };
 }
 
-kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _367 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_part_ix;
     threadgroup Monoid sh_scratch[512];
@@ -127,10 +127,10 @@
     }
     if (gl_LocalInvocationID.x == 511u)
     {
-        _43.state[part_ix].aggregate.element = agg.element;
+        atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].aggregate.element, agg.element, memory_order_relaxed);
         if (part_ix == 0u)
         {
-            _43.state[0].prefix.element = agg.element;
+            atomic_store_explicit((volatile device atomic_uint*)&_43.state[0].prefix.element, agg.element, memory_order_relaxed);
         }
     }
     threadgroup_barrier(mem_flags::mem_device);
@@ -148,7 +148,6 @@
     {
         uint look_back_ix = part_ix - 1u;
         uint their_ix = 0u;
-        Monoid their_prefix;
         Monoid their_agg;
         Monoid m;
         while (true)
@@ -164,7 +163,8 @@
             {
                 if (gl_LocalInvocationID.x == 511u)
                 {
-                    their_prefix.element = _43.state[look_back_ix].prefix.element;
+                    uint _221 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].prefix.element, memory_order_relaxed);
+                    Monoid their_prefix = Monoid{ _221 };
                     Monoid param_4 = their_prefix;
                     Monoid param_5 = exclusive;
                     exclusive = combine_monoid(param_4, param_5);
@@ -177,7 +177,8 @@
                 {
                     if (gl_LocalInvocationID.x == 511u)
                     {
-                        their_agg.element = _43.state[look_back_ix].aggregate.element;
+                        uint _242 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].aggregate.element, memory_order_relaxed);
+                        their_agg.element = _242;
                         Monoid param_6 = their_agg;
                         Monoid param_7 = exclusive;
                         exclusive = combine_monoid(param_6, param_7);
@@ -230,7 +231,7 @@
             Monoid param_13 = agg;
             Monoid inclusive_prefix = combine_monoid(param_12, param_13);
             sh_prefix = exclusive;
-            _43.state[part_ix].prefix.element = inclusive_prefix.element;
+            atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].prefix.element, inclusive_prefix.element, memory_order_relaxed);
         }
         threadgroup_barrier(mem_flags::mem_device);
         if (gl_LocalInvocationID.x == 511u)
@@ -256,7 +257,7 @@
         Monoid param_16 = row;
         Monoid param_17 = local[i_2];
         Monoid m_1 = combine_monoid(param_16, param_17);
-        _372.outbuf[ix + i_2].element = m_1.element;
+        _367.outbuf[ix + i_2].element = m_1.element;
     }
 }
 
diff --git a/tests/shader/gen/prefix.spv b/tests/shader/gen/prefix.spv
index d2c1aad..0da9ed8 100644
--- a/tests/shader/gen/prefix.spv
+++ b/tests/shader/gen/prefix.spv
Binary files differ
diff --git a/tests/shader/gen/prefix_atomic.dxil b/tests/shader/gen/prefix_atomic.dxil
index 68f47e5..80c05a3 100644
--- a/tests/shader/gen/prefix_atomic.dxil
+++ b/tests/shader/gen/prefix_atomic.dxil
Binary files differ
diff --git a/tests/shader/gen/prefix_atomic.hlsl b/tests/shader/gen/prefix_atomic.hlsl
index 10f7081..6105c01 100644
--- a/tests/shader/gen/prefix_atomic.hlsl
+++ b/tests/shader/gen/prefix_atomic.hlsl
@@ -12,11 +12,11 @@
 
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
 
-static const Monoid _185 = { 0u };
+static const Monoid _183 = { 0u };
 
 globallycoherent RWByteAddressBuffer _43 : register(u2);
 ByteAddressBuffer _67 : register(t0);
-RWByteAddressBuffer _372 : register(u1);
+RWByteAddressBuffer _367 : register(u1);
 
 static uint3 gl_LocalInvocationID;
 struct SPIRV_Cross_Input
@@ -76,10 +76,12 @@
     }
     if (gl_LocalInvocationID.x == 511u)
     {
-        _43.Store(part_ix * 12 + 8, agg.element);
+        uint _378;
+        _43.InterlockedExchange(part_ix * 12 + 8, agg.element, _378);
         if (part_ix == 0u)
         {
-            _43.Store(12, agg.element);
+            uint _379;
+            _43.InterlockedExchange(12, agg.element, _379);
         }
     }
     DeviceMemoryBarrier();
@@ -90,24 +92,23 @@
         {
             flag = 2u;
         }
-        uint _383;
-        _43.InterlockedExchange(part_ix * 12 + 4, flag, _383);
+        uint _380;
+        _43.InterlockedExchange(part_ix * 12 + 4, flag, _380);
     }
-    Monoid exclusive = _185;
+    Monoid exclusive = _183;
     if (part_ix != 0u)
     {
         uint look_back_ix = part_ix - 1u;
         uint their_ix = 0u;
-        Monoid their_prefix;
         Monoid their_agg;
         Monoid m;
         while (true)
         {
             if (gl_LocalInvocationID.x == 511u)
             {
-                uint _208;
-                _43.InterlockedAdd(look_back_ix * 12 + 4, 0, _208);
-                sh_flag = _208;
+                uint _206;
+                _43.InterlockedAdd(look_back_ix * 12 + 4, 0, _206);
+                sh_flag = _206;
             }
             GroupMemoryBarrierWithGroupSync();
             DeviceMemoryBarrier();
@@ -116,9 +117,10 @@
             {
                 if (gl_LocalInvocationID.x == 511u)
                 {
-                    Monoid _223;
-                    _223.element = _43.Load(look_back_ix * 12 + 12);
-                    their_prefix.element = _223.element;
+                    uint _221;
+                    _43.InterlockedAdd(look_back_ix * 12 + 12, 0, _221);
+                    Monoid _222 = { _221 };
+                    Monoid their_prefix = _222;
                     Monoid param_4 = their_prefix;
                     Monoid param_5 = exclusive;
                     exclusive = combine_monoid(param_4, param_5);
@@ -131,9 +133,9 @@
                 {
                     if (gl_LocalInvocationID.x == 511u)
                     {
-                        Monoid _245;
-                        _245.element = _43.Load(look_back_ix * 12 + 8);
-                        their_agg.element = _245.element;
+                        uint _242;
+                        _43.InterlockedAdd(look_back_ix * 12 + 8, 0, _242);
+                        their_agg.element = _242;
                         Monoid param_6 = their_agg;
                         Monoid param_7 = exclusive;
                         exclusive = combine_monoid(param_6, param_7);
@@ -145,9 +147,9 @@
             }
             if (gl_LocalInvocationID.x == 511u)
             {
-                Monoid _267;
-                _267.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
-                m.element = _267.element;
+                Monoid _263;
+                _263.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
+                m.element = _263.element;
                 if (their_ix == 0u)
                 {
                     their_agg = m;
@@ -188,13 +190,14 @@
             Monoid param_13 = agg;
             Monoid inclusive_prefix = combine_monoid(param_12, param_13);
             sh_prefix = exclusive;
-            _43.Store(part_ix * 12 + 12, inclusive_prefix.element);
+            uint _381;
+            _43.InterlockedExchange(part_ix * 12 + 12, inclusive_prefix.element, _381);
         }
         DeviceMemoryBarrier();
         if (gl_LocalInvocationID.x == 511u)
         {
-            uint _384;
-            _43.InterlockedExchange(part_ix * 12 + 4, 2u, _384);
+            uint _382;
+            _43.InterlockedExchange(part_ix * 12 + 4, 2u, _382);
         }
     }
     GroupMemoryBarrierWithGroupSync();
@@ -215,7 +218,7 @@
         Monoid param_16 = row;
         Monoid param_17 = local[i_2];
         Monoid m_1 = combine_monoid(param_16, param_17);
-        _372.Store((ix + i_2) * 4 + 0, m_1.element);
+        _367.Store((ix + i_2) * 4 + 0, m_1.element);
     }
 }
 
diff --git a/tests/shader/gen/prefix_atomic.msl b/tests/shader/gen/prefix_atomic.msl
index 6d7d155..d5de44a 100644
--- a/tests/shader/gen/prefix_atomic.msl
+++ b/tests/shader/gen/prefix_atomic.msl
@@ -87,7 +87,7 @@
     return Monoid{ a.element + b.element };
 }
 
-kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _367 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_part_ix;
     threadgroup Monoid sh_scratch[512];
@@ -127,10 +127,10 @@
     }
     if (gl_LocalInvocationID.x == 511u)
     {
-        _43.state[part_ix].aggregate.element = agg.element;
+        atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].aggregate.element, agg.element, memory_order_relaxed);
         if (part_ix == 0u)
         {
-            _43.state[0].prefix.element = agg.element;
+            atomic_store_explicit((volatile device atomic_uint*)&_43.state[0].prefix.element, agg.element, memory_order_relaxed);
         }
     }
     threadgroup_barrier(mem_flags::mem_device);
@@ -148,15 +148,14 @@
     {
         uint look_back_ix = part_ix - 1u;
         uint their_ix = 0u;
-        Monoid their_prefix;
         Monoid their_agg;
         Monoid m;
         while (true)
         {
             if (gl_LocalInvocationID.x == 511u)
             {
-                uint _208 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].flag, memory_order_relaxed);
-                sh_flag = _208;
+                uint _206 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].flag, memory_order_relaxed);
+                sh_flag = _206;
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             threadgroup_barrier(mem_flags::mem_device);
@@ -165,7 +164,8 @@
             {
                 if (gl_LocalInvocationID.x == 511u)
                 {
-                    their_prefix.element = _43.state[look_back_ix].prefix.element;
+                    uint _221 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].prefix.element, memory_order_relaxed);
+                    Monoid their_prefix = Monoid{ _221 };
                     Monoid param_4 = their_prefix;
                     Monoid param_5 = exclusive;
                     exclusive = combine_monoid(param_4, param_5);
@@ -178,7 +178,8 @@
                 {
                     if (gl_LocalInvocationID.x == 511u)
                     {
-                        their_agg.element = _43.state[look_back_ix].aggregate.element;
+                        uint _242 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].aggregate.element, memory_order_relaxed);
+                        their_agg.element = _242;
                         Monoid param_6 = their_agg;
                         Monoid param_7 = exclusive;
                         exclusive = combine_monoid(param_6, param_7);
@@ -231,7 +232,7 @@
             Monoid param_13 = agg;
             Monoid inclusive_prefix = combine_monoid(param_12, param_13);
             sh_prefix = exclusive;
-            _43.state[part_ix].prefix.element = inclusive_prefix.element;
+            atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].prefix.element, inclusive_prefix.element, memory_order_relaxed);
         }
         threadgroup_barrier(mem_flags::mem_device);
         if (gl_LocalInvocationID.x == 511u)
@@ -257,7 +258,7 @@
         Monoid param_16 = row;
         Monoid param_17 = local[i_2];
         Monoid m_1 = combine_monoid(param_16, param_17);
-        _372.outbuf[ix + i_2].element = m_1.element;
+        _367.outbuf[ix + i_2].element = m_1.element;
     }
 }
 
diff --git a/tests/shader/gen/prefix_atomic.spv b/tests/shader/gen/prefix_atomic.spv
index acca545..1b31f67 100644
--- a/tests/shader/gen/prefix_atomic.spv
+++ b/tests/shader/gen/prefix_atomic.spv
Binary files differ
diff --git a/tests/shader/gen/prefix_vkmm.spv b/tests/shader/gen/prefix_vkmm.spv
index 0b8d475..c8b9ea2 100644
--- a/tests/shader/gen/prefix_vkmm.spv
+++ b/tests/shader/gen/prefix_vkmm.spv
Binary files differ
diff --git a/tests/shader/message_passing.comp b/tests/shader/message_passing.comp
index e5e53b6..521dedd 100644
--- a/tests/shader/message_passing.comp
+++ b/tests/shader/message_passing.comp
@@ -23,7 +23,7 @@
     uint flag;
 };
 
-layout(binding = 0) buffer DataBuf
+layout(binding = 0) coherent buffer DataBuf
 {
     Element data[];
 } data_buf;
@@ -40,7 +40,7 @@
 
 void main()
 {
-    atomicStore(data_buf.data[gl_GlobalInvocationID.x].data, 1u, gl_ScopeDevice, 0, 0);
+    data_buf.data[gl_GlobalInvocationID.x].data = 1u;
 #ifndef VKMM
     memoryBarrierBuffer();
 #endif
@@ -52,7 +52,7 @@
 #ifndef VKMM
     memoryBarrierBuffer();
 #endif
-    uint data = atomicLoad(data_buf.data[read_ix].data, gl_ScopeDevice, 0, 0);
+    uint data = data_buf.data[read_ix].data;
     if (flag > data)
     {
         atomicAdd(control_buf.failures, 1u);
diff --git a/tests/shader/prefix.comp b/tests/shader/prefix.comp
index a6a0d57..0fc9909 100644
--- a/tests/shader/prefix.comp
+++ b/tests/shader/prefix.comp
@@ -99,9 +99,9 @@
 
     // Publish aggregate for this partition
     if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-        state[part_ix].aggregate = agg;
+        atomicStore(state[part_ix].aggregate.element, agg.element, gl_ScopeDevice, 0, 0);
         if (part_ix == 0) {
-            state[0].prefix = agg;
+            atomicStore(state[0].prefix.element, agg.element, gl_ScopeDevice, 0, 0);
         }
     }
     // Write flag with release semantics; this is done portably with a barrier.
@@ -147,13 +147,13 @@
 
             if (flag == FLAG_PREFIX_READY) {
                 if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-                    Monoid their_prefix = state[look_back_ix].prefix;
+                    Monoid their_prefix = Monoid(atomicLoad(state[look_back_ix].prefix.element, gl_ScopeDevice, 0, 0));
                     exclusive = combine_monoid(their_prefix, exclusive);
                 }
                 break;
             } else if (flag == FLAG_AGGREGATE_READY) {
                 if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-                    their_agg = state[look_back_ix].aggregate;
+                    their_agg.element = atomicLoad(state[look_back_ix].aggregate.element, gl_ScopeDevice, 0, 0);
                     exclusive = combine_monoid(their_agg, exclusive);
                 }
                 look_back_ix--;
@@ -193,7 +193,7 @@
         if (gl_LocalInvocationID.x == WG_SIZE - 1) {
             Monoid inclusive_prefix = combine_monoid(exclusive, agg);
             sh_prefix = exclusive;
-            state[part_ix].prefix = inclusive_prefix;
+            atomicStore(state[part_ix].prefix.element, inclusive_prefix.element, gl_ScopeDevice, 0, 0);
         }
 #ifndef VKMM
         memoryBarrierBuffer();
diff --git a/tests/src/coherence.rs b/tests/src/coherence.rs
index 8a0bf97..28ee79c 100644
--- a/tests/src/coherence.rs
+++ b/tests/src/coherence.rs
@@ -72,7 +72,13 @@
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
         }
-        total_elapsed += runner.submit(commands);
+        let start_clock = std::time::Instant::now();
+        let mut elapsed = runner.submit(commands);
+        // Work around lack of timer queries on Metal
+        if runner.backend_type() == BackendType::Metal {
+            elapsed = start_clock.elapsed().as_secs_f64();
+        }
+        total_elapsed += elapsed;
         if i == 0 {
             let mut dst: Vec<u32> = Default::default();
             out_buf.read(&mut dst);