Use relaxed atomics for status counters This has no performance impact on AMD 5700 XT, and may be less compatible. It's probably worth experimenting with both variants.
diff --git a/tests/shader/prefix_nobar.comp b/tests/shader/prefix_nobar.comp index 7758e72..ee96ff8 100644 --- a/tests/shader/prefix_nobar.comp +++ b/tests/shader/prefix_nobar.comp
@@ -4,7 +4,7 @@ #version 450 -//#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_memory_scope_semantics : enable #define N_ROWS 16 #define LG_WG_SIZE 9 @@ -94,10 +94,8 @@ // Publish aggregate for this partition if (gl_LocalInvocationID.x == WG_SIZE - 1) { uvec2 split_agg = split_monoid(agg); - atomicExchange(state[part_ix].local_0, split_agg.x | FLAG); - atomicExchange(state[part_ix].local_1, split_agg.y | FLAG); - //atomicStore(state[part_ix].local_0, split_agg.x | FLAG, 0, 0); - //atomicStore(state[part_ix].local_1, split_agg.y | FLAG, 0, 0); + atomicStore(state[part_ix].local_0, split_agg.x | FLAG, gl_ScopeDevice, 0, 0); + atomicStore(state[part_ix].local_1, split_agg.y | FLAG, gl_ScopeDevice, 0, 0); } Monoid exclusive = Monoid(0); @@ -110,8 +108,8 @@ while (true) { // Read local status counters if (gl_LocalInvocationID.x == WG_SIZE - 1) { - uint sc_0 = atomicOr(state[look_back_ix].local_0, 0); - uint sc_1 = atomicOr(state[look_back_ix].local_1, 0); + uint sc_0 = atomicLoad(state[look_back_ix].local_0, gl_ScopeDevice, 0, 0); + uint sc_1 = atomicLoad(state[look_back_ix].local_1, gl_ScopeDevice, 0, 0); sh_local_split = uvec2(sc_0, sc_1); } barrier(); @@ -121,8 +119,8 @@ if (look_back_ix != 0) { // Read global status counters if (gl_LocalInvocationID.x == WG_SIZE - 1) { - uint sc_0 = atomicOr(state[look_back_ix].global_0, 0); - uint sc_1 = atomicOr(state[look_back_ix].global_1, 0); + uint sc_0 = atomicLoad(state[look_back_ix].global_0, gl_ScopeDevice, 0, 0); + uint sc_1 = atomicLoad(state[look_back_ix].global_1, gl_ScopeDevice, 0, 0); sh_global_split = uvec2(sc_0, sc_1); } barrier(); // broadcast global split & protect local from WAR @@ -173,8 +171,8 @@ Monoid inclusive_prefix = combine_monoid(exclusive, agg); sh_prefix = exclusive; uvec2 split_inclusive = split_monoid(inclusive_prefix); - atomicExchange(state[part_ix].global_0, split_inclusive.x | FLAG); - atomicExchange(state[part_ix].global_1, split_inclusive.y | FLAG); + atomicStore(state[part_ix].global_0, split_inclusive.x | FLAG, gl_ScopeDevice, 0, 0); + atomicStore(state[part_ix].global_1, split_inclusive.y | FLAG, gl_ScopeDevice, 0, 0); } } barrier(); // broadcast sh_prefix and protect sh_scratch from RAW