Set working group size to 512, k = 1 Make working group uniform for stack and bbox tests. Also fixes the write-after-read hazard in stack. This should be in the upstream branch, but oh well.
diff --git a/tests/shader/gen/stack_leaf.dxil b/tests/shader/gen/stack_leaf.dxil index 75423cd..14658c3 100644 --- a/tests/shader/gen/stack_leaf.dxil +++ b/tests/shader/gen/stack_leaf.dxil Binary files differ
diff --git a/tests/shader/gen/stack_leaf.hlsl b/tests/shader/gen/stack_leaf.hlsl index fb56399..4969add 100644 --- a/tests/shader/gen/stack_leaf.hlsl +++ b/tests/shader/gen/stack_leaf.hlsl
@@ -4,14 +4,14 @@ uint b; }; -static const uint3 gl_WorkGroupSize = uint3(64u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); -static const Bic _237 = { 0u, 0u }; +static const Bic _157 = { 0u, 0u }; -ByteAddressBuffer _250 : register(t1); -ByteAddressBuffer _512 : register(t2); -ByteAddressBuffer _593 : register(t0); -RWByteAddressBuffer _751 : register(u3); +ByteAddressBuffer _170 : register(t1); +ByteAddressBuffer _298 : register(t2); +ByteAddressBuffer _314 : register(t0); +RWByteAddressBuffer _399 : register(u3); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -23,40 +23,23 @@ uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared Bic sh_bic[126]; -groupshared uint sh_bitmaps[64]; +groupshared Bic sh_bic[1022]; groupshared uint sh_stack[512]; -groupshared uint sh_link[64]; -groupshared uint sh_next[64]; Bic bic_combine(Bic x, Bic y) { uint m = min(x.b, y.a); - Bic _47 = { (x.a + y.a) - m, (x.b + y.b) - m }; - return _47; -} - -uint search_bit_set(uint bitmask, uint ix) -{ - uint result = 0u; - for (uint j = 0u; j < 5u; j++) - { - uint _step = 1u << (4u - j); - if (uint(int(countbits(bitmask & ((1u << (result + _step)) - 1u)))) <= ix) - { - result += _step; - } - } - return result; + Bic _42 = { (x.a + y.a) - m, (x.b + y.b) - m }; + return _42; } uint search_link(inout Bic bic) { uint ix = gl_LocalInvocationID.x; uint j = 0u; - while (j < 6u) + while (j < 9u) { - uint base = 128u - (2u << (6u - j)); + uint base = 1024u - (2u << (9u - j)); if (((ix >> j) & 1u) != 0u) { Bic param = sh_bic[(base + (ix >> j)) - 1u]; @@ -76,7 +59,7 @@ while (j > 0u) { j--; - uint base_1 = 128u - (2u << (6u - j)); + uint base_1 = 1024u - (2u << (9u - j)); Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u]; Bic param_3 = bic; Bic test_1 = bic_combine(param_2, param_3); @@ -87,55 +70,41 @@ } } } - if (ix > 0u) - { - ix--; - Bic param_4 = sh_bic[ix]; - Bic param_5 = bic; - Bic test_2 = bic_combine(param_4, param_5); - uint param_6 = sh_bitmaps[ix]; - uint param_7 = test_2.b - 1u; - uint ix_in_chunk = search_bit_set(param_6, param_7); - return (ix * 8u) + ix_in_chunk; - } - else - { - return 4294967295u - bic.a; - } + return ix; } void comp_main() { uint th = gl_LocalInvocationID.x; - Bic bic = _237; - if ((th * 8u) < gl_WorkGroupID.x) + Bic bic = _157; + if ((th * 1u) < gl_WorkGroupID.x) { - Bic _255; - _255.a = _250.Load((th * 8u) * 8 + 0); - _255.b = _250.Load((th * 8u) * 8 + 4); - bic.a = _255.a; - bic.b = _255.b; + Bic _175; + _175.a = _170.Load((th * 1u) * 8 + 0); + _175.b = _170.Load((th * 1u) * 8 + 4); + bic.a = _175.a; + bic.b = _175.b; } Bic other; - for (uint i = 1u; i < 8u; i++) + for (uint i = 1u; i < 1u; i++) { - if (((th * 8u) + i) < gl_WorkGroupID.x) + if (((th * 1u) + i) < gl_WorkGroupID.x) { - Bic _283; - _283.a = _250.Load(((th * 8u) + i) * 8 + 0); - _283.b = _250.Load(((th * 8u) + i) * 8 + 4); - other.a = _283.a; - other.b = _283.b; + Bic _203; + _203.a = _170.Load(((th * 1u) + i) * 8 + 0); + _203.b = _170.Load(((th * 1u) + i) * 8 + 4); + other.a = _203.a; + other.b = _203.b; Bic param = bic; Bic param_1 = other; bic = bic_combine(param, param_1); } } sh_bic[th] = bic; - for (uint i_1 = 0u; i_1 < 6u; i_1++) + for (uint i_1 = 0u; i_1 < 9u; i_1++) { GroupMemoryBarrierWithGroupSync(); - if ((th + (1u << i_1)) < 64u) + if ((th + (1u << i_1)) < 512u) { Bic other_1 = sh_bic[th + (1u << i_1)]; Bic param_2 = bic; @@ -146,212 +115,58 @@ sh_bic[th] = bic; } GroupMemoryBarrierWithGroupSync(); - if (th == 63u) - { - bic = _237; - } - else - { - bic = sh_bic[th + 1u]; - } - uint last_b = bic.b; - uint bitmap = 0u; - Bic param_4; - for (uint i_2 = 0u; i_2 < 8u; i_2++) - { - uint this_ix = (((th * 8u) + 8u) - 1u) - i_2; - if (this_ix < gl_WorkGroupID.x) - { - Bic _369; - _369.a = _250.Load(this_ix * 8 + 0); - _369.b = _250.Load(this_ix * 8 + 4); - param_4.a = _369.a; - param_4.b = _369.b; - Bic param_5 = bic; - bic = bic_combine(param_4, param_5); - } - sh_stack[this_ix] = bic.b; - if (bic.b > last_b) - { - bitmap |= (1u << (7u - i_2)); - } - last_b = bic.b; - } - sh_bitmaps[th] = bitmap; - uint link = 0u; - if (bitmap != 0u) - { - link = (th * 8u) + uint(int(firstbithigh(bitmap))); - } - sh_link[th] = link; - for (uint i_3 = 0u; i_3 < 6u; i_3++) - { - GroupMemoryBarrierWithGroupSync(); - if (th >= (1u << i_3)) - { - link = max(link, sh_link[th - (1u << i_3)]); - } - GroupMemoryBarrierWithGroupSync(); - sh_link[th] = link; - } - GroupMemoryBarrierWithGroupSync(); - uint sp = 504u - (th * 8u); + uint sp = 511u - th; uint ix = 0u; - for (uint i_4 = 0u; i_4 < 9u; i_4++) + for (uint i_2 = 0u; i_2 < 9u; i_2++) { - uint probe = ix + (256u >> i_4); - if (sp < sh_stack[probe]) + uint probe = ix + (256u >> i_2); + if (sp < sh_bic[probe].b) { ix = probe; } } - uint b = sh_stack[ix]; - uint local_stack[8]; - for (uint i_5 = 0u; i_5 < 8u; i_5++) + uint b = sh_bic[ix].b; + if (sp < b) { - local_stack[i_5] = 0u; - } - uint i_6 = 0u; - while ((sp + i_6) < b) - { - local_stack[7u - i_6] = _512.Load(((((ix * 512u) + b) - (sp + i_6)) - 1u) * 4 + 0); - i_6++; - if (i_6 == 8u) - { - break; - } - if ((sp + i_6) == b) - { - uint bits = sh_bitmaps[ix / 8u] & ((1u << (ix % 8u)) - 1u); - if (bits == 0u) - { - ix = sh_link[max((ix / 8u), 1u) - 1u]; - } - else - { - ix = (ix & 4294967288u) + uint(int(firstbithigh(bits))); - } - b = sh_stack[ix]; - } + sh_stack[th] = _298.Load(((((ix * 512u) + b) - sp) - 1u) * 4 + 0); } GroupMemoryBarrierWithGroupSync(); - for (uint i_7 = 0u; i_7 < 8u; i_7++) - { - sh_stack[(th * 8u) + i_7] = local_stack[i_7]; - } - uint inp = _593.Load((((gl_GlobalInvocationID.x * 8u) + 8u) - 1u) * 4 + 0); - Bic _605 = { 1u - inp, inp }; - bic = _605; - bitmap = inp << uint(7); - for (uint i_8 = 7u; i_8 > 0u; i_8--) - { - inp = _593.Load((((gl_GlobalInvocationID.x * 8u) + i_8) - 1u) * 4 + 0); - bool _626 = inp == 1u; - bool _632; - if (_626) - { - _632 = bic.a == 0u; - } - else - { - _632 = _626; - } - if (_632) - { - bitmap |= (1u << (i_8 - 1u)); - } - Bic _644 = { 1u - inp, inp }; - Bic other_2 = _644; - Bic param_6 = other_2; - Bic param_7 = bic; - bic = bic_combine(param_6, param_7); - } - sh_bitmaps[th] = bitmap; + uint inp = _314.Load((((gl_GlobalInvocationID.x * 1u) + 1u) - 1u) * 4 + 0); + Bic _326 = { 1u - inp, inp }; + bic = _326; sh_bic[th] = bic; uint inbase = 0u; - for (uint i_9 = 0u; i_9 < 5u; i_9++) + for (uint i_3 = 0u; i_3 < 8u; i_3++) { - uint outbase = 128u - (1u << (6u - i_9)); + uint outbase = 1024u - (1u << (9u - i_3)); GroupMemoryBarrierWithGroupSync(); - if (th < (1u << (5u - i_9))) + if (th < (1u << (8u - i_3))) { - Bic param_8 = sh_bic[inbase + (th * 2u)]; - Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u]; - sh_bic[outbase + th] = bic_combine(param_8, param_9); + Bic param_4 = sh_bic[inbase + (th * 2u)]; + Bic param_5 = sh_bic[(inbase + (th * 2u)) + 1u]; + sh_bic[outbase + th] = bic_combine(param_4, param_5); } inbase = outbase; } GroupMemoryBarrierWithGroupSync(); - bic.b = 0u; - Bic param_10 = bic; - uint _706 = search_link(param_10); - bic = param_10; - sh_link[th] = _706; - bic = _237; - Bic param_11 = bic; - uint _711 = search_link(param_11); - bic = param_11; - ix = _711; - uint loc_sp = 0u; + bic = _157; + Bic param_6 = bic; + uint _377 = search_link(param_6); + bic = param_6; + ix = _377; uint outp; - uint loc_stack[8]; - for (uint i_10 = 0u; i_10 < 8u; i_10++) + if (ix > 0u) { - if (loc_sp > 0u) - { - outp = loc_stack[loc_sp - 1u]; - } - else - { - if (int(ix) >= 0) - { - outp = (gl_WorkGroupID.x * 512u) + ix; - } - else - { - outp = sh_stack[512u + ix]; - } - } - _751.Store(((gl_GlobalInvocationID.x * 8u) + i_10) * 4 + 0, outp); - inp = _593.Load(((gl_GlobalInvocationID.x * 8u) + i_10) * 4 + 0); - if (inp == 1u) - { - loc_stack[loc_sp] = (gl_GlobalInvocationID.x * 8u) + i_10; - loc_sp++; - } - else - { - if (inp == 0u) - { - if (loc_sp > 0u) - { - loc_sp--; - } - else - { - if (int(ix) >= 0) - { - uint bits_1 = sh_bitmaps[ix / 8u] & ((1u << (ix % 8u)) - 1u); - if (bits_1 == 0u) - { - ix = sh_link[ix / 8u]; - } - else - { - ix = (ix & 4294967288u) + uint(int(firstbithigh(bits_1))); - } - } - else - { - ix--; - } - } - } - } + outp = ((gl_WorkGroupID.x * 512u) + ix) - 1u; } + else + { + outp = sh_stack[511u - bic.a]; + } + _399.Store(gl_GlobalInvocationID.x * 4 + 0, outp); } -[numthreads(64, 1, 1)] +[numthreads(512, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_WorkGroupID = stage_input.gl_WorkGroupID;
diff --git a/tests/shader/gen/stack_leaf.msl b/tests/shader/gen/stack_leaf.msl index cbf72a3..13afb0a 100644 --- a/tests/shader/gen/stack_leaf.msl +++ b/tests/shader/gen/stack_leaf.msl
@@ -1,56 +1,10 @@ #pragma clang diagnostic ignored "-Wmissing-prototypes" -#pragma clang diagnostic ignored "-Wmissing-braces" #include <metal_stdlib> #include <simd/simd.h> using namespace metal; -template<typename T, size_t Num> -struct spvUnsafeArray -{ - T elements[Num ? Num : 1]; - - thread T& operator [] (size_t pos) thread - { - return elements[pos]; - } - constexpr const thread T& operator [] (size_t pos) const thread - { - return elements[pos]; - } - - device T& operator [] (size_t pos) device - { - return elements[pos]; - } - constexpr const device T& operator [] (size_t pos) const device - { - return elements[pos]; - } - - constexpr const constant T& operator [] (size_t pos) const constant - { - return elements[pos]; - } - - threadgroup T& operator [] (size_t pos) threadgroup - { - return elements[pos]; - } - constexpr const threadgroup T& operator [] (size_t pos) const threadgroup - { - return elements[pos]; - } -}; - -// Implementation of the unsigned GLSL findMSB() function -template<typename T> -inline T spvFindUMSB(T x) -{ - return select(clz(T(0)) - (clz(x) + T(1)), T(-1), x == T(0)); -} - struct Bic { uint a; @@ -83,7 +37,7 @@ uint outbuf[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); static inline __attribute__((always_inline)) Bic bic_combine(thread const Bic& x, thread const Bic& y) @@ -93,28 +47,13 @@ } static inline __attribute__((always_inline)) -uint search_bit_set(thread const uint& bitmask, thread const uint& ix) -{ - uint result = 0u; - for (uint j = 0u; j < 5u; j++) - { - uint _step = 1u << (4u - j); - if (uint(int(popcount(bitmask & ((1u << (result + _step)) - 1u)))) <= ix) - { - result += _step; - } - } - return result; -} - -static inline __attribute__((always_inline)) -uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[126], threadgroup uint (&sh_bitmaps)[64]) +uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[1022]) { uint ix = gl_LocalInvocationID.x; uint j = 0u; - while (j < 6u) + while (j < 9u) { - uint base = 128u - (2u << (6u - j)); + uint base = 1024u - (2u << (9u - j)); if (((ix >> j) & 1u) != 0u) { Bic param = sh_bic[(base + (ix >> j)) - 1u]; @@ -134,7 +73,7 @@ while (j > 0u) { j--; - uint base_1 = 128u - (2u << (6u - j)); + uint base_1 = 1024u - (2u << (9u - j)); Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u]; Bic param_3 = bic; Bic test_1 = bic_combine(param_2, param_3); @@ -145,56 +84,39 @@ } } } - if (ix > 0u) - { - ix--; - Bic param_4 = sh_bic[ix]; - Bic param_5 = bic; - Bic test_2 = bic_combine(param_4, param_5); - uint param_6 = sh_bitmaps[ix]; - uint param_7 = test_2.b - 1u; - uint ix_in_chunk = search_bit_set(param_6, param_7); - return (ix * 8u) + ix_in_chunk; - } - else - { - return 4294967295u - bic.a; - } + return ix; } -kernel void main0(const device InBuf& _593 [[buffer(0)]], const device BicBuf& _250 [[buffer(1)]], const device StackBuf& _512 [[buffer(2)]], device OutBuf& _751 [[buffer(3)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +kernel void main0(const device InBuf& _314 [[buffer(0)]], const device BicBuf& _170 [[buffer(1)]], const device StackBuf& _298 [[buffer(2)]], device OutBuf& _399 [[buffer(3)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) { - threadgroup Bic sh_bic[126]; - threadgroup uint sh_bitmaps[64]; + threadgroup Bic sh_bic[1022]; threadgroup uint sh_stack[512]; - threadgroup uint sh_link[64]; - threadgroup uint sh_next[64]; uint th = gl_LocalInvocationID.x; Bic bic = Bic{ 0u, 0u }; - if ((th * 8u) < gl_WorkGroupID.x) + if ((th * 1u) < gl_WorkGroupID.x) { - uint _252 = th * 8u; - bic.a = _250.bicbuf[_252].a; - bic.b = _250.bicbuf[_252].b; + uint _172 = th * 1u; + bic.a = _170.bicbuf[_172].a; + bic.b = _170.bicbuf[_172].b; } Bic other; - for (uint i = 1u; i < 8u; i++) + for (uint i = 1u; i < 1u; i++) { - if (((th * 8u) + i) < gl_WorkGroupID.x) + if (((th * 1u) + i) < gl_WorkGroupID.x) { - uint _281 = (th * 8u) + i; - other.a = _250.bicbuf[_281].a; - other.b = _250.bicbuf[_281].b; + uint _201 = (th * 1u) + i; + other.a = _170.bicbuf[_201].a; + other.b = _170.bicbuf[_201].b; Bic param = bic; Bic param_1 = other; bic = bic_combine(param, param_1); } } sh_bic[th] = bic; - for (uint i_1 = 0u; i_1 < 6u; i_1++) + for (uint i_1 = 0u; i_1 < 9u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); - if ((th + (1u << i_1)) < 64u) + if ((th + (1u << i_1)) < 512u) { Bic other_1 = sh_bic[th + (1u << i_1)]; Bic param_2 = bic; @@ -205,203 +127,53 @@ sh_bic[th] = bic; } threadgroup_barrier(mem_flags::mem_threadgroup); - if (th == 63u) - { - bic = Bic{ 0u, 0u }; - } - else - { - bic = sh_bic[th + 1u]; - } - uint last_b = bic.b; - uint bitmap = 0u; - Bic param_4; - for (uint i_2 = 0u; i_2 < 8u; i_2++) - { - uint this_ix = (((th * 8u) + 8u) - 1u) - i_2; - if (this_ix < gl_WorkGroupID.x) - { - param_4.a = _250.bicbuf[this_ix].a; - param_4.b = _250.bicbuf[this_ix].b; - Bic param_5 = bic; - bic = bic_combine(param_4, param_5); - } - sh_stack[this_ix] = bic.b; - if (bic.b > last_b) - { - bitmap |= (1u << (7u - i_2)); - } - last_b = bic.b; - } - sh_bitmaps[th] = bitmap; - uint link = 0u; - if (bitmap != 0u) - { - link = (th * 8u) + uint(int(spvFindUMSB(bitmap))); - } - sh_link[th] = link; - for (uint i_3 = 0u; i_3 < 6u; i_3++) - { - threadgroup_barrier(mem_flags::mem_threadgroup); - if (th >= (1u << i_3)) - { - link = max(link, sh_link[th - (1u << i_3)]); - } - threadgroup_barrier(mem_flags::mem_threadgroup); - sh_link[th] = link; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - uint sp = 504u - (th * 8u); + uint sp = 511u - th; uint ix = 0u; - for (uint i_4 = 0u; i_4 < 9u; i_4++) + for (uint i_2 = 0u; i_2 < 9u; i_2++) { - uint probe = ix + (256u >> i_4); - if (sp < sh_stack[probe]) + uint probe = ix + (256u >> i_2); + if (sp < sh_bic[probe].b) { ix = probe; } } - uint b = sh_stack[ix]; - spvUnsafeArray<uint, 8> local_stack; - for (uint i_5 = 0u; i_5 < 8u; i_5++) + uint b = sh_bic[ix].b; + if (sp < b) { - local_stack[i_5] = 0u; - } - uint i_6 = 0u; - while ((sp + i_6) < b) - { - local_stack[7u - i_6] = _512.stack[(((ix * 512u) + b) - (sp + i_6)) - 1u]; - i_6++; - if (i_6 == 8u) - { - break; - } - if ((sp + i_6) == b) - { - uint bits = sh_bitmaps[ix / 8u] & ((1u << (ix % 8u)) - 1u); - if (bits == 0u) - { - ix = sh_link[max((ix / 8u), 1u) - 1u]; - } - else - { - ix = (ix & 4294967288u) + uint(int(spvFindUMSB(bits))); - } - b = sh_stack[ix]; - } + sh_stack[th] = _298.stack[(((ix * 512u) + b) - sp) - 1u]; } threadgroup_barrier(mem_flags::mem_threadgroup); - for (uint i_7 = 0u; i_7 < 8u; i_7++) - { - sh_stack[(th * 8u) + i_7] = local_stack[i_7]; - } - uint inp = _593.inbuf[((gl_GlobalInvocationID.x * 8u) + 8u) - 1u]; + uint inp = _314.inbuf[((gl_GlobalInvocationID.x * 1u) + 1u) - 1u]; bic = Bic{ 1u - inp, inp }; - bitmap = inp << uint(7); - for (uint i_8 = 7u; i_8 > 0u; i_8--) - { - inp = _593.inbuf[((gl_GlobalInvocationID.x * 8u) + i_8) - 1u]; - bool _626 = inp == 1u; - bool _632; - if (_626) - { - _632 = bic.a == 0u; - } - else - { - _632 = _626; - } - if (_632) - { - bitmap |= (1u << (i_8 - 1u)); - } - Bic other_2 = Bic{ 1u - inp, inp }; - Bic param_6 = other_2; - Bic param_7 = bic; - bic = bic_combine(param_6, param_7); - } - sh_bitmaps[th] = bitmap; sh_bic[th] = bic; uint inbase = 0u; - for (uint i_9 = 0u; i_9 < 5u; i_9++) + for (uint i_3 = 0u; i_3 < 8u; i_3++) { - uint outbase = 128u - (1u << (6u - i_9)); + uint outbase = 1024u - (1u << (9u - i_3)); threadgroup_barrier(mem_flags::mem_threadgroup); - if (th < (1u << (5u - i_9))) + if (th < (1u << (8u - i_3))) { - Bic param_8 = sh_bic[inbase + (th * 2u)]; - Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u]; - sh_bic[outbase + th] = bic_combine(param_8, param_9); + Bic param_4 = sh_bic[inbase + (th * 2u)]; + Bic param_5 = sh_bic[(inbase + (th * 2u)) + 1u]; + sh_bic[outbase + th] = bic_combine(param_4, param_5); } inbase = outbase; } threadgroup_barrier(mem_flags::mem_threadgroup); - bic.b = 0u; - Bic param_10 = bic; - uint _706 = search_link(param_10, gl_LocalInvocationID, sh_bic, sh_bitmaps); - bic = param_10; - sh_link[th] = _706; bic = Bic{ 0u, 0u }; - Bic param_11 = bic; - uint _711 = search_link(param_11, gl_LocalInvocationID, sh_bic, sh_bitmaps); - bic = param_11; - ix = _711; - uint loc_sp = 0u; + Bic param_6 = bic; + uint _377 = search_link(param_6, gl_LocalInvocationID, sh_bic); + bic = param_6; + ix = _377; uint outp; - spvUnsafeArray<uint, 8> loc_stack; - for (uint i_10 = 0u; i_10 < 8u; i_10++) + if (ix > 0u) { - if (loc_sp > 0u) - { - outp = loc_stack[loc_sp - 1u]; - } - else - { - if (int(ix) >= 0) - { - outp = (gl_WorkGroupID.x * 512u) + ix; - } - else - { - outp = sh_stack[512u + ix]; - } - } - _751.outbuf[(gl_GlobalInvocationID.x * 8u) + i_10] = outp; - inp = _593.inbuf[(gl_GlobalInvocationID.x * 8u) + i_10]; - if (inp == 1u) - { - loc_stack[loc_sp] = (gl_GlobalInvocationID.x * 8u) + i_10; - loc_sp++; - } - else - { - if (inp == 0u) - { - if (loc_sp > 0u) - { - loc_sp--; - } - else - { - if (int(ix) >= 0) - { - uint bits_1 = sh_bitmaps[ix / 8u] & ((1u << (ix % 8u)) - 1u); - if (bits_1 == 0u) - { - ix = sh_link[ix / 8u]; - } - else - { - ix = (ix & 4294967288u) + uint(int(spvFindUMSB(bits_1))); - } - } - else - { - ix--; - } - } - } - } + outp = ((gl_WorkGroupID.x * 512u) + ix) - 1u; } + else + { + outp = sh_stack[511u - bic.a]; + } + _399.outbuf[gl_GlobalInvocationID.x] = outp; }
diff --git a/tests/shader/gen/stack_leaf.spv b/tests/shader/gen/stack_leaf.spv index 827ef14..527d20c 100644 --- a/tests/shader/gen/stack_leaf.spv +++ b/tests/shader/gen/stack_leaf.spv Binary files differ
diff --git a/tests/shader/gen/stack_reduce.dxil b/tests/shader/gen/stack_reduce.dxil index fd97e87..50ceb82 100644 --- a/tests/shader/gen/stack_reduce.dxil +++ b/tests/shader/gen/stack_reduce.dxil Binary files differ
diff --git a/tests/shader/gen/stack_reduce.hlsl b/tests/shader/gen/stack_reduce.hlsl index 0eab9cb..5df49cd 100644 --- a/tests/shader/gen/stack_reduce.hlsl +++ b/tests/shader/gen/stack_reduce.hlsl
@@ -4,13 +4,13 @@ uint b; }; -static const uint3 gl_WorkGroupSize = uint3(64u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); -static const Bic _175 = { 0u, 0u }; +static const Bic _174 = { 0u, 0u }; ByteAddressBuffer _48 : register(t0); -RWByteAddressBuffer _160 : register(u1); -RWByteAddressBuffer _223 : register(u2); +RWByteAddressBuffer _159 : register(u1); +RWByteAddressBuffer _221 : register(u2); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -22,7 +22,7 @@ uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared Bic sh_bic[64]; +groupshared Bic sh_bic[512]; Bic bic_combine(Bic x, Bic y) { @@ -33,24 +33,24 @@ void comp_main() { - uint inp[8]; - inp[0] = _48.Load((gl_GlobalInvocationID.x * 8u) * 4 + 0); - Bic _68 = { 1u - inp[0], inp[0] }; - Bic bic = _68; - for (uint i = 1u; i < 8u; i++) + uint inp[1]; + inp[0] = _48.Load((gl_GlobalInvocationID.x * 1u) * 4 + 0); + Bic _67 = { 1u - inp[0], inp[0] }; + Bic bic = _67; + for (uint i = 1u; i < 1u; i++) { - inp[i] = _48.Load(((gl_GlobalInvocationID.x * 8u) + i) * 4 + 0); - Bic _95 = { 1u - inp[i], inp[i] }; - Bic other = _95; + inp[i] = _48.Load(((gl_GlobalInvocationID.x * 1u) + i) * 4 + 0); + Bic _94 = { 1u - inp[i], inp[i] }; + Bic other = _94; Bic param = bic; Bic param_1 = other; bic = bic_combine(param, param_1); } sh_bic[gl_LocalInvocationID.x] = bic; - for (uint i_1 = 0u; i_1 < 6u; i_1++) + for (uint i_1 = 0u; i_1 < 9u; i_1++) { GroupMemoryBarrierWithGroupSync(); - if ((gl_LocalInvocationID.x + (1u << i_1)) < 64u) + if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) { Bic other_1 = sh_bic[gl_LocalInvocationID.x + (1u << i_1)]; Bic param_2 = bic; @@ -62,43 +62,43 @@ } if (gl_LocalInvocationID.x == 0u) { - _160.Store(gl_WorkGroupID.x * 8 + 0, bic.a); - _160.Store(gl_WorkGroupID.x * 8 + 4, bic.b); + _159.Store(gl_WorkGroupID.x * 8 + 0, bic.a); + _159.Store(gl_WorkGroupID.x * 8 + 4, bic.b); } GroupMemoryBarrierWithGroupSync(); uint size = sh_bic[0].b; - bic = _175; - if ((gl_LocalInvocationID.x + 1u) < 64u) + bic = _174; + if ((gl_LocalInvocationID.x + 1u) < 512u) { bic = sh_bic[gl_LocalInvocationID.x + 1u]; } uint out_ix = ((gl_WorkGroupID.x * 512u) + size) - bic.b; - for (uint i_2 = 8u; i_2 > 0u; i_2--) + for (uint i_2 = 1u; i_2 > 0u; i_2--) { - bool _209 = inp[i_2 - 1u] == 1u; - bool _215; - if (_209) + bool _207 = inp[i_2 - 1u] == 1u; + bool _213; + if (_207) { - _215 = bic.a == 0u; + _213 = bic.a == 0u; } else { - _215 = _209; + _213 = _207; } - if (_215) + if (_213) { out_ix--; - _223.Store(out_ix * 4 + 0, ((gl_GlobalInvocationID.x * 8u) + i_2) - 1u); + _221.Store(out_ix * 4 + 0, ((gl_GlobalInvocationID.x * 1u) + i_2) - 1u); } - Bic _242 = { 1u - inp[i_2 - 1u], inp[i_2 - 1u] }; - Bic other_2 = _242; + Bic _240 = { 1u - inp[i_2 - 1u], inp[i_2 - 1u] }; + Bic other_2 = _240; Bic param_4 = other_2; Bic param_5 = bic; bic = bic_combine(param_4, param_5); } } -[numthreads(64, 1, 1)] +[numthreads(512, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_WorkGroupID = stage_input.gl_WorkGroupID;
diff --git a/tests/shader/gen/stack_reduce.msl b/tests/shader/gen/stack_reduce.msl index 2723dee..13ce9fa 100644 --- a/tests/shader/gen/stack_reduce.msl +++ b/tests/shader/gen/stack_reduce.msl
@@ -71,7 +71,7 @@ uint stack[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); static inline __attribute__((always_inline)) Bic bic_combine(thread const Bic& x, thread const Bic& y) @@ -80,25 +80,25 @@ return Bic{ (x.a + y.a) - m, (x.b + y.b) - m }; } -kernel void main0(const device InBuf& _48 [[buffer(0)]], device OutBuf& _160 [[buffer(1)]], device StackBuf& _223 [[buffer(2)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(const device InBuf& _48 [[buffer(0)]], device OutBuf& _159 [[buffer(1)]], device StackBuf& _221 [[buffer(2)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { - threadgroup Bic sh_bic[64]; - spvUnsafeArray<uint, 8> inp; - inp[0] = _48.inbuf[gl_GlobalInvocationID.x * 8u]; + threadgroup Bic sh_bic[512]; + spvUnsafeArray<uint, 1> inp; + inp[0] = _48.inbuf[gl_GlobalInvocationID.x * 1u]; Bic bic = Bic{ 1u - inp[0], inp[0] }; - for (uint i = 1u; i < 8u; i++) + for (uint i = 1u; i < 1u; i++) { - inp[i] = _48.inbuf[(gl_GlobalInvocationID.x * 8u) + i]; + inp[i] = _48.inbuf[(gl_GlobalInvocationID.x * 1u) + i]; Bic other = Bic{ 1u - inp[i], inp[i] }; Bic param = bic; Bic param_1 = other; bic = bic_combine(param, param_1); } sh_bic[gl_LocalInvocationID.x] = bic; - for (uint i_1 = 0u; i_1 < 6u; i_1++) + for (uint i_1 = 0u; i_1 < 9u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); - if ((gl_LocalInvocationID.x + (1u << i_1)) < 64u) + if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) { Bic other_1 = sh_bic[gl_LocalInvocationID.x + (1u << i_1)]; Bic param_2 = bic; @@ -110,33 +110,33 @@ } if (gl_LocalInvocationID.x == 0u) { - _160.outbuf[gl_WorkGroupID.x].a = bic.a; - _160.outbuf[gl_WorkGroupID.x].b = bic.b; + _159.outbuf[gl_WorkGroupID.x].a = bic.a; + _159.outbuf[gl_WorkGroupID.x].b = bic.b; } threadgroup_barrier(mem_flags::mem_threadgroup); uint size = sh_bic[0].b; bic = Bic{ 0u, 0u }; - if ((gl_LocalInvocationID.x + 1u) < 64u) + if ((gl_LocalInvocationID.x + 1u) < 512u) { bic = sh_bic[gl_LocalInvocationID.x + 1u]; } uint out_ix = ((gl_WorkGroupID.x * 512u) + size) - bic.b; - for (uint i_2 = 8u; i_2 > 0u; i_2--) + for (uint i_2 = 1u; i_2 > 0u; i_2--) { - bool _209 = inp[i_2 - 1u] == 1u; - bool _215; - if (_209) + bool _207 = inp[i_2 - 1u] == 1u; + bool _213; + if (_207) { - _215 = bic.a == 0u; + _213 = bic.a == 0u; } else { - _215 = _209; + _213 = _207; } - if (_215) + if (_213) { out_ix--; - _223.stack[out_ix] = ((gl_GlobalInvocationID.x * 8u) + i_2) - 1u; + _221.stack[out_ix] = ((gl_GlobalInvocationID.x * 1u) + i_2) - 1u; } Bic other_2 = Bic{ 1u - inp[i_2 - 1u], inp[i_2 - 1u] }; Bic param_4 = other_2;
diff --git a/tests/shader/gen/stack_reduce.spv b/tests/shader/gen/stack_reduce.spv index 90be43a..733c3a0 100644 --- a/tests/shader/gen/stack_reduce.spv +++ b/tests/shader/gen/stack_reduce.spv Binary files differ
diff --git a/tests/shader/intersection_leaf.comp b/tests/shader/intersection_leaf.comp index c3e594b..04918f1 100644 --- a/tests/shader/intersection_leaf.comp +++ b/tests/shader/intersection_leaf.comp
@@ -6,7 +6,7 @@ #define LG_N_SEQ 0 #define N_SEQ (1 << LG_N_SEQ) -#define LG_WG_SIZE 8 +#define LG_WG_SIZE 9 #define WG_SIZE (1 << LG_WG_SIZE) #define PART_SIZE (WG_SIZE * N_SEQ)
diff --git a/tests/shader/intersection_reduce.comp b/tests/shader/intersection_reduce.comp index cc1e600..19e568b 100644 --- a/tests/shader/intersection_reduce.comp +++ b/tests/shader/intersection_reduce.comp
@@ -5,7 +5,7 @@ #version 450 // At least for now, N_SEQ is hardwired at 1 -#define LG_WG_SIZE 8 +#define LG_WG_SIZE 9 #define WG_SIZE (1 << LG_WG_SIZE) #define PART_SIZE WG_SIZE
diff --git a/tests/shader/stack_leaf.comp b/tests/shader/stack_leaf.comp index 1cdb099..23bf47a 100644 --- a/tests/shader/stack_leaf.comp +++ b/tests/shader/stack_leaf.comp
@@ -4,9 +4,9 @@ #version 450 -#define LG_N_SEQ 3 +#define LG_N_SEQ 0 #define N_SEQ (1 << LG_N_SEQ) -#define LG_WG_SIZE 6 +#define LG_WG_SIZE 9 #define WG_SIZE (1 << LG_WG_SIZE) #define PART_SIZE (WG_SIZE * N_SEQ) @@ -232,6 +232,7 @@ sh_stack[th] = stack[ix * PART_SIZE + b - sp - 1]; } #endif + barrier(); // Do tree reduction of bicyclic semigroups (up-sweep) uint inp = inbuf[gl_GlobalInvocationID.x * N_SEQ + N_SEQ - 1];
diff --git a/tests/shader/stack_reduce.comp b/tests/shader/stack_reduce.comp index b066135..71da8b6 100644 --- a/tests/shader/stack_reduce.comp +++ b/tests/shader/stack_reduce.comp
@@ -4,8 +4,8 @@ #version 450 -#define N_SEQ 8 -#define LG_WG_SIZE 6 +#define N_SEQ 1 +#define LG_WG_SIZE 9 #define WG_SIZE (1 << LG_WG_SIZE) #define PART_SIZE (WG_SIZE * N_SEQ)
diff --git a/tests/shader/union_leaf.comp b/tests/shader/union_leaf.comp index e40bc6d..49775eb 100644 --- a/tests/shader/union_leaf.comp +++ b/tests/shader/union_leaf.comp
@@ -6,7 +6,7 @@ #define LG_N_SEQ 0 #define N_SEQ (1 << LG_N_SEQ) -#define LG_WG_SIZE 8 +#define LG_WG_SIZE 9 #define WG_SIZE (1 << LG_WG_SIZE) #define PART_SIZE (WG_SIZE * N_SEQ)
diff --git a/tests/shader/union_reduce.comp b/tests/shader/union_reduce.comp index f584ae3..fec376c 100644 --- a/tests/shader/union_reduce.comp +++ b/tests/shader/union_reduce.comp
@@ -5,7 +5,7 @@ #version 450 // At least for now, N_SEQ is hardwired at 1 -#define LG_WG_SIZE 8 +#define LG_WG_SIZE 9 #define WG_SIZE (1 << LG_WG_SIZE) #define PART_SIZE WG_SIZE
diff --git a/tests/src/bbox_intersection.rs b/tests/src/bbox_intersection.rs index fac9704..2c006a8 100644 --- a/tests/src/bbox_intersection.rs +++ b/tests/src/bbox_intersection.rs
@@ -24,7 +24,7 @@ use crate::runner::Runner; use crate::test_result::TestResult; -const WG_SIZE: u64 = 256; +const WG_SIZE: u64 = 512; const N_ROWS: u64 = 1; const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
diff --git a/tests/src/bbox_union.rs b/tests/src/bbox_union.rs index ca2224c..bf77f59 100644 --- a/tests/src/bbox_union.rs +++ b/tests/src/bbox_union.rs
@@ -24,7 +24,7 @@ use crate::runner::Runner; use crate::test_result::TestResult; -const WG_SIZE: u64 = 256; +const WG_SIZE: u64 = 512; const N_ROWS: u64 = 1; const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
diff --git a/tests/src/stack.rs b/tests/src/stack.rs index aba875b..923010d 100644 --- a/tests/src/stack.rs +++ b/tests/src/stack.rs
@@ -21,8 +21,8 @@ use crate::runner::Runner; use crate::test_result::TestResult; -const WG_SIZE: u64 = 64; -const N_ROWS: u64 = 8; +const WG_SIZE: u64 = 512; +const N_ROWS: u64 = 1; const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS; struct StackCode {