Set working group size to 512, k = 1
Make working group uniform for stack and bbox tests.
Also fixes the write-after-read hazard in stack. This should be in the upstream branch, but oh well.
diff --git a/tests/shader/gen/stack_leaf.dxil b/tests/shader/gen/stack_leaf.dxil
index 75423cd..14658c3 100644
--- a/tests/shader/gen/stack_leaf.dxil
+++ b/tests/shader/gen/stack_leaf.dxil
Binary files differ
diff --git a/tests/shader/gen/stack_leaf.hlsl b/tests/shader/gen/stack_leaf.hlsl
index fb56399..4969add 100644
--- a/tests/shader/gen/stack_leaf.hlsl
+++ b/tests/shader/gen/stack_leaf.hlsl
@@ -4,14 +4,14 @@
uint b;
};
-static const uint3 gl_WorkGroupSize = uint3(64u, 1u, 1u);
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
-static const Bic _237 = { 0u, 0u };
+static const Bic _157 = { 0u, 0u };
-ByteAddressBuffer _250 : register(t1);
-ByteAddressBuffer _512 : register(t2);
-ByteAddressBuffer _593 : register(t0);
-RWByteAddressBuffer _751 : register(u3);
+ByteAddressBuffer _170 : register(t1);
+ByteAddressBuffer _298 : register(t2);
+ByteAddressBuffer _314 : register(t0);
+RWByteAddressBuffer _399 : register(u3);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;
@@ -23,40 +23,23 @@
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
};
-groupshared Bic sh_bic[126];
-groupshared uint sh_bitmaps[64];
+groupshared Bic sh_bic[1022];
groupshared uint sh_stack[512];
-groupshared uint sh_link[64];
-groupshared uint sh_next[64];
Bic bic_combine(Bic x, Bic y)
{
uint m = min(x.b, y.a);
- Bic _47 = { (x.a + y.a) - m, (x.b + y.b) - m };
- return _47;
-}
-
-uint search_bit_set(uint bitmask, uint ix)
-{
- uint result = 0u;
- for (uint j = 0u; j < 5u; j++)
- {
- uint _step = 1u << (4u - j);
- if (uint(int(countbits(bitmask & ((1u << (result + _step)) - 1u)))) <= ix)
- {
- result += _step;
- }
- }
- return result;
+ Bic _42 = { (x.a + y.a) - m, (x.b + y.b) - m };
+ return _42;
}
uint search_link(inout Bic bic)
{
uint ix = gl_LocalInvocationID.x;
uint j = 0u;
- while (j < 6u)
+ while (j < 9u)
{
- uint base = 128u - (2u << (6u - j));
+ uint base = 1024u - (2u << (9u - j));
if (((ix >> j) & 1u) != 0u)
{
Bic param = sh_bic[(base + (ix >> j)) - 1u];
@@ -76,7 +59,7 @@
while (j > 0u)
{
j--;
- uint base_1 = 128u - (2u << (6u - j));
+ uint base_1 = 1024u - (2u << (9u - j));
Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u];
Bic param_3 = bic;
Bic test_1 = bic_combine(param_2, param_3);
@@ -87,55 +70,41 @@
}
}
}
- if (ix > 0u)
- {
- ix--;
- Bic param_4 = sh_bic[ix];
- Bic param_5 = bic;
- Bic test_2 = bic_combine(param_4, param_5);
- uint param_6 = sh_bitmaps[ix];
- uint param_7 = test_2.b - 1u;
- uint ix_in_chunk = search_bit_set(param_6, param_7);
- return (ix * 8u) + ix_in_chunk;
- }
- else
- {
- return 4294967295u - bic.a;
- }
+ return ix;
}
void comp_main()
{
uint th = gl_LocalInvocationID.x;
- Bic bic = _237;
- if ((th * 8u) < gl_WorkGroupID.x)
+ Bic bic = _157;
+ if ((th * 1u) < gl_WorkGroupID.x)
{
- Bic _255;
- _255.a = _250.Load((th * 8u) * 8 + 0);
- _255.b = _250.Load((th * 8u) * 8 + 4);
- bic.a = _255.a;
- bic.b = _255.b;
+ Bic _175;
+ _175.a = _170.Load((th * 1u) * 8 + 0);
+ _175.b = _170.Load((th * 1u) * 8 + 4);
+ bic.a = _175.a;
+ bic.b = _175.b;
}
Bic other;
- for (uint i = 1u; i < 8u; i++)
+ for (uint i = 1u; i < 1u; i++)
{
- if (((th * 8u) + i) < gl_WorkGroupID.x)
+ if (((th * 1u) + i) < gl_WorkGroupID.x)
{
- Bic _283;
- _283.a = _250.Load(((th * 8u) + i) * 8 + 0);
- _283.b = _250.Load(((th * 8u) + i) * 8 + 4);
- other.a = _283.a;
- other.b = _283.b;
+ Bic _203;
+ _203.a = _170.Load(((th * 1u) + i) * 8 + 0);
+ _203.b = _170.Load(((th * 1u) + i) * 8 + 4);
+ other.a = _203.a;
+ other.b = _203.b;
Bic param = bic;
Bic param_1 = other;
bic = bic_combine(param, param_1);
}
}
sh_bic[th] = bic;
- for (uint i_1 = 0u; i_1 < 6u; i_1++)
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
{
GroupMemoryBarrierWithGroupSync();
- if ((th + (1u << i_1)) < 64u)
+ if ((th + (1u << i_1)) < 512u)
{
Bic other_1 = sh_bic[th + (1u << i_1)];
Bic param_2 = bic;
@@ -146,212 +115,58 @@
sh_bic[th] = bic;
}
GroupMemoryBarrierWithGroupSync();
- if (th == 63u)
- {
- bic = _237;
- }
- else
- {
- bic = sh_bic[th + 1u];
- }
- uint last_b = bic.b;
- uint bitmap = 0u;
- Bic param_4;
- for (uint i_2 = 0u; i_2 < 8u; i_2++)
- {
- uint this_ix = (((th * 8u) + 8u) - 1u) - i_2;
- if (this_ix < gl_WorkGroupID.x)
- {
- Bic _369;
- _369.a = _250.Load(this_ix * 8 + 0);
- _369.b = _250.Load(this_ix * 8 + 4);
- param_4.a = _369.a;
- param_4.b = _369.b;
- Bic param_5 = bic;
- bic = bic_combine(param_4, param_5);
- }
- sh_stack[this_ix] = bic.b;
- if (bic.b > last_b)
- {
- bitmap |= (1u << (7u - i_2));
- }
- last_b = bic.b;
- }
- sh_bitmaps[th] = bitmap;
- uint link = 0u;
- if (bitmap != 0u)
- {
- link = (th * 8u) + uint(int(firstbithigh(bitmap)));
- }
- sh_link[th] = link;
- for (uint i_3 = 0u; i_3 < 6u; i_3++)
- {
- GroupMemoryBarrierWithGroupSync();
- if (th >= (1u << i_3))
- {
- link = max(link, sh_link[th - (1u << i_3)]);
- }
- GroupMemoryBarrierWithGroupSync();
- sh_link[th] = link;
- }
- GroupMemoryBarrierWithGroupSync();
- uint sp = 504u - (th * 8u);
+ uint sp = 511u - th;
uint ix = 0u;
- for (uint i_4 = 0u; i_4 < 9u; i_4++)
+ for (uint i_2 = 0u; i_2 < 9u; i_2++)
{
- uint probe = ix + (256u >> i_4);
- if (sp < sh_stack[probe])
+ uint probe = ix + (256u >> i_2);
+ if (sp < sh_bic[probe].b)
{
ix = probe;
}
}
- uint b = sh_stack[ix];
- uint local_stack[8];
- for (uint i_5 = 0u; i_5 < 8u; i_5++)
+ uint b = sh_bic[ix].b;
+ if (sp < b)
{
- local_stack[i_5] = 0u;
- }
- uint i_6 = 0u;
- while ((sp + i_6) < b)
- {
- local_stack[7u - i_6] = _512.Load(((((ix * 512u) + b) - (sp + i_6)) - 1u) * 4 + 0);
- i_6++;
- if (i_6 == 8u)
- {
- break;
- }
- if ((sp + i_6) == b)
- {
- uint bits = sh_bitmaps[ix / 8u] & ((1u << (ix % 8u)) - 1u);
- if (bits == 0u)
- {
- ix = sh_link[max((ix / 8u), 1u) - 1u];
- }
- else
- {
- ix = (ix & 4294967288u) + uint(int(firstbithigh(bits)));
- }
- b = sh_stack[ix];
- }
+ sh_stack[th] = _298.Load(((((ix * 512u) + b) - sp) - 1u) * 4 + 0);
}
GroupMemoryBarrierWithGroupSync();
- for (uint i_7 = 0u; i_7 < 8u; i_7++)
- {
- sh_stack[(th * 8u) + i_7] = local_stack[i_7];
- }
- uint inp = _593.Load((((gl_GlobalInvocationID.x * 8u) + 8u) - 1u) * 4 + 0);
- Bic _605 = { 1u - inp, inp };
- bic = _605;
- bitmap = inp << uint(7);
- for (uint i_8 = 7u; i_8 > 0u; i_8--)
- {
- inp = _593.Load((((gl_GlobalInvocationID.x * 8u) + i_8) - 1u) * 4 + 0);
- bool _626 = inp == 1u;
- bool _632;
- if (_626)
- {
- _632 = bic.a == 0u;
- }
- else
- {
- _632 = _626;
- }
- if (_632)
- {
- bitmap |= (1u << (i_8 - 1u));
- }
- Bic _644 = { 1u - inp, inp };
- Bic other_2 = _644;
- Bic param_6 = other_2;
- Bic param_7 = bic;
- bic = bic_combine(param_6, param_7);
- }
- sh_bitmaps[th] = bitmap;
+ uint inp = _314.Load((((gl_GlobalInvocationID.x * 1u) + 1u) - 1u) * 4 + 0);
+ Bic _326 = { 1u - inp, inp };
+ bic = _326;
sh_bic[th] = bic;
uint inbase = 0u;
- for (uint i_9 = 0u; i_9 < 5u; i_9++)
+ for (uint i_3 = 0u; i_3 < 8u; i_3++)
{
- uint outbase = 128u - (1u << (6u - i_9));
+ uint outbase = 1024u - (1u << (9u - i_3));
GroupMemoryBarrierWithGroupSync();
- if (th < (1u << (5u - i_9)))
+ if (th < (1u << (8u - i_3)))
{
- Bic param_8 = sh_bic[inbase + (th * 2u)];
- Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u];
- sh_bic[outbase + th] = bic_combine(param_8, param_9);
+ Bic param_4 = sh_bic[inbase + (th * 2u)];
+ Bic param_5 = sh_bic[(inbase + (th * 2u)) + 1u];
+ sh_bic[outbase + th] = bic_combine(param_4, param_5);
}
inbase = outbase;
}
GroupMemoryBarrierWithGroupSync();
- bic.b = 0u;
- Bic param_10 = bic;
- uint _706 = search_link(param_10);
- bic = param_10;
- sh_link[th] = _706;
- bic = _237;
- Bic param_11 = bic;
- uint _711 = search_link(param_11);
- bic = param_11;
- ix = _711;
- uint loc_sp = 0u;
+ bic = _157;
+ Bic param_6 = bic;
+ uint _377 = search_link(param_6);
+ bic = param_6;
+ ix = _377;
uint outp;
- uint loc_stack[8];
- for (uint i_10 = 0u; i_10 < 8u; i_10++)
+ if (ix > 0u)
{
- if (loc_sp > 0u)
- {
- outp = loc_stack[loc_sp - 1u];
- }
- else
- {
- if (int(ix) >= 0)
- {
- outp = (gl_WorkGroupID.x * 512u) + ix;
- }
- else
- {
- outp = sh_stack[512u + ix];
- }
- }
- _751.Store(((gl_GlobalInvocationID.x * 8u) + i_10) * 4 + 0, outp);
- inp = _593.Load(((gl_GlobalInvocationID.x * 8u) + i_10) * 4 + 0);
- if (inp == 1u)
- {
- loc_stack[loc_sp] = (gl_GlobalInvocationID.x * 8u) + i_10;
- loc_sp++;
- }
- else
- {
- if (inp == 0u)
- {
- if (loc_sp > 0u)
- {
- loc_sp--;
- }
- else
- {
- if (int(ix) >= 0)
- {
- uint bits_1 = sh_bitmaps[ix / 8u] & ((1u << (ix % 8u)) - 1u);
- if (bits_1 == 0u)
- {
- ix = sh_link[ix / 8u];
- }
- else
- {
- ix = (ix & 4294967288u) + uint(int(firstbithigh(bits_1)));
- }
- }
- else
- {
- ix--;
- }
- }
- }
- }
+ outp = ((gl_WorkGroupID.x * 512u) + ix) - 1u;
}
+ else
+ {
+ outp = sh_stack[511u - bic.a];
+ }
+ _399.Store(gl_GlobalInvocationID.x * 4 + 0, outp);
}
-[numthreads(64, 1, 1)]
+[numthreads(512, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_WorkGroupID = stage_input.gl_WorkGroupID;
diff --git a/tests/shader/gen/stack_leaf.msl b/tests/shader/gen/stack_leaf.msl
index cbf72a3..13afb0a 100644
--- a/tests/shader/gen/stack_leaf.msl
+++ b/tests/shader/gen/stack_leaf.msl
@@ -1,56 +1,10 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"
-#pragma clang diagnostic ignored "-Wmissing-braces"
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
-template<typename T, size_t Num>
-struct spvUnsafeArray
-{
- T elements[Num ? Num : 1];
-
- thread T& operator [] (size_t pos) thread
- {
- return elements[pos];
- }
- constexpr const thread T& operator [] (size_t pos) const thread
- {
- return elements[pos];
- }
-
- device T& operator [] (size_t pos) device
- {
- return elements[pos];
- }
- constexpr const device T& operator [] (size_t pos) const device
- {
- return elements[pos];
- }
-
- constexpr const constant T& operator [] (size_t pos) const constant
- {
- return elements[pos];
- }
-
- threadgroup T& operator [] (size_t pos) threadgroup
- {
- return elements[pos];
- }
- constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
- {
- return elements[pos];
- }
-};
-
-// Implementation of the unsigned GLSL findMSB() function
-template<typename T>
-inline T spvFindUMSB(T x)
-{
- return select(clz(T(0)) - (clz(x) + T(1)), T(-1), x == T(0));
-}
-
struct Bic
{
uint a;
@@ -83,7 +37,7 @@
uint outbuf[1];
};
-constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u);
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
static inline __attribute__((always_inline))
Bic bic_combine(thread const Bic& x, thread const Bic& y)
@@ -93,28 +47,13 @@
}
static inline __attribute__((always_inline))
-uint search_bit_set(thread const uint& bitmask, thread const uint& ix)
-{
- uint result = 0u;
- for (uint j = 0u; j < 5u; j++)
- {
- uint _step = 1u << (4u - j);
- if (uint(int(popcount(bitmask & ((1u << (result + _step)) - 1u)))) <= ix)
- {
- result += _step;
- }
- }
- return result;
-}
-
-static inline __attribute__((always_inline))
-uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[126], threadgroup uint (&sh_bitmaps)[64])
+uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[1022])
{
uint ix = gl_LocalInvocationID.x;
uint j = 0u;
- while (j < 6u)
+ while (j < 9u)
{
- uint base = 128u - (2u << (6u - j));
+ uint base = 1024u - (2u << (9u - j));
if (((ix >> j) & 1u) != 0u)
{
Bic param = sh_bic[(base + (ix >> j)) - 1u];
@@ -134,7 +73,7 @@
while (j > 0u)
{
j--;
- uint base_1 = 128u - (2u << (6u - j));
+ uint base_1 = 1024u - (2u << (9u - j));
Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u];
Bic param_3 = bic;
Bic test_1 = bic_combine(param_2, param_3);
@@ -145,56 +84,39 @@
}
}
}
- if (ix > 0u)
- {
- ix--;
- Bic param_4 = sh_bic[ix];
- Bic param_5 = bic;
- Bic test_2 = bic_combine(param_4, param_5);
- uint param_6 = sh_bitmaps[ix];
- uint param_7 = test_2.b - 1u;
- uint ix_in_chunk = search_bit_set(param_6, param_7);
- return (ix * 8u) + ix_in_chunk;
- }
- else
- {
- return 4294967295u - bic.a;
- }
+ return ix;
}
-kernel void main0(const device InBuf& _593 [[buffer(0)]], const device BicBuf& _250 [[buffer(1)]], const device StackBuf& _512 [[buffer(2)]], device OutBuf& _751 [[buffer(3)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+kernel void main0(const device InBuf& _314 [[buffer(0)]], const device BicBuf& _170 [[buffer(1)]], const device StackBuf& _298 [[buffer(2)]], device OutBuf& _399 [[buffer(3)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
{
- threadgroup Bic sh_bic[126];
- threadgroup uint sh_bitmaps[64];
+ threadgroup Bic sh_bic[1022];
threadgroup uint sh_stack[512];
- threadgroup uint sh_link[64];
- threadgroup uint sh_next[64];
uint th = gl_LocalInvocationID.x;
Bic bic = Bic{ 0u, 0u };
- if ((th * 8u) < gl_WorkGroupID.x)
+ if ((th * 1u) < gl_WorkGroupID.x)
{
- uint _252 = th * 8u;
- bic.a = _250.bicbuf[_252].a;
- bic.b = _250.bicbuf[_252].b;
+ uint _172 = th * 1u;
+ bic.a = _170.bicbuf[_172].a;
+ bic.b = _170.bicbuf[_172].b;
}
Bic other;
- for (uint i = 1u; i < 8u; i++)
+ for (uint i = 1u; i < 1u; i++)
{
- if (((th * 8u) + i) < gl_WorkGroupID.x)
+ if (((th * 1u) + i) < gl_WorkGroupID.x)
{
- uint _281 = (th * 8u) + i;
- other.a = _250.bicbuf[_281].a;
- other.b = _250.bicbuf[_281].b;
+ uint _201 = (th * 1u) + i;
+ other.a = _170.bicbuf[_201].a;
+ other.b = _170.bicbuf[_201].b;
Bic param = bic;
Bic param_1 = other;
bic = bic_combine(param, param_1);
}
}
sh_bic[th] = bic;
- for (uint i_1 = 0u; i_1 < 6u; i_1++)
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
- if ((th + (1u << i_1)) < 64u)
+ if ((th + (1u << i_1)) < 512u)
{
Bic other_1 = sh_bic[th + (1u << i_1)];
Bic param_2 = bic;
@@ -205,203 +127,53 @@
sh_bic[th] = bic;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
- if (th == 63u)
- {
- bic = Bic{ 0u, 0u };
- }
- else
- {
- bic = sh_bic[th + 1u];
- }
- uint last_b = bic.b;
- uint bitmap = 0u;
- Bic param_4;
- for (uint i_2 = 0u; i_2 < 8u; i_2++)
- {
- uint this_ix = (((th * 8u) + 8u) - 1u) - i_2;
- if (this_ix < gl_WorkGroupID.x)
- {
- param_4.a = _250.bicbuf[this_ix].a;
- param_4.b = _250.bicbuf[this_ix].b;
- Bic param_5 = bic;
- bic = bic_combine(param_4, param_5);
- }
- sh_stack[this_ix] = bic.b;
- if (bic.b > last_b)
- {
- bitmap |= (1u << (7u - i_2));
- }
- last_b = bic.b;
- }
- sh_bitmaps[th] = bitmap;
- uint link = 0u;
- if (bitmap != 0u)
- {
- link = (th * 8u) + uint(int(spvFindUMSB(bitmap)));
- }
- sh_link[th] = link;
- for (uint i_3 = 0u; i_3 < 6u; i_3++)
- {
- threadgroup_barrier(mem_flags::mem_threadgroup);
- if (th >= (1u << i_3))
- {
- link = max(link, sh_link[th - (1u << i_3)]);
- }
- threadgroup_barrier(mem_flags::mem_threadgroup);
- sh_link[th] = link;
- }
- threadgroup_barrier(mem_flags::mem_threadgroup);
- uint sp = 504u - (th * 8u);
+ uint sp = 511u - th;
uint ix = 0u;
- for (uint i_4 = 0u; i_4 < 9u; i_4++)
+ for (uint i_2 = 0u; i_2 < 9u; i_2++)
{
- uint probe = ix + (256u >> i_4);
- if (sp < sh_stack[probe])
+ uint probe = ix + (256u >> i_2);
+ if (sp < sh_bic[probe].b)
{
ix = probe;
}
}
- uint b = sh_stack[ix];
- spvUnsafeArray<uint, 8> local_stack;
- for (uint i_5 = 0u; i_5 < 8u; i_5++)
+ uint b = sh_bic[ix].b;
+ if (sp < b)
{
- local_stack[i_5] = 0u;
- }
- uint i_6 = 0u;
- while ((sp + i_6) < b)
- {
- local_stack[7u - i_6] = _512.stack[(((ix * 512u) + b) - (sp + i_6)) - 1u];
- i_6++;
- if (i_6 == 8u)
- {
- break;
- }
- if ((sp + i_6) == b)
- {
- uint bits = sh_bitmaps[ix / 8u] & ((1u << (ix % 8u)) - 1u);
- if (bits == 0u)
- {
- ix = sh_link[max((ix / 8u), 1u) - 1u];
- }
- else
- {
- ix = (ix & 4294967288u) + uint(int(spvFindUMSB(bits)));
- }
- b = sh_stack[ix];
- }
+ sh_stack[th] = _298.stack[(((ix * 512u) + b) - sp) - 1u];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
- for (uint i_7 = 0u; i_7 < 8u; i_7++)
- {
- sh_stack[(th * 8u) + i_7] = local_stack[i_7];
- }
- uint inp = _593.inbuf[((gl_GlobalInvocationID.x * 8u) + 8u) - 1u];
+ uint inp = _314.inbuf[((gl_GlobalInvocationID.x * 1u) + 1u) - 1u];
bic = Bic{ 1u - inp, inp };
- bitmap = inp << uint(7);
- for (uint i_8 = 7u; i_8 > 0u; i_8--)
- {
- inp = _593.inbuf[((gl_GlobalInvocationID.x * 8u) + i_8) - 1u];
- bool _626 = inp == 1u;
- bool _632;
- if (_626)
- {
- _632 = bic.a == 0u;
- }
- else
- {
- _632 = _626;
- }
- if (_632)
- {
- bitmap |= (1u << (i_8 - 1u));
- }
- Bic other_2 = Bic{ 1u - inp, inp };
- Bic param_6 = other_2;
- Bic param_7 = bic;
- bic = bic_combine(param_6, param_7);
- }
- sh_bitmaps[th] = bitmap;
sh_bic[th] = bic;
uint inbase = 0u;
- for (uint i_9 = 0u; i_9 < 5u; i_9++)
+ for (uint i_3 = 0u; i_3 < 8u; i_3++)
{
- uint outbase = 128u - (1u << (6u - i_9));
+ uint outbase = 1024u - (1u << (9u - i_3));
threadgroup_barrier(mem_flags::mem_threadgroup);
- if (th < (1u << (5u - i_9)))
+ if (th < (1u << (8u - i_3)))
{
- Bic param_8 = sh_bic[inbase + (th * 2u)];
- Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u];
- sh_bic[outbase + th] = bic_combine(param_8, param_9);
+ Bic param_4 = sh_bic[inbase + (th * 2u)];
+ Bic param_5 = sh_bic[(inbase + (th * 2u)) + 1u];
+ sh_bic[outbase + th] = bic_combine(param_4, param_5);
}
inbase = outbase;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
- bic.b = 0u;
- Bic param_10 = bic;
- uint _706 = search_link(param_10, gl_LocalInvocationID, sh_bic, sh_bitmaps);
- bic = param_10;
- sh_link[th] = _706;
bic = Bic{ 0u, 0u };
- Bic param_11 = bic;
- uint _711 = search_link(param_11, gl_LocalInvocationID, sh_bic, sh_bitmaps);
- bic = param_11;
- ix = _711;
- uint loc_sp = 0u;
+ Bic param_6 = bic;
+ uint _377 = search_link(param_6, gl_LocalInvocationID, sh_bic);
+ bic = param_6;
+ ix = _377;
uint outp;
- spvUnsafeArray<uint, 8> loc_stack;
- for (uint i_10 = 0u; i_10 < 8u; i_10++)
+ if (ix > 0u)
{
- if (loc_sp > 0u)
- {
- outp = loc_stack[loc_sp - 1u];
- }
- else
- {
- if (int(ix) >= 0)
- {
- outp = (gl_WorkGroupID.x * 512u) + ix;
- }
- else
- {
- outp = sh_stack[512u + ix];
- }
- }
- _751.outbuf[(gl_GlobalInvocationID.x * 8u) + i_10] = outp;
- inp = _593.inbuf[(gl_GlobalInvocationID.x * 8u) + i_10];
- if (inp == 1u)
- {
- loc_stack[loc_sp] = (gl_GlobalInvocationID.x * 8u) + i_10;
- loc_sp++;
- }
- else
- {
- if (inp == 0u)
- {
- if (loc_sp > 0u)
- {
- loc_sp--;
- }
- else
- {
- if (int(ix) >= 0)
- {
- uint bits_1 = sh_bitmaps[ix / 8u] & ((1u << (ix % 8u)) - 1u);
- if (bits_1 == 0u)
- {
- ix = sh_link[ix / 8u];
- }
- else
- {
- ix = (ix & 4294967288u) + uint(int(spvFindUMSB(bits_1)));
- }
- }
- else
- {
- ix--;
- }
- }
- }
- }
+ outp = ((gl_WorkGroupID.x * 512u) + ix) - 1u;
}
+ else
+ {
+ outp = sh_stack[511u - bic.a];
+ }
+ _399.outbuf[gl_GlobalInvocationID.x] = outp;
}
diff --git a/tests/shader/gen/stack_leaf.spv b/tests/shader/gen/stack_leaf.spv
index 827ef14..527d20c 100644
--- a/tests/shader/gen/stack_leaf.spv
+++ b/tests/shader/gen/stack_leaf.spv
Binary files differ
diff --git a/tests/shader/gen/stack_reduce.dxil b/tests/shader/gen/stack_reduce.dxil
index fd97e87..50ceb82 100644
--- a/tests/shader/gen/stack_reduce.dxil
+++ b/tests/shader/gen/stack_reduce.dxil
Binary files differ
diff --git a/tests/shader/gen/stack_reduce.hlsl b/tests/shader/gen/stack_reduce.hlsl
index 0eab9cb..5df49cd 100644
--- a/tests/shader/gen/stack_reduce.hlsl
+++ b/tests/shader/gen/stack_reduce.hlsl
@@ -4,13 +4,13 @@
uint b;
};
-static const uint3 gl_WorkGroupSize = uint3(64u, 1u, 1u);
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
-static const Bic _175 = { 0u, 0u };
+static const Bic _174 = { 0u, 0u };
ByteAddressBuffer _48 : register(t0);
-RWByteAddressBuffer _160 : register(u1);
-RWByteAddressBuffer _223 : register(u2);
+RWByteAddressBuffer _159 : register(u1);
+RWByteAddressBuffer _221 : register(u2);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;
@@ -22,7 +22,7 @@
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
};
-groupshared Bic sh_bic[64];
+groupshared Bic sh_bic[512];
Bic bic_combine(Bic x, Bic y)
{
@@ -33,24 +33,24 @@
void comp_main()
{
- uint inp[8];
- inp[0] = _48.Load((gl_GlobalInvocationID.x * 8u) * 4 + 0);
- Bic _68 = { 1u - inp[0], inp[0] };
- Bic bic = _68;
- for (uint i = 1u; i < 8u; i++)
+ uint inp[1];
+ inp[0] = _48.Load((gl_GlobalInvocationID.x * 1u) * 4 + 0);
+ Bic _67 = { 1u - inp[0], inp[0] };
+ Bic bic = _67;
+ for (uint i = 1u; i < 1u; i++)
{
- inp[i] = _48.Load(((gl_GlobalInvocationID.x * 8u) + i) * 4 + 0);
- Bic _95 = { 1u - inp[i], inp[i] };
- Bic other = _95;
+ inp[i] = _48.Load(((gl_GlobalInvocationID.x * 1u) + i) * 4 + 0);
+ Bic _94 = { 1u - inp[i], inp[i] };
+ Bic other = _94;
Bic param = bic;
Bic param_1 = other;
bic = bic_combine(param, param_1);
}
sh_bic[gl_LocalInvocationID.x] = bic;
- for (uint i_1 = 0u; i_1 < 6u; i_1++)
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
{
GroupMemoryBarrierWithGroupSync();
- if ((gl_LocalInvocationID.x + (1u << i_1)) < 64u)
+ if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
{
Bic other_1 = sh_bic[gl_LocalInvocationID.x + (1u << i_1)];
Bic param_2 = bic;
@@ -62,43 +62,43 @@
}
if (gl_LocalInvocationID.x == 0u)
{
- _160.Store(gl_WorkGroupID.x * 8 + 0, bic.a);
- _160.Store(gl_WorkGroupID.x * 8 + 4, bic.b);
+ _159.Store(gl_WorkGroupID.x * 8 + 0, bic.a);
+ _159.Store(gl_WorkGroupID.x * 8 + 4, bic.b);
}
GroupMemoryBarrierWithGroupSync();
uint size = sh_bic[0].b;
- bic = _175;
- if ((gl_LocalInvocationID.x + 1u) < 64u)
+ bic = _174;
+ if ((gl_LocalInvocationID.x + 1u) < 512u)
{
bic = sh_bic[gl_LocalInvocationID.x + 1u];
}
uint out_ix = ((gl_WorkGroupID.x * 512u) + size) - bic.b;
- for (uint i_2 = 8u; i_2 > 0u; i_2--)
+ for (uint i_2 = 1u; i_2 > 0u; i_2--)
{
- bool _209 = inp[i_2 - 1u] == 1u;
- bool _215;
- if (_209)
+ bool _207 = inp[i_2 - 1u] == 1u;
+ bool _213;
+ if (_207)
{
- _215 = bic.a == 0u;
+ _213 = bic.a == 0u;
}
else
{
- _215 = _209;
+ _213 = _207;
}
- if (_215)
+ if (_213)
{
out_ix--;
- _223.Store(out_ix * 4 + 0, ((gl_GlobalInvocationID.x * 8u) + i_2) - 1u);
+ _221.Store(out_ix * 4 + 0, ((gl_GlobalInvocationID.x * 1u) + i_2) - 1u);
}
- Bic _242 = { 1u - inp[i_2 - 1u], inp[i_2 - 1u] };
- Bic other_2 = _242;
+ Bic _240 = { 1u - inp[i_2 - 1u], inp[i_2 - 1u] };
+ Bic other_2 = _240;
Bic param_4 = other_2;
Bic param_5 = bic;
bic = bic_combine(param_4, param_5);
}
}
-[numthreads(64, 1, 1)]
+[numthreads(512, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_WorkGroupID = stage_input.gl_WorkGroupID;
diff --git a/tests/shader/gen/stack_reduce.msl b/tests/shader/gen/stack_reduce.msl
index 2723dee..13ce9fa 100644
--- a/tests/shader/gen/stack_reduce.msl
+++ b/tests/shader/gen/stack_reduce.msl
@@ -71,7 +71,7 @@
uint stack[1];
};
-constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u);
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
static inline __attribute__((always_inline))
Bic bic_combine(thread const Bic& x, thread const Bic& y)
@@ -80,25 +80,25 @@
return Bic{ (x.a + y.a) - m, (x.b + y.b) - m };
}
-kernel void main0(const device InBuf& _48 [[buffer(0)]], device OutBuf& _160 [[buffer(1)]], device StackBuf& _223 [[buffer(2)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(const device InBuf& _48 [[buffer(0)]], device OutBuf& _159 [[buffer(1)]], device StackBuf& _221 [[buffer(2)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
{
- threadgroup Bic sh_bic[64];
- spvUnsafeArray<uint, 8> inp;
- inp[0] = _48.inbuf[gl_GlobalInvocationID.x * 8u];
+ threadgroup Bic sh_bic[512];
+ spvUnsafeArray<uint, 1> inp;
+ inp[0] = _48.inbuf[gl_GlobalInvocationID.x * 1u];
Bic bic = Bic{ 1u - inp[0], inp[0] };
- for (uint i = 1u; i < 8u; i++)
+ for (uint i = 1u; i < 1u; i++)
{
- inp[i] = _48.inbuf[(gl_GlobalInvocationID.x * 8u) + i];
+ inp[i] = _48.inbuf[(gl_GlobalInvocationID.x * 1u) + i];
Bic other = Bic{ 1u - inp[i], inp[i] };
Bic param = bic;
Bic param_1 = other;
bic = bic_combine(param, param_1);
}
sh_bic[gl_LocalInvocationID.x] = bic;
- for (uint i_1 = 0u; i_1 < 6u; i_1++)
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
- if ((gl_LocalInvocationID.x + (1u << i_1)) < 64u)
+ if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
{
Bic other_1 = sh_bic[gl_LocalInvocationID.x + (1u << i_1)];
Bic param_2 = bic;
@@ -110,33 +110,33 @@
}
if (gl_LocalInvocationID.x == 0u)
{
- _160.outbuf[gl_WorkGroupID.x].a = bic.a;
- _160.outbuf[gl_WorkGroupID.x].b = bic.b;
+ _159.outbuf[gl_WorkGroupID.x].a = bic.a;
+ _159.outbuf[gl_WorkGroupID.x].b = bic.b;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint size = sh_bic[0].b;
bic = Bic{ 0u, 0u };
- if ((gl_LocalInvocationID.x + 1u) < 64u)
+ if ((gl_LocalInvocationID.x + 1u) < 512u)
{
bic = sh_bic[gl_LocalInvocationID.x + 1u];
}
uint out_ix = ((gl_WorkGroupID.x * 512u) + size) - bic.b;
- for (uint i_2 = 8u; i_2 > 0u; i_2--)
+ for (uint i_2 = 1u; i_2 > 0u; i_2--)
{
- bool _209 = inp[i_2 - 1u] == 1u;
- bool _215;
- if (_209)
+ bool _207 = inp[i_2 - 1u] == 1u;
+ bool _213;
+ if (_207)
{
- _215 = bic.a == 0u;
+ _213 = bic.a == 0u;
}
else
{
- _215 = _209;
+ _213 = _207;
}
- if (_215)
+ if (_213)
{
out_ix--;
- _223.stack[out_ix] = ((gl_GlobalInvocationID.x * 8u) + i_2) - 1u;
+ _221.stack[out_ix] = ((gl_GlobalInvocationID.x * 1u) + i_2) - 1u;
}
Bic other_2 = Bic{ 1u - inp[i_2 - 1u], inp[i_2 - 1u] };
Bic param_4 = other_2;
diff --git a/tests/shader/gen/stack_reduce.spv b/tests/shader/gen/stack_reduce.spv
index 90be43a..733c3a0 100644
--- a/tests/shader/gen/stack_reduce.spv
+++ b/tests/shader/gen/stack_reduce.spv
Binary files differ
diff --git a/tests/shader/intersection_leaf.comp b/tests/shader/intersection_leaf.comp
index c3e594b..04918f1 100644
--- a/tests/shader/intersection_leaf.comp
+++ b/tests/shader/intersection_leaf.comp
@@ -6,7 +6,7 @@
#define LG_N_SEQ 0
#define N_SEQ (1 << LG_N_SEQ)
-#define LG_WG_SIZE 8
+#define LG_WG_SIZE 9
#define WG_SIZE (1 << LG_WG_SIZE)
#define PART_SIZE (WG_SIZE * N_SEQ)
diff --git a/tests/shader/intersection_reduce.comp b/tests/shader/intersection_reduce.comp
index cc1e600..19e568b 100644
--- a/tests/shader/intersection_reduce.comp
+++ b/tests/shader/intersection_reduce.comp
@@ -5,7 +5,7 @@
#version 450
// At least for now, N_SEQ is hardwired at 1
-#define LG_WG_SIZE 8
+#define LG_WG_SIZE 9
#define WG_SIZE (1 << LG_WG_SIZE)
#define PART_SIZE WG_SIZE
diff --git a/tests/shader/stack_leaf.comp b/tests/shader/stack_leaf.comp
index 1cdb099..23bf47a 100644
--- a/tests/shader/stack_leaf.comp
+++ b/tests/shader/stack_leaf.comp
@@ -4,9 +4,9 @@
#version 450
-#define LG_N_SEQ 3
+#define LG_N_SEQ 0
#define N_SEQ (1 << LG_N_SEQ)
-#define LG_WG_SIZE 6
+#define LG_WG_SIZE 9
#define WG_SIZE (1 << LG_WG_SIZE)
#define PART_SIZE (WG_SIZE * N_SEQ)
@@ -232,6 +232,7 @@
sh_stack[th] = stack[ix * PART_SIZE + b - sp - 1];
}
#endif
+ barrier();
// Do tree reduction of bicyclic semigroups (up-sweep)
uint inp = inbuf[gl_GlobalInvocationID.x * N_SEQ + N_SEQ - 1];
diff --git a/tests/shader/stack_reduce.comp b/tests/shader/stack_reduce.comp
index b066135..71da8b6 100644
--- a/tests/shader/stack_reduce.comp
+++ b/tests/shader/stack_reduce.comp
@@ -4,8 +4,8 @@
#version 450
-#define N_SEQ 8
-#define LG_WG_SIZE 6
+#define N_SEQ 1
+#define LG_WG_SIZE 9
#define WG_SIZE (1 << LG_WG_SIZE)
#define PART_SIZE (WG_SIZE * N_SEQ)
diff --git a/tests/shader/union_leaf.comp b/tests/shader/union_leaf.comp
index e40bc6d..49775eb 100644
--- a/tests/shader/union_leaf.comp
+++ b/tests/shader/union_leaf.comp
@@ -6,7 +6,7 @@
#define LG_N_SEQ 0
#define N_SEQ (1 << LG_N_SEQ)
-#define LG_WG_SIZE 8
+#define LG_WG_SIZE 9
#define WG_SIZE (1 << LG_WG_SIZE)
#define PART_SIZE (WG_SIZE * N_SEQ)
diff --git a/tests/shader/union_reduce.comp b/tests/shader/union_reduce.comp
index f584ae3..fec376c 100644
--- a/tests/shader/union_reduce.comp
+++ b/tests/shader/union_reduce.comp
@@ -5,7 +5,7 @@
#version 450
// At least for now, N_SEQ is hardwired at 1
-#define LG_WG_SIZE 8
+#define LG_WG_SIZE 9
#define WG_SIZE (1 << LG_WG_SIZE)
#define PART_SIZE WG_SIZE
diff --git a/tests/src/bbox_intersection.rs b/tests/src/bbox_intersection.rs
index fac9704..2c006a8 100644
--- a/tests/src/bbox_intersection.rs
+++ b/tests/src/bbox_intersection.rs
@@ -24,7 +24,7 @@
use crate::runner::Runner;
use crate::test_result::TestResult;
-const WG_SIZE: u64 = 256;
+const WG_SIZE: u64 = 512;
const N_ROWS: u64 = 1;
const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
diff --git a/tests/src/bbox_union.rs b/tests/src/bbox_union.rs
index ca2224c..bf77f59 100644
--- a/tests/src/bbox_union.rs
+++ b/tests/src/bbox_union.rs
@@ -24,7 +24,7 @@
use crate::runner::Runner;
use crate::test_result::TestResult;
-const WG_SIZE: u64 = 256;
+const WG_SIZE: u64 = 512;
const N_ROWS: u64 = 1;
const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
diff --git a/tests/src/stack.rs b/tests/src/stack.rs
index aba875b..923010d 100644
--- a/tests/src/stack.rs
+++ b/tests/src/stack.rs
@@ -21,8 +21,8 @@
use crate::runner::Runner;
use crate::test_result::TestResult;
-const WG_SIZE: u64 = 64;
-const N_ROWS: u64 = 8;
+const WG_SIZE: u64 = 512;
+const N_ROWS: u64 = 1;
const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
struct StackCode {