Set working group size to 512, k = 1

Make working group uniform for stack and bbox tests.

Also fixes the write-after-read hazard in stack. This should be in the upstream branch, but oh well.
diff --git a/tests/shader/gen/stack_leaf.dxil b/tests/shader/gen/stack_leaf.dxil
index 75423cd..14658c3 100644
--- a/tests/shader/gen/stack_leaf.dxil
+++ b/tests/shader/gen/stack_leaf.dxil
Binary files differ
diff --git a/tests/shader/gen/stack_leaf.hlsl b/tests/shader/gen/stack_leaf.hlsl
index fb56399..4969add 100644
--- a/tests/shader/gen/stack_leaf.hlsl
+++ b/tests/shader/gen/stack_leaf.hlsl
@@ -4,14 +4,14 @@
     uint b;
 };
 
-static const uint3 gl_WorkGroupSize = uint3(64u, 1u, 1u);
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
 
-static const Bic _237 = { 0u, 0u };
+static const Bic _157 = { 0u, 0u };
 
-ByteAddressBuffer _250 : register(t1);
-ByteAddressBuffer _512 : register(t2);
-ByteAddressBuffer _593 : register(t0);
-RWByteAddressBuffer _751 : register(u3);
+ByteAddressBuffer _170 : register(t1);
+ByteAddressBuffer _298 : register(t2);
+ByteAddressBuffer _314 : register(t0);
+RWByteAddressBuffer _399 : register(u3);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -23,40 +23,23 @@
     uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
 };
 
-groupshared Bic sh_bic[126];
-groupshared uint sh_bitmaps[64];
+groupshared Bic sh_bic[1022];
 groupshared uint sh_stack[512];
-groupshared uint sh_link[64];
-groupshared uint sh_next[64];
 
 Bic bic_combine(Bic x, Bic y)
 {
     uint m = min(x.b, y.a);
-    Bic _47 = { (x.a + y.a) - m, (x.b + y.b) - m };
-    return _47;
-}
-
-uint search_bit_set(uint bitmask, uint ix)
-{
-    uint result = 0u;
-    for (uint j = 0u; j < 5u; j++)
-    {
-        uint _step = 1u << (4u - j);
-        if (uint(int(countbits(bitmask & ((1u << (result + _step)) - 1u)))) <= ix)
-        {
-            result += _step;
-        }
-    }
-    return result;
+    Bic _42 = { (x.a + y.a) - m, (x.b + y.b) - m };
+    return _42;
 }
 
 uint search_link(inout Bic bic)
 {
     uint ix = gl_LocalInvocationID.x;
     uint j = 0u;
-    while (j < 6u)
+    while (j < 9u)
     {
-        uint base = 128u - (2u << (6u - j));
+        uint base = 1024u - (2u << (9u - j));
         if (((ix >> j) & 1u) != 0u)
         {
             Bic param = sh_bic[(base + (ix >> j)) - 1u];
@@ -76,7 +59,7 @@
         while (j > 0u)
         {
             j--;
-            uint base_1 = 128u - (2u << (6u - j));
+            uint base_1 = 1024u - (2u << (9u - j));
             Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u];
             Bic param_3 = bic;
             Bic test_1 = bic_combine(param_2, param_3);
@@ -87,55 +70,41 @@
             }
         }
     }
-    if (ix > 0u)
-    {
-        ix--;
-        Bic param_4 = sh_bic[ix];
-        Bic param_5 = bic;
-        Bic test_2 = bic_combine(param_4, param_5);
-        uint param_6 = sh_bitmaps[ix];
-        uint param_7 = test_2.b - 1u;
-        uint ix_in_chunk = search_bit_set(param_6, param_7);
-        return (ix * 8u) + ix_in_chunk;
-    }
-    else
-    {
-        return 4294967295u - bic.a;
-    }
+    return ix;
 }
 
 void comp_main()
 {
     uint th = gl_LocalInvocationID.x;
-    Bic bic = _237;
-    if ((th * 8u) < gl_WorkGroupID.x)
+    Bic bic = _157;
+    if ((th * 1u) < gl_WorkGroupID.x)
     {
-        Bic _255;
-        _255.a = _250.Load((th * 8u) * 8 + 0);
-        _255.b = _250.Load((th * 8u) * 8 + 4);
-        bic.a = _255.a;
-        bic.b = _255.b;
+        Bic _175;
+        _175.a = _170.Load((th * 1u) * 8 + 0);
+        _175.b = _170.Load((th * 1u) * 8 + 4);
+        bic.a = _175.a;
+        bic.b = _175.b;
     }
     Bic other;
-    for (uint i = 1u; i < 8u; i++)
+    for (uint i = 1u; i < 1u; i++)
     {
-        if (((th * 8u) + i) < gl_WorkGroupID.x)
+        if (((th * 1u) + i) < gl_WorkGroupID.x)
         {
-            Bic _283;
-            _283.a = _250.Load(((th * 8u) + i) * 8 + 0);
-            _283.b = _250.Load(((th * 8u) + i) * 8 + 4);
-            other.a = _283.a;
-            other.b = _283.b;
+            Bic _203;
+            _203.a = _170.Load(((th * 1u) + i) * 8 + 0);
+            _203.b = _170.Load(((th * 1u) + i) * 8 + 4);
+            other.a = _203.a;
+            other.b = _203.b;
             Bic param = bic;
             Bic param_1 = other;
             bic = bic_combine(param, param_1);
         }
     }
     sh_bic[th] = bic;
-    for (uint i_1 = 0u; i_1 < 6u; i_1++)
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
     {
         GroupMemoryBarrierWithGroupSync();
-        if ((th + (1u << i_1)) < 64u)
+        if ((th + (1u << i_1)) < 512u)
         {
             Bic other_1 = sh_bic[th + (1u << i_1)];
             Bic param_2 = bic;
@@ -146,212 +115,58 @@
         sh_bic[th] = bic;
     }
     GroupMemoryBarrierWithGroupSync();
-    if (th == 63u)
-    {
-        bic = _237;
-    }
-    else
-    {
-        bic = sh_bic[th + 1u];
-    }
-    uint last_b = bic.b;
-    uint bitmap = 0u;
-    Bic param_4;
-    for (uint i_2 = 0u; i_2 < 8u; i_2++)
-    {
-        uint this_ix = (((th * 8u) + 8u) - 1u) - i_2;
-        if (this_ix < gl_WorkGroupID.x)
-        {
-            Bic _369;
-            _369.a = _250.Load(this_ix * 8 + 0);
-            _369.b = _250.Load(this_ix * 8 + 4);
-            param_4.a = _369.a;
-            param_4.b = _369.b;
-            Bic param_5 = bic;
-            bic = bic_combine(param_4, param_5);
-        }
-        sh_stack[this_ix] = bic.b;
-        if (bic.b > last_b)
-        {
-            bitmap |= (1u << (7u - i_2));
-        }
-        last_b = bic.b;
-    }
-    sh_bitmaps[th] = bitmap;
-    uint link = 0u;
-    if (bitmap != 0u)
-    {
-        link = (th * 8u) + uint(int(firstbithigh(bitmap)));
-    }
-    sh_link[th] = link;
-    for (uint i_3 = 0u; i_3 < 6u; i_3++)
-    {
-        GroupMemoryBarrierWithGroupSync();
-        if (th >= (1u << i_3))
-        {
-            link = max(link, sh_link[th - (1u << i_3)]);
-        }
-        GroupMemoryBarrierWithGroupSync();
-        sh_link[th] = link;
-    }
-    GroupMemoryBarrierWithGroupSync();
-    uint sp = 504u - (th * 8u);
+    uint sp = 511u - th;
     uint ix = 0u;
-    for (uint i_4 = 0u; i_4 < 9u; i_4++)
+    for (uint i_2 = 0u; i_2 < 9u; i_2++)
     {
-        uint probe = ix + (256u >> i_4);
-        if (sp < sh_stack[probe])
+        uint probe = ix + (256u >> i_2);
+        if (sp < sh_bic[probe].b)
         {
             ix = probe;
         }
     }
-    uint b = sh_stack[ix];
-    uint local_stack[8];
-    for (uint i_5 = 0u; i_5 < 8u; i_5++)
+    uint b = sh_bic[ix].b;
+    if (sp < b)
     {
-        local_stack[i_5] = 0u;
-    }
-    uint i_6 = 0u;
-    while ((sp + i_6) < b)
-    {
-        local_stack[7u - i_6] = _512.Load(((((ix * 512u) + b) - (sp + i_6)) - 1u) * 4 + 0);
-        i_6++;
-        if (i_6 == 8u)
-        {
-            break;
-        }
-        if ((sp + i_6) == b)
-        {
-            uint bits = sh_bitmaps[ix / 8u] & ((1u << (ix % 8u)) - 1u);
-            if (bits == 0u)
-            {
-                ix = sh_link[max((ix / 8u), 1u) - 1u];
-            }
-            else
-            {
-                ix = (ix & 4294967288u) + uint(int(firstbithigh(bits)));
-            }
-            b = sh_stack[ix];
-        }
+        sh_stack[th] = _298.Load(((((ix * 512u) + b) - sp) - 1u) * 4 + 0);
     }
     GroupMemoryBarrierWithGroupSync();
-    for (uint i_7 = 0u; i_7 < 8u; i_7++)
-    {
-        sh_stack[(th * 8u) + i_7] = local_stack[i_7];
-    }
-    uint inp = _593.Load((((gl_GlobalInvocationID.x * 8u) + 8u) - 1u) * 4 + 0);
-    Bic _605 = { 1u - inp, inp };
-    bic = _605;
-    bitmap = inp << uint(7);
-    for (uint i_8 = 7u; i_8 > 0u; i_8--)
-    {
-        inp = _593.Load((((gl_GlobalInvocationID.x * 8u) + i_8) - 1u) * 4 + 0);
-        bool _626 = inp == 1u;
-        bool _632;
-        if (_626)
-        {
-            _632 = bic.a == 0u;
-        }
-        else
-        {
-            _632 = _626;
-        }
-        if (_632)
-        {
-            bitmap |= (1u << (i_8 - 1u));
-        }
-        Bic _644 = { 1u - inp, inp };
-        Bic other_2 = _644;
-        Bic param_6 = other_2;
-        Bic param_7 = bic;
-        bic = bic_combine(param_6, param_7);
-    }
-    sh_bitmaps[th] = bitmap;
+    uint inp = _314.Load((((gl_GlobalInvocationID.x * 1u) + 1u) - 1u) * 4 + 0);
+    Bic _326 = { 1u - inp, inp };
+    bic = _326;
     sh_bic[th] = bic;
     uint inbase = 0u;
-    for (uint i_9 = 0u; i_9 < 5u; i_9++)
+    for (uint i_3 = 0u; i_3 < 8u; i_3++)
     {
-        uint outbase = 128u - (1u << (6u - i_9));
+        uint outbase = 1024u - (1u << (9u - i_3));
         GroupMemoryBarrierWithGroupSync();
-        if (th < (1u << (5u - i_9)))
+        if (th < (1u << (8u - i_3)))
         {
-            Bic param_8 = sh_bic[inbase + (th * 2u)];
-            Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u];
-            sh_bic[outbase + th] = bic_combine(param_8, param_9);
+            Bic param_4 = sh_bic[inbase + (th * 2u)];
+            Bic param_5 = sh_bic[(inbase + (th * 2u)) + 1u];
+            sh_bic[outbase + th] = bic_combine(param_4, param_5);
         }
         inbase = outbase;
     }
     GroupMemoryBarrierWithGroupSync();
-    bic.b = 0u;
-    Bic param_10 = bic;
-    uint _706 = search_link(param_10);
-    bic = param_10;
-    sh_link[th] = _706;
-    bic = _237;
-    Bic param_11 = bic;
-    uint _711 = search_link(param_11);
-    bic = param_11;
-    ix = _711;
-    uint loc_sp = 0u;
+    bic = _157;
+    Bic param_6 = bic;
+    uint _377 = search_link(param_6);
+    bic = param_6;
+    ix = _377;
     uint outp;
-    uint loc_stack[8];
-    for (uint i_10 = 0u; i_10 < 8u; i_10++)
+    if (ix > 0u)
     {
-        if (loc_sp > 0u)
-        {
-            outp = loc_stack[loc_sp - 1u];
-        }
-        else
-        {
-            if (int(ix) >= 0)
-            {
-                outp = (gl_WorkGroupID.x * 512u) + ix;
-            }
-            else
-            {
-                outp = sh_stack[512u + ix];
-            }
-        }
-        _751.Store(((gl_GlobalInvocationID.x * 8u) + i_10) * 4 + 0, outp);
-        inp = _593.Load(((gl_GlobalInvocationID.x * 8u) + i_10) * 4 + 0);
-        if (inp == 1u)
-        {
-            loc_stack[loc_sp] = (gl_GlobalInvocationID.x * 8u) + i_10;
-            loc_sp++;
-        }
-        else
-        {
-            if (inp == 0u)
-            {
-                if (loc_sp > 0u)
-                {
-                    loc_sp--;
-                }
-                else
-                {
-                    if (int(ix) >= 0)
-                    {
-                        uint bits_1 = sh_bitmaps[ix / 8u] & ((1u << (ix % 8u)) - 1u);
-                        if (bits_1 == 0u)
-                        {
-                            ix = sh_link[ix / 8u];
-                        }
-                        else
-                        {
-                            ix = (ix & 4294967288u) + uint(int(firstbithigh(bits_1)));
-                        }
-                    }
-                    else
-                    {
-                        ix--;
-                    }
-                }
-            }
-        }
+        outp = ((gl_WorkGroupID.x * 512u) + ix) - 1u;
     }
+    else
+    {
+        outp = sh_stack[511u - bic.a];
+    }
+    _399.Store(gl_GlobalInvocationID.x * 4 + 0, outp);
 }
 
-[numthreads(64, 1, 1)]
+[numthreads(512, 1, 1)]
 void main(SPIRV_Cross_Input stage_input)
 {
     gl_WorkGroupID = stage_input.gl_WorkGroupID;
diff --git a/tests/shader/gen/stack_leaf.msl b/tests/shader/gen/stack_leaf.msl
index cbf72a3..13afb0a 100644
--- a/tests/shader/gen/stack_leaf.msl
+++ b/tests/shader/gen/stack_leaf.msl
@@ -1,56 +1,10 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
-#pragma clang diagnostic ignored "-Wmissing-braces"
 
 #include <metal_stdlib>
 #include <simd/simd.h>
 
 using namespace metal;
 
-template<typename T, size_t Num>
-struct spvUnsafeArray
-{
-    T elements[Num ? Num : 1];
-    
-    thread T& operator [] (size_t pos) thread
-    {
-        return elements[pos];
-    }
-    constexpr const thread T& operator [] (size_t pos) const thread
-    {
-        return elements[pos];
-    }
-    
-    device T& operator [] (size_t pos) device
-    {
-        return elements[pos];
-    }
-    constexpr const device T& operator [] (size_t pos) const device
-    {
-        return elements[pos];
-    }
-    
-    constexpr const constant T& operator [] (size_t pos) const constant
-    {
-        return elements[pos];
-    }
-    
-    threadgroup T& operator [] (size_t pos) threadgroup
-    {
-        return elements[pos];
-    }
-    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
-    {
-        return elements[pos];
-    }
-};
-
-// Implementation of the unsigned GLSL findMSB() function
-template<typename T>
-inline T spvFindUMSB(T x)
-{
-    return select(clz(T(0)) - (clz(x) + T(1)), T(-1), x == T(0));
-}
-
 struct Bic
 {
     uint a;
@@ -83,7 +37,7 @@
     uint outbuf[1];
 };
 
-constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u);
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
 
 static inline __attribute__((always_inline))
 Bic bic_combine(thread const Bic& x, thread const Bic& y)
@@ -93,28 +47,13 @@
 }
 
 static inline __attribute__((always_inline))
-uint search_bit_set(thread const uint& bitmask, thread const uint& ix)
-{
-    uint result = 0u;
-    for (uint j = 0u; j < 5u; j++)
-    {
-        uint _step = 1u << (4u - j);
-        if (uint(int(popcount(bitmask & ((1u << (result + _step)) - 1u)))) <= ix)
-        {
-            result += _step;
-        }
-    }
-    return result;
-}
-
-static inline __attribute__((always_inline))
-uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[126], threadgroup uint (&sh_bitmaps)[64])
+uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[1022])
 {
     uint ix = gl_LocalInvocationID.x;
     uint j = 0u;
-    while (j < 6u)
+    while (j < 9u)
     {
-        uint base = 128u - (2u << (6u - j));
+        uint base = 1024u - (2u << (9u - j));
         if (((ix >> j) & 1u) != 0u)
         {
             Bic param = sh_bic[(base + (ix >> j)) - 1u];
@@ -134,7 +73,7 @@
         while (j > 0u)
         {
             j--;
-            uint base_1 = 128u - (2u << (6u - j));
+            uint base_1 = 1024u - (2u << (9u - j));
             Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u];
             Bic param_3 = bic;
             Bic test_1 = bic_combine(param_2, param_3);
@@ -145,56 +84,39 @@
             }
         }
     }
-    if (ix > 0u)
-    {
-        ix--;
-        Bic param_4 = sh_bic[ix];
-        Bic param_5 = bic;
-        Bic test_2 = bic_combine(param_4, param_5);
-        uint param_6 = sh_bitmaps[ix];
-        uint param_7 = test_2.b - 1u;
-        uint ix_in_chunk = search_bit_set(param_6, param_7);
-        return (ix * 8u) + ix_in_chunk;
-    }
-    else
-    {
-        return 4294967295u - bic.a;
-    }
+    return ix;
 }
 
-kernel void main0(const device InBuf& _593 [[buffer(0)]], const device BicBuf& _250 [[buffer(1)]], const device StackBuf& _512 [[buffer(2)]], device OutBuf& _751 [[buffer(3)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+kernel void main0(const device InBuf& _314 [[buffer(0)]], const device BicBuf& _170 [[buffer(1)]], const device StackBuf& _298 [[buffer(2)]], device OutBuf& _399 [[buffer(3)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
-    threadgroup Bic sh_bic[126];
-    threadgroup uint sh_bitmaps[64];
+    threadgroup Bic sh_bic[1022];
     threadgroup uint sh_stack[512];
-    threadgroup uint sh_link[64];
-    threadgroup uint sh_next[64];
     uint th = gl_LocalInvocationID.x;
     Bic bic = Bic{ 0u, 0u };
-    if ((th * 8u) < gl_WorkGroupID.x)
+    if ((th * 1u) < gl_WorkGroupID.x)
     {
-        uint _252 = th * 8u;
-        bic.a = _250.bicbuf[_252].a;
-        bic.b = _250.bicbuf[_252].b;
+        uint _172 = th * 1u;
+        bic.a = _170.bicbuf[_172].a;
+        bic.b = _170.bicbuf[_172].b;
     }
     Bic other;
-    for (uint i = 1u; i < 8u; i++)
+    for (uint i = 1u; i < 1u; i++)
     {
-        if (((th * 8u) + i) < gl_WorkGroupID.x)
+        if (((th * 1u) + i) < gl_WorkGroupID.x)
         {
-            uint _281 = (th * 8u) + i;
-            other.a = _250.bicbuf[_281].a;
-            other.b = _250.bicbuf[_281].b;
+            uint _201 = (th * 1u) + i;
+            other.a = _170.bicbuf[_201].a;
+            other.b = _170.bicbuf[_201].b;
             Bic param = bic;
             Bic param_1 = other;
             bic = bic_combine(param, param_1);
         }
     }
     sh_bic[th] = bic;
-    for (uint i_1 = 0u; i_1 < 6u; i_1++)
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
     {
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        if ((th + (1u << i_1)) < 64u)
+        if ((th + (1u << i_1)) < 512u)
         {
             Bic other_1 = sh_bic[th + (1u << i_1)];
             Bic param_2 = bic;
@@ -205,203 +127,53 @@
         sh_bic[th] = bic;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (th == 63u)
-    {
-        bic = Bic{ 0u, 0u };
-    }
-    else
-    {
-        bic = sh_bic[th + 1u];
-    }
-    uint last_b = bic.b;
-    uint bitmap = 0u;
-    Bic param_4;
-    for (uint i_2 = 0u; i_2 < 8u; i_2++)
-    {
-        uint this_ix = (((th * 8u) + 8u) - 1u) - i_2;
-        if (this_ix < gl_WorkGroupID.x)
-        {
-            param_4.a = _250.bicbuf[this_ix].a;
-            param_4.b = _250.bicbuf[this_ix].b;
-            Bic param_5 = bic;
-            bic = bic_combine(param_4, param_5);
-        }
-        sh_stack[this_ix] = bic.b;
-        if (bic.b > last_b)
-        {
-            bitmap |= (1u << (7u - i_2));
-        }
-        last_b = bic.b;
-    }
-    sh_bitmaps[th] = bitmap;
-    uint link = 0u;
-    if (bitmap != 0u)
-    {
-        link = (th * 8u) + uint(int(spvFindUMSB(bitmap)));
-    }
-    sh_link[th] = link;
-    for (uint i_3 = 0u; i_3 < 6u; i_3++)
-    {
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-        if (th >= (1u << i_3))
-        {
-            link = max(link, sh_link[th - (1u << i_3)]);
-        }
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-        sh_link[th] = link;
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    uint sp = 504u - (th * 8u);
+    uint sp = 511u - th;
     uint ix = 0u;
-    for (uint i_4 = 0u; i_4 < 9u; i_4++)
+    for (uint i_2 = 0u; i_2 < 9u; i_2++)
     {
-        uint probe = ix + (256u >> i_4);
-        if (sp < sh_stack[probe])
+        uint probe = ix + (256u >> i_2);
+        if (sp < sh_bic[probe].b)
         {
             ix = probe;
         }
     }
-    uint b = sh_stack[ix];
-    spvUnsafeArray<uint, 8> local_stack;
-    for (uint i_5 = 0u; i_5 < 8u; i_5++)
+    uint b = sh_bic[ix].b;
+    if (sp < b)
     {
-        local_stack[i_5] = 0u;
-    }
-    uint i_6 = 0u;
-    while ((sp + i_6) < b)
-    {
-        local_stack[7u - i_6] = _512.stack[(((ix * 512u) + b) - (sp + i_6)) - 1u];
-        i_6++;
-        if (i_6 == 8u)
-        {
-            break;
-        }
-        if ((sp + i_6) == b)
-        {
-            uint bits = sh_bitmaps[ix / 8u] & ((1u << (ix % 8u)) - 1u);
-            if (bits == 0u)
-            {
-                ix = sh_link[max((ix / 8u), 1u) - 1u];
-            }
-            else
-            {
-                ix = (ix & 4294967288u) + uint(int(spvFindUMSB(bits)));
-            }
-            b = sh_stack[ix];
-        }
+        sh_stack[th] = _298.stack[(((ix * 512u) + b) - sp) - 1u];
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    for (uint i_7 = 0u; i_7 < 8u; i_7++)
-    {
-        sh_stack[(th * 8u) + i_7] = local_stack[i_7];
-    }
-    uint inp = _593.inbuf[((gl_GlobalInvocationID.x * 8u) + 8u) - 1u];
+    uint inp = _314.inbuf[((gl_GlobalInvocationID.x * 1u) + 1u) - 1u];
     bic = Bic{ 1u - inp, inp };
-    bitmap = inp << uint(7);
-    for (uint i_8 = 7u; i_8 > 0u; i_8--)
-    {
-        inp = _593.inbuf[((gl_GlobalInvocationID.x * 8u) + i_8) - 1u];
-        bool _626 = inp == 1u;
-        bool _632;
-        if (_626)
-        {
-            _632 = bic.a == 0u;
-        }
-        else
-        {
-            _632 = _626;
-        }
-        if (_632)
-        {
-            bitmap |= (1u << (i_8 - 1u));
-        }
-        Bic other_2 = Bic{ 1u - inp, inp };
-        Bic param_6 = other_2;
-        Bic param_7 = bic;
-        bic = bic_combine(param_6, param_7);
-    }
-    sh_bitmaps[th] = bitmap;
     sh_bic[th] = bic;
     uint inbase = 0u;
-    for (uint i_9 = 0u; i_9 < 5u; i_9++)
+    for (uint i_3 = 0u; i_3 < 8u; i_3++)
     {
-        uint outbase = 128u - (1u << (6u - i_9));
+        uint outbase = 1024u - (1u << (9u - i_3));
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        if (th < (1u << (5u - i_9)))
+        if (th < (1u << (8u - i_3)))
         {
-            Bic param_8 = sh_bic[inbase + (th * 2u)];
-            Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u];
-            sh_bic[outbase + th] = bic_combine(param_8, param_9);
+            Bic param_4 = sh_bic[inbase + (th * 2u)];
+            Bic param_5 = sh_bic[(inbase + (th * 2u)) + 1u];
+            sh_bic[outbase + th] = bic_combine(param_4, param_5);
         }
         inbase = outbase;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    bic.b = 0u;
-    Bic param_10 = bic;
-    uint _706 = search_link(param_10, gl_LocalInvocationID, sh_bic, sh_bitmaps);
-    bic = param_10;
-    sh_link[th] = _706;
     bic = Bic{ 0u, 0u };
-    Bic param_11 = bic;
-    uint _711 = search_link(param_11, gl_LocalInvocationID, sh_bic, sh_bitmaps);
-    bic = param_11;
-    ix = _711;
-    uint loc_sp = 0u;
+    Bic param_6 = bic;
+    uint _377 = search_link(param_6, gl_LocalInvocationID, sh_bic);
+    bic = param_6;
+    ix = _377;
     uint outp;
-    spvUnsafeArray<uint, 8> loc_stack;
-    for (uint i_10 = 0u; i_10 < 8u; i_10++)
+    if (ix > 0u)
     {
-        if (loc_sp > 0u)
-        {
-            outp = loc_stack[loc_sp - 1u];
-        }
-        else
-        {
-            if (int(ix) >= 0)
-            {
-                outp = (gl_WorkGroupID.x * 512u) + ix;
-            }
-            else
-            {
-                outp = sh_stack[512u + ix];
-            }
-        }
-        _751.outbuf[(gl_GlobalInvocationID.x * 8u) + i_10] = outp;
-        inp = _593.inbuf[(gl_GlobalInvocationID.x * 8u) + i_10];
-        if (inp == 1u)
-        {
-            loc_stack[loc_sp] = (gl_GlobalInvocationID.x * 8u) + i_10;
-            loc_sp++;
-        }
-        else
-        {
-            if (inp == 0u)
-            {
-                if (loc_sp > 0u)
-                {
-                    loc_sp--;
-                }
-                else
-                {
-                    if (int(ix) >= 0)
-                    {
-                        uint bits_1 = sh_bitmaps[ix / 8u] & ((1u << (ix % 8u)) - 1u);
-                        if (bits_1 == 0u)
-                        {
-                            ix = sh_link[ix / 8u];
-                        }
-                        else
-                        {
-                            ix = (ix & 4294967288u) + uint(int(spvFindUMSB(bits_1)));
-                        }
-                    }
-                    else
-                    {
-                        ix--;
-                    }
-                }
-            }
-        }
+        outp = ((gl_WorkGroupID.x * 512u) + ix) - 1u;
     }
+    else
+    {
+        outp = sh_stack[511u - bic.a];
+    }
+    _399.outbuf[gl_GlobalInvocationID.x] = outp;
 }
 
diff --git a/tests/shader/gen/stack_leaf.spv b/tests/shader/gen/stack_leaf.spv
index 827ef14..527d20c 100644
--- a/tests/shader/gen/stack_leaf.spv
+++ b/tests/shader/gen/stack_leaf.spv
Binary files differ
diff --git a/tests/shader/gen/stack_reduce.dxil b/tests/shader/gen/stack_reduce.dxil
index fd97e87..50ceb82 100644
--- a/tests/shader/gen/stack_reduce.dxil
+++ b/tests/shader/gen/stack_reduce.dxil
Binary files differ
diff --git a/tests/shader/gen/stack_reduce.hlsl b/tests/shader/gen/stack_reduce.hlsl
index 0eab9cb..5df49cd 100644
--- a/tests/shader/gen/stack_reduce.hlsl
+++ b/tests/shader/gen/stack_reduce.hlsl
@@ -4,13 +4,13 @@
     uint b;
 };
 
-static const uint3 gl_WorkGroupSize = uint3(64u, 1u, 1u);
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
 
-static const Bic _175 = { 0u, 0u };
+static const Bic _174 = { 0u, 0u };
 
 ByteAddressBuffer _48 : register(t0);
-RWByteAddressBuffer _160 : register(u1);
-RWByteAddressBuffer _223 : register(u2);
+RWByteAddressBuffer _159 : register(u1);
+RWByteAddressBuffer _221 : register(u2);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -22,7 +22,7 @@
     uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
 };
 
-groupshared Bic sh_bic[64];
+groupshared Bic sh_bic[512];
 
 Bic bic_combine(Bic x, Bic y)
 {
@@ -33,24 +33,24 @@
 
 void comp_main()
 {
-    uint inp[8];
-    inp[0] = _48.Load((gl_GlobalInvocationID.x * 8u) * 4 + 0);
-    Bic _68 = { 1u - inp[0], inp[0] };
-    Bic bic = _68;
-    for (uint i = 1u; i < 8u; i++)
+    uint inp[1];
+    inp[0] = _48.Load((gl_GlobalInvocationID.x * 1u) * 4 + 0);
+    Bic _67 = { 1u - inp[0], inp[0] };
+    Bic bic = _67;
+    for (uint i = 1u; i < 1u; i++)
     {
-        inp[i] = _48.Load(((gl_GlobalInvocationID.x * 8u) + i) * 4 + 0);
-        Bic _95 = { 1u - inp[i], inp[i] };
-        Bic other = _95;
+        inp[i] = _48.Load(((gl_GlobalInvocationID.x * 1u) + i) * 4 + 0);
+        Bic _94 = { 1u - inp[i], inp[i] };
+        Bic other = _94;
         Bic param = bic;
         Bic param_1 = other;
         bic = bic_combine(param, param_1);
     }
     sh_bic[gl_LocalInvocationID.x] = bic;
-    for (uint i_1 = 0u; i_1 < 6u; i_1++)
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
     {
         GroupMemoryBarrierWithGroupSync();
-        if ((gl_LocalInvocationID.x + (1u << i_1)) < 64u)
+        if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
         {
             Bic other_1 = sh_bic[gl_LocalInvocationID.x + (1u << i_1)];
             Bic param_2 = bic;
@@ -62,43 +62,43 @@
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        _160.Store(gl_WorkGroupID.x * 8 + 0, bic.a);
-        _160.Store(gl_WorkGroupID.x * 8 + 4, bic.b);
+        _159.Store(gl_WorkGroupID.x * 8 + 0, bic.a);
+        _159.Store(gl_WorkGroupID.x * 8 + 4, bic.b);
     }
     GroupMemoryBarrierWithGroupSync();
     uint size = sh_bic[0].b;
-    bic = _175;
-    if ((gl_LocalInvocationID.x + 1u) < 64u)
+    bic = _174;
+    if ((gl_LocalInvocationID.x + 1u) < 512u)
     {
         bic = sh_bic[gl_LocalInvocationID.x + 1u];
     }
     uint out_ix = ((gl_WorkGroupID.x * 512u) + size) - bic.b;
-    for (uint i_2 = 8u; i_2 > 0u; i_2--)
+    for (uint i_2 = 1u; i_2 > 0u; i_2--)
     {
-        bool _209 = inp[i_2 - 1u] == 1u;
-        bool _215;
-        if (_209)
+        bool _207 = inp[i_2 - 1u] == 1u;
+        bool _213;
+        if (_207)
         {
-            _215 = bic.a == 0u;
+            _213 = bic.a == 0u;
         }
         else
         {
-            _215 = _209;
+            _213 = _207;
         }
-        if (_215)
+        if (_213)
         {
             out_ix--;
-            _223.Store(out_ix * 4 + 0, ((gl_GlobalInvocationID.x * 8u) + i_2) - 1u);
+            _221.Store(out_ix * 4 + 0, ((gl_GlobalInvocationID.x * 1u) + i_2) - 1u);
         }
-        Bic _242 = { 1u - inp[i_2 - 1u], inp[i_2 - 1u] };
-        Bic other_2 = _242;
+        Bic _240 = { 1u - inp[i_2 - 1u], inp[i_2 - 1u] };
+        Bic other_2 = _240;
         Bic param_4 = other_2;
         Bic param_5 = bic;
         bic = bic_combine(param_4, param_5);
     }
 }
 
-[numthreads(64, 1, 1)]
+[numthreads(512, 1, 1)]
 void main(SPIRV_Cross_Input stage_input)
 {
     gl_WorkGroupID = stage_input.gl_WorkGroupID;
diff --git a/tests/shader/gen/stack_reduce.msl b/tests/shader/gen/stack_reduce.msl
index 2723dee..13ce9fa 100644
--- a/tests/shader/gen/stack_reduce.msl
+++ b/tests/shader/gen/stack_reduce.msl
@@ -71,7 +71,7 @@
     uint stack[1];
 };
 
-constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u);
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
 
 static inline __attribute__((always_inline))
 Bic bic_combine(thread const Bic& x, thread const Bic& y)
@@ -80,25 +80,25 @@
     return Bic{ (x.a + y.a) - m, (x.b + y.b) - m };
 }
 
-kernel void main0(const device InBuf& _48 [[buffer(0)]], device OutBuf& _160 [[buffer(1)]], device StackBuf& _223 [[buffer(2)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(const device InBuf& _48 [[buffer(0)]], device OutBuf& _159 [[buffer(1)]], device StackBuf& _221 [[buffer(2)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
-    threadgroup Bic sh_bic[64];
-    spvUnsafeArray<uint, 8> inp;
-    inp[0] = _48.inbuf[gl_GlobalInvocationID.x * 8u];
+    threadgroup Bic sh_bic[512];
+    spvUnsafeArray<uint, 1> inp;
+    inp[0] = _48.inbuf[gl_GlobalInvocationID.x * 1u];
     Bic bic = Bic{ 1u - inp[0], inp[0] };
-    for (uint i = 1u; i < 8u; i++)
+    for (uint i = 1u; i < 1u; i++)
     {
-        inp[i] = _48.inbuf[(gl_GlobalInvocationID.x * 8u) + i];
+        inp[i] = _48.inbuf[(gl_GlobalInvocationID.x * 1u) + i];
         Bic other = Bic{ 1u - inp[i], inp[i] };
         Bic param = bic;
         Bic param_1 = other;
         bic = bic_combine(param, param_1);
     }
     sh_bic[gl_LocalInvocationID.x] = bic;
-    for (uint i_1 = 0u; i_1 < 6u; i_1++)
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
     {
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        if ((gl_LocalInvocationID.x + (1u << i_1)) < 64u)
+        if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
         {
             Bic other_1 = sh_bic[gl_LocalInvocationID.x + (1u << i_1)];
             Bic param_2 = bic;
@@ -110,33 +110,33 @@
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        _160.outbuf[gl_WorkGroupID.x].a = bic.a;
-        _160.outbuf[gl_WorkGroupID.x].b = bic.b;
+        _159.outbuf[gl_WorkGroupID.x].a = bic.a;
+        _159.outbuf[gl_WorkGroupID.x].b = bic.b;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint size = sh_bic[0].b;
     bic = Bic{ 0u, 0u };
-    if ((gl_LocalInvocationID.x + 1u) < 64u)
+    if ((gl_LocalInvocationID.x + 1u) < 512u)
     {
         bic = sh_bic[gl_LocalInvocationID.x + 1u];
     }
     uint out_ix = ((gl_WorkGroupID.x * 512u) + size) - bic.b;
-    for (uint i_2 = 8u; i_2 > 0u; i_2--)
+    for (uint i_2 = 1u; i_2 > 0u; i_2--)
     {
-        bool _209 = inp[i_2 - 1u] == 1u;
-        bool _215;
-        if (_209)
+        bool _207 = inp[i_2 - 1u] == 1u;
+        bool _213;
+        if (_207)
         {
-            _215 = bic.a == 0u;
+            _213 = bic.a == 0u;
         }
         else
         {
-            _215 = _209;
+            _213 = _207;
         }
-        if (_215)
+        if (_213)
         {
             out_ix--;
-            _223.stack[out_ix] = ((gl_GlobalInvocationID.x * 8u) + i_2) - 1u;
+            _221.stack[out_ix] = ((gl_GlobalInvocationID.x * 1u) + i_2) - 1u;
         }
         Bic other_2 = Bic{ 1u - inp[i_2 - 1u], inp[i_2 - 1u] };
         Bic param_4 = other_2;
diff --git a/tests/shader/gen/stack_reduce.spv b/tests/shader/gen/stack_reduce.spv
index 90be43a..733c3a0 100644
--- a/tests/shader/gen/stack_reduce.spv
+++ b/tests/shader/gen/stack_reduce.spv
Binary files differ
diff --git a/tests/shader/intersection_leaf.comp b/tests/shader/intersection_leaf.comp
index c3e594b..04918f1 100644
--- a/tests/shader/intersection_leaf.comp
+++ b/tests/shader/intersection_leaf.comp
@@ -6,7 +6,7 @@
 
 #define LG_N_SEQ 0
 #define N_SEQ (1 << LG_N_SEQ)
-#define LG_WG_SIZE 8
+#define LG_WG_SIZE 9
 #define WG_SIZE (1 << LG_WG_SIZE)
 #define PART_SIZE (WG_SIZE * N_SEQ)
 
diff --git a/tests/shader/intersection_reduce.comp b/tests/shader/intersection_reduce.comp
index cc1e600..19e568b 100644
--- a/tests/shader/intersection_reduce.comp
+++ b/tests/shader/intersection_reduce.comp
@@ -5,7 +5,7 @@
 #version 450
 
 // At least for now, N_SEQ is hardwired at 1
-#define LG_WG_SIZE 8
+#define LG_WG_SIZE 9
 #define WG_SIZE (1 << LG_WG_SIZE)
 #define PART_SIZE WG_SIZE
 
diff --git a/tests/shader/stack_leaf.comp b/tests/shader/stack_leaf.comp
index 1cdb099..23bf47a 100644
--- a/tests/shader/stack_leaf.comp
+++ b/tests/shader/stack_leaf.comp
@@ -4,9 +4,9 @@
 
 #version 450
 
-#define LG_N_SEQ 3
+#define LG_N_SEQ 0
 #define N_SEQ (1 << LG_N_SEQ)
-#define LG_WG_SIZE 6
+#define LG_WG_SIZE 9
 #define WG_SIZE (1 << LG_WG_SIZE)
 #define PART_SIZE (WG_SIZE * N_SEQ)
 
@@ -232,6 +232,7 @@
         sh_stack[th] = stack[ix * PART_SIZE + b - sp - 1];
     }
 #endif
+    barrier();
 
     // Do tree reduction of bicyclic semigroups (up-sweep)
     uint inp = inbuf[gl_GlobalInvocationID.x * N_SEQ + N_SEQ - 1];
diff --git a/tests/shader/stack_reduce.comp b/tests/shader/stack_reduce.comp
index b066135..71da8b6 100644
--- a/tests/shader/stack_reduce.comp
+++ b/tests/shader/stack_reduce.comp
@@ -4,8 +4,8 @@
 
 #version 450
 
-#define N_SEQ 8
-#define LG_WG_SIZE 6
+#define N_SEQ 1
+#define LG_WG_SIZE 9
 #define WG_SIZE (1 << LG_WG_SIZE)
 #define PART_SIZE (WG_SIZE * N_SEQ)
 
diff --git a/tests/shader/union_leaf.comp b/tests/shader/union_leaf.comp
index e40bc6d..49775eb 100644
--- a/tests/shader/union_leaf.comp
+++ b/tests/shader/union_leaf.comp
@@ -6,7 +6,7 @@
 
 #define LG_N_SEQ 0
 #define N_SEQ (1 << LG_N_SEQ)
-#define LG_WG_SIZE 8
+#define LG_WG_SIZE 9
 #define WG_SIZE (1 << LG_WG_SIZE)
 #define PART_SIZE (WG_SIZE * N_SEQ)
 
diff --git a/tests/shader/union_reduce.comp b/tests/shader/union_reduce.comp
index f584ae3..fec376c 100644
--- a/tests/shader/union_reduce.comp
+++ b/tests/shader/union_reduce.comp
@@ -5,7 +5,7 @@
 #version 450
 
 // At least for now, N_SEQ is hardwired at 1
-#define LG_WG_SIZE 8
+#define LG_WG_SIZE 9
 #define WG_SIZE (1 << LG_WG_SIZE)
 #define PART_SIZE WG_SIZE
 
diff --git a/tests/src/bbox_intersection.rs b/tests/src/bbox_intersection.rs
index fac9704..2c006a8 100644
--- a/tests/src/bbox_intersection.rs
+++ b/tests/src/bbox_intersection.rs
@@ -24,7 +24,7 @@
 use crate::runner::Runner;
 use crate::test_result::TestResult;
 
-const WG_SIZE: u64 = 256;
+const WG_SIZE: u64 = 512;
 const N_ROWS: u64 = 1;
 const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
 
diff --git a/tests/src/bbox_union.rs b/tests/src/bbox_union.rs
index ca2224c..bf77f59 100644
--- a/tests/src/bbox_union.rs
+++ b/tests/src/bbox_union.rs
@@ -24,7 +24,7 @@
 use crate::runner::Runner;
 use crate::test_result::TestResult;
 
-const WG_SIZE: u64 = 256;
+const WG_SIZE: u64 = 512;
 const N_ROWS: u64 = 1;
 const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
 
diff --git a/tests/src/stack.rs b/tests/src/stack.rs
index aba875b..923010d 100644
--- a/tests/src/stack.rs
+++ b/tests/src/stack.rs
@@ -21,8 +21,8 @@
 use crate::runner::Runner;
 use crate::test_result::TestResult;
 
-const WG_SIZE: u64 = 64;
-const N_ROWS: u64 = 8;
+const WG_SIZE: u64 = 512;
+const N_ROWS: u64 = 1;
 const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
 
 struct StackCode {