Threadgroups for block transpose A sad perf drop from subgroups.
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 14e67da..ace8617 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja
@@ -4,7 +4,7 @@ glslang_validator = glslangValidator -target = vulkan1.1 +target = vulkan1.0 rule glsl command = $glslang_validator -V -o $out $in --target-env $target
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index d1ea94f..c4eb12e 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp
@@ -2,9 +2,6 @@ #version 450 #extension GL_GOOGLE_include_directive : enable -#extension GL_KHR_shader_subgroup_ballot : enable -#extension GL_KHR_shader_subgroup_arithmetic : enable -#extension GL_KHR_shader_subgroup_shuffle : enable #include "setup.h" @@ -97,11 +94,10 @@ return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign); } -// Implementation of 16 x 16 boolean matrix transpose, using subgroups. -uint block_swap(uint a, uint m, uint s) { - uint b = subgroupShuffleXor(a, s); +// Implementation of 16 x 16 boolean matrix transpose, using threadgroup shared memory. +uint block_swap(uint a, uint b, uint m, uint s) { uint c; - if ((gl_SubgroupInvocationID & s) == 0) { + if ((gl_LocalInvocationID.x & s) == 0) { c = b << s; } else { m = ~m; @@ -112,9 +108,14 @@ const uint masks[4] = uint[4](0x55555555, 0x33333333, 0xf0f0f0f, 0xff00ff); +shared uint tg_bms[N_TILE]; + uint transpose(uint bitmask) { for (uint i = 0; i < 4; i++) { - bitmask = block_swap(bitmask, masks[i], 1 << i); + tg_bms[gl_LocalInvocationID.x] = bitmask; + barrier(); + bitmask = block_swap(bitmask, tg_bms[gl_LocalInvocationID.x ^ (1 << i)], masks[i], 1 << i); + barrier(); } return bitmask; }
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index fc225ae..b601f18 100644 --- a/piet-gpu/shader/coarse.spv +++ b/piet-gpu/shader/coarse.spv Binary files differ