Threadgroups for block transpose
A sad perf drop from subgroups.
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 14e67da..ace8617 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -4,7 +4,7 @@
glslang_validator = glslangValidator
-target = vulkan1.1
+target = vulkan1.0
rule glsl
command = $glslang_validator -V -o $out $in --target-env $target
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index d1ea94f..c4eb12e 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -2,9 +2,6 @@
#version 450
#extension GL_GOOGLE_include_directive : enable
-#extension GL_KHR_shader_subgroup_ballot : enable
-#extension GL_KHR_shader_subgroup_arithmetic : enable
-#extension GL_KHR_shader_subgroup_shuffle : enable
#include "setup.h"
@@ -97,11 +94,10 @@
return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
}
-// Implementation of 16 x 16 boolean matrix transpose, using subgroups.
-uint block_swap(uint a, uint m, uint s) {
- uint b = subgroupShuffleXor(a, s);
+// Implementation of 16 x 16 boolean matrix transpose, using threadgroup shared memory.
+uint block_swap(uint a, uint b, uint m, uint s) {
uint c;
- if ((gl_SubgroupInvocationID & s) == 0) {
+ if ((gl_LocalInvocationID.x & s) == 0) {
c = b << s;
} else {
m = ~m;
@@ -112,9 +108,14 @@
const uint masks[4] = uint[4](0x55555555, 0x33333333, 0xf0f0f0f, 0xff00ff);
+shared uint tg_bms[N_TILE];
+
uint transpose(uint bitmask) {
for (uint i = 0; i < 4; i++) {
- bitmask = block_swap(bitmask, masks[i], 1 << i);
+ tg_bms[gl_LocalInvocationID.x] = bitmask;
+ barrier();
+ bitmask = block_swap(bitmask, tg_bms[gl_LocalInvocationID.x ^ (1 << i)], masks[i], 1 << i);
+ barrier();
}
return bitmask;
}
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index fc225ae..b601f18 100644
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
Binary files differ