Threadgroups for block transpose A sad perf drop from subgroups.

commit: e23cba84d46510cee2febc638fa062cf37ea3bd2 [log] [tgz]
author: Raph Levien <raph.levien@gmail.com> Tue May 26 13:53:38 2020 -0700
committer: Raph Levien <raph.levien@gmail.com> Tue May 26 13:53:38 2020 -0700
tree: 8838ca3d82f220fdc1a2ec629fba9ea8d46a65cb
parent: c70cb072902f7993e2ad3a43b53d1e53dc470000 [diff]
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 14e67da..ace8617 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja

@@ -4,7 +4,7 @@
 
 glslang_validator = glslangValidator
 
-target = vulkan1.1
+target = vulkan1.0
 
 rule glsl
   command = $glslang_validator -V -o $out $in --target-env $target

diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index d1ea94f..c4eb12e 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp

@@ -2,9 +2,6 @@
 
 #version 450
 #extension GL_GOOGLE_include_directive : enable
-#extension GL_KHR_shader_subgroup_ballot : enable
-#extension GL_KHR_shader_subgroup_arithmetic : enable
-#extension GL_KHR_shader_subgroup_shuffle : enable
 
 #include "setup.h"
 
@@ -97,11 +94,10 @@
     return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
 }
 
-// Implementation of 16 x 16 boolean matrix transpose, using subgroups.
-uint block_swap(uint a, uint m, uint s) {
-    uint b = subgroupShuffleXor(a, s);
+// Implementation of 16 x 16 boolean matrix transpose, using threadgroup shared memory.
+uint block_swap(uint a, uint b, uint m, uint s) {
     uint c;
-    if ((gl_SubgroupInvocationID & s) == 0) {
+    if ((gl_LocalInvocationID.x & s) == 0) {
         c = b << s;
     } else {
         m = ~m;
@@ -112,9 +108,14 @@
 
 const uint masks[4] = uint[4](0x55555555, 0x33333333, 0xf0f0f0f, 0xff00ff);
 
+shared uint tg_bms[N_TILE];
+
 uint transpose(uint bitmask) {
     for (uint i = 0; i < 4; i++) {
-        bitmask = block_swap(bitmask, masks[i], 1 << i);
+        tg_bms[gl_LocalInvocationID.x] = bitmask;
+        barrier();
+        bitmask = block_swap(bitmask, tg_bms[gl_LocalInvocationID.x ^ (1 << i)], masks[i], 1 << i);
+        barrier();
     }
     return bitmask;
 }

diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index fc225ae..b601f18 100644
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
Binary files differ
commit	e23cba84d46510cee2febc638fa062cf37ea3bd2	[log] [tgz]
author	Raph Levien <raph.levien@gmail.com>	Tue May 26 13:53:38 2020 -0700
committer	Raph Levien <raph.levien@gmail.com>	Tue May 26 13:53:38 2020 -0700
tree	8838ca3d82f220fdc1a2ec629fba9ea8d46a65cb
parent	c70cb072902f7993e2ad3a43b53d1e53dc470000 [diff]