Threadgroups for block transpose

A sad perf drop from subgroups.
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 14e67da..ace8617 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -4,7 +4,7 @@
 
 glslang_validator = glslangValidator
 
-target = vulkan1.1
+target = vulkan1.0
 
 rule glsl
   command = $glslang_validator -V -o $out $in --target-env $target
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index d1ea94f..c4eb12e 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -2,9 +2,6 @@
 
 #version 450
 #extension GL_GOOGLE_include_directive : enable
-#extension GL_KHR_shader_subgroup_ballot : enable
-#extension GL_KHR_shader_subgroup_arithmetic : enable
-#extension GL_KHR_shader_subgroup_shuffle : enable
 
 #include "setup.h"
 
@@ -97,11 +94,10 @@
     return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
 }
 
-// Implementation of 16 x 16 boolean matrix transpose, using subgroups.
-uint block_swap(uint a, uint m, uint s) {
-    uint b = subgroupShuffleXor(a, s);
+// Implementation of 16 x 16 boolean matrix transpose, using threadgroup shared memory.
+uint block_swap(uint a, uint b, uint m, uint s) {
     uint c;
-    if ((gl_SubgroupInvocationID & s) == 0) {
+    if ((gl_LocalInvocationID.x & s) == 0) {
         c = b << s;
     } else {
         m = ~m;
@@ -112,9 +108,14 @@
 
 const uint masks[4] = uint[4](0x55555555, 0x33333333, 0xf0f0f0f, 0xff00ff);
 
+shared uint tg_bms[N_TILE];
+
 uint transpose(uint bitmask) {
     for (uint i = 0; i < 4; i++) {
-        bitmask = block_swap(bitmask, masks[i], 1 << i);
+        tg_bms[gl_LocalInvocationID.x] = bitmask;
+        barrier();
+        bitmask = block_swap(bitmask, tg_bms[gl_LocalInvocationID.x ^ (1 << i)], masks[i], 1 << i);
+        barrier();
     }
     return bitmask;
 }
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index fc225ae..b601f18 100644
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
Binary files differ