[sksl] Reduce memory-order constraints of workgroupBarrier()

The workgroupBarrier() intrinsic was implemented with a combined device,
threadgroup, and texture address space constraints in Metal. This scope
is too large for many use cases that need constraints to apply over a
more narrow subset (workgroup shared memory in particular).

This changes the barrier intrinsic definitions to follow the WGSL
approach which should map cleanly to Metal, WebGPU, Vulkan, GL, and
D3D12:

1. workgroupBarrier() now only implies memory ordering semantics in the
   workgroup address space (i.e. `mem_flags::mem_threadgroup` in Metal).

2. Introduced a storageBarrier() intrinsics that provides memory
   ordering guarantees in storage and uniform address spaces (i.e.
   `mem_flags::mem_device`).

Both intrinsics are intended to be control-barriers that operate in
workgroup execution and memory scope but with different memory
semantics. We can introduce new intrinsics if we want barriers for new
memory semantics (e.g. texture) which is model used in D3D and OpenGL.

An alternative design is to have a single intrinsic that accepts a
bitmask (like in Vulkan and Metal) and have those map to separate
intrinsic calls on WGSL and GLSL.

P.S.: Also re-formatted the intrinsic list enum in SkSLIntrinsicList.h

Change-Id: I00a7450fc27e5feefb318262743c3229dcd4f700
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/608719
Reviewed-by: Brian Osman <brianosman@google.com>
Commit-Queue: Arman Uguray <armansito@google.com>
diff --git a/gn/sksl_tests.gni b/gn/sksl_tests.gni
index 0b1bcf4..297c22d 100644
--- a/gn/sksl_tests.gni
+++ b/gn/sksl_tests.gni
@@ -305,6 +305,7 @@
 
 sksl_metal_tests = [
   "compute/ArrayAdd.compute",
+  "compute/Barrier.compute",
   "compute/Desaturate.compute",
   "compute/DesaturateFunction.compute",
   "compute/DesaturateReadWrite.compute",
diff --git a/resources/sksl/BUILD.bazel b/resources/sksl/BUILD.bazel
index db3d79b..5d6542b 100644
--- a/resources/sksl/BUILD.bazel
+++ b/resources/sksl/BUILD.bazel
@@ -512,6 +512,7 @@
     name = "sksl_metal_tests",
     srcs = [
         "compute/ArrayAdd.compute",
+        "compute/Barrier.compute",
         "compute/Desaturate.compute",
         "compute/DesaturateFunction.compute",
         "compute/DesaturateReadWrite.compute",
diff --git a/resources/sksl/compute/Barrier.compute b/resources/sksl/compute/Barrier.compute
new file mode 100644
index 0000000..2536720
--- /dev/null
+++ b/resources/sksl/compute/Barrier.compute
@@ -0,0 +1,4 @@
+void main() {
+    workgroupBarrier();
+    storageBarrier();
+}
diff --git a/src/sksl/SkSLIntrinsicList.h b/src/sksl/SkSLIntrinsicList.h
index 3f6b092..d204d0d 100644
--- a/src/sksl/SkSLIntrinsicList.h
+++ b/src/sksl/SkSLIntrinsicList.h
@@ -16,107 +16,108 @@
 
 // A list of every intrinsic supported by SkSL.
 // Using an X-Macro (https://en.wikipedia.org/wiki/X_Macro) to manage the list.
-#define SKSL_INTRINSIC_LIST            \
-    SKSL_INTRINSIC(abs)                \
-    SKSL_INTRINSIC(acosh)              \
-    SKSL_INTRINSIC(acos)               \
-    SKSL_INTRINSIC(all)                \
-    SKSL_INTRINSIC(any)                \
-    SKSL_INTRINSIC(asinh)              \
-    SKSL_INTRINSIC(asin)               \
-    SKSL_INTRINSIC(atanh)              \
-    SKSL_INTRINSIC(atan)               \
-    SKSL_INTRINSIC(bitCount)           \
-    SKSL_INTRINSIC(ceil)               \
-    SKSL_INTRINSIC(clamp)              \
-    SKSL_INTRINSIC(cosh)               \
-    SKSL_INTRINSIC(cos)                \
-    SKSL_INTRINSIC(cross)              \
-    SKSL_INTRINSIC(degrees)            \
-    SKSL_INTRINSIC(determinant)        \
-    SKSL_INTRINSIC(dFdx)               \
-    SKSL_INTRINSIC(dFdy)               \
-    SKSL_INTRINSIC(distance)           \
-    SKSL_INTRINSIC(dot)                \
-    SKSL_INTRINSIC(equal)              \
-    SKSL_INTRINSIC(eval)               \
-    SKSL_INTRINSIC(exp2)               \
-    SKSL_INTRINSIC(exp)                \
-    SKSL_INTRINSIC(faceforward)        \
-    SKSL_INTRINSIC(findLSB)            \
-    SKSL_INTRINSIC(findMSB)            \
-    SKSL_INTRINSIC(floatBitsToInt)     \
-    SKSL_INTRINSIC(floatBitsToUint)    \
-    SKSL_INTRINSIC(floor)              \
-    SKSL_INTRINSIC(fma)                \
-    SKSL_INTRINSIC(fract)              \
-    SKSL_INTRINSIC(frexp)              \
-    SKSL_INTRINSIC(fromLinearSrgb)     \
-    SKSL_INTRINSIC(fwidth)             \
-    SKSL_INTRINSIC(greaterThanEqual)   \
-    SKSL_INTRINSIC(greaterThan)        \
-    SKSL_INTRINSIC(height)             \
-    SKSL_INTRINSIC(intBitsToFloat)     \
-    SKSL_INTRINSIC(inversesqrt)        \
-    SKSL_INTRINSIC(inverse)            \
-    SKSL_INTRINSIC(isinf)              \
-    SKSL_INTRINSIC(isnan)              \
-    SKSL_INTRINSIC(ldexp)              \
-    SKSL_INTRINSIC(length)             \
-    SKSL_INTRINSIC(lessThanEqual)      \
-    SKSL_INTRINSIC(lessThan)           \
-    SKSL_INTRINSIC(log2)               \
-    SKSL_INTRINSIC(log)                \
-    SKSL_INTRINSIC(makeSampler2D)      \
-    SKSL_INTRINSIC(matrixCompMult)     \
-    SKSL_INTRINSIC(matrixInverse)      \
-    SKSL_INTRINSIC(max)                \
-    SKSL_INTRINSIC(min)                \
-    SKSL_INTRINSIC(mix)                \
-    SKSL_INTRINSIC(modf)               \
-    SKSL_INTRINSIC(mod)                \
-    SKSL_INTRINSIC(normalize)          \
-    SKSL_INTRINSIC(notEqual)           \
-    SKSL_INTRINSIC(not)                \
-    SKSL_INTRINSIC(outerProduct)       \
-    SKSL_INTRINSIC(packDouble2x32)     \
-    SKSL_INTRINSIC(packHalf2x16)       \
-    SKSL_INTRINSIC(packSnorm2x16)      \
-    SKSL_INTRINSIC(packSnorm4x8)       \
-    SKSL_INTRINSIC(packUnorm2x16)      \
-    SKSL_INTRINSIC(packUnorm4x8)       \
-    SKSL_INTRINSIC(pow)                \
-    SKSL_INTRINSIC(radians)            \
-    SKSL_INTRINSIC(read)               \
-    SKSL_INTRINSIC(reflect)            \
-    SKSL_INTRINSIC(refract)            \
-    SKSL_INTRINSIC(roundEven)          \
-    SKSL_INTRINSIC(round)              \
-    SKSL_INTRINSIC(sample)             \
-    SKSL_INTRINSIC(sampleGrad)         \
-    SKSL_INTRINSIC(sampleLod)          \
-    SKSL_INTRINSIC(saturate)           \
-    SKSL_INTRINSIC(sign)               \
-    SKSL_INTRINSIC(sinh)               \
-    SKSL_INTRINSIC(sin)                \
-    SKSL_INTRINSIC(smoothstep)         \
-    SKSL_INTRINSIC(sqrt)               \
-    SKSL_INTRINSIC(step)               \
-    SKSL_INTRINSIC(subpassLoad)        \
-    SKSL_INTRINSIC(tanh)               \
-    SKSL_INTRINSIC(tan)                \
-    SKSL_INTRINSIC(toLinearSrgb)       \
-    SKSL_INTRINSIC(transpose)          \
-    SKSL_INTRINSIC(trunc)              \
-    SKSL_INTRINSIC(uintBitsToFloat)    \
-    SKSL_INTRINSIC(unpackDouble2x32)   \
-    SKSL_INTRINSIC(unpackHalf2x16)     \
-    SKSL_INTRINSIC(unpackSnorm2x16)    \
-    SKSL_INTRINSIC(unpackSnorm4x8)     \
-    SKSL_INTRINSIC(unpackUnorm2x16)    \
-    SKSL_INTRINSIC(unpackUnorm4x8)     \
-    SKSL_INTRINSIC(width)              \
-    SKSL_INTRINSIC(workgroupBarrier)   \
+#define SKSL_INTRINSIC_LIST          \
+    SKSL_INTRINSIC(abs)              \
+    SKSL_INTRINSIC(acosh)            \
+    SKSL_INTRINSIC(acos)             \
+    SKSL_INTRINSIC(all)              \
+    SKSL_INTRINSIC(any)              \
+    SKSL_INTRINSIC(asinh)            \
+    SKSL_INTRINSIC(asin)             \
+    SKSL_INTRINSIC(atanh)            \
+    SKSL_INTRINSIC(atan)             \
+    SKSL_INTRINSIC(bitCount)         \
+    SKSL_INTRINSIC(ceil)             \
+    SKSL_INTRINSIC(clamp)            \
+    SKSL_INTRINSIC(cosh)             \
+    SKSL_INTRINSIC(cos)              \
+    SKSL_INTRINSIC(cross)            \
+    SKSL_INTRINSIC(degrees)          \
+    SKSL_INTRINSIC(determinant)      \
+    SKSL_INTRINSIC(dFdx)             \
+    SKSL_INTRINSIC(dFdy)             \
+    SKSL_INTRINSIC(distance)         \
+    SKSL_INTRINSIC(dot)              \
+    SKSL_INTRINSIC(equal)            \
+    SKSL_INTRINSIC(eval)             \
+    SKSL_INTRINSIC(exp2)             \
+    SKSL_INTRINSIC(exp)              \
+    SKSL_INTRINSIC(faceforward)      \
+    SKSL_INTRINSIC(findLSB)          \
+    SKSL_INTRINSIC(findMSB)          \
+    SKSL_INTRINSIC(floatBitsToInt)   \
+    SKSL_INTRINSIC(floatBitsToUint)  \
+    SKSL_INTRINSIC(floor)            \
+    SKSL_INTRINSIC(fma)              \
+    SKSL_INTRINSIC(fract)            \
+    SKSL_INTRINSIC(frexp)            \
+    SKSL_INTRINSIC(fromLinearSrgb)   \
+    SKSL_INTRINSIC(fwidth)           \
+    SKSL_INTRINSIC(greaterThanEqual) \
+    SKSL_INTRINSIC(greaterThan)      \
+    SKSL_INTRINSIC(height)           \
+    SKSL_INTRINSIC(intBitsToFloat)   \
+    SKSL_INTRINSIC(inversesqrt)      \
+    SKSL_INTRINSIC(inverse)          \
+    SKSL_INTRINSIC(isinf)            \
+    SKSL_INTRINSIC(isnan)            \
+    SKSL_INTRINSIC(ldexp)            \
+    SKSL_INTRINSIC(length)           \
+    SKSL_INTRINSIC(lessThanEqual)    \
+    SKSL_INTRINSIC(lessThan)         \
+    SKSL_INTRINSIC(log2)             \
+    SKSL_INTRINSIC(log)              \
+    SKSL_INTRINSIC(makeSampler2D)    \
+    SKSL_INTRINSIC(matrixCompMult)   \
+    SKSL_INTRINSIC(matrixInverse)    \
+    SKSL_INTRINSIC(max)              \
+    SKSL_INTRINSIC(min)              \
+    SKSL_INTRINSIC(mix)              \
+    SKSL_INTRINSIC(modf)             \
+    SKSL_INTRINSIC(mod)              \
+    SKSL_INTRINSIC(normalize)        \
+    SKSL_INTRINSIC(notEqual)         \
+    SKSL_INTRINSIC(not )             \
+    SKSL_INTRINSIC(outerProduct)     \
+    SKSL_INTRINSIC(packDouble2x32)   \
+    SKSL_INTRINSIC(packHalf2x16)     \
+    SKSL_INTRINSIC(packSnorm2x16)    \
+    SKSL_INTRINSIC(packSnorm4x8)     \
+    SKSL_INTRINSIC(packUnorm2x16)    \
+    SKSL_INTRINSIC(packUnorm4x8)     \
+    SKSL_INTRINSIC(pow)              \
+    SKSL_INTRINSIC(radians)          \
+    SKSL_INTRINSIC(read)             \
+    SKSL_INTRINSIC(reflect)          \
+    SKSL_INTRINSIC(refract)          \
+    SKSL_INTRINSIC(roundEven)        \
+    SKSL_INTRINSIC(round)            \
+    SKSL_INTRINSIC(sample)           \
+    SKSL_INTRINSIC(sampleGrad)       \
+    SKSL_INTRINSIC(sampleLod)        \
+    SKSL_INTRINSIC(saturate)         \
+    SKSL_INTRINSIC(sign)             \
+    SKSL_INTRINSIC(sinh)             \
+    SKSL_INTRINSIC(sin)              \
+    SKSL_INTRINSIC(smoothstep)       \
+    SKSL_INTRINSIC(sqrt)             \
+    SKSL_INTRINSIC(step)             \
+    SKSL_INTRINSIC(storageBarrier)   \
+    SKSL_INTRINSIC(subpassLoad)      \
+    SKSL_INTRINSIC(tanh)             \
+    SKSL_INTRINSIC(tan)              \
+    SKSL_INTRINSIC(toLinearSrgb)     \
+    SKSL_INTRINSIC(transpose)        \
+    SKSL_INTRINSIC(trunc)            \
+    SKSL_INTRINSIC(uintBitsToFloat)  \
+    SKSL_INTRINSIC(unpackDouble2x32) \
+    SKSL_INTRINSIC(unpackHalf2x16)   \
+    SKSL_INTRINSIC(unpackSnorm2x16)  \
+    SKSL_INTRINSIC(unpackSnorm4x8)   \
+    SKSL_INTRINSIC(unpackUnorm2x16)  \
+    SKSL_INTRINSIC(unpackUnorm4x8)   \
+    SKSL_INTRINSIC(width)            \
+    SKSL_INTRINSIC(workgroupBarrier) \
     SKSL_INTRINSIC(write)
 
 namespace SkSL {
diff --git a/src/sksl/codegen/SkSLMetalCodeGenerator.cpp b/src/sksl/codegen/SkSLMetalCodeGenerator.cpp
index f505b01..a33e8ac 100644
--- a/src/sksl/codegen/SkSLMetalCodeGenerator.cpp
+++ b/src/sksl/codegen/SkSLMetalCodeGenerator.cpp
@@ -1012,9 +1012,11 @@
             this->write(")");
             return true;
         }
+        case k_storageBarrier_IntrinsicKind:
+            this->write("threadgroup_barrier(mem_flags::mem_device)");
+            return true;
         case k_workgroupBarrier_IntrinsicKind:
-            this->write("threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | "
-                                            "mem_flags::mem_texture)");
+            this->write("threadgroup_barrier(mem_flags::mem_threadgroup)");
             return true;
         default:
             return false;
diff --git a/src/sksl/generated/sksl_compute.minified.sksl b/src/sksl/generated/sksl_compute.minified.sksl
index ded4f93..83781f0 100644
--- a/src/sksl/generated/sksl_compute.minified.sksl
+++ b/src/sksl/generated/sksl_compute.minified.sksl
@@ -1,4 +1,5 @@
 static constexpr char SKSL_MINIFIED_sksl_compute[] =
 "layout(builtin=28)in uint3 sk_ThreadPosition;$pure half4 read($readableTexture2D"
 ",uint2);void write($writableTexture2D,uint2,half4);$pure uint width($genTexture2D"
-");$pure uint height($genTexture2D);void workgroupBarrier();";
+");$pure uint height($genTexture2D);void workgroupBarrier();void storageBarrier"
+"();";
diff --git a/src/sksl/generated/sksl_compute.unoptimized.sksl b/src/sksl/generated/sksl_compute.unoptimized.sksl
index 21fc6a3..8d4dc9d 100644
--- a/src/sksl/generated/sksl_compute.unoptimized.sksl
+++ b/src/sksl/generated/sksl_compute.unoptimized.sksl
@@ -2,4 +2,4 @@
 "layout(builtin=28)in uint3 sk_ThreadPosition;$pure half4 read($readableTexture2D"
 " t,uint2 pos);void write($writableTexture2D t,uint2 pos,half4 color);$pure uint"
 " width($genTexture2D t);$pure uint height($genTexture2D t);void workgroupBarrier"
-"();";
+"();void storageBarrier();";
diff --git a/src/sksl/sksl_compute.sksl b/src/sksl/sksl_compute.sksl
index 11f9683..cfe1b09 100644
--- a/src/sksl/sksl_compute.sksl
+++ b/src/sksl/sksl_compute.sksl
@@ -8,4 +8,10 @@
 $pure uint width($genTexture2D t);
 $pure uint height($genTexture2D t);
 
+// Control-barrier with memory-ordering constraints applied to
+// workgroup shared memory only.
 void workgroupBarrier();
+
+// Control-barrier with memory-ordering constraints applied to
+// uniform and storage-buffer memory.
+void storageBarrier();
diff --git a/tests/sksl/compute/Barrier.metal b/tests/sksl/compute/Barrier.metal
new file mode 100644
index 0000000..c594914
--- /dev/null
+++ b/tests/sksl/compute/Barrier.metal
@@ -0,0 +1,11 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+using namespace metal;
+struct Inputs {
+};
+kernel void computeMain(uint3 sk_ThreadPosition [[thread_position_in_grid]]) {
+    Inputs _in = {  };
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    threadgroup_barrier(mem_flags::mem_device);
+    return;
+}
diff --git a/tests/sksl/compute/Workgroup.metal b/tests/sksl/compute/Workgroup.metal
index 0f6c2f5..8384663 100644
--- a/tests/sksl/compute/Workgroup.metal
+++ b/tests/sksl/compute/Workgroup.metal
@@ -32,14 +32,14 @@
     uint mask;
     _threadgroups.shared_data[id * 2u] = _globals._anonInterface0->in_data[id * 2u];
     _threadgroups.shared_data[id * 2u + 1u] = _globals._anonInterface0->in_data[id * 2u + 1u];
-    threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture);
+    threadgroup_barrier(mem_flags::mem_threadgroup);
     const uint steps = 10u;
     for (uint step = 0u;step < steps; step++) {
         mask = (1u << step) - 1u;
         rd_id = ((id >> step) << step + 1u) + mask;
         wr_id = (rd_id + 1u) + (id & mask);
         store_vIf(_threadgroups, wr_id, _threadgroups.shared_data[wr_id] + _threadgroups.shared_data[rd_id]);
-        threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
     }
     _globals._anonInterface1->out_data[id * 2u] = _threadgroups.shared_data[id * 2u];
     _globals._anonInterface1->out_data[id * 2u + 1u] = _threadgroups.shared_data[id * 2u + 1u];