merge from dev branch - dev
diff --git a/.gitignore b/.gitignore
index e0229c8..6853bbc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,3 @@
**/*.rs.bk
.ninja_deps
.ninja_log
-**/shader/gen
diff --git a/piet-gpu-hal/examples/shader/gen/collatz.dxil b/piet-gpu-hal/examples/shader/gen/collatz.dxil
new file mode 100644
index 0000000..0ce6e9f
--- /dev/null
+++ b/piet-gpu-hal/examples/shader/gen/collatz.dxil
Binary files differ
diff --git a/piet-gpu-hal/examples/shader/gen/collatz.hlsl b/piet-gpu-hal/examples/shader/gen/collatz.hlsl
new file mode 100644
index 0000000..762f06d
--- /dev/null
+++ b/piet-gpu-hal/examples/shader/gen/collatz.hlsl
@@ -0,0 +1,62 @@
+static const uint3 gl_WorkGroupSize = uint3(1u, 1u, 1u);
+
+RWByteAddressBuffer _57 : register(u0);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+float mod(float x, float y)
+{
+ return x - y * floor(x / y);
+}
+
+float2 mod(float2 x, float2 y)
+{
+ return x - y * floor(x / y);
+}
+
+float3 mod(float3 x, float3 y)
+{
+ return x - y * floor(x / y);
+}
+
+float4 mod(float4 x, float4 y)
+{
+ return x - y * floor(x / y);
+}
+
+uint collatz_iterations(inout uint n)
+{
+ uint i = 0u;
+ while (n != 1u)
+ {
+ if (mod(float(n), 2.0f) == 0.0f)
+ {
+ n /= 2u;
+ }
+ else
+ {
+ n = (3u * n) + 1u;
+ }
+ i++;
+ }
+ return i;
+}
+
+void comp_main()
+{
+ uint index = gl_GlobalInvocationID.x;
+ uint param = _57.Load(index * 4 + 0);
+ uint _65 = collatz_iterations(param);
+ _57.Store(index * 4 + 0, _65);
+}
+
+[numthreads(1, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu-hal/examples/shader/gen/collatz.msl b/piet-gpu-hal/examples/shader/gen/collatz.msl
new file mode 100644
index 0000000..1b75efe
--- /dev/null
+++ b/piet-gpu-hal/examples/shader/gen/collatz.msl
@@ -0,0 +1,48 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+// Implementation of the GLSL mod() function, which is slightly different than Metal fmod()
+template<typename Tx, typename Ty>
+inline Tx mod(Tx x, Ty y)
+{
+ return x - y * floor(x / y);
+}
+
+struct PrimeIndices
+{
+ uint indices[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u);
+
+static inline __attribute__((always_inline))
+uint collatz_iterations(thread uint& n)
+{
+ uint i = 0u;
+ while (n != 1u)
+ {
+ if (mod(float(n), 2.0) == 0.0)
+ {
+ n /= 2u;
+ }
+ else
+ {
+ n = (3u * n) + 1u;
+ }
+ i++;
+ }
+ return i;
+}
+
+kernel void main0(device PrimeIndices& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+ uint index = gl_GlobalInvocationID.x;
+ uint param = _57.indices[index];
+ uint _65 = collatz_iterations(param);
+ _57.indices[index] = _65;
+}
+
diff --git a/piet-gpu-hal/examples/shader/gen/collatz.spv b/piet-gpu-hal/examples/shader/gen/collatz.spv
new file mode 100644
index 0000000..886797e
--- /dev/null
+++ b/piet-gpu-hal/examples/shader/gen/collatz.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil
new file mode 100644
index 0000000..69873d8
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop.hlsl b/piet-gpu/shader/gen/backdrop.hlsl
new file mode 100644
index 0000000..7b64fb4
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop.hlsl
@@ -0,0 +1,256 @@
+struct Alloc
+{
+ uint offset;
+};
+
+struct PathRef
+{
+ uint offset;
+};
+
+struct TileRef
+{
+ uint offset;
+};
+
+struct Path
+{
+ uint4 bbox;
+ TileRef tiles;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer _59 : register(u0, space0);
+ByteAddressBuffer _181 : register(t1, space0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+static uint gl_LocalInvocationIndex;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+ uint gl_LocalInvocationIndex : SV_GroupIndex;
+};
+
+groupshared uint sh_row_width[256];
+groupshared Alloc sh_row_alloc[256];
+groupshared uint sh_row_count[256];
+
+bool check_deps(uint dep_stage)
+{
+ uint _65;
+ _59.InterlockedOr(4, 0u, _65);
+ return (_65 & dep_stage) == 0u;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+ return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return 0u;
+ }
+ uint v = _59.Load(offset * 4 + 12);
+ return v;
+}
+
+Path Path_read(Alloc a, PathRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5);
+ Path s;
+ s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+ TileRef _146 = { raw2 };
+ s.tiles = _146;
+ return s;
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok)
+{
+ Alloc a;
+ a.offset = offset;
+ return a;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ _59.Store(offset * 4 + 12, val);
+}
+
+void comp_main()
+{
+ uint param = 7u;
+ bool _154 = check_deps(param);
+ if (!_154)
+ {
+ return;
+ }
+ uint th_ix = gl_LocalInvocationIndex;
+ uint element_ix = gl_GlobalInvocationID.x;
+ uint row_count = 0u;
+ if (gl_LocalInvocationID.y == 0u)
+ {
+ if (element_ix < _181.Load(4))
+ {
+ PathRef _195 = { _181.Load(20) + (element_ix * 12u) };
+ PathRef path_ref = _195;
+ Alloc _200;
+ _200.offset = _181.Load(20);
+ Alloc param_1;
+ param_1.offset = _200.offset;
+ PathRef param_2 = path_ref;
+ Path path = Path_read(param_1, param_2);
+ sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+ row_count = path.bbox.w - path.bbox.y;
+ bool _225 = row_count == 1u;
+ bool _231;
+ if (_225)
+ {
+ _231 = path.bbox.y > 0u;
+ }
+ else
+ {
+ _231 = _225;
+ }
+ if (_231)
+ {
+ row_count = 0u;
+ }
+ uint param_3 = path.tiles.offset;
+ uint param_4 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+ bool param_5 = true;
+ Alloc path_alloc = new_alloc(param_3, param_4, param_5);
+ sh_row_alloc[th_ix] = path_alloc;
+ }
+ sh_row_count[th_ix] = row_count;
+ }
+ for (uint i = 0u; i < 8u; i++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ bool _276 = gl_LocalInvocationID.y == 0u;
+ bool _283;
+ if (_276)
+ {
+ _283 = th_ix >= (1u << i);
+ }
+ else
+ {
+ _283 = _276;
+ }
+ if (_283)
+ {
+ row_count += sh_row_count[th_ix - (1u << i)];
+ }
+ GroupMemoryBarrierWithGroupSync();
+ if (gl_LocalInvocationID.y == 0u)
+ {
+ sh_row_count[th_ix] = row_count;
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ uint total_rows = sh_row_count[255];
+ uint _360;
+ for (uint row = th_ix; row < total_rows; row += 256u)
+ {
+ uint el_ix = 0u;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ uint probe = el_ix + (128u >> i_1);
+ if (row >= sh_row_count[probe - 1u])
+ {
+ el_ix = probe;
+ }
+ }
+ uint width = sh_row_width[el_ix];
+ if (width > 0u)
+ {
+ Alloc tiles_alloc = sh_row_alloc[el_ix];
+ if (el_ix > 0u)
+ {
+ _360 = sh_row_count[el_ix - 1u];
+ }
+ else
+ {
+ _360 = 0u;
+ }
+ uint seq_ix = row - _360;
+ uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
+ Alloc param_6 = tiles_alloc;
+ uint param_7 = tile_el_ix;
+ uint sum = read_mem(param_6, param_7);
+ for (uint x = 1u; x < width; x++)
+ {
+ tile_el_ix += 2u;
+ Alloc param_8 = tiles_alloc;
+ uint param_9 = tile_el_ix;
+ sum += read_mem(param_8, param_9);
+ Alloc param_10 = tiles_alloc;
+ uint param_11 = tile_el_ix;
+ uint param_12 = sum;
+ write_mem(param_10, param_11, param_12);
+ }
+ }
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/backdrop.msl b/piet-gpu/shader/gen/backdrop.msl
new file mode 100644
index 0000000..da32c27
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop.msl
@@ -0,0 +1,262 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct PathRef
+{
+ uint offset;
+};
+
+struct TileRef
+{
+ uint offset;
+};
+
+struct Path
+{
+ uint4 bbox;
+ TileRef tiles;
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+struct Alloc_1
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc_1 tile_alloc;
+ Alloc_1 bin_alloc;
+ Alloc_1 ptcl_alloc;
+ Alloc_1 pathseg_alloc;
+ Alloc_1 anno_alloc;
+ Alloc_1 path_bbox_alloc;
+ Alloc_1 drawmonoid_alloc;
+ Alloc_1 clip_alloc;
+ Alloc_1 clip_bic_alloc;
+ Alloc_1 clip_stack_alloc;
+ Alloc_1 clip_bbox_alloc;
+ Alloc_1 draw_bbox_alloc;
+ Alloc_1 drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+bool check_deps(thread const uint& dep_stage, device Memory& v_59)
+{
+ uint _65 = atomic_fetch_or_explicit((device atomic_uint*)&v_59.mem_error, 0u, memory_order_relaxed);
+ return (_65 & dep_stage) == 0u;
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+ return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_59)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return 0u;
+ }
+ uint v = v_59.memory[offset];
+ return v;
+}
+
+static inline __attribute__((always_inline))
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_59)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_59);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_59);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5, v_59);
+ Path s;
+ s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+ s.tiles = TileRef{ raw2 };
+ return s;
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+ Alloc a;
+ a.offset = offset;
+ return a;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_59)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ v_59.memory[offset] = val;
+}
+
+kernel void main0(device Memory& v_59 [[buffer(0)]], const device ConfigBuf& _181 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+ threadgroup uint sh_row_width[256];
+ threadgroup Alloc sh_row_alloc[256];
+ threadgroup uint sh_row_count[256];
+ uint param = 7u;
+ bool _154 = check_deps(param, v_59);
+ if (!_154)
+ {
+ return;
+ }
+ uint th_ix = gl_LocalInvocationIndex;
+ uint element_ix = gl_GlobalInvocationID.x;
+ uint row_count = 0u;
+ if (gl_LocalInvocationID.y == 0u)
+ {
+ if (element_ix < _181.conf.n_elements)
+ {
+ PathRef path_ref = PathRef{ _181.conf.tile_alloc.offset + (element_ix * 12u) };
+ Alloc param_1;
+ param_1.offset = _181.conf.tile_alloc.offset;
+ PathRef param_2 = path_ref;
+ Path path = Path_read(param_1, param_2, v_59);
+ sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+ row_count = path.bbox.w - path.bbox.y;
+ bool _225 = row_count == 1u;
+ bool _231;
+ if (_225)
+ {
+ _231 = path.bbox.y > 0u;
+ }
+ else
+ {
+ _231 = _225;
+ }
+ if (_231)
+ {
+ row_count = 0u;
+ }
+ uint param_3 = path.tiles.offset;
+ uint param_4 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+ bool param_5 = true;
+ Alloc path_alloc = new_alloc(param_3, param_4, param_5);
+ sh_row_alloc[th_ix] = path_alloc;
+ }
+ sh_row_count[th_ix] = row_count;
+ }
+ for (uint i = 0u; i < 8u; i++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ bool _276 = gl_LocalInvocationID.y == 0u;
+ bool _283;
+ if (_276)
+ {
+ _283 = th_ix >= (1u << i);
+ }
+ else
+ {
+ _283 = _276;
+ }
+ if (_283)
+ {
+ row_count += sh_row_count[th_ix - (1u << i)];
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (gl_LocalInvocationID.y == 0u)
+ {
+ sh_row_count[th_ix] = row_count;
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint total_rows = sh_row_count[255];
+ uint _360;
+ for (uint row = th_ix; row < total_rows; row += 256u)
+ {
+ uint el_ix = 0u;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ uint probe = el_ix + (128u >> i_1);
+ if (row >= sh_row_count[probe - 1u])
+ {
+ el_ix = probe;
+ }
+ }
+ uint width = sh_row_width[el_ix];
+ if (width > 0u)
+ {
+ Alloc tiles_alloc = sh_row_alloc[el_ix];
+ if (el_ix > 0u)
+ {
+ _360 = sh_row_count[el_ix - 1u];
+ }
+ else
+ {
+ _360 = 0u;
+ }
+ uint seq_ix = row - _360;
+ uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
+ Alloc param_6 = tiles_alloc;
+ uint param_7 = tile_el_ix;
+ uint sum = read_mem(param_6, param_7, v_59);
+ for (uint x = 1u; x < width; x++)
+ {
+ tile_el_ix += 2u;
+ Alloc param_8 = tiles_alloc;
+ uint param_9 = tile_el_ix;
+ sum += read_mem(param_8, param_9, v_59);
+ Alloc param_10 = tiles_alloc;
+ uint param_11 = tile_el_ix;
+ uint param_12 = sum;
+ write_mem(param_10, param_11, param_12, v_59);
+ }
+ }
+ }
+}
+
diff --git a/piet-gpu/shader/gen/backdrop.spv b/piet-gpu/shader/gen/backdrop.spv
new file mode 100644
index 0000000..7a4cbde
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil
new file mode 100644
index 0000000..afcf3b8
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop_lg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.hlsl b/piet-gpu/shader/gen/backdrop_lg.hlsl
new file mode 100644
index 0000000..f15b566
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop_lg.hlsl
@@ -0,0 +1,256 @@
+struct Alloc
+{
+ uint offset;
+};
+
+struct PathRef
+{
+ uint offset;
+};
+
+struct TileRef
+{
+ uint offset;
+};
+
+struct Path
+{
+ uint4 bbox;
+ TileRef tiles;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 4u, 1u);
+
+RWByteAddressBuffer _59 : register(u0, space0);
+ByteAddressBuffer _181 : register(t1, space0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+static uint gl_LocalInvocationIndex;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+ uint gl_LocalInvocationIndex : SV_GroupIndex;
+};
+
+groupshared uint sh_row_width[256];
+groupshared Alloc sh_row_alloc[256];
+groupshared uint sh_row_count[256];
+
+bool check_deps(uint dep_stage)
+{
+ uint _65;
+ _59.InterlockedOr(4, 0u, _65);
+ return (_65 & dep_stage) == 0u;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+ return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return 0u;
+ }
+ uint v = _59.Load(offset * 4 + 12);
+ return v;
+}
+
+Path Path_read(Alloc a, PathRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5);
+ Path s;
+ s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+ TileRef _146 = { raw2 };
+ s.tiles = _146;
+ return s;
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok)
+{
+ Alloc a;
+ a.offset = offset;
+ return a;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ _59.Store(offset * 4 + 12, val);
+}
+
+void comp_main()
+{
+ uint param = 7u;
+ bool _154 = check_deps(param);
+ if (!_154)
+ {
+ return;
+ }
+ uint th_ix = gl_LocalInvocationIndex;
+ uint element_ix = gl_GlobalInvocationID.x;
+ uint row_count = 0u;
+ if (gl_LocalInvocationID.y == 0u)
+ {
+ if (element_ix < _181.Load(4))
+ {
+ PathRef _195 = { _181.Load(20) + (element_ix * 12u) };
+ PathRef path_ref = _195;
+ Alloc _200;
+ _200.offset = _181.Load(20);
+ Alloc param_1;
+ param_1.offset = _200.offset;
+ PathRef param_2 = path_ref;
+ Path path = Path_read(param_1, param_2);
+ sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+ row_count = path.bbox.w - path.bbox.y;
+ bool _225 = row_count == 1u;
+ bool _231;
+ if (_225)
+ {
+ _231 = path.bbox.y > 0u;
+ }
+ else
+ {
+ _231 = _225;
+ }
+ if (_231)
+ {
+ row_count = 0u;
+ }
+ uint param_3 = path.tiles.offset;
+ uint param_4 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+ bool param_5 = true;
+ Alloc path_alloc = new_alloc(param_3, param_4, param_5);
+ sh_row_alloc[th_ix] = path_alloc;
+ }
+ sh_row_count[th_ix] = row_count;
+ }
+ for (uint i = 0u; i < 8u; i++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ bool _276 = gl_LocalInvocationID.y == 0u;
+ bool _283;
+ if (_276)
+ {
+ _283 = th_ix >= (1u << i);
+ }
+ else
+ {
+ _283 = _276;
+ }
+ if (_283)
+ {
+ row_count += sh_row_count[th_ix - (1u << i)];
+ }
+ GroupMemoryBarrierWithGroupSync();
+ if (gl_LocalInvocationID.y == 0u)
+ {
+ sh_row_count[th_ix] = row_count;
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ uint total_rows = sh_row_count[255];
+ uint _360;
+ for (uint row = th_ix; row < total_rows; row += 1024u)
+ {
+ uint el_ix = 0u;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ uint probe = el_ix + (128u >> i_1);
+ if (row >= sh_row_count[probe - 1u])
+ {
+ el_ix = probe;
+ }
+ }
+ uint width = sh_row_width[el_ix];
+ if (width > 0u)
+ {
+ Alloc tiles_alloc = sh_row_alloc[el_ix];
+ if (el_ix > 0u)
+ {
+ _360 = sh_row_count[el_ix - 1u];
+ }
+ else
+ {
+ _360 = 0u;
+ }
+ uint seq_ix = row - _360;
+ uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
+ Alloc param_6 = tiles_alloc;
+ uint param_7 = tile_el_ix;
+ uint sum = read_mem(param_6, param_7);
+ for (uint x = 1u; x < width; x++)
+ {
+ tile_el_ix += 2u;
+ Alloc param_8 = tiles_alloc;
+ uint param_9 = tile_el_ix;
+ sum += read_mem(param_8, param_9);
+ Alloc param_10 = tiles_alloc;
+ uint param_11 = tile_el_ix;
+ uint param_12 = sum;
+ write_mem(param_10, param_11, param_12);
+ }
+ }
+ }
+}
+
+[numthreads(256, 4, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/backdrop_lg.msl b/piet-gpu/shader/gen/backdrop_lg.msl
new file mode 100644
index 0000000..07ab9e7
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop_lg.msl
@@ -0,0 +1,262 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct PathRef
+{
+ uint offset;
+};
+
+struct TileRef
+{
+ uint offset;
+};
+
+struct Path
+{
+ uint4 bbox;
+ TileRef tiles;
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+struct Alloc_1
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc_1 tile_alloc;
+ Alloc_1 bin_alloc;
+ Alloc_1 ptcl_alloc;
+ Alloc_1 pathseg_alloc;
+ Alloc_1 anno_alloc;
+ Alloc_1 path_bbox_alloc;
+ Alloc_1 drawmonoid_alloc;
+ Alloc_1 clip_alloc;
+ Alloc_1 clip_bic_alloc;
+ Alloc_1 clip_stack_alloc;
+ Alloc_1 clip_bbox_alloc;
+ Alloc_1 draw_bbox_alloc;
+ Alloc_1 drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 4u, 1u);
+
+static inline __attribute__((always_inline))
+bool check_deps(thread const uint& dep_stage, device Memory& v_59)
+{
+ uint _65 = atomic_fetch_or_explicit((device atomic_uint*)&v_59.mem_error, 0u, memory_order_relaxed);
+ return (_65 & dep_stage) == 0u;
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+ return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_59)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return 0u;
+ }
+ uint v = v_59.memory[offset];
+ return v;
+}
+
+static inline __attribute__((always_inline))
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_59)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_59);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_59);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5, v_59);
+ Path s;
+ s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+ s.tiles = TileRef{ raw2 };
+ return s;
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+ Alloc a;
+ a.offset = offset;
+ return a;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_59)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ v_59.memory[offset] = val;
+}
+
+kernel void main0(device Memory& v_59 [[buffer(0)]], const device ConfigBuf& _181 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+ threadgroup uint sh_row_width[256];
+ threadgroup Alloc sh_row_alloc[256];
+ threadgroup uint sh_row_count[256];
+ uint param = 7u;
+ bool _154 = check_deps(param, v_59);
+ if (!_154)
+ {
+ return;
+ }
+ uint th_ix = gl_LocalInvocationIndex;
+ uint element_ix = gl_GlobalInvocationID.x;
+ uint row_count = 0u;
+ if (gl_LocalInvocationID.y == 0u)
+ {
+ if (element_ix < _181.conf.n_elements)
+ {
+ PathRef path_ref = PathRef{ _181.conf.tile_alloc.offset + (element_ix * 12u) };
+ Alloc param_1;
+ param_1.offset = _181.conf.tile_alloc.offset;
+ PathRef param_2 = path_ref;
+ Path path = Path_read(param_1, param_2, v_59);
+ sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+ row_count = path.bbox.w - path.bbox.y;
+ bool _225 = row_count == 1u;
+ bool _231;
+ if (_225)
+ {
+ _231 = path.bbox.y > 0u;
+ }
+ else
+ {
+ _231 = _225;
+ }
+ if (_231)
+ {
+ row_count = 0u;
+ }
+ uint param_3 = path.tiles.offset;
+ uint param_4 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+ bool param_5 = true;
+ Alloc path_alloc = new_alloc(param_3, param_4, param_5);
+ sh_row_alloc[th_ix] = path_alloc;
+ }
+ sh_row_count[th_ix] = row_count;
+ }
+ for (uint i = 0u; i < 8u; i++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ bool _276 = gl_LocalInvocationID.y == 0u;
+ bool _283;
+ if (_276)
+ {
+ _283 = th_ix >= (1u << i);
+ }
+ else
+ {
+ _283 = _276;
+ }
+ if (_283)
+ {
+ row_count += sh_row_count[th_ix - (1u << i)];
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (gl_LocalInvocationID.y == 0u)
+ {
+ sh_row_count[th_ix] = row_count;
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint total_rows = sh_row_count[255];
+ uint _360;
+ for (uint row = th_ix; row < total_rows; row += 1024u)
+ {
+ uint el_ix = 0u;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ uint probe = el_ix + (128u >> i_1);
+ if (row >= sh_row_count[probe - 1u])
+ {
+ el_ix = probe;
+ }
+ }
+ uint width = sh_row_width[el_ix];
+ if (width > 0u)
+ {
+ Alloc tiles_alloc = sh_row_alloc[el_ix];
+ if (el_ix > 0u)
+ {
+ _360 = sh_row_count[el_ix - 1u];
+ }
+ else
+ {
+ _360 = 0u;
+ }
+ uint seq_ix = row - _360;
+ uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
+ Alloc param_6 = tiles_alloc;
+ uint param_7 = tile_el_ix;
+ uint sum = read_mem(param_6, param_7, v_59);
+ for (uint x = 1u; x < width; x++)
+ {
+ tile_el_ix += 2u;
+ Alloc param_8 = tiles_alloc;
+ uint param_9 = tile_el_ix;
+ sum += read_mem(param_8, param_9, v_59);
+ Alloc param_10 = tiles_alloc;
+ uint param_11 = tile_el_ix;
+ uint param_12 = sum;
+ write_mem(param_10, param_11, param_12, v_59);
+ }
+ }
+ }
+}
+
diff --git a/piet-gpu/shader/gen/backdrop_lg.spv b/piet-gpu/shader/gen/backdrop_lg.spv
new file mode 100644
index 0000000..4222310
--- /dev/null
+++ b/piet-gpu/shader/gen/backdrop_lg.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil
new file mode 100644
index 0000000..1f8f6f0
--- /dev/null
+++ b/piet-gpu/shader/gen/bbox_clear.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl
new file mode 100644
index 0000000..6ff5a13
--- /dev/null
+++ b/piet-gpu/shader/gen/bbox_clear.hlsl
@@ -0,0 +1,66 @@
+struct Alloc
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+ByteAddressBuffer _21 : register(t1, space0);
+RWByteAddressBuffer _45 : register(u0, space0);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+void comp_main()
+{
+ uint ix = gl_GlobalInvocationID.x;
+ if (ix < _21.Load(76))
+ {
+ uint out_ix = (_21.Load(40) >> uint(2)) + (6u * ix);
+ _45.Store(out_ix * 4 + 12, 65535u);
+ _45.Store((out_ix + 1u) * 4 + 12, 65535u);
+ _45.Store((out_ix + 2u) * 4 + 12, 0u);
+ _45.Store((out_ix + 3u) * 4 + 12, 0u);
+ }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl
new file mode 100644
index 0000000..394e55d
--- /dev/null
+++ b/piet-gpu/shader/gen/bbox_clear.msl
@@ -0,0 +1,69 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+kernel void main0(device Memory& _45 [[buffer(0)]], const device ConfigBuf& _21 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+ uint ix = gl_GlobalInvocationID.x;
+ if (ix < _21.conf.n_path)
+ {
+ uint out_ix = (_21.conf.path_bbox_alloc.offset >> uint(2)) + (6u * ix);
+ _45.memory[out_ix] = 65535u;
+ _45.memory[out_ix + 1u] = 65535u;
+ _45.memory[out_ix + 2u] = 0u;
+ _45.memory[out_ix + 3u] = 0u;
+ }
+}
+
diff --git a/piet-gpu/shader/gen/bbox_clear.spv b/piet-gpu/shader/gen/bbox_clear.spv
new file mode 100644
index 0000000..ce0ee2c
--- /dev/null
+++ b/piet-gpu/shader/gen/bbox_clear.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil
new file mode 100644
index 0000000..f39dee3
--- /dev/null
+++ b/piet-gpu/shader/gen/binning.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl
new file mode 100644
index 0000000..5f43e88
--- /dev/null
+++ b/piet-gpu/shader/gen/binning.hlsl
@@ -0,0 +1,273 @@
+struct Alloc
+{
+ uint offset;
+};
+
+struct DrawMonoid
+{
+ uint path_ix;
+ uint clip_ix;
+ uint scene_offset;
+ uint info_offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer _57 : register(u0, space0);
+ByteAddressBuffer _101 : register(t1, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_WorkGroupID : SV_GroupID;
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+};
+
+groupshared uint bitmaps[8][256];
+groupshared uint count[8][256];
+groupshared uint sh_chunk_offset[256];
+
+DrawMonoid load_draw_monoid(uint element_ix)
+{
+ uint base = (_101.Load(44) >> uint(2)) + (4u * element_ix);
+ uint path_ix = _57.Load(base * 4 + 12);
+ uint clip_ix = _57.Load((base + 1u) * 4 + 12);
+ uint scene_offset = _57.Load((base + 2u) * 4 + 12);
+ uint info_offset = _57.Load((base + 3u) * 4 + 12);
+ DrawMonoid _136 = { path_ix, clip_ix, scene_offset, info_offset };
+ return _136;
+}
+
+float4 load_clip_bbox(uint clip_ix)
+{
+ uint base = (_101.Load(60) >> uint(2)) + (4u * clip_ix);
+ float x0 = asfloat(_57.Load(base * 4 + 12));
+ float y0 = asfloat(_57.Load((base + 1u) * 4 + 12));
+ float x1 = asfloat(_57.Load((base + 2u) * 4 + 12));
+ float y1 = asfloat(_57.Load((base + 3u) * 4 + 12));
+ float4 bbox = float4(x0, y0, x1, y1);
+ return bbox;
+}
+
+float4 load_path_bbox(uint path_ix)
+{
+ uint base = (_101.Load(40) >> uint(2)) + (6u * path_ix);
+ float bbox_l = float(_57.Load(base * 4 + 12)) - 32768.0f;
+ float bbox_t = float(_57.Load((base + 1u) * 4 + 12)) - 32768.0f;
+ float bbox_r = float(_57.Load((base + 2u) * 4 + 12)) - 32768.0f;
+ float bbox_b = float(_57.Load((base + 3u) * 4 + 12)) - 32768.0f;
+ float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+ return bbox;
+}
+
+float4 bbox_intersect(float4 a, float4 b)
+{
+ return float4(max(a.xy, b.xy), min(a.zw, b.zw));
+}
+
+void store_draw_bbox(uint draw_ix, float4 bbox)
+{
+ uint base = (_101.Load(64) >> uint(2)) + (4u * draw_ix);
+ _57.Store(base * 4 + 12, asuint(bbox.x));
+ _57.Store((base + 1u) * 4 + 12, asuint(bbox.y));
+ _57.Store((base + 2u) * 4 + 12, asuint(bbox.z));
+ _57.Store((base + 3u) * 4 + 12, asuint(bbox.w));
+}
+
+uint malloc_stage(uint size, uint mem_size, uint stage)
+{
+ uint _65;
+ _57.InterlockedAdd(0, size, _65);
+ uint offset = _65;
+ if ((offset + size) > mem_size)
+ {
+ uint _76;
+ _57.InterlockedOr(4, stage, _76);
+ offset = 0u;
+ }
+ return offset;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+ return true;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ _57.Store(offset * 4 + 12, val);
+}
+
+void comp_main()
+{
+ uint my_partition = gl_WorkGroupID.x;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ bitmaps[i][gl_LocalInvocationID.x] = 0u;
+ }
+ uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
+ int x0 = 0;
+ int y0 = 0;
+ int x1 = 0;
+ int y1 = 0;
+ if (element_ix < _101.Load(4))
+ {
+ uint param = element_ix;
+ DrawMonoid draw_monoid = load_draw_monoid(param);
+ uint path_ix = draw_monoid.path_ix;
+ float4 clip_bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
+ uint clip_ix = draw_monoid.clip_ix;
+ if (clip_ix > 0u)
+ {
+ uint param_1 = clip_ix - 1u;
+ clip_bbox = load_clip_bbox(param_1);
+ }
+ uint param_2 = path_ix;
+ float4 path_bbox = load_path_bbox(param_2);
+ float4 param_3 = path_bbox;
+ float4 param_4 = clip_bbox;
+ float4 bbox = bbox_intersect(param_3, param_4);
+ float4 _354 = bbox;
+ float4 _356 = bbox;
+ float2 _358 = max(_354.xy, _356.zw);
+ bbox.z = _358.x;
+ bbox.w = _358.y;
+ uint param_5 = element_ix;
+ float4 param_6 = bbox;
+ store_draw_bbox(param_5, param_6);
+ x0 = int(floor(bbox.x * 0.00390625f));
+ y0 = int(floor(bbox.y * 0.00390625f));
+ x1 = int(ceil(bbox.z * 0.00390625f));
+ y1 = int(ceil(bbox.w * 0.00390625f));
+ }
+ uint width_in_bins = ((_101.Load(12) + 16u) - 1u) / 16u;
+ uint height_in_bins = ((_101.Load(16) + 16u) - 1u) / 16u;
+ x0 = clamp(x0, 0, int(width_in_bins));
+ x1 = clamp(x1, x0, int(width_in_bins));
+ y0 = clamp(y0, 0, int(height_in_bins));
+ y1 = clamp(y1, y0, int(height_in_bins));
+ if (x0 == x1)
+ {
+ y1 = y0;
+ }
+ int x = x0;
+ int y = y0;
+ uint my_slice = gl_LocalInvocationID.x / 32u;
+ uint my_mask = 1u << (gl_LocalInvocationID.x & 31u);
+ while (y < y1)
+ {
+ uint _460;
+ InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _460);
+ x++;
+ if (x == x1)
+ {
+ x = x0;
+ y++;
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ uint element_count = 0u;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x])));
+ count[i_1][gl_LocalInvocationID.x] = element_count;
+ }
+ uint chunk_offset = 0u;
+ if (element_count != 0u)
+ {
+ uint param_7 = element_count * 4u;
+ uint param_8 = _101.Load(0);
+ uint param_9 = 1u;
+ uint _510 = malloc_stage(param_7, param_8, param_9);
+ chunk_offset = _510;
+ sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset;
+ }
+ uint out_ix = (_101.Load(24) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
+ Alloc _532;
+ _532.offset = _101.Load(24);
+ Alloc param_10;
+ param_10.offset = _532.offset;
+ uint param_11 = out_ix;
+ uint param_12 = element_count;
+ write_mem(param_10, param_11, param_12);
+ Alloc _544;
+ _544.offset = _101.Load(24);
+ Alloc param_13;
+ param_13.offset = _544.offset;
+ uint param_14 = out_ix + 1u;
+ uint param_15 = chunk_offset;
+ write_mem(param_13, param_14, param_15);
+ GroupMemoryBarrierWithGroupSync();
+ x = x0;
+ y = y0;
+ while (y < y1)
+ {
+ uint bin_ix = (uint(y) * width_in_bins) + uint(x);
+ uint out_mask = bitmaps[my_slice][bin_ix];
+ if ((out_mask & my_mask) != 0u)
+ {
+ uint idx = uint(int(countbits(out_mask & (my_mask - 1u))));
+ if (my_slice > 0u)
+ {
+ idx += count[my_slice - 1u][bin_ix];
+ }
+ uint chunk_offset_1 = sh_chunk_offset[bin_ix];
+ if (chunk_offset_1 != 0u)
+ {
+ _57.Store(((chunk_offset_1 >> uint(2)) + idx) * 4 + 12, element_ix);
+ }
+ }
+ x++;
+ if (x == x1)
+ {
+ x = x0;
+ y++;
+ }
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_WorkGroupID = stage_input.gl_WorkGroupID;
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl
new file mode 100644
index 0000000..94ce30c
--- /dev/null
+++ b/piet-gpu/shader/gen/binning.msl
@@ -0,0 +1,281 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct DrawMonoid
+{
+ uint path_ix;
+ uint clip_ix;
+ uint scene_offset;
+ uint info_offset;
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+struct Alloc_1
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc_1 tile_alloc;
+ Alloc_1 bin_alloc;
+ Alloc_1 ptcl_alloc;
+ Alloc_1 pathseg_alloc;
+ Alloc_1 anno_alloc;
+ Alloc_1 path_bbox_alloc;
+ Alloc_1 drawmonoid_alloc;
+ Alloc_1 clip_alloc;
+ Alloc_1 clip_bic_alloc;
+ Alloc_1 clip_stack_alloc;
+ Alloc_1 clip_bbox_alloc;
+ Alloc_1 draw_bbox_alloc;
+ Alloc_1 drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_57, const device ConfigBuf& v_101)
+{
+ uint base = (v_101.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * element_ix);
+ uint path_ix = v_57.memory[base];
+ uint clip_ix = v_57.memory[base + 1u];
+ uint scene_offset = v_57.memory[base + 2u];
+ uint info_offset = v_57.memory[base + 3u];
+ return DrawMonoid{ path_ix, clip_ix, scene_offset, info_offset };
+}
+
+static inline __attribute__((always_inline))
+float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_57, const device ConfigBuf& v_101)
+{
+ uint base = (v_101.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix);
+ float x0 = as_type<float>(v_57.memory[base]);
+ float y0 = as_type<float>(v_57.memory[base + 1u]);
+ float x1 = as_type<float>(v_57.memory[base + 2u]);
+ float y1 = as_type<float>(v_57.memory[base + 3u]);
+ float4 bbox = float4(x0, y0, x1, y1);
+ return bbox;
+}
+
+static inline __attribute__((always_inline))
+float4 load_path_bbox(thread const uint& path_ix, device Memory& v_57, const device ConfigBuf& v_101)
+{
+ uint base = (v_101.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+ float bbox_l = float(v_57.memory[base]) - 32768.0;
+ float bbox_t = float(v_57.memory[base + 1u]) - 32768.0;
+ float bbox_r = float(v_57.memory[base + 2u]) - 32768.0;
+ float bbox_b = float(v_57.memory[base + 3u]) - 32768.0;
+ float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+ return bbox;
+}
+
+static inline __attribute__((always_inline))
+float4 bbox_intersect(thread const float4& a, thread const float4& b)
+{
+ return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw));
+}
+
+static inline __attribute__((always_inline))
+void store_draw_bbox(thread const uint& draw_ix, thread const float4& bbox, device Memory& v_57, const device ConfigBuf& v_101)
+{
+ uint base = (v_101.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix);
+ v_57.memory[base] = as_type<uint>(bbox.x);
+ v_57.memory[base + 1u] = as_type<uint>(bbox.y);
+ v_57.memory[base + 2u] = as_type<uint>(bbox.z);
+ v_57.memory[base + 3u] = as_type<uint>(bbox.w);
+}
+
+static inline __attribute__((always_inline))
+uint malloc_stage(thread const uint& size, thread const uint& mem_size, thread const uint& stage, device Memory& v_57)
+{
+ uint _65 = atomic_fetch_add_explicit((device atomic_uint*)&v_57.mem_offset, size, memory_order_relaxed);
+ uint offset = _65;
+ if ((offset + size) > mem_size)
+ {
+ uint _76 = atomic_fetch_or_explicit((device atomic_uint*)&v_57.mem_error, stage, memory_order_relaxed);
+ offset = 0u;
+ }
+ return offset;
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+ return true;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_57)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ v_57.memory[offset] = val;
+}
+
+kernel void main0(device Memory& v_57 [[buffer(0)]], const device ConfigBuf& v_101 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+ threadgroup uint bitmaps[8][256];
+ threadgroup uint count[8][256];
+ threadgroup uint sh_chunk_offset[256];
+ uint my_partition = gl_WorkGroupID.x;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ bitmaps[i][gl_LocalInvocationID.x] = 0u;
+ }
+ uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
+ int x0 = 0;
+ int y0 = 0;
+ int x1 = 0;
+ int y1 = 0;
+ if (element_ix < v_101.conf.n_elements)
+ {
+ uint param = element_ix;
+ DrawMonoid draw_monoid = load_draw_monoid(param, v_57, v_101);
+ uint path_ix = draw_monoid.path_ix;
+ float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
+ uint clip_ix = draw_monoid.clip_ix;
+ if (clip_ix > 0u)
+ {
+ uint param_1 = clip_ix - 1u;
+ clip_bbox = load_clip_bbox(param_1, v_57, v_101);
+ }
+ uint param_2 = path_ix;
+ float4 path_bbox = load_path_bbox(param_2, v_57, v_101);
+ float4 param_3 = path_bbox;
+ float4 param_4 = clip_bbox;
+ float4 bbox = bbox_intersect(param_3, param_4);
+ float4 _354 = bbox;
+ float4 _356 = bbox;
+ float2 _358 = fast::max(_354.xy, _356.zw);
+ bbox.z = _358.x;
+ bbox.w = _358.y;
+ uint param_5 = element_ix;
+ float4 param_6 = bbox;
+ store_draw_bbox(param_5, param_6, v_57, v_101);
+ x0 = int(floor(bbox.x * 0.00390625));
+ y0 = int(floor(bbox.y * 0.00390625));
+ x1 = int(ceil(bbox.z * 0.00390625));
+ y1 = int(ceil(bbox.w * 0.00390625));
+ }
+ uint width_in_bins = ((v_101.conf.width_in_tiles + 16u) - 1u) / 16u;
+ uint height_in_bins = ((v_101.conf.height_in_tiles + 16u) - 1u) / 16u;
+ x0 = clamp(x0, 0, int(width_in_bins));
+ x1 = clamp(x1, x0, int(width_in_bins));
+ y0 = clamp(y0, 0, int(height_in_bins));
+ y1 = clamp(y1, y0, int(height_in_bins));
+ if (x0 == x1)
+ {
+ y1 = y0;
+ }
+ int x = x0;
+ int y = y0;
+ uint my_slice = gl_LocalInvocationID.x / 32u;
+ uint my_mask = 1u << (gl_LocalInvocationID.x & 31u);
+ while (y < y1)
+ {
+ uint _460 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
+ x++;
+ if (x == x1)
+ {
+ x = x0;
+ y++;
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint element_count = 0u;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x])));
+ count[i_1][gl_LocalInvocationID.x] = element_count;
+ }
+ uint chunk_offset = 0u;
+ if (element_count != 0u)
+ {
+ uint param_7 = element_count * 4u;
+ uint param_8 = v_101.conf.mem_size;
+ uint param_9 = 1u;
+ uint _510 = malloc_stage(param_7, param_8, param_9, v_57);
+ chunk_offset = _510;
+ sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset;
+ }
+ uint out_ix = (v_101.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
+ Alloc param_10;
+ param_10.offset = v_101.conf.bin_alloc.offset;
+ uint param_11 = out_ix;
+ uint param_12 = element_count;
+ write_mem(param_10, param_11, param_12, v_57);
+ Alloc param_13;
+ param_13.offset = v_101.conf.bin_alloc.offset;
+ uint param_14 = out_ix + 1u;
+ uint param_15 = chunk_offset;
+ write_mem(param_13, param_14, param_15, v_57);
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ x = x0;
+ y = y0;
+ while (y < y1)
+ {
+ uint bin_ix = (uint(y) * width_in_bins) + uint(x);
+ uint out_mask = bitmaps[my_slice][bin_ix];
+ if ((out_mask & my_mask) != 0u)
+ {
+ uint idx = uint(int(popcount(out_mask & (my_mask - 1u))));
+ if (my_slice > 0u)
+ {
+ idx += count[my_slice - 1u][bin_ix];
+ }
+ uint chunk_offset_1 = sh_chunk_offset[bin_ix];
+ if (chunk_offset_1 != 0u)
+ {
+ v_57.memory[(chunk_offset_1 >> uint(2)) + idx] = element_ix;
+ }
+ }
+ x++;
+ if (x == x1)
+ {
+ x = x0;
+ y++;
+ }
+ }
+}
+
diff --git a/piet-gpu/shader/gen/binning.spv b/piet-gpu/shader/gen/binning.spv
new file mode 100644
index 0000000..bcf544c
--- /dev/null
+++ b/piet-gpu/shader/gen/binning.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_leaf.dxil b/piet-gpu/shader/gen/clip_leaf.dxil
new file mode 100644
index 0000000..e3e943c
--- /dev/null
+++ b/piet-gpu/shader/gen/clip_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_leaf.hlsl b/piet-gpu/shader/gen/clip_leaf.hlsl
new file mode 100644
index 0000000..9a907c4
--- /dev/null
+++ b/piet-gpu/shader/gen/clip_leaf.hlsl
@@ -0,0 +1,371 @@
+struct Bic
+{
+ uint a;
+ uint b;
+};
+
+struct ClipEl
+{
+ uint parent_ix;
+ float4 bbox;
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+static const Bic _394 = { 0u, 0u };
+
+ByteAddressBuffer _80 : register(t1, space0);
+RWByteAddressBuffer _96 : register(u0, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_WorkGroupID : SV_GroupID;
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Bic sh_bic[510];
+groupshared uint sh_stack[256];
+groupshared float4 sh_stack_bbox[256];
+groupshared uint sh_link[256];
+groupshared float4 sh_bbox[256];
+
+Bic load_bic(uint ix)
+{
+ uint base = (_80.Load(52) >> uint(2)) + (2u * ix);
+ Bic _287 = { _96.Load(base * 4 + 12), _96.Load((base + 1u) * 4 + 12) };
+ return _287;
+}
+
+Bic bic_combine(Bic x, Bic y)
+{
+ uint m = min(x.b, y.a);
+ Bic _72 = { (x.a + y.a) - m, (x.b + y.b) - m };
+ return _72;
+}
+
+ClipEl load_clip_el(uint ix)
+{
+ uint base = (_80.Load(56) >> uint(2)) + (5u * ix);
+ uint parent_ix = _96.Load(base * 4 + 12);
+ float x0 = asfloat(_96.Load((base + 1u) * 4 + 12));
+ float y0 = asfloat(_96.Load((base + 2u) * 4 + 12));
+ float x1 = asfloat(_96.Load((base + 3u) * 4 + 12));
+ float y1 = asfloat(_96.Load((base + 4u) * 4 + 12));
+ float4 bbox = float4(x0, y0, x1, y1);
+ ClipEl _336 = { parent_ix, bbox };
+ return _336;
+}
+
+float4 bbox_intersect(float4 a, float4 b)
+{
+ return float4(max(a.xy, b.xy), min(a.zw, b.zw));
+}
+
+uint load_path_ix(uint ix)
+{
+ if (ix < _80.Load(80))
+ {
+ return _96.Load(((_80.Load(48) >> uint(2)) + ix) * 4 + 12);
+ }
+ else
+ {
+ return 2147483648u;
+ }
+}
+
+float4 load_path_bbox(uint path_ix)
+{
+ uint base = (_80.Load(40) >> uint(2)) + (6u * path_ix);
+ float bbox_l = float(_96.Load(base * 4 + 12)) - 32768.0f;
+ float bbox_t = float(_96.Load((base + 1u) * 4 + 12)) - 32768.0f;
+ float bbox_r = float(_96.Load((base + 2u) * 4 + 12)) - 32768.0f;
+ float bbox_b = float(_96.Load((base + 3u) * 4 + 12)) - 32768.0f;
+ float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+ return bbox;
+}
+
+uint search_link(inout Bic bic)
+{
+ uint ix = gl_LocalInvocationID.x;
+ uint j = 0u;
+ while (j < 8u)
+ {
+ uint base = 512u - (2u << (8u - j));
+ if (((ix >> j) & 1u) != 0u)
+ {
+ Bic param = sh_bic[(base + (ix >> j)) - 1u];
+ Bic param_1 = bic;
+ Bic test = bic_combine(param, param_1);
+ if (test.b > 0u)
+ {
+ break;
+ }
+ bic = test;
+ ix -= (1u << j);
+ }
+ j++;
+ }
+ if (ix > 0u)
+ {
+ while (j > 0u)
+ {
+ j--;
+ uint base_1 = 512u - (2u << (8u - j));
+ Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u];
+ Bic param_3 = bic;
+ Bic test_1 = bic_combine(param_2, param_3);
+ if (test_1.b == 0u)
+ {
+ bic = test_1;
+ ix -= (1u << j);
+ }
+ }
+ }
+ if (ix > 0u)
+ {
+ return ix - 1u;
+ }
+ else
+ {
+ return 4294967295u - bic.a;
+ }
+}
+
+void store_clip_bbox(uint ix, float4 bbox)
+{
+ uint base = (_80.Load(60) >> uint(2)) + (4u * ix);
+ _96.Store(base * 4 + 12, asuint(bbox.x));
+ _96.Store((base + 1u) * 4 + 12, asuint(bbox.y));
+ _96.Store((base + 2u) * 4 + 12, asuint(bbox.z));
+ _96.Store((base + 3u) * 4 + 12, asuint(bbox.w));
+}
+
+void comp_main()
+{
+ uint th = gl_LocalInvocationID.x;
+ Bic bic = _394;
+ if (th < gl_WorkGroupID.x)
+ {
+ uint param = th;
+ bic = load_bic(param);
+ }
+ sh_bic[th] = bic;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if ((th + (1u << i)) < 256u)
+ {
+ Bic other = sh_bic[th + (1u << i)];
+ Bic param_1 = bic;
+ Bic param_2 = other;
+ bic = bic_combine(param_1, param_2);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_bic[th] = bic;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ uint stack_size = sh_bic[0].b;
+ uint sp = 255u - th;
+ uint ix = 0u;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ uint probe = ix + (128u >> i_1);
+ if (sp < sh_bic[probe].b)
+ {
+ ix = probe;
+ }
+ }
+ uint b = sh_bic[ix].b;
+ float4 bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
+ if (sp < b)
+ {
+ uint param_3 = (((ix * 256u) + b) - sp) - 1u;
+ ClipEl el = load_clip_el(param_3);
+ sh_stack[th] = el.parent_ix;
+ bbox = el.bbox;
+ }
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ sh_stack_bbox[th] = bbox;
+ GroupMemoryBarrierWithGroupSync();
+ if (th >= (1u << i_2))
+ {
+ float4 param_4 = sh_stack_bbox[th - (1u << i_2)];
+ float4 param_5 = bbox;
+ bbox = bbox_intersect(param_4, param_5);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ }
+ sh_stack_bbox[th] = bbox;
+ uint param_6 = gl_GlobalInvocationID.x;
+ uint inp = load_path_ix(param_6);
+ bool is_push = int(inp) >= 0;
+ Bic _560 = { 1u - uint(is_push), uint(is_push) };
+ bic = _560;
+ sh_bic[th] = bic;
+ if (is_push)
+ {
+ uint param_7 = inp;
+ bbox = load_path_bbox(param_7);
+ }
+ else
+ {
+ bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
+ }
+ uint inbase = 0u;
+ for (uint i_3 = 0u; i_3 < 7u; i_3++)
+ {
+ uint outbase = 512u - (1u << (8u - i_3));
+ GroupMemoryBarrierWithGroupSync();
+ if (th < (1u << (7u - i_3)))
+ {
+ Bic param_8 = sh_bic[inbase + (th * 2u)];
+ Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u];
+ sh_bic[outbase + th] = bic_combine(param_8, param_9);
+ }
+ inbase = outbase;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ bic = _394;
+ Bic param_10 = bic;
+ uint _619 = search_link(param_10);
+ bic = param_10;
+ uint link = _619;
+ sh_link[th] = link;
+ GroupMemoryBarrierWithGroupSync();
+ uint grandparent;
+ if (int(link) >= 0)
+ {
+ grandparent = sh_link[link];
+ }
+ else
+ {
+ grandparent = link - 1u;
+ }
+ uint parent;
+ if (int(link) >= 0)
+ {
+ parent = (gl_WorkGroupID.x * 256u) + link;
+ }
+ else
+ {
+ if (int(link + stack_size) >= 0)
+ {
+ parent = sh_stack[256u + link];
+ }
+ else
+ {
+ parent = 4294967295u;
+ }
+ }
+ for (uint i_4 = 0u; i_4 < 8u; i_4++)
+ {
+ if (i_4 != 0u)
+ {
+ sh_link[th] = link;
+ }
+ sh_bbox[th] = bbox;
+ GroupMemoryBarrierWithGroupSync();
+ if (int(link) >= 0)
+ {
+ float4 param_11 = sh_bbox[link];
+ float4 param_12 = bbox;
+ bbox = bbox_intersect(param_11, param_12);
+ link = sh_link[link];
+ }
+ GroupMemoryBarrierWithGroupSync();
+ }
+ if (int(link + stack_size) >= 0)
+ {
+ float4 param_13 = sh_stack_bbox[256u + link];
+ float4 param_14 = bbox;
+ bbox = bbox_intersect(param_13, param_14);
+ }
+ sh_bbox[th] = bbox;
+ GroupMemoryBarrierWithGroupSync();
+ uint path_ix = inp;
+ bool _718 = !is_push;
+ bool _726;
+ if (_718)
+ {
+ _726 = gl_GlobalInvocationID.x < _80.Load(80);
+ }
+ else
+ {
+ _726 = _718;
+ }
+ if (_726)
+ {
+ uint param_15 = parent;
+ path_ix = load_path_ix(param_15);
+ uint drawmonoid_out_base = (_80.Load(44) >> uint(2)) + (4u * (~inp));
+ _96.Store(drawmonoid_out_base * 4 + 12, path_ix);
+ if (int(grandparent) >= 0)
+ {
+ bbox = sh_bbox[grandparent];
+ }
+ else
+ {
+ if (int(grandparent + stack_size) >= 0)
+ {
+ bbox = sh_stack_bbox[256u + grandparent];
+ }
+ else
+ {
+ bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
+ }
+ }
+ }
+ uint param_16 = gl_GlobalInvocationID.x;
+ float4 param_17 = bbox;
+ store_clip_bbox(param_16, param_17);
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_WorkGroupID = stage_input.gl_WorkGroupID;
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/clip_leaf.msl b/piet-gpu/shader/gen/clip_leaf.msl
new file mode 100644
index 0000000..3d93b91
--- /dev/null
+++ b/piet-gpu/shader/gen/clip_leaf.msl
@@ -0,0 +1,371 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Bic
+{
+ uint a;
+ uint b;
+};
+
+struct ClipEl
+{
+ uint parent_ix;
+ float4 bbox;
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Bic load_bic(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96)
+{
+ uint base = (v_80.conf.clip_bic_alloc.offset >> uint(2)) + (2u * ix);
+ return Bic{ v_96.memory[base], v_96.memory[base + 1u] };
+}
+
+static inline __attribute__((always_inline))
+Bic bic_combine(thread const Bic& x, thread const Bic& y)
+{
+ uint m = min(x.b, y.a);
+ return Bic{ (x.a + y.a) - m, (x.b + y.b) - m };
+}
+
+static inline __attribute__((always_inline))
+ClipEl load_clip_el(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96)
+{
+ uint base = (v_80.conf.clip_stack_alloc.offset >> uint(2)) + (5u * ix);
+ uint parent_ix = v_96.memory[base];
+ float x0 = as_type<float>(v_96.memory[base + 1u]);
+ float y0 = as_type<float>(v_96.memory[base + 2u]);
+ float x1 = as_type<float>(v_96.memory[base + 3u]);
+ float y1 = as_type<float>(v_96.memory[base + 4u]);
+ float4 bbox = float4(x0, y0, x1, y1);
+ return ClipEl{ parent_ix, bbox };
+}
+
+static inline __attribute__((always_inline))
+float4 bbox_intersect(thread const float4& a, thread const float4& b)
+{
+ return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw));
+}
+
+static inline __attribute__((always_inline))
+uint load_path_ix(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96)
+{
+ if (ix < v_80.conf.n_clip)
+ {
+ return v_96.memory[(v_80.conf.clip_alloc.offset >> uint(2)) + ix];
+ }
+ else
+ {
+ return 2147483648u;
+ }
+}
+
+static inline __attribute__((always_inline))
+float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_80, device Memory& v_96)
+{
+ uint base = (v_80.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+ float bbox_l = float(v_96.memory[base]) - 32768.0;
+ float bbox_t = float(v_96.memory[base + 1u]) - 32768.0;
+ float bbox_r = float(v_96.memory[base + 2u]) - 32768.0;
+ float bbox_b = float(v_96.memory[base + 3u]) - 32768.0;
+ float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+ return bbox;
+}
+
+static inline __attribute__((always_inline))
+uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[510])
+{
+ uint ix = gl_LocalInvocationID.x;
+ uint j = 0u;
+ while (j < 8u)
+ {
+ uint base = 512u - (2u << (8u - j));
+ if (((ix >> j) & 1u) != 0u)
+ {
+ Bic param = sh_bic[(base + (ix >> j)) - 1u];
+ Bic param_1 = bic;
+ Bic test = bic_combine(param, param_1);
+ if (test.b > 0u)
+ {
+ break;
+ }
+ bic = test;
+ ix -= (1u << j);
+ }
+ j++;
+ }
+ if (ix > 0u)
+ {
+ while (j > 0u)
+ {
+ j--;
+ uint base_1 = 512u - (2u << (8u - j));
+ Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u];
+ Bic param_3 = bic;
+ Bic test_1 = bic_combine(param_2, param_3);
+ if (test_1.b == 0u)
+ {
+ bic = test_1;
+ ix -= (1u << j);
+ }
+ }
+ }
+ if (ix > 0u)
+ {
+ return ix - 1u;
+ }
+ else
+ {
+ return 4294967295u - bic.a;
+ }
+}
+
+static inline __attribute__((always_inline))
+void store_clip_bbox(thread const uint& ix, thread const float4& bbox, const device ConfigBuf& v_80, device Memory& v_96)
+{
+ uint base = (v_80.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * ix);
+ v_96.memory[base] = as_type<uint>(bbox.x);
+ v_96.memory[base + 1u] = as_type<uint>(bbox.y);
+ v_96.memory[base + 2u] = as_type<uint>(bbox.z);
+ v_96.memory[base + 3u] = as_type<uint>(bbox.w);
+}
+
+kernel void main0(device Memory& v_96 [[buffer(0)]], const device ConfigBuf& v_80 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+ threadgroup Bic sh_bic[510];
+ threadgroup uint sh_stack[256];
+ threadgroup float4 sh_stack_bbox[256];
+ threadgroup uint sh_link[256];
+ threadgroup float4 sh_bbox[256];
+ uint th = gl_LocalInvocationID.x;
+ Bic bic = Bic{ 0u, 0u };
+ if (th < gl_WorkGroupID.x)
+ {
+ uint param = th;
+ bic = load_bic(param, v_80, v_96);
+ }
+ sh_bic[th] = bic;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if ((th + (1u << i)) < 256u)
+ {
+ Bic other = sh_bic[th + (1u << i)];
+ Bic param_1 = bic;
+ Bic param_2 = other;
+ bic = bic_combine(param_1, param_2);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_bic[th] = bic;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint stack_size = sh_bic[0].b;
+ uint sp = 255u - th;
+ uint ix = 0u;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ uint probe = ix + (128u >> i_1);
+ if (sp < sh_bic[probe].b)
+ {
+ ix = probe;
+ }
+ }
+ uint b = sh_bic[ix].b;
+ float4 bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
+ if (sp < b)
+ {
+ uint param_3 = (((ix * 256u) + b) - sp) - 1u;
+ ClipEl el = load_clip_el(param_3, v_80, v_96);
+ sh_stack[th] = el.parent_ix;
+ bbox = el.bbox;
+ }
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ sh_stack_bbox[th] = bbox;
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (th >= (1u << i_2))
+ {
+ float4 param_4 = sh_stack_bbox[th - (1u << i_2)];
+ float4 param_5 = bbox;
+ bbox = bbox_intersect(param_4, param_5);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ }
+ sh_stack_bbox[th] = bbox;
+ uint param_6 = gl_GlobalInvocationID.x;
+ uint inp = load_path_ix(param_6, v_80, v_96);
+ bool is_push = int(inp) >= 0;
+ bic = Bic{ 1u - uint(is_push), uint(is_push) };
+ sh_bic[th] = bic;
+ if (is_push)
+ {
+ uint param_7 = inp;
+ bbox = load_path_bbox(param_7, v_80, v_96);
+ }
+ else
+ {
+ bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
+ }
+ uint inbase = 0u;
+ for (uint i_3 = 0u; i_3 < 7u; i_3++)
+ {
+ uint outbase = 512u - (1u << (8u - i_3));
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (th < (1u << (7u - i_3)))
+ {
+ Bic param_8 = sh_bic[inbase + (th * 2u)];
+ Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u];
+ sh_bic[outbase + th] = bic_combine(param_8, param_9);
+ }
+ inbase = outbase;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ bic = Bic{ 0u, 0u };
+ Bic param_10 = bic;
+ uint _619 = search_link(param_10, gl_LocalInvocationID, sh_bic);
+ bic = param_10;
+ uint link = _619;
+ sh_link[th] = link;
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint grandparent;
+ if (int(link) >= 0)
+ {
+ grandparent = sh_link[link];
+ }
+ else
+ {
+ grandparent = link - 1u;
+ }
+ uint parent;
+ if (int(link) >= 0)
+ {
+ parent = (gl_WorkGroupID.x * 256u) + link;
+ }
+ else
+ {
+ if (int(link + stack_size) >= 0)
+ {
+ parent = sh_stack[256u + link];
+ }
+ else
+ {
+ parent = 4294967295u;
+ }
+ }
+ for (uint i_4 = 0u; i_4 < 8u; i_4++)
+ {
+ if (i_4 != 0u)
+ {
+ sh_link[th] = link;
+ }
+ sh_bbox[th] = bbox;
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (int(link) >= 0)
+ {
+ float4 param_11 = sh_bbox[link];
+ float4 param_12 = bbox;
+ bbox = bbox_intersect(param_11, param_12);
+ link = sh_link[link];
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ }
+ if (int(link + stack_size) >= 0)
+ {
+ float4 param_13 = sh_stack_bbox[256u + link];
+ float4 param_14 = bbox;
+ bbox = bbox_intersect(param_13, param_14);
+ }
+ sh_bbox[th] = bbox;
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint path_ix = inp;
+ bool _718 = !is_push;
+ bool _726;
+ if (_718)
+ {
+ _726 = gl_GlobalInvocationID.x < v_80.conf.n_clip;
+ }
+ else
+ {
+ _726 = _718;
+ }
+ if (_726)
+ {
+ uint param_15 = parent;
+ path_ix = load_path_ix(param_15, v_80, v_96);
+ uint drawmonoid_out_base = (v_80.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * (~inp));
+ v_96.memory[drawmonoid_out_base] = path_ix;
+ if (int(grandparent) >= 0)
+ {
+ bbox = sh_bbox[grandparent];
+ }
+ else
+ {
+ if (int(grandparent + stack_size) >= 0)
+ {
+ bbox = sh_stack_bbox[256u + grandparent];
+ }
+ else
+ {
+ bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
+ }
+ }
+ }
+ uint param_16 = gl_GlobalInvocationID.x;
+ float4 param_17 = bbox;
+ store_clip_bbox(param_16, param_17, v_80, v_96);
+}
+
diff --git a/piet-gpu/shader/gen/clip_leaf.spv b/piet-gpu/shader/gen/clip_leaf.spv
new file mode 100644
index 0000000..25014ed
--- /dev/null
+++ b/piet-gpu/shader/gen/clip_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_reduce.dxil b/piet-gpu/shader/gen/clip_reduce.dxil
new file mode 100644
index 0000000..b18a155
--- /dev/null
+++ b/piet-gpu/shader/gen/clip_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/clip_reduce.hlsl b/piet-gpu/shader/gen/clip_reduce.hlsl
new file mode 100644
index 0000000..b7f0e85
--- /dev/null
+++ b/piet-gpu/shader/gen/clip_reduce.hlsl
@@ -0,0 +1,181 @@
+struct Bic
+{
+ uint a;
+ uint b;
+};
+
+struct ClipEl
+{
+ uint parent_ix;
+ float4 bbox;
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+static const Bic _268 = { 0u, 0u };
+
+ByteAddressBuffer _64 : register(t1, space0);
+RWByteAddressBuffer _80 : register(u0, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_WorkGroupID : SV_GroupID;
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Bic sh_bic[256];
+groupshared uint sh_parent[256];
+groupshared uint sh_path_ix[256];
+groupshared float4 sh_bbox[256];
+
+Bic bic_combine(Bic x, Bic y)
+{
+ uint m = min(x.b, y.a);
+ Bic _56 = { (x.a + y.a) - m, (x.b + y.b) - m };
+ return _56;
+}
+
+void store_bic(uint ix, Bic bic)
+{
+ uint base = (_64.Load(52) >> uint(2)) + (2u * ix);
+ _80.Store(base * 4 + 12, bic.a);
+ _80.Store((base + 1u) * 4 + 12, bic.b);
+}
+
+float4 load_path_bbox(uint path_ix)
+{
+ uint base = (_64.Load(40) >> uint(2)) + (6u * path_ix);
+ float bbox_l = float(_80.Load(base * 4 + 12)) - 32768.0f;
+ float bbox_t = float(_80.Load((base + 1u) * 4 + 12)) - 32768.0f;
+ float bbox_r = float(_80.Load((base + 2u) * 4 + 12)) - 32768.0f;
+ float bbox_b = float(_80.Load((base + 3u) * 4 + 12)) - 32768.0f;
+ float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+ return bbox;
+}
+
+void store_clip_el(uint ix, ClipEl el)
+{
+ uint base = (_64.Load(56) >> uint(2)) + (5u * ix);
+ _80.Store(base * 4 + 12, el.parent_ix);
+ _80.Store((base + 1u) * 4 + 12, asuint(el.bbox.x));
+ _80.Store((base + 2u) * 4 + 12, asuint(el.bbox.y));
+ _80.Store((base + 3u) * 4 + 12, asuint(el.bbox.z));
+ _80.Store((base + 4u) * 4 + 12, asuint(el.bbox.w));
+}
+
+void comp_main()
+{
+ uint th = gl_LocalInvocationID.x;
+ uint inp = _80.Load(((_64.Load(48) >> uint(2)) + gl_GlobalInvocationID.x) * 4 + 12);
+ bool is_push = int(inp) >= 0;
+ Bic _208 = { 1u - uint(is_push), uint(is_push) };
+ Bic bic = _208;
+ sh_bic[gl_LocalInvocationID.x] = bic;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if ((th + (1u << i)) < 256u)
+ {
+ Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)];
+ Bic param = bic;
+ Bic param_1 = other;
+ bic = bic_combine(param, param_1);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_bic[th] = bic;
+ }
+ if (th == 0u)
+ {
+ uint param_2 = gl_WorkGroupID.x;
+ Bic param_3 = bic;
+ store_bic(param_2, param_3);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ uint size = sh_bic[0].b;
+ bic = _268;
+ if ((th + 1u) < 256u)
+ {
+ bic = sh_bic[th + 1u];
+ }
+ bool _284;
+ if (is_push)
+ {
+ _284 = bic.a == 0u;
+ }
+ else
+ {
+ _284 = is_push;
+ }
+ if (_284)
+ {
+ uint local_ix = (size - bic.b) - 1u;
+ sh_parent[local_ix] = th;
+ sh_path_ix[local_ix] = inp;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ float4 bbox;
+ if (th < size)
+ {
+ uint path_ix = sh_path_ix[th];
+ uint param_4 = path_ix;
+ bbox = load_path_bbox(param_4);
+ }
+ if (th < size)
+ {
+ uint parent_ix = sh_parent[th] + (gl_WorkGroupID.x * 256u);
+ ClipEl _332 = { parent_ix, bbox };
+ ClipEl el = _332;
+ uint param_5 = gl_GlobalInvocationID.x;
+ ClipEl param_6 = el;
+ store_clip_el(param_5, param_6);
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_WorkGroupID = stage_input.gl_WorkGroupID;
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/clip_reduce.msl b/piet-gpu/shader/gen/clip_reduce.msl
new file mode 100644
index 0000000..c85a51d
--- /dev/null
+++ b/piet-gpu/shader/gen/clip_reduce.msl
@@ -0,0 +1,178 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Bic
+{
+ uint a;
+ uint b;
+};
+
+struct ClipEl
+{
+ uint parent_ix;
+ float4 bbox;
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Bic bic_combine(thread const Bic& x, thread const Bic& y)
+{
+ uint m = min(x.b, y.a);
+ return Bic{ (x.a + y.a) - m, (x.b + y.b) - m };
+}
+
+static inline __attribute__((always_inline))
+void store_bic(thread const uint& ix, thread const Bic& bic, const device ConfigBuf& v_64, device Memory& v_80)
+{
+ uint base = (v_64.conf.clip_bic_alloc.offset >> uint(2)) + (2u * ix);
+ v_80.memory[base] = bic.a;
+ v_80.memory[base + 1u] = bic.b;
+}
+
+static inline __attribute__((always_inline))
+float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_64, device Memory& v_80)
+{
+ uint base = (v_64.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+ float bbox_l = float(v_80.memory[base]) - 32768.0;
+ float bbox_t = float(v_80.memory[base + 1u]) - 32768.0;
+ float bbox_r = float(v_80.memory[base + 2u]) - 32768.0;
+ float bbox_b = float(v_80.memory[base + 3u]) - 32768.0;
+ float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+ return bbox;
+}
+
+static inline __attribute__((always_inline))
+void store_clip_el(thread const uint& ix, thread const ClipEl& el, const device ConfigBuf& v_64, device Memory& v_80)
+{
+ uint base = (v_64.conf.clip_stack_alloc.offset >> uint(2)) + (5u * ix);
+ v_80.memory[base] = el.parent_ix;
+ v_80.memory[base + 1u] = as_type<uint>(el.bbox.x);
+ v_80.memory[base + 2u] = as_type<uint>(el.bbox.y);
+ v_80.memory[base + 3u] = as_type<uint>(el.bbox.z);
+ v_80.memory[base + 4u] = as_type<uint>(el.bbox.w);
+}
+
+kernel void main0(device Memory& v_80 [[buffer(0)]], const device ConfigBuf& v_64 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+ threadgroup Bic sh_bic[256];
+ threadgroup uint sh_parent[256];
+ threadgroup uint sh_path_ix[256];
+ threadgroup float4 sh_bbox[256];
+ uint th = gl_LocalInvocationID.x;
+ uint inp = v_80.memory[(v_64.conf.clip_alloc.offset >> uint(2)) + gl_GlobalInvocationID.x];
+ bool is_push = int(inp) >= 0;
+ Bic bic = Bic{ 1u - uint(is_push), uint(is_push) };
+ sh_bic[gl_LocalInvocationID.x] = bic;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if ((th + (1u << i)) < 256u)
+ {
+ Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)];
+ Bic param = bic;
+ Bic param_1 = other;
+ bic = bic_combine(param, param_1);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_bic[th] = bic;
+ }
+ if (th == 0u)
+ {
+ uint param_2 = gl_WorkGroupID.x;
+ Bic param_3 = bic;
+ store_bic(param_2, param_3, v_64, v_80);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint size = sh_bic[0].b;
+ bic = Bic{ 0u, 0u };
+ if ((th + 1u) < 256u)
+ {
+ bic = sh_bic[th + 1u];
+ }
+ bool _284;
+ if (is_push)
+ {
+ _284 = bic.a == 0u;
+ }
+ else
+ {
+ _284 = is_push;
+ }
+ if (_284)
+ {
+ uint local_ix = (size - bic.b) - 1u;
+ sh_parent[local_ix] = th;
+ sh_path_ix[local_ix] = inp;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ float4 bbox;
+ if (th < size)
+ {
+ uint path_ix = sh_path_ix[th];
+ uint param_4 = path_ix;
+ bbox = load_path_bbox(param_4, v_64, v_80);
+ }
+ if (th < size)
+ {
+ uint parent_ix = sh_parent[th] + (gl_WorkGroupID.x * 256u);
+ ClipEl el = ClipEl{ parent_ix, bbox };
+ uint param_5 = gl_GlobalInvocationID.x;
+ ClipEl param_6 = el;
+ store_clip_el(param_5, param_6, v_64, v_80);
+ }
+}
+
diff --git a/piet-gpu/shader/gen/clip_reduce.spv b/piet-gpu/shader/gen/clip_reduce.spv
new file mode 100644
index 0000000..56ddcef
--- /dev/null
+++ b/piet-gpu/shader/gen/clip_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
new file mode 100644
index 0000000..e2a8406
--- /dev/null
+++ b/piet-gpu/shader/gen/coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
new file mode 100644
index 0000000..83a0df7
--- /dev/null
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -0,0 +1,1245 @@
+struct Alloc
+{
+ uint offset;
+};
+
+struct BinInstanceRef
+{
+ uint offset;
+};
+
+struct BinInstance
+{
+ uint element_ix;
+};
+
+struct PathRef
+{
+ uint offset;
+};
+
+struct TileRef
+{
+ uint offset;
+};
+
+struct Path
+{
+ uint4 bbox;
+ TileRef tiles;
+};
+
+struct TileSegRef
+{
+ uint offset;
+};
+
+struct Tile
+{
+ TileSegRef tile;
+ int backdrop;
+};
+
+struct CmdStrokeRef
+{
+ uint offset;
+};
+
+struct CmdStroke
+{
+ uint tile_ref;
+ float half_width;
+};
+
+struct CmdFillRef
+{
+ uint offset;
+};
+
+struct CmdFill
+{
+ uint tile_ref;
+ int backdrop;
+};
+
+struct CmdColorRef
+{
+ uint offset;
+};
+
+struct CmdColor
+{
+ uint rgba_color;
+};
+
+struct CmdLinGradRef
+{
+ uint offset;
+};
+
+struct CmdLinGrad
+{
+ uint index;
+ float line_x;
+ float line_y;
+ float line_c;
+};
+
+struct CmdRadGradRef
+{
+ uint offset;
+};
+
+struct CmdRadGrad
+{
+ uint index;
+ float4 mat;
+ float2 xlat;
+ float2 c1;
+ float ra;
+ float roff;
+};
+
+struct CmdImageRef
+{
+ uint offset;
+};
+
+struct CmdImage
+{
+ uint index;
+ int2 offset;
+};
+
+struct CmdEndClipRef
+{
+ uint offset;
+};
+
+struct CmdEndClip
+{
+ uint blend;
+};
+
+struct CmdJumpRef
+{
+ uint offset;
+};
+
+struct CmdJump
+{
+ uint new_ref;
+};
+
+struct CmdRef
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer _267 : register(u0, space0);
+ByteAddressBuffer _891 : register(t1, space0);
+ByteAddressBuffer _1390 : register(t2, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_WorkGroupID : SV_GroupID;
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+};
+
+static bool mem_ok;
+groupshared uint sh_bitmaps[8][256];
+groupshared Alloc sh_part_elements[256];
+groupshared uint sh_part_count[256];
+groupshared uint sh_elements[256];
+groupshared uint sh_tile_stride[256];
+groupshared uint sh_tile_width[256];
+groupshared uint sh_tile_x0[256];
+groupshared uint sh_tile_y0[256];
+groupshared uint sh_tile_base[256];
+groupshared uint sh_tile_count[256];
+
+bool check_deps(uint dep_stage)
+{
+ uint _273;
+ _267.InterlockedOr(4, 0u, _273);
+ return (_273 & dep_stage) == 0u;
+}
+
+Alloc slice_mem(Alloc a, uint offset, uint size)
+{
+ Alloc _331 = { a.offset + offset };
+ return _331;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+ return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return 0u;
+ }
+ uint v = _267.Load(offset * 4 + 12);
+ return v;
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok_1)
+{
+ Alloc a;
+ a.offset = offset;
+ return a;
+}
+
+BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index)
+{
+ BinInstanceRef _340 = { ref.offset + (index * 4u) };
+ return _340;
+}
+
+BinInstance BinInstance_read(Alloc a, BinInstanceRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ BinInstance s;
+ s.element_ix = raw0;
+ return s;
+}
+
+Path Path_read(Alloc a, PathRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5);
+ Path s;
+ s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+ TileRef _404 = { raw2 };
+ s.tiles = _404;
+ return s;
+}
+
+void write_tile_alloc(uint el_ix, Alloc a)
+{
+}
+
+Alloc read_tile_alloc(uint el_ix, bool mem_ok_1)
+{
+ uint param = 0u;
+ uint param_1 = _891.Load(0);
+ bool param_2 = mem_ok_1;
+ return new_alloc(param, param_1, param_2);
+}
+
+Tile Tile_read(Alloc a, TileRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ TileSegRef _429 = { raw0 };
+ Tile s;
+ s.tile = _429;
+ s.backdrop = int(raw1);
+ return s;
+}
+
+uint malloc_stage(uint size, uint mem_size, uint stage)
+{
+ uint _282;
+ _267.InterlockedAdd(0, size, _282);
+ uint offset = _282;
+ if ((offset + size) > mem_size)
+ {
+ uint _292;
+ _267.InterlockedOr(4, stage, _292);
+ offset = 0u;
+ }
+ return offset;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ _267.Store(offset * 4 + 12, val);
+}
+
+void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.new_ref;
+ write_mem(param, param_1, param_2);
+}
+
+void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 11u;
+ write_mem(param, param_1, param_2);
+ CmdJumpRef _880 = { ref.offset + 4u };
+ Alloc param_3 = a;
+ CmdJumpRef param_4 = _880;
+ CmdJump param_5 = s;
+ CmdJump_write(param_3, param_4, param_5);
+}
+
+void alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit)
+{
+ if (cmd_ref.offset < cmd_limit)
+ {
+ return;
+ }
+ uint param = 1024u;
+ uint param_1 = _891.Load(0);
+ uint param_2 = 8u;
+ uint _915 = malloc_stage(param, param_1, param_2);
+ uint new_cmd = _915;
+ if (new_cmd == 0u)
+ {
+ mem_ok = false;
+ }
+ if (mem_ok)
+ {
+ CmdJump _926 = { new_cmd };
+ CmdJump jump = _926;
+ Alloc param_3 = cmd_alloc;
+ CmdRef param_4 = cmd_ref;
+ CmdJump param_5 = jump;
+ Cmd_Jump_write(param_3, param_4, param_5);
+ }
+ uint param_6 = new_cmd;
+ uint param_7 = 1024u;
+ bool param_8 = true;
+ cmd_alloc = new_alloc(param_6, param_7, param_8);
+ CmdRef _940 = { new_cmd };
+ cmd_ref = _940;
+ cmd_limit = (new_cmd + 1024u) - 144u;
+}
+
+void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.tile_ref;
+ write_mem(param, param_1, param_2);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = uint(s.backdrop);
+ write_mem(param_3, param_4, param_5);
+}
+
+void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 1u;
+ write_mem(param, param_1, param_2);
+ CmdFillRef _737 = { ref.offset + 4u };
+ Alloc param_3 = a;
+ CmdFillRef param_4 = _737;
+ CmdFill param_5 = s;
+ CmdFill_write(param_3, param_4, param_5);
+}
+
+void Cmd_Solid_write(Alloc a, CmdRef ref)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 3u;
+ write_mem(param, param_1, param_2);
+}
+
+void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.tile_ref;
+ write_mem(param, param_1, param_2);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = asuint(s.half_width);
+ write_mem(param_3, param_4, param_5);
+}
+
+void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 2u;
+ write_mem(param, param_1, param_2);
+ CmdStrokeRef _755 = { ref.offset + 4u };
+ Alloc param_3 = a;
+ CmdStrokeRef param_4 = _755;
+ CmdStroke param_5 = s;
+ CmdStroke_write(param_3, param_4, param_5);
+}
+
+void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth)
+{
+ if (linewidth < 0.0f)
+ {
+ if (tile.tile.offset != 0u)
+ {
+ CmdFill _960 = { tile.tile.offset, tile.backdrop };
+ CmdFill cmd_fill = _960;
+ if (mem_ok)
+ {
+ Alloc param = alloc;
+ CmdRef param_1 = cmd_ref;
+ CmdFill param_2 = cmd_fill;
+ Cmd_Fill_write(param, param_1, param_2);
+ }
+ cmd_ref.offset += 12u;
+ }
+ else
+ {
+ if (mem_ok)
+ {
+ Alloc param_3 = alloc;
+ CmdRef param_4 = cmd_ref;
+ Cmd_Solid_write(param_3, param_4);
+ }
+ cmd_ref.offset += 4u;
+ }
+ }
+ else
+ {
+ CmdStroke _996 = { tile.tile.offset, 0.5f * linewidth };
+ CmdStroke cmd_stroke = _996;
+ if (mem_ok)
+ {
+ Alloc param_5 = alloc;
+ CmdRef param_6 = cmd_ref;
+ CmdStroke param_7 = cmd_stroke;
+ Cmd_Stroke_write(param_5, param_6, param_7);
+ }
+ cmd_ref.offset += 12u;
+ }
+}
+
+void CmdColor_write(Alloc a, CmdColorRef ref, CmdColor s)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.rgba_color;
+ write_mem(param, param_1, param_2);
+}
+
+void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 5u;
+ write_mem(param, param_1, param_2);
+ CmdColorRef _781 = { ref.offset + 4u };
+ Alloc param_3 = a;
+ CmdColorRef param_4 = _781;
+ CmdColor param_5 = s;
+ CmdColor_write(param_3, param_4, param_5);
+}
+
+void CmdLinGrad_write(Alloc a, CmdLinGradRef ref, CmdLinGrad s)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.index;
+ write_mem(param, param_1, param_2);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = asuint(s.line_x);
+ write_mem(param_3, param_4, param_5);
+ Alloc param_6 = a;
+ uint param_7 = ix + 2u;
+ uint param_8 = asuint(s.line_y);
+ write_mem(param_6, param_7, param_8);
+ Alloc param_9 = a;
+ uint param_10 = ix + 3u;
+ uint param_11 = asuint(s.line_c);
+ write_mem(param_9, param_10, param_11);
+}
+
+void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 6u;
+ write_mem(param, param_1, param_2);
+ CmdLinGradRef _799 = { ref.offset + 4u };
+ Alloc param_3 = a;
+ CmdLinGradRef param_4 = _799;
+ CmdLinGrad param_5 = s;
+ CmdLinGrad_write(param_3, param_4, param_5);
+}
+
+void CmdRadGrad_write(Alloc a, CmdRadGradRef ref, CmdRadGrad s)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.index;
+ write_mem(param, param_1, param_2);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = asuint(s.mat.x);
+ write_mem(param_3, param_4, param_5);
+ Alloc param_6 = a;
+ uint param_7 = ix + 2u;
+ uint param_8 = asuint(s.mat.y);
+ write_mem(param_6, param_7, param_8);
+ Alloc param_9 = a;
+ uint param_10 = ix + 3u;
+ uint param_11 = asuint(s.mat.z);
+ write_mem(param_9, param_10, param_11);
+ Alloc param_12 = a;
+ uint param_13 = ix + 4u;
+ uint param_14 = asuint(s.mat.w);
+ write_mem(param_12, param_13, param_14);
+ Alloc param_15 = a;
+ uint param_16 = ix + 5u;
+ uint param_17 = asuint(s.xlat.x);
+ write_mem(param_15, param_16, param_17);
+ Alloc param_18 = a;
+ uint param_19 = ix + 6u;
+ uint param_20 = asuint(s.xlat.y);
+ write_mem(param_18, param_19, param_20);
+ Alloc param_21 = a;
+ uint param_22 = ix + 7u;
+ uint param_23 = asuint(s.c1.x);
+ write_mem(param_21, param_22, param_23);
+ Alloc param_24 = a;
+ uint param_25 = ix + 8u;
+ uint param_26 = asuint(s.c1.y);
+ write_mem(param_24, param_25, param_26);
+ Alloc param_27 = a;
+ uint param_28 = ix + 9u;
+ uint param_29 = asuint(s.ra);
+ write_mem(param_27, param_28, param_29);
+ Alloc param_30 = a;
+ uint param_31 = ix + 10u;
+ uint param_32 = asuint(s.roff);
+ write_mem(param_30, param_31, param_32);
+}
+
+void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 7u;
+ write_mem(param, param_1, param_2);
+ CmdRadGradRef _817 = { ref.offset + 4u };
+ Alloc param_3 = a;
+ CmdRadGradRef param_4 = _817;
+ CmdRadGrad param_5 = s;
+ CmdRadGrad_write(param_3, param_4, param_5);
+}
+
+void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.index;
+ write_mem(param, param_1, param_2);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
+ write_mem(param_3, param_4, param_5);
+}
+
+void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 8u;
+ write_mem(param, param_1, param_2);
+ CmdImageRef _835 = { ref.offset + 4u };
+ Alloc param_3 = a;
+ CmdImageRef param_4 = _835;
+ CmdImage param_5 = s;
+ CmdImage_write(param_3, param_4, param_5);
+}
+
+void Cmd_BeginClip_write(Alloc a, CmdRef ref)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 9u;
+ write_mem(param, param_1, param_2);
+}
+
+void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.blend;
+ write_mem(param, param_1, param_2);
+}
+
+void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 10u;
+ write_mem(param, param_1, param_2);
+ CmdEndClipRef _861 = { ref.offset + 4u };
+ Alloc param_3 = a;
+ CmdEndClipRef param_4 = _861;
+ CmdEndClip param_5 = s;
+ CmdEndClip_write(param_3, param_4, param_5);
+}
+
+void Cmd_End_write(Alloc a, CmdRef ref)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 0u;
+ write_mem(param, param_1, param_2);
+}
+
+void comp_main()
+{
+ mem_ok = true;
+ uint param = 7u;
+ bool _1012 = check_deps(param);
+ if (!_1012)
+ {
+ return;
+ }
+ uint width_in_bins = ((_891.Load(12) + 16u) - 1u) / 16u;
+ uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
+ uint partition_ix = 0u;
+ uint n_partitions = ((_891.Load(4) + 256u) - 1u) / 256u;
+ uint th_ix = gl_LocalInvocationID.x;
+ uint bin_tile_x = 16u * gl_WorkGroupID.x;
+ uint bin_tile_y = 16u * gl_WorkGroupID.y;
+ uint tile_x = gl_LocalInvocationID.x % 16u;
+ uint tile_y = gl_LocalInvocationID.x / 16u;
+ uint this_tile_ix = (((bin_tile_y + tile_y) * _891.Load(12)) + bin_tile_x) + tile_x;
+ Alloc _1082;
+ _1082.offset = _891.Load(28);
+ Alloc param_1;
+ param_1.offset = _1082.offset;
+ uint param_2 = this_tile_ix * 1024u;
+ uint param_3 = 1024u;
+ Alloc cmd_alloc = slice_mem(param_1, param_2, param_3);
+ CmdRef _1091 = { cmd_alloc.offset };
+ CmdRef cmd_ref = _1091;
+ uint cmd_limit = (cmd_ref.offset + 1024u) - 144u;
+ uint clip_depth = 0u;
+ uint clip_zero_depth = 0u;
+ uint rd_ix = 0u;
+ uint wr_ix = 0u;
+ uint part_start_ix = 0u;
+ uint ready_ix = 0u;
+ Alloc param_4 = cmd_alloc;
+ uint param_5 = 0u;
+ uint param_6 = 8u;
+ Alloc scratch_alloc = slice_mem(param_4, param_5, param_6);
+ cmd_ref.offset += 4u;
+ uint render_blend_depth = 0u;
+ uint max_blend_depth = 0u;
+ uint drawmonoid_start = _891.Load(44) >> uint(2);
+ uint drawtag_start = _891.Load(100) >> uint(2);
+ uint drawdata_start = _891.Load(104) >> uint(2);
+ uint drawinfo_start = _891.Load(68) >> uint(2);
+ Alloc param_7;
+ Alloc param_9;
+ uint _1322;
+ uint element_ix;
+ Alloc param_18;
+ uint tile_count;
+ uint _1622;
+ float linewidth;
+ CmdLinGrad cmd_lin;
+ CmdRadGrad cmd_rad;
+ while (true)
+ {
+ for (uint i = 0u; i < 8u; i++)
+ {
+ sh_bitmaps[i][th_ix] = 0u;
+ }
+ bool _1374;
+ for (;;)
+ {
+ if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
+ {
+ part_start_ix = ready_ix;
+ uint count = 0u;
+ bool _1174 = th_ix < 256u;
+ bool _1182;
+ if (_1174)
+ {
+ _1182 = (partition_ix + th_ix) < n_partitions;
+ }
+ else
+ {
+ _1182 = _1174;
+ }
+ if (_1182)
+ {
+ uint in_ix = (_891.Load(24) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+ Alloc _1200;
+ _1200.offset = _891.Load(24);
+ param_7.offset = _1200.offset;
+ uint param_8 = in_ix;
+ count = read_mem(param_7, param_8);
+ Alloc _1211;
+ _1211.offset = _891.Load(24);
+ param_9.offset = _1211.offset;
+ uint param_10 = in_ix + 1u;
+ uint offset = read_mem(param_9, param_10);
+ uint param_11 = offset;
+ uint param_12 = count * 4u;
+ bool param_13 = true;
+ sh_part_elements[th_ix] = new_alloc(param_11, param_12, param_13);
+ }
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ if (th_ix < 256u)
+ {
+ sh_part_count[th_ix] = count;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ if (th_ix < 256u)
+ {
+ if (th_ix >= (1u << i_1))
+ {
+ count += sh_part_count[th_ix - (1u << i_1)];
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ }
+ if (th_ix < 256u)
+ {
+ sh_part_count[th_ix] = part_start_ix + count;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ ready_ix = sh_part_count[255];
+ partition_ix += 256u;
+ }
+ uint ix = rd_ix + th_ix;
+ if ((ix >= wr_ix) && (ix < ready_ix))
+ {
+ uint part_ix = 0u;
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ uint probe = part_ix + (128u >> i_2);
+ if (ix >= sh_part_count[probe - 1u])
+ {
+ part_ix = probe;
+ }
+ }
+ if (part_ix > 0u)
+ {
+ _1322 = sh_part_count[part_ix - 1u];
+ }
+ else
+ {
+ _1322 = part_start_ix;
+ }
+ ix -= _1322;
+ Alloc bin_alloc = sh_part_elements[part_ix];
+ BinInstanceRef _1341 = { bin_alloc.offset };
+ BinInstanceRef inst_ref = _1341;
+ BinInstanceRef param_14 = inst_ref;
+ uint param_15 = ix;
+ Alloc param_16 = bin_alloc;
+ BinInstanceRef param_17 = BinInstance_index(param_14, param_15);
+ BinInstance inst = BinInstance_read(param_16, param_17);
+ sh_elements[th_ix] = inst.element_ix;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ wr_ix = min((rd_ix + 256u), ready_ix);
+ bool _1364 = (wr_ix - rd_ix) < 256u;
+ if (_1364)
+ {
+ _1374 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+ }
+ else
+ {
+ _1374 = _1364;
+ }
+ if (_1374)
+ {
+ continue;
+ }
+ else
+ {
+ break;
+ }
+ }
+ uint tag = 0u;
+ if ((th_ix + rd_ix) < wr_ix)
+ {
+ element_ix = sh_elements[th_ix];
+ tag = _1390.Load((drawtag_start + element_ix) * 4 + 0);
+ }
+ switch (tag)
+ {
+ case 68u:
+ case 72u:
+ case 276u:
+ case 732u:
+ case 5u:
+ case 37u:
+ {
+ uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
+ uint path_ix = _267.Load(drawmonoid_base * 4 + 12);
+ PathRef _1415 = { _891.Load(20) + (path_ix * 12u) };
+ Alloc _1418;
+ _1418.offset = _891.Load(20);
+ param_18.offset = _1418.offset;
+ PathRef param_19 = _1415;
+ Path path = Path_read(param_18, param_19);
+ uint stride = path.bbox.z - path.bbox.x;
+ sh_tile_stride[th_ix] = stride;
+ int dx = int(path.bbox.x) - int(bin_tile_x);
+ int dy = int(path.bbox.y) - int(bin_tile_y);
+ int x0 = clamp(dx, 0, 16);
+ int y0 = clamp(dy, 0, 16);
+ int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16);
+ int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16);
+ sh_tile_width[th_ix] = uint(x1 - x0);
+ sh_tile_x0[th_ix] = uint(x0);
+ sh_tile_y0[th_ix] = uint(y0);
+ tile_count = uint(x1 - x0) * uint(y1 - y0);
+ uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
+ sh_tile_base[th_ix] = base;
+ uint param_20 = path.tiles.offset;
+ uint param_21 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+ bool param_22 = true;
+ Alloc path_alloc = new_alloc(param_20, param_21, param_22);
+ uint param_23 = th_ix;
+ Alloc param_24 = path_alloc;
+ write_tile_alloc(param_23, param_24);
+ break;
+ }
+ default:
+ {
+ tile_count = 0u;
+ break;
+ }
+ }
+ sh_tile_count[th_ix] = tile_count;
+ for (uint i_3 = 0u; i_3 < 8u; i_3++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if (th_ix >= (1u << i_3))
+ {
+ tile_count += sh_tile_count[th_ix - (1u << i_3)];
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_tile_count[th_ix] = tile_count;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ uint total_tile_count = sh_tile_count[255];
+ for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u)
+ {
+ uint el_ix = 0u;
+ for (uint i_4 = 0u; i_4 < 8u; i_4++)
+ {
+ uint probe_1 = el_ix + (128u >> i_4);
+ if (ix_1 >= sh_tile_count[probe_1 - 1u])
+ {
+ el_ix = probe_1;
+ }
+ }
+ uint element_ix_1 = sh_elements[el_ix];
+ uint tag_1 = _1390.Load((drawtag_start + element_ix_1) * 4 + 0);
+ if (el_ix > 0u)
+ {
+ _1622 = sh_tile_count[el_ix - 1u];
+ }
+ else
+ {
+ _1622 = 0u;
+ }
+ uint seq_ix = ix_1 - _1622;
+ uint width = sh_tile_width[el_ix];
+ uint x = sh_tile_x0[el_ix] + (seq_ix % width);
+ uint y = sh_tile_y0[el_ix] + (seq_ix / width);
+ bool include_tile = false;
+ uint param_25 = el_ix;
+ bool param_26 = true;
+ TileRef _1670 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+ Alloc param_27 = read_tile_alloc(param_25, param_26);
+ TileRef param_28 = _1670;
+ Tile tile = Tile_read(param_27, param_28);
+ bool is_clip = (tag_1 & 1u) != 0u;
+ bool is_blend = false;
+ if (is_clip)
+ {
+ uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
+ uint scene_offset = _267.Load((drawmonoid_base_1 + 2u) * 4 + 12);
+ uint dd = drawdata_start + (scene_offset >> uint(2));
+ uint blend = _1390.Load(dd * 4 + 0);
+ is_blend = blend != 32771u;
+ }
+ bool _1706 = tile.tile.offset != 0u;
+ bool _1715;
+ if (!_1706)
+ {
+ _1715 = (tile.backdrop == 0) == is_clip;
+ }
+ else
+ {
+ _1715 = _1706;
+ }
+ include_tile = _1715 || is_blend;
+ if (include_tile)
+ {
+ uint el_slice = el_ix / 32u;
+ uint el_mask = 1u << (el_ix & 31u);
+ uint _1737;
+ InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1737);
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ uint slice_ix = 0u;
+ uint bitmap = sh_bitmaps[0][th_ix];
+ while (true)
+ {
+ if (bitmap == 0u)
+ {
+ slice_ix++;
+ if (slice_ix == 8u)
+ {
+ break;
+ }
+ bitmap = sh_bitmaps[slice_ix][th_ix];
+ if (bitmap == 0u)
+ {
+ continue;
+ }
+ }
+ uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
+ uint element_ix_2 = sh_elements[element_ref_ix];
+ bitmap &= (bitmap - 1u);
+ uint drawtag = _1390.Load((drawtag_start + element_ix_2) * 4 + 0);
+ if (clip_zero_depth == 0u)
+ {
+ uint param_29 = element_ref_ix;
+ bool param_30 = true;
+ TileRef _1812 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+ Alloc param_31 = read_tile_alloc(param_29, param_30);
+ TileRef param_32 = _1812;
+ Tile tile_1 = Tile_read(param_31, param_32);
+ uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
+ uint scene_offset_1 = _267.Load((drawmonoid_base_2 + 2u) * 4 + 12);
+ uint info_offset = _267.Load((drawmonoid_base_2 + 3u) * 4 + 12);
+ uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
+ uint di = drawinfo_start + (info_offset >> uint(2));
+ switch (drawtag)
+ {
+ case 68u:
+ {
+ linewidth = asfloat(_267.Load(di * 4 + 12));
+ Alloc param_33 = cmd_alloc;
+ CmdRef param_34 = cmd_ref;
+ uint param_35 = cmd_limit;
+ alloc_cmd(param_33, param_34, param_35);
+ cmd_alloc = param_33;
+ cmd_ref = param_34;
+ cmd_limit = param_35;
+ Alloc param_36 = cmd_alloc;
+ CmdRef param_37 = cmd_ref;
+ Tile param_38 = tile_1;
+ float param_39 = linewidth;
+ write_fill(param_36, param_37, param_38, param_39);
+ cmd_ref = param_37;
+ uint rgba = _1390.Load(dd_1 * 4 + 0);
+ if (mem_ok)
+ {
+ CmdColor _1882 = { rgba };
+ Alloc param_40 = cmd_alloc;
+ CmdRef param_41 = cmd_ref;
+ CmdColor param_42 = _1882;
+ Cmd_Color_write(param_40, param_41, param_42);
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 276u:
+ {
+ Alloc param_43 = cmd_alloc;
+ CmdRef param_44 = cmd_ref;
+ uint param_45 = cmd_limit;
+ alloc_cmd(param_43, param_44, param_45);
+ cmd_alloc = param_43;
+ cmd_ref = param_44;
+ cmd_limit = param_45;
+ linewidth = asfloat(_267.Load(di * 4 + 12));
+ Alloc param_46 = cmd_alloc;
+ CmdRef param_47 = cmd_ref;
+ Tile param_48 = tile_1;
+ float param_49 = linewidth;
+ write_fill(param_46, param_47, param_48, param_49);
+ cmd_ref = param_47;
+ cmd_lin.index = _1390.Load(dd_1 * 4 + 0);
+ cmd_lin.line_x = asfloat(_267.Load((di + 1u) * 4 + 12));
+ cmd_lin.line_y = asfloat(_267.Load((di + 2u) * 4 + 12));
+ cmd_lin.line_c = asfloat(_267.Load((di + 3u) * 4 + 12));
+ if (mem_ok)
+ {
+ Alloc param_50 = cmd_alloc;
+ CmdRef param_51 = cmd_ref;
+ CmdLinGrad param_52 = cmd_lin;
+ Cmd_LinGrad_write(param_50, param_51, param_52);
+ }
+ cmd_ref.offset += 20u;
+ break;
+ }
+ case 732u:
+ {
+ Alloc param_53 = cmd_alloc;
+ CmdRef param_54 = cmd_ref;
+ uint param_55 = cmd_limit;
+ alloc_cmd(param_53, param_54, param_55);
+ cmd_alloc = param_53;
+ cmd_ref = param_54;
+ cmd_limit = param_55;
+ linewidth = asfloat(_267.Load(di * 4 + 12));
+ Alloc param_56 = cmd_alloc;
+ CmdRef param_57 = cmd_ref;
+ Tile param_58 = tile_1;
+ float param_59 = linewidth;
+ write_fill(param_56, param_57, param_58, param_59);
+ cmd_ref = param_57;
+ cmd_rad.index = _1390.Load(dd_1 * 4 + 0);
+ cmd_rad.mat = asfloat(uint4(_267.Load((di + 1u) * 4 + 12), _267.Load((di + 2u) * 4 + 12), _267.Load((di + 3u) * 4 + 12), _267.Load((di + 4u) * 4 + 12)));
+ cmd_rad.xlat = asfloat(uint2(_267.Load((di + 5u) * 4 + 12), _267.Load((di + 6u) * 4 + 12)));
+ cmd_rad.c1 = asfloat(uint2(_267.Load((di + 7u) * 4 + 12), _267.Load((di + 8u) * 4 + 12)));
+ cmd_rad.ra = asfloat(_267.Load((di + 9u) * 4 + 12));
+ cmd_rad.roff = asfloat(_267.Load((di + 10u) * 4 + 12));
+ if (mem_ok)
+ {
+ Alloc param_60 = cmd_alloc;
+ CmdRef param_61 = cmd_ref;
+ CmdRadGrad param_62 = cmd_rad;
+ Cmd_RadGrad_write(param_60, param_61, param_62);
+ }
+ cmd_ref.offset += 48u;
+ break;
+ }
+ case 72u:
+ {
+ Alloc param_63 = cmd_alloc;
+ CmdRef param_64 = cmd_ref;
+ uint param_65 = cmd_limit;
+ alloc_cmd(param_63, param_64, param_65);
+ cmd_alloc = param_63;
+ cmd_ref = param_64;
+ cmd_limit = param_65;
+ linewidth = asfloat(_267.Load(di * 4 + 12));
+ Alloc param_66 = cmd_alloc;
+ CmdRef param_67 = cmd_ref;
+ Tile param_68 = tile_1;
+ float param_69 = linewidth;
+ write_fill(param_66, param_67, param_68, param_69);
+ cmd_ref = param_67;
+ uint index = _1390.Load(dd_1 * 4 + 0);
+ uint raw1 = _1390.Load((dd_1 + 1u) * 4 + 0);
+ int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+ if (mem_ok)
+ {
+ CmdImage _2106 = { index, offset_1 };
+ Alloc param_70 = cmd_alloc;
+ CmdRef param_71 = cmd_ref;
+ CmdImage param_72 = _2106;
+ Cmd_Image_write(param_70, param_71, param_72);
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 5u:
+ {
+ bool _2120 = tile_1.tile.offset == 0u;
+ bool _2126;
+ if (_2120)
+ {
+ _2126 = tile_1.backdrop == 0;
+ }
+ else
+ {
+ _2126 = _2120;
+ }
+ if (_2126)
+ {
+ clip_zero_depth = clip_depth + 1u;
+ }
+ else
+ {
+ Alloc param_73 = cmd_alloc;
+ CmdRef param_74 = cmd_ref;
+ uint param_75 = cmd_limit;
+ alloc_cmd(param_73, param_74, param_75);
+ cmd_alloc = param_73;
+ cmd_ref = param_74;
+ cmd_limit = param_75;
+ if (mem_ok)
+ {
+ Alloc param_76 = cmd_alloc;
+ CmdRef param_77 = cmd_ref;
+ Cmd_BeginClip_write(param_76, param_77);
+ }
+ cmd_ref.offset += 4u;
+ render_blend_depth++;
+ max_blend_depth = max(max_blend_depth, render_blend_depth);
+ }
+ clip_depth++;
+ break;
+ }
+ case 37u:
+ {
+ clip_depth--;
+ Alloc param_78 = cmd_alloc;
+ CmdRef param_79 = cmd_ref;
+ Tile param_80 = tile_1;
+ float param_81 = -1.0f;
+ write_fill(param_78, param_79, param_80, param_81);
+ cmd_ref = param_79;
+ uint blend_1 = _1390.Load(dd_1 * 4 + 0);
+ if (mem_ok)
+ {
+ CmdEndClip _2182 = { blend_1 };
+ Alloc param_82 = cmd_alloc;
+ CmdRef param_83 = cmd_ref;
+ CmdEndClip param_84 = _2182;
+ Cmd_EndClip_write(param_82, param_83, param_84);
+ }
+ cmd_ref.offset += 8u;
+ render_blend_depth--;
+ break;
+ }
+ }
+ }
+ else
+ {
+ switch (drawtag)
+ {
+ case 5u:
+ {
+ clip_depth++;
+ break;
+ }
+ case 37u:
+ {
+ if (clip_depth == clip_zero_depth)
+ {
+ clip_zero_depth = 0u;
+ }
+ clip_depth--;
+ break;
+ }
+ }
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ rd_ix += 256u;
+ if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions))
+ {
+ break;
+ }
+ }
+ bool _2231 = (bin_tile_x + tile_x) < _891.Load(12);
+ bool _2240;
+ if (_2231)
+ {
+ _2240 = (bin_tile_y + tile_y) < _891.Load(16);
+ }
+ else
+ {
+ _2240 = _2231;
+ }
+ if (_2240)
+ {
+ if (mem_ok)
+ {
+ Alloc param_85 = cmd_alloc;
+ CmdRef param_86 = cmd_ref;
+ Cmd_End_write(param_85, param_86);
+ }
+ if (max_blend_depth > 4u)
+ {
+ uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u;
+ uint _2264;
+ _267.InterlockedAdd(8, scratch_size, _2264);
+ uint scratch = _2264;
+ Alloc param_87 = scratch_alloc;
+ uint param_88 = scratch_alloc.offset >> uint(2);
+ uint param_89 = scratch;
+ write_mem(param_87, param_88, param_89);
+ }
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_WorkGroupID = stage_input.gl_WorkGroupID;
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
new file mode 100644
index 0000000..4577470
--- /dev/null
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -0,0 +1,1260 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+// Implementation of the GLSL findLSB() function
+template<typename T>
+inline T spvFindLSB(T x)
+{
+ return select(ctz(x), T(-1), x == T(0));
+}
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct BinInstanceRef
+{
+ uint offset;
+};
+
+struct BinInstance
+{
+ uint element_ix;
+};
+
+struct PathRef
+{
+ uint offset;
+};
+
+struct TileRef
+{
+ uint offset;
+};
+
+struct Path
+{
+ uint4 bbox;
+ TileRef tiles;
+};
+
+struct TileSegRef
+{
+ uint offset;
+};
+
+struct Tile
+{
+ TileSegRef tile;
+ int backdrop;
+};
+
+struct CmdStrokeRef
+{
+ uint offset;
+};
+
+struct CmdStroke
+{
+ uint tile_ref;
+ float half_width;
+};
+
+struct CmdFillRef
+{
+ uint offset;
+};
+
+struct CmdFill
+{
+ uint tile_ref;
+ int backdrop;
+};
+
+struct CmdColorRef
+{
+ uint offset;
+};
+
+struct CmdColor
+{
+ uint rgba_color;
+};
+
+struct CmdLinGradRef
+{
+ uint offset;
+};
+
+struct CmdLinGrad
+{
+ uint index;
+ float line_x;
+ float line_y;
+ float line_c;
+};
+
+struct CmdRadGradRef
+{
+ uint offset;
+};
+
+struct CmdRadGrad
+{
+ uint index;
+ float4 mat;
+ float2 xlat;
+ float2 c1;
+ float ra;
+ float roff;
+};
+
+struct CmdImageRef
+{
+ uint offset;
+};
+
+struct CmdImage
+{
+ uint index;
+ int2 offset;
+};
+
+struct CmdEndClipRef
+{
+ uint offset;
+};
+
+struct CmdEndClip
+{
+ uint blend;
+};
+
+struct CmdJumpRef
+{
+ uint offset;
+};
+
+struct CmdJump
+{
+ uint new_ref;
+};
+
+struct CmdRef
+{
+ uint offset;
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+struct Alloc_1
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc_1 tile_alloc;
+ Alloc_1 bin_alloc;
+ Alloc_1 ptcl_alloc;
+ Alloc_1 pathseg_alloc;
+ Alloc_1 anno_alloc;
+ Alloc_1 path_bbox_alloc;
+ Alloc_1 drawmonoid_alloc;
+ Alloc_1 clip_alloc;
+ Alloc_1 clip_bic_alloc;
+ Alloc_1 clip_stack_alloc;
+ Alloc_1 clip_bbox_alloc;
+ Alloc_1 draw_bbox_alloc;
+ Alloc_1 drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+struct SceneBuf
+{
+ uint scene[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+bool check_deps(thread const uint& dep_stage, device Memory& v_267)
+{
+ uint _273 = atomic_fetch_or_explicit((device atomic_uint*)&v_267.mem_error, 0u, memory_order_relaxed);
+ return (_273 & dep_stage) == 0u;
+}
+
+static inline __attribute__((always_inline))
+Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
+{
+ return Alloc{ a.offset + offset };
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+ return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_267)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return 0u;
+ }
+ uint v = v_267.memory[offset];
+ return v;
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+ Alloc a;
+ a.offset = offset;
+ return a;
+}
+
+static inline __attribute__((always_inline))
+BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const uint& index)
+{
+ return BinInstanceRef{ ref.offset + (index * 4u) };
+}
+
+static inline __attribute__((always_inline))
+BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_267)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_267);
+ BinInstance s;
+ s.element_ix = raw0;
+ return s;
+}
+
+static inline __attribute__((always_inline))
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_267)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_267);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_267);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5, v_267);
+ Path s;
+ s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+ s.tiles = TileRef{ raw2 };
+ return s;
+}
+
+static inline __attribute__((always_inline))
+void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a)
+{
+}
+
+static inline __attribute__((always_inline))
+Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, const device ConfigBuf& v_891)
+{
+ uint param = 0u;
+ uint param_1 = v_891.conf.mem_size;
+ bool param_2 = mem_ok;
+ return new_alloc(param, param_1, param_2);
+}
+
+static inline __attribute__((always_inline))
+Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_267)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_267);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_267);
+ Tile s;
+ s.tile = TileSegRef{ raw0 };
+ s.backdrop = int(raw1);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+uint malloc_stage(thread const uint& size, thread const uint& mem_size, thread const uint& stage, device Memory& v_267)
+{
+ uint _282 = atomic_fetch_add_explicit((device atomic_uint*)&v_267.mem_offset, size, memory_order_relaxed);
+ uint offset = _282;
+ if ((offset + size) > mem_size)
+ {
+ uint _292 = atomic_fetch_or_explicit((device atomic_uint*)&v_267.mem_error, stage, memory_order_relaxed);
+ offset = 0u;
+ }
+ return offset;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_267)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ v_267.memory[offset] = val;
+}
+
+static inline __attribute__((always_inline))
+void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_267)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.new_ref;
+ write_mem(param, param_1, param_2, v_267);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_267)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 11u;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u };
+ CmdJump param_5 = s;
+ CmdJump_write(param_3, param_4, param_5, v_267);
+}
+
+static inline __attribute__((always_inline))
+void alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, thread bool& mem_ok, device Memory& v_267, const device ConfigBuf& v_891)
+{
+ if (cmd_ref.offset < cmd_limit)
+ {
+ return;
+ }
+ uint param = 1024u;
+ uint param_1 = v_891.conf.mem_size;
+ uint param_2 = 8u;
+ uint _915 = malloc_stage(param, param_1, param_2, v_267);
+ uint new_cmd = _915;
+ if (new_cmd == 0u)
+ {
+ mem_ok = false;
+ }
+ if (mem_ok)
+ {
+ CmdJump jump = CmdJump{ new_cmd };
+ Alloc param_3 = cmd_alloc;
+ CmdRef param_4 = cmd_ref;
+ CmdJump param_5 = jump;
+ Cmd_Jump_write(param_3, param_4, param_5, v_267);
+ }
+ uint param_6 = new_cmd;
+ uint param_7 = 1024u;
+ bool param_8 = true;
+ cmd_alloc = new_alloc(param_6, param_7, param_8);
+ cmd_ref = CmdRef{ new_cmd };
+ cmd_limit = (new_cmd + 1024u) - 144u;
+}
+
+static inline __attribute__((always_inline))
+void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_267)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.tile_ref;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = uint(s.backdrop);
+ write_mem(param_3, param_4, param_5, v_267);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_267)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 1u;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u };
+ CmdFill param_5 = s;
+ CmdFill_write(param_3, param_4, param_5, v_267);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_267)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 3u;
+ write_mem(param, param_1, param_2, v_267);
+}
+
+static inline __attribute__((always_inline))
+void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_267)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.tile_ref;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = as_type<uint>(s.half_width);
+ write_mem(param_3, param_4, param_5, v_267);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_267)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 2u;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u };
+ CmdStroke param_5 = s;
+ CmdStroke_write(param_3, param_4, param_5, v_267);
+}
+
+static inline __attribute__((always_inline))
+void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, thread bool& mem_ok, device Memory& v_267)
+{
+ if (linewidth < 0.0)
+ {
+ if (tile.tile.offset != 0u)
+ {
+ CmdFill cmd_fill = CmdFill{ tile.tile.offset, tile.backdrop };
+ if (mem_ok)
+ {
+ Alloc param = alloc;
+ CmdRef param_1 = cmd_ref;
+ CmdFill param_2 = cmd_fill;
+ Cmd_Fill_write(param, param_1, param_2, v_267);
+ }
+ cmd_ref.offset += 12u;
+ }
+ else
+ {
+ if (mem_ok)
+ {
+ Alloc param_3 = alloc;
+ CmdRef param_4 = cmd_ref;
+ Cmd_Solid_write(param_3, param_4, v_267);
+ }
+ cmd_ref.offset += 4u;
+ }
+ }
+ else
+ {
+ CmdStroke cmd_stroke = CmdStroke{ tile.tile.offset, 0.5 * linewidth };
+ if (mem_ok)
+ {
+ Alloc param_5 = alloc;
+ CmdRef param_6 = cmd_ref;
+ CmdStroke param_7 = cmd_stroke;
+ Cmd_Stroke_write(param_5, param_6, param_7, v_267);
+ }
+ cmd_ref.offset += 12u;
+ }
+}
+
+static inline __attribute__((always_inline))
+void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_267)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.rgba_color;
+ write_mem(param, param_1, param_2, v_267);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_267)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 5u;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u };
+ CmdColor param_5 = s;
+ CmdColor_write(param_3, param_4, param_5, v_267);
+}
+
+static inline __attribute__((always_inline))
+void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_267)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.index;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = as_type<uint>(s.line_x);
+ write_mem(param_3, param_4, param_5, v_267);
+ Alloc param_6 = a;
+ uint param_7 = ix + 2u;
+ uint param_8 = as_type<uint>(s.line_y);
+ write_mem(param_6, param_7, param_8, v_267);
+ Alloc param_9 = a;
+ uint param_10 = ix + 3u;
+ uint param_11 = as_type<uint>(s.line_c);
+ write_mem(param_9, param_10, param_11, v_267);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_267)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 6u;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u };
+ CmdLinGrad param_5 = s;
+ CmdLinGrad_write(param_3, param_4, param_5, v_267);
+}
+
+static inline __attribute__((always_inline))
+void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_267)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.index;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = as_type<uint>(s.mat.x);
+ write_mem(param_3, param_4, param_5, v_267);
+ Alloc param_6 = a;
+ uint param_7 = ix + 2u;
+ uint param_8 = as_type<uint>(s.mat.y);
+ write_mem(param_6, param_7, param_8, v_267);
+ Alloc param_9 = a;
+ uint param_10 = ix + 3u;
+ uint param_11 = as_type<uint>(s.mat.z);
+ write_mem(param_9, param_10, param_11, v_267);
+ Alloc param_12 = a;
+ uint param_13 = ix + 4u;
+ uint param_14 = as_type<uint>(s.mat.w);
+ write_mem(param_12, param_13, param_14, v_267);
+ Alloc param_15 = a;
+ uint param_16 = ix + 5u;
+ uint param_17 = as_type<uint>(s.xlat.x);
+ write_mem(param_15, param_16, param_17, v_267);
+ Alloc param_18 = a;
+ uint param_19 = ix + 6u;
+ uint param_20 = as_type<uint>(s.xlat.y);
+ write_mem(param_18, param_19, param_20, v_267);
+ Alloc param_21 = a;
+ uint param_22 = ix + 7u;
+ uint param_23 = as_type<uint>(s.c1.x);
+ write_mem(param_21, param_22, param_23, v_267);
+ Alloc param_24 = a;
+ uint param_25 = ix + 8u;
+ uint param_26 = as_type<uint>(s.c1.y);
+ write_mem(param_24, param_25, param_26, v_267);
+ Alloc param_27 = a;
+ uint param_28 = ix + 9u;
+ uint param_29 = as_type<uint>(s.ra);
+ write_mem(param_27, param_28, param_29, v_267);
+ Alloc param_30 = a;
+ uint param_31 = ix + 10u;
+ uint param_32 = as_type<uint>(s.roff);
+ write_mem(param_30, param_31, param_32, v_267);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_267)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 7u;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ CmdRadGradRef param_4 = CmdRadGradRef{ ref.offset + 4u };
+ CmdRadGrad param_5 = s;
+ CmdRadGrad_write(param_3, param_4, param_5, v_267);
+}
+
+static inline __attribute__((always_inline))
+void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_267)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.index;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
+ write_mem(param_3, param_4, param_5, v_267);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_267)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 8u;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
+ CmdImage param_5 = s;
+ CmdImage_write(param_3, param_4, param_5, v_267);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_267)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 9u;
+ write_mem(param, param_1, param_2, v_267);
+}
+
+static inline __attribute__((always_inline))
+void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_267)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.blend;
+ write_mem(param, param_1, param_2, v_267);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_267)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 10u;
+ write_mem(param, param_1, param_2, v_267);
+ Alloc param_3 = a;
+ CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u };
+ CmdEndClip param_5 = s;
+ CmdEndClip_write(param_3, param_4, param_5, v_267);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_267)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = 0u;
+ write_mem(param, param_1, param_2, v_267);
+}
+
+kernel void main0(device Memory& v_267 [[buffer(0)]], const device ConfigBuf& v_891 [[buffer(1)]], const device SceneBuf& _1390 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+ threadgroup uint sh_bitmaps[8][256];
+ threadgroup Alloc sh_part_elements[256];
+ threadgroup uint sh_part_count[256];
+ threadgroup uint sh_elements[256];
+ threadgroup uint sh_tile_stride[256];
+ threadgroup uint sh_tile_width[256];
+ threadgroup uint sh_tile_x0[256];
+ threadgroup uint sh_tile_y0[256];
+ threadgroup uint sh_tile_base[256];
+ threadgroup uint sh_tile_count[256];
+ bool mem_ok = true;
+ uint param = 7u;
+ bool _1012 = check_deps(param, v_267);
+ if (!_1012)
+ {
+ return;
+ }
+ uint width_in_bins = ((v_891.conf.width_in_tiles + 16u) - 1u) / 16u;
+ uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
+ uint partition_ix = 0u;
+ uint n_partitions = ((v_891.conf.n_elements + 256u) - 1u) / 256u;
+ uint th_ix = gl_LocalInvocationID.x;
+ uint bin_tile_x = 16u * gl_WorkGroupID.x;
+ uint bin_tile_y = 16u * gl_WorkGroupID.y;
+ uint tile_x = gl_LocalInvocationID.x % 16u;
+ uint tile_y = gl_LocalInvocationID.x / 16u;
+ uint this_tile_ix = (((bin_tile_y + tile_y) * v_891.conf.width_in_tiles) + bin_tile_x) + tile_x;
+ Alloc param_1;
+ param_1.offset = v_891.conf.ptcl_alloc.offset;
+ uint param_2 = this_tile_ix * 1024u;
+ uint param_3 = 1024u;
+ Alloc cmd_alloc = slice_mem(param_1, param_2, param_3);
+ CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+ uint cmd_limit = (cmd_ref.offset + 1024u) - 144u;
+ uint clip_depth = 0u;
+ uint clip_zero_depth = 0u;
+ uint rd_ix = 0u;
+ uint wr_ix = 0u;
+ uint part_start_ix = 0u;
+ uint ready_ix = 0u;
+ Alloc param_4 = cmd_alloc;
+ uint param_5 = 0u;
+ uint param_6 = 8u;
+ Alloc scratch_alloc = slice_mem(param_4, param_5, param_6);
+ cmd_ref.offset += 4u;
+ uint render_blend_depth = 0u;
+ uint max_blend_depth = 0u;
+ uint drawmonoid_start = v_891.conf.drawmonoid_alloc.offset >> uint(2);
+ uint drawtag_start = v_891.conf.drawtag_offset >> uint(2);
+ uint drawdata_start = v_891.conf.drawdata_offset >> uint(2);
+ uint drawinfo_start = v_891.conf.drawinfo_alloc.offset >> uint(2);
+ Alloc param_7;
+ Alloc param_9;
+ uint _1322;
+ uint element_ix;
+ Alloc param_18;
+ uint tile_count;
+ uint _1622;
+ float linewidth;
+ CmdLinGrad cmd_lin;
+ CmdRadGrad cmd_rad;
+ while (true)
+ {
+ for (uint i = 0u; i < 8u; i++)
+ {
+ sh_bitmaps[i][th_ix] = 0u;
+ }
+ bool _1374;
+ for (;;)
+ {
+ if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
+ {
+ part_start_ix = ready_ix;
+ uint count = 0u;
+ bool _1174 = th_ix < 256u;
+ bool _1182;
+ if (_1174)
+ {
+ _1182 = (partition_ix + th_ix) < n_partitions;
+ }
+ else
+ {
+ _1182 = _1174;
+ }
+ if (_1182)
+ {
+ uint in_ix = (v_891.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+ param_7.offset = v_891.conf.bin_alloc.offset;
+ uint param_8 = in_ix;
+ count = read_mem(param_7, param_8, v_267);
+ param_9.offset = v_891.conf.bin_alloc.offset;
+ uint param_10 = in_ix + 1u;
+ uint offset = read_mem(param_9, param_10, v_267);
+ uint param_11 = offset;
+ uint param_12 = count * 4u;
+ bool param_13 = true;
+ sh_part_elements[th_ix] = new_alloc(param_11, param_12, param_13);
+ }
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ if (th_ix < 256u)
+ {
+ sh_part_count[th_ix] = count;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (th_ix < 256u)
+ {
+ if (th_ix >= (1u << i_1))
+ {
+ count += sh_part_count[th_ix - (1u << i_1)];
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ }
+ if (th_ix < 256u)
+ {
+ sh_part_count[th_ix] = part_start_ix + count;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ ready_ix = sh_part_count[255];
+ partition_ix += 256u;
+ }
+ uint ix = rd_ix + th_ix;
+ if ((ix >= wr_ix) && (ix < ready_ix))
+ {
+ uint part_ix = 0u;
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ uint probe = part_ix + (128u >> i_2);
+ if (ix >= sh_part_count[probe - 1u])
+ {
+ part_ix = probe;
+ }
+ }
+ if (part_ix > 0u)
+ {
+ _1322 = sh_part_count[part_ix - 1u];
+ }
+ else
+ {
+ _1322 = part_start_ix;
+ }
+ ix -= _1322;
+ Alloc bin_alloc = sh_part_elements[part_ix];
+ BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
+ BinInstanceRef param_14 = inst_ref;
+ uint param_15 = ix;
+ Alloc param_16 = bin_alloc;
+ BinInstanceRef param_17 = BinInstance_index(param_14, param_15);
+ BinInstance inst = BinInstance_read(param_16, param_17, v_267);
+ sh_elements[th_ix] = inst.element_ix;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ wr_ix = min((rd_ix + 256u), ready_ix);
+ bool _1364 = (wr_ix - rd_ix) < 256u;
+ if (_1364)
+ {
+ _1374 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+ }
+ else
+ {
+ _1374 = _1364;
+ }
+ if (_1374)
+ {
+ continue;
+ }
+ else
+ {
+ break;
+ }
+ }
+ uint tag = 0u;
+ if ((th_ix + rd_ix) < wr_ix)
+ {
+ element_ix = sh_elements[th_ix];
+ tag = _1390.scene[drawtag_start + element_ix];
+ }
+ switch (tag)
+ {
+ case 68u:
+ case 72u:
+ case 276u:
+ case 732u:
+ case 5u:
+ case 37u:
+ {
+ uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
+ uint path_ix = v_267.memory[drawmonoid_base];
+ param_18.offset = v_891.conf.tile_alloc.offset;
+ PathRef param_19 = PathRef{ v_891.conf.tile_alloc.offset + (path_ix * 12u) };
+ Path path = Path_read(param_18, param_19, v_267);
+ uint stride = path.bbox.z - path.bbox.x;
+ sh_tile_stride[th_ix] = stride;
+ int dx = int(path.bbox.x) - int(bin_tile_x);
+ int dy = int(path.bbox.y) - int(bin_tile_y);
+ int x0 = clamp(dx, 0, 16);
+ int y0 = clamp(dy, 0, 16);
+ int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16);
+ int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16);
+ sh_tile_width[th_ix] = uint(x1 - x0);
+ sh_tile_x0[th_ix] = uint(x0);
+ sh_tile_y0[th_ix] = uint(y0);
+ tile_count = uint(x1 - x0) * uint(y1 - y0);
+ uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
+ sh_tile_base[th_ix] = base;
+ uint param_20 = path.tiles.offset;
+ uint param_21 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+ bool param_22 = true;
+ Alloc path_alloc = new_alloc(param_20, param_21, param_22);
+ uint param_23 = th_ix;
+ Alloc param_24 = path_alloc;
+ write_tile_alloc(param_23, param_24);
+ break;
+ }
+ default:
+ {
+ tile_count = 0u;
+ break;
+ }
+ }
+ sh_tile_count[th_ix] = tile_count;
+ for (uint i_3 = 0u; i_3 < 8u; i_3++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (th_ix >= (1u << i_3))
+ {
+ tile_count += sh_tile_count[th_ix - (1u << i_3)];
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_tile_count[th_ix] = tile_count;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint total_tile_count = sh_tile_count[255];
+ for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u)
+ {
+ uint el_ix = 0u;
+ for (uint i_4 = 0u; i_4 < 8u; i_4++)
+ {
+ uint probe_1 = el_ix + (128u >> i_4);
+ if (ix_1 >= sh_tile_count[probe_1 - 1u])
+ {
+ el_ix = probe_1;
+ }
+ }
+ uint element_ix_1 = sh_elements[el_ix];
+ uint tag_1 = _1390.scene[drawtag_start + element_ix_1];
+ if (el_ix > 0u)
+ {
+ _1622 = sh_tile_count[el_ix - 1u];
+ }
+ else
+ {
+ _1622 = 0u;
+ }
+ uint seq_ix = ix_1 - _1622;
+ uint width = sh_tile_width[el_ix];
+ uint x = sh_tile_x0[el_ix] + (seq_ix % width);
+ uint y = sh_tile_y0[el_ix] + (seq_ix / width);
+ bool include_tile = false;
+ uint param_25 = el_ix;
+ bool param_26 = true;
+ Alloc param_27 = read_tile_alloc(param_25, param_26, v_891);
+ TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+ Tile tile = Tile_read(param_27, param_28, v_267);
+ bool is_clip = (tag_1 & 1u) != 0u;
+ bool is_blend = false;
+ if (is_clip)
+ {
+ uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
+ uint scene_offset = v_267.memory[drawmonoid_base_1 + 2u];
+ uint dd = drawdata_start + (scene_offset >> uint(2));
+ uint blend = _1390.scene[dd];
+ is_blend = blend != 32771u;
+ }
+ bool _1706 = tile.tile.offset != 0u;
+ bool _1715;
+ if (!_1706)
+ {
+ _1715 = (tile.backdrop == 0) == is_clip;
+ }
+ else
+ {
+ _1715 = _1706;
+ }
+ include_tile = _1715 || is_blend;
+ if (include_tile)
+ {
+ uint el_slice = el_ix / 32u;
+ uint el_mask = 1u << (el_ix & 31u);
+ uint _1737 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint slice_ix = 0u;
+ uint bitmap = sh_bitmaps[0][th_ix];
+ while (true)
+ {
+ if (bitmap == 0u)
+ {
+ slice_ix++;
+ if (slice_ix == 8u)
+ {
+ break;
+ }
+ bitmap = sh_bitmaps[slice_ix][th_ix];
+ if (bitmap == 0u)
+ {
+ continue;
+ }
+ }
+ uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
+ uint element_ix_2 = sh_elements[element_ref_ix];
+ bitmap &= (bitmap - 1u);
+ uint drawtag = _1390.scene[drawtag_start + element_ix_2];
+ if (clip_zero_depth == 0u)
+ {
+ uint param_29 = element_ref_ix;
+ bool param_30 = true;
+ Alloc param_31 = read_tile_alloc(param_29, param_30, v_891);
+ TileRef param_32 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+ Tile tile_1 = Tile_read(param_31, param_32, v_267);
+ uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
+ uint scene_offset_1 = v_267.memory[drawmonoid_base_2 + 2u];
+ uint info_offset = v_267.memory[drawmonoid_base_2 + 3u];
+ uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
+ uint di = drawinfo_start + (info_offset >> uint(2));
+ switch (drawtag)
+ {
+ case 68u:
+ {
+ linewidth = as_type<float>(v_267.memory[di]);
+ Alloc param_33 = cmd_alloc;
+ CmdRef param_34 = cmd_ref;
+ uint param_35 = cmd_limit;
+ alloc_cmd(param_33, param_34, param_35, mem_ok, v_267, v_891);
+ cmd_alloc = param_33;
+ cmd_ref = param_34;
+ cmd_limit = param_35;
+ Alloc param_36 = cmd_alloc;
+ CmdRef param_37 = cmd_ref;
+ Tile param_38 = tile_1;
+ float param_39 = linewidth;
+ write_fill(param_36, param_37, param_38, param_39, mem_ok, v_267);
+ cmd_ref = param_37;
+ uint rgba = _1390.scene[dd_1];
+ if (mem_ok)
+ {
+ Alloc param_40 = cmd_alloc;
+ CmdRef param_41 = cmd_ref;
+ CmdColor param_42 = CmdColor{ rgba };
+ Cmd_Color_write(param_40, param_41, param_42, v_267);
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 276u:
+ {
+ Alloc param_43 = cmd_alloc;
+ CmdRef param_44 = cmd_ref;
+ uint param_45 = cmd_limit;
+ alloc_cmd(param_43, param_44, param_45, mem_ok, v_267, v_891);
+ cmd_alloc = param_43;
+ cmd_ref = param_44;
+ cmd_limit = param_45;
+ linewidth = as_type<float>(v_267.memory[di]);
+ Alloc param_46 = cmd_alloc;
+ CmdRef param_47 = cmd_ref;
+ Tile param_48 = tile_1;
+ float param_49 = linewidth;
+ write_fill(param_46, param_47, param_48, param_49, mem_ok, v_267);
+ cmd_ref = param_47;
+ cmd_lin.index = _1390.scene[dd_1];
+ cmd_lin.line_x = as_type<float>(v_267.memory[di + 1u]);
+ cmd_lin.line_y = as_type<float>(v_267.memory[di + 2u]);
+ cmd_lin.line_c = as_type<float>(v_267.memory[di + 3u]);
+ if (mem_ok)
+ {
+ Alloc param_50 = cmd_alloc;
+ CmdRef param_51 = cmd_ref;
+ CmdLinGrad param_52 = cmd_lin;
+ Cmd_LinGrad_write(param_50, param_51, param_52, v_267);
+ }
+ cmd_ref.offset += 20u;
+ break;
+ }
+ case 732u:
+ {
+ Alloc param_53 = cmd_alloc;
+ CmdRef param_54 = cmd_ref;
+ uint param_55 = cmd_limit;
+ alloc_cmd(param_53, param_54, param_55, mem_ok, v_267, v_891);
+ cmd_alloc = param_53;
+ cmd_ref = param_54;
+ cmd_limit = param_55;
+ linewidth = as_type<float>(v_267.memory[di]);
+ Alloc param_56 = cmd_alloc;
+ CmdRef param_57 = cmd_ref;
+ Tile param_58 = tile_1;
+ float param_59 = linewidth;
+ write_fill(param_56, param_57, param_58, param_59, mem_ok, v_267);
+ cmd_ref = param_57;
+ cmd_rad.index = _1390.scene[dd_1];
+ cmd_rad.mat = as_type<float4>(uint4(v_267.memory[di + 1u], v_267.memory[di + 2u], v_267.memory[di + 3u], v_267.memory[di + 4u]));
+ cmd_rad.xlat = as_type<float2>(uint2(v_267.memory[di + 5u], v_267.memory[di + 6u]));
+ cmd_rad.c1 = as_type<float2>(uint2(v_267.memory[di + 7u], v_267.memory[di + 8u]));
+ cmd_rad.ra = as_type<float>(v_267.memory[di + 9u]);
+ cmd_rad.roff = as_type<float>(v_267.memory[di + 10u]);
+ if (mem_ok)
+ {
+ Alloc param_60 = cmd_alloc;
+ CmdRef param_61 = cmd_ref;
+ CmdRadGrad param_62 = cmd_rad;
+ Cmd_RadGrad_write(param_60, param_61, param_62, v_267);
+ }
+ cmd_ref.offset += 48u;
+ break;
+ }
+ case 72u:
+ {
+ Alloc param_63 = cmd_alloc;
+ CmdRef param_64 = cmd_ref;
+ uint param_65 = cmd_limit;
+ alloc_cmd(param_63, param_64, param_65, mem_ok, v_267, v_891);
+ cmd_alloc = param_63;
+ cmd_ref = param_64;
+ cmd_limit = param_65;
+ linewidth = as_type<float>(v_267.memory[di]);
+ Alloc param_66 = cmd_alloc;
+ CmdRef param_67 = cmd_ref;
+ Tile param_68 = tile_1;
+ float param_69 = linewidth;
+ write_fill(param_66, param_67, param_68, param_69, mem_ok, v_267);
+ cmd_ref = param_67;
+ uint index = _1390.scene[dd_1];
+ uint raw1 = _1390.scene[dd_1 + 1u];
+ int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+ if (mem_ok)
+ {
+ Alloc param_70 = cmd_alloc;
+ CmdRef param_71 = cmd_ref;
+ CmdImage param_72 = CmdImage{ index, offset_1 };
+ Cmd_Image_write(param_70, param_71, param_72, v_267);
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 5u:
+ {
+ bool _2120 = tile_1.tile.offset == 0u;
+ bool _2126;
+ if (_2120)
+ {
+ _2126 = tile_1.backdrop == 0;
+ }
+ else
+ {
+ _2126 = _2120;
+ }
+ if (_2126)
+ {
+ clip_zero_depth = clip_depth + 1u;
+ }
+ else
+ {
+ Alloc param_73 = cmd_alloc;
+ CmdRef param_74 = cmd_ref;
+ uint param_75 = cmd_limit;
+ alloc_cmd(param_73, param_74, param_75, mem_ok, v_267, v_891);
+ cmd_alloc = param_73;
+ cmd_ref = param_74;
+ cmd_limit = param_75;
+ if (mem_ok)
+ {
+ Alloc param_76 = cmd_alloc;
+ CmdRef param_77 = cmd_ref;
+ Cmd_BeginClip_write(param_76, param_77, v_267);
+ }
+ cmd_ref.offset += 4u;
+ render_blend_depth++;
+ max_blend_depth = max(max_blend_depth, render_blend_depth);
+ }
+ clip_depth++;
+ break;
+ }
+ case 37u:
+ {
+ clip_depth--;
+ Alloc param_78 = cmd_alloc;
+ CmdRef param_79 = cmd_ref;
+ Tile param_80 = tile_1;
+ float param_81 = -1.0;
+ write_fill(param_78, param_79, param_80, param_81, mem_ok, v_267);
+ cmd_ref = param_79;
+ uint blend_1 = _1390.scene[dd_1];
+ if (mem_ok)
+ {
+ Alloc param_82 = cmd_alloc;
+ CmdRef param_83 = cmd_ref;
+ CmdEndClip param_84 = CmdEndClip{ blend_1 };
+ Cmd_EndClip_write(param_82, param_83, param_84, v_267);
+ }
+ cmd_ref.offset += 8u;
+ render_blend_depth--;
+ break;
+ }
+ }
+ }
+ else
+ {
+ switch (drawtag)
+ {
+ case 5u:
+ {
+ clip_depth++;
+ break;
+ }
+ case 37u:
+ {
+ if (clip_depth == clip_zero_depth)
+ {
+ clip_zero_depth = 0u;
+ }
+ clip_depth--;
+ break;
+ }
+ }
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ rd_ix += 256u;
+ if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions))
+ {
+ break;
+ }
+ }
+ bool _2231 = (bin_tile_x + tile_x) < v_891.conf.width_in_tiles;
+ bool _2240;
+ if (_2231)
+ {
+ _2240 = (bin_tile_y + tile_y) < v_891.conf.height_in_tiles;
+ }
+ else
+ {
+ _2240 = _2231;
+ }
+ if (_2240)
+ {
+ if (mem_ok)
+ {
+ Alloc param_85 = cmd_alloc;
+ CmdRef param_86 = cmd_ref;
+ Cmd_End_write(param_85, param_86, v_267);
+ }
+ if (max_blend_depth > 4u)
+ {
+ uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u;
+ uint _2264 = atomic_fetch_add_explicit((device atomic_uint*)&v_267.blend_offset, scratch_size, memory_order_relaxed);
+ uint scratch = _2264;
+ Alloc param_87 = scratch_alloc;
+ uint param_88 = scratch_alloc.offset >> uint(2);
+ uint param_89 = scratch;
+ write_mem(param_87, param_88, param_89, v_267);
+ }
+ }
+}
+
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
new file mode 100644
index 0000000..6140907
--- /dev/null
+++ b/piet-gpu/shader/gen/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil
new file mode 100644
index 0000000..acf2ed2
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl
new file mode 100644
index 0000000..a366c32
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_leaf.hlsl
@@ -0,0 +1,268 @@
+struct DrawMonoid
+{
+ uint path_ix;
+ uint clip_ix;
+ uint scene_offset;
+ uint info_offset;
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+static const DrawMonoid _23 = { 0u, 0u, 0u, 0u };
+
+ByteAddressBuffer _93 : register(t1, space0);
+ByteAddressBuffer _103 : register(t2, space0);
+ByteAddressBuffer _203 : register(t3, space0);
+RWByteAddressBuffer _285 : register(u0, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_WorkGroupID : SV_GroupID;
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared DrawMonoid sh_scratch[256];
+
+DrawMonoid map_tag(uint tag_word)
+{
+ uint has_path = uint(tag_word != 0u);
+ DrawMonoid _76 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
+ return _76;
+}
+
+DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
+{
+ DrawMonoid c;
+ c.path_ix = a.path_ix + b.path_ix;
+ c.clip_ix = a.clip_ix + b.clip_ix;
+ c.scene_offset = a.scene_offset + b.scene_offset;
+ c.info_offset = a.info_offset + b.info_offset;
+ return c;
+}
+
+DrawMonoid draw_monoid_identity()
+{
+ return _23;
+}
+
+void comp_main()
+{
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ uint drawtag_base = _93.Load(100) >> uint(2);
+ uint tag_word = _103.Load((drawtag_base + ix) * 4 + 0);
+ uint param = tag_word;
+ DrawMonoid agg = map_tag(param);
+ DrawMonoid local[8];
+ local[0] = agg;
+ for (uint i = 1u; i < 8u; i++)
+ {
+ tag_word = _103.Load(((drawtag_base + ix) + i) * 4 + 0);
+ uint param_1 = tag_word;
+ DrawMonoid param_2 = agg;
+ DrawMonoid param_3 = map_tag(param_1);
+ agg = combine_draw_monoid(param_2, param_3);
+ local[i] = agg;
+ }
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ DrawMonoid param_4 = other;
+ DrawMonoid param_5 = agg;
+ agg = combine_draw_monoid(param_4, param_5);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ DrawMonoid row = draw_monoid_identity();
+ if (gl_WorkGroupID.x > 0u)
+ {
+ DrawMonoid _209;
+ _209.path_ix = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 0);
+ _209.clip_ix = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 4);
+ _209.scene_offset = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 8);
+ _209.info_offset = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 12);
+ row.path_ix = _209.path_ix;
+ row.clip_ix = _209.clip_ix;
+ row.scene_offset = _209.scene_offset;
+ row.info_offset = _209.info_offset;
+ }
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ DrawMonoid param_6 = row;
+ DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
+ row = combine_draw_monoid(param_6, param_7);
+ }
+ uint drawdata_base = _93.Load(104) >> uint(2);
+ uint drawinfo_base = _93.Load(68) >> uint(2);
+ uint out_ix = gl_GlobalInvocationID.x * 8u;
+ uint out_base = (_93.Load(44) >> uint(2)) + (out_ix * 4u);
+ uint clip_out_base = _93.Load(48) >> uint(2);
+ float4 mat;
+ float2 translate;
+ float2 p0;
+ float2 p1;
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ DrawMonoid m = row;
+ if (i_2 > 0u)
+ {
+ DrawMonoid param_8 = m;
+ DrawMonoid param_9 = local[i_2 - 1u];
+ m = combine_draw_monoid(param_8, param_9);
+ }
+ _285.Store((out_base + (i_2 * 4u)) * 4 + 12, m.path_ix);
+ _285.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 12, m.clip_ix);
+ _285.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 12, m.scene_offset);
+ _285.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 12, m.info_offset);
+ uint dd = drawdata_base + (m.scene_offset >> uint(2));
+ uint di = drawinfo_base + (m.info_offset >> uint(2));
+ tag_word = _103.Load(((drawtag_base + ix) + i_2) * 4 + 0);
+ if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u))
+ {
+ uint bbox_offset = (_93.Load(40) >> uint(2)) + (6u * m.path_ix);
+ float bbox_l = float(_285.Load(bbox_offset * 4 + 12)) - 32768.0f;
+ float bbox_t = float(_285.Load((bbox_offset + 1u) * 4 + 12)) - 32768.0f;
+ float bbox_r = float(_285.Load((bbox_offset + 2u) * 4 + 12)) - 32768.0f;
+ float bbox_b = float(_285.Load((bbox_offset + 3u) * 4 + 12)) - 32768.0f;
+ float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+ float linewidth = asfloat(_285.Load((bbox_offset + 4u) * 4 + 12));
+ uint fill_mode = uint(linewidth >= 0.0f);
+ if (((linewidth >= 0.0f) || (tag_word == 276u)) || (tag_word == 732u))
+ {
+ uint trans_ix = _285.Load((bbox_offset + 5u) * 4 + 12);
+ uint t = (_93.Load(84) >> uint(2)) + (trans_ix * 6u);
+ mat = asfloat(uint4(_103.Load(t * 4 + 0), _103.Load((t + 1u) * 4 + 0), _103.Load((t + 2u) * 4 + 0), _103.Load((t + 3u) * 4 + 0)));
+ if ((tag_word == 276u) || (tag_word == 732u))
+ {
+ translate = asfloat(uint2(_103.Load((t + 4u) * 4 + 0), _103.Load((t + 5u) * 4 + 0)));
+ }
+ }
+ if (linewidth >= 0.0f)
+ {
+ linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z)));
+ }
+ switch (tag_word)
+ {
+ case 68u:
+ case 72u:
+ {
+ _285.Store(di * 4 + 12, asuint(linewidth));
+ break;
+ }
+ case 276u:
+ {
+ _285.Store(di * 4 + 12, asuint(linewidth));
+ p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0)));
+ p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0)));
+ p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
+ p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
+ float2 dxy = p1 - p0;
+ float scale = 1.0f / ((dxy.x * dxy.x) + (dxy.y * dxy.y));
+ float line_x = dxy.x * scale;
+ float line_y = dxy.y * scale;
+ float line_c = -((p0.x * line_x) + (p0.y * line_y));
+ _285.Store((di + 1u) * 4 + 12, asuint(line_x));
+ _285.Store((di + 2u) * 4 + 12, asuint(line_y));
+ _285.Store((di + 3u) * 4 + 12, asuint(line_c));
+ break;
+ }
+ case 732u:
+ {
+ p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0)));
+ p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0)));
+ float r0 = asfloat(_103.Load((dd + 5u) * 4 + 0));
+ float r1 = asfloat(_103.Load((dd + 6u) * 4 + 0));
+ float inv_det = 1.0f / ((mat.x * mat.w) - (mat.y * mat.z));
+ float4 inv_mat = float4(mat.w, -mat.y, -mat.z, mat.x) * inv_det;
+ float2 inv_tr = (inv_mat.xz * translate.x) + (inv_mat.yw * translate.y);
+ inv_tr += p0;
+ float2 center1 = p1 - p0;
+ float rr = r1 / (r1 - r0);
+ float rainv = rr / ((r1 * r1) - dot(center1, center1));
+ float2 c1 = center1 * rainv;
+ float ra = rr * rainv;
+ float roff = rr - 1.0f;
+ _285.Store(di * 4 + 12, asuint(linewidth));
+ _285.Store((di + 1u) * 4 + 12, asuint(inv_mat.x));
+ _285.Store((di + 2u) * 4 + 12, asuint(inv_mat.y));
+ _285.Store((di + 3u) * 4 + 12, asuint(inv_mat.z));
+ _285.Store((di + 4u) * 4 + 12, asuint(inv_mat.w));
+ _285.Store((di + 5u) * 4 + 12, asuint(inv_tr.x));
+ _285.Store((di + 6u) * 4 + 12, asuint(inv_tr.y));
+ _285.Store((di + 7u) * 4 + 12, asuint(c1.x));
+ _285.Store((di + 8u) * 4 + 12, asuint(c1.y));
+ _285.Store((di + 9u) * 4 + 12, asuint(ra));
+ _285.Store((di + 10u) * 4 + 12, asuint(roff));
+ break;
+ }
+ case 5u:
+ {
+ break;
+ }
+ }
+ }
+ if ((tag_word == 5u) || (tag_word == 37u))
+ {
+ uint path_ix = ~(out_ix + i_2);
+ if (tag_word == 5u)
+ {
+ path_ix = m.path_ix;
+ }
+ _285.Store((clip_out_base + m.clip_ix) * 4 + 12, path_ix);
+ }
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_WorkGroupID = stage_input.gl_WorkGroupID;
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl
new file mode 100644
index 0000000..bca3ef7
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@@ -0,0 +1,317 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+ T elements[Num ? Num : 1];
+
+ thread T& operator [] (size_t pos) thread
+ {
+ return elements[pos];
+ }
+ constexpr const thread T& operator [] (size_t pos) const thread
+ {
+ return elements[pos];
+ }
+
+ device T& operator [] (size_t pos) device
+ {
+ return elements[pos];
+ }
+ constexpr const device T& operator [] (size_t pos) const device
+ {
+ return elements[pos];
+ }
+
+ constexpr const constant T& operator [] (size_t pos) const constant
+ {
+ return elements[pos];
+ }
+
+ threadgroup T& operator [] (size_t pos) threadgroup
+ {
+ return elements[pos];
+ }
+ constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+ {
+ return elements[pos];
+ }
+};
+
+struct DrawMonoid
+{
+ uint path_ix;
+ uint clip_ix;
+ uint scene_offset;
+ uint info_offset;
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+struct SceneBuf
+{
+ uint scene[1];
+};
+
+struct DrawMonoid_1
+{
+ uint path_ix;
+ uint clip_ix;
+ uint scene_offset;
+ uint info_offset;
+};
+
+struct ParentBuf
+{
+ DrawMonoid_1 parent[1];
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+DrawMonoid map_tag(thread const uint& tag_word)
+{
+ uint has_path = uint(tag_word != 0u);
+ return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
+}
+
+static inline __attribute__((always_inline))
+DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
+{
+ DrawMonoid c;
+ c.path_ix = a.path_ix + b.path_ix;
+ c.clip_ix = a.clip_ix + b.clip_ix;
+ c.scene_offset = a.scene_offset + b.scene_offset;
+ c.info_offset = a.info_offset + b.info_offset;
+ return c;
+}
+
+static inline __attribute__((always_inline))
+DrawMonoid draw_monoid_identity()
+{
+ return DrawMonoid{ 0u, 0u, 0u, 0u };
+}
+
+kernel void main0(device Memory& _285 [[buffer(0)]], const device ConfigBuf& _93 [[buffer(1)]], const device SceneBuf& _103 [[buffer(2)]], const device ParentBuf& _203 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+ threadgroup DrawMonoid sh_scratch[256];
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ uint drawtag_base = _93.conf.drawtag_offset >> uint(2);
+ uint tag_word = _103.scene[drawtag_base + ix];
+ uint param = tag_word;
+ DrawMonoid agg = map_tag(param);
+ spvUnsafeArray<DrawMonoid, 8> local;
+ local[0] = agg;
+ for (uint i = 1u; i < 8u; i++)
+ {
+ tag_word = _103.scene[(drawtag_base + ix) + i];
+ uint param_1 = tag_word;
+ DrawMonoid param_2 = agg;
+ DrawMonoid param_3 = map_tag(param_1);
+ agg = combine_draw_monoid(param_2, param_3);
+ local[i] = agg;
+ }
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ DrawMonoid param_4 = other;
+ DrawMonoid param_5 = agg;
+ agg = combine_draw_monoid(param_4, param_5);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ DrawMonoid row = draw_monoid_identity();
+ if (gl_WorkGroupID.x > 0u)
+ {
+ uint _206 = gl_WorkGroupID.x - 1u;
+ row.path_ix = _203.parent[_206].path_ix;
+ row.clip_ix = _203.parent[_206].clip_ix;
+ row.scene_offset = _203.parent[_206].scene_offset;
+ row.info_offset = _203.parent[_206].info_offset;
+ }
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ DrawMonoid param_6 = row;
+ DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
+ row = combine_draw_monoid(param_6, param_7);
+ }
+ uint drawdata_base = _93.conf.drawdata_offset >> uint(2);
+ uint drawinfo_base = _93.conf.drawinfo_alloc.offset >> uint(2);
+ uint out_ix = gl_GlobalInvocationID.x * 8u;
+ uint out_base = (_93.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 4u);
+ uint clip_out_base = _93.conf.clip_alloc.offset >> uint(2);
+ float4 mat;
+ float2 translate;
+ float2 p0;
+ float2 p1;
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ DrawMonoid m = row;
+ if (i_2 > 0u)
+ {
+ DrawMonoid param_8 = m;
+ DrawMonoid param_9 = local[i_2 - 1u];
+ m = combine_draw_monoid(param_8, param_9);
+ }
+ _285.memory[out_base + (i_2 * 4u)] = m.path_ix;
+ _285.memory[(out_base + (i_2 * 4u)) + 1u] = m.clip_ix;
+ _285.memory[(out_base + (i_2 * 4u)) + 2u] = m.scene_offset;
+ _285.memory[(out_base + (i_2 * 4u)) + 3u] = m.info_offset;
+ uint dd = drawdata_base + (m.scene_offset >> uint(2));
+ uint di = drawinfo_base + (m.info_offset >> uint(2));
+ tag_word = _103.scene[(drawtag_base + ix) + i_2];
+ if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u))
+ {
+ uint bbox_offset = (_93.conf.path_bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
+ float bbox_l = float(_285.memory[bbox_offset]) - 32768.0;
+ float bbox_t = float(_285.memory[bbox_offset + 1u]) - 32768.0;
+ float bbox_r = float(_285.memory[bbox_offset + 2u]) - 32768.0;
+ float bbox_b = float(_285.memory[bbox_offset + 3u]) - 32768.0;
+ float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+ float linewidth = as_type<float>(_285.memory[bbox_offset + 4u]);
+ uint fill_mode = uint(linewidth >= 0.0);
+ if (((linewidth >= 0.0) || (tag_word == 276u)) || (tag_word == 732u))
+ {
+ uint trans_ix = _285.memory[bbox_offset + 5u];
+ uint t = (_93.conf.trans_offset >> uint(2)) + (trans_ix * 6u);
+ mat = as_type<float4>(uint4(_103.scene[t], _103.scene[t + 1u], _103.scene[t + 2u], _103.scene[t + 3u]));
+ if ((tag_word == 276u) || (tag_word == 732u))
+ {
+ translate = as_type<float2>(uint2(_103.scene[t + 4u], _103.scene[t + 5u]));
+ }
+ }
+ if (linewidth >= 0.0)
+ {
+ linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z)));
+ }
+ switch (tag_word)
+ {
+ case 68u:
+ case 72u:
+ {
+ _285.memory[di] = as_type<uint>(linewidth);
+ break;
+ }
+ case 276u:
+ {
+ _285.memory[di] = as_type<uint>(linewidth);
+ p0 = as_type<float2>(uint2(_103.scene[dd + 1u], _103.scene[dd + 2u]));
+ p1 = as_type<float2>(uint2(_103.scene[dd + 3u], _103.scene[dd + 4u]));
+ p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
+ p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
+ float2 dxy = p1 - p0;
+ float scale = 1.0 / ((dxy.x * dxy.x) + (dxy.y * dxy.y));
+ float line_x = dxy.x * scale;
+ float line_y = dxy.y * scale;
+ float line_c = -((p0.x * line_x) + (p0.y * line_y));
+ _285.memory[di + 1u] = as_type<uint>(line_x);
+ _285.memory[di + 2u] = as_type<uint>(line_y);
+ _285.memory[di + 3u] = as_type<uint>(line_c);
+ break;
+ }
+ case 732u:
+ {
+ p0 = as_type<float2>(uint2(_103.scene[dd + 1u], _103.scene[dd + 2u]));
+ p1 = as_type<float2>(uint2(_103.scene[dd + 3u], _103.scene[dd + 4u]));
+ float r0 = as_type<float>(_103.scene[dd + 5u]);
+ float r1 = as_type<float>(_103.scene[dd + 6u]);
+ float inv_det = 1.0 / ((mat.x * mat.w) - (mat.y * mat.z));
+ float4 inv_mat = float4(mat.w, -mat.y, -mat.z, mat.x) * inv_det;
+ float2 inv_tr = (inv_mat.xz * translate.x) + (inv_mat.yw * translate.y);
+ inv_tr += p0;
+ float2 center1 = p1 - p0;
+ float rr = r1 / (r1 - r0);
+ float rainv = rr / ((r1 * r1) - dot(center1, center1));
+ float2 c1 = center1 * rainv;
+ float ra = rr * rainv;
+ float roff = rr - 1.0;
+ _285.memory[di] = as_type<uint>(linewidth);
+ _285.memory[di + 1u] = as_type<uint>(inv_mat.x);
+ _285.memory[di + 2u] = as_type<uint>(inv_mat.y);
+ _285.memory[di + 3u] = as_type<uint>(inv_mat.z);
+ _285.memory[di + 4u] = as_type<uint>(inv_mat.w);
+ _285.memory[di + 5u] = as_type<uint>(inv_tr.x);
+ _285.memory[di + 6u] = as_type<uint>(inv_tr.y);
+ _285.memory[di + 7u] = as_type<uint>(c1.x);
+ _285.memory[di + 8u] = as_type<uint>(c1.y);
+ _285.memory[di + 9u] = as_type<uint>(ra);
+ _285.memory[di + 10u] = as_type<uint>(roff);
+ break;
+ }
+ case 5u:
+ {
+ break;
+ }
+ }
+ }
+ if ((tag_word == 5u) || (tag_word == 37u))
+ {
+ uint path_ix = ~(out_ix + i_2);
+ if (tag_word == 5u)
+ {
+ path_ix = m.path_ix;
+ }
+ _285.memory[clip_out_base + m.clip_ix] = path_ix;
+ }
+ }
+}
+
diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv
new file mode 100644
index 0000000..d66f719
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil
new file mode 100644
index 0000000..9e8b3ca
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl
new file mode 100644
index 0000000..652f594
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_reduce.hlsl
@@ -0,0 +1,126 @@
+struct DrawMonoid
+{
+ uint path_ix;
+ uint clip_ix;
+ uint scene_offset;
+ uint info_offset;
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+ByteAddressBuffer _87 : register(t1, space0);
+ByteAddressBuffer _97 : register(t2, space0);
+RWByteAddressBuffer _188 : register(u3, space0);
+RWByteAddressBuffer _206 : register(u0, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_WorkGroupID : SV_GroupID;
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared DrawMonoid sh_scratch[256];
+
+DrawMonoid map_tag(uint tag_word)
+{
+ uint has_path = uint(tag_word != 0u);
+ DrawMonoid _70 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
+ return _70;
+}
+
+DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
+{
+ DrawMonoid c;
+ c.path_ix = a.path_ix + b.path_ix;
+ c.clip_ix = a.clip_ix + b.clip_ix;
+ c.scene_offset = a.scene_offset + b.scene_offset;
+ c.info_offset = a.info_offset + b.info_offset;
+ return c;
+}
+
+void comp_main()
+{
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ uint drawtag_base = _87.Load(100) >> uint(2);
+ uint tag_word = _97.Load((drawtag_base + ix) * 4 + 0);
+ uint param = tag_word;
+ DrawMonoid agg = map_tag(param);
+ for (uint i = 1u; i < 8u; i++)
+ {
+ uint tag_word_1 = _97.Load(((drawtag_base + ix) + i) * 4 + 0);
+ uint param_1 = tag_word_1;
+ DrawMonoid param_2 = agg;
+ DrawMonoid param_3 = map_tag(param_1);
+ agg = combine_draw_monoid(param_2, param_3);
+ }
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u)
+ {
+ DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+ DrawMonoid param_4 = agg;
+ DrawMonoid param_5 = other;
+ agg = combine_draw_monoid(param_4, param_5);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ if (gl_LocalInvocationID.x == 0u)
+ {
+ _188.Store(gl_WorkGroupID.x * 16 + 0, agg.path_ix);
+ _188.Store(gl_WorkGroupID.x * 16 + 4, agg.clip_ix);
+ _188.Store(gl_WorkGroupID.x * 16 + 8, agg.scene_offset);
+ _188.Store(gl_WorkGroupID.x * 16 + 12, agg.info_offset);
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_WorkGroupID = stage_input.gl_WorkGroupID;
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl
new file mode 100644
index 0000000..16df1aa
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_reduce.msl
@@ -0,0 +1,141 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct DrawMonoid
+{
+ uint path_ix;
+ uint clip_ix;
+ uint scene_offset;
+ uint info_offset;
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+struct SceneBuf
+{
+ uint scene[1];
+};
+
+struct DrawMonoid_1
+{
+ uint path_ix;
+ uint clip_ix;
+ uint scene_offset;
+ uint info_offset;
+};
+
+struct OutBuf
+{
+ DrawMonoid_1 outbuf[1];
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+DrawMonoid map_tag(thread const uint& tag_word)
+{
+ uint has_path = uint(tag_word != 0u);
+ return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
+}
+
+static inline __attribute__((always_inline))
+DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
+{
+ DrawMonoid c;
+ c.path_ix = a.path_ix + b.path_ix;
+ c.clip_ix = a.clip_ix + b.clip_ix;
+ c.scene_offset = a.scene_offset + b.scene_offset;
+ c.info_offset = a.info_offset + b.info_offset;
+ return c;
+}
+
+kernel void main0(const device ConfigBuf& _87 [[buffer(1)]], const device SceneBuf& _97 [[buffer(2)]], device OutBuf& _188 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+ threadgroup DrawMonoid sh_scratch[256];
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ uint drawtag_base = _87.conf.drawtag_offset >> uint(2);
+ uint tag_word = _97.scene[drawtag_base + ix];
+ uint param = tag_word;
+ DrawMonoid agg = map_tag(param);
+ for (uint i = 1u; i < 8u; i++)
+ {
+ uint tag_word_1 = _97.scene[(drawtag_base + ix) + i];
+ uint param_1 = tag_word_1;
+ DrawMonoid param_2 = agg;
+ DrawMonoid param_3 = map_tag(param_1);
+ agg = combine_draw_monoid(param_2, param_3);
+ }
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u)
+ {
+ DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+ DrawMonoid param_4 = agg;
+ DrawMonoid param_5 = other;
+ agg = combine_draw_monoid(param_4, param_5);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ if (gl_LocalInvocationID.x == 0u)
+ {
+ _188.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
+ _188.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix;
+ _188.outbuf[gl_WorkGroupID.x].scene_offset = agg.scene_offset;
+ _188.outbuf[gl_WorkGroupID.x].info_offset = agg.info_offset;
+ }
+}
+
diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv
new file mode 100644
index 0000000..db5df5f
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_root.dxil b/piet-gpu/shader/gen/draw_root.dxil
new file mode 100644
index 0000000..2fbedfc
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_root.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_root.hlsl b/piet-gpu/shader/gen/draw_root.hlsl
new file mode 100644
index 0000000..b4cb7e4
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_root.hlsl
@@ -0,0 +1,108 @@
+struct DrawMonoid
+{
+ uint path_ix;
+ uint clip_ix;
+ uint scene_offset;
+ uint info_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+static const DrawMonoid _18 = { 0u, 0u, 0u, 0u };
+
+RWByteAddressBuffer _71 : register(u0, space0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared DrawMonoid sh_scratch[256];
+
+DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
+{
+ DrawMonoid c;
+ c.path_ix = a.path_ix + b.path_ix;
+ c.clip_ix = a.clip_ix + b.clip_ix;
+ c.scene_offset = a.scene_offset + b.scene_offset;
+ c.info_offset = a.info_offset + b.info_offset;
+ return c;
+}
+
+DrawMonoid draw_monoid_identity()
+{
+ return _18;
+}
+
+void comp_main()
+{
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ DrawMonoid _75;
+ _75.path_ix = _71.Load(ix * 16 + 0);
+ _75.clip_ix = _71.Load(ix * 16 + 4);
+ _75.scene_offset = _71.Load(ix * 16 + 8);
+ _75.info_offset = _71.Load(ix * 16 + 12);
+ DrawMonoid local[8];
+ local[0].path_ix = _75.path_ix;
+ local[0].clip_ix = _75.clip_ix;
+ local[0].scene_offset = _75.scene_offset;
+ local[0].info_offset = _75.info_offset;
+ DrawMonoid param_1;
+ for (uint i = 1u; i < 8u; i++)
+ {
+ DrawMonoid param = local[i - 1u];
+ DrawMonoid _106;
+ _106.path_ix = _71.Load((ix + i) * 16 + 0);
+ _106.clip_ix = _71.Load((ix + i) * 16 + 4);
+ _106.scene_offset = _71.Load((ix + i) * 16 + 8);
+ _106.info_offset = _71.Load((ix + i) * 16 + 12);
+ param_1.path_ix = _106.path_ix;
+ param_1.clip_ix = _106.clip_ix;
+ param_1.scene_offset = _106.scene_offset;
+ param_1.info_offset = _106.info_offset;
+ local[i] = combine_draw_monoid(param, param_1);
+ }
+ DrawMonoid agg = local[7];
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ DrawMonoid param_2 = other;
+ DrawMonoid param_3 = agg;
+ agg = combine_draw_monoid(param_2, param_3);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ DrawMonoid row = draw_monoid_identity();
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ row = sh_scratch[gl_LocalInvocationID.x - 1u];
+ }
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ DrawMonoid param_4 = row;
+ DrawMonoid param_5 = local[i_2];
+ DrawMonoid m = combine_draw_monoid(param_4, param_5);
+ uint _199 = ix + i_2;
+ _71.Store(_199 * 16 + 0, m.path_ix);
+ _71.Store(_199 * 16 + 4, m.clip_ix);
+ _71.Store(_199 * 16 + 8, m.scene_offset);
+ _71.Store(_199 * 16 + 12, m.info_offset);
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/draw_root.msl b/piet-gpu/shader/gen/draw_root.msl
new file mode 100644
index 0000000..9ee8cfe
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_root.msl
@@ -0,0 +1,140 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+ T elements[Num ? Num : 1];
+
+ thread T& operator [] (size_t pos) thread
+ {
+ return elements[pos];
+ }
+ constexpr const thread T& operator [] (size_t pos) const thread
+ {
+ return elements[pos];
+ }
+
+ device T& operator [] (size_t pos) device
+ {
+ return elements[pos];
+ }
+ constexpr const device T& operator [] (size_t pos) const device
+ {
+ return elements[pos];
+ }
+
+ constexpr const constant T& operator [] (size_t pos) const constant
+ {
+ return elements[pos];
+ }
+
+ threadgroup T& operator [] (size_t pos) threadgroup
+ {
+ return elements[pos];
+ }
+ constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+ {
+ return elements[pos];
+ }
+};
+
+struct DrawMonoid
+{
+ uint path_ix;
+ uint clip_ix;
+ uint scene_offset;
+ uint info_offset;
+};
+
+struct DrawMonoid_1
+{
+ uint path_ix;
+ uint clip_ix;
+ uint scene_offset;
+ uint info_offset;
+};
+
+struct DataBuf
+{
+ DrawMonoid_1 data[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
+{
+ DrawMonoid c;
+ c.path_ix = a.path_ix + b.path_ix;
+ c.clip_ix = a.clip_ix + b.clip_ix;
+ c.scene_offset = a.scene_offset + b.scene_offset;
+ c.info_offset = a.info_offset + b.info_offset;
+ return c;
+}
+
+static inline __attribute__((always_inline))
+DrawMonoid draw_monoid_identity()
+{
+ return DrawMonoid{ 0u, 0u, 0u, 0u };
+}
+
+kernel void main0(device DataBuf& _71 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+ threadgroup DrawMonoid sh_scratch[256];
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ spvUnsafeArray<DrawMonoid, 8> local;
+ local[0].path_ix = _71.data[ix].path_ix;
+ local[0].clip_ix = _71.data[ix].clip_ix;
+ local[0].scene_offset = _71.data[ix].scene_offset;
+ local[0].info_offset = _71.data[ix].info_offset;
+ DrawMonoid param_1;
+ for (uint i = 1u; i < 8u; i++)
+ {
+ uint _100 = ix + i;
+ DrawMonoid param = local[i - 1u];
+ param_1.path_ix = _71.data[_100].path_ix;
+ param_1.clip_ix = _71.data[_100].clip_ix;
+ param_1.scene_offset = _71.data[_100].scene_offset;
+ param_1.info_offset = _71.data[_100].info_offset;
+ local[i] = combine_draw_monoid(param, param_1);
+ }
+ DrawMonoid agg = local[7];
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ DrawMonoid param_2 = other;
+ DrawMonoid param_3 = agg;
+ agg = combine_draw_monoid(param_2, param_3);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ DrawMonoid row = draw_monoid_identity();
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ row = sh_scratch[gl_LocalInvocationID.x - 1u];
+ }
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ DrawMonoid param_4 = row;
+ DrawMonoid param_5 = local[i_2];
+ DrawMonoid m = combine_draw_monoid(param_4, param_5);
+ uint _199 = ix + i_2;
+ _71.data[_199].path_ix = m.path_ix;
+ _71.data[_199].clip_ix = m.clip_ix;
+ _71.data[_199].scene_offset = m.scene_offset;
+ _71.data[_199].info_offset = m.info_offset;
+ }
+}
+
diff --git a/piet-gpu/shader/gen/draw_root.spv b/piet-gpu/shader/gen/draw_root.spv
new file mode 100644
index 0000000..e6a53e5
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_root.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
new file mode 100644
index 0000000..4c259cc
--- /dev/null
+++ b/piet-gpu/shader/gen/kernel4.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
new file mode 100644
index 0000000..992a758
--- /dev/null
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -0,0 +1,1303 @@
+struct Alloc
+{
+ uint offset;
+};
+
+struct CmdStrokeRef
+{
+ uint offset;
+};
+
+struct CmdStroke
+{
+ uint tile_ref;
+ float half_width;
+};
+
+struct CmdFillRef
+{
+ uint offset;
+};
+
+struct CmdFill
+{
+ uint tile_ref;
+ int backdrop;
+};
+
+struct CmdColorRef
+{
+ uint offset;
+};
+
+struct CmdColor
+{
+ uint rgba_color;
+};
+
+struct CmdLinGradRef
+{
+ uint offset;
+};
+
+struct CmdLinGrad
+{
+ uint index;
+ float line_x;
+ float line_y;
+ float line_c;
+};
+
+struct CmdRadGradRef
+{
+ uint offset;
+};
+
+struct CmdRadGrad
+{
+ uint index;
+ float4 mat;
+ float2 xlat;
+ float2 c1;
+ float ra;
+ float roff;
+};
+
+struct CmdImageRef
+{
+ uint offset;
+};
+
+struct CmdImage
+{
+ uint index;
+ int2 offset;
+};
+
+struct CmdAlphaRef
+{
+ uint offset;
+};
+
+struct CmdAlpha
+{
+ float alpha;
+};
+
+struct CmdEndClipRef
+{
+ uint offset;
+};
+
+struct CmdEndClip
+{
+ uint blend;
+};
+
+struct CmdJumpRef
+{
+ uint offset;
+};
+
+struct CmdJump
+{
+ uint new_ref;
+};
+
+struct CmdRef
+{
+ uint offset;
+};
+
+struct CmdTag
+{
+ uint tag;
+ uint flags;
+};
+
+struct TileSegRef
+{
+ uint offset;
+};
+
+struct TileSeg
+{
+ float2 origin;
+ float2 _vector;
+ float y_edge;
+ TileSegRef next;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
+
+RWByteAddressBuffer _297 : register(u0, space0);
+ByteAddressBuffer _1681 : register(t1, space0);
+RWByteAddressBuffer _2506 : register(u2, space0);
+RWTexture2D<unorm float4> image_atlas : register(u4, space0);
+RWTexture2D<unorm float4> gradients : register(u5, space0);
+RWTexture2D<unorm float4> image : register(u3, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_WorkGroupID : SV_GroupID;
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+};
+
+uint spvPackUnorm4x8(float4 value)
+{
+ uint4 Packed = uint4(round(saturate(value) * 255.0));
+ return Packed.x | (Packed.y << 8) | (Packed.z << 16) | (Packed.w << 24);
+}
+
+float4 spvUnpackUnorm4x8(uint value)
+{
+ uint4 Packed = uint4(value & 0xff, (value >> 8) & 0xff, (value >> 16) & 0xff, value >> 24);
+ return float4(Packed) / 255.0;
+}
+
+Alloc slice_mem(Alloc a, uint offset, uint size)
+{
+ Alloc _310 = { a.offset + offset };
+ return _310;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+ return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return 0u;
+ }
+ uint v = _297.Load(offset * 4 + 12);
+ return v;
+}
+
+CmdTag Cmd_tag(Alloc a, CmdRef ref)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint tag_and_flags = read_mem(param, param_1);
+ CmdTag _669 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+ return _669;
+}
+
+CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ CmdStroke s;
+ s.tile_ref = raw0;
+ s.half_width = asfloat(raw1);
+ return s;
+}
+
+CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
+{
+ CmdStrokeRef _685 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdStrokeRef param_1 = _685;
+ return CmdStroke_read(param, param_1);
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok)
+{
+ Alloc a;
+ a.offset = offset;
+ return a;
+}
+
+TileSeg TileSeg_read(Alloc a, TileSegRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7);
+ Alloc param_8 = a;
+ uint param_9 = ix + 4u;
+ uint raw4 = read_mem(param_8, param_9);
+ Alloc param_10 = a;
+ uint param_11 = ix + 5u;
+ uint raw5 = read_mem(param_10, param_11);
+ TileSeg s;
+ s.origin = float2(asfloat(raw0), asfloat(raw1));
+ s._vector = float2(asfloat(raw2), asfloat(raw3));
+ s.y_edge = asfloat(raw4);
+ TileSegRef _826 = { raw5 };
+ s.next = _826;
+ return s;
+}
+
+uint2 chunk_offset(uint i)
+{
+ return uint2((i % 2u) * 8u, (i / 2u) * 4u);
+}
+
+CmdFill CmdFill_read(Alloc a, CmdFillRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ CmdFill s;
+ s.tile_ref = raw0;
+ s.backdrop = int(raw1);
+ return s;
+}
+
+CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
+{
+ CmdFillRef _675 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdFillRef param_1 = _675;
+ return CmdFill_read(param, param_1);
+}
+
+CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ CmdAlpha s;
+ s.alpha = asfloat(raw0);
+ return s;
+}
+
+CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
+{
+ CmdAlphaRef _695 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdAlphaRef param_1 = _695;
+ return CmdAlpha_read(param, param_1);
+}
+
+CmdColor CmdColor_read(Alloc a, CmdColorRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ CmdColor s;
+ s.rgba_color = raw0;
+ return s;
+}
+
+CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
+{
+ CmdColorRef _705 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdColorRef param_1 = _705;
+ return CmdColor_read(param, param_1);
+}
+
+float3 fromsRGB(float3 srgb)
+{
+ return srgb;
+}
+
+float4 unpacksRGB(uint srgba)
+{
+ float4 color = spvUnpackUnorm4x8(srgba).wzyx;
+ float3 param = color.xyz;
+ return float4(fromsRGB(param), color.w);
+}
+
+CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7);
+ CmdLinGrad s;
+ s.index = raw0;
+ s.line_x = asfloat(raw1);
+ s.line_y = asfloat(raw2);
+ s.line_c = asfloat(raw3);
+ return s;
+}
+
+CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
+{
+ CmdLinGradRef _715 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdLinGradRef param_1 = _715;
+ return CmdLinGrad_read(param, param_1);
+}
+
+CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7);
+ Alloc param_8 = a;
+ uint param_9 = ix + 4u;
+ uint raw4 = read_mem(param_8, param_9);
+ Alloc param_10 = a;
+ uint param_11 = ix + 5u;
+ uint raw5 = read_mem(param_10, param_11);
+ Alloc param_12 = a;
+ uint param_13 = ix + 6u;
+ uint raw6 = read_mem(param_12, param_13);
+ Alloc param_14 = a;
+ uint param_15 = ix + 7u;
+ uint raw7 = read_mem(param_14, param_15);
+ Alloc param_16 = a;
+ uint param_17 = ix + 8u;
+ uint raw8 = read_mem(param_16, param_17);
+ Alloc param_18 = a;
+ uint param_19 = ix + 9u;
+ uint raw9 = read_mem(param_18, param_19);
+ Alloc param_20 = a;
+ uint param_21 = ix + 10u;
+ uint raw10 = read_mem(param_20, param_21);
+ CmdRadGrad s;
+ s.index = raw0;
+ s.mat = float4(asfloat(raw1), asfloat(raw2), asfloat(raw3), asfloat(raw4));
+ s.xlat = float2(asfloat(raw5), asfloat(raw6));
+ s.c1 = float2(asfloat(raw7), asfloat(raw8));
+ s.ra = asfloat(raw9);
+ s.roff = asfloat(raw10);
+ return s;
+}
+
+CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref)
+{
+ CmdRadGradRef _725 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdRadGradRef param_1 = _725;
+ return CmdRadGrad_read(param, param_1);
+}
+
+CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ CmdImage s;
+ s.index = raw0;
+ s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+ return s;
+}
+
+CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
+{
+ CmdImageRef _735 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdImageRef param_1 = _735;
+ return CmdImage_read(param, param_1);
+}
+
+void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
+{
+ float4 rgba[8];
+ for (uint i = 0u; i < 8u; i++)
+ {
+ uint param = i;
+ int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
+ float4 fg_rgba = image_atlas[uv];
+ float3 param_1 = fg_rgba.xyz;
+ float3 _1653 = fromsRGB(param_1);
+ fg_rgba.x = _1653.x;
+ fg_rgba.y = _1653.y;
+ fg_rgba.z = _1653.z;
+ rgba[i] = fg_rgba;
+ }
+ spvReturnValue = rgba;
+}
+
+float3 tosRGB(float3 rgb)
+{
+ return rgb;
+}
+
+uint packsRGB(inout float4 rgba)
+{
+ float3 param = rgba.xyz;
+ rgba = float4(tosRGB(param), rgba.w);
+ return spvPackUnorm4x8(rgba.wzyx);
+}
+
+CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ CmdEndClip s;
+ s.blend = raw0;
+ return s;
+}
+
+CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
+{
+ CmdEndClipRef _745 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdEndClipRef param_1 = _745;
+ return CmdEndClip_read(param, param_1);
+}
+
+float3 screen(float3 cb, float3 cs)
+{
+ return (cb + cs) - (cb * cs);
+}
+
+float3 hard_light(float3 cb, float3 cs)
+{
+ float3 param = cb;
+ float3 param_1 = (cs * 2.0f) - 1.0f.xxx;
+ float3 _889 = screen(param, param_1);
+ float3 _893 = (cb * 2.0f) * cs;
+ bool3 _898 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z);
+ return float3(_898.x ? _893.x : _889.x, _898.y ? _893.y : _889.y, _898.z ? _893.z : _889.z);
+}
+
+float color_dodge(float cb, float cs)
+{
+ if (cb == 0.0f)
+ {
+ return 0.0f;
+ }
+ else
+ {
+ if (cs == 1.0f)
+ {
+ return 1.0f;
+ }
+ else
+ {
+ return min(1.0f, cb / (1.0f - cs));
+ }
+ }
+}
+
+float color_burn(float cb, float cs)
+{
+ if (cb == 1.0f)
+ {
+ return 1.0f;
+ }
+ else
+ {
+ if (cs == 0.0f)
+ {
+ return 0.0f;
+ }
+ else
+ {
+ return 1.0f - min(1.0f, (1.0f - cb) / cs);
+ }
+ }
+}
+
+float3 soft_light(float3 cb, float3 cs)
+{
+ float3 _904 = sqrt(cb);
+ float3 _917 = ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb;
+ bool3 _921 = bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z);
+ float3 d = float3(_921.x ? _917.x : _904.x, _921.y ? _917.y : _904.y, _921.z ? _917.z : _904.z);
+ float3 _932 = cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb));
+ float3 _942 = cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb));
+ bool3 _944 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z);
+ return float3(_944.x ? _942.x : _932.x, _944.y ? _942.y : _932.y, _944.z ? _942.z : _932.z);
+}
+
+float sat(float3 c)
+{
+ return max(c.x, max(c.y, c.z)) - min(c.x, min(c.y, c.z));
+}
+
+void set_sat_inner(inout float cmin, inout float cmid, inout float cmax, float s)
+{
+ if (cmax > cmin)
+ {
+ cmid = ((cmid - cmin) * s) / (cmax - cmin);
+ cmax = s;
+ }
+ else
+ {
+ cmid = 0.0f;
+ cmax = 0.0f;
+ }
+ cmin = 0.0f;
+}
+
+float3 set_sat(inout float3 c, float s)
+{
+ if (c.x <= c.y)
+ {
+ if (c.y <= c.z)
+ {
+ float param = c.x;
+ float param_1 = c.y;
+ float param_2 = c.z;
+ float param_3 = s;
+ set_sat_inner(param, param_1, param_2, param_3);
+ c.x = param;
+ c.y = param_1;
+ c.z = param_2;
+ }
+ else
+ {
+ if (c.x <= c.z)
+ {
+ float param_4 = c.x;
+ float param_5 = c.z;
+ float param_6 = c.y;
+ float param_7 = s;
+ set_sat_inner(param_4, param_5, param_6, param_7);
+ c.x = param_4;
+ c.z = param_5;
+ c.y = param_6;
+ }
+ else
+ {
+ float param_8 = c.z;
+ float param_9 = c.x;
+ float param_10 = c.y;
+ float param_11 = s;
+ set_sat_inner(param_8, param_9, param_10, param_11);
+ c.z = param_8;
+ c.x = param_9;
+ c.y = param_10;
+ }
+ }
+ }
+ else
+ {
+ if (c.x <= c.z)
+ {
+ float param_12 = c.y;
+ float param_13 = c.x;
+ float param_14 = c.z;
+ float param_15 = s;
+ set_sat_inner(param_12, param_13, param_14, param_15);
+ c.y = param_12;
+ c.x = param_13;
+ c.z = param_14;
+ }
+ else
+ {
+ if (c.y <= c.z)
+ {
+ float param_16 = c.y;
+ float param_17 = c.z;
+ float param_18 = c.x;
+ float param_19 = s;
+ set_sat_inner(param_16, param_17, param_18, param_19);
+ c.y = param_16;
+ c.z = param_17;
+ c.x = param_18;
+ }
+ else
+ {
+ float param_20 = c.z;
+ float param_21 = c.y;
+ float param_22 = c.x;
+ float param_23 = s;
+ set_sat_inner(param_20, param_21, param_22, param_23);
+ c.z = param_20;
+ c.y = param_21;
+ c.x = param_22;
+ }
+ }
+ }
+ return c;
+}
+
+float lum(float3 c)
+{
+ float3 f = float3(0.300000011920928955078125f, 0.589999973773956298828125f, 0.10999999940395355224609375f);
+ return dot(c, f);
+}
+
+float3 clip_color(inout float3 c)
+{
+ float3 param = c;
+ float L = lum(param);
+ float n = min(c.x, min(c.y, c.z));
+ float x = max(c.x, max(c.y, c.z));
+ if (n < 0.0f)
+ {
+ c = L.xxx + (((c - L.xxx) * L) / (L - n).xxx);
+ }
+ if (x > 1.0f)
+ {
+ c = L.xxx + (((c - L.xxx) * (1.0f - L)) / (x - L).xxx);
+ }
+ return c;
+}
+
+float3 set_lum(float3 c, float l)
+{
+ float3 param = c;
+ float3 param_1 = c + (l - lum(param)).xxx;
+ float3 _1048 = clip_color(param_1);
+ return _1048;
+}
+
+float3 mix_blend(float3 cb, float3 cs, uint mode)
+{
+ float3 b = 0.0f.xxx;
+ switch (mode)
+ {
+ case 1u:
+ {
+ b = cb * cs;
+ break;
+ }
+ case 2u:
+ {
+ float3 param = cb;
+ float3 param_1 = cs;
+ b = screen(param, param_1);
+ break;
+ }
+ case 3u:
+ {
+ float3 param_2 = cs;
+ float3 param_3 = cb;
+ b = hard_light(param_2, param_3);
+ break;
+ }
+ case 4u:
+ {
+ b = min(cb, cs);
+ break;
+ }
+ case 5u:
+ {
+ b = max(cb, cs);
+ break;
+ }
+ case 6u:
+ {
+ float param_4 = cb.x;
+ float param_5 = cs.x;
+ float param_6 = cb.y;
+ float param_7 = cs.y;
+ float param_8 = cb.z;
+ float param_9 = cs.z;
+ b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9));
+ break;
+ }
+ case 7u:
+ {
+ float param_10 = cb.x;
+ float param_11 = cs.x;
+ float param_12 = cb.y;
+ float param_13 = cs.y;
+ float param_14 = cb.z;
+ float param_15 = cs.z;
+ b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15));
+ break;
+ }
+ case 8u:
+ {
+ float3 param_16 = cb;
+ float3 param_17 = cs;
+ b = hard_light(param_16, param_17);
+ break;
+ }
+ case 9u:
+ {
+ float3 param_18 = cb;
+ float3 param_19 = cs;
+ b = soft_light(param_18, param_19);
+ break;
+ }
+ case 10u:
+ {
+ b = abs(cb - cs);
+ break;
+ }
+ case 11u:
+ {
+ b = (cb + cs) - ((cb * 2.0f) * cs);
+ break;
+ }
+ case 12u:
+ {
+ float3 param_20 = cb;
+ float3 param_21 = cs;
+ float param_22 = sat(param_20);
+ float3 _1340 = set_sat(param_21, param_22);
+ float3 param_23 = cb;
+ float3 param_24 = _1340;
+ float param_25 = lum(param_23);
+ b = set_lum(param_24, param_25);
+ break;
+ }
+ case 13u:
+ {
+ float3 param_26 = cs;
+ float3 param_27 = cb;
+ float param_28 = sat(param_26);
+ float3 _1354 = set_sat(param_27, param_28);
+ float3 param_29 = cb;
+ float3 param_30 = _1354;
+ float param_31 = lum(param_29);
+ b = set_lum(param_30, param_31);
+ break;
+ }
+ case 14u:
+ {
+ float3 param_32 = cb;
+ float3 param_33 = cs;
+ float param_34 = lum(param_32);
+ b = set_lum(param_33, param_34);
+ break;
+ }
+ case 15u:
+ {
+ float3 param_35 = cs;
+ float3 param_36 = cb;
+ float param_37 = lum(param_35);
+ b = set_lum(param_36, param_37);
+ break;
+ }
+ default:
+ {
+ b = cs;
+ break;
+ }
+ }
+ return b;
+}
+
+float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode)
+{
+ float fa = 0.0f;
+ float fb = 0.0f;
+ switch (mode)
+ {
+ case 1u:
+ {
+ fa = 1.0f;
+ fb = 0.0f;
+ break;
+ }
+ case 2u:
+ {
+ fa = 0.0f;
+ fb = 1.0f;
+ break;
+ }
+ case 3u:
+ {
+ fa = 1.0f;
+ fb = 1.0f - as;
+ break;
+ }
+ case 4u:
+ {
+ fa = 1.0f - ab;
+ fb = 1.0f;
+ break;
+ }
+ case 5u:
+ {
+ fa = ab;
+ fb = 0.0f;
+ break;
+ }
+ case 6u:
+ {
+ fa = 0.0f;
+ fb = as;
+ break;
+ }
+ case 7u:
+ {
+ fa = 1.0f - ab;
+ fb = 0.0f;
+ break;
+ }
+ case 8u:
+ {
+ fa = 0.0f;
+ fb = 1.0f - as;
+ break;
+ }
+ case 9u:
+ {
+ fa = ab;
+ fb = 1.0f - as;
+ break;
+ }
+ case 10u:
+ {
+ fa = 1.0f - ab;
+ fb = as;
+ break;
+ }
+ case 11u:
+ {
+ fa = 1.0f - ab;
+ fb = 1.0f - as;
+ break;
+ }
+ case 12u:
+ {
+ fa = 1.0f;
+ fb = 1.0f;
+ break;
+ }
+ case 13u:
+ {
+ return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
+ }
+ default:
+ {
+ break;
+ }
+ }
+ float as_fa = as * fa;
+ float ab_fb = ab * fb;
+ float3 co = (cs * as_fa) + (cb * ab_fb);
+ return float4(co, as_fa + ab_fb);
+}
+
+float4 mix_blend_compose(float4 backdrop, float4 src, uint mode)
+{
+ if ((mode & 32767u) == 3u)
+ {
+ return (backdrop * (1.0f - src.w)) + src;
+ }
+ float inv_src_a = 1.0f / (src.w + 1.0000000036274937255387218471014e-15f);
+ float3 cs = src.xyz * inv_src_a;
+ float inv_backdrop_a = 1.0f / (backdrop.w + 1.0000000036274937255387218471014e-15f);
+ float3 cb = backdrop.xyz * inv_backdrop_a;
+ uint blend_mode = mode >> uint(8);
+ float3 param = cb;
+ float3 param_1 = cs;
+ uint param_2 = blend_mode;
+ float3 blended = mix_blend(param, param_1, param_2);
+ cs = lerp(cs, blended, backdrop.w.xxx);
+ uint comp_mode = mode & 255u;
+ if (comp_mode == 3u)
+ {
+ float3 co = lerp(backdrop.xyz, cs, src.w.xxx);
+ return float4(co, src.w + (backdrop.w * (1.0f - src.w)));
+ }
+ else
+ {
+ float3 param_3 = cb;
+ float3 param_4 = cs;
+ float param_5 = backdrop.w;
+ float param_6 = src.w;
+ uint param_7 = comp_mode;
+ return mix_compose(param_3, param_4, param_5, param_6, param_7);
+ }
+}
+
+CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ CmdJump s;
+ s.new_ref = raw0;
+ return s;
+}
+
+CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
+{
+ CmdJumpRef _755 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdJumpRef param_1 = _755;
+ return CmdJump_read(param, param_1);
+}
+
+void comp_main()
+{
+ uint tile_ix = (gl_WorkGroupID.y * _1681.Load(12)) + gl_WorkGroupID.x;
+ Alloc _1696;
+ _1696.offset = _1681.Load(28);
+ Alloc param;
+ param.offset = _1696.offset;
+ uint param_1 = tile_ix * 1024u;
+ uint param_2 = 1024u;
+ Alloc cmd_alloc = slice_mem(param, param_1, param_2);
+ CmdRef _1705 = { cmd_alloc.offset };
+ CmdRef cmd_ref = _1705;
+ uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 12);
+ cmd_ref.offset += 4u;
+ uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
+ float2 xy = float2(xy_uint);
+ float4 rgba[8];
+ for (uint i = 0u; i < 8u; i++)
+ {
+ rgba[i] = 0.0f.xxxx;
+ }
+ uint clip_depth = 0u;
+ float df[8];
+ TileSegRef tile_seg_ref;
+ float area[8];
+ uint blend_stack[4][8];
+ uint base_ix_1;
+ uint bg_rgba;
+ while (true)
+ {
+ Alloc param_3 = cmd_alloc;
+ CmdRef param_4 = cmd_ref;
+ uint tag = Cmd_tag(param_3, param_4).tag;
+ if (tag == 0u)
+ {
+ break;
+ }
+ switch (tag)
+ {
+ case 2u:
+ {
+ Alloc param_5 = cmd_alloc;
+ CmdRef param_6 = cmd_ref;
+ CmdStroke stroke = Cmd_Stroke_read(param_5, param_6);
+ for (uint k = 0u; k < 8u; k++)
+ {
+ df[k] = 1000000000.0f;
+ }
+ TileSegRef _1805 = { stroke.tile_ref };
+ tile_seg_ref = _1805;
+ do
+ {
+ uint param_7 = tile_seg_ref.offset;
+ uint param_8 = 24u;
+ bool param_9 = true;
+ Alloc param_10 = new_alloc(param_7, param_8, param_9);
+ TileSegRef param_11 = tile_seg_ref;
+ TileSeg seg = TileSeg_read(param_10, param_11);
+ float2 line_vec = seg._vector;
+ for (uint k_1 = 0u; k_1 < 8u; k_1++)
+ {
+ float2 dpos = (xy + 0.5f.xx) - seg.origin;
+ uint param_12 = k_1;
+ dpos += float2(chunk_offset(param_12));
+ float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0f, 1.0f);
+ df[k_1] = min(df[k_1], length((line_vec * t) - dpos));
+ }
+ tile_seg_ref = seg.next;
+ } while (tile_seg_ref.offset != 0u);
+ for (uint k_2 = 0u; k_2 < 8u; k_2++)
+ {
+ area[k_2] = clamp((stroke.half_width + 0.5f) - df[k_2], 0.0f, 1.0f);
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 1u:
+ {
+ Alloc param_13 = cmd_alloc;
+ CmdRef param_14 = cmd_ref;
+ CmdFill fill = Cmd_Fill_read(param_13, param_14);
+ for (uint k_3 = 0u; k_3 < 8u; k_3++)
+ {
+ area[k_3] = float(fill.backdrop);
+ }
+ TileSegRef _1924 = { fill.tile_ref };
+ tile_seg_ref = _1924;
+ do
+ {
+ uint param_15 = tile_seg_ref.offset;
+ uint param_16 = 24u;
+ bool param_17 = true;
+ Alloc param_18 = new_alloc(param_15, param_16, param_17);
+ TileSegRef param_19 = tile_seg_ref;
+ TileSeg seg_1 = TileSeg_read(param_18, param_19);
+ for (uint k_4 = 0u; k_4 < 8u; k_4++)
+ {
+ uint param_20 = k_4;
+ float2 my_xy = xy + float2(chunk_offset(param_20));
+ float2 start = seg_1.origin - my_xy;
+ float2 end = start + seg_1._vector;
+ float2 window = clamp(float2(start.y, end.y), 0.0f.xx, 1.0f.xx);
+ if (window.x != window.y)
+ {
+ float2 t_1 = (window - start.y.xx) / seg_1._vector.y.xx;
+ float2 xs = float2(lerp(start.x, end.x, t_1.x), lerp(start.x, end.x, t_1.y));
+ float xmin = min(min(xs.x, xs.y), 1.0f) - 9.9999999747524270787835121154785e-07f;
+ float xmax = max(xs.x, xs.y);
+ float b = min(xmax, 1.0f);
+ float c = max(b, 0.0f);
+ float d = max(xmin, 0.0f);
+ float a = ((b + (0.5f * ((d * d) - (c * c)))) - xmin) / (xmax - xmin);
+ area[k_4] += (a * (window.x - window.y));
+ }
+ area[k_4] += (sign(seg_1._vector.x) * clamp((my_xy.y - seg_1.y_edge) + 1.0f, 0.0f, 1.0f));
+ }
+ tile_seg_ref = seg_1.next;
+ } while (tile_seg_ref.offset != 0u);
+ for (uint k_5 = 0u; k_5 < 8u; k_5++)
+ {
+ area[k_5] = min(abs(area[k_5]), 1.0f);
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 3u:
+ {
+ for (uint k_6 = 0u; k_6 < 8u; k_6++)
+ {
+ area[k_6] = 1.0f;
+ }
+ cmd_ref.offset += 4u;
+ break;
+ }
+ case 4u:
+ {
+ Alloc param_21 = cmd_alloc;
+ CmdRef param_22 = cmd_ref;
+ CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22);
+ for (uint k_7 = 0u; k_7 < 8u; k_7++)
+ {
+ area[k_7] = alpha.alpha;
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 5u:
+ {
+ Alloc param_23 = cmd_alloc;
+ CmdRef param_24 = cmd_ref;
+ CmdColor color = Cmd_Color_read(param_23, param_24);
+ uint param_25 = color.rgba_color;
+ float4 fg = unpacksRGB(param_25);
+ for (uint k_8 = 0u; k_8 < 8u; k_8++)
+ {
+ float4 fg_k = fg * area[k_8];
+ rgba[k_8] = (rgba[k_8] * (1.0f - fg_k.w)) + fg_k;
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 6u:
+ {
+ Alloc param_26 = cmd_alloc;
+ CmdRef param_27 = cmd_ref;
+ CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27);
+ float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
+ for (uint k_9 = 0u; k_9 < 8u; k_9++)
+ {
+ uint param_28 = k_9;
+ float2 chunk_xy = float2(chunk_offset(param_28));
+ float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y);
+ int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
+ float4 fg_rgba = gradients[int2(x, int(lin.index))];
+ float3 param_29 = fg_rgba.xyz;
+ float3 _2257 = fromsRGB(param_29);
+ fg_rgba.x = _2257.x;
+ fg_rgba.y = _2257.y;
+ fg_rgba.z = _2257.z;
+ float4 fg_k_1 = fg_rgba * area[k_9];
+ rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
+ }
+ cmd_ref.offset += 20u;
+ break;
+ }
+ case 7u:
+ {
+ Alloc param_30 = cmd_alloc;
+ CmdRef param_31 = cmd_ref;
+ CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31);
+ for (uint k_10 = 0u; k_10 < 8u; k_10++)
+ {
+ uint param_32 = k_10;
+ float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+ my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+ float ba = dot(my_xy_1, rad.c1);
+ float ca = rad.ra * dot(my_xy_1, my_xy_1);
+ float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+ int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
+ float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
+ float3 param_33 = fg_rgba_1.xyz;
+ float3 _2367 = fromsRGB(param_33);
+ fg_rgba_1.x = _2367.x;
+ fg_rgba_1.y = _2367.y;
+ fg_rgba_1.z = _2367.z;
+ float4 fg_k_2 = fg_rgba_1 * area[k_10];
+ rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
+ }
+ cmd_ref.offset += 48u;
+ break;
+ }
+ case 8u:
+ {
+ Alloc param_34 = cmd_alloc;
+ CmdRef param_35 = cmd_ref;
+ CmdImage fill_img = Cmd_Image_read(param_34, param_35);
+ uint2 param_36 = xy_uint;
+ CmdImage param_37 = fill_img;
+ float4 _2410[8];
+ fillImage(_2410, param_36, param_37);
+ float4 img[8] = _2410;
+ for (uint k_11 = 0u; k_11 < 8u; k_11++)
+ {
+ float4 fg_k_3 = img[k_11] * area[k_11];
+ rgba[k_11] = (rgba[k_11] * (1.0f - fg_k_3.w)) + fg_k_3;
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 9u:
+ {
+ if (clip_depth < 4u)
+ {
+ for (uint k_12 = 0u; k_12 < 8u; k_12++)
+ {
+ float4 param_38 = float4(rgba[k_12]);
+ uint _2472 = packsRGB(param_38);
+ blend_stack[clip_depth][k_12] = _2472;
+ rgba[k_12] = 0.0f.xxxx;
+ }
+ }
+ else
+ {
+ uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+ for (uint k_13 = 0u; k_13 < 8u; k_13++)
+ {
+ float4 param_39 = float4(rgba[k_13]);
+ uint _2519 = packsRGB(param_39);
+ _2506.Store((base_ix + k_13) * 4 + 0, _2519);
+ rgba[k_13] = 0.0f.xxxx;
+ }
+ }
+ clip_depth++;
+ cmd_ref.offset += 4u;
+ break;
+ }
+ case 10u:
+ {
+ Alloc param_40 = cmd_alloc;
+ CmdRef param_41 = cmd_ref;
+ CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41);
+ clip_depth--;
+ if (clip_depth >= 4u)
+ {
+ base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+ }
+ for (uint k_14 = 0u; k_14 < 8u; k_14++)
+ {
+ if (clip_depth < 4u)
+ {
+ bg_rgba = blend_stack[clip_depth][k_14];
+ }
+ else
+ {
+ bg_rgba = _2506.Load((base_ix_1 + k_14) * 4 + 0);
+ }
+ uint param_42 = bg_rgba;
+ float4 bg = unpacksRGB(param_42);
+ float4 fg_1 = rgba[k_14] * area[k_14];
+ float4 param_43 = bg;
+ float4 param_44 = fg_1;
+ uint param_45 = end_clip.blend;
+ rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 11u:
+ {
+ Alloc param_46 = cmd_alloc;
+ CmdRef param_47 = cmd_ref;
+ CmdRef _2618 = { Cmd_Jump_read(param_46, param_47).new_ref };
+ cmd_ref = _2618;
+ cmd_alloc.offset = cmd_ref.offset;
+ break;
+ }
+ }
+ }
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ uint param_48 = i_1;
+ float3 param_49 = rgba[i_1].xyz;
+ image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w);
+ }
+}
+
+[numthreads(8, 4, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_WorkGroupID = stage_input.gl_WorkGroupID;
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
new file mode 100644
index 0000000..e51183f
--- /dev/null
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -0,0 +1,1354 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+ T elements[Num ? Num : 1];
+
+ thread T& operator [] (size_t pos) thread
+ {
+ return elements[pos];
+ }
+ constexpr const thread T& operator [] (size_t pos) const thread
+ {
+ return elements[pos];
+ }
+
+ device T& operator [] (size_t pos) device
+ {
+ return elements[pos];
+ }
+ constexpr const device T& operator [] (size_t pos) const device
+ {
+ return elements[pos];
+ }
+
+ constexpr const constant T& operator [] (size_t pos) const constant
+ {
+ return elements[pos];
+ }
+
+ threadgroup T& operator [] (size_t pos) threadgroup
+ {
+ return elements[pos];
+ }
+ constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+ {
+ return elements[pos];
+ }
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct CmdStrokeRef
+{
+ uint offset;
+};
+
+struct CmdStroke
+{
+ uint tile_ref;
+ float half_width;
+};
+
+struct CmdFillRef
+{
+ uint offset;
+};
+
+struct CmdFill
+{
+ uint tile_ref;
+ int backdrop;
+};
+
+struct CmdColorRef
+{
+ uint offset;
+};
+
+struct CmdColor
+{
+ uint rgba_color;
+};
+
+struct CmdLinGradRef
+{
+ uint offset;
+};
+
+struct CmdLinGrad
+{
+ uint index;
+ float line_x;
+ float line_y;
+ float line_c;
+};
+
+struct CmdRadGradRef
+{
+ uint offset;
+};
+
+struct CmdRadGrad
+{
+ uint index;
+ float4 mat;
+ float2 xlat;
+ float2 c1;
+ float ra;
+ float roff;
+};
+
+struct CmdImageRef
+{
+ uint offset;
+};
+
+struct CmdImage
+{
+ uint index;
+ int2 offset;
+};
+
+struct CmdAlphaRef
+{
+ uint offset;
+};
+
+struct CmdAlpha
+{
+ float alpha;
+};
+
+struct CmdEndClipRef
+{
+ uint offset;
+};
+
+struct CmdEndClip
+{
+ uint blend;
+};
+
+struct CmdJumpRef
+{
+ uint offset;
+};
+
+struct CmdJump
+{
+ uint new_ref;
+};
+
+struct CmdRef
+{
+ uint offset;
+};
+
+struct CmdTag
+{
+ uint tag;
+ uint flags;
+};
+
+struct TileSegRef
+{
+ uint offset;
+};
+
+struct TileSeg
+{
+ float2 origin;
+ float2 vector;
+ float y_edge;
+ TileSegRef next;
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+struct Alloc_1
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc_1 tile_alloc;
+ Alloc_1 bin_alloc;
+ Alloc_1 ptcl_alloc;
+ Alloc_1 pathseg_alloc;
+ Alloc_1 anno_alloc;
+ Alloc_1 path_bbox_alloc;
+ Alloc_1 drawmonoid_alloc;
+ Alloc_1 clip_alloc;
+ Alloc_1 clip_bic_alloc;
+ Alloc_1 clip_stack_alloc;
+ Alloc_1 clip_bbox_alloc;
+ Alloc_1 draw_bbox_alloc;
+ Alloc_1 drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+struct BlendBuf
+{
+ uint blend_mem[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u);
+
+static inline __attribute__((always_inline))
+Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
+{
+ return Alloc{ a.offset + offset };
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+ return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_297)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return 0u;
+ }
+ uint v = v_297.memory[offset];
+ return v;
+}
+
+static inline __attribute__((always_inline))
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint tag_and_flags = read_mem(param, param_1, v_297);
+ return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+}
+
+static inline __attribute__((always_inline))
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_297);
+ CmdStroke s;
+ s.tile_ref = raw0;
+ s.half_width = as_type<float>(raw1);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
+ return CmdStroke_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+ Alloc a;
+ a.offset = offset;
+ return a;
+}
+
+static inline __attribute__((always_inline))
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_297);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5, v_297);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7, v_297);
+ Alloc param_8 = a;
+ uint param_9 = ix + 4u;
+ uint raw4 = read_mem(param_8, param_9, v_297);
+ Alloc param_10 = a;
+ uint param_11 = ix + 5u;
+ uint raw5 = read_mem(param_10, param_11, v_297);
+ TileSeg s;
+ s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
+ s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
+ s.y_edge = as_type<float>(raw4);
+ s.next = TileSegRef{ raw5 };
+ return s;
+}
+
+static inline __attribute__((always_inline))
+uint2 chunk_offset(thread const uint& i)
+{
+ return uint2((i % 2u) * 8u, (i / 2u) * 4u);
+}
+
+static inline __attribute__((always_inline))
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_297);
+ CmdFill s;
+ s.tile_ref = raw0;
+ s.backdrop = int(raw1);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
+ return CmdFill_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ CmdAlpha s;
+ s.alpha = as_type<float>(raw0);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
+ return CmdAlpha_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ CmdColor s;
+ s.rgba_color = raw0;
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
+ return CmdColor_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+float3 fromsRGB(thread const float3& srgb)
+{
+ return srgb;
+}
+
+static inline __attribute__((always_inline))
+float4 unpacksRGB(thread const uint& srgba)
+{
+ float4 color = unpack_unorm4x8_to_float(srgba).wzyx;
+ float3 param = color.xyz;
+ return float4(fromsRGB(param), color.w);
+}
+
+static inline __attribute__((always_inline))
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_297);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5, v_297);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7, v_297);
+ CmdLinGrad s;
+ s.index = raw0;
+ s.line_x = as_type<float>(raw1);
+ s.line_y = as_type<float>(raw2);
+ s.line_c = as_type<float>(raw3);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
+ return CmdLinGrad_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_297);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5, v_297);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7, v_297);
+ Alloc param_8 = a;
+ uint param_9 = ix + 4u;
+ uint raw4 = read_mem(param_8, param_9, v_297);
+ Alloc param_10 = a;
+ uint param_11 = ix + 5u;
+ uint raw5 = read_mem(param_10, param_11, v_297);
+ Alloc param_12 = a;
+ uint param_13 = ix + 6u;
+ uint raw6 = read_mem(param_12, param_13, v_297);
+ Alloc param_14 = a;
+ uint param_15 = ix + 7u;
+ uint raw7 = read_mem(param_14, param_15, v_297);
+ Alloc param_16 = a;
+ uint param_17 = ix + 8u;
+ uint raw8 = read_mem(param_16, param_17, v_297);
+ Alloc param_18 = a;
+ uint param_19 = ix + 9u;
+ uint raw9 = read_mem(param_18, param_19, v_297);
+ Alloc param_20 = a;
+ uint param_21 = ix + 10u;
+ uint raw10 = read_mem(param_20, param_21, v_297);
+ CmdRadGrad s;
+ s.index = raw0;
+ s.mat = float4(as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3), as_type<float>(raw4));
+ s.xlat = float2(as_type<float>(raw5), as_type<float>(raw6));
+ s.c1 = float2(as_type<float>(raw7), as_type<float>(raw8));
+ s.ra = as_type<float>(raw9);
+ s.roff = as_type<float>(raw10);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u };
+ return CmdRadGrad_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_297);
+ CmdImage s;
+ s.index = raw0;
+ s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
+ return CmdImage_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImage& cmd_img, texture2d<float> image_atlas)
+{
+ spvUnsafeArray<float4, 8> rgba;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ uint param = i;
+ int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
+ float4 fg_rgba = image_atlas.read(uint2(uv));
+ float3 param_1 = fg_rgba.xyz;
+ float3 _1653 = fromsRGB(param_1);
+ fg_rgba.x = _1653.x;
+ fg_rgba.y = _1653.y;
+ fg_rgba.z = _1653.z;
+ rgba[i] = fg_rgba;
+ }
+ return rgba;
+}
+
+static inline __attribute__((always_inline))
+float3 tosRGB(thread const float3& rgb)
+{
+ return rgb;
+}
+
+static inline __attribute__((always_inline))
+uint packsRGB(thread float4& rgba)
+{
+ float3 param = rgba.xyz;
+ rgba = float4(tosRGB(param), rgba.w);
+ return pack_float_to_unorm4x8(rgba.wzyx);
+}
+
+static inline __attribute__((always_inline))
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ CmdEndClip s;
+ s.blend = raw0;
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
+ return CmdEndClip_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+float3 screen(thread const float3& cb, thread const float3& cs)
+{
+ return (cb + cs) - (cb * cs);
+}
+
+static inline __attribute__((always_inline))
+float3 hard_light(thread const float3& cb, thread const float3& cs)
+{
+ float3 param = cb;
+ float3 param_1 = (cs * 2.0) - float3(1.0);
+ return select(screen(param, param_1), (cb * 2.0) * cs, cs <= float3(0.5));
+}
+
+static inline __attribute__((always_inline))
+float color_dodge(thread const float& cb, thread const float& cs)
+{
+ if (cb == 0.0)
+ {
+ return 0.0;
+ }
+ else
+ {
+ if (cs == 1.0)
+ {
+ return 1.0;
+ }
+ else
+ {
+ return fast::min(1.0, cb / (1.0 - cs));
+ }
+ }
+}
+
+static inline __attribute__((always_inline))
+float color_burn(thread const float& cb, thread const float& cs)
+{
+ if (cb == 1.0)
+ {
+ return 1.0;
+ }
+ else
+ {
+ if (cs == 0.0)
+ {
+ return 0.0;
+ }
+ else
+ {
+ return 1.0 - fast::min(1.0, (1.0 - cb) / cs);
+ }
+ }
+}
+
+static inline __attribute__((always_inline))
+float3 soft_light(thread const float3& cb, thread const float3& cs)
+{
+ float3 d = select(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, cb <= float3(0.25));
+ return select(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), cs <= float3(0.5));
+}
+
+static inline __attribute__((always_inline))
+float sat(thread const float3& c)
+{
+ return fast::max(c.x, fast::max(c.y, c.z)) - fast::min(c.x, fast::min(c.y, c.z));
+}
+
+static inline __attribute__((always_inline))
+void set_sat_inner(thread float& cmin, thread float& cmid, thread float& cmax, thread const float& s)
+{
+ if (cmax > cmin)
+ {
+ cmid = ((cmid - cmin) * s) / (cmax - cmin);
+ cmax = s;
+ }
+ else
+ {
+ cmid = 0.0;
+ cmax = 0.0;
+ }
+ cmin = 0.0;
+}
+
+static inline __attribute__((always_inline))
+float3 set_sat(thread float3& c, thread const float& s)
+{
+ if (c.x <= c.y)
+ {
+ if (c.y <= c.z)
+ {
+ float param = c.x;
+ float param_1 = c.y;
+ float param_2 = c.z;
+ float param_3 = s;
+ set_sat_inner(param, param_1, param_2, param_3);
+ c.x = param;
+ c.y = param_1;
+ c.z = param_2;
+ }
+ else
+ {
+ if (c.x <= c.z)
+ {
+ float param_4 = c.x;
+ float param_5 = c.z;
+ float param_6 = c.y;
+ float param_7 = s;
+ set_sat_inner(param_4, param_5, param_6, param_7);
+ c.x = param_4;
+ c.z = param_5;
+ c.y = param_6;
+ }
+ else
+ {
+ float param_8 = c.z;
+ float param_9 = c.x;
+ float param_10 = c.y;
+ float param_11 = s;
+ set_sat_inner(param_8, param_9, param_10, param_11);
+ c.z = param_8;
+ c.x = param_9;
+ c.y = param_10;
+ }
+ }
+ }
+ else
+ {
+ if (c.x <= c.z)
+ {
+ float param_12 = c.y;
+ float param_13 = c.x;
+ float param_14 = c.z;
+ float param_15 = s;
+ set_sat_inner(param_12, param_13, param_14, param_15);
+ c.y = param_12;
+ c.x = param_13;
+ c.z = param_14;
+ }
+ else
+ {
+ if (c.y <= c.z)
+ {
+ float param_16 = c.y;
+ float param_17 = c.z;
+ float param_18 = c.x;
+ float param_19 = s;
+ set_sat_inner(param_16, param_17, param_18, param_19);
+ c.y = param_16;
+ c.z = param_17;
+ c.x = param_18;
+ }
+ else
+ {
+ float param_20 = c.z;
+ float param_21 = c.y;
+ float param_22 = c.x;
+ float param_23 = s;
+ set_sat_inner(param_20, param_21, param_22, param_23);
+ c.z = param_20;
+ c.y = param_21;
+ c.x = param_22;
+ }
+ }
+ }
+ return c;
+}
+
+static inline __attribute__((always_inline))
+float lum(thread const float3& c)
+{
+ float3 f = float3(0.300000011920928955078125, 0.589999973773956298828125, 0.10999999940395355224609375);
+ return dot(c, f);
+}
+
+static inline __attribute__((always_inline))
+float3 clip_color(thread float3& c)
+{
+ float3 param = c;
+ float L = lum(param);
+ float n = fast::min(c.x, fast::min(c.y, c.z));
+ float x = fast::max(c.x, fast::max(c.y, c.z));
+ if (n < 0.0)
+ {
+ c = float3(L) + (((c - float3(L)) * L) / float3(L - n));
+ }
+ if (x > 1.0)
+ {
+ c = float3(L) + (((c - float3(L)) * (1.0 - L)) / float3(x - L));
+ }
+ return c;
+}
+
+static inline __attribute__((always_inline))
+float3 set_lum(thread const float3& c, thread const float& l)
+{
+ float3 param = c;
+ float3 param_1 = c + float3(l - lum(param));
+ float3 _1048 = clip_color(param_1);
+ return _1048;
+}
+
+static inline __attribute__((always_inline))
+float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const uint& mode)
+{
+ float3 b = float3(0.0);
+ switch (mode)
+ {
+ case 1u:
+ {
+ b = cb * cs;
+ break;
+ }
+ case 2u:
+ {
+ float3 param = cb;
+ float3 param_1 = cs;
+ b = screen(param, param_1);
+ break;
+ }
+ case 3u:
+ {
+ float3 param_2 = cs;
+ float3 param_3 = cb;
+ b = hard_light(param_2, param_3);
+ break;
+ }
+ case 4u:
+ {
+ b = fast::min(cb, cs);
+ break;
+ }
+ case 5u:
+ {
+ b = fast::max(cb, cs);
+ break;
+ }
+ case 6u:
+ {
+ float param_4 = cb.x;
+ float param_5 = cs.x;
+ float param_6 = cb.y;
+ float param_7 = cs.y;
+ float param_8 = cb.z;
+ float param_9 = cs.z;
+ b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9));
+ break;
+ }
+ case 7u:
+ {
+ float param_10 = cb.x;
+ float param_11 = cs.x;
+ float param_12 = cb.y;
+ float param_13 = cs.y;
+ float param_14 = cb.z;
+ float param_15 = cs.z;
+ b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15));
+ break;
+ }
+ case 8u:
+ {
+ float3 param_16 = cb;
+ float3 param_17 = cs;
+ b = hard_light(param_16, param_17);
+ break;
+ }
+ case 9u:
+ {
+ float3 param_18 = cb;
+ float3 param_19 = cs;
+ b = soft_light(param_18, param_19);
+ break;
+ }
+ case 10u:
+ {
+ b = abs(cb - cs);
+ break;
+ }
+ case 11u:
+ {
+ b = (cb + cs) - ((cb * 2.0) * cs);
+ break;
+ }
+ case 12u:
+ {
+ float3 param_20 = cb;
+ float3 param_21 = cs;
+ float param_22 = sat(param_20);
+ float3 _1340 = set_sat(param_21, param_22);
+ float3 param_23 = cb;
+ float3 param_24 = _1340;
+ float param_25 = lum(param_23);
+ b = set_lum(param_24, param_25);
+ break;
+ }
+ case 13u:
+ {
+ float3 param_26 = cs;
+ float3 param_27 = cb;
+ float param_28 = sat(param_26);
+ float3 _1354 = set_sat(param_27, param_28);
+ float3 param_29 = cb;
+ float3 param_30 = _1354;
+ float param_31 = lum(param_29);
+ b = set_lum(param_30, param_31);
+ break;
+ }
+ case 14u:
+ {
+ float3 param_32 = cb;
+ float3 param_33 = cs;
+ float param_34 = lum(param_32);
+ b = set_lum(param_33, param_34);
+ break;
+ }
+ case 15u:
+ {
+ float3 param_35 = cs;
+ float3 param_36 = cb;
+ float param_37 = lum(param_35);
+ b = set_lum(param_36, param_37);
+ break;
+ }
+ default:
+ {
+ b = cs;
+ break;
+ }
+ }
+ return b;
+}
+
+static inline __attribute__((always_inline))
+float4 mix_compose(thread const float3& cb, thread const float3& cs, thread const float& ab, thread const float& as, thread const uint& mode)
+{
+ float fa = 0.0;
+ float fb = 0.0;
+ switch (mode)
+ {
+ case 1u:
+ {
+ fa = 1.0;
+ fb = 0.0;
+ break;
+ }
+ case 2u:
+ {
+ fa = 0.0;
+ fb = 1.0;
+ break;
+ }
+ case 3u:
+ {
+ fa = 1.0;
+ fb = 1.0 - as;
+ break;
+ }
+ case 4u:
+ {
+ fa = 1.0 - ab;
+ fb = 1.0;
+ break;
+ }
+ case 5u:
+ {
+ fa = ab;
+ fb = 0.0;
+ break;
+ }
+ case 6u:
+ {
+ fa = 0.0;
+ fb = as;
+ break;
+ }
+ case 7u:
+ {
+ fa = 1.0 - ab;
+ fb = 0.0;
+ break;
+ }
+ case 8u:
+ {
+ fa = 0.0;
+ fb = 1.0 - as;
+ break;
+ }
+ case 9u:
+ {
+ fa = ab;
+ fb = 1.0 - as;
+ break;
+ }
+ case 10u:
+ {
+ fa = 1.0 - ab;
+ fb = as;
+ break;
+ }
+ case 11u:
+ {
+ fa = 1.0 - ab;
+ fb = 1.0 - as;
+ break;
+ }
+ case 12u:
+ {
+ fa = 1.0;
+ fb = 1.0;
+ break;
+ }
+ case 13u:
+ {
+ return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
+ }
+ default:
+ {
+ break;
+ }
+ }
+ float as_fa = as * fa;
+ float ab_fb = ab * fb;
+ float3 co = (cs * as_fa) + (cb * ab_fb);
+ return float4(co, as_fa + ab_fb);
+}
+
+static inline __attribute__((always_inline))
+float4 mix_blend_compose(thread const float4& backdrop, thread const float4& src, thread const uint& mode)
+{
+ if ((mode & 32767u) == 3u)
+ {
+ return (backdrop * (1.0 - src.w)) + src;
+ }
+ float inv_src_a = 1.0 / (src.w + 1.0000000036274937255387218471014e-15);
+ float3 cs = src.xyz * inv_src_a;
+ float inv_backdrop_a = 1.0 / (backdrop.w + 1.0000000036274937255387218471014e-15);
+ float3 cb = backdrop.xyz * inv_backdrop_a;
+ uint blend_mode = mode >> uint(8);
+ float3 param = cb;
+ float3 param_1 = cs;
+ uint param_2 = blend_mode;
+ float3 blended = mix_blend(param, param_1, param_2);
+ cs = mix(cs, blended, float3(backdrop.w));
+ uint comp_mode = mode & 255u;
+ if (comp_mode == 3u)
+ {
+ float3 co = mix(backdrop.xyz, cs, float3(src.w));
+ return float4(co, src.w + (backdrop.w * (1.0 - src.w)));
+ }
+ else
+ {
+ float3 param_3 = cb;
+ float3 param_4 = cs;
+ float param_5 = backdrop.w;
+ float param_6 = src.w;
+ uint param_7 = comp_mode;
+ return mix_compose(param_3, param_4, param_5, param_6, param_7);
+ }
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ CmdJump s;
+ s.new_ref = raw0;
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
+ return CmdJump_read(param, param_1, v_297);
+}
+
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], device BlendBuf& _2506 [[buffer(2)]], texture2d<float, access::write> image [[texture(3)]], texture2d<float> image_atlas [[texture(4)]], texture2d<float> gradients [[texture(5)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+ uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x;
+ Alloc param;
+ param.offset = _1681.conf.ptcl_alloc.offset;
+ uint param_1 = tile_ix * 1024u;
+ uint param_2 = 1024u;
+ Alloc cmd_alloc = slice_mem(param, param_1, param_2);
+ CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+ uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)];
+ cmd_ref.offset += 4u;
+ uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
+ float2 xy = float2(xy_uint);
+ spvUnsafeArray<float4, 8> rgba;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ rgba[i] = float4(0.0);
+ }
+ uint clip_depth = 0u;
+ spvUnsafeArray<float, 8> df;
+ TileSegRef tile_seg_ref;
+ spvUnsafeArray<float, 8> area;
+ spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
+ uint base_ix_1;
+ uint bg_rgba;
+ while (true)
+ {
+ Alloc param_3 = cmd_alloc;
+ CmdRef param_4 = cmd_ref;
+ uint tag = Cmd_tag(param_3, param_4, v_297).tag;
+ if (tag == 0u)
+ {
+ break;
+ }
+ switch (tag)
+ {
+ case 2u:
+ {
+ Alloc param_5 = cmd_alloc;
+ CmdRef param_6 = cmd_ref;
+ CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_297);
+ for (uint k = 0u; k < 8u; k++)
+ {
+ df[k] = 1000000000.0;
+ }
+ tile_seg_ref = TileSegRef{ stroke.tile_ref };
+ do
+ {
+ uint param_7 = tile_seg_ref.offset;
+ uint param_8 = 24u;
+ bool param_9 = true;
+ Alloc param_10 = new_alloc(param_7, param_8, param_9);
+ TileSegRef param_11 = tile_seg_ref;
+ TileSeg seg = TileSeg_read(param_10, param_11, v_297);
+ float2 line_vec = seg.vector;
+ for (uint k_1 = 0u; k_1 < 8u; k_1++)
+ {
+ float2 dpos = (xy + float2(0.5)) - seg.origin;
+ uint param_12 = k_1;
+ dpos += float2(chunk_offset(param_12));
+ float t = fast::clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
+ df[k_1] = fast::min(df[k_1], length((line_vec * t) - dpos));
+ }
+ tile_seg_ref = seg.next;
+ } while (tile_seg_ref.offset != 0u);
+ for (uint k_2 = 0u; k_2 < 8u; k_2++)
+ {
+ area[k_2] = fast::clamp((stroke.half_width + 0.5) - df[k_2], 0.0, 1.0);
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 1u:
+ {
+ Alloc param_13 = cmd_alloc;
+ CmdRef param_14 = cmd_ref;
+ CmdFill fill = Cmd_Fill_read(param_13, param_14, v_297);
+ for (uint k_3 = 0u; k_3 < 8u; k_3++)
+ {
+ area[k_3] = float(fill.backdrop);
+ }
+ tile_seg_ref = TileSegRef{ fill.tile_ref };
+ do
+ {
+ uint param_15 = tile_seg_ref.offset;
+ uint param_16 = 24u;
+ bool param_17 = true;
+ Alloc param_18 = new_alloc(param_15, param_16, param_17);
+ TileSegRef param_19 = tile_seg_ref;
+ TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297);
+ for (uint k_4 = 0u; k_4 < 8u; k_4++)
+ {
+ uint param_20 = k_4;
+ float2 my_xy = xy + float2(chunk_offset(param_20));
+ float2 start = seg_1.origin - my_xy;
+ float2 end = start + seg_1.vector;
+ float2 window = fast::clamp(float2(start.y, end.y), float2(0.0), float2(1.0));
+ if ((isunordered(window.x, window.y) || window.x != window.y))
+ {
+ float2 t_1 = (window - float2(start.y)) / float2(seg_1.vector.y);
+ float2 xs = float2(mix(start.x, end.x, t_1.x), mix(start.x, end.x, t_1.y));
+ float xmin = fast::min(fast::min(xs.x, xs.y), 1.0) - 9.9999999747524270787835121154785e-07;
+ float xmax = fast::max(xs.x, xs.y);
+ float b = fast::min(xmax, 1.0);
+ float c = fast::max(b, 0.0);
+ float d = fast::max(xmin, 0.0);
+ float a = ((b + (0.5 * ((d * d) - (c * c)))) - xmin) / (xmax - xmin);
+ area[k_4] += (a * (window.x - window.y));
+ }
+ area[k_4] += (sign(seg_1.vector.x) * fast::clamp((my_xy.y - seg_1.y_edge) + 1.0, 0.0, 1.0));
+ }
+ tile_seg_ref = seg_1.next;
+ } while (tile_seg_ref.offset != 0u);
+ for (uint k_5 = 0u; k_5 < 8u; k_5++)
+ {
+ area[k_5] = fast::min(abs(area[k_5]), 1.0);
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 3u:
+ {
+ for (uint k_6 = 0u; k_6 < 8u; k_6++)
+ {
+ area[k_6] = 1.0;
+ }
+ cmd_ref.offset += 4u;
+ break;
+ }
+ case 4u:
+ {
+ Alloc param_21 = cmd_alloc;
+ CmdRef param_22 = cmd_ref;
+ CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_297);
+ for (uint k_7 = 0u; k_7 < 8u; k_7++)
+ {
+ area[k_7] = alpha.alpha;
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 5u:
+ {
+ Alloc param_23 = cmd_alloc;
+ CmdRef param_24 = cmd_ref;
+ CmdColor color = Cmd_Color_read(param_23, param_24, v_297);
+ uint param_25 = color.rgba_color;
+ float4 fg = unpacksRGB(param_25);
+ for (uint k_8 = 0u; k_8 < 8u; k_8++)
+ {
+ float4 fg_k = fg * area[k_8];
+ rgba[k_8] = (rgba[k_8] * (1.0 - fg_k.w)) + fg_k;
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 6u:
+ {
+ Alloc param_26 = cmd_alloc;
+ CmdRef param_27 = cmd_ref;
+ CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_297);
+ float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
+ for (uint k_9 = 0u; k_9 < 8u; k_9++)
+ {
+ uint param_28 = k_9;
+ float2 chunk_xy = float2(chunk_offset(param_28));
+ float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y);
+ int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
+ float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
+ float3 param_29 = fg_rgba.xyz;
+ float3 _2257 = fromsRGB(param_29);
+ fg_rgba.x = _2257.x;
+ fg_rgba.y = _2257.y;
+ fg_rgba.z = _2257.z;
+ float4 fg_k_1 = fg_rgba * area[k_9];
+ rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
+ }
+ cmd_ref.offset += 20u;
+ break;
+ }
+ case 7u:
+ {
+ Alloc param_30 = cmd_alloc;
+ CmdRef param_31 = cmd_ref;
+ CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_297);
+ for (uint k_10 = 0u; k_10 < 8u; k_10++)
+ {
+ uint param_32 = k_10;
+ float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+ my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+ float ba = dot(my_xy_1, rad.c1);
+ float ca = rad.ra * dot(my_xy_1, my_xy_1);
+ float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+ int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
+ float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
+ float3 param_33 = fg_rgba_1.xyz;
+ float3 _2367 = fromsRGB(param_33);
+ fg_rgba_1.x = _2367.x;
+ fg_rgba_1.y = _2367.y;
+ fg_rgba_1.z = _2367.z;
+ float4 fg_k_2 = fg_rgba_1 * area[k_10];
+ rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
+ }
+ cmd_ref.offset += 48u;
+ break;
+ }
+ case 8u:
+ {
+ Alloc param_34 = cmd_alloc;
+ CmdRef param_35 = cmd_ref;
+ CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_297);
+ uint2 param_36 = xy_uint;
+ CmdImage param_37 = fill_img;
+ spvUnsafeArray<float4, 8> img;
+ img = fillImage(param_36, param_37, image_atlas);
+ for (uint k_11 = 0u; k_11 < 8u; k_11++)
+ {
+ float4 fg_k_3 = img[k_11] * area[k_11];
+ rgba[k_11] = (rgba[k_11] * (1.0 - fg_k_3.w)) + fg_k_3;
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 9u:
+ {
+ if (clip_depth < 4u)
+ {
+ for (uint k_12 = 0u; k_12 < 8u; k_12++)
+ {
+ float4 param_38 = float4(rgba[k_12]);
+ uint _2472 = packsRGB(param_38);
+ blend_stack[clip_depth][k_12] = _2472;
+ rgba[k_12] = float4(0.0);
+ }
+ }
+ else
+ {
+ uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+ for (uint k_13 = 0u; k_13 < 8u; k_13++)
+ {
+ float4 param_39 = float4(rgba[k_13]);
+ uint _2519 = packsRGB(param_39);
+ _2506.blend_mem[base_ix + k_13] = _2519;
+ rgba[k_13] = float4(0.0);
+ }
+ }
+ clip_depth++;
+ cmd_ref.offset += 4u;
+ break;
+ }
+ case 10u:
+ {
+ Alloc param_40 = cmd_alloc;
+ CmdRef param_41 = cmd_ref;
+ CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297);
+ clip_depth--;
+ if (clip_depth >= 4u)
+ {
+ base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+ }
+ for (uint k_14 = 0u; k_14 < 8u; k_14++)
+ {
+ if (clip_depth < 4u)
+ {
+ bg_rgba = blend_stack[clip_depth][k_14];
+ }
+ else
+ {
+ bg_rgba = _2506.blend_mem[base_ix_1 + k_14];
+ }
+ uint param_42 = bg_rgba;
+ float4 bg = unpacksRGB(param_42);
+ float4 fg_1 = rgba[k_14] * area[k_14];
+ float4 param_43 = bg;
+ float4 param_44 = fg_1;
+ uint param_45 = end_clip.blend;
+ rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 11u:
+ {
+ Alloc param_46 = cmd_alloc;
+ CmdRef param_47 = cmd_ref;
+ cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref };
+ cmd_alloc.offset = cmd_ref.offset;
+ break;
+ }
+ }
+ }
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ uint param_48 = i_1;
+ float3 param_49 = rgba[i_1].xyz;
+ image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
+ }
+}
+
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
new file mode 100644
index 0000000..3bd9d6e
--- /dev/null
+++ b/piet-gpu/shader/gen/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
new file mode 100644
index 0000000..098e317
--- /dev/null
+++ b/piet-gpu/shader/gen/kernel4_gray.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
new file mode 100644
index 0000000..500c527
--- /dev/null
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -0,0 +1,1302 @@
+struct Alloc
+{
+ uint offset;
+};
+
+struct CmdStrokeRef
+{
+ uint offset;
+};
+
+struct CmdStroke
+{
+ uint tile_ref;
+ float half_width;
+};
+
+struct CmdFillRef
+{
+ uint offset;
+};
+
+struct CmdFill
+{
+ uint tile_ref;
+ int backdrop;
+};
+
+struct CmdColorRef
+{
+ uint offset;
+};
+
+struct CmdColor
+{
+ uint rgba_color;
+};
+
+struct CmdLinGradRef
+{
+ uint offset;
+};
+
+struct CmdLinGrad
+{
+ uint index;
+ float line_x;
+ float line_y;
+ float line_c;
+};
+
+struct CmdRadGradRef
+{
+ uint offset;
+};
+
+struct CmdRadGrad
+{
+ uint index;
+ float4 mat;
+ float2 xlat;
+ float2 c1;
+ float ra;
+ float roff;
+};
+
+struct CmdImageRef
+{
+ uint offset;
+};
+
+struct CmdImage
+{
+ uint index;
+ int2 offset;
+};
+
+struct CmdAlphaRef
+{
+ uint offset;
+};
+
+struct CmdAlpha
+{
+ float alpha;
+};
+
+struct CmdEndClipRef
+{
+ uint offset;
+};
+
+struct CmdEndClip
+{
+ uint blend;
+};
+
+struct CmdJumpRef
+{
+ uint offset;
+};
+
+struct CmdJump
+{
+ uint new_ref;
+};
+
+struct CmdRef
+{
+ uint offset;
+};
+
+struct CmdTag
+{
+ uint tag;
+ uint flags;
+};
+
+struct TileSegRef
+{
+ uint offset;
+};
+
+struct TileSeg
+{
+ float2 origin;
+ float2 _vector;
+ float y_edge;
+ TileSegRef next;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
+
+RWByteAddressBuffer _297 : register(u0, space0);
+ByteAddressBuffer _1681 : register(t1, space0);
+RWByteAddressBuffer _2506 : register(u2, space0);
+RWTexture2D<unorm float4> image_atlas : register(u4, space0);
+RWTexture2D<unorm float4> gradients : register(u5, space0);
+RWTexture2D<unorm float> image : register(u3, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_WorkGroupID : SV_GroupID;
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+};
+
+uint spvPackUnorm4x8(float4 value)
+{
+ uint4 Packed = uint4(round(saturate(value) * 255.0));
+ return Packed.x | (Packed.y << 8) | (Packed.z << 16) | (Packed.w << 24);
+}
+
+float4 spvUnpackUnorm4x8(uint value)
+{
+ uint4 Packed = uint4(value & 0xff, (value >> 8) & 0xff, (value >> 16) & 0xff, value >> 24);
+ return float4(Packed) / 255.0;
+}
+
+Alloc slice_mem(Alloc a, uint offset, uint size)
+{
+ Alloc _310 = { a.offset + offset };
+ return _310;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+ return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return 0u;
+ }
+ uint v = _297.Load(offset * 4 + 12);
+ return v;
+}
+
+CmdTag Cmd_tag(Alloc a, CmdRef ref)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint tag_and_flags = read_mem(param, param_1);
+ CmdTag _669 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+ return _669;
+}
+
+CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ CmdStroke s;
+ s.tile_ref = raw0;
+ s.half_width = asfloat(raw1);
+ return s;
+}
+
+CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
+{
+ CmdStrokeRef _685 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdStrokeRef param_1 = _685;
+ return CmdStroke_read(param, param_1);
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok)
+{
+ Alloc a;
+ a.offset = offset;
+ return a;
+}
+
+TileSeg TileSeg_read(Alloc a, TileSegRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7);
+ Alloc param_8 = a;
+ uint param_9 = ix + 4u;
+ uint raw4 = read_mem(param_8, param_9);
+ Alloc param_10 = a;
+ uint param_11 = ix + 5u;
+ uint raw5 = read_mem(param_10, param_11);
+ TileSeg s;
+ s.origin = float2(asfloat(raw0), asfloat(raw1));
+ s._vector = float2(asfloat(raw2), asfloat(raw3));
+ s.y_edge = asfloat(raw4);
+ TileSegRef _826 = { raw5 };
+ s.next = _826;
+ return s;
+}
+
+uint2 chunk_offset(uint i)
+{
+ return uint2((i % 2u) * 8u, (i / 2u) * 4u);
+}
+
+CmdFill CmdFill_read(Alloc a, CmdFillRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ CmdFill s;
+ s.tile_ref = raw0;
+ s.backdrop = int(raw1);
+ return s;
+}
+
+CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
+{
+ CmdFillRef _675 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdFillRef param_1 = _675;
+ return CmdFill_read(param, param_1);
+}
+
+CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ CmdAlpha s;
+ s.alpha = asfloat(raw0);
+ return s;
+}
+
+CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
+{
+ CmdAlphaRef _695 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdAlphaRef param_1 = _695;
+ return CmdAlpha_read(param, param_1);
+}
+
+CmdColor CmdColor_read(Alloc a, CmdColorRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ CmdColor s;
+ s.rgba_color = raw0;
+ return s;
+}
+
+CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
+{
+ CmdColorRef _705 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdColorRef param_1 = _705;
+ return CmdColor_read(param, param_1);
+}
+
+float3 fromsRGB(float3 srgb)
+{
+ return srgb;
+}
+
+float4 unpacksRGB(uint srgba)
+{
+ float4 color = spvUnpackUnorm4x8(srgba).wzyx;
+ float3 param = color.xyz;
+ return float4(fromsRGB(param), color.w);
+}
+
+CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7);
+ CmdLinGrad s;
+ s.index = raw0;
+ s.line_x = asfloat(raw1);
+ s.line_y = asfloat(raw2);
+ s.line_c = asfloat(raw3);
+ return s;
+}
+
+CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
+{
+ CmdLinGradRef _715 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdLinGradRef param_1 = _715;
+ return CmdLinGrad_read(param, param_1);
+}
+
+CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7);
+ Alloc param_8 = a;
+ uint param_9 = ix + 4u;
+ uint raw4 = read_mem(param_8, param_9);
+ Alloc param_10 = a;
+ uint param_11 = ix + 5u;
+ uint raw5 = read_mem(param_10, param_11);
+ Alloc param_12 = a;
+ uint param_13 = ix + 6u;
+ uint raw6 = read_mem(param_12, param_13);
+ Alloc param_14 = a;
+ uint param_15 = ix + 7u;
+ uint raw7 = read_mem(param_14, param_15);
+ Alloc param_16 = a;
+ uint param_17 = ix + 8u;
+ uint raw8 = read_mem(param_16, param_17);
+ Alloc param_18 = a;
+ uint param_19 = ix + 9u;
+ uint raw9 = read_mem(param_18, param_19);
+ Alloc param_20 = a;
+ uint param_21 = ix + 10u;
+ uint raw10 = read_mem(param_20, param_21);
+ CmdRadGrad s;
+ s.index = raw0;
+ s.mat = float4(asfloat(raw1), asfloat(raw2), asfloat(raw3), asfloat(raw4));
+ s.xlat = float2(asfloat(raw5), asfloat(raw6));
+ s.c1 = float2(asfloat(raw7), asfloat(raw8));
+ s.ra = asfloat(raw9);
+ s.roff = asfloat(raw10);
+ return s;
+}
+
+CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref)
+{
+ CmdRadGradRef _725 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdRadGradRef param_1 = _725;
+ return CmdRadGrad_read(param, param_1);
+}
+
+CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ CmdImage s;
+ s.index = raw0;
+ s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+ return s;
+}
+
+CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
+{
+ CmdImageRef _735 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdImageRef param_1 = _735;
+ return CmdImage_read(param, param_1);
+}
+
+void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
+{
+ float4 rgba[8];
+ for (uint i = 0u; i < 8u; i++)
+ {
+ uint param = i;
+ int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
+ float4 fg_rgba = image_atlas[uv];
+ float3 param_1 = fg_rgba.xyz;
+ float3 _1653 = fromsRGB(param_1);
+ fg_rgba.x = _1653.x;
+ fg_rgba.y = _1653.y;
+ fg_rgba.z = _1653.z;
+ rgba[i] = fg_rgba;
+ }
+ spvReturnValue = rgba;
+}
+
+float3 tosRGB(float3 rgb)
+{
+ return rgb;
+}
+
+uint packsRGB(inout float4 rgba)
+{
+ float3 param = rgba.xyz;
+ rgba = float4(tosRGB(param), rgba.w);
+ return spvPackUnorm4x8(rgba.wzyx);
+}
+
+CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ CmdEndClip s;
+ s.blend = raw0;
+ return s;
+}
+
+CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
+{
+ CmdEndClipRef _745 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdEndClipRef param_1 = _745;
+ return CmdEndClip_read(param, param_1);
+}
+
+float3 screen(float3 cb, float3 cs)
+{
+ return (cb + cs) - (cb * cs);
+}
+
+float3 hard_light(float3 cb, float3 cs)
+{
+ float3 param = cb;
+ float3 param_1 = (cs * 2.0f) - 1.0f.xxx;
+ float3 _889 = screen(param, param_1);
+ float3 _893 = (cb * 2.0f) * cs;
+ bool3 _898 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z);
+ return float3(_898.x ? _893.x : _889.x, _898.y ? _893.y : _889.y, _898.z ? _893.z : _889.z);
+}
+
+float color_dodge(float cb, float cs)
+{
+ if (cb == 0.0f)
+ {
+ return 0.0f;
+ }
+ else
+ {
+ if (cs == 1.0f)
+ {
+ return 1.0f;
+ }
+ else
+ {
+ return min(1.0f, cb / (1.0f - cs));
+ }
+ }
+}
+
+float color_burn(float cb, float cs)
+{
+ if (cb == 1.0f)
+ {
+ return 1.0f;
+ }
+ else
+ {
+ if (cs == 0.0f)
+ {
+ return 0.0f;
+ }
+ else
+ {
+ return 1.0f - min(1.0f, (1.0f - cb) / cs);
+ }
+ }
+}
+
+float3 soft_light(float3 cb, float3 cs)
+{
+ float3 _904 = sqrt(cb);
+ float3 _917 = ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb;
+ bool3 _921 = bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z);
+ float3 d = float3(_921.x ? _917.x : _904.x, _921.y ? _917.y : _904.y, _921.z ? _917.z : _904.z);
+ float3 _932 = cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb));
+ float3 _942 = cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb));
+ bool3 _944 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z);
+ return float3(_944.x ? _942.x : _932.x, _944.y ? _942.y : _932.y, _944.z ? _942.z : _932.z);
+}
+
+float sat(float3 c)
+{
+ return max(c.x, max(c.y, c.z)) - min(c.x, min(c.y, c.z));
+}
+
+void set_sat_inner(inout float cmin, inout float cmid, inout float cmax, float s)
+{
+ if (cmax > cmin)
+ {
+ cmid = ((cmid - cmin) * s) / (cmax - cmin);
+ cmax = s;
+ }
+ else
+ {
+ cmid = 0.0f;
+ cmax = 0.0f;
+ }
+ cmin = 0.0f;
+}
+
+float3 set_sat(inout float3 c, float s)
+{
+ if (c.x <= c.y)
+ {
+ if (c.y <= c.z)
+ {
+ float param = c.x;
+ float param_1 = c.y;
+ float param_2 = c.z;
+ float param_3 = s;
+ set_sat_inner(param, param_1, param_2, param_3);
+ c.x = param;
+ c.y = param_1;
+ c.z = param_2;
+ }
+ else
+ {
+ if (c.x <= c.z)
+ {
+ float param_4 = c.x;
+ float param_5 = c.z;
+ float param_6 = c.y;
+ float param_7 = s;
+ set_sat_inner(param_4, param_5, param_6, param_7);
+ c.x = param_4;
+ c.z = param_5;
+ c.y = param_6;
+ }
+ else
+ {
+ float param_8 = c.z;
+ float param_9 = c.x;
+ float param_10 = c.y;
+ float param_11 = s;
+ set_sat_inner(param_8, param_9, param_10, param_11);
+ c.z = param_8;
+ c.x = param_9;
+ c.y = param_10;
+ }
+ }
+ }
+ else
+ {
+ if (c.x <= c.z)
+ {
+ float param_12 = c.y;
+ float param_13 = c.x;
+ float param_14 = c.z;
+ float param_15 = s;
+ set_sat_inner(param_12, param_13, param_14, param_15);
+ c.y = param_12;
+ c.x = param_13;
+ c.z = param_14;
+ }
+ else
+ {
+ if (c.y <= c.z)
+ {
+ float param_16 = c.y;
+ float param_17 = c.z;
+ float param_18 = c.x;
+ float param_19 = s;
+ set_sat_inner(param_16, param_17, param_18, param_19);
+ c.y = param_16;
+ c.z = param_17;
+ c.x = param_18;
+ }
+ else
+ {
+ float param_20 = c.z;
+ float param_21 = c.y;
+ float param_22 = c.x;
+ float param_23 = s;
+ set_sat_inner(param_20, param_21, param_22, param_23);
+ c.z = param_20;
+ c.y = param_21;
+ c.x = param_22;
+ }
+ }
+ }
+ return c;
+}
+
+float lum(float3 c)
+{
+ float3 f = float3(0.300000011920928955078125f, 0.589999973773956298828125f, 0.10999999940395355224609375f);
+ return dot(c, f);
+}
+
+float3 clip_color(inout float3 c)
+{
+ float3 param = c;
+ float L = lum(param);
+ float n = min(c.x, min(c.y, c.z));
+ float x = max(c.x, max(c.y, c.z));
+ if (n < 0.0f)
+ {
+ c = L.xxx + (((c - L.xxx) * L) / (L - n).xxx);
+ }
+ if (x > 1.0f)
+ {
+ c = L.xxx + (((c - L.xxx) * (1.0f - L)) / (x - L).xxx);
+ }
+ return c;
+}
+
+float3 set_lum(float3 c, float l)
+{
+ float3 param = c;
+ float3 param_1 = c + (l - lum(param)).xxx;
+ float3 _1048 = clip_color(param_1);
+ return _1048;
+}
+
+float3 mix_blend(float3 cb, float3 cs, uint mode)
+{
+ float3 b = 0.0f.xxx;
+ switch (mode)
+ {
+ case 1u:
+ {
+ b = cb * cs;
+ break;
+ }
+ case 2u:
+ {
+ float3 param = cb;
+ float3 param_1 = cs;
+ b = screen(param, param_1);
+ break;
+ }
+ case 3u:
+ {
+ float3 param_2 = cs;
+ float3 param_3 = cb;
+ b = hard_light(param_2, param_3);
+ break;
+ }
+ case 4u:
+ {
+ b = min(cb, cs);
+ break;
+ }
+ case 5u:
+ {
+ b = max(cb, cs);
+ break;
+ }
+ case 6u:
+ {
+ float param_4 = cb.x;
+ float param_5 = cs.x;
+ float param_6 = cb.y;
+ float param_7 = cs.y;
+ float param_8 = cb.z;
+ float param_9 = cs.z;
+ b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9));
+ break;
+ }
+ case 7u:
+ {
+ float param_10 = cb.x;
+ float param_11 = cs.x;
+ float param_12 = cb.y;
+ float param_13 = cs.y;
+ float param_14 = cb.z;
+ float param_15 = cs.z;
+ b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15));
+ break;
+ }
+ case 8u:
+ {
+ float3 param_16 = cb;
+ float3 param_17 = cs;
+ b = hard_light(param_16, param_17);
+ break;
+ }
+ case 9u:
+ {
+ float3 param_18 = cb;
+ float3 param_19 = cs;
+ b = soft_light(param_18, param_19);
+ break;
+ }
+ case 10u:
+ {
+ b = abs(cb - cs);
+ break;
+ }
+ case 11u:
+ {
+ b = (cb + cs) - ((cb * 2.0f) * cs);
+ break;
+ }
+ case 12u:
+ {
+ float3 param_20 = cb;
+ float3 param_21 = cs;
+ float param_22 = sat(param_20);
+ float3 _1340 = set_sat(param_21, param_22);
+ float3 param_23 = cb;
+ float3 param_24 = _1340;
+ float param_25 = lum(param_23);
+ b = set_lum(param_24, param_25);
+ break;
+ }
+ case 13u:
+ {
+ float3 param_26 = cs;
+ float3 param_27 = cb;
+ float param_28 = sat(param_26);
+ float3 _1354 = set_sat(param_27, param_28);
+ float3 param_29 = cb;
+ float3 param_30 = _1354;
+ float param_31 = lum(param_29);
+ b = set_lum(param_30, param_31);
+ break;
+ }
+ case 14u:
+ {
+ float3 param_32 = cb;
+ float3 param_33 = cs;
+ float param_34 = lum(param_32);
+ b = set_lum(param_33, param_34);
+ break;
+ }
+ case 15u:
+ {
+ float3 param_35 = cs;
+ float3 param_36 = cb;
+ float param_37 = lum(param_35);
+ b = set_lum(param_36, param_37);
+ break;
+ }
+ default:
+ {
+ b = cs;
+ break;
+ }
+ }
+ return b;
+}
+
+float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode)
+{
+ float fa = 0.0f;
+ float fb = 0.0f;
+ switch (mode)
+ {
+ case 1u:
+ {
+ fa = 1.0f;
+ fb = 0.0f;
+ break;
+ }
+ case 2u:
+ {
+ fa = 0.0f;
+ fb = 1.0f;
+ break;
+ }
+ case 3u:
+ {
+ fa = 1.0f;
+ fb = 1.0f - as;
+ break;
+ }
+ case 4u:
+ {
+ fa = 1.0f - ab;
+ fb = 1.0f;
+ break;
+ }
+ case 5u:
+ {
+ fa = ab;
+ fb = 0.0f;
+ break;
+ }
+ case 6u:
+ {
+ fa = 0.0f;
+ fb = as;
+ break;
+ }
+ case 7u:
+ {
+ fa = 1.0f - ab;
+ fb = 0.0f;
+ break;
+ }
+ case 8u:
+ {
+ fa = 0.0f;
+ fb = 1.0f - as;
+ break;
+ }
+ case 9u:
+ {
+ fa = ab;
+ fb = 1.0f - as;
+ break;
+ }
+ case 10u:
+ {
+ fa = 1.0f - ab;
+ fb = as;
+ break;
+ }
+ case 11u:
+ {
+ fa = 1.0f - ab;
+ fb = 1.0f - as;
+ break;
+ }
+ case 12u:
+ {
+ fa = 1.0f;
+ fb = 1.0f;
+ break;
+ }
+ case 13u:
+ {
+ return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
+ }
+ default:
+ {
+ break;
+ }
+ }
+ float as_fa = as * fa;
+ float ab_fb = ab * fb;
+ float3 co = (cs * as_fa) + (cb * ab_fb);
+ return float4(co, as_fa + ab_fb);
+}
+
+float4 mix_blend_compose(float4 backdrop, float4 src, uint mode)
+{
+ if ((mode & 32767u) == 3u)
+ {
+ return (backdrop * (1.0f - src.w)) + src;
+ }
+ float inv_src_a = 1.0f / (src.w + 1.0000000036274937255387218471014e-15f);
+ float3 cs = src.xyz * inv_src_a;
+ float inv_backdrop_a = 1.0f / (backdrop.w + 1.0000000036274937255387218471014e-15f);
+ float3 cb = backdrop.xyz * inv_backdrop_a;
+ uint blend_mode = mode >> uint(8);
+ float3 param = cb;
+ float3 param_1 = cs;
+ uint param_2 = blend_mode;
+ float3 blended = mix_blend(param, param_1, param_2);
+ cs = lerp(cs, blended, backdrop.w.xxx);
+ uint comp_mode = mode & 255u;
+ if (comp_mode == 3u)
+ {
+ float3 co = lerp(backdrop.xyz, cs, src.w.xxx);
+ return float4(co, src.w + (backdrop.w * (1.0f - src.w)));
+ }
+ else
+ {
+ float3 param_3 = cb;
+ float3 param_4 = cs;
+ float param_5 = backdrop.w;
+ float param_6 = src.w;
+ uint param_7 = comp_mode;
+ return mix_compose(param_3, param_4, param_5, param_6, param_7);
+ }
+}
+
+CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ CmdJump s;
+ s.new_ref = raw0;
+ return s;
+}
+
+CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
+{
+ CmdJumpRef _755 = { ref.offset + 4u };
+ Alloc param = a;
+ CmdJumpRef param_1 = _755;
+ return CmdJump_read(param, param_1);
+}
+
+void comp_main()
+{
+ uint tile_ix = (gl_WorkGroupID.y * _1681.Load(12)) + gl_WorkGroupID.x;
+ Alloc _1696;
+ _1696.offset = _1681.Load(28);
+ Alloc param;
+ param.offset = _1696.offset;
+ uint param_1 = tile_ix * 1024u;
+ uint param_2 = 1024u;
+ Alloc cmd_alloc = slice_mem(param, param_1, param_2);
+ CmdRef _1705 = { cmd_alloc.offset };
+ CmdRef cmd_ref = _1705;
+ uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 12);
+ cmd_ref.offset += 4u;
+ uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
+ float2 xy = float2(xy_uint);
+ float4 rgba[8];
+ for (uint i = 0u; i < 8u; i++)
+ {
+ rgba[i] = 0.0f.xxxx;
+ }
+ uint clip_depth = 0u;
+ float df[8];
+ TileSegRef tile_seg_ref;
+ float area[8];
+ uint blend_stack[4][8];
+ uint base_ix_1;
+ uint bg_rgba;
+ while (true)
+ {
+ Alloc param_3 = cmd_alloc;
+ CmdRef param_4 = cmd_ref;
+ uint tag = Cmd_tag(param_3, param_4).tag;
+ if (tag == 0u)
+ {
+ break;
+ }
+ switch (tag)
+ {
+ case 2u:
+ {
+ Alloc param_5 = cmd_alloc;
+ CmdRef param_6 = cmd_ref;
+ CmdStroke stroke = Cmd_Stroke_read(param_5, param_6);
+ for (uint k = 0u; k < 8u; k++)
+ {
+ df[k] = 1000000000.0f;
+ }
+ TileSegRef _1805 = { stroke.tile_ref };
+ tile_seg_ref = _1805;
+ do
+ {
+ uint param_7 = tile_seg_ref.offset;
+ uint param_8 = 24u;
+ bool param_9 = true;
+ Alloc param_10 = new_alloc(param_7, param_8, param_9);
+ TileSegRef param_11 = tile_seg_ref;
+ TileSeg seg = TileSeg_read(param_10, param_11);
+ float2 line_vec = seg._vector;
+ for (uint k_1 = 0u; k_1 < 8u; k_1++)
+ {
+ float2 dpos = (xy + 0.5f.xx) - seg.origin;
+ uint param_12 = k_1;
+ dpos += float2(chunk_offset(param_12));
+ float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0f, 1.0f);
+ df[k_1] = min(df[k_1], length((line_vec * t) - dpos));
+ }
+ tile_seg_ref = seg.next;
+ } while (tile_seg_ref.offset != 0u);
+ for (uint k_2 = 0u; k_2 < 8u; k_2++)
+ {
+ area[k_2] = clamp((stroke.half_width + 0.5f) - df[k_2], 0.0f, 1.0f);
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 1u:
+ {
+ Alloc param_13 = cmd_alloc;
+ CmdRef param_14 = cmd_ref;
+ CmdFill fill = Cmd_Fill_read(param_13, param_14);
+ for (uint k_3 = 0u; k_3 < 8u; k_3++)
+ {
+ area[k_3] = float(fill.backdrop);
+ }
+ TileSegRef _1924 = { fill.tile_ref };
+ tile_seg_ref = _1924;
+ do
+ {
+ uint param_15 = tile_seg_ref.offset;
+ uint param_16 = 24u;
+ bool param_17 = true;
+ Alloc param_18 = new_alloc(param_15, param_16, param_17);
+ TileSegRef param_19 = tile_seg_ref;
+ TileSeg seg_1 = TileSeg_read(param_18, param_19);
+ for (uint k_4 = 0u; k_4 < 8u; k_4++)
+ {
+ uint param_20 = k_4;
+ float2 my_xy = xy + float2(chunk_offset(param_20));
+ float2 start = seg_1.origin - my_xy;
+ float2 end = start + seg_1._vector;
+ float2 window = clamp(float2(start.y, end.y), 0.0f.xx, 1.0f.xx);
+ if (window.x != window.y)
+ {
+ float2 t_1 = (window - start.y.xx) / seg_1._vector.y.xx;
+ float2 xs = float2(lerp(start.x, end.x, t_1.x), lerp(start.x, end.x, t_1.y));
+ float xmin = min(min(xs.x, xs.y), 1.0f) - 9.9999999747524270787835121154785e-07f;
+ float xmax = max(xs.x, xs.y);
+ float b = min(xmax, 1.0f);
+ float c = max(b, 0.0f);
+ float d = max(xmin, 0.0f);
+ float a = ((b + (0.5f * ((d * d) - (c * c)))) - xmin) / (xmax - xmin);
+ area[k_4] += (a * (window.x - window.y));
+ }
+ area[k_4] += (sign(seg_1._vector.x) * clamp((my_xy.y - seg_1.y_edge) + 1.0f, 0.0f, 1.0f));
+ }
+ tile_seg_ref = seg_1.next;
+ } while (tile_seg_ref.offset != 0u);
+ for (uint k_5 = 0u; k_5 < 8u; k_5++)
+ {
+ area[k_5] = min(abs(area[k_5]), 1.0f);
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 3u:
+ {
+ for (uint k_6 = 0u; k_6 < 8u; k_6++)
+ {
+ area[k_6] = 1.0f;
+ }
+ cmd_ref.offset += 4u;
+ break;
+ }
+ case 4u:
+ {
+ Alloc param_21 = cmd_alloc;
+ CmdRef param_22 = cmd_ref;
+ CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22);
+ for (uint k_7 = 0u; k_7 < 8u; k_7++)
+ {
+ area[k_7] = alpha.alpha;
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 5u:
+ {
+ Alloc param_23 = cmd_alloc;
+ CmdRef param_24 = cmd_ref;
+ CmdColor color = Cmd_Color_read(param_23, param_24);
+ uint param_25 = color.rgba_color;
+ float4 fg = unpacksRGB(param_25);
+ for (uint k_8 = 0u; k_8 < 8u; k_8++)
+ {
+ float4 fg_k = fg * area[k_8];
+ rgba[k_8] = (rgba[k_8] * (1.0f - fg_k.w)) + fg_k;
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 6u:
+ {
+ Alloc param_26 = cmd_alloc;
+ CmdRef param_27 = cmd_ref;
+ CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27);
+ float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
+ for (uint k_9 = 0u; k_9 < 8u; k_9++)
+ {
+ uint param_28 = k_9;
+ float2 chunk_xy = float2(chunk_offset(param_28));
+ float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y);
+ int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
+ float4 fg_rgba = gradients[int2(x, int(lin.index))];
+ float3 param_29 = fg_rgba.xyz;
+ float3 _2257 = fromsRGB(param_29);
+ fg_rgba.x = _2257.x;
+ fg_rgba.y = _2257.y;
+ fg_rgba.z = _2257.z;
+ float4 fg_k_1 = fg_rgba * area[k_9];
+ rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
+ }
+ cmd_ref.offset += 20u;
+ break;
+ }
+ case 7u:
+ {
+ Alloc param_30 = cmd_alloc;
+ CmdRef param_31 = cmd_ref;
+ CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31);
+ for (uint k_10 = 0u; k_10 < 8u; k_10++)
+ {
+ uint param_32 = k_10;
+ float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+ my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+ float ba = dot(my_xy_1, rad.c1);
+ float ca = rad.ra * dot(my_xy_1, my_xy_1);
+ float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+ int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
+ float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
+ float3 param_33 = fg_rgba_1.xyz;
+ float3 _2367 = fromsRGB(param_33);
+ fg_rgba_1.x = _2367.x;
+ fg_rgba_1.y = _2367.y;
+ fg_rgba_1.z = _2367.z;
+ float4 fg_k_2 = fg_rgba_1 * area[k_10];
+ rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
+ }
+ cmd_ref.offset += 48u;
+ break;
+ }
+ case 8u:
+ {
+ Alloc param_34 = cmd_alloc;
+ CmdRef param_35 = cmd_ref;
+ CmdImage fill_img = Cmd_Image_read(param_34, param_35);
+ uint2 param_36 = xy_uint;
+ CmdImage param_37 = fill_img;
+ float4 _2410[8];
+ fillImage(_2410, param_36, param_37);
+ float4 img[8] = _2410;
+ for (uint k_11 = 0u; k_11 < 8u; k_11++)
+ {
+ float4 fg_k_3 = img[k_11] * area[k_11];
+ rgba[k_11] = (rgba[k_11] * (1.0f - fg_k_3.w)) + fg_k_3;
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 9u:
+ {
+ if (clip_depth < 4u)
+ {
+ for (uint k_12 = 0u; k_12 < 8u; k_12++)
+ {
+ float4 param_38 = float4(rgba[k_12]);
+ uint _2472 = packsRGB(param_38);
+ blend_stack[clip_depth][k_12] = _2472;
+ rgba[k_12] = 0.0f.xxxx;
+ }
+ }
+ else
+ {
+ uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+ for (uint k_13 = 0u; k_13 < 8u; k_13++)
+ {
+ float4 param_39 = float4(rgba[k_13]);
+ uint _2519 = packsRGB(param_39);
+ _2506.Store((base_ix + k_13) * 4 + 0, _2519);
+ rgba[k_13] = 0.0f.xxxx;
+ }
+ }
+ clip_depth++;
+ cmd_ref.offset += 4u;
+ break;
+ }
+ case 10u:
+ {
+ Alloc param_40 = cmd_alloc;
+ CmdRef param_41 = cmd_ref;
+ CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41);
+ clip_depth--;
+ if (clip_depth >= 4u)
+ {
+ base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+ }
+ for (uint k_14 = 0u; k_14 < 8u; k_14++)
+ {
+ if (clip_depth < 4u)
+ {
+ bg_rgba = blend_stack[clip_depth][k_14];
+ }
+ else
+ {
+ bg_rgba = _2506.Load((base_ix_1 + k_14) * 4 + 0);
+ }
+ uint param_42 = bg_rgba;
+ float4 bg = unpacksRGB(param_42);
+ float4 fg_1 = rgba[k_14] * area[k_14];
+ float4 param_43 = bg;
+ float4 param_44 = fg_1;
+ uint param_45 = end_clip.blend;
+ rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 11u:
+ {
+ Alloc param_46 = cmd_alloc;
+ CmdRef param_47 = cmd_ref;
+ CmdRef _2618 = { Cmd_Jump_read(param_46, param_47).new_ref };
+ cmd_ref = _2618;
+ cmd_alloc.offset = cmd_ref.offset;
+ break;
+ }
+ }
+ }
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ uint param_48 = i_1;
+ image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x;
+ }
+}
+
+[numthreads(8, 4, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_WorkGroupID = stage_input.gl_WorkGroupID;
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
new file mode 100644
index 0000000..40a03e5
--- /dev/null
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -0,0 +1,1353 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+ T elements[Num ? Num : 1];
+
+ thread T& operator [] (size_t pos) thread
+ {
+ return elements[pos];
+ }
+ constexpr const thread T& operator [] (size_t pos) const thread
+ {
+ return elements[pos];
+ }
+
+ device T& operator [] (size_t pos) device
+ {
+ return elements[pos];
+ }
+ constexpr const device T& operator [] (size_t pos) const device
+ {
+ return elements[pos];
+ }
+
+ constexpr const constant T& operator [] (size_t pos) const constant
+ {
+ return elements[pos];
+ }
+
+ threadgroup T& operator [] (size_t pos) threadgroup
+ {
+ return elements[pos];
+ }
+ constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+ {
+ return elements[pos];
+ }
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct CmdStrokeRef
+{
+ uint offset;
+};
+
+struct CmdStroke
+{
+ uint tile_ref;
+ float half_width;
+};
+
+struct CmdFillRef
+{
+ uint offset;
+};
+
+struct CmdFill
+{
+ uint tile_ref;
+ int backdrop;
+};
+
+struct CmdColorRef
+{
+ uint offset;
+};
+
+struct CmdColor
+{
+ uint rgba_color;
+};
+
+struct CmdLinGradRef
+{
+ uint offset;
+};
+
+struct CmdLinGrad
+{
+ uint index;
+ float line_x;
+ float line_y;
+ float line_c;
+};
+
+struct CmdRadGradRef
+{
+ uint offset;
+};
+
+struct CmdRadGrad
+{
+ uint index;
+ float4 mat;
+ float2 xlat;
+ float2 c1;
+ float ra;
+ float roff;
+};
+
+struct CmdImageRef
+{
+ uint offset;
+};
+
+struct CmdImage
+{
+ uint index;
+ int2 offset;
+};
+
+struct CmdAlphaRef
+{
+ uint offset;
+};
+
+struct CmdAlpha
+{
+ float alpha;
+};
+
+struct CmdEndClipRef
+{
+ uint offset;
+};
+
+struct CmdEndClip
+{
+ uint blend;
+};
+
+struct CmdJumpRef
+{
+ uint offset;
+};
+
+struct CmdJump
+{
+ uint new_ref;
+};
+
+struct CmdRef
+{
+ uint offset;
+};
+
+struct CmdTag
+{
+ uint tag;
+ uint flags;
+};
+
+struct TileSegRef
+{
+ uint offset;
+};
+
+struct TileSeg
+{
+ float2 origin;
+ float2 vector;
+ float y_edge;
+ TileSegRef next;
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+struct Alloc_1
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc_1 tile_alloc;
+ Alloc_1 bin_alloc;
+ Alloc_1 ptcl_alloc;
+ Alloc_1 pathseg_alloc;
+ Alloc_1 anno_alloc;
+ Alloc_1 path_bbox_alloc;
+ Alloc_1 drawmonoid_alloc;
+ Alloc_1 clip_alloc;
+ Alloc_1 clip_bic_alloc;
+ Alloc_1 clip_stack_alloc;
+ Alloc_1 clip_bbox_alloc;
+ Alloc_1 draw_bbox_alloc;
+ Alloc_1 drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+struct BlendBuf
+{
+ uint blend_mem[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u);
+
+static inline __attribute__((always_inline))
+Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
+{
+ return Alloc{ a.offset + offset };
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+ return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_297)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return 0u;
+ }
+ uint v = v_297.memory[offset];
+ return v;
+}
+
+static inline __attribute__((always_inline))
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint tag_and_flags = read_mem(param, param_1, v_297);
+ return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+}
+
+static inline __attribute__((always_inline))
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_297);
+ CmdStroke s;
+ s.tile_ref = raw0;
+ s.half_width = as_type<float>(raw1);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
+ return CmdStroke_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+ Alloc a;
+ a.offset = offset;
+ return a;
+}
+
+static inline __attribute__((always_inline))
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_297);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5, v_297);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7, v_297);
+ Alloc param_8 = a;
+ uint param_9 = ix + 4u;
+ uint raw4 = read_mem(param_8, param_9, v_297);
+ Alloc param_10 = a;
+ uint param_11 = ix + 5u;
+ uint raw5 = read_mem(param_10, param_11, v_297);
+ TileSeg s;
+ s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
+ s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
+ s.y_edge = as_type<float>(raw4);
+ s.next = TileSegRef{ raw5 };
+ return s;
+}
+
+static inline __attribute__((always_inline))
+uint2 chunk_offset(thread const uint& i)
+{
+ return uint2((i % 2u) * 8u, (i / 2u) * 4u);
+}
+
+static inline __attribute__((always_inline))
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_297);
+ CmdFill s;
+ s.tile_ref = raw0;
+ s.backdrop = int(raw1);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
+ return CmdFill_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ CmdAlpha s;
+ s.alpha = as_type<float>(raw0);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
+ return CmdAlpha_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ CmdColor s;
+ s.rgba_color = raw0;
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
+ return CmdColor_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+float3 fromsRGB(thread const float3& srgb)
+{
+ return srgb;
+}
+
+static inline __attribute__((always_inline))
+float4 unpacksRGB(thread const uint& srgba)
+{
+ float4 color = unpack_unorm4x8_to_float(srgba).wzyx;
+ float3 param = color.xyz;
+ return float4(fromsRGB(param), color.w);
+}
+
+static inline __attribute__((always_inline))
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_297);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5, v_297);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7, v_297);
+ CmdLinGrad s;
+ s.index = raw0;
+ s.line_x = as_type<float>(raw1);
+ s.line_y = as_type<float>(raw2);
+ s.line_c = as_type<float>(raw3);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
+ return CmdLinGrad_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_297);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5, v_297);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7, v_297);
+ Alloc param_8 = a;
+ uint param_9 = ix + 4u;
+ uint raw4 = read_mem(param_8, param_9, v_297);
+ Alloc param_10 = a;
+ uint param_11 = ix + 5u;
+ uint raw5 = read_mem(param_10, param_11, v_297);
+ Alloc param_12 = a;
+ uint param_13 = ix + 6u;
+ uint raw6 = read_mem(param_12, param_13, v_297);
+ Alloc param_14 = a;
+ uint param_15 = ix + 7u;
+ uint raw7 = read_mem(param_14, param_15, v_297);
+ Alloc param_16 = a;
+ uint param_17 = ix + 8u;
+ uint raw8 = read_mem(param_16, param_17, v_297);
+ Alloc param_18 = a;
+ uint param_19 = ix + 9u;
+ uint raw9 = read_mem(param_18, param_19, v_297);
+ Alloc param_20 = a;
+ uint param_21 = ix + 10u;
+ uint raw10 = read_mem(param_20, param_21, v_297);
+ CmdRadGrad s;
+ s.index = raw0;
+ s.mat = float4(as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3), as_type<float>(raw4));
+ s.xlat = float2(as_type<float>(raw5), as_type<float>(raw6));
+ s.c1 = float2(as_type<float>(raw7), as_type<float>(raw8));
+ s.ra = as_type<float>(raw9);
+ s.roff = as_type<float>(raw10);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u };
+ return CmdRadGrad_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_297);
+ CmdImage s;
+ s.index = raw0;
+ s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
+ return CmdImage_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImage& cmd_img, texture2d<float> image_atlas)
+{
+ spvUnsafeArray<float4, 8> rgba;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ uint param = i;
+ int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
+ float4 fg_rgba = image_atlas.read(uint2(uv));
+ float3 param_1 = fg_rgba.xyz;
+ float3 _1653 = fromsRGB(param_1);
+ fg_rgba.x = _1653.x;
+ fg_rgba.y = _1653.y;
+ fg_rgba.z = _1653.z;
+ rgba[i] = fg_rgba;
+ }
+ return rgba;
+}
+
+static inline __attribute__((always_inline))
+float3 tosRGB(thread const float3& rgb)
+{
+ return rgb;
+}
+
+static inline __attribute__((always_inline))
+uint packsRGB(thread float4& rgba)
+{
+ float3 param = rgba.xyz;
+ rgba = float4(tosRGB(param), rgba.w);
+ return pack_float_to_unorm4x8(rgba.wzyx);
+}
+
+static inline __attribute__((always_inline))
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ CmdEndClip s;
+ s.blend = raw0;
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
+ return CmdEndClip_read(param, param_1, v_297);
+}
+
+static inline __attribute__((always_inline))
+float3 screen(thread const float3& cb, thread const float3& cs)
+{
+ return (cb + cs) - (cb * cs);
+}
+
+static inline __attribute__((always_inline))
+float3 hard_light(thread const float3& cb, thread const float3& cs)
+{
+ float3 param = cb;
+ float3 param_1 = (cs * 2.0) - float3(1.0);
+ return select(screen(param, param_1), (cb * 2.0) * cs, cs <= float3(0.5));
+}
+
+static inline __attribute__((always_inline))
+float color_dodge(thread const float& cb, thread const float& cs)
+{
+ if (cb == 0.0)
+ {
+ return 0.0;
+ }
+ else
+ {
+ if (cs == 1.0)
+ {
+ return 1.0;
+ }
+ else
+ {
+ return fast::min(1.0, cb / (1.0 - cs));
+ }
+ }
+}
+
+static inline __attribute__((always_inline))
+float color_burn(thread const float& cb, thread const float& cs)
+{
+ if (cb == 1.0)
+ {
+ return 1.0;
+ }
+ else
+ {
+ if (cs == 0.0)
+ {
+ return 0.0;
+ }
+ else
+ {
+ return 1.0 - fast::min(1.0, (1.0 - cb) / cs);
+ }
+ }
+}
+
+static inline __attribute__((always_inline))
+float3 soft_light(thread const float3& cb, thread const float3& cs)
+{
+ float3 d = select(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, cb <= float3(0.25));
+ return select(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), cs <= float3(0.5));
+}
+
+static inline __attribute__((always_inline))
+float sat(thread const float3& c)
+{
+ return fast::max(c.x, fast::max(c.y, c.z)) - fast::min(c.x, fast::min(c.y, c.z));
+}
+
+static inline __attribute__((always_inline))
+void set_sat_inner(thread float& cmin, thread float& cmid, thread float& cmax, thread const float& s)
+{
+ if (cmax > cmin)
+ {
+ cmid = ((cmid - cmin) * s) / (cmax - cmin);
+ cmax = s;
+ }
+ else
+ {
+ cmid = 0.0;
+ cmax = 0.0;
+ }
+ cmin = 0.0;
+}
+
+static inline __attribute__((always_inline))
+float3 set_sat(thread float3& c, thread const float& s)
+{
+ if (c.x <= c.y)
+ {
+ if (c.y <= c.z)
+ {
+ float param = c.x;
+ float param_1 = c.y;
+ float param_2 = c.z;
+ float param_3 = s;
+ set_sat_inner(param, param_1, param_2, param_3);
+ c.x = param;
+ c.y = param_1;
+ c.z = param_2;
+ }
+ else
+ {
+ if (c.x <= c.z)
+ {
+ float param_4 = c.x;
+ float param_5 = c.z;
+ float param_6 = c.y;
+ float param_7 = s;
+ set_sat_inner(param_4, param_5, param_6, param_7);
+ c.x = param_4;
+ c.z = param_5;
+ c.y = param_6;
+ }
+ else
+ {
+ float param_8 = c.z;
+ float param_9 = c.x;
+ float param_10 = c.y;
+ float param_11 = s;
+ set_sat_inner(param_8, param_9, param_10, param_11);
+ c.z = param_8;
+ c.x = param_9;
+ c.y = param_10;
+ }
+ }
+ }
+ else
+ {
+ if (c.x <= c.z)
+ {
+ float param_12 = c.y;
+ float param_13 = c.x;
+ float param_14 = c.z;
+ float param_15 = s;
+ set_sat_inner(param_12, param_13, param_14, param_15);
+ c.y = param_12;
+ c.x = param_13;
+ c.z = param_14;
+ }
+ else
+ {
+ if (c.y <= c.z)
+ {
+ float param_16 = c.y;
+ float param_17 = c.z;
+ float param_18 = c.x;
+ float param_19 = s;
+ set_sat_inner(param_16, param_17, param_18, param_19);
+ c.y = param_16;
+ c.z = param_17;
+ c.x = param_18;
+ }
+ else
+ {
+ float param_20 = c.z;
+ float param_21 = c.y;
+ float param_22 = c.x;
+ float param_23 = s;
+ set_sat_inner(param_20, param_21, param_22, param_23);
+ c.z = param_20;
+ c.y = param_21;
+ c.x = param_22;
+ }
+ }
+ }
+ return c;
+}
+
+static inline __attribute__((always_inline))
+float lum(thread const float3& c)
+{
+ float3 f = float3(0.300000011920928955078125, 0.589999973773956298828125, 0.10999999940395355224609375);
+ return dot(c, f);
+}
+
+static inline __attribute__((always_inline))
+float3 clip_color(thread float3& c)
+{
+ float3 param = c;
+ float L = lum(param);
+ float n = fast::min(c.x, fast::min(c.y, c.z));
+ float x = fast::max(c.x, fast::max(c.y, c.z));
+ if (n < 0.0)
+ {
+ c = float3(L) + (((c - float3(L)) * L) / float3(L - n));
+ }
+ if (x > 1.0)
+ {
+ c = float3(L) + (((c - float3(L)) * (1.0 - L)) / float3(x - L));
+ }
+ return c;
+}
+
+static inline __attribute__((always_inline))
+float3 set_lum(thread const float3& c, thread const float& l)
+{
+ float3 param = c;
+ float3 param_1 = c + float3(l - lum(param));
+ float3 _1048 = clip_color(param_1);
+ return _1048;
+}
+
+static inline __attribute__((always_inline))
+float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const uint& mode)
+{
+ float3 b = float3(0.0);
+ switch (mode)
+ {
+ case 1u:
+ {
+ b = cb * cs;
+ break;
+ }
+ case 2u:
+ {
+ float3 param = cb;
+ float3 param_1 = cs;
+ b = screen(param, param_1);
+ break;
+ }
+ case 3u:
+ {
+ float3 param_2 = cs;
+ float3 param_3 = cb;
+ b = hard_light(param_2, param_3);
+ break;
+ }
+ case 4u:
+ {
+ b = fast::min(cb, cs);
+ break;
+ }
+ case 5u:
+ {
+ b = fast::max(cb, cs);
+ break;
+ }
+ case 6u:
+ {
+ float param_4 = cb.x;
+ float param_5 = cs.x;
+ float param_6 = cb.y;
+ float param_7 = cs.y;
+ float param_8 = cb.z;
+ float param_9 = cs.z;
+ b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9));
+ break;
+ }
+ case 7u:
+ {
+ float param_10 = cb.x;
+ float param_11 = cs.x;
+ float param_12 = cb.y;
+ float param_13 = cs.y;
+ float param_14 = cb.z;
+ float param_15 = cs.z;
+ b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15));
+ break;
+ }
+ case 8u:
+ {
+ float3 param_16 = cb;
+ float3 param_17 = cs;
+ b = hard_light(param_16, param_17);
+ break;
+ }
+ case 9u:
+ {
+ float3 param_18 = cb;
+ float3 param_19 = cs;
+ b = soft_light(param_18, param_19);
+ break;
+ }
+ case 10u:
+ {
+ b = abs(cb - cs);
+ break;
+ }
+ case 11u:
+ {
+ b = (cb + cs) - ((cb * 2.0) * cs);
+ break;
+ }
+ case 12u:
+ {
+ float3 param_20 = cb;
+ float3 param_21 = cs;
+ float param_22 = sat(param_20);
+ float3 _1340 = set_sat(param_21, param_22);
+ float3 param_23 = cb;
+ float3 param_24 = _1340;
+ float param_25 = lum(param_23);
+ b = set_lum(param_24, param_25);
+ break;
+ }
+ case 13u:
+ {
+ float3 param_26 = cs;
+ float3 param_27 = cb;
+ float param_28 = sat(param_26);
+ float3 _1354 = set_sat(param_27, param_28);
+ float3 param_29 = cb;
+ float3 param_30 = _1354;
+ float param_31 = lum(param_29);
+ b = set_lum(param_30, param_31);
+ break;
+ }
+ case 14u:
+ {
+ float3 param_32 = cb;
+ float3 param_33 = cs;
+ float param_34 = lum(param_32);
+ b = set_lum(param_33, param_34);
+ break;
+ }
+ case 15u:
+ {
+ float3 param_35 = cs;
+ float3 param_36 = cb;
+ float param_37 = lum(param_35);
+ b = set_lum(param_36, param_37);
+ break;
+ }
+ default:
+ {
+ b = cs;
+ break;
+ }
+ }
+ return b;
+}
+
+static inline __attribute__((always_inline))
+float4 mix_compose(thread const float3& cb, thread const float3& cs, thread const float& ab, thread const float& as, thread const uint& mode)
+{
+ float fa = 0.0;
+ float fb = 0.0;
+ switch (mode)
+ {
+ case 1u:
+ {
+ fa = 1.0;
+ fb = 0.0;
+ break;
+ }
+ case 2u:
+ {
+ fa = 0.0;
+ fb = 1.0;
+ break;
+ }
+ case 3u:
+ {
+ fa = 1.0;
+ fb = 1.0 - as;
+ break;
+ }
+ case 4u:
+ {
+ fa = 1.0 - ab;
+ fb = 1.0;
+ break;
+ }
+ case 5u:
+ {
+ fa = ab;
+ fb = 0.0;
+ break;
+ }
+ case 6u:
+ {
+ fa = 0.0;
+ fb = as;
+ break;
+ }
+ case 7u:
+ {
+ fa = 1.0 - ab;
+ fb = 0.0;
+ break;
+ }
+ case 8u:
+ {
+ fa = 0.0;
+ fb = 1.0 - as;
+ break;
+ }
+ case 9u:
+ {
+ fa = ab;
+ fb = 1.0 - as;
+ break;
+ }
+ case 10u:
+ {
+ fa = 1.0 - ab;
+ fb = as;
+ break;
+ }
+ case 11u:
+ {
+ fa = 1.0 - ab;
+ fb = 1.0 - as;
+ break;
+ }
+ case 12u:
+ {
+ fa = 1.0;
+ fb = 1.0;
+ break;
+ }
+ case 13u:
+ {
+ return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
+ }
+ default:
+ {
+ break;
+ }
+ }
+ float as_fa = as * fa;
+ float ab_fb = ab * fb;
+ float3 co = (cs * as_fa) + (cb * ab_fb);
+ return float4(co, as_fa + ab_fb);
+}
+
+static inline __attribute__((always_inline))
+float4 mix_blend_compose(thread const float4& backdrop, thread const float4& src, thread const uint& mode)
+{
+ if ((mode & 32767u) == 3u)
+ {
+ return (backdrop * (1.0 - src.w)) + src;
+ }
+ float inv_src_a = 1.0 / (src.w + 1.0000000036274937255387218471014e-15);
+ float3 cs = src.xyz * inv_src_a;
+ float inv_backdrop_a = 1.0 / (backdrop.w + 1.0000000036274937255387218471014e-15);
+ float3 cb = backdrop.xyz * inv_backdrop_a;
+ uint blend_mode = mode >> uint(8);
+ float3 param = cb;
+ float3 param_1 = cs;
+ uint param_2 = blend_mode;
+ float3 blended = mix_blend(param, param_1, param_2);
+ cs = mix(cs, blended, float3(backdrop.w));
+ uint comp_mode = mode & 255u;
+ if (comp_mode == 3u)
+ {
+ float3 co = mix(backdrop.xyz, cs, float3(src.w));
+ return float4(co, src.w + (backdrop.w * (1.0 - src.w)));
+ }
+ else
+ {
+ float3 param_3 = cb;
+ float3 param_4 = cs;
+ float param_5 = backdrop.w;
+ float param_6 = src.w;
+ uint param_7 = comp_mode;
+ return mix_compose(param_3, param_4, param_5, param_6, param_7);
+ }
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_297)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_297);
+ CmdJump s;
+ s.new_ref = raw0;
+ return s;
+}
+
+static inline __attribute__((always_inline))
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
+{
+ Alloc param = a;
+ CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
+ return CmdJump_read(param, param_1, v_297);
+}
+
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], device BlendBuf& _2506 [[buffer(2)]], texture2d<float, access::write> image [[texture(3)]], texture2d<float> image_atlas [[texture(4)]], texture2d<float> gradients [[texture(5)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+ uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x;
+ Alloc param;
+ param.offset = _1681.conf.ptcl_alloc.offset;
+ uint param_1 = tile_ix * 1024u;
+ uint param_2 = 1024u;
+ Alloc cmd_alloc = slice_mem(param, param_1, param_2);
+ CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+ uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)];
+ cmd_ref.offset += 4u;
+ uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
+ float2 xy = float2(xy_uint);
+ spvUnsafeArray<float4, 8> rgba;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ rgba[i] = float4(0.0);
+ }
+ uint clip_depth = 0u;
+ spvUnsafeArray<float, 8> df;
+ TileSegRef tile_seg_ref;
+ spvUnsafeArray<float, 8> area;
+ spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
+ uint base_ix_1;
+ uint bg_rgba;
+ while (true)
+ {
+ Alloc param_3 = cmd_alloc;
+ CmdRef param_4 = cmd_ref;
+ uint tag = Cmd_tag(param_3, param_4, v_297).tag;
+ if (tag == 0u)
+ {
+ break;
+ }
+ switch (tag)
+ {
+ case 2u:
+ {
+ Alloc param_5 = cmd_alloc;
+ CmdRef param_6 = cmd_ref;
+ CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_297);
+ for (uint k = 0u; k < 8u; k++)
+ {
+ df[k] = 1000000000.0;
+ }
+ tile_seg_ref = TileSegRef{ stroke.tile_ref };
+ do
+ {
+ uint param_7 = tile_seg_ref.offset;
+ uint param_8 = 24u;
+ bool param_9 = true;
+ Alloc param_10 = new_alloc(param_7, param_8, param_9);
+ TileSegRef param_11 = tile_seg_ref;
+ TileSeg seg = TileSeg_read(param_10, param_11, v_297);
+ float2 line_vec = seg.vector;
+ for (uint k_1 = 0u; k_1 < 8u; k_1++)
+ {
+ float2 dpos = (xy + float2(0.5)) - seg.origin;
+ uint param_12 = k_1;
+ dpos += float2(chunk_offset(param_12));
+ float t = fast::clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
+ df[k_1] = fast::min(df[k_1], length((line_vec * t) - dpos));
+ }
+ tile_seg_ref = seg.next;
+ } while (tile_seg_ref.offset != 0u);
+ for (uint k_2 = 0u; k_2 < 8u; k_2++)
+ {
+ area[k_2] = fast::clamp((stroke.half_width + 0.5) - df[k_2], 0.0, 1.0);
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 1u:
+ {
+ Alloc param_13 = cmd_alloc;
+ CmdRef param_14 = cmd_ref;
+ CmdFill fill = Cmd_Fill_read(param_13, param_14, v_297);
+ for (uint k_3 = 0u; k_3 < 8u; k_3++)
+ {
+ area[k_3] = float(fill.backdrop);
+ }
+ tile_seg_ref = TileSegRef{ fill.tile_ref };
+ do
+ {
+ uint param_15 = tile_seg_ref.offset;
+ uint param_16 = 24u;
+ bool param_17 = true;
+ Alloc param_18 = new_alloc(param_15, param_16, param_17);
+ TileSegRef param_19 = tile_seg_ref;
+ TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297);
+ for (uint k_4 = 0u; k_4 < 8u; k_4++)
+ {
+ uint param_20 = k_4;
+ float2 my_xy = xy + float2(chunk_offset(param_20));
+ float2 start = seg_1.origin - my_xy;
+ float2 end = start + seg_1.vector;
+ float2 window = fast::clamp(float2(start.y, end.y), float2(0.0), float2(1.0));
+ if ((isunordered(window.x, window.y) || window.x != window.y))
+ {
+ float2 t_1 = (window - float2(start.y)) / float2(seg_1.vector.y);
+ float2 xs = float2(mix(start.x, end.x, t_1.x), mix(start.x, end.x, t_1.y));
+ float xmin = fast::min(fast::min(xs.x, xs.y), 1.0) - 9.9999999747524270787835121154785e-07;
+ float xmax = fast::max(xs.x, xs.y);
+ float b = fast::min(xmax, 1.0);
+ float c = fast::max(b, 0.0);
+ float d = fast::max(xmin, 0.0);
+ float a = ((b + (0.5 * ((d * d) - (c * c)))) - xmin) / (xmax - xmin);
+ area[k_4] += (a * (window.x - window.y));
+ }
+ area[k_4] += (sign(seg_1.vector.x) * fast::clamp((my_xy.y - seg_1.y_edge) + 1.0, 0.0, 1.0));
+ }
+ tile_seg_ref = seg_1.next;
+ } while (tile_seg_ref.offset != 0u);
+ for (uint k_5 = 0u; k_5 < 8u; k_5++)
+ {
+ area[k_5] = fast::min(abs(area[k_5]), 1.0);
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 3u:
+ {
+ for (uint k_6 = 0u; k_6 < 8u; k_6++)
+ {
+ area[k_6] = 1.0;
+ }
+ cmd_ref.offset += 4u;
+ break;
+ }
+ case 4u:
+ {
+ Alloc param_21 = cmd_alloc;
+ CmdRef param_22 = cmd_ref;
+ CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_297);
+ for (uint k_7 = 0u; k_7 < 8u; k_7++)
+ {
+ area[k_7] = alpha.alpha;
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 5u:
+ {
+ Alloc param_23 = cmd_alloc;
+ CmdRef param_24 = cmd_ref;
+ CmdColor color = Cmd_Color_read(param_23, param_24, v_297);
+ uint param_25 = color.rgba_color;
+ float4 fg = unpacksRGB(param_25);
+ for (uint k_8 = 0u; k_8 < 8u; k_8++)
+ {
+ float4 fg_k = fg * area[k_8];
+ rgba[k_8] = (rgba[k_8] * (1.0 - fg_k.w)) + fg_k;
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 6u:
+ {
+ Alloc param_26 = cmd_alloc;
+ CmdRef param_27 = cmd_ref;
+ CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_297);
+ float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
+ for (uint k_9 = 0u; k_9 < 8u; k_9++)
+ {
+ uint param_28 = k_9;
+ float2 chunk_xy = float2(chunk_offset(param_28));
+ float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y);
+ int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
+ float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
+ float3 param_29 = fg_rgba.xyz;
+ float3 _2257 = fromsRGB(param_29);
+ fg_rgba.x = _2257.x;
+ fg_rgba.y = _2257.y;
+ fg_rgba.z = _2257.z;
+ float4 fg_k_1 = fg_rgba * area[k_9];
+ rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
+ }
+ cmd_ref.offset += 20u;
+ break;
+ }
+ case 7u:
+ {
+ Alloc param_30 = cmd_alloc;
+ CmdRef param_31 = cmd_ref;
+ CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_297);
+ for (uint k_10 = 0u; k_10 < 8u; k_10++)
+ {
+ uint param_32 = k_10;
+ float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+ my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+ float ba = dot(my_xy_1, rad.c1);
+ float ca = rad.ra * dot(my_xy_1, my_xy_1);
+ float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+ int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
+ float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
+ float3 param_33 = fg_rgba_1.xyz;
+ float3 _2367 = fromsRGB(param_33);
+ fg_rgba_1.x = _2367.x;
+ fg_rgba_1.y = _2367.y;
+ fg_rgba_1.z = _2367.z;
+ float4 fg_k_2 = fg_rgba_1 * area[k_10];
+ rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
+ }
+ cmd_ref.offset += 48u;
+ break;
+ }
+ case 8u:
+ {
+ Alloc param_34 = cmd_alloc;
+ CmdRef param_35 = cmd_ref;
+ CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_297);
+ uint2 param_36 = xy_uint;
+ CmdImage param_37 = fill_img;
+ spvUnsafeArray<float4, 8> img;
+ img = fillImage(param_36, param_37, image_atlas);
+ for (uint k_11 = 0u; k_11 < 8u; k_11++)
+ {
+ float4 fg_k_3 = img[k_11] * area[k_11];
+ rgba[k_11] = (rgba[k_11] * (1.0 - fg_k_3.w)) + fg_k_3;
+ }
+ cmd_ref.offset += 12u;
+ break;
+ }
+ case 9u:
+ {
+ if (clip_depth < 4u)
+ {
+ for (uint k_12 = 0u; k_12 < 8u; k_12++)
+ {
+ float4 param_38 = float4(rgba[k_12]);
+ uint _2472 = packsRGB(param_38);
+ blend_stack[clip_depth][k_12] = _2472;
+ rgba[k_12] = float4(0.0);
+ }
+ }
+ else
+ {
+ uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+ for (uint k_13 = 0u; k_13 < 8u; k_13++)
+ {
+ float4 param_39 = float4(rgba[k_13]);
+ uint _2519 = packsRGB(param_39);
+ _2506.blend_mem[base_ix + k_13] = _2519;
+ rgba[k_13] = float4(0.0);
+ }
+ }
+ clip_depth++;
+ cmd_ref.offset += 4u;
+ break;
+ }
+ case 10u:
+ {
+ Alloc param_40 = cmd_alloc;
+ CmdRef param_41 = cmd_ref;
+ CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297);
+ clip_depth--;
+ if (clip_depth >= 4u)
+ {
+ base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+ }
+ for (uint k_14 = 0u; k_14 < 8u; k_14++)
+ {
+ if (clip_depth < 4u)
+ {
+ bg_rgba = blend_stack[clip_depth][k_14];
+ }
+ else
+ {
+ bg_rgba = _2506.blend_mem[base_ix_1 + k_14];
+ }
+ uint param_42 = bg_rgba;
+ float4 bg = unpacksRGB(param_42);
+ float4 fg_1 = rgba[k_14] * area[k_14];
+ float4 param_43 = bg;
+ float4 param_44 = fg_1;
+ uint param_45 = end_clip.blend;
+ rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
+ }
+ cmd_ref.offset += 8u;
+ break;
+ }
+ case 11u:
+ {
+ Alloc param_46 = cmd_alloc;
+ CmdRef param_47 = cmd_ref;
+ cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref };
+ cmd_alloc.offset = cmd_ref.offset;
+ break;
+ }
+ }
+ }
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ uint param_48 = i_1;
+ image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
+ }
+}
+
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
new file mode 100644
index 0000000..f5526be
--- /dev/null
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil
new file mode 100644
index 0000000..81e44ef
--- /dev/null
+++ b/piet-gpu/shader/gen/path_coarse.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/path_coarse.hlsl b/piet-gpu/shader/gen/path_coarse.hlsl
new file mode 100644
index 0000000..0e5b5db
--- /dev/null
+++ b/piet-gpu/shader/gen/path_coarse.hlsl
@@ -0,0 +1,672 @@
+struct Alloc
+{
+ uint offset;
+};
+
+struct PathCubicRef
+{
+ uint offset;
+};
+
+struct PathCubic
+{
+ float2 p0;
+ float2 p1;
+ float2 p2;
+ float2 p3;
+ uint path_ix;
+ uint trans_ix;
+ float2 stroke;
+};
+
+struct PathSegRef
+{
+ uint offset;
+};
+
+struct PathSegTag
+{
+ uint tag;
+ uint flags;
+};
+
+struct TileRef
+{
+ uint offset;
+};
+
+struct PathRef
+{
+ uint offset;
+};
+
+struct Path
+{
+ uint4 bbox;
+ TileRef tiles;
+};
+
+struct TileSegRef
+{
+ uint offset;
+};
+
+struct TileSeg
+{
+ float2 origin;
+ float2 _vector;
+ float y_edge;
+ TileSegRef next;
+};
+
+struct SubdivResult
+{
+ float val;
+ float a0;
+ float a2;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(32u, 1u, 1u);
+
+static const PathSegTag _722 = { 0u, 0u };
+
+RWByteAddressBuffer _143 : register(u0, space0);
+ByteAddressBuffer _711 : register(t1, space0);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+static bool mem_ok;
+
+bool check_deps(uint dep_stage)
+{
+ uint _149;
+ _143.InterlockedOr(4, 0u, _149);
+ return (_149 & dep_stage) == 0u;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+ return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return 0u;
+ }
+ uint v = _143.Load(offset * 4 + 12);
+ return v;
+}
+
+PathSegTag PathSeg_tag(Alloc a, PathSegRef ref)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint tag_and_flags = read_mem(param, param_1);
+ PathSegTag _362 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+ return _362;
+}
+
+PathCubic PathCubic_read(Alloc a, PathCubicRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7);
+ Alloc param_8 = a;
+ uint param_9 = ix + 4u;
+ uint raw4 = read_mem(param_8, param_9);
+ Alloc param_10 = a;
+ uint param_11 = ix + 5u;
+ uint raw5 = read_mem(param_10, param_11);
+ Alloc param_12 = a;
+ uint param_13 = ix + 6u;
+ uint raw6 = read_mem(param_12, param_13);
+ Alloc param_14 = a;
+ uint param_15 = ix + 7u;
+ uint raw7 = read_mem(param_14, param_15);
+ Alloc param_16 = a;
+ uint param_17 = ix + 8u;
+ uint raw8 = read_mem(param_16, param_17);
+ Alloc param_18 = a;
+ uint param_19 = ix + 9u;
+ uint raw9 = read_mem(param_18, param_19);
+ Alloc param_20 = a;
+ uint param_21 = ix + 10u;
+ uint raw10 = read_mem(param_20, param_21);
+ Alloc param_22 = a;
+ uint param_23 = ix + 11u;
+ uint raw11 = read_mem(param_22, param_23);
+ PathCubic s;
+ s.p0 = float2(asfloat(raw0), asfloat(raw1));
+ s.p1 = float2(asfloat(raw2), asfloat(raw3));
+ s.p2 = float2(asfloat(raw4), asfloat(raw5));
+ s.p3 = float2(asfloat(raw6), asfloat(raw7));
+ s.path_ix = raw8;
+ s.trans_ix = raw9;
+ s.stroke = float2(asfloat(raw10), asfloat(raw11));
+ return s;
+}
+
+PathCubic PathSeg_Cubic_read(Alloc a, PathSegRef ref)
+{
+ PathCubicRef _368 = { ref.offset + 4u };
+ Alloc param = a;
+ PathCubicRef param_1 = _368;
+ return PathCubic_read(param, param_1);
+}
+
+float2 eval_cubic(float2 p0, float2 p1, float2 p2, float2 p3, float t)
+{
+ float mt = 1.0f - t;
+ return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0f)) + (((p2 * (mt * 3.0f)) + (p3 * t)) * t)) * t);
+}
+
+float approx_parabola_integral(float x)
+{
+ return x * rsqrt(sqrt(0.3300000131130218505859375f + (0.201511204242706298828125f + ((0.25f * x) * x))));
+}
+
+SubdivResult estimate_subdiv(float2 p0, float2 p1, float2 p2, float sqrt_tol)
+{
+ float2 d01 = p1 - p0;
+ float2 d12 = p2 - p1;
+ float2 dd = d01 - d12;
+ float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x);
+ float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross;
+ float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross;
+ float scale = abs(_cross / (length(dd) * (x2 - x0)));
+ float param = x0;
+ float a0 = approx_parabola_integral(param);
+ float param_1 = x2;
+ float a2 = approx_parabola_integral(param_1);
+ float val = 0.0f;
+ if (scale < 1000000000.0f)
+ {
+ float da = abs(a2 - a0);
+ float sqrt_scale = sqrt(scale);
+ if (sign(x0) == sign(x2))
+ {
+ val = da * sqrt_scale;
+ }
+ else
+ {
+ float xmin = sqrt_tol / sqrt_scale;
+ float param_2 = xmin;
+ val = (sqrt_tol * da) / approx_parabola_integral(param_2);
+ }
+ }
+ SubdivResult _690 = { val, a0, a2 };
+ return _690;
+}
+
+uint fill_mode_from_flags(uint flags)
+{
+ return flags & 1u;
+}
+
+Path Path_read(Alloc a, PathRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5);
+ Path s;
+ s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+ TileRef _422 = { raw2 };
+ s.tiles = _422;
+ return s;
+}
+
+Alloc new_alloc(uint offset, uint size, bool mem_ok_1)
+{
+ Alloc a;
+ a.offset = offset;
+ return a;
+}
+
+float approx_parabola_inv_integral(float x)
+{
+ return x * sqrt(0.61000001430511474609375f + (0.1520999968051910400390625f + ((0.25f * x) * x)));
+}
+
+float2 eval_quad(float2 p0, float2 p1, float2 p2, float t)
+{
+ float mt = 1.0f - t;
+ return (p0 * (mt * mt)) + (((p1 * (mt * 2.0f)) + (p2 * t)) * t);
+}
+
+uint malloc_stage(uint size, uint mem_size, uint stage)
+{
+ uint _158;
+ _143.InterlockedAdd(0, size, _158);
+ uint offset = _158;
+ if ((offset + size) > mem_size)
+ {
+ uint _168;
+ _143.InterlockedOr(4, stage, _168);
+ offset = 0u;
+ }
+ return offset;
+}
+
+TileRef Tile_index(TileRef ref, uint index)
+{
+ TileRef _380 = { ref.offset + (index * 8u) };
+ return _380;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ _143.Store(offset * 4 + 12, val);
+}
+
+void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = asuint(s.origin.x);
+ write_mem(param, param_1, param_2);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = asuint(s.origin.y);
+ write_mem(param_3, param_4, param_5);
+ Alloc param_6 = a;
+ uint param_7 = ix + 2u;
+ uint param_8 = asuint(s._vector.x);
+ write_mem(param_6, param_7, param_8);
+ Alloc param_9 = a;
+ uint param_10 = ix + 3u;
+ uint param_11 = asuint(s._vector.y);
+ write_mem(param_9, param_10, param_11);
+ Alloc param_12 = a;
+ uint param_13 = ix + 4u;
+ uint param_14 = asuint(s.y_edge);
+ write_mem(param_12, param_13, param_14);
+ Alloc param_15 = a;
+ uint param_16 = ix + 5u;
+ uint param_17 = s.next.offset;
+ write_mem(param_15, param_16, param_17);
+}
+
+void comp_main()
+{
+ mem_ok = true;
+ uint param = 7u;
+ bool _694 = check_deps(param);
+ if (!_694)
+ {
+ return;
+ }
+ uint element_ix = gl_GlobalInvocationID.x;
+ PathSegRef _719 = { _711.Load(32) + (element_ix * 52u) };
+ PathSegRef ref = _719;
+ PathSegTag tag = _722;
+ if (element_ix < _711.Load(8))
+ {
+ Alloc _732;
+ _732.offset = _711.Load(32);
+ Alloc param_1;
+ param_1.offset = _732.offset;
+ PathSegRef param_2 = ref;
+ tag = PathSeg_tag(param_1, param_2);
+ }
+ switch (tag.tag)
+ {
+ case 1u:
+ {
+ Alloc _745;
+ _745.offset = _711.Load(32);
+ Alloc param_3;
+ param_3.offset = _745.offset;
+ PathSegRef param_4 = ref;
+ PathCubic cubic = PathSeg_Cubic_read(param_3, param_4);
+ float2 err_v = (((cubic.p2 - cubic.p1) * 3.0f) + cubic.p0) - cubic.p3;
+ float err = (err_v.x * err_v.x) + (err_v.y * err_v.y);
+ uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875f, 0.16666667163372039794921875f))), 1u);
+ n_quads = min(n_quads, 16u);
+ float val = 0.0f;
+ float2 qp0 = cubic.p0;
+ float _step = 1.0f / float(n_quads);
+ SubdivResult keep_params[16];
+ for (uint i = 0u; i < n_quads; i++)
+ {
+ float t = float(i + 1u) * _step;
+ float2 param_5 = cubic.p0;
+ float2 param_6 = cubic.p1;
+ float2 param_7 = cubic.p2;
+ float2 param_8 = cubic.p3;
+ float param_9 = t;
+ float2 qp2 = eval_cubic(param_5, param_6, param_7, param_8, param_9);
+ float2 param_10 = cubic.p0;
+ float2 param_11 = cubic.p1;
+ float2 param_12 = cubic.p2;
+ float2 param_13 = cubic.p3;
+ float param_14 = t - (0.5f * _step);
+ float2 qp1 = eval_cubic(param_10, param_11, param_12, param_13, param_14);
+ qp1 = (qp1 * 2.0f) - ((qp0 + qp2) * 0.5f);
+ float2 param_15 = qp0;
+ float2 param_16 = qp1;
+ float2 param_17 = qp2;
+ float param_18 = 0.4743416607379913330078125f;
+ SubdivResult params = estimate_subdiv(param_15, param_16, param_17, param_18);
+ keep_params[i] = params;
+ val += params.val;
+ qp0 = qp2;
+ }
+ uint n = max(uint(ceil((val * 0.5f) / 0.4743416607379913330078125f)), 1u);
+ uint param_19 = tag.flags;
+ bool is_stroke = fill_mode_from_flags(param_19) == 1u;
+ uint path_ix = cubic.path_ix;
+ PathRef _901 = { _711.Load(20) + (path_ix * 12u) };
+ Alloc _904;
+ _904.offset = _711.Load(20);
+ Alloc param_20;
+ param_20.offset = _904.offset;
+ PathRef param_21 = _901;
+ Path path = Path_read(param_20, param_21);
+ uint param_22 = path.tiles.offset;
+ uint param_23 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+ bool param_24 = true;
+ Alloc path_alloc = new_alloc(param_22, param_23, param_24);
+ int4 bbox = int4(path.bbox);
+ float2 p0 = cubic.p0;
+ qp0 = cubic.p0;
+ float v_step = val / float(n);
+ int n_out = 1;
+ float val_sum = 0.0f;
+ float2 p1;
+ float _1143;
+ TileSeg tile_seg;
+ for (uint i_1 = 0u; i_1 < n_quads; i_1++)
+ {
+ float t_1 = float(i_1 + 1u) * _step;
+ float2 param_25 = cubic.p0;
+ float2 param_26 = cubic.p1;
+ float2 param_27 = cubic.p2;
+ float2 param_28 = cubic.p3;
+ float param_29 = t_1;
+ float2 qp2_1 = eval_cubic(param_25, param_26, param_27, param_28, param_29);
+ float2 param_30 = cubic.p0;
+ float2 param_31 = cubic.p1;
+ float2 param_32 = cubic.p2;
+ float2 param_33 = cubic.p3;
+ float param_34 = t_1 - (0.5f * _step);
+ float2 qp1_1 = eval_cubic(param_30, param_31, param_32, param_33, param_34);
+ qp1_1 = (qp1_1 * 2.0f) - ((qp0 + qp2_1) * 0.5f);
+ SubdivResult params_1 = keep_params[i_1];
+ float param_35 = params_1.a0;
+ float u0 = approx_parabola_inv_integral(param_35);
+ float param_36 = params_1.a2;
+ float u2 = approx_parabola_inv_integral(param_36);
+ float uscale = 1.0f / (u2 - u0);
+ float target = float(n_out) * v_step;
+ for (;;)
+ {
+ bool _1036 = uint(n_out) == n;
+ bool _1046;
+ if (!_1036)
+ {
+ _1046 = target < (val_sum + params_1.val);
+ }
+ else
+ {
+ _1046 = _1036;
+ }
+ if (_1046)
+ {
+ if (uint(n_out) == n)
+ {
+ p1 = cubic.p3;
+ }
+ else
+ {
+ float u = (target - val_sum) / params_1.val;
+ float a = lerp(params_1.a0, params_1.a2, u);
+ float param_37 = a;
+ float au = approx_parabola_inv_integral(param_37);
+ float t_2 = (au - u0) * uscale;
+ float2 param_38 = qp0;
+ float2 param_39 = qp1_1;
+ float2 param_40 = qp2_1;
+ float param_41 = t_2;
+ p1 = eval_quad(param_38, param_39, param_40, param_41);
+ }
+ float xmin = min(p0.x, p1.x) - cubic.stroke.x;
+ float xmax = max(p0.x, p1.x) + cubic.stroke.x;
+ float ymin = min(p0.y, p1.y) - cubic.stroke.y;
+ float ymax = max(p0.y, p1.y) + cubic.stroke.y;
+ float dx = p1.x - p0.x;
+ float dy = p1.y - p0.y;
+ if (abs(dy) < 9.999999717180685365747194737196e-10f)
+ {
+ _1143 = 1000000000.0f;
+ }
+ else
+ {
+ _1143 = dx / dy;
+ }
+ float invslope = _1143;
+ float c = (cubic.stroke.x + (abs(invslope) * (8.0f + cubic.stroke.y))) * 0.0625f;
+ float b = invslope;
+ float a_1 = (p0.x - ((p0.y - 8.0f) * b)) * 0.0625f;
+ int x0 = int(floor(xmin * 0.0625f));
+ int x1 = int(floor(xmax * 0.0625f) + 1.0f);
+ int y0 = int(floor(ymin * 0.0625f));
+ int y1 = int(floor(ymax * 0.0625f) + 1.0f);
+ x0 = clamp(x0, bbox.x, bbox.z);
+ y0 = clamp(y0, bbox.y, bbox.w);
+ x1 = clamp(x1, bbox.x, bbox.z);
+ y1 = clamp(y1, bbox.y, bbox.w);
+ float xc = a_1 + (b * float(y0));
+ int stride = bbox.z - bbox.x;
+ int base = ((y0 - bbox.y) * stride) - bbox.x;
+ uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
+ uint malloc_size = n_tile_alloc * 24u;
+ uint param_42 = malloc_size;
+ uint param_43 = _711.Load(0);
+ uint param_44 = 4u;
+ uint _1265 = malloc_stage(param_42, param_43, param_44);
+ uint tile_offset = _1265;
+ if (tile_offset == 0u)
+ {
+ mem_ok = false;
+ }
+ uint param_45 = tile_offset;
+ uint param_46 = malloc_size;
+ bool param_47 = true;
+ Alloc tile_alloc = new_alloc(param_45, param_46, param_47);
+ int xray = int(floor(p0.x * 0.0625f));
+ int last_xray = int(floor(p1.x * 0.0625f));
+ if (p0.y > p1.y)
+ {
+ int tmp = xray;
+ xray = last_xray;
+ last_xray = tmp;
+ }
+ for (int y = y0; y < y1; y++)
+ {
+ float tile_y0 = float(y * 16);
+ int xbackdrop = max((xray + 1), bbox.x);
+ bool _1322 = !is_stroke;
+ bool _1332;
+ if (_1322)
+ {
+ _1332 = min(p0.y, p1.y) < tile_y0;
+ }
+ else
+ {
+ _1332 = _1322;
+ }
+ bool _1339;
+ if (_1332)
+ {
+ _1339 = xbackdrop < bbox.z;
+ }
+ else
+ {
+ _1339 = _1332;
+ }
+ if (_1339)
+ {
+ int backdrop = (p1.y < p0.y) ? 1 : (-1);
+ TileRef param_48 = path.tiles;
+ uint param_49 = uint(base + xbackdrop);
+ TileRef tile_ref = Tile_index(param_48, param_49);
+ uint tile_el = tile_ref.offset >> uint(2);
+ uint _1369;
+ _143.InterlockedAdd((tile_el + 1u) * 4 + 12, uint(backdrop), _1369);
+ }
+ int next_xray = last_xray;
+ if (y < (y1 - 1))
+ {
+ float tile_y1 = float((y + 1) * 16);
+ float x_edge = lerp(p0.x, p1.x, (tile_y1 - p0.y) / dy);
+ next_xray = int(floor(x_edge * 0.0625f));
+ }
+ int min_xray = min(xray, next_xray);
+ int max_xray = max(xray, next_xray);
+ int xx0 = min(int(floor(xc - c)), min_xray);
+ int xx1 = max(int(ceil(xc + c)), (max_xray + 1));
+ xx0 = clamp(xx0, x0, x1);
+ xx1 = clamp(xx1, x0, x1);
+ for (int x = xx0; x < xx1; x++)
+ {
+ float tile_x0 = float(x * 16);
+ TileRef _1449 = { path.tiles.offset };
+ TileRef param_50 = _1449;
+ uint param_51 = uint(base + x);
+ TileRef tile_ref_1 = Tile_index(param_50, param_51);
+ uint tile_el_1 = tile_ref_1.offset >> uint(2);
+ uint old = 0u;
+ uint _1465;
+ _143.InterlockedExchange(tile_el_1 * 4 + 12, tile_offset, _1465);
+ old = _1465;
+ tile_seg.origin = p0;
+ tile_seg._vector = p1 - p0;
+ float y_edge = 0.0f;
+ if (!is_stroke)
+ {
+ y_edge = lerp(p0.y, p1.y, (tile_x0 - p0.x) / dx);
+ if (min(p0.x, p1.x) < tile_x0)
+ {
+ float2 p = float2(tile_x0, y_edge);
+ if (p0.x > p1.x)
+ {
+ tile_seg._vector = p - p0;
+ }
+ else
+ {
+ tile_seg.origin = p;
+ tile_seg._vector = p1 - p;
+ }
+ if (tile_seg._vector.x == 0.0f)
+ {
+ tile_seg._vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10f;
+ }
+ }
+ if ((x <= min_xray) || (max_xray < x))
+ {
+ y_edge = 1000000000.0f;
+ }
+ }
+ tile_seg.y_edge = y_edge;
+ tile_seg.next.offset = old;
+ if (mem_ok)
+ {
+ TileSegRef _1550 = { tile_offset };
+ Alloc param_52 = tile_alloc;
+ TileSegRef param_53 = _1550;
+ TileSeg param_54 = tile_seg;
+ TileSeg_write(param_52, param_53, param_54);
+ }
+ tile_offset += 24u;
+ }
+ xc += b;
+ base += stride;
+ xray = next_xray;
+ }
+ n_out++;
+ target += v_step;
+ p0 = p1;
+ continue;
+ }
+ else
+ {
+ break;
+ }
+ }
+ val_sum += params_1.val;
+ qp0 = qp2_1;
+ }
+ break;
+ }
+ }
+}
+
+[numthreads(32, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/path_coarse.msl b/piet-gpu/shader/gen/path_coarse.msl
new file mode 100644
index 0000000..1af03a8
--- /dev/null
+++ b/piet-gpu/shader/gen/path_coarse.msl
@@ -0,0 +1,717 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+ T elements[Num ? Num : 1];
+
+ thread T& operator [] (size_t pos) thread
+ {
+ return elements[pos];
+ }
+ constexpr const thread T& operator [] (size_t pos) const thread
+ {
+ return elements[pos];
+ }
+
+ device T& operator [] (size_t pos) device
+ {
+ return elements[pos];
+ }
+ constexpr const device T& operator [] (size_t pos) const device
+ {
+ return elements[pos];
+ }
+
+ constexpr const constant T& operator [] (size_t pos) const constant
+ {
+ return elements[pos];
+ }
+
+ threadgroup T& operator [] (size_t pos) threadgroup
+ {
+ return elements[pos];
+ }
+ constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+ {
+ return elements[pos];
+ }
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct PathCubicRef
+{
+ uint offset;
+};
+
+struct PathCubic
+{
+ float2 p0;
+ float2 p1;
+ float2 p2;
+ float2 p3;
+ uint path_ix;
+ uint trans_ix;
+ float2 stroke;
+};
+
+struct PathSegRef
+{
+ uint offset;
+};
+
+struct PathSegTag
+{
+ uint tag;
+ uint flags;
+};
+
+struct TileRef
+{
+ uint offset;
+};
+
+struct PathRef
+{
+ uint offset;
+};
+
+struct Path
+{
+ uint4 bbox;
+ TileRef tiles;
+};
+
+struct TileSegRef
+{
+ uint offset;
+};
+
+struct TileSeg
+{
+ float2 origin;
+ float2 vector;
+ float y_edge;
+ TileSegRef next;
+};
+
+struct SubdivResult
+{
+ float val;
+ float a0;
+ float a2;
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+struct Alloc_1
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc_1 tile_alloc;
+ Alloc_1 bin_alloc;
+ Alloc_1 ptcl_alloc;
+ Alloc_1 pathseg_alloc;
+ Alloc_1 anno_alloc;
+ Alloc_1 path_bbox_alloc;
+ Alloc_1 drawmonoid_alloc;
+ Alloc_1 clip_alloc;
+ Alloc_1 clip_bic_alloc;
+ Alloc_1 clip_stack_alloc;
+ Alloc_1 clip_bbox_alloc;
+ Alloc_1 draw_bbox_alloc;
+ Alloc_1 drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(32u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+bool check_deps(thread const uint& dep_stage, device Memory& v_143)
+{
+ uint _149 = atomic_fetch_or_explicit((device atomic_uint*)&v_143.mem_error, 0u, memory_order_relaxed);
+ return (_149 & dep_stage) == 0u;
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+ return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_143)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return 0u;
+ }
+ uint v = v_143.memory[offset];
+ return v;
+}
+
+static inline __attribute__((always_inline))
+PathSegTag PathSeg_tag(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_143)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint tag_and_flags = read_mem(param, param_1, v_143);
+ return PathSegTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+}
+
+static inline __attribute__((always_inline))
+PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, device Memory& v_143)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_143);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_143);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5, v_143);
+ Alloc param_6 = a;
+ uint param_7 = ix + 3u;
+ uint raw3 = read_mem(param_6, param_7, v_143);
+ Alloc param_8 = a;
+ uint param_9 = ix + 4u;
+ uint raw4 = read_mem(param_8, param_9, v_143);
+ Alloc param_10 = a;
+ uint param_11 = ix + 5u;
+ uint raw5 = read_mem(param_10, param_11, v_143);
+ Alloc param_12 = a;
+ uint param_13 = ix + 6u;
+ uint raw6 = read_mem(param_12, param_13, v_143);
+ Alloc param_14 = a;
+ uint param_15 = ix + 7u;
+ uint raw7 = read_mem(param_14, param_15, v_143);
+ Alloc param_16 = a;
+ uint param_17 = ix + 8u;
+ uint raw8 = read_mem(param_16, param_17, v_143);
+ Alloc param_18 = a;
+ uint param_19 = ix + 9u;
+ uint raw9 = read_mem(param_18, param_19, v_143);
+ Alloc param_20 = a;
+ uint param_21 = ix + 10u;
+ uint raw10 = read_mem(param_20, param_21, v_143);
+ Alloc param_22 = a;
+ uint param_23 = ix + 11u;
+ uint raw11 = read_mem(param_22, param_23, v_143);
+ PathCubic s;
+ s.p0 = float2(as_type<float>(raw0), as_type<float>(raw1));
+ s.p1 = float2(as_type<float>(raw2), as_type<float>(raw3));
+ s.p2 = float2(as_type<float>(raw4), as_type<float>(raw5));
+ s.p3 = float2(as_type<float>(raw6), as_type<float>(raw7));
+ s.path_ix = raw8;
+ s.trans_ix = raw9;
+ s.stroke = float2(as_type<float>(raw10), as_type<float>(raw11));
+ return s;
+}
+
+static inline __attribute__((always_inline))
+PathCubic PathSeg_Cubic_read(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_143)
+{
+ Alloc param = a;
+ PathCubicRef param_1 = PathCubicRef{ ref.offset + 4u };
+ return PathCubic_read(param, param_1, v_143);
+}
+
+static inline __attribute__((always_inline))
+float2 eval_cubic(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float2& p3, thread const float& t)
+{
+ float mt = 1.0 - t;
+ return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0)) + (((p2 * (mt * 3.0)) + (p3 * t)) * t)) * t);
+}
+
+static inline __attribute__((always_inline))
+float approx_parabola_integral(thread const float& x)
+{
+ return x * rsqrt(sqrt(0.3300000131130218505859375 + (0.201511204242706298828125 + ((0.25 * x) * x))));
+}
+
+static inline __attribute__((always_inline))
+SubdivResult estimate_subdiv(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& sqrt_tol)
+{
+ float2 d01 = p1 - p0;
+ float2 d12 = p2 - p1;
+ float2 dd = d01 - d12;
+ float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x);
+ float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross;
+ float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross;
+ float scale = abs(_cross / (length(dd) * (x2 - x0)));
+ float param = x0;
+ float a0 = approx_parabola_integral(param);
+ float param_1 = x2;
+ float a2 = approx_parabola_integral(param_1);
+ float val = 0.0;
+ if (scale < 1000000000.0)
+ {
+ float da = abs(a2 - a0);
+ float sqrt_scale = sqrt(scale);
+ if (sign(x0) == sign(x2))
+ {
+ val = da * sqrt_scale;
+ }
+ else
+ {
+ float xmin = sqrt_tol / sqrt_scale;
+ float param_2 = xmin;
+ val = (sqrt_tol * da) / approx_parabola_integral(param_2);
+ }
+ }
+ return SubdivResult{ val, a0, a2 };
+}
+
+static inline __attribute__((always_inline))
+uint fill_mode_from_flags(thread const uint& flags)
+{
+ return flags & 1u;
+}
+
+static inline __attribute__((always_inline))
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_143)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint raw0 = read_mem(param, param_1, v_143);
+ Alloc param_2 = a;
+ uint param_3 = ix + 1u;
+ uint raw1 = read_mem(param_2, param_3, v_143);
+ Alloc param_4 = a;
+ uint param_5 = ix + 2u;
+ uint raw2 = read_mem(param_4, param_5, v_143);
+ Path s;
+ s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
+ s.tiles = TileRef{ raw2 };
+ return s;
+}
+
+static inline __attribute__((always_inline))
+Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+{
+ Alloc a;
+ a.offset = offset;
+ return a;
+}
+
+static inline __attribute__((always_inline))
+float approx_parabola_inv_integral(thread const float& x)
+{
+ return x * sqrt(0.61000001430511474609375 + (0.1520999968051910400390625 + ((0.25 * x) * x)));
+}
+
+static inline __attribute__((always_inline))
+float2 eval_quad(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& t)
+{
+ float mt = 1.0 - t;
+ return (p0 * (mt * mt)) + (((p1 * (mt * 2.0)) + (p2 * t)) * t);
+}
+
+static inline __attribute__((always_inline))
+uint malloc_stage(thread const uint& size, thread const uint& mem_size, thread const uint& stage, device Memory& v_143)
+{
+ uint _158 = atomic_fetch_add_explicit((device atomic_uint*)&v_143.mem_offset, size, memory_order_relaxed);
+ uint offset = _158;
+ if ((offset + size) > mem_size)
+ {
+ uint _168 = atomic_fetch_or_explicit((device atomic_uint*)&v_143.mem_error, stage, memory_order_relaxed);
+ offset = 0u;
+ }
+ return offset;
+}
+
+static inline __attribute__((always_inline))
+TileRef Tile_index(thread const TileRef& ref, thread const uint& index)
+{
+ return TileRef{ ref.offset + (index * 8u) };
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_143)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ v_143.memory[offset] = val;
+}
+
+static inline __attribute__((always_inline))
+void TileSeg_write(thread const Alloc& a, thread const TileSegRef& ref, thread const TileSeg& s, device Memory& v_143)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = as_type<uint>(s.origin.x);
+ write_mem(param, param_1, param_2, v_143);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = as_type<uint>(s.origin.y);
+ write_mem(param_3, param_4, param_5, v_143);
+ Alloc param_6 = a;
+ uint param_7 = ix + 2u;
+ uint param_8 = as_type<uint>(s.vector.x);
+ write_mem(param_6, param_7, param_8, v_143);
+ Alloc param_9 = a;
+ uint param_10 = ix + 3u;
+ uint param_11 = as_type<uint>(s.vector.y);
+ write_mem(param_9, param_10, param_11, v_143);
+ Alloc param_12 = a;
+ uint param_13 = ix + 4u;
+ uint param_14 = as_type<uint>(s.y_edge);
+ write_mem(param_12, param_13, param_14, v_143);
+ Alloc param_15 = a;
+ uint param_16 = ix + 5u;
+ uint param_17 = s.next.offset;
+ write_mem(param_15, param_16, param_17, v_143);
+}
+
+kernel void main0(device Memory& v_143 [[buffer(0)]], const device ConfigBuf& _711 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+ bool mem_ok = true;
+ uint param = 7u;
+ bool _694 = check_deps(param, v_143);
+ if (!_694)
+ {
+ return;
+ }
+ uint element_ix = gl_GlobalInvocationID.x;
+ PathSegRef ref = PathSegRef{ _711.conf.pathseg_alloc.offset + (element_ix * 52u) };
+ PathSegTag tag = PathSegTag{ 0u, 0u };
+ if (element_ix < _711.conf.n_pathseg)
+ {
+ Alloc param_1;
+ param_1.offset = _711.conf.pathseg_alloc.offset;
+ PathSegRef param_2 = ref;
+ tag = PathSeg_tag(param_1, param_2, v_143);
+ }
+ switch (tag.tag)
+ {
+ case 1u:
+ {
+ Alloc param_3;
+ param_3.offset = _711.conf.pathseg_alloc.offset;
+ PathSegRef param_4 = ref;
+ PathCubic cubic = PathSeg_Cubic_read(param_3, param_4, v_143);
+ float2 err_v = (((cubic.p2 - cubic.p1) * 3.0) + cubic.p0) - cubic.p3;
+ float err = (err_v.x * err_v.x) + (err_v.y * err_v.y);
+ uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875, 0.16666667163372039794921875))), 1u);
+ n_quads = min(n_quads, 16u);
+ float val = 0.0;
+ float2 qp0 = cubic.p0;
+ float _step = 1.0 / float(n_quads);
+ spvUnsafeArray<SubdivResult, 16> keep_params;
+ for (uint i = 0u; i < n_quads; i++)
+ {
+ float t = float(i + 1u) * _step;
+ float2 param_5 = cubic.p0;
+ float2 param_6 = cubic.p1;
+ float2 param_7 = cubic.p2;
+ float2 param_8 = cubic.p3;
+ float param_9 = t;
+ float2 qp2 = eval_cubic(param_5, param_6, param_7, param_8, param_9);
+ float2 param_10 = cubic.p0;
+ float2 param_11 = cubic.p1;
+ float2 param_12 = cubic.p2;
+ float2 param_13 = cubic.p3;
+ float param_14 = t - (0.5 * _step);
+ float2 qp1 = eval_cubic(param_10, param_11, param_12, param_13, param_14);
+ qp1 = (qp1 * 2.0) - ((qp0 + qp2) * 0.5);
+ float2 param_15 = qp0;
+ float2 param_16 = qp1;
+ float2 param_17 = qp2;
+ float param_18 = 0.4743416607379913330078125;
+ SubdivResult params = estimate_subdiv(param_15, param_16, param_17, param_18);
+ keep_params[i] = params;
+ val += params.val;
+ qp0 = qp2;
+ }
+ uint n = max(uint(ceil((val * 0.5) / 0.4743416607379913330078125)), 1u);
+ uint param_19 = tag.flags;
+ bool is_stroke = fill_mode_from_flags(param_19) == 1u;
+ uint path_ix = cubic.path_ix;
+ Alloc param_20;
+ param_20.offset = _711.conf.tile_alloc.offset;
+ PathRef param_21 = PathRef{ _711.conf.tile_alloc.offset + (path_ix * 12u) };
+ Path path = Path_read(param_20, param_21, v_143);
+ uint param_22 = path.tiles.offset;
+ uint param_23 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+ bool param_24 = true;
+ Alloc path_alloc = new_alloc(param_22, param_23, param_24);
+ int4 bbox = int4(path.bbox);
+ float2 p0 = cubic.p0;
+ qp0 = cubic.p0;
+ float v_step = val / float(n);
+ int n_out = 1;
+ float val_sum = 0.0;
+ float2 p1;
+ float _1143;
+ TileSeg tile_seg;
+ for (uint i_1 = 0u; i_1 < n_quads; i_1++)
+ {
+ float t_1 = float(i_1 + 1u) * _step;
+ float2 param_25 = cubic.p0;
+ float2 param_26 = cubic.p1;
+ float2 param_27 = cubic.p2;
+ float2 param_28 = cubic.p3;
+ float param_29 = t_1;
+ float2 qp2_1 = eval_cubic(param_25, param_26, param_27, param_28, param_29);
+ float2 param_30 = cubic.p0;
+ float2 param_31 = cubic.p1;
+ float2 param_32 = cubic.p2;
+ float2 param_33 = cubic.p3;
+ float param_34 = t_1 - (0.5 * _step);
+ float2 qp1_1 = eval_cubic(param_30, param_31, param_32, param_33, param_34);
+ qp1_1 = (qp1_1 * 2.0) - ((qp0 + qp2_1) * 0.5);
+ SubdivResult params_1 = keep_params[i_1];
+ float param_35 = params_1.a0;
+ float u0 = approx_parabola_inv_integral(param_35);
+ float param_36 = params_1.a2;
+ float u2 = approx_parabola_inv_integral(param_36);
+ float uscale = 1.0 / (u2 - u0);
+ float target = float(n_out) * v_step;
+ for (;;)
+ {
+ bool _1036 = uint(n_out) == n;
+ bool _1046;
+ if (!_1036)
+ {
+ _1046 = target < (val_sum + params_1.val);
+ }
+ else
+ {
+ _1046 = _1036;
+ }
+ if (_1046)
+ {
+ if (uint(n_out) == n)
+ {
+ p1 = cubic.p3;
+ }
+ else
+ {
+ float u = (target - val_sum) / params_1.val;
+ float a = mix(params_1.a0, params_1.a2, u);
+ float param_37 = a;
+ float au = approx_parabola_inv_integral(param_37);
+ float t_2 = (au - u0) * uscale;
+ float2 param_38 = qp0;
+ float2 param_39 = qp1_1;
+ float2 param_40 = qp2_1;
+ float param_41 = t_2;
+ p1 = eval_quad(param_38, param_39, param_40, param_41);
+ }
+ float xmin = fast::min(p0.x, p1.x) - cubic.stroke.x;
+ float xmax = fast::max(p0.x, p1.x) + cubic.stroke.x;
+ float ymin = fast::min(p0.y, p1.y) - cubic.stroke.y;
+ float ymax = fast::max(p0.y, p1.y) + cubic.stroke.y;
+ float dx = p1.x - p0.x;
+ float dy = p1.y - p0.y;
+ if (abs(dy) < 9.999999717180685365747194737196e-10)
+ {
+ _1143 = 1000000000.0;
+ }
+ else
+ {
+ _1143 = dx / dy;
+ }
+ float invslope = _1143;
+ float c = (cubic.stroke.x + (abs(invslope) * (8.0 + cubic.stroke.y))) * 0.0625;
+ float b = invslope;
+ float a_1 = (p0.x - ((p0.y - 8.0) * b)) * 0.0625;
+ int x0 = int(floor(xmin * 0.0625));
+ int x1 = int(floor(xmax * 0.0625) + 1.0);
+ int y0 = int(floor(ymin * 0.0625));
+ int y1 = int(floor(ymax * 0.0625) + 1.0);
+ x0 = clamp(x0, bbox.x, bbox.z);
+ y0 = clamp(y0, bbox.y, bbox.w);
+ x1 = clamp(x1, bbox.x, bbox.z);
+ y1 = clamp(y1, bbox.y, bbox.w);
+ float xc = a_1 + (b * float(y0));
+ int stride = bbox.z - bbox.x;
+ int base = ((y0 - bbox.y) * stride) - bbox.x;
+ uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
+ uint malloc_size = n_tile_alloc * 24u;
+ uint param_42 = malloc_size;
+ uint param_43 = _711.conf.mem_size;
+ uint param_44 = 4u;
+ uint _1265 = malloc_stage(param_42, param_43, param_44, v_143);
+ uint tile_offset = _1265;
+ if (tile_offset == 0u)
+ {
+ mem_ok = false;
+ }
+ uint param_45 = tile_offset;
+ uint param_46 = malloc_size;
+ bool param_47 = true;
+ Alloc tile_alloc = new_alloc(param_45, param_46, param_47);
+ int xray = int(floor(p0.x * 0.0625));
+ int last_xray = int(floor(p1.x * 0.0625));
+ if (p0.y > p1.y)
+ {
+ int tmp = xray;
+ xray = last_xray;
+ last_xray = tmp;
+ }
+ for (int y = y0; y < y1; y++)
+ {
+ float tile_y0 = float(y * 16);
+ int xbackdrop = max((xray + 1), bbox.x);
+ bool _1322 = !is_stroke;
+ bool _1332;
+ if (_1322)
+ {
+ _1332 = fast::min(p0.y, p1.y) < tile_y0;
+ }
+ else
+ {
+ _1332 = _1322;
+ }
+ bool _1339;
+ if (_1332)
+ {
+ _1339 = xbackdrop < bbox.z;
+ }
+ else
+ {
+ _1339 = _1332;
+ }
+ if (_1339)
+ {
+ int backdrop = (p1.y < p0.y) ? 1 : (-1);
+ TileRef param_48 = path.tiles;
+ uint param_49 = uint(base + xbackdrop);
+ TileRef tile_ref = Tile_index(param_48, param_49);
+ uint tile_el = tile_ref.offset >> uint(2);
+ uint _1369 = atomic_fetch_add_explicit((device atomic_uint*)&v_143.memory[tile_el + 1u], uint(backdrop), memory_order_relaxed);
+ }
+ int next_xray = last_xray;
+ if (y < (y1 - 1))
+ {
+ float tile_y1 = float((y + 1) * 16);
+ float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
+ next_xray = int(floor(x_edge * 0.0625));
+ }
+ int min_xray = min(xray, next_xray);
+ int max_xray = max(xray, next_xray);
+ int xx0 = min(int(floor(xc - c)), min_xray);
+ int xx1 = max(int(ceil(xc + c)), (max_xray + 1));
+ xx0 = clamp(xx0, x0, x1);
+ xx1 = clamp(xx1, x0, x1);
+ for (int x = xx0; x < xx1; x++)
+ {
+ float tile_x0 = float(x * 16);
+ TileRef param_50 = TileRef{ path.tiles.offset };
+ uint param_51 = uint(base + x);
+ TileRef tile_ref_1 = Tile_index(param_50, param_51);
+ uint tile_el_1 = tile_ref_1.offset >> uint(2);
+ uint old = 0u;
+ uint _1465 = atomic_exchange_explicit((device atomic_uint*)&v_143.memory[tile_el_1], tile_offset, memory_order_relaxed);
+ old = _1465;
+ tile_seg.origin = p0;
+ tile_seg.vector = p1 - p0;
+ float y_edge = 0.0;
+ if (!is_stroke)
+ {
+ y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
+ if (fast::min(p0.x, p1.x) < tile_x0)
+ {
+ float2 p = float2(tile_x0, y_edge);
+ if (p0.x > p1.x)
+ {
+ tile_seg.vector = p - p0;
+ }
+ else
+ {
+ tile_seg.origin = p;
+ tile_seg.vector = p1 - p;
+ }
+ if (tile_seg.vector.x == 0.0)
+ {
+ tile_seg.vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10;
+ }
+ }
+ if ((x <= min_xray) || (max_xray < x))
+ {
+ y_edge = 1000000000.0;
+ }
+ }
+ tile_seg.y_edge = y_edge;
+ tile_seg.next.offset = old;
+ if (mem_ok)
+ {
+ Alloc param_52 = tile_alloc;
+ TileSegRef param_53 = TileSegRef{ tile_offset };
+ TileSeg param_54 = tile_seg;
+ TileSeg_write(param_52, param_53, param_54, v_143);
+ }
+ tile_offset += 24u;
+ }
+ xc += b;
+ base += stride;
+ xray = next_xray;
+ }
+ n_out++;
+ target += v_step;
+ p0 = p1;
+ continue;
+ }
+ else
+ {
+ break;
+ }
+ }
+ val_sum += params_1.val;
+ qp0 = qp2_1;
+ }
+ break;
+ }
+ }
+}
+
diff --git a/piet-gpu/shader/gen/path_coarse.spv b/piet-gpu/shader/gen/path_coarse.spv
new file mode 100644
index 0000000..a7e0ef3
--- /dev/null
+++ b/piet-gpu/shader/gen/path_coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil
new file mode 100644
index 0000000..11c96a6
--- /dev/null
+++ b/piet-gpu/shader/gen/pathseg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl
new file mode 100644
index 0000000..89872f5
--- /dev/null
+++ b/piet-gpu/shader/gen/pathseg.hlsl
@@ -0,0 +1,633 @@
+struct Alloc
+{
+ uint offset;
+};
+
+struct TagMonoid
+{
+ uint trans_ix;
+ uint linewidth_ix;
+ uint pathseg_ix;
+ uint path_ix;
+ uint pathseg_offset;
+};
+
+struct PathCubicRef
+{
+ uint offset;
+};
+
+struct PathCubic
+{
+ float2 p0;
+ float2 p1;
+ float2 p2;
+ float2 p3;
+ uint path_ix;
+ uint trans_ix;
+ float2 stroke;
+};
+
+struct PathSegRef
+{
+ uint offset;
+};
+
+struct TransformRef
+{
+ uint offset;
+};
+
+struct Transform
+{
+ float4 mat;
+ float2 translate;
+};
+
+struct Monoid
+{
+ float4 bbox;
+ uint flags;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+static const TagMonoid _113 = { 0u, 0u, 0u, 0u, 0u };
+static const Monoid _537 = { 0.0f.xxxx, 0u };
+
+RWByteAddressBuffer _105 : register(u0, space0);
+ByteAddressBuffer _382 : register(t2, space0);
+ByteAddressBuffer _605 : register(t1, space0);
+ByteAddressBuffer _676 : register(t3, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_WorkGroupID : SV_GroupID;
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared TagMonoid sh_tag[256];
+groupshared Monoid sh_scratch[256];
+
+TagMonoid reduce_tag(uint tag_word)
+{
+ uint point_count = tag_word & 50529027u;
+ TagMonoid c;
+ c.pathseg_ix = uint(int(countbits((point_count * 7u) & 67372036u)));
+ c.linewidth_ix = uint(int(countbits(tag_word & 1077952576u)));
+ c.path_ix = uint(int(countbits(tag_word & 269488144u)));
+ c.trans_ix = uint(int(countbits(tag_word & 538976288u)));
+ uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u);
+ uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u));
+ a += (a >> uint(8));
+ a += (a >> uint(16));
+ c.pathseg_offset = a & 255u;
+ return c;
+}
+
+TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b)
+{
+ TagMonoid c;
+ c.trans_ix = a.trans_ix + b.trans_ix;
+ c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+ c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+ c.path_ix = a.path_ix + b.path_ix;
+ c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+ return c;
+}
+
+TagMonoid tag_monoid_identity()
+{
+ return _113;
+}
+
+float2 read_f32_point(uint ix)
+{
+ float x = asfloat(_382.Load(ix * 4 + 0));
+ float y = asfloat(_382.Load((ix + 1u) * 4 + 0));
+ return float2(x, y);
+}
+
+float2 read_i16_point(uint ix)
+{
+ uint raw = _382.Load(ix * 4 + 0);
+ float x = float(int(raw << uint(16)) >> 16);
+ float y = float(int(raw) >> 16);
+ return float2(x, y);
+}
+
+Transform Transform_read(TransformRef ref)
+{
+ uint ix = ref.offset >> uint(2);
+ uint raw0 = _382.Load((ix + 0u) * 4 + 0);
+ uint raw1 = _382.Load((ix + 1u) * 4 + 0);
+ uint raw2 = _382.Load((ix + 2u) * 4 + 0);
+ uint raw3 = _382.Load((ix + 3u) * 4 + 0);
+ uint raw4 = _382.Load((ix + 4u) * 4 + 0);
+ uint raw5 = _382.Load((ix + 5u) * 4 + 0);
+ Transform s;
+ s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+ s.translate = float2(asfloat(raw4), asfloat(raw5));
+ return s;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+ return true;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ _105.Store(offset * 4 + 12, val);
+}
+
+void PathCubic_write(Alloc a, PathCubicRef ref, PathCubic s)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = asuint(s.p0.x);
+ write_mem(param, param_1, param_2);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = asuint(s.p0.y);
+ write_mem(param_3, param_4, param_5);
+ Alloc param_6 = a;
+ uint param_7 = ix + 2u;
+ uint param_8 = asuint(s.p1.x);
+ write_mem(param_6, param_7, param_8);
+ Alloc param_9 = a;
+ uint param_10 = ix + 3u;
+ uint param_11 = asuint(s.p1.y);
+ write_mem(param_9, param_10, param_11);
+ Alloc param_12 = a;
+ uint param_13 = ix + 4u;
+ uint param_14 = asuint(s.p2.x);
+ write_mem(param_12, param_13, param_14);
+ Alloc param_15 = a;
+ uint param_16 = ix + 5u;
+ uint param_17 = asuint(s.p2.y);
+ write_mem(param_15, param_16, param_17);
+ Alloc param_18 = a;
+ uint param_19 = ix + 6u;
+ uint param_20 = asuint(s.p3.x);
+ write_mem(param_18, param_19, param_20);
+ Alloc param_21 = a;
+ uint param_22 = ix + 7u;
+ uint param_23 = asuint(s.p3.y);
+ write_mem(param_21, param_22, param_23);
+ Alloc param_24 = a;
+ uint param_25 = ix + 8u;
+ uint param_26 = s.path_ix;
+ write_mem(param_24, param_25, param_26);
+ Alloc param_27 = a;
+ uint param_28 = ix + 9u;
+ uint param_29 = s.trans_ix;
+ write_mem(param_27, param_28, param_29);
+ Alloc param_30 = a;
+ uint param_31 = ix + 10u;
+ uint param_32 = asuint(s.stroke.x);
+ write_mem(param_30, param_31, param_32);
+ Alloc param_33 = a;
+ uint param_34 = ix + 11u;
+ uint param_35 = asuint(s.stroke.y);
+ write_mem(param_33, param_34, param_35);
+}
+
+void PathSeg_Cubic_write(Alloc a, PathSegRef ref, uint flags, PathCubic s)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = (flags << uint(16)) | 1u;
+ write_mem(param, param_1, param_2);
+ PathCubicRef _367 = { ref.offset + 4u };
+ Alloc param_3 = a;
+ PathCubicRef param_4 = _367;
+ PathCubic param_5 = s;
+ PathCubic_write(param_3, param_4, param_5);
+}
+
+Monoid combine_monoid(Monoid a, Monoid b)
+{
+ Monoid c;
+ c.bbox = b.bbox;
+ bool _442 = (a.flags & 1u) == 0u;
+ bool _450;
+ if (_442)
+ {
+ _450 = b.bbox.z <= b.bbox.x;
+ }
+ else
+ {
+ _450 = _442;
+ }
+ bool _458;
+ if (_450)
+ {
+ _458 = b.bbox.w <= b.bbox.y;
+ }
+ else
+ {
+ _458 = _450;
+ }
+ if (_458)
+ {
+ c.bbox = a.bbox;
+ }
+ else
+ {
+ bool _468 = (a.flags & 1u) == 0u;
+ bool _475;
+ if (_468)
+ {
+ _475 = (b.flags & 2u) == 0u;
+ }
+ else
+ {
+ _475 = _468;
+ }
+ bool _492;
+ if (_475)
+ {
+ bool _482 = a.bbox.z > a.bbox.x;
+ bool _491;
+ if (!_482)
+ {
+ _491 = a.bbox.w > a.bbox.y;
+ }
+ else
+ {
+ _491 = _482;
+ }
+ _492 = _491;
+ }
+ else
+ {
+ _492 = _475;
+ }
+ if (_492)
+ {
+ float4 _499 = c.bbox;
+ float2 _501 = min(a.bbox.xy, _499.xy);
+ c.bbox.x = _501.x;
+ c.bbox.y = _501.y;
+ float4 _510 = c.bbox;
+ float2 _512 = max(a.bbox.zw, _510.zw);
+ c.bbox.z = _512.x;
+ c.bbox.w = _512.y;
+ }
+ }
+ c.flags = (a.flags & 2u) | b.flags;
+ c.flags |= ((a.flags & 1u) << uint(1));
+ return c;
+}
+
+Monoid monoid_identity()
+{
+ return _537;
+}
+
+uint round_down(float x)
+{
+ return uint(max(0.0f, floor(x) + 32768.0f));
+}
+
+uint round_up(float x)
+{
+ return uint(min(65535.0f, ceil(x) + 32768.0f));
+}
+
+void comp_main()
+{
+ uint ix = gl_GlobalInvocationID.x * 4u;
+ uint tag_word = _382.Load(((_605.Load(92) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
+ uint param = tag_word;
+ TagMonoid local_tm = reduce_tag(param);
+ sh_tag[gl_LocalInvocationID.x] = local_tm;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if (gl_LocalInvocationID.x >= (1u << i))
+ {
+ TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)];
+ TagMonoid param_1 = other;
+ TagMonoid param_2 = local_tm;
+ local_tm = combine_tag_monoid(param_1, param_2);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_tag[gl_LocalInvocationID.x] = local_tm;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ TagMonoid tm = tag_monoid_identity();
+ if (gl_WorkGroupID.x > 0u)
+ {
+ TagMonoid _682;
+ _682.trans_ix = _676.Load((gl_WorkGroupID.x - 1u) * 20 + 0);
+ _682.linewidth_ix = _676.Load((gl_WorkGroupID.x - 1u) * 20 + 4);
+ _682.pathseg_ix = _676.Load((gl_WorkGroupID.x - 1u) * 20 + 8);
+ _682.path_ix = _676.Load((gl_WorkGroupID.x - 1u) * 20 + 12);
+ _682.pathseg_offset = _676.Load((gl_WorkGroupID.x - 1u) * 20 + 16);
+ tm.trans_ix = _682.trans_ix;
+ tm.linewidth_ix = _682.linewidth_ix;
+ tm.pathseg_ix = _682.pathseg_ix;
+ tm.path_ix = _682.path_ix;
+ tm.pathseg_offset = _682.pathseg_offset;
+ }
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ TagMonoid param_3 = tm;
+ TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u];
+ tm = combine_tag_monoid(param_3, param_4);
+ }
+ uint ps_ix = (_605.Load(96) >> uint(2)) + tm.pathseg_offset;
+ uint lw_ix = (_605.Load(88) >> uint(2)) + tm.linewidth_ix;
+ uint save_path_ix = tm.path_ix;
+ uint trans_ix = tm.trans_ix;
+ TransformRef _737 = { _605.Load(84) + (trans_ix * 24u) };
+ TransformRef trans_ref = _737;
+ PathSegRef _746 = { _605.Load(32) + (tm.pathseg_ix * 52u) };
+ PathSegRef ps_ref = _746;
+ float linewidth[4];
+ uint save_trans_ix[4];
+ float2 p0;
+ float2 p1;
+ float2 p2;
+ float2 p3;
+ Monoid local[4];
+ PathCubic cubic;
+ Alloc param_14;
+ for (uint i_1 = 0u; i_1 < 4u; i_1++)
+ {
+ linewidth[i_1] = asfloat(_382.Load(lw_ix * 4 + 0));
+ save_trans_ix[i_1] = trans_ix;
+ uint tag_byte = tag_word >> (i_1 * 8u);
+ uint seg_type = tag_byte & 3u;
+ if (seg_type != 0u)
+ {
+ if ((tag_byte & 8u) != 0u)
+ {
+ uint param_5 = ps_ix;
+ p0 = read_f32_point(param_5);
+ uint param_6 = ps_ix + 2u;
+ p1 = read_f32_point(param_6);
+ if (seg_type >= 2u)
+ {
+ uint param_7 = ps_ix + 4u;
+ p2 = read_f32_point(param_7);
+ if (seg_type == 3u)
+ {
+ uint param_8 = ps_ix + 6u;
+ p3 = read_f32_point(param_8);
+ }
+ }
+ }
+ else
+ {
+ uint param_9 = ps_ix;
+ p0 = read_i16_point(param_9);
+ uint param_10 = ps_ix + 1u;
+ p1 = read_i16_point(param_10);
+ if (seg_type >= 2u)
+ {
+ uint param_11 = ps_ix + 2u;
+ p2 = read_i16_point(param_11);
+ if (seg_type == 3u)
+ {
+ uint param_12 = ps_ix + 3u;
+ p3 = read_i16_point(param_12);
+ }
+ }
+ }
+ TransformRef param_13 = trans_ref;
+ Transform transform = Transform_read(param_13);
+ p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate;
+ p1 = ((transform.mat.xy * p1.x) + (transform.mat.zw * p1.y)) + transform.translate;
+ float4 bbox = float4(min(p0, p1), max(p0, p1));
+ if (seg_type >= 2u)
+ {
+ p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate;
+ float4 _906 = bbox;
+ float2 _909 = min(_906.xy, p2);
+ bbox.x = _909.x;
+ bbox.y = _909.y;
+ float4 _914 = bbox;
+ float2 _917 = max(_914.zw, p2);
+ bbox.z = _917.x;
+ bbox.w = _917.y;
+ if (seg_type == 3u)
+ {
+ p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate;
+ float4 _942 = bbox;
+ float2 _945 = min(_942.xy, p3);
+ bbox.x = _945.x;
+ bbox.y = _945.y;
+ float4 _950 = bbox;
+ float2 _953 = max(_950.zw, p3);
+ bbox.z = _953.x;
+ bbox.w = _953.y;
+ }
+ else
+ {
+ p3 = p2;
+ p2 = lerp(p1, p2, 0.3333333432674407958984375f.xx);
+ p1 = lerp(p1, p0, 0.3333333432674407958984375f.xx);
+ }
+ }
+ else
+ {
+ p3 = p1;
+ p2 = lerp(p3, p0, 0.3333333432674407958984375f.xx);
+ p1 = lerp(p0, p3, 0.3333333432674407958984375f.xx);
+ }
+ float2 stroke = 0.0f.xx;
+ if (linewidth[i_1] >= 0.0f)
+ {
+ stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5f * linewidth[i_1]);
+ bbox += float4(-stroke, stroke);
+ }
+ local[i_1].bbox = bbox;
+ local[i_1].flags = 0u;
+ cubic.p0 = p0;
+ cubic.p1 = p1;
+ cubic.p2 = p2;
+ cubic.p3 = p3;
+ cubic.path_ix = tm.path_ix;
+ cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1;
+ cubic.stroke = stroke;
+ uint fill_mode = uint(linewidth[i_1] >= 0.0f);
+ Alloc _1049;
+ _1049.offset = _605.Load(32);
+ param_14.offset = _1049.offset;
+ PathSegRef param_15 = ps_ref;
+ uint param_16 = fill_mode;
+ PathCubic param_17 = cubic;
+ PathSeg_Cubic_write(param_14, param_15, param_16, param_17);
+ ps_ref.offset += 52u;
+ uint n_points = (tag_byte & 3u) + ((tag_byte >> uint(2)) & 1u);
+ uint n_words = n_points + (n_points & (((tag_byte >> uint(3)) & 1u) * 15u));
+ ps_ix += n_words;
+ }
+ else
+ {
+ local[i_1].bbox = 0.0f.xxxx;
+ uint is_path = (tag_byte >> uint(4)) & 1u;
+ local[i_1].flags = is_path;
+ tm.path_ix += is_path;
+ trans_ix += ((tag_byte >> uint(5)) & 1u);
+ trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u);
+ lw_ix += ((tag_byte >> uint(6)) & 1u);
+ }
+ }
+ Monoid agg = local[0];
+ for (uint i_2 = 1u; i_2 < 4u; i_2++)
+ {
+ Monoid param_18 = agg;
+ Monoid param_19 = local[i_2];
+ agg = combine_monoid(param_18, param_19);
+ local[i_2] = agg;
+ }
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_3 = 0u; i_3 < 8u; i_3++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if (gl_LocalInvocationID.x >= (1u << i_3))
+ {
+ Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - (1u << i_3)];
+ Monoid param_20 = other_1;
+ Monoid param_21 = agg;
+ agg = combine_monoid(param_20, param_21);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ uint path_ix = save_path_ix;
+ uint bbox_out_ix = (_605.Load(40) >> uint(2)) + (path_ix * 6u);
+ Monoid row = monoid_identity();
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ row = sh_scratch[gl_LocalInvocationID.x - 1u];
+ }
+ for (uint i_4 = 0u; i_4 < 4u; i_4++)
+ {
+ Monoid param_22 = row;
+ Monoid param_23 = local[i_4];
+ Monoid m = combine_monoid(param_22, param_23);
+ bool do_atomic = false;
+ bool _1224 = i_4 == 3u;
+ bool _1230;
+ if (_1224)
+ {
+ _1230 = gl_LocalInvocationID.x == 255u;
+ }
+ else
+ {
+ _1230 = _1224;
+ }
+ if (_1230)
+ {
+ do_atomic = true;
+ }
+ if ((m.flags & 1u) != 0u)
+ {
+ _105.Store((bbox_out_ix + 4u) * 4 + 12, asuint(linewidth[i_4]));
+ _105.Store((bbox_out_ix + 5u) * 4 + 12, save_trans_ix[i_4]);
+ if ((m.flags & 2u) == 0u)
+ {
+ do_atomic = true;
+ }
+ else
+ {
+ float param_24 = m.bbox.x;
+ _105.Store(bbox_out_ix * 4 + 12, round_down(param_24));
+ float param_25 = m.bbox.y;
+ _105.Store((bbox_out_ix + 1u) * 4 + 12, round_down(param_25));
+ float param_26 = m.bbox.z;
+ _105.Store((bbox_out_ix + 2u) * 4 + 12, round_up(param_26));
+ float param_27 = m.bbox.w;
+ _105.Store((bbox_out_ix + 3u) * 4 + 12, round_up(param_27));
+ bbox_out_ix += 6u;
+ do_atomic = false;
+ }
+ }
+ if (do_atomic)
+ {
+ bool _1295 = m.bbox.z > m.bbox.x;
+ bool _1304;
+ if (!_1295)
+ {
+ _1304 = m.bbox.w > m.bbox.y;
+ }
+ else
+ {
+ _1304 = _1295;
+ }
+ if (_1304)
+ {
+ float param_28 = m.bbox.x;
+ uint _1313;
+ _105.InterlockedMin(bbox_out_ix * 4 + 12, round_down(param_28), _1313);
+ float param_29 = m.bbox.y;
+ uint _1321;
+ _105.InterlockedMin((bbox_out_ix + 1u) * 4 + 12, round_down(param_29), _1321);
+ float param_30 = m.bbox.z;
+ uint _1329;
+ _105.InterlockedMax((bbox_out_ix + 2u) * 4 + 12, round_up(param_30), _1329);
+ float param_31 = m.bbox.w;
+ uint _1337;
+ _105.InterlockedMax((bbox_out_ix + 3u) * 4 + 12, round_up(param_31), _1337);
+ }
+ bbox_out_ix += 6u;
+ }
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_WorkGroupID = stage_input.gl_WorkGroupID;
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl
new file mode 100644
index 0000000..9687453
--- /dev/null
+++ b/piet-gpu/shader/gen/pathseg.msl
@@ -0,0 +1,691 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+ T elements[Num ? Num : 1];
+
+ thread T& operator [] (size_t pos) thread
+ {
+ return elements[pos];
+ }
+ constexpr const thread T& operator [] (size_t pos) const thread
+ {
+ return elements[pos];
+ }
+
+ device T& operator [] (size_t pos) device
+ {
+ return elements[pos];
+ }
+ constexpr const device T& operator [] (size_t pos) const device
+ {
+ return elements[pos];
+ }
+
+ constexpr const constant T& operator [] (size_t pos) const constant
+ {
+ return elements[pos];
+ }
+
+ threadgroup T& operator [] (size_t pos) threadgroup
+ {
+ return elements[pos];
+ }
+ constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+ {
+ return elements[pos];
+ }
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct TagMonoid
+{
+ uint trans_ix;
+ uint linewidth_ix;
+ uint pathseg_ix;
+ uint path_ix;
+ uint pathseg_offset;
+};
+
+struct PathCubicRef
+{
+ uint offset;
+};
+
+struct PathCubic
+{
+ float2 p0;
+ float2 p1;
+ float2 p2;
+ float2 p3;
+ uint path_ix;
+ uint trans_ix;
+ float2 stroke;
+};
+
+struct PathSegRef
+{
+ uint offset;
+};
+
+struct TransformRef
+{
+ uint offset;
+};
+
+struct Transform
+{
+ float4 mat;
+ float2 translate;
+};
+
+struct Monoid
+{
+ float4 bbox;
+ uint flags;
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+struct SceneBuf
+{
+ uint scene[1];
+};
+
+struct Alloc_1
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc_1 tile_alloc;
+ Alloc_1 bin_alloc;
+ Alloc_1 ptcl_alloc;
+ Alloc_1 pathseg_alloc;
+ Alloc_1 anno_alloc;
+ Alloc_1 path_bbox_alloc;
+ Alloc_1 drawmonoid_alloc;
+ Alloc_1 clip_alloc;
+ Alloc_1 clip_bic_alloc;
+ Alloc_1 clip_stack_alloc;
+ Alloc_1 clip_bbox_alloc;
+ Alloc_1 draw_bbox_alloc;
+ Alloc_1 drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+struct TagMonoid_1
+{
+ uint trans_ix;
+ uint linewidth_ix;
+ uint pathseg_ix;
+ uint path_ix;
+ uint pathseg_offset;
+};
+
+struct ParentBuf
+{
+ TagMonoid_1 parent[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+TagMonoid reduce_tag(thread const uint& tag_word)
+{
+ uint point_count = tag_word & 50529027u;
+ TagMonoid c;
+ c.pathseg_ix = uint(int(popcount((point_count * 7u) & 67372036u)));
+ c.linewidth_ix = uint(int(popcount(tag_word & 1077952576u)));
+ c.path_ix = uint(int(popcount(tag_word & 269488144u)));
+ c.trans_ix = uint(int(popcount(tag_word & 538976288u)));
+ uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u);
+ uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u));
+ a += (a >> uint(8));
+ a += (a >> uint(16));
+ c.pathseg_offset = a & 255u;
+ return c;
+}
+
+static inline __attribute__((always_inline))
+TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b)
+{
+ TagMonoid c;
+ c.trans_ix = a.trans_ix + b.trans_ix;
+ c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+ c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+ c.path_ix = a.path_ix + b.path_ix;
+ c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+ return c;
+}
+
+static inline __attribute__((always_inline))
+TagMonoid tag_monoid_identity()
+{
+ return TagMonoid{ 0u, 0u, 0u, 0u, 0u };
+}
+
+static inline __attribute__((always_inline))
+float2 read_f32_point(thread const uint& ix, const device SceneBuf& v_382)
+{
+ float x = as_type<float>(v_382.scene[ix]);
+ float y = as_type<float>(v_382.scene[ix + 1u]);
+ return float2(x, y);
+}
+
+static inline __attribute__((always_inline))
+float2 read_i16_point(thread const uint& ix, const device SceneBuf& v_382)
+{
+ uint raw = v_382.scene[ix];
+ float x = float(int(raw << uint(16)) >> 16);
+ float y = float(int(raw) >> 16);
+ return float2(x, y);
+}
+
+static inline __attribute__((always_inline))
+Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_382)
+{
+ uint ix = ref.offset >> uint(2);
+ uint raw0 = v_382.scene[ix + 0u];
+ uint raw1 = v_382.scene[ix + 1u];
+ uint raw2 = v_382.scene[ix + 2u];
+ uint raw3 = v_382.scene[ix + 3u];
+ uint raw4 = v_382.scene[ix + 4u];
+ uint raw5 = v_382.scene[ix + 5u];
+ Transform s;
+ s.mat = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+ s.translate = float2(as_type<float>(raw4), as_type<float>(raw5));
+ return s;
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+ return true;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_105)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ v_105.memory[offset] = val;
+}
+
+static inline __attribute__((always_inline))
+void PathCubic_write(thread const Alloc& a, thread const PathCubicRef& ref, thread const PathCubic& s, device Memory& v_105)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = as_type<uint>(s.p0.x);
+ write_mem(param, param_1, param_2, v_105);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = as_type<uint>(s.p0.y);
+ write_mem(param_3, param_4, param_5, v_105);
+ Alloc param_6 = a;
+ uint param_7 = ix + 2u;
+ uint param_8 = as_type<uint>(s.p1.x);
+ write_mem(param_6, param_7, param_8, v_105);
+ Alloc param_9 = a;
+ uint param_10 = ix + 3u;
+ uint param_11 = as_type<uint>(s.p1.y);
+ write_mem(param_9, param_10, param_11, v_105);
+ Alloc param_12 = a;
+ uint param_13 = ix + 4u;
+ uint param_14 = as_type<uint>(s.p2.x);
+ write_mem(param_12, param_13, param_14, v_105);
+ Alloc param_15 = a;
+ uint param_16 = ix + 5u;
+ uint param_17 = as_type<uint>(s.p2.y);
+ write_mem(param_15, param_16, param_17, v_105);
+ Alloc param_18 = a;
+ uint param_19 = ix + 6u;
+ uint param_20 = as_type<uint>(s.p3.x);
+ write_mem(param_18, param_19, param_20, v_105);
+ Alloc param_21 = a;
+ uint param_22 = ix + 7u;
+ uint param_23 = as_type<uint>(s.p3.y);
+ write_mem(param_21, param_22, param_23, v_105);
+ Alloc param_24 = a;
+ uint param_25 = ix + 8u;
+ uint param_26 = s.path_ix;
+ write_mem(param_24, param_25, param_26, v_105);
+ Alloc param_27 = a;
+ uint param_28 = ix + 9u;
+ uint param_29 = s.trans_ix;
+ write_mem(param_27, param_28, param_29, v_105);
+ Alloc param_30 = a;
+ uint param_31 = ix + 10u;
+ uint param_32 = as_type<uint>(s.stroke.x);
+ write_mem(param_30, param_31, param_32, v_105);
+ Alloc param_33 = a;
+ uint param_34 = ix + 11u;
+ uint param_35 = as_type<uint>(s.stroke.y);
+ write_mem(param_33, param_34, param_35, v_105);
+}
+
+static inline __attribute__((always_inline))
+void PathSeg_Cubic_write(thread const Alloc& a, thread const PathSegRef& ref, thread const uint& flags, thread const PathCubic& s, device Memory& v_105)
+{
+ Alloc param = a;
+ uint param_1 = ref.offset >> uint(2);
+ uint param_2 = (flags << uint(16)) | 1u;
+ write_mem(param, param_1, param_2, v_105);
+ Alloc param_3 = a;
+ PathCubicRef param_4 = PathCubicRef{ ref.offset + 4u };
+ PathCubic param_5 = s;
+ PathCubic_write(param_3, param_4, param_5, v_105);
+}
+
+static inline __attribute__((always_inline))
+Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b)
+{
+ Monoid c;
+ c.bbox = b.bbox;
+ bool _442 = (a.flags & 1u) == 0u;
+ bool _450;
+ if (_442)
+ {
+ _450 = b.bbox.z <= b.bbox.x;
+ }
+ else
+ {
+ _450 = _442;
+ }
+ bool _458;
+ if (_450)
+ {
+ _458 = b.bbox.w <= b.bbox.y;
+ }
+ else
+ {
+ _458 = _450;
+ }
+ if (_458)
+ {
+ c.bbox = a.bbox;
+ }
+ else
+ {
+ bool _468 = (a.flags & 1u) == 0u;
+ bool _475;
+ if (_468)
+ {
+ _475 = (b.flags & 2u) == 0u;
+ }
+ else
+ {
+ _475 = _468;
+ }
+ bool _492;
+ if (_475)
+ {
+ bool _482 = a.bbox.z > a.bbox.x;
+ bool _491;
+ if (!_482)
+ {
+ _491 = a.bbox.w > a.bbox.y;
+ }
+ else
+ {
+ _491 = _482;
+ }
+ _492 = _491;
+ }
+ else
+ {
+ _492 = _475;
+ }
+ if (_492)
+ {
+ float4 _499 = c.bbox;
+ float2 _501 = fast::min(a.bbox.xy, _499.xy);
+ c.bbox.x = _501.x;
+ c.bbox.y = _501.y;
+ float4 _510 = c.bbox;
+ float2 _512 = fast::max(a.bbox.zw, _510.zw);
+ c.bbox.z = _512.x;
+ c.bbox.w = _512.y;
+ }
+ }
+ c.flags = (a.flags & 2u) | b.flags;
+ c.flags |= ((a.flags & 1u) << uint(1));
+ return c;
+}
+
+static inline __attribute__((always_inline))
+Monoid monoid_identity()
+{
+ return Monoid{ float4(0.0), 0u };
+}
+
+static inline __attribute__((always_inline))
+uint round_down(thread const float& x)
+{
+ return uint(fast::max(0.0, floor(x) + 32768.0));
+}
+
+static inline __attribute__((always_inline))
+uint round_up(thread const float& x)
+{
+ return uint(fast::min(65535.0, ceil(x) + 32768.0));
+}
+
+kernel void main0(device Memory& v_105 [[buffer(0)]], const device ConfigBuf& _605 [[buffer(1)]], const device SceneBuf& v_382 [[buffer(2)]], const device ParentBuf& _676 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+ threadgroup TagMonoid sh_tag[256];
+ threadgroup Monoid sh_scratch[256];
+ uint ix = gl_GlobalInvocationID.x * 4u;
+ uint tag_word = v_382.scene[(_605.conf.pathtag_offset >> uint(2)) + (ix >> uint(2))];
+ uint param = tag_word;
+ TagMonoid local_tm = reduce_tag(param);
+ sh_tag[gl_LocalInvocationID.x] = local_tm;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (gl_LocalInvocationID.x >= (1u << i))
+ {
+ TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)];
+ TagMonoid param_1 = other;
+ TagMonoid param_2 = local_tm;
+ local_tm = combine_tag_monoid(param_1, param_2);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_tag[gl_LocalInvocationID.x] = local_tm;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ TagMonoid tm = tag_monoid_identity();
+ if (gl_WorkGroupID.x > 0u)
+ {
+ uint _679 = gl_WorkGroupID.x - 1u;
+ tm.trans_ix = _676.parent[_679].trans_ix;
+ tm.linewidth_ix = _676.parent[_679].linewidth_ix;
+ tm.pathseg_ix = _676.parent[_679].pathseg_ix;
+ tm.path_ix = _676.parent[_679].path_ix;
+ tm.pathseg_offset = _676.parent[_679].pathseg_offset;
+ }
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ TagMonoid param_3 = tm;
+ TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u];
+ tm = combine_tag_monoid(param_3, param_4);
+ }
+ uint ps_ix = (_605.conf.pathseg_offset >> uint(2)) + tm.pathseg_offset;
+ uint lw_ix = (_605.conf.linewidth_offset >> uint(2)) + tm.linewidth_ix;
+ uint save_path_ix = tm.path_ix;
+ uint trans_ix = tm.trans_ix;
+ TransformRef trans_ref = TransformRef{ _605.conf.trans_offset + (trans_ix * 24u) };
+ PathSegRef ps_ref = PathSegRef{ _605.conf.pathseg_alloc.offset + (tm.pathseg_ix * 52u) };
+ spvUnsafeArray<float, 4> linewidth;
+ spvUnsafeArray<uint, 4> save_trans_ix;
+ float2 p0;
+ float2 p1;
+ float2 p2;
+ float2 p3;
+ spvUnsafeArray<Monoid, 4> local;
+ PathCubic cubic;
+ Alloc param_14;
+ for (uint i_1 = 0u; i_1 < 4u; i_1++)
+ {
+ linewidth[i_1] = as_type<float>(v_382.scene[lw_ix]);
+ save_trans_ix[i_1] = trans_ix;
+ uint tag_byte = tag_word >> (i_1 * 8u);
+ uint seg_type = tag_byte & 3u;
+ if (seg_type != 0u)
+ {
+ if ((tag_byte & 8u) != 0u)
+ {
+ uint param_5 = ps_ix;
+ p0 = read_f32_point(param_5, v_382);
+ uint param_6 = ps_ix + 2u;
+ p1 = read_f32_point(param_6, v_382);
+ if (seg_type >= 2u)
+ {
+ uint param_7 = ps_ix + 4u;
+ p2 = read_f32_point(param_7, v_382);
+ if (seg_type == 3u)
+ {
+ uint param_8 = ps_ix + 6u;
+ p3 = read_f32_point(param_8, v_382);
+ }
+ }
+ }
+ else
+ {
+ uint param_9 = ps_ix;
+ p0 = read_i16_point(param_9, v_382);
+ uint param_10 = ps_ix + 1u;
+ p1 = read_i16_point(param_10, v_382);
+ if (seg_type >= 2u)
+ {
+ uint param_11 = ps_ix + 2u;
+ p2 = read_i16_point(param_11, v_382);
+ if (seg_type == 3u)
+ {
+ uint param_12 = ps_ix + 3u;
+ p3 = read_i16_point(param_12, v_382);
+ }
+ }
+ }
+ TransformRef param_13 = trans_ref;
+ Transform transform = Transform_read(param_13, v_382);
+ p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate;
+ p1 = ((transform.mat.xy * p1.x) + (transform.mat.zw * p1.y)) + transform.translate;
+ float4 bbox = float4(fast::min(p0, p1), fast::max(p0, p1));
+ if (seg_type >= 2u)
+ {
+ p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate;
+ float4 _906 = bbox;
+ float2 _909 = fast::min(_906.xy, p2);
+ bbox.x = _909.x;
+ bbox.y = _909.y;
+ float4 _914 = bbox;
+ float2 _917 = fast::max(_914.zw, p2);
+ bbox.z = _917.x;
+ bbox.w = _917.y;
+ if (seg_type == 3u)
+ {
+ p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate;
+ float4 _942 = bbox;
+ float2 _945 = fast::min(_942.xy, p3);
+ bbox.x = _945.x;
+ bbox.y = _945.y;
+ float4 _950 = bbox;
+ float2 _953 = fast::max(_950.zw, p3);
+ bbox.z = _953.x;
+ bbox.w = _953.y;
+ }
+ else
+ {
+ p3 = p2;
+ p2 = mix(p1, p2, float2(0.3333333432674407958984375));
+ p1 = mix(p1, p0, float2(0.3333333432674407958984375));
+ }
+ }
+ else
+ {
+ p3 = p1;
+ p2 = mix(p3, p0, float2(0.3333333432674407958984375));
+ p1 = mix(p0, p3, float2(0.3333333432674407958984375));
+ }
+ float2 stroke = float2(0.0);
+ if (linewidth[i_1] >= 0.0)
+ {
+ stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5 * linewidth[i_1]);
+ bbox += float4(-stroke, stroke);
+ }
+ local[i_1].bbox = bbox;
+ local[i_1].flags = 0u;
+ cubic.p0 = p0;
+ cubic.p1 = p1;
+ cubic.p2 = p2;
+ cubic.p3 = p3;
+ cubic.path_ix = tm.path_ix;
+ cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1;
+ cubic.stroke = stroke;
+ uint fill_mode = uint(linewidth[i_1] >= 0.0);
+ param_14.offset = _605.conf.pathseg_alloc.offset;
+ PathSegRef param_15 = ps_ref;
+ uint param_16 = fill_mode;
+ PathCubic param_17 = cubic;
+ PathSeg_Cubic_write(param_14, param_15, param_16, param_17, v_105);
+ ps_ref.offset += 52u;
+ uint n_points = (tag_byte & 3u) + ((tag_byte >> uint(2)) & 1u);
+ uint n_words = n_points + (n_points & (((tag_byte >> uint(3)) & 1u) * 15u));
+ ps_ix += n_words;
+ }
+ else
+ {
+ local[i_1].bbox = float4(0.0);
+ uint is_path = (tag_byte >> uint(4)) & 1u;
+ local[i_1].flags = is_path;
+ tm.path_ix += is_path;
+ trans_ix += ((tag_byte >> uint(5)) & 1u);
+ trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u);
+ lw_ix += ((tag_byte >> uint(6)) & 1u);
+ }
+ }
+ Monoid agg = local[0];
+ for (uint i_2 = 1u; i_2 < 4u; i_2++)
+ {
+ Monoid param_18 = agg;
+ Monoid param_19 = local[i_2];
+ agg = combine_monoid(param_18, param_19);
+ local[i_2] = agg;
+ }
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_3 = 0u; i_3 < 8u; i_3++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (gl_LocalInvocationID.x >= (1u << i_3))
+ {
+ Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - (1u << i_3)];
+ Monoid param_20 = other_1;
+ Monoid param_21 = agg;
+ agg = combine_monoid(param_20, param_21);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint path_ix = save_path_ix;
+ uint bbox_out_ix = (_605.conf.path_bbox_alloc.offset >> uint(2)) + (path_ix * 6u);
+ Monoid row = monoid_identity();
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ row = sh_scratch[gl_LocalInvocationID.x - 1u];
+ }
+ for (uint i_4 = 0u; i_4 < 4u; i_4++)
+ {
+ Monoid param_22 = row;
+ Monoid param_23 = local[i_4];
+ Monoid m = combine_monoid(param_22, param_23);
+ bool do_atomic = false;
+ bool _1224 = i_4 == 3u;
+ bool _1230;
+ if (_1224)
+ {
+ _1230 = gl_LocalInvocationID.x == 255u;
+ }
+ else
+ {
+ _1230 = _1224;
+ }
+ if (_1230)
+ {
+ do_atomic = true;
+ }
+ if ((m.flags & 1u) != 0u)
+ {
+ v_105.memory[bbox_out_ix + 4u] = as_type<uint>(linewidth[i_4]);
+ v_105.memory[bbox_out_ix + 5u] = save_trans_ix[i_4];
+ if ((m.flags & 2u) == 0u)
+ {
+ do_atomic = true;
+ }
+ else
+ {
+ float param_24 = m.bbox.x;
+ v_105.memory[bbox_out_ix] = round_down(param_24);
+ float param_25 = m.bbox.y;
+ v_105.memory[bbox_out_ix + 1u] = round_down(param_25);
+ float param_26 = m.bbox.z;
+ v_105.memory[bbox_out_ix + 2u] = round_up(param_26);
+ float param_27 = m.bbox.w;
+ v_105.memory[bbox_out_ix + 3u] = round_up(param_27);
+ bbox_out_ix += 6u;
+ do_atomic = false;
+ }
+ }
+ if (do_atomic)
+ {
+ bool _1295 = m.bbox.z > m.bbox.x;
+ bool _1304;
+ if (!_1295)
+ {
+ _1304 = m.bbox.w > m.bbox.y;
+ }
+ else
+ {
+ _1304 = _1295;
+ }
+ if (_1304)
+ {
+ float param_28 = m.bbox.x;
+ uint _1313 = atomic_fetch_min_explicit((device atomic_uint*)&v_105.memory[bbox_out_ix], round_down(param_28), memory_order_relaxed);
+ float param_29 = m.bbox.y;
+ uint _1321 = atomic_fetch_min_explicit((device atomic_uint*)&v_105.memory[bbox_out_ix + 1u], round_down(param_29), memory_order_relaxed);
+ float param_30 = m.bbox.z;
+ uint _1329 = atomic_fetch_max_explicit((device atomic_uint*)&v_105.memory[bbox_out_ix + 2u], round_up(param_30), memory_order_relaxed);
+ float param_31 = m.bbox.w;
+ uint _1337 = atomic_fetch_max_explicit((device atomic_uint*)&v_105.memory[bbox_out_ix + 3u], round_up(param_31), memory_order_relaxed);
+ }
+ bbox_out_ix += 6u;
+ }
+ }
+}
+
diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv
new file mode 100644
index 0000000..ffdd7f1
--- /dev/null
+++ b/piet-gpu/shader/gen/pathseg.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil
new file mode 100644
index 0000000..0298c43
--- /dev/null
+++ b/piet-gpu/shader/gen/pathtag_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl
new file mode 100644
index 0000000..a5b5fe2
--- /dev/null
+++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl
@@ -0,0 +1,138 @@
+struct TagMonoid
+{
+ uint trans_ix;
+ uint linewidth_ix;
+ uint pathseg_ix;
+ uint path_ix;
+ uint pathseg_offset;
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u);
+
+ByteAddressBuffer _139 : register(t1, space0);
+ByteAddressBuffer _151 : register(t2, space0);
+RWByteAddressBuffer _238 : register(u3, space0);
+RWByteAddressBuffer _258 : register(u0, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_WorkGroupID : SV_GroupID;
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared TagMonoid sh_scratch[128];
+
+TagMonoid reduce_tag(uint tag_word)
+{
+ uint point_count = tag_word & 50529027u;
+ TagMonoid c;
+ c.pathseg_ix = uint(int(countbits((point_count * 7u) & 67372036u)));
+ c.linewidth_ix = uint(int(countbits(tag_word & 1077952576u)));
+ c.path_ix = uint(int(countbits(tag_word & 269488144u)));
+ c.trans_ix = uint(int(countbits(tag_word & 538976288u)));
+ uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u);
+ uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u));
+ a += (a >> uint(8));
+ a += (a >> uint(16));
+ c.pathseg_offset = a & 255u;
+ return c;
+}
+
+TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b)
+{
+ TagMonoid c;
+ c.trans_ix = a.trans_ix + b.trans_ix;
+ c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+ c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+ c.path_ix = a.path_ix + b.path_ix;
+ c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+ return c;
+}
+
+void comp_main()
+{
+ uint ix = gl_GlobalInvocationID.x * 2u;
+ uint scene_ix = (_139.Load(92) >> uint(2)) + ix;
+ uint tag_word = _151.Load(scene_ix * 4 + 0);
+ uint param = tag_word;
+ TagMonoid agg = reduce_tag(param);
+ for (uint i = 1u; i < 2u; i++)
+ {
+ tag_word = _151.Load((scene_ix + i) * 4 + 0);
+ uint param_1 = tag_word;
+ TagMonoid param_2 = agg;
+ TagMonoid param_3 = reduce_tag(param_1);
+ agg = combine_tag_monoid(param_2, param_3);
+ }
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 7u; i_1++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if ((gl_LocalInvocationID.x + (1u << i_1)) < 128u)
+ {
+ TagMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+ TagMonoid param_4 = agg;
+ TagMonoid param_5 = other;
+ agg = combine_tag_monoid(param_4, param_5);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ if (gl_LocalInvocationID.x == 0u)
+ {
+ _238.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix);
+ _238.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix);
+ _238.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix);
+ _238.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix);
+ _238.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset);
+ }
+}
+
+[numthreads(128, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_WorkGroupID = stage_input.gl_WorkGroupID;
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl
new file mode 100644
index 0000000..e347b71
--- /dev/null
+++ b/piet-gpu/shader/gen/pathtag_reduce.msl
@@ -0,0 +1,155 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct TagMonoid
+{
+ uint trans_ix;
+ uint linewidth_ix;
+ uint pathseg_ix;
+ uint path_ix;
+ uint pathseg_offset;
+};
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+struct SceneBuf
+{
+ uint scene[1];
+};
+
+struct TagMonoid_1
+{
+ uint trans_ix;
+ uint linewidth_ix;
+ uint pathseg_ix;
+ uint path_ix;
+ uint pathseg_offset;
+};
+
+struct OutBuf
+{
+ TagMonoid_1 outbuf[1];
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(128u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+TagMonoid reduce_tag(thread const uint& tag_word)
+{
+ uint point_count = tag_word & 50529027u;
+ TagMonoid c;
+ c.pathseg_ix = uint(int(popcount((point_count * 7u) & 67372036u)));
+ c.linewidth_ix = uint(int(popcount(tag_word & 1077952576u)));
+ c.path_ix = uint(int(popcount(tag_word & 269488144u)));
+ c.trans_ix = uint(int(popcount(tag_word & 538976288u)));
+ uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u);
+ uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u));
+ a += (a >> uint(8));
+ a += (a >> uint(16));
+ c.pathseg_offset = a & 255u;
+ return c;
+}
+
+static inline __attribute__((always_inline))
+TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b)
+{
+ TagMonoid c;
+ c.trans_ix = a.trans_ix + b.trans_ix;
+ c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+ c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+ c.path_ix = a.path_ix + b.path_ix;
+ c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+ return c;
+}
+
+kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _151 [[buffer(2)]], device OutBuf& _238 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+ threadgroup TagMonoid sh_scratch[128];
+ uint ix = gl_GlobalInvocationID.x * 2u;
+ uint scene_ix = (_139.conf.pathtag_offset >> uint(2)) + ix;
+ uint tag_word = _151.scene[scene_ix];
+ uint param = tag_word;
+ TagMonoid agg = reduce_tag(param);
+ for (uint i = 1u; i < 2u; i++)
+ {
+ tag_word = _151.scene[scene_ix + i];
+ uint param_1 = tag_word;
+ TagMonoid param_2 = agg;
+ TagMonoid param_3 = reduce_tag(param_1);
+ agg = combine_tag_monoid(param_2, param_3);
+ }
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 7u; i_1++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if ((gl_LocalInvocationID.x + (1u << i_1)) < 128u)
+ {
+ TagMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+ TagMonoid param_4 = agg;
+ TagMonoid param_5 = other;
+ agg = combine_tag_monoid(param_4, param_5);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ if (gl_LocalInvocationID.x == 0u)
+ {
+ _238.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix;
+ _238.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix;
+ _238.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix;
+ _238.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
+ _238.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset;
+ }
+}
+
diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv
new file mode 100644
index 0000000..78f1fd6
--- /dev/null
+++ b/piet-gpu/shader/gen/pathtag_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil
new file mode 100644
index 0000000..4badf4f
--- /dev/null
+++ b/piet-gpu/shader/gen/pathtag_root.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_root.hlsl b/piet-gpu/shader/gen/pathtag_root.hlsl
new file mode 100644
index 0000000..7ad806c
--- /dev/null
+++ b/piet-gpu/shader/gen/pathtag_root.hlsl
@@ -0,0 +1,115 @@
+struct TagMonoid
+{
+ uint trans_ix;
+ uint linewidth_ix;
+ uint pathseg_ix;
+ uint path_ix;
+ uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+static const TagMonoid _18 = { 0u, 0u, 0u, 0u, 0u };
+
+RWByteAddressBuffer _78 : register(u0, space0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared TagMonoid sh_scratch[256];
+
+TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b)
+{
+ TagMonoid c;
+ c.trans_ix = a.trans_ix + b.trans_ix;
+ c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+ c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+ c.path_ix = a.path_ix + b.path_ix;
+ c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+ return c;
+}
+
+TagMonoid tag_monoid_identity()
+{
+ return _18;
+}
+
+void comp_main()
+{
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ TagMonoid _82;
+ _82.trans_ix = _78.Load(ix * 20 + 0);
+ _82.linewidth_ix = _78.Load(ix * 20 + 4);
+ _82.pathseg_ix = _78.Load(ix * 20 + 8);
+ _82.path_ix = _78.Load(ix * 20 + 12);
+ _82.pathseg_offset = _78.Load(ix * 20 + 16);
+ TagMonoid local[8];
+ local[0].trans_ix = _82.trans_ix;
+ local[0].linewidth_ix = _82.linewidth_ix;
+ local[0].pathseg_ix = _82.pathseg_ix;
+ local[0].path_ix = _82.path_ix;
+ local[0].pathseg_offset = _82.pathseg_offset;
+ TagMonoid param_1;
+ for (uint i = 1u; i < 8u; i++)
+ {
+ TagMonoid param = local[i - 1u];
+ TagMonoid _115;
+ _115.trans_ix = _78.Load((ix + i) * 20 + 0);
+ _115.linewidth_ix = _78.Load((ix + i) * 20 + 4);
+ _115.pathseg_ix = _78.Load((ix + i) * 20 + 8);
+ _115.path_ix = _78.Load((ix + i) * 20 + 12);
+ _115.pathseg_offset = _78.Load((ix + i) * 20 + 16);
+ param_1.trans_ix = _115.trans_ix;
+ param_1.linewidth_ix = _115.linewidth_ix;
+ param_1.pathseg_ix = _115.pathseg_ix;
+ param_1.path_ix = _115.path_ix;
+ param_1.pathseg_offset = _115.pathseg_offset;
+ local[i] = combine_tag_monoid(param, param_1);
+ }
+ TagMonoid agg = local[7];
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ TagMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ TagMonoid param_2 = other;
+ TagMonoid param_3 = agg;
+ agg = combine_tag_monoid(param_2, param_3);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ TagMonoid row = tag_monoid_identity();
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ row = sh_scratch[gl_LocalInvocationID.x - 1u];
+ }
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ TagMonoid param_4 = row;
+ TagMonoid param_5 = local[i_2];
+ TagMonoid m = combine_tag_monoid(param_4, param_5);
+ uint _210 = ix + i_2;
+ _78.Store(_210 * 20 + 0, m.trans_ix);
+ _78.Store(_210 * 20 + 4, m.linewidth_ix);
+ _78.Store(_210 * 20 + 8, m.pathseg_ix);
+ _78.Store(_210 * 20 + 12, m.path_ix);
+ _78.Store(_210 * 20 + 16, m.pathseg_offset);
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/pathtag_root.msl b/piet-gpu/shader/gen/pathtag_root.msl
new file mode 100644
index 0000000..65e3741
--- /dev/null
+++ b/piet-gpu/shader/gen/pathtag_root.msl
@@ -0,0 +1,146 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+ T elements[Num ? Num : 1];
+
+ thread T& operator [] (size_t pos) thread
+ {
+ return elements[pos];
+ }
+ constexpr const thread T& operator [] (size_t pos) const thread
+ {
+ return elements[pos];
+ }
+
+ device T& operator [] (size_t pos) device
+ {
+ return elements[pos];
+ }
+ constexpr const device T& operator [] (size_t pos) const device
+ {
+ return elements[pos];
+ }
+
+ constexpr const constant T& operator [] (size_t pos) const constant
+ {
+ return elements[pos];
+ }
+
+ threadgroup T& operator [] (size_t pos) threadgroup
+ {
+ return elements[pos];
+ }
+ constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+ {
+ return elements[pos];
+ }
+};
+
+struct TagMonoid
+{
+ uint trans_ix;
+ uint linewidth_ix;
+ uint pathseg_ix;
+ uint path_ix;
+ uint pathseg_offset;
+};
+
+struct TagMonoid_1
+{
+ uint trans_ix;
+ uint linewidth_ix;
+ uint pathseg_ix;
+ uint path_ix;
+ uint pathseg_offset;
+};
+
+struct DataBuf
+{
+ TagMonoid_1 data[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b)
+{
+ TagMonoid c;
+ c.trans_ix = a.trans_ix + b.trans_ix;
+ c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+ c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+ c.path_ix = a.path_ix + b.path_ix;
+ c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+ return c;
+}
+
+static inline __attribute__((always_inline))
+TagMonoid tag_monoid_identity()
+{
+ return TagMonoid{ 0u, 0u, 0u, 0u, 0u };
+}
+
+kernel void main0(device DataBuf& _78 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+ threadgroup TagMonoid sh_scratch[256];
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ spvUnsafeArray<TagMonoid, 8> local;
+ local[0].trans_ix = _78.data[ix].trans_ix;
+ local[0].linewidth_ix = _78.data[ix].linewidth_ix;
+ local[0].pathseg_ix = _78.data[ix].pathseg_ix;
+ local[0].path_ix = _78.data[ix].path_ix;
+ local[0].pathseg_offset = _78.data[ix].pathseg_offset;
+ TagMonoid param_1;
+ for (uint i = 1u; i < 8u; i++)
+ {
+ uint _109 = ix + i;
+ TagMonoid param = local[i - 1u];
+ param_1.trans_ix = _78.data[_109].trans_ix;
+ param_1.linewidth_ix = _78.data[_109].linewidth_ix;
+ param_1.pathseg_ix = _78.data[_109].pathseg_ix;
+ param_1.path_ix = _78.data[_109].path_ix;
+ param_1.pathseg_offset = _78.data[_109].pathseg_offset;
+ local[i] = combine_tag_monoid(param, param_1);
+ }
+ TagMonoid agg = local[7];
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 8u; i_1++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ TagMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ TagMonoid param_2 = other;
+ TagMonoid param_3 = agg;
+ agg = combine_tag_monoid(param_2, param_3);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ TagMonoid row = tag_monoid_identity();
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ row = sh_scratch[gl_LocalInvocationID.x - 1u];
+ }
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ TagMonoid param_4 = row;
+ TagMonoid param_5 = local[i_2];
+ TagMonoid m = combine_tag_monoid(param_4, param_5);
+ uint _210 = ix + i_2;
+ _78.data[_210].trans_ix = m.trans_ix;
+ _78.data[_210].linewidth_ix = m.linewidth_ix;
+ _78.data[_210].pathseg_ix = m.pathseg_ix;
+ _78.data[_210].path_ix = m.path_ix;
+ _78.data[_210].pathseg_offset = m.pathseg_offset;
+ }
+}
+
diff --git a/piet-gpu/shader/gen/pathtag_root.spv b/piet-gpu/shader/gen/pathtag_root.spv
new file mode 100644
index 0000000..3783b49
--- /dev/null
+++ b/piet-gpu/shader/gen/pathtag_root.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil
new file mode 100644
index 0000000..ecf865c
--- /dev/null
+++ b/piet-gpu/shader/gen/tile_alloc.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl
new file mode 100644
index 0000000..e36370c
--- /dev/null
+++ b/piet-gpu/shader/gen/tile_alloc.hlsl
@@ -0,0 +1,235 @@
+struct Alloc
+{
+ uint offset;
+};
+
+struct PathRef
+{
+ uint offset;
+};
+
+struct TileRef
+{
+ uint offset;
+};
+
+struct Path
+{
+ uint4 bbox;
+ TileRef tiles;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc tile_alloc;
+ Alloc bin_alloc;
+ Alloc ptcl_alloc;
+ Alloc pathseg_alloc;
+ Alloc anno_alloc;
+ Alloc path_bbox_alloc;
+ Alloc drawmonoid_alloc;
+ Alloc clip_alloc;
+ Alloc clip_bic_alloc;
+ Alloc clip_stack_alloc;
+ Alloc clip_bbox_alloc;
+ Alloc draw_bbox_alloc;
+ Alloc drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer _53 : register(u0, space0);
+ByteAddressBuffer _148 : register(t1, space0);
+ByteAddressBuffer _231 : register(t2, space0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared uint sh_tile_count[256];
+groupshared uint sh_tile_offset;
+
+bool check_deps(uint dep_stage)
+{
+ uint _60;
+ _53.InterlockedOr(4, 0u, _60);
+ return (_60 & dep_stage) == 0u;
+}
+
+float4 load_draw_bbox(uint draw_ix)
+{
+ uint base = (_148.Load(64) >> uint(2)) + (4u * draw_ix);
+ float x0 = asfloat(_53.Load(base * 4 + 12));
+ float y0 = asfloat(_53.Load((base + 1u) * 4 + 12));
+ float x1 = asfloat(_53.Load((base + 2u) * 4 + 12));
+ float y1 = asfloat(_53.Load((base + 3u) * 4 + 12));
+ float4 bbox = float4(x0, y0, x1, y1);
+ return bbox;
+}
+
+uint malloc_stage(uint size, uint mem_size, uint stage)
+{
+ uint _70;
+ _53.InterlockedAdd(0, size, _70);
+ uint offset = _70;
+ if ((offset + size) > mem_size)
+ {
+ uint _80;
+ _53.InterlockedOr(4, stage, _80);
+ offset = 0u;
+ }
+ return offset;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+ return true;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ _53.Store(offset * 4 + 12, val);
+}
+
+void Path_write(Alloc a, PathRef ref, Path s)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.bbox.x | (s.bbox.y << uint(16));
+ write_mem(param, param_1, param_2);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = s.bbox.z | (s.bbox.w << uint(16));
+ write_mem(param_3, param_4, param_5);
+ Alloc param_6 = a;
+ uint param_7 = ix + 2u;
+ uint param_8 = s.tiles.offset;
+ write_mem(param_6, param_7, param_8);
+}
+
+void comp_main()
+{
+ uint param = 1u;
+ bool _191 = check_deps(param);
+ if (!_191)
+ {
+ return;
+ }
+ uint th_ix = gl_LocalInvocationID.x;
+ uint element_ix = gl_GlobalInvocationID.x;
+ PathRef _215 = { _148.Load(20) + (element_ix * 12u) };
+ PathRef path_ref = _215;
+ uint drawtag_base = _148.Load(100) >> uint(2);
+ uint drawtag = 0u;
+ if (element_ix < _148.Load(4))
+ {
+ drawtag = _231.Load((drawtag_base + element_ix) * 4 + 0);
+ }
+ int x0 = 0;
+ int y0 = 0;
+ int x1 = 0;
+ int y1 = 0;
+ if ((drawtag != 0u) && (drawtag != 37u))
+ {
+ uint param_1 = element_ix;
+ float4 bbox = load_draw_bbox(param_1);
+ x0 = int(floor(bbox.x * 0.0625f));
+ y0 = int(floor(bbox.y * 0.0625f));
+ x1 = int(ceil(bbox.z * 0.0625f));
+ y1 = int(ceil(bbox.w * 0.0625f));
+ }
+ x0 = clamp(x0, 0, int(_148.Load(12)));
+ y0 = clamp(y0, 0, int(_148.Load(16)));
+ x1 = clamp(x1, 0, int(_148.Load(12)));
+ y1 = clamp(y1, 0, int(_148.Load(16)));
+ Path path;
+ path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
+ uint tile_count = uint((x1 - x0) * (y1 - y0));
+ sh_tile_count[th_ix] = tile_count;
+ uint total_tile_count = tile_count;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if (th_ix >= (1u << i))
+ {
+ total_tile_count += sh_tile_count[th_ix - (1u << i)];
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_tile_count[th_ix] = total_tile_count;
+ }
+ if (th_ix == 255u)
+ {
+ uint param_2 = total_tile_count * 8u;
+ uint param_3 = _148.Load(0);
+ uint param_4 = 2u;
+ uint _369 = malloc_stage(param_2, param_3, param_4);
+ sh_tile_offset = _369;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ uint offset_start = sh_tile_offset;
+ if (offset_start == 0u)
+ {
+ return;
+ }
+ if (element_ix < _148.Load(4))
+ {
+ uint _386;
+ if (th_ix > 0u)
+ {
+ _386 = sh_tile_count[th_ix - 1u];
+ }
+ else
+ {
+ _386 = 0u;
+ }
+ uint tile_subix = _386;
+ TileRef _399 = { offset_start + (8u * tile_subix) };
+ path.tiles = _399;
+ Alloc _405;
+ _405.offset = _148.Load(20);
+ Alloc param_5;
+ param_5.offset = _405.offset;
+ PathRef param_6 = path_ref;
+ Path param_7 = path;
+ Path_write(param_5, param_6, param_7);
+ }
+ uint total_count = sh_tile_count[255] * 2u;
+ uint start_ix = offset_start >> uint(2);
+ for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
+ {
+ _53.Store((start_ix + i_1) * 4 + 12, 0u);
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl
new file mode 100644
index 0000000..85fecdc
--- /dev/null
+++ b/piet-gpu/shader/gen/tile_alloc.msl
@@ -0,0 +1,246 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct Alloc
+{
+ uint offset;
+};
+
+struct PathRef
+{
+ uint offset;
+};
+
+struct TileRef
+{
+ uint offset;
+};
+
+struct Path
+{
+ uint4 bbox;
+ TileRef tiles;
+};
+
+struct Memory
+{
+ uint mem_offset;
+ uint mem_error;
+ uint blend_offset;
+ uint memory[1];
+};
+
+struct Alloc_1
+{
+ uint offset;
+};
+
+struct Config
+{
+ uint mem_size;
+ uint n_elements;
+ uint n_pathseg;
+ uint width_in_tiles;
+ uint height_in_tiles;
+ Alloc_1 tile_alloc;
+ Alloc_1 bin_alloc;
+ Alloc_1 ptcl_alloc;
+ Alloc_1 pathseg_alloc;
+ Alloc_1 anno_alloc;
+ Alloc_1 path_bbox_alloc;
+ Alloc_1 drawmonoid_alloc;
+ Alloc_1 clip_alloc;
+ Alloc_1 clip_bic_alloc;
+ Alloc_1 clip_stack_alloc;
+ Alloc_1 clip_bbox_alloc;
+ Alloc_1 draw_bbox_alloc;
+ Alloc_1 drawinfo_alloc;
+ uint n_trans;
+ uint n_path;
+ uint n_clip;
+ uint trans_offset;
+ uint linewidth_offset;
+ uint pathtag_offset;
+ uint pathseg_offset;
+ uint drawtag_offset;
+ uint drawdata_offset;
+};
+
+struct ConfigBuf
+{
+ Config conf;
+};
+
+struct SceneBuf
+{
+ uint scene[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+bool check_deps(thread const uint& dep_stage, device Memory& v_53)
+{
+ uint _60 = atomic_fetch_or_explicit((device atomic_uint*)&v_53.mem_error, 0u, memory_order_relaxed);
+ return (_60 & dep_stage) == 0u;
+}
+
+static inline __attribute__((always_inline))
+float4 load_draw_bbox(thread const uint& draw_ix, device Memory& v_53, const device ConfigBuf& v_148)
+{
+ uint base = (v_148.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix);
+ float x0 = as_type<float>(v_53.memory[base]);
+ float y0 = as_type<float>(v_53.memory[base + 1u]);
+ float x1 = as_type<float>(v_53.memory[base + 2u]);
+ float y1 = as_type<float>(v_53.memory[base + 3u]);
+ float4 bbox = float4(x0, y0, x1, y1);
+ return bbox;
+}
+
+static inline __attribute__((always_inline))
+uint malloc_stage(thread const uint& size, thread const uint& mem_size, thread const uint& stage, device Memory& v_53)
+{
+ uint _70 = atomic_fetch_add_explicit((device atomic_uint*)&v_53.mem_offset, size, memory_order_relaxed);
+ uint offset = _70;
+ if ((offset + size) > mem_size)
+ {
+ uint _80 = atomic_fetch_or_explicit((device atomic_uint*)&v_53.mem_error, stage, memory_order_relaxed);
+ offset = 0u;
+ }
+ return offset;
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+ return true;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_53)
+{
+ Alloc param = alloc;
+ uint param_1 = offset;
+ if (!touch_mem(param, param_1))
+ {
+ return;
+ }
+ v_53.memory[offset] = val;
+}
+
+static inline __attribute__((always_inline))
+void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_53)
+{
+ uint ix = ref.offset >> uint(2);
+ Alloc param = a;
+ uint param_1 = ix + 0u;
+ uint param_2 = s.bbox.x | (s.bbox.y << uint(16));
+ write_mem(param, param_1, param_2, v_53);
+ Alloc param_3 = a;
+ uint param_4 = ix + 1u;
+ uint param_5 = s.bbox.z | (s.bbox.w << uint(16));
+ write_mem(param_3, param_4, param_5, v_53);
+ Alloc param_6 = a;
+ uint param_7 = ix + 2u;
+ uint param_8 = s.tiles.offset;
+ write_mem(param_6, param_7, param_8, v_53);
+}
+
+kernel void main0(device Memory& v_53 [[buffer(0)]], const device ConfigBuf& v_148 [[buffer(1)]], const device SceneBuf& _231 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+ threadgroup uint sh_tile_count[256];
+ threadgroup uint sh_tile_offset;
+ uint param = 1u;
+ bool _191 = check_deps(param, v_53);
+ if (!_191)
+ {
+ return;
+ }
+ uint th_ix = gl_LocalInvocationID.x;
+ uint element_ix = gl_GlobalInvocationID.x;
+ PathRef path_ref = PathRef{ v_148.conf.tile_alloc.offset + (element_ix * 12u) };
+ uint drawtag_base = v_148.conf.drawtag_offset >> uint(2);
+ uint drawtag = 0u;
+ if (element_ix < v_148.conf.n_elements)
+ {
+ drawtag = _231.scene[drawtag_base + element_ix];
+ }
+ int x0 = 0;
+ int y0 = 0;
+ int x1 = 0;
+ int y1 = 0;
+ if ((drawtag != 0u) && (drawtag != 37u))
+ {
+ uint param_1 = element_ix;
+ float4 bbox = load_draw_bbox(param_1, v_53, v_148);
+ x0 = int(floor(bbox.x * 0.0625));
+ y0 = int(floor(bbox.y * 0.0625));
+ x1 = int(ceil(bbox.z * 0.0625));
+ y1 = int(ceil(bbox.w * 0.0625));
+ }
+ x0 = clamp(x0, 0, int(v_148.conf.width_in_tiles));
+ y0 = clamp(y0, 0, int(v_148.conf.height_in_tiles));
+ x1 = clamp(x1, 0, int(v_148.conf.width_in_tiles));
+ y1 = clamp(y1, 0, int(v_148.conf.height_in_tiles));
+ Path path;
+ path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
+ uint tile_count = uint((x1 - x0) * (y1 - y0));
+ sh_tile_count[th_ix] = tile_count;
+ uint total_tile_count = tile_count;
+ for (uint i = 0u; i < 8u; i++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (th_ix >= (1u << i))
+ {
+ total_tile_count += sh_tile_count[th_ix - (1u << i)];
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_tile_count[th_ix] = total_tile_count;
+ }
+ if (th_ix == 255u)
+ {
+ uint param_2 = total_tile_count * 8u;
+ uint param_3 = v_148.conf.mem_size;
+ uint param_4 = 2u;
+ uint _369 = malloc_stage(param_2, param_3, param_4, v_53);
+ sh_tile_offset = _369;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint offset_start = sh_tile_offset;
+ if (offset_start == 0u)
+ {
+ return;
+ }
+ if (element_ix < v_148.conf.n_elements)
+ {
+ uint _386;
+ if (th_ix > 0u)
+ {
+ _386 = sh_tile_count[th_ix - 1u];
+ }
+ else
+ {
+ _386 = 0u;
+ }
+ uint tile_subix = _386;
+ path.tiles = TileRef{ offset_start + (8u * tile_subix) };
+ Alloc param_5;
+ param_5.offset = v_148.conf.tile_alloc.offset;
+ PathRef param_6 = path_ref;
+ Path param_7 = path;
+ Path_write(param_5, param_6, param_7, v_53);
+ }
+ uint total_count = sh_tile_count[255] * 2u;
+ uint start_ix = offset_start >> uint(2);
+ for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
+ {
+ v_53.memory[start_ix + i_1] = 0u;
+ }
+}
+
diff --git a/piet-gpu/shader/gen/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv
new file mode 100644
index 0000000..c6457a2
--- /dev/null
+++ b/piet-gpu/shader/gen/tile_alloc.spv
Binary files differ
diff --git a/tests/shader/gen/clear.dxil b/tests/shader/gen/clear.dxil
new file mode 100644
index 0000000..75422dd
--- /dev/null
+++ b/tests/shader/gen/clear.dxil
Binary files differ
diff --git a/tests/shader/gen/clear.hlsl b/tests/shader/gen/clear.hlsl
new file mode 100644
index 0000000..f6a576c
--- /dev/null
+++ b/tests/shader/gen/clear.hlsl
@@ -0,0 +1,26 @@
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+ByteAddressBuffer _19 : register(t0);
+RWByteAddressBuffer _32 : register(u1);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+void comp_main()
+{
+ uint ix = gl_GlobalInvocationID.x;
+ if (ix < _19.Load(0))
+ {
+ _32.Store(ix * 4 + 0, _19.Load(4));
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/tests/shader/gen/clear.msl b/tests/shader/gen/clear.msl
new file mode 100644
index 0000000..d89853b
--- /dev/null
+++ b/tests/shader/gen/clear.msl
@@ -0,0 +1,27 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct ConfigBuf
+{
+ uint size;
+ uint value;
+};
+
+struct TargetBuf
+{
+ uint data[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+kernel void main0(const device ConfigBuf& _19 [[buffer(0)]], device TargetBuf& _32 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+ uint ix = gl_GlobalInvocationID.x;
+ if (ix < _19.size)
+ {
+ _32.data[ix] = _19.value;
+ }
+}
+
diff --git a/tests/shader/gen/clear.spv b/tests/shader/gen/clear.spv
new file mode 100644
index 0000000..0e8d1d7
--- /dev/null
+++ b/tests/shader/gen/clear.spv
Binary files differ
diff --git a/tests/shader/gen/linkedlist.dxil b/tests/shader/gen/linkedlist.dxil
new file mode 100644
index 0000000..18491fa
--- /dev/null
+++ b/tests/shader/gen/linkedlist.dxil
Binary files differ
diff --git a/tests/shader/gen/linkedlist.hlsl b/tests/shader/gen/linkedlist.hlsl
new file mode 100644
index 0000000..614791a
--- /dev/null
+++ b/tests/shader/gen/linkedlist.hlsl
@@ -0,0 +1,39 @@
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer _56 : register(u0);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+void comp_main()
+{
+ uint rng = gl_GlobalInvocationID.x + 1u;
+ for (uint i = 0u; i < 100u; i++)
+ {
+ rng ^= (rng << uint(13));
+ rng ^= (rng >> uint(17));
+ rng ^= (rng << uint(5));
+ uint bucket = rng % 65536u;
+ if (bucket != 0u)
+ {
+ uint _61;
+ _56.InterlockedAdd(0, 2u, _61);
+ uint alloc = _61 + 65536u;
+ uint _67;
+ _56.InterlockedExchange(bucket * 4 + 0, alloc, _67);
+ uint old = _67;
+ _56.Store(alloc * 4 + 0, old);
+ _56.Store((alloc + 1u) * 4 + 0, gl_GlobalInvocationID.x);
+ }
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/tests/shader/gen/linkedlist.msl b/tests/shader/gen/linkedlist.msl
new file mode 100644
index 0000000..0461d79
--- /dev/null
+++ b/tests/shader/gen/linkedlist.msl
@@ -0,0 +1,36 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct MemBuf
+{
+ uint mem[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+kernel void main0(device MemBuf& _56 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+ uint rng = gl_GlobalInvocationID.x + 1u;
+ for (uint i = 0u; i < 100u; i++)
+ {
+ rng ^= (rng << uint(13));
+ rng ^= (rng >> uint(17));
+ rng ^= (rng << uint(5));
+ uint bucket = rng % 65536u;
+ if (bucket != 0u)
+ {
+ uint _61 = atomic_fetch_add_explicit((device atomic_uint*)&_56.mem[0], 2u, memory_order_relaxed);
+ uint alloc = _61 + 65536u;
+ uint _67 = atomic_exchange_explicit((device atomic_uint*)&_56.mem[bucket], alloc, memory_order_relaxed);
+ uint old = _67;
+ _56.mem[alloc] = old;
+ _56.mem[alloc + 1u] = gl_GlobalInvocationID.x;
+ }
+ }
+}
+
diff --git a/tests/shader/gen/linkedlist.spv b/tests/shader/gen/linkedlist.spv
new file mode 100644
index 0000000..a723283
--- /dev/null
+++ b/tests/shader/gen/linkedlist.spv
Binary files differ
diff --git a/tests/shader/gen/message_passing.dxil b/tests/shader/gen/message_passing.dxil
new file mode 100644
index 0000000..fffb34b
--- /dev/null
+++ b/tests/shader/gen/message_passing.dxil
Binary files differ
diff --git a/tests/shader/gen/message_passing.hlsl b/tests/shader/gen/message_passing.hlsl
new file mode 100644
index 0000000..ba8ce5f
--- /dev/null
+++ b/tests/shader/gen/message_passing.hlsl
@@ -0,0 +1,54 @@
+struct Element
+{
+ uint data;
+ uint flag;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer data_buf : register(u0);
+RWByteAddressBuffer control_buf : register(u1);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+uint permute_flag_ix(uint data_ix)
+{
+ return (data_ix * 419u) & 65535u;
+}
+
+void comp_main()
+{
+ uint _76;
+ data_buf.InterlockedExchange(gl_GlobalInvocationID.x * 8 + 0, 1u, _76);
+ DeviceMemoryBarrier();
+ uint param = gl_GlobalInvocationID.x;
+ uint write_flag_ix = permute_flag_ix(param);
+ uint _77;
+ data_buf.InterlockedExchange(write_flag_ix * 8 + 4, 1u, _77);
+ uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u;
+ uint param_1 = read_ix;
+ uint read_flag_ix = permute_flag_ix(param_1);
+ uint _58;
+ data_buf.InterlockedAdd(read_flag_ix * 8 + 4, 0, _58);
+ uint flag = _58;
+ DeviceMemoryBarrier();
+ uint _62;
+ data_buf.InterlockedAdd(read_ix * 8 + 0, 0, _62);
+ uint data = _62;
+ if (flag > data)
+ {
+ uint _73;
+ control_buf.InterlockedAdd(0, 1u, _73);
+ }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/tests/shader/gen/message_passing.msl b/tests/shader/gen/message_passing.msl
new file mode 100644
index 0000000..e48f48a
--- /dev/null
+++ b/tests/shader/gen/message_passing.msl
@@ -0,0 +1,54 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct Element
+{
+ uint data;
+ uint flag;
+};
+
+struct DataBuf
+{
+ Element data[1];
+};
+
+struct ControlBuf
+{
+ uint failures;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+uint permute_flag_ix(thread const uint& data_ix)
+{
+ return (data_ix * 419u) & 65535u;
+}
+
+kernel void main0(device DataBuf& data_buf [[buffer(0)]], device ControlBuf& control_buf [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+ atomic_store_explicit((device atomic_uint*)&data_buf.data[gl_GlobalInvocationID.x].data, 1u, memory_order_relaxed);
+ threadgroup_barrier(mem_flags::mem_device);
+ uint param = gl_GlobalInvocationID.x;
+ uint write_flag_ix = permute_flag_ix(param);
+ atomic_store_explicit((device atomic_uint*)&data_buf.data[write_flag_ix].flag, 1u, memory_order_relaxed);
+ uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u;
+ uint param_1 = read_ix;
+ uint read_flag_ix = permute_flag_ix(param_1);
+ uint _58 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_flag_ix].flag, memory_order_relaxed);
+ uint flag = _58;
+ threadgroup_barrier(mem_flags::mem_device);
+ uint _62 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_ix].data, memory_order_relaxed);
+ uint data = _62;
+ if (flag > data)
+ {
+ uint _73 = atomic_fetch_add_explicit((device atomic_uint*)&control_buf.failures, 1u, memory_order_relaxed);
+ }
+}
+
diff --git a/tests/shader/gen/message_passing.spv b/tests/shader/gen/message_passing.spv
new file mode 100644
index 0000000..e5f56d6
--- /dev/null
+++ b/tests/shader/gen/message_passing.spv
Binary files differ
diff --git a/tests/shader/gen/message_passing_vkmm.spv b/tests/shader/gen/message_passing_vkmm.spv
new file mode 100644
index 0000000..8527c2b
--- /dev/null
+++ b/tests/shader/gen/message_passing_vkmm.spv
Binary files differ
diff --git a/tests/shader/gen/prefix.dxil b/tests/shader/gen/prefix.dxil
new file mode 100644
index 0000000..a6c4945
--- /dev/null
+++ b/tests/shader/gen/prefix.dxil
Binary files differ
diff --git a/tests/shader/gen/prefix.hlsl b/tests/shader/gen/prefix.hlsl
new file mode 100644
index 0000000..72cfa90
--- /dev/null
+++ b/tests/shader/gen/prefix.hlsl
@@ -0,0 +1,225 @@
+struct Monoid
+{
+ uint element;
+};
+
+struct State
+{
+ uint flag;
+ Monoid aggregate;
+ Monoid prefix;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const Monoid _185 = { 0u };
+
+globallycoherent RWByteAddressBuffer _43 : register(u2);
+ByteAddressBuffer _67 : register(t0);
+RWByteAddressBuffer _372 : register(u1);
+
+static uint3 gl_LocalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+};
+
+groupshared uint sh_part_ix;
+groupshared Monoid sh_scratch[512];
+groupshared uint sh_flag;
+groupshared Monoid sh_prefix;
+
+Monoid combine_monoid(Monoid a, Monoid b)
+{
+ Monoid _22 = { a.element + b.element };
+ return _22;
+}
+
+void comp_main()
+{
+ if (gl_LocalInvocationID.x == 0u)
+ {
+ uint _47;
+ _43.InterlockedAdd(0, 1u, _47);
+ sh_part_ix = _47;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ uint part_ix = sh_part_ix;
+ uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u);
+ Monoid _71;
+ _71.element = _67.Load(ix * 4 + 0);
+ Monoid local[16];
+ local[0].element = _71.element;
+ Monoid param_1;
+ for (uint i = 1u; i < 16u; i++)
+ {
+ Monoid param = local[i - 1u];
+ Monoid _94;
+ _94.element = _67.Load((ix + i) * 4 + 0);
+ param_1.element = _94.element;
+ local[i] = combine_monoid(param, param_1);
+ }
+ Monoid agg = local[15];
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ Monoid param_2 = other;
+ Monoid param_3 = agg;
+ agg = combine_monoid(param_2, param_3);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ _43.Store(part_ix * 12 + 8, agg.element);
+ if (part_ix == 0u)
+ {
+ _43.Store(12, agg.element);
+ }
+ }
+ DeviceMemoryBarrier();
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ uint flag = 1u;
+ if (part_ix == 0u)
+ {
+ flag = 2u;
+ }
+ _43.Store(part_ix * 12 + 4, flag);
+ }
+ Monoid exclusive = _185;
+ if (part_ix != 0u)
+ {
+ uint look_back_ix = part_ix - 1u;
+ uint their_ix = 0u;
+ Monoid their_prefix;
+ Monoid their_agg;
+ Monoid m;
+ while (true)
+ {
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ sh_flag = _43.Load(look_back_ix * 12 + 4);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ DeviceMemoryBarrier();
+ uint flag_1 = sh_flag;
+ GroupMemoryBarrierWithGroupSync();
+ if (flag_1 == 2u)
+ {
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ Monoid _223;
+ _223.element = _43.Load(look_back_ix * 12 + 12);
+ their_prefix.element = _223.element;
+ Monoid param_4 = their_prefix;
+ Monoid param_5 = exclusive;
+ exclusive = combine_monoid(param_4, param_5);
+ }
+ break;
+ }
+ else
+ {
+ if (flag_1 == 1u)
+ {
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ Monoid _245;
+ _245.element = _43.Load(look_back_ix * 12 + 8);
+ their_agg.element = _245.element;
+ Monoid param_6 = their_agg;
+ Monoid param_7 = exclusive;
+ exclusive = combine_monoid(param_6, param_7);
+ }
+ look_back_ix--;
+ their_ix = 0u;
+ continue;
+ }
+ }
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ Monoid _267;
+ _267.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
+ m.element = _267.element;
+ if (their_ix == 0u)
+ {
+ their_agg = m;
+ }
+ else
+ {
+ Monoid param_8 = their_agg;
+ Monoid param_9 = m;
+ their_agg = combine_monoid(param_8, param_9);
+ }
+ their_ix++;
+ if (their_ix == 8192u)
+ {
+ Monoid param_10 = their_agg;
+ Monoid param_11 = exclusive;
+ exclusive = combine_monoid(param_10, param_11);
+ if (look_back_ix == 0u)
+ {
+ sh_flag = 2u;
+ }
+ else
+ {
+ look_back_ix--;
+ their_ix = 0u;
+ }
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ flag_1 = sh_flag;
+ GroupMemoryBarrierWithGroupSync();
+ if (flag_1 == 2u)
+ {
+ break;
+ }
+ }
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ Monoid param_12 = exclusive;
+ Monoid param_13 = agg;
+ Monoid inclusive_prefix = combine_monoid(param_12, param_13);
+ sh_prefix = exclusive;
+ _43.Store(part_ix * 12 + 12, inclusive_prefix.element);
+ }
+ DeviceMemoryBarrier();
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ _43.Store(part_ix * 12 + 4, 2u);
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ if (part_ix != 0u)
+ {
+ exclusive = sh_prefix;
+ }
+ Monoid row = exclusive;
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u];
+ Monoid param_14 = row;
+ Monoid param_15 = other_1;
+ row = combine_monoid(param_14, param_15);
+ }
+ for (uint i_2 = 0u; i_2 < 16u; i_2++)
+ {
+ Monoid param_16 = row;
+ Monoid param_17 = local[i_2];
+ Monoid m_1 = combine_monoid(param_16, param_17);
+ _372.Store((ix + i_2) * 4 + 0, m_1.element);
+ }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ comp_main();
+}
diff --git a/tests/shader/gen/prefix.msl b/tests/shader/gen/prefix.msl
new file mode 100644
index 0000000..24bee60
--- /dev/null
+++ b/tests/shader/gen/prefix.msl
@@ -0,0 +1,264 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+ T elements[Num ? Num : 1];
+
+ thread T& operator [] (size_t pos) thread
+ {
+ return elements[pos];
+ }
+ constexpr const thread T& operator [] (size_t pos) const thread
+ {
+ return elements[pos];
+ }
+
+ device T& operator [] (size_t pos) device
+ {
+ return elements[pos];
+ }
+ constexpr const device T& operator [] (size_t pos) const device
+ {
+ return elements[pos];
+ }
+
+ constexpr const constant T& operator [] (size_t pos) const constant
+ {
+ return elements[pos];
+ }
+
+ threadgroup T& operator [] (size_t pos) threadgroup
+ {
+ return elements[pos];
+ }
+ constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+ {
+ return elements[pos];
+ }
+};
+
+struct Monoid
+{
+ uint element;
+};
+
+struct Monoid_1
+{
+ uint element;
+};
+
+struct State
+{
+ uint flag;
+ Monoid_1 aggregate;
+ Monoid_1 prefix;
+};
+
+struct StateBuf
+{
+ uint part_counter;
+ State state[1];
+};
+
+struct InBuf
+{
+ Monoid_1 inbuf[1];
+};
+
+struct OutBuf
+{
+ Monoid_1 outbuf[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b)
+{
+ return Monoid{ a.element + b.element };
+}
+
+kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+ threadgroup uint sh_part_ix;
+ threadgroup Monoid sh_scratch[512];
+ threadgroup uint sh_flag;
+ threadgroup Monoid sh_prefix;
+ if (gl_LocalInvocationID.x == 0u)
+ {
+ uint _47 = atomic_fetch_add_explicit((volatile device atomic_uint*)&_43.part_counter, 1u, memory_order_relaxed);
+ sh_part_ix = _47;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint part_ix = sh_part_ix;
+ uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u);
+ spvUnsafeArray<Monoid, 16> local;
+ local[0].element = _67.inbuf[ix].element;
+ Monoid param_1;
+ for (uint i = 1u; i < 16u; i++)
+ {
+ Monoid param = local[i - 1u];
+ param_1.element = _67.inbuf[ix + i].element;
+ local[i] = combine_monoid(param, param_1);
+ }
+ Monoid agg = local[15];
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ Monoid param_2 = other;
+ Monoid param_3 = agg;
+ agg = combine_monoid(param_2, param_3);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ _43.state[part_ix].aggregate.element = agg.element;
+ if (part_ix == 0u)
+ {
+ _43.state[0].prefix.element = agg.element;
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_device);
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ uint flag = 1u;
+ if (part_ix == 0u)
+ {
+ flag = 2u;
+ }
+ _43.state[part_ix].flag = flag;
+ }
+ Monoid exclusive = Monoid{ 0u };
+ if (part_ix != 0u)
+ {
+ uint look_back_ix = part_ix - 1u;
+ uint their_ix = 0u;
+ Monoid their_prefix;
+ Monoid their_agg;
+ Monoid m;
+ while (true)
+ {
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ sh_flag = _43.state[look_back_ix].flag;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ threadgroup_barrier(mem_flags::mem_device);
+ uint flag_1 = sh_flag;
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (flag_1 == 2u)
+ {
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ their_prefix.element = _43.state[look_back_ix].prefix.element;
+ Monoid param_4 = their_prefix;
+ Monoid param_5 = exclusive;
+ exclusive = combine_monoid(param_4, param_5);
+ }
+ break;
+ }
+ else
+ {
+ if (flag_1 == 1u)
+ {
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ their_agg.element = _43.state[look_back_ix].aggregate.element;
+ Monoid param_6 = their_agg;
+ Monoid param_7 = exclusive;
+ exclusive = combine_monoid(param_6, param_7);
+ }
+ look_back_ix--;
+ their_ix = 0u;
+ continue;
+ }
+ }
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ m.element = _67.inbuf[(look_back_ix * 8192u) + their_ix].element;
+ if (their_ix == 0u)
+ {
+ their_agg = m;
+ }
+ else
+ {
+ Monoid param_8 = their_agg;
+ Monoid param_9 = m;
+ their_agg = combine_monoid(param_8, param_9);
+ }
+ their_ix++;
+ if (their_ix == 8192u)
+ {
+ Monoid param_10 = their_agg;
+ Monoid param_11 = exclusive;
+ exclusive = combine_monoid(param_10, param_11);
+ if (look_back_ix == 0u)
+ {
+ sh_flag = 2u;
+ }
+ else
+ {
+ look_back_ix--;
+ their_ix = 0u;
+ }
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ flag_1 = sh_flag;
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (flag_1 == 2u)
+ {
+ break;
+ }
+ }
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ Monoid param_12 = exclusive;
+ Monoid param_13 = agg;
+ Monoid inclusive_prefix = combine_monoid(param_12, param_13);
+ sh_prefix = exclusive;
+ _43.state[part_ix].prefix.element = inclusive_prefix.element;
+ }
+ threadgroup_barrier(mem_flags::mem_device);
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ _43.state[part_ix].flag = 2u;
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (part_ix != 0u)
+ {
+ exclusive = sh_prefix;
+ }
+ Monoid row = exclusive;
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u];
+ Monoid param_14 = row;
+ Monoid param_15 = other_1;
+ row = combine_monoid(param_14, param_15);
+ }
+ for (uint i_2 = 0u; i_2 < 16u; i_2++)
+ {
+ Monoid param_16 = row;
+ Monoid param_17 = local[i_2];
+ Monoid m_1 = combine_monoid(param_16, param_17);
+ _372.outbuf[ix + i_2].element = m_1.element;
+ }
+}
+
diff --git a/tests/shader/gen/prefix.spv b/tests/shader/gen/prefix.spv
new file mode 100644
index 0000000..8e7db4a
--- /dev/null
+++ b/tests/shader/gen/prefix.spv
Binary files differ
diff --git a/tests/shader/gen/prefix_atomic.dxil b/tests/shader/gen/prefix_atomic.dxil
new file mode 100644
index 0000000..c1b3207
--- /dev/null
+++ b/tests/shader/gen/prefix_atomic.dxil
Binary files differ
diff --git a/tests/shader/gen/prefix_atomic.hlsl b/tests/shader/gen/prefix_atomic.hlsl
new file mode 100644
index 0000000..a75448f
--- /dev/null
+++ b/tests/shader/gen/prefix_atomic.hlsl
@@ -0,0 +1,229 @@
+struct Monoid
+{
+ uint element;
+};
+
+struct State
+{
+ uint flag;
+ Monoid aggregate;
+ Monoid prefix;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const Monoid _185 = { 0u };
+
+globallycoherent RWByteAddressBuffer _43 : register(u2);
+ByteAddressBuffer _67 : register(t0);
+RWByteAddressBuffer _372 : register(u1);
+
+static uint3 gl_LocalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+};
+
+groupshared uint sh_part_ix;
+groupshared Monoid sh_scratch[512];
+groupshared uint sh_flag;
+groupshared Monoid sh_prefix;
+
+Monoid combine_monoid(Monoid a, Monoid b)
+{
+ Monoid _22 = { a.element + b.element };
+ return _22;
+}
+
+void comp_main()
+{
+ if (gl_LocalInvocationID.x == 0u)
+ {
+ uint _47;
+ _43.InterlockedAdd(0, 1u, _47);
+ sh_part_ix = _47;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ uint part_ix = sh_part_ix;
+ uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u);
+ Monoid _71;
+ _71.element = _67.Load(ix * 4 + 0);
+ Monoid local[16];
+ local[0].element = _71.element;
+ Monoid param_1;
+ for (uint i = 1u; i < 16u; i++)
+ {
+ Monoid param = local[i - 1u];
+ Monoid _94;
+ _94.element = _67.Load((ix + i) * 4 + 0);
+ param_1.element = _94.element;
+ local[i] = combine_monoid(param, param_1);
+ }
+ Monoid agg = local[15];
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ Monoid param_2 = other;
+ Monoid param_3 = agg;
+ agg = combine_monoid(param_2, param_3);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ _43.Store(part_ix * 12 + 8, agg.element);
+ if (part_ix == 0u)
+ {
+ _43.Store(12, agg.element);
+ }
+ }
+ DeviceMemoryBarrier();
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ uint flag = 1u;
+ if (part_ix == 0u)
+ {
+ flag = 2u;
+ }
+ uint _383;
+ _43.InterlockedExchange(part_ix * 12 + 4, flag, _383);
+ }
+ Monoid exclusive = _185;
+ if (part_ix != 0u)
+ {
+ uint look_back_ix = part_ix - 1u;
+ uint their_ix = 0u;
+ Monoid their_prefix;
+ Monoid their_agg;
+ Monoid m;
+ while (true)
+ {
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ uint _208;
+ _43.InterlockedAdd(look_back_ix * 12 + 4, 0, _208);
+ sh_flag = _208;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ DeviceMemoryBarrier();
+ uint flag_1 = sh_flag;
+ GroupMemoryBarrierWithGroupSync();
+ if (flag_1 == 2u)
+ {
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ Monoid _223;
+ _223.element = _43.Load(look_back_ix * 12 + 12);
+ their_prefix.element = _223.element;
+ Monoid param_4 = their_prefix;
+ Monoid param_5 = exclusive;
+ exclusive = combine_monoid(param_4, param_5);
+ }
+ break;
+ }
+ else
+ {
+ if (flag_1 == 1u)
+ {
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ Monoid _245;
+ _245.element = _43.Load(look_back_ix * 12 + 8);
+ their_agg.element = _245.element;
+ Monoid param_6 = their_agg;
+ Monoid param_7 = exclusive;
+ exclusive = combine_monoid(param_6, param_7);
+ }
+ look_back_ix--;
+ their_ix = 0u;
+ continue;
+ }
+ }
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ Monoid _267;
+ _267.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
+ m.element = _267.element;
+ if (their_ix == 0u)
+ {
+ their_agg = m;
+ }
+ else
+ {
+ Monoid param_8 = their_agg;
+ Monoid param_9 = m;
+ their_agg = combine_monoid(param_8, param_9);
+ }
+ their_ix++;
+ if (their_ix == 8192u)
+ {
+ Monoid param_10 = their_agg;
+ Monoid param_11 = exclusive;
+ exclusive = combine_monoid(param_10, param_11);
+ if (look_back_ix == 0u)
+ {
+ sh_flag = 2u;
+ }
+ else
+ {
+ look_back_ix--;
+ their_ix = 0u;
+ }
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ flag_1 = sh_flag;
+ GroupMemoryBarrierWithGroupSync();
+ if (flag_1 == 2u)
+ {
+ break;
+ }
+ }
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ Monoid param_12 = exclusive;
+ Monoid param_13 = agg;
+ Monoid inclusive_prefix = combine_monoid(param_12, param_13);
+ sh_prefix = exclusive;
+ _43.Store(part_ix * 12 + 12, inclusive_prefix.element);
+ }
+ DeviceMemoryBarrier();
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ uint _384;
+ _43.InterlockedExchange(part_ix * 12 + 4, 2u, _384);
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ if (part_ix != 0u)
+ {
+ exclusive = sh_prefix;
+ }
+ Monoid row = exclusive;
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u];
+ Monoid param_14 = row;
+ Monoid param_15 = other_1;
+ row = combine_monoid(param_14, param_15);
+ }
+ for (uint i_2 = 0u; i_2 < 16u; i_2++)
+ {
+ Monoid param_16 = row;
+ Monoid param_17 = local[i_2];
+ Monoid m_1 = combine_monoid(param_16, param_17);
+ _372.Store((ix + i_2) * 4 + 0, m_1.element);
+ }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ comp_main();
+}
diff --git a/tests/shader/gen/prefix_atomic.msl b/tests/shader/gen/prefix_atomic.msl
new file mode 100644
index 0000000..910e842
--- /dev/null
+++ b/tests/shader/gen/prefix_atomic.msl
@@ -0,0 +1,265 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+ T elements[Num ? Num : 1];
+
+ thread T& operator [] (size_t pos) thread
+ {
+ return elements[pos];
+ }
+ constexpr const thread T& operator [] (size_t pos) const thread
+ {
+ return elements[pos];
+ }
+
+ device T& operator [] (size_t pos) device
+ {
+ return elements[pos];
+ }
+ constexpr const device T& operator [] (size_t pos) const device
+ {
+ return elements[pos];
+ }
+
+ constexpr const constant T& operator [] (size_t pos) const constant
+ {
+ return elements[pos];
+ }
+
+ threadgroup T& operator [] (size_t pos) threadgroup
+ {
+ return elements[pos];
+ }
+ constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+ {
+ return elements[pos];
+ }
+};
+
+struct Monoid
+{
+ uint element;
+};
+
+struct Monoid_1
+{
+ uint element;
+};
+
+struct State
+{
+ uint flag;
+ Monoid_1 aggregate;
+ Monoid_1 prefix;
+};
+
+struct StateBuf
+{
+ uint part_counter;
+ State state[1];
+};
+
+struct InBuf
+{
+ Monoid_1 inbuf[1];
+};
+
+struct OutBuf
+{
+ Monoid_1 outbuf[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b)
+{
+ return Monoid{ a.element + b.element };
+}
+
+kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+ threadgroup uint sh_part_ix;
+ threadgroup Monoid sh_scratch[512];
+ threadgroup uint sh_flag;
+ threadgroup Monoid sh_prefix;
+ if (gl_LocalInvocationID.x == 0u)
+ {
+ uint _47 = atomic_fetch_add_explicit((volatile device atomic_uint*)&_43.part_counter, 1u, memory_order_relaxed);
+ sh_part_ix = _47;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint part_ix = sh_part_ix;
+ uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u);
+ spvUnsafeArray<Monoid, 16> local;
+ local[0].element = _67.inbuf[ix].element;
+ Monoid param_1;
+ for (uint i = 1u; i < 16u; i++)
+ {
+ Monoid param = local[i - 1u];
+ param_1.element = _67.inbuf[ix + i].element;
+ local[i] = combine_monoid(param, param_1);
+ }
+ Monoid agg = local[15];
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ Monoid param_2 = other;
+ Monoid param_3 = agg;
+ agg = combine_monoid(param_2, param_3);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ _43.state[part_ix].aggregate.element = agg.element;
+ if (part_ix == 0u)
+ {
+ _43.state[0].prefix.element = agg.element;
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_device);
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ uint flag = 1u;
+ if (part_ix == 0u)
+ {
+ flag = 2u;
+ }
+ atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].flag, flag, memory_order_relaxed);
+ }
+ Monoid exclusive = Monoid{ 0u };
+ if (part_ix != 0u)
+ {
+ uint look_back_ix = part_ix - 1u;
+ uint their_ix = 0u;
+ Monoid their_prefix;
+ Monoid their_agg;
+ Monoid m;
+ while (true)
+ {
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ uint _208 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].flag, memory_order_relaxed);
+ sh_flag = _208;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ threadgroup_barrier(mem_flags::mem_device);
+ uint flag_1 = sh_flag;
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (flag_1 == 2u)
+ {
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ their_prefix.element = _43.state[look_back_ix].prefix.element;
+ Monoid param_4 = their_prefix;
+ Monoid param_5 = exclusive;
+ exclusive = combine_monoid(param_4, param_5);
+ }
+ break;
+ }
+ else
+ {
+ if (flag_1 == 1u)
+ {
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ their_agg.element = _43.state[look_back_ix].aggregate.element;
+ Monoid param_6 = their_agg;
+ Monoid param_7 = exclusive;
+ exclusive = combine_monoid(param_6, param_7);
+ }
+ look_back_ix--;
+ their_ix = 0u;
+ continue;
+ }
+ }
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ m.element = _67.inbuf[(look_back_ix * 8192u) + their_ix].element;
+ if (their_ix == 0u)
+ {
+ their_agg = m;
+ }
+ else
+ {
+ Monoid param_8 = their_agg;
+ Monoid param_9 = m;
+ their_agg = combine_monoid(param_8, param_9);
+ }
+ their_ix++;
+ if (their_ix == 8192u)
+ {
+ Monoid param_10 = their_agg;
+ Monoid param_11 = exclusive;
+ exclusive = combine_monoid(param_10, param_11);
+ if (look_back_ix == 0u)
+ {
+ sh_flag = 2u;
+ }
+ else
+ {
+ look_back_ix--;
+ their_ix = 0u;
+ }
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ flag_1 = sh_flag;
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (flag_1 == 2u)
+ {
+ break;
+ }
+ }
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ Monoid param_12 = exclusive;
+ Monoid param_13 = agg;
+ Monoid inclusive_prefix = combine_monoid(param_12, param_13);
+ sh_prefix = exclusive;
+ _43.state[part_ix].prefix.element = inclusive_prefix.element;
+ }
+ threadgroup_barrier(mem_flags::mem_device);
+ if (gl_LocalInvocationID.x == 511u)
+ {
+ atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].flag, 2u, memory_order_relaxed);
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (part_ix != 0u)
+ {
+ exclusive = sh_prefix;
+ }
+ Monoid row = exclusive;
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u];
+ Monoid param_14 = row;
+ Monoid param_15 = other_1;
+ row = combine_monoid(param_14, param_15);
+ }
+ for (uint i_2 = 0u; i_2 < 16u; i_2++)
+ {
+ Monoid param_16 = row;
+ Monoid param_17 = local[i_2];
+ Monoid m_1 = combine_monoid(param_16, param_17);
+ _372.outbuf[ix + i_2].element = m_1.element;
+ }
+}
+
diff --git a/tests/shader/gen/prefix_atomic.spv b/tests/shader/gen/prefix_atomic.spv
new file mode 100644
index 0000000..d7dac5b
--- /dev/null
+++ b/tests/shader/gen/prefix_atomic.spv
Binary files differ
diff --git a/tests/shader/gen/prefix_reduce.dxil b/tests/shader/gen/prefix_reduce.dxil
new file mode 100644
index 0000000..9b11457
--- /dev/null
+++ b/tests/shader/gen/prefix_reduce.dxil
Binary files differ
diff --git a/tests/shader/gen/prefix_reduce.hlsl b/tests/shader/gen/prefix_reduce.hlsl
new file mode 100644
index 0000000..f2de539
--- /dev/null
+++ b/tests/shader/gen/prefix_reduce.hlsl
@@ -0,0 +1,72 @@
+struct Monoid
+{
+ uint element;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+ByteAddressBuffer _40 : register(t0);
+RWByteAddressBuffer _127 : register(u1);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_WorkGroupID : SV_GroupID;
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Monoid sh_scratch[512];
+
+Monoid combine_monoid(Monoid a, Monoid b)
+{
+ Monoid _22 = { a.element + b.element };
+ return _22;
+}
+
+void comp_main()
+{
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ Monoid _44;
+ _44.element = _40.Load(ix * 4 + 0);
+ Monoid agg;
+ agg.element = _44.element;
+ Monoid param_1;
+ for (uint i = 1u; i < 8u; i++)
+ {
+ Monoid param = agg;
+ Monoid _64;
+ _64.element = _40.Load((ix + i) * 4 + 0);
+ param_1.element = _64.element;
+ agg = combine_monoid(param, param_1);
+ }
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
+ {
+ Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+ Monoid param_2 = agg;
+ Monoid param_3 = other;
+ agg = combine_monoid(param_2, param_3);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ if (gl_LocalInvocationID.x == 0u)
+ {
+ _127.Store(gl_WorkGroupID.x * 4 + 0, agg.element);
+ }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_WorkGroupID = stage_input.gl_WorkGroupID;
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/tests/shader/gen/prefix_reduce.msl b/tests/shader/gen/prefix_reduce.msl
new file mode 100644
index 0000000..3a3125d
--- /dev/null
+++ b/tests/shader/gen/prefix_reduce.msl
@@ -0,0 +1,68 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Monoid
+{
+ uint element;
+};
+
+struct Monoid_1
+{
+ uint element;
+};
+
+struct InBuf
+{
+ Monoid_1 inbuf[1];
+};
+
+struct OutBuf
+{
+ Monoid_1 outbuf[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b)
+{
+ return Monoid{ a.element + b.element };
+}
+
+kernel void main0(const device InBuf& _40 [[buffer(0)]], device OutBuf& _127 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+ threadgroup Monoid sh_scratch[512];
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ Monoid agg;
+ agg.element = _40.inbuf[ix].element;
+ Monoid param_1;
+ for (uint i = 1u; i < 8u; i++)
+ {
+ Monoid param = agg;
+ param_1.element = _40.inbuf[ix + i].element;
+ agg = combine_monoid(param, param_1);
+ }
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
+ {
+ Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+ Monoid param_2 = agg;
+ Monoid param_3 = other;
+ agg = combine_monoid(param_2, param_3);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ if (gl_LocalInvocationID.x == 0u)
+ {
+ _127.outbuf[gl_WorkGroupID.x].element = agg.element;
+ }
+}
+
diff --git a/tests/shader/gen/prefix_reduce.spv b/tests/shader/gen/prefix_reduce.spv
new file mode 100644
index 0000000..b2e35fc
--- /dev/null
+++ b/tests/shader/gen/prefix_reduce.spv
Binary files differ
diff --git a/tests/shader/gen/prefix_root.dxil b/tests/shader/gen/prefix_root.dxil
new file mode 100644
index 0000000..056b18c
--- /dev/null
+++ b/tests/shader/gen/prefix_root.dxil
Binary files differ
diff --git a/tests/shader/gen/prefix_root.hlsl b/tests/shader/gen/prefix_root.hlsl
new file mode 100644
index 0000000..adf6bf8
--- /dev/null
+++ b/tests/shader/gen/prefix_root.hlsl
@@ -0,0 +1,80 @@
+struct Monoid
+{
+ uint element;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const Monoid _131 = { 0u };
+
+RWByteAddressBuffer _42 : register(u0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Monoid sh_scratch[512];
+
+Monoid combine_monoid(Monoid a, Monoid b)
+{
+ Monoid _22 = { a.element + b.element };
+ return _22;
+}
+
+void comp_main()
+{
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ Monoid _46;
+ _46.element = _42.Load(ix * 4 + 0);
+ Monoid local[8];
+ local[0].element = _46.element;
+ Monoid param_1;
+ for (uint i = 1u; i < 8u; i++)
+ {
+ Monoid param = local[i - 1u];
+ Monoid _71;
+ _71.element = _42.Load((ix + i) * 4 + 0);
+ param_1.element = _71.element;
+ local[i] = combine_monoid(param, param_1);
+ }
+ Monoid agg = local[7];
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ Monoid param_2 = other;
+ Monoid param_3 = agg;
+ agg = combine_monoid(param_2, param_3);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ Monoid row = _131;
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ row = sh_scratch[gl_LocalInvocationID.x - 1u];
+ }
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ Monoid param_4 = row;
+ Monoid param_5 = local[i_2];
+ Monoid m = combine_monoid(param_4, param_5);
+ _42.Store((ix + i_2) * 4 + 0, m.element);
+ }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/tests/shader/gen/prefix_root.msl b/tests/shader/gen/prefix_root.msl
new file mode 100644
index 0000000..897a6a4
--- /dev/null
+++ b/tests/shader/gen/prefix_root.msl
@@ -0,0 +1,112 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+ T elements[Num ? Num : 1];
+
+ thread T& operator [] (size_t pos) thread
+ {
+ return elements[pos];
+ }
+ constexpr const thread T& operator [] (size_t pos) const thread
+ {
+ return elements[pos];
+ }
+
+ device T& operator [] (size_t pos) device
+ {
+ return elements[pos];
+ }
+ constexpr const device T& operator [] (size_t pos) const device
+ {
+ return elements[pos];
+ }
+
+ constexpr const constant T& operator [] (size_t pos) const constant
+ {
+ return elements[pos];
+ }
+
+ threadgroup T& operator [] (size_t pos) threadgroup
+ {
+ return elements[pos];
+ }
+ constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+ {
+ return elements[pos];
+ }
+};
+
+struct Monoid
+{
+ uint element;
+};
+
+struct Monoid_1
+{
+ uint element;
+};
+
+struct DataBuf
+{
+ Monoid_1 data[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b)
+{
+ return Monoid{ a.element + b.element };
+}
+
+kernel void main0(device DataBuf& _42 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+ threadgroup Monoid sh_scratch[512];
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ spvUnsafeArray<Monoid, 8> local;
+ local[0].element = _42.data[ix].element;
+ Monoid param_1;
+ for (uint i = 1u; i < 8u; i++)
+ {
+ Monoid param = local[i - 1u];
+ param_1.element = _42.data[ix + i].element;
+ local[i] = combine_monoid(param, param_1);
+ }
+ Monoid agg = local[7];
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ Monoid param_2 = other;
+ Monoid param_3 = agg;
+ agg = combine_monoid(param_2, param_3);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ Monoid row = Monoid{ 0u };
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ row = sh_scratch[gl_LocalInvocationID.x - 1u];
+ }
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ Monoid param_4 = row;
+ Monoid param_5 = local[i_2];
+ Monoid m = combine_monoid(param_4, param_5);
+ _42.data[ix + i_2].element = m.element;
+ }
+}
+
diff --git a/tests/shader/gen/prefix_root.spv b/tests/shader/gen/prefix_root.spv
new file mode 100644
index 0000000..3e04224
--- /dev/null
+++ b/tests/shader/gen/prefix_root.spv
Binary files differ
diff --git a/tests/shader/gen/prefix_scan.dxil b/tests/shader/gen/prefix_scan.dxil
new file mode 100644
index 0000000..8a808f1
--- /dev/null
+++ b/tests/shader/gen/prefix_scan.dxil
Binary files differ
diff --git a/tests/shader/gen/prefix_scan.hlsl b/tests/shader/gen/prefix_scan.hlsl
new file mode 100644
index 0000000..d9e74ea
--- /dev/null
+++ b/tests/shader/gen/prefix_scan.hlsl
@@ -0,0 +1,92 @@
+struct Monoid
+{
+ uint element;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const Monoid _131 = { 0u };
+
+RWByteAddressBuffer _42 : register(u0);
+ByteAddressBuffer _141 : register(t1);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+ uint3 gl_WorkGroupID : SV_GroupID;
+ uint3 gl_LocalInvocationID : SV_GroupThreadID;
+ uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Monoid sh_scratch[512];
+
+Monoid combine_monoid(Monoid a, Monoid b)
+{
+ Monoid _22 = { a.element + b.element };
+ return _22;
+}
+
+void comp_main()
+{
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ Monoid _46;
+ _46.element = _42.Load(ix * 4 + 0);
+ Monoid local[8];
+ local[0].element = _46.element;
+ Monoid param_1;
+ for (uint i = 1u; i < 8u; i++)
+ {
+ Monoid param = local[i - 1u];
+ Monoid _71;
+ _71.element = _42.Load((ix + i) * 4 + 0);
+ param_1.element = _71.element;
+ local[i] = combine_monoid(param, param_1);
+ }
+ Monoid agg = local[7];
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ Monoid param_2 = other;
+ Monoid param_3 = agg;
+ agg = combine_monoid(param_2, param_3);
+ }
+ GroupMemoryBarrierWithGroupSync();
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ Monoid row = _131;
+ if (gl_WorkGroupID.x > 0u)
+ {
+ Monoid _146;
+ _146.element = _141.Load((gl_WorkGroupID.x - 1u) * 4 + 0);
+ row.element = _146.element;
+ }
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ Monoid param_4 = row;
+ Monoid param_5 = sh_scratch[gl_LocalInvocationID.x - 1u];
+ row = combine_monoid(param_4, param_5);
+ }
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ Monoid param_6 = row;
+ Monoid param_7 = local[i_2];
+ Monoid m = combine_monoid(param_6, param_7);
+ _42.Store((ix + i_2) * 4 + 0, m.element);
+ }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+ gl_WorkGroupID = stage_input.gl_WorkGroupID;
+ gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+ gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+ comp_main();
+}
diff --git a/tests/shader/gen/prefix_scan.msl b/tests/shader/gen/prefix_scan.msl
new file mode 100644
index 0000000..5be4e65
--- /dev/null
+++ b/tests/shader/gen/prefix_scan.msl
@@ -0,0 +1,123 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+ T elements[Num ? Num : 1];
+
+ thread T& operator [] (size_t pos) thread
+ {
+ return elements[pos];
+ }
+ constexpr const thread T& operator [] (size_t pos) const thread
+ {
+ return elements[pos];
+ }
+
+ device T& operator [] (size_t pos) device
+ {
+ return elements[pos];
+ }
+ constexpr const device T& operator [] (size_t pos) const device
+ {
+ return elements[pos];
+ }
+
+ constexpr const constant T& operator [] (size_t pos) const constant
+ {
+ return elements[pos];
+ }
+
+ threadgroup T& operator [] (size_t pos) threadgroup
+ {
+ return elements[pos];
+ }
+ constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+ {
+ return elements[pos];
+ }
+};
+
+struct Monoid
+{
+ uint element;
+};
+
+struct Monoid_1
+{
+ uint element;
+};
+
+struct DataBuf
+{
+ Monoid_1 data[1];
+};
+
+struct ParentBuf
+{
+ Monoid_1 parent[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b)
+{
+ return Monoid{ a.element + b.element };
+}
+
+kernel void main0(device DataBuf& _42 [[buffer(0)]], const device ParentBuf& _141 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+ threadgroup Monoid sh_scratch[512];
+ uint ix = gl_GlobalInvocationID.x * 8u;
+ spvUnsafeArray<Monoid, 8> local;
+ local[0].element = _42.data[ix].element;
+ Monoid param_1;
+ for (uint i = 1u; i < 8u; i++)
+ {
+ Monoid param = local[i - 1u];
+ param_1.element = _42.data[ix + i].element;
+ local[i] = combine_monoid(param, param_1);
+ }
+ Monoid agg = local[7];
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ for (uint i_1 = 0u; i_1 < 9u; i_1++)
+ {
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ if (gl_LocalInvocationID.x >= (1u << i_1))
+ {
+ Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+ Monoid param_2 = other;
+ Monoid param_3 = agg;
+ agg = combine_monoid(param_2, param_3);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ sh_scratch[gl_LocalInvocationID.x] = agg;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ Monoid row = Monoid{ 0u };
+ if (gl_WorkGroupID.x > 0u)
+ {
+ row.element = _141.parent[gl_WorkGroupID.x - 1u].element;
+ }
+ if (gl_LocalInvocationID.x > 0u)
+ {
+ Monoid param_4 = row;
+ Monoid param_5 = sh_scratch[gl_LocalInvocationID.x - 1u];
+ row = combine_monoid(param_4, param_5);
+ }
+ for (uint i_2 = 0u; i_2 < 8u; i_2++)
+ {
+ Monoid param_6 = row;
+ Monoid param_7 = local[i_2];
+ Monoid m = combine_monoid(param_6, param_7);
+ _42.data[ix + i_2].element = m.element;
+ }
+}
+
diff --git a/tests/shader/gen/prefix_scan.spv b/tests/shader/gen/prefix_scan.spv
new file mode 100644
index 0000000..6d8fe0a
--- /dev/null
+++ b/tests/shader/gen/prefix_scan.spv
Binary files differ
diff --git a/tests/shader/gen/prefix_vkmm.spv b/tests/shader/gen/prefix_vkmm.spv
new file mode 100644
index 0000000..cef3965
--- /dev/null
+++ b/tests/shader/gen/prefix_vkmm.spv
Binary files differ