blob: d42b1eb7db600f9671509edee6342df995a531a8 [file] [log] [blame]
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct Bic
{
uint a;
uint b;
};
struct Bic_1
{
uint a;
uint b;
};
struct BicBbox
{
Bic_1 bic;
uint pad2;
uint pad3;
float4 bbox;
};
struct BicBuf
{
BicBbox bicbuf[1];
};
struct StackBuf
{
float4 stack[1];
};
struct Node
{
uint node_type;
uint pad1;
uint pad2;
uint pad3;
float4 bbox;
};
struct Node_1
{
uint node_type;
uint pad1;
uint pad2;
uint pad3;
float4 bbox;
};
struct InBuf
{
Node_1 inbuf[1];
};
struct OutBuf
{
float4 outbuf[1];
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
static inline __attribute__((always_inline))
Bic bic_combine(thread const Bic& x, thread const Bic& y)
{
uint m = min(x.b, y.a);
return Bic{ (x.a + y.a) - m, (x.b + y.b) - m };
}
static inline __attribute__((always_inline))
float4 bbox_union(thread const float4& a, thread const float4& b)
{
return float4(fast::min(a.xy, b.xy), fast::max(a.zw, b.zw));
}
kernel void main0(const device InBuf& _249 [[buffer(0)]], const device BicBuf& _94 [[buffer(1)]], const device StackBuf& _213 [[buffer(2)]], device OutBuf& _492 [[buffer(3)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
{
threadgroup Bic sh_bic[1022];
threadgroup float4 sh_bbox[1022];
threadgroup float4 sh_stack[512];
uint th = gl_LocalInvocationID.x;
Bic bic = Bic{ 0u, 0u };
float4 bbox = float4(1000000000.0, 1000000000.0, -1000000000.0, -1000000000.0);
if (th < gl_WorkGroupID.x)
{
bic.a = _94.bicbuf[th].bic.a;
bic.b = _94.bicbuf[th].bic.b;
bbox = _94.bicbuf[th].bbox;
}
sh_bic[th] = bic;
sh_bbox[th] = bbox;
for (uint i = 0u; i < 9u; i++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
uint other_ix = th + (1u << i);
if (other_ix < 512u)
{
Bic param = bic;
Bic param_1 = sh_bic[other_ix];
bic = bic_combine(param, param_1);
float4 param_2 = bbox;
float4 param_3 = sh_bbox[other_ix];
bbox = bbox_union(param_2, param_3);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
sh_bic[th] = bic;
sh_bbox[th] = bbox;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint size = sh_bic[0].b;
uint sp = 511u - th;
uint ix = 0u;
for (uint i_1 = 0u; i_1 < 9u; i_1++)
{
uint probe = ix + (256u >> i_1);
if (sp < sh_bic[probe].b)
{
ix = probe;
}
}
uint b = sh_bic[ix].b;
if (sp < b)
{
float4 bbox_1 = _213.stack[(((ix * 512u) + b) - sp) - 1u];
if ((ix + 1u) < 512u)
{
float4 param_4 = bbox_1;
float4 param_5 = sh_bbox[ix + 1u];
bbox_1 = bbox_union(param_4, param_5);
}
sh_stack[th] = bbox_1;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
Node inp;
inp.node_type = _249.inbuf[gl_GlobalInvocationID.x].node_type;
inp.pad1 = _249.inbuf[gl_GlobalInvocationID.x].pad1;
inp.pad2 = _249.inbuf[gl_GlobalInvocationID.x].pad2;
inp.pad3 = _249.inbuf[gl_GlobalInvocationID.x].pad3;
inp.bbox = _249.inbuf[gl_GlobalInvocationID.x].bbox;
uint node_type = inp.node_type;
bic = Bic{ uint(node_type == 1u), uint(node_type == 0u) };
sh_bic[th] = bic;
sh_bbox[th] = inp.bbox;
uint inbase = 0u;
for (uint i_2 = 0u; i_2 < 8u; i_2++)
{
uint outbase = 1024u - (1u << (9u - i_2));
threadgroup_barrier(mem_flags::mem_threadgroup);
if (th < (1u << (8u - i_2)))
{
Bic param_6 = sh_bic[inbase + (th * 2u)];
Bic param_7 = sh_bic[(inbase + (th * 2u)) + 1u];
sh_bic[outbase + th] = bic_combine(param_6, param_7);
float4 param_8 = sh_bbox[inbase + (th * 2u)];
float4 param_9 = sh_bbox[(inbase + (th * 2u)) + 1u];
sh_bbox[outbase + th] = bbox_union(param_8, param_9);
}
inbase = outbase;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
ix = th;
bbox = inp.bbox;
bic = Bic{ 0u, 0u };
if (node_type == 1u)
{
uint j = 0u;
while (j < 9u)
{
uint base = 1024u - (2u << (9u - j));
if (((ix >> j) & 1u) != 0u)
{
Bic param_10 = sh_bic[(base + (ix >> j)) - 1u];
Bic param_11 = bic;
Bic test = bic_combine(param_10, param_11);
if (test.b > 0u)
{
break;
}
bic = test;
float4 param_12 = sh_bbox[(base + (ix >> j)) - 1u];
float4 param_13 = bbox;
bbox = bbox_union(param_12, param_13);
ix -= (1u << j);
}
j++;
}
if (ix > 0u)
{
while (j > 0u)
{
j--;
uint base_1 = 1024u - (2u << (9u - j));
Bic param_14 = sh_bic[(base_1 + (ix >> j)) - 1u];
Bic param_15 = bic;
Bic test_1 = bic_combine(param_14, param_15);
if (test_1.b == 0u)
{
bic = test_1;
float4 param_16 = sh_bbox[(base_1 + (ix >> j)) - 1u];
float4 param_17 = bbox;
bbox = bbox_union(param_16, param_17);
ix -= (1u << j);
}
}
}
bool _470 = ix == 0u;
bool _477;
if (_470)
{
_477 = bic.a < size;
}
else
{
_477 = _470;
}
if (_477)
{
float4 param_18 = sh_stack[511u - bic.a];
float4 param_19 = bbox;
bbox = bbox_union(param_18, param_19);
}
}
_492.outbuf[gl_GlobalInvocationID.x] = bbox;
}