| // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense |
| |
| // The second dispatch of clip stack processing. |
| |
| #version 450 |
| #extension GL_GOOGLE_include_directive : enable |
| |
| #include "mem.h" |
| #include "setup.h" |
| |
| #define LG_WG_SIZE (7 + LG_WG_FACTOR) |
| #define WG_SIZE (1 << LG_WG_SIZE) |
| #define PARTITION_SIZE WG_SIZE |
| |
| layout(local_size_x = WG_SIZE) in; |
| |
| layout(binding = 1) readonly buffer ConfigBuf { |
| Config conf; |
| }; |
| |
| // Some of this is cut'n'paste duplication with the reduce pass, and |
| // arguably should be moved to a common .h file. |
| // The bicyclic monoid |
| |
| struct ClipEl { |
| // index of parent node |
| uint parent_ix; |
| // bounding box |
| vec4 bbox; |
| }; |
| |
| struct Bic { |
| uint a; |
| uint b; |
| }; |
| |
| Bic bic_combine(Bic x, Bic y) { |
| uint m = min(x.b, y.a); |
| return Bic(x.a + y.a - m, x.b + y.b - m); |
| } |
| |
| // Load path's bbox from bbox (as written by pathseg). |
| vec4 load_path_bbox(uint path_ix) { |
| uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix; |
| float bbox_l = float(memory[base]) - 32768.0; |
| float bbox_t = float(memory[base + 1]) - 32768.0; |
| float bbox_r = float(memory[base + 2]) - 32768.0; |
| float bbox_b = float(memory[base + 3]) - 32768.0; |
| vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b); |
| return bbox; |
| } |
| |
| vec4 bbox_intersect(vec4 a, vec4 b) { |
| return vec4(max(a.xy, b.xy), min(a.zw, b.zw)); |
| } |
| |
| shared Bic sh_bic[WG_SIZE * 2 - 2]; |
| shared uint sh_stack[PARTITION_SIZE]; |
| shared vec4 sh_stack_bbox[PARTITION_SIZE]; |
| shared uint sh_link[PARTITION_SIZE]; |
| shared vec4 sh_bbox[PARTITION_SIZE]; |
| |
| // This is adapted directly from the stack monoid impl. |
| // Return value is reference within partition if >= 0, |
| // otherwise reference to stack. |
| uint search_link(inout Bic bic) { |
| uint ix = gl_LocalInvocationID.x; |
| uint j = 0; |
| while (j < LG_WG_SIZE) { |
| uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j)); |
| if (((ix >> j) & 1) != 0) { |
| Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic); |
| if (test.b > 0) { |
| break; |
| } |
| bic = test; |
| ix -= 1u << j; |
| } |
| j++; |
| } |
| if (ix > 0) { |
| while (j > 0) { |
| j--; |
| uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j)); |
| Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic); |
| if (test.b == 0) { |
| bic = test; |
| ix -= 1u << j; |
| } |
| } |
| } |
| // ix is the smallest value such that reduce(ix..th).b == 0 |
| if (ix > 0) { |
| return ix - 1; |
| } else { |
| return ~0u - bic.a; |
| } |
| } |
| |
| Bic load_bic(uint ix) { |
| uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix; |
| return Bic(memory[base], memory[base + 1]); |
| } |
| |
| ClipEl load_clip_el(uint ix) { |
| uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix; |
| uint parent_ix = memory[base]; |
| float x0 = uintBitsToFloat(memory[base + 1]); |
| float y0 = uintBitsToFloat(memory[base + 2]); |
| float x1 = uintBitsToFloat(memory[base + 3]); |
| float y1 = uintBitsToFloat(memory[base + 4]); |
| vec4 bbox = vec4(x0, y0, x1, y1); |
| return ClipEl(parent_ix, bbox); |
| } |
| |
| uint load_path_ix(uint ix) { |
| // This is one approach to a partial final block. Another would be |
| // to do a memset to the padding in the command queue. |
| if (ix < conf.n_clip) { |
| return memory[(conf.clip_alloc.offset >> 2) + ix]; |
| } else { |
| // EndClip tags don't implicate further loads. |
| return 0x80000000; |
| } |
| } |
| |
| void store_clip_bbox(uint ix, vec4 bbox) { |
| uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix; |
| memory[base] = floatBitsToUint(bbox.x); |
| memory[base + 1] = floatBitsToUint(bbox.y); |
| memory[base + 2] = floatBitsToUint(bbox.z); |
| memory[base + 3] = floatBitsToUint(bbox.w); |
| } |
| |
| void main() { |
| // materialize stack up to the start of this partition. This |
| // is based on the pure stack monoid, but with two additions. |
| |
| // First, (this only matters if the stack goes deeper than the |
| // partition size, which might be unlikely in practice), the |
| // topmost stack element from each partition is picked, then an |
| // exclusive scan of those. Also note that if this is skipped, |
| // a scan is not needed in the reduce stage. |
| |
| // Second, after the stream compaction, do a scan of the retrieved |
| // bbox values. |
| uint th = gl_LocalInvocationID.x; |
| Bic bic = Bic(0, 0); |
| if (th < gl_WorkGroupID.x) { |
| bic = load_bic(th); |
| } |
| sh_bic[th] = bic; |
| for (uint i = 0; i < LG_WG_SIZE; i++) { |
| barrier(); |
| if (th + (1u << i) < WG_SIZE) { |
| Bic other = sh_bic[th + (1u << i)]; |
| bic = bic_combine(bic, other); |
| } |
| barrier(); |
| sh_bic[th] = bic; |
| } |
| barrier(); |
| uint stack_size = sh_bic[0].b; |
| |
| // TODO: do bbox scan here (to unlock greater stack depth) |
| |
| // binary search in stack |
| uint sp = PARTITION_SIZE - 1 - th; |
| uint ix = 0; |
| for (uint i = 0; i < LG_WG_SIZE; i++) { |
| uint probe = ix + (uint(PARTITION_SIZE / 2) >> i); |
| if (sp < sh_bic[probe].b) { |
| ix = probe; |
| } |
| } |
| // ix is largest value such that sp < sh_bic[ix].b (if any) |
| uint b = sh_bic[ix].b; |
| vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9); |
| if (sp < b) { |
| // maybe store the index here for future use? |
| ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1); |
| sh_stack[th] = el.parent_ix; |
| bbox = el.bbox; |
| // other element values here? |
| } |
| |
| // forward scan of bbox values of prefix stack |
| for (uint i = 0; i < LG_WG_SIZE; i++) { |
| sh_stack_bbox[th] = bbox; |
| barrier(); |
| if (th >= (1u << i)) { |
| bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox); |
| } |
| barrier(); |
| } |
| sh_stack_bbox[th] = bbox; |
| |
| // Read input and compute bicyclic semigroup binary tree |
| uint inp = load_path_ix(gl_GlobalInvocationID.x); |
| bool is_push = int(inp) >= 0; |
| bic = Bic(1 - uint(is_push), uint(is_push)); |
| sh_bic[th] = bic; |
| if (is_push) { |
| bbox = load_path_bbox(inp); |
| } else { |
| bbox = vec4(-1e9, -1e9, 1e9, 1e9); |
| } |
| uint inbase = 0; |
| for (uint i = 0; i < LG_WG_SIZE - 1; i++) { |
| uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i)); |
| barrier(); |
| if (th < (1u << (LG_WG_SIZE - 1 - i))) { |
| sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]); |
| } |
| inbase = outbase; |
| } |
| barrier(); |
| // Search for predecessor node |
| bic = Bic(0, 0); |
| uint link = search_link(bic); |
| // we use N_SEQ > 1 convention here: |
| // link >= 0 is index within partition |
| // link < 0 is reference to stack |
| |
| // We want grandparent bbox for pop nodes, so follow those links. |
| sh_link[th] = link; |
| barrier(); |
| uint grandparent; |
| if (int(link) >= 0) { |
| grandparent = sh_link[link]; |
| } else { |
| grandparent = link - 1; |
| } |
| |
| // Resolve parent |
| uint parent; |
| if (int(link) >= 0) { |
| parent = gl_WorkGroupID.x * PARTITION_SIZE + link; |
| } else if (int(link + stack_size) >= 0) { |
| parent = sh_stack[PARTITION_SIZE + link]; |
| } else { |
| parent = ~0u; |
| } |
| |
| // bbox scan along parent links |
| for (uint i = 0; i < LG_WG_SIZE; i++) { |
| // sh_link was already stored for first iteration |
| if (i != 0) { |
| sh_link[th] = link; |
| } |
| sh_bbox[th] = bbox; |
| barrier(); |
| if (int(link) >= 0) { |
| bbox = bbox_intersect(sh_bbox[link], bbox); |
| link = sh_link[link]; |
| } |
| barrier(); |
| } |
| if (int(link + stack_size) >= 0) { |
| bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox); |
| } |
| // At this point, bbox is the reduction of bounding boxes along the tree. |
| sh_bbox[th] = bbox; |
| barrier(); |
| |
| uint path_ix = inp; |
| if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) { |
| // Is this load expensive? If so, it's loaded earlier for in-partition |
| // and is in the ClipEl for cross-partition. |
| // If not, can probably get rid of it in the stack intermediate buf. |
| path_ix = load_path_ix(parent); |
| uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 4 * ~inp; |
| // Fix up drawmonoid so path_ix at EndClip matches BeginClip |
| memory[drawmonoid_out_base] = path_ix; |
| |
| if (int(grandparent) >= 0) { |
| bbox = sh_bbox[grandparent]; |
| } else if (int(grandparent + stack_size) >= 0) { |
| bbox = sh_stack_bbox[PARTITION_SIZE + grandparent]; |
| } else { |
| bbox = vec4(-1e9, -1e9, 1e9, 1e9); |
| } |
| } |
| store_clip_bbox(gl_GlobalInvocationID.x, bbox); |
| } |