piet-gpu/shader/clip_leaf.comp - external/github.com/linebender/vello - Git at Google

 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

 // The second dispatch of clip stack processing.

 #version 450
 #extension GL_GOOGLE_include_directive : enable

 #include "mem.h"
 #include "setup.h"

 #define LG_WG_SIZE (7 + LG_WG_FACTOR)
 #define WG_SIZE (1 << LG_WG_SIZE)
 #define PARTITION_SIZE WG_SIZE

 layout(local_size_x = WG_SIZE) in;

 layout(binding = 1) readonly buffer ConfigBuf {
     Config conf;
 };

 // Some of this is cut'n'paste duplication with the reduce pass, and
 // arguably should be moved to a common .h file.
 // The bicyclic monoid

 struct ClipEl {
     // index of parent node
     uint parent_ix;
     // bounding box
     vec4 bbox;
 };

 struct Bic {
     uint a;
     uint b;
 };

 Bic bic_combine(Bic x, Bic y) {
     uint m = min(x.b, y.a);
     return Bic(x.a + y.a - m, x.b + y.b - m);
 }

 // Load path's bbox from bbox (as written by pathseg).
 vec4 load_path_bbox(uint path_ix) {
     uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
     float bbox_l = float(memory[base]) - 32768.0;
     float bbox_t = float(memory[base + 1]) - 32768.0;
     float bbox_r = float(memory[base + 2]) - 32768.0;
     float bbox_b = float(memory[base + 3]) - 32768.0;
     vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
     return bbox;
 }

 vec4 bbox_intersect(vec4 a, vec4 b) {
     return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
 }

 shared Bic sh_bic[WG_SIZE * 2 - 2];
 shared uint sh_stack[PARTITION_SIZE];
 shared vec4 sh_stack_bbox[PARTITION_SIZE];
 shared uint sh_link[PARTITION_SIZE];
 shared vec4 sh_bbox[PARTITION_SIZE];

 // This is adapted directly from the stack monoid impl.
 // Return value is reference within partition if >= 0,
 // otherwise reference to stack.
 uint search_link(inout Bic bic) {
     uint ix = gl_LocalInvocationID.x;
     uint j = 0;
     while (j < LG_WG_SIZE) {
         uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
         if (((ix >> j) & 1) != 0) {
             Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
             if (test.b > 0) {
                 break;
             }
             bic = test;
             ix -= 1u << j;
         }
         j++;
     }
     if (ix > 0) {
         while (j > 0) {
             j--;
             uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
             Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
             if (test.b == 0) {
                 bic = test;
                 ix -= 1u << j;
             }
         }
     }
     // ix is the smallest value such that reduce(ix..th).b == 0
     if (ix > 0) {
         return ix - 1;
     } else {
         return ~0u - bic.a;
     }
 }

 Bic load_bic(uint ix) {
     uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;
     return Bic(memory[base], memory[base + 1]);
 }

 ClipEl load_clip_el(uint ix) {
     uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;
     uint parent_ix = memory[base];
     float x0 = uintBitsToFloat(memory[base + 1]);
     float y0 = uintBitsToFloat(memory[base + 2]);
     float x1 = uintBitsToFloat(memory[base + 3]);
     float y1 = uintBitsToFloat(memory[base + 4]);
     vec4 bbox = vec4(x0, y0, x1, y1);
     return ClipEl(parent_ix, bbox);
 }

 uint load_path_ix(uint ix) {
     // This is one approach to a partial final block. Another would be
     // to do a memset to the padding in the command queue.
     if (ix < conf.n_clip) {
         return memory[(conf.clip_alloc.offset >> 2) + ix];
     } else {
         // EndClip tags don't implicate further loads.
         return 0x80000000;
     }
 }

 void store_clip_bbox(uint ix, vec4 bbox) {
     uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix;
     memory[base] = floatBitsToUint(bbox.x);
     memory[base + 1] = floatBitsToUint(bbox.y);
     memory[base + 2] = floatBitsToUint(bbox.z);
     memory[base + 3] = floatBitsToUint(bbox.w);
 }

 void main() {
     // materialize stack up to the start of this partition. This
     // is based on the pure stack monoid, but with two additions.

     // First, (this only matters if the stack goes deeper than the
     // partition size, which might be unlikely in practice), the
     // topmost stack element from each partition is picked, then an
     // exclusive scan of those. Also note that if this is skipped,
     // a scan is not needed in the reduce stage.

     // Second, after the stream compaction, do a scan of the retrieved
     // bbox values.
     uint th = gl_LocalInvocationID.x;
     Bic bic = Bic(0, 0);
     if (th < gl_WorkGroupID.x) {
         bic = load_bic(th);
     }
     sh_bic[th] = bic;
     for (uint i = 0; i < LG_WG_SIZE; i++) {
         barrier();
         if (th + (1u << i) < WG_SIZE) {
             Bic other = sh_bic[th + (1u << i)];
             bic = bic_combine(bic, other);
         }
         barrier();
         sh_bic[th] = bic;
     }
     barrier();
     uint stack_size = sh_bic[0].b;

     // TODO: do bbox scan here (to unlock greater stack depth)

     // binary search in stack
     uint sp = PARTITION_SIZE - 1 - th;
     uint ix = 0;
     for (uint i = 0; i < LG_WG_SIZE; i++) {
         uint probe = ix + (uint(PARTITION_SIZE / 2) >> i);
         if (sp < sh_bic[probe].b) {
             ix = probe;
         }
     }
     // ix is largest value such that sp < sh_bic[ix].b (if any)
     uint b = sh_bic[ix].b;
     vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9);
     if (sp < b) {
         // maybe store the index here for future use?
         ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1);
         sh_stack[th] = el.parent_ix;
         bbox = el.bbox;
         // other element values here?
     }

     // forward scan of bbox values of prefix stack
     for (uint i = 0; i < LG_WG_SIZE; i++) {
         sh_stack_bbox[th] = bbox;
         barrier();
         if (th >= (1u << i)) {
             bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox);
         }
         barrier();
     }
     sh_stack_bbox[th] = bbox;

     // Read input and compute bicyclic semigroup binary tree
     uint inp = load_path_ix(gl_GlobalInvocationID.x);
     bool is_push = int(inp) >= 0;
     bic = Bic(1 - uint(is_push), uint(is_push));
     sh_bic[th] = bic;
     if (is_push) {
         bbox = load_path_bbox(inp);
     } else {
         bbox = vec4(-1e9, -1e9, 1e9, 1e9);
     }
     uint inbase = 0;
     for (uint i = 0; i < LG_WG_SIZE - 1; i++) {
         uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i));
         barrier();
         if (th < (1u << (LG_WG_SIZE - 1 - i))) {
             sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]);
         }
         inbase = outbase;
     }
     barrier();
     // Search for predecessor node
     bic = Bic(0, 0);
     uint link = search_link(bic);
     // we use N_SEQ > 1 convention here:
     // link >= 0 is index within partition
     // link < 0 is reference to stack

     // We want grandparent bbox for pop nodes, so follow those links.
     sh_link[th] = link;
     barrier();
     uint grandparent;
     if (int(link) >= 0) {
         grandparent = sh_link[link];
     } else {
         grandparent = link - 1;
     }

     // Resolve parent
     uint parent;
     if (int(link) >= 0) {
         parent = gl_WorkGroupID.x * PARTITION_SIZE + link;
     } else if (int(link + stack_size) >= 0) {
         parent = sh_stack[PARTITION_SIZE + link];
     } else {
         parent = ~0u;
     }

     // bbox scan along parent links
     for (uint i = 0; i < LG_WG_SIZE; i++) {
         // sh_link was already stored for first iteration
         if (i != 0) {
             sh_link[th] = link;
         }
         sh_bbox[th] = bbox;
         barrier();
         if (int(link) >= 0) {
             bbox = bbox_intersect(sh_bbox[link], bbox);
             link = sh_link[link];
         }
         barrier();
     }
     if (int(link + stack_size) >= 0) {
         bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox);
     }
     // At this point, bbox is the reduction of bounding boxes along the tree.
     sh_bbox[th] = bbox;
     barrier();

     uint path_ix = inp;
     if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) {
         // Is this load expensive? If so, it's loaded earlier for in-partition
         // and is in the ClipEl for cross-partition.
         // If not, can probably get rid of it in the stack intermediate buf.
         path_ix = load_path_ix(parent);
         uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 4 * ~inp;
         // Fix up drawmonoid so path_ix at EndClip matches BeginClip
         memory[drawmonoid_out_base] = path_ix;

         if (int(grandparent) >= 0) {
             bbox = sh_bbox[grandparent];
         } else if (int(grandparent + stack_size) >= 0) {
             bbox = sh_stack_bbox[PARTITION_SIZE + grandparent];
         } else {
             bbox = vec4(-1e9, -1e9, 1e9, 1e9);
         }
     }
     store_clip_bbox(gl_GlobalInvocationID.x, bbox);
 }
	// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

	// The second dispatch of clip stack processing.

	#version 450
	#extension GL_GOOGLE_include_directive : enable

	#include "mem.h"
	#include "setup.h"

	#define LG_WG_SIZE (7 + LG_WG_FACTOR)
	#define WG_SIZE (1 << LG_WG_SIZE)
	#define PARTITION_SIZE WG_SIZE

	layout(local_size_x = WG_SIZE) in;

	layout(binding = 1) readonly buffer ConfigBuf {
	Config conf;
	};

	// Some of this is cut'n'paste duplication with the reduce pass, and
	// arguably should be moved to a common .h file.
	// The bicyclic monoid

	struct ClipEl {
	// index of parent node
	uint parent_ix;
	// bounding box
	vec4 bbox;
	};

	struct Bic {
	uint a;
	uint b;
	};

	Bic bic_combine(Bic x, Bic y) {
	uint m = min(x.b, y.a);
	return Bic(x.a + y.a - m, x.b + y.b - m);
	}

	// Load path's bbox from bbox (as written by pathseg).
	vec4 load_path_bbox(uint path_ix) {
	uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
	float bbox_l = float(memory[base]) - 32768.0;
	float bbox_t = float(memory[base + 1]) - 32768.0;
	float bbox_r = float(memory[base + 2]) - 32768.0;
	float bbox_b = float(memory[base + 3]) - 32768.0;
	vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
	return bbox;
	}

	vec4 bbox_intersect(vec4 a, vec4 b) {
	return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
	}

	shared Bic sh_bic[WG_SIZE * 2 - 2];
	shared uint sh_stack[PARTITION_SIZE];
	shared vec4 sh_stack_bbox[PARTITION_SIZE];
	shared uint sh_link[PARTITION_SIZE];
	shared vec4 sh_bbox[PARTITION_SIZE];

	// This is adapted directly from the stack monoid impl.
	// Return value is reference within partition if >= 0,
	// otherwise reference to stack.
	uint search_link(inout Bic bic) {
	uint ix = gl_LocalInvocationID.x;
	uint j = 0;
	while (j < LG_WG_SIZE) {
	uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
	if (((ix >> j) & 1) != 0) {
	Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
	if (test.b > 0) {
	break;
	}
	bic = test;
	ix -= 1u << j;
	}
	j++;
	}
	if (ix > 0) {
	while (j > 0) {
	j--;
	uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
	Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
	if (test.b == 0) {
	bic = test;
	ix -= 1u << j;
	}
	}
	}
	// ix is the smallest value such that reduce(ix..th).b == 0
	if (ix > 0) {
	return ix - 1;
	} else {
	return ~0u - bic.a;
	}
	}

	Bic load_bic(uint ix) {
	uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;
	return Bic(memory[base], memory[base + 1]);
	}

	ClipEl load_clip_el(uint ix) {
	uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;
	uint parent_ix = memory[base];
	float x0 = uintBitsToFloat(memory[base + 1]);
	float y0 = uintBitsToFloat(memory[base + 2]);
	float x1 = uintBitsToFloat(memory[base + 3]);
	float y1 = uintBitsToFloat(memory[base + 4]);
	vec4 bbox = vec4(x0, y0, x1, y1);
	return ClipEl(parent_ix, bbox);
	}

	uint load_path_ix(uint ix) {
	// This is one approach to a partial final block. Another would be
	// to do a memset to the padding in the command queue.
	if (ix < conf.n_clip) {
	return memory[(conf.clip_alloc.offset >> 2) + ix];
	} else {
	// EndClip tags don't implicate further loads.
	return 0x80000000;
	}
	}

	void store_clip_bbox(uint ix, vec4 bbox) {
	uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix;
	memory[base] = floatBitsToUint(bbox.x);
	memory[base + 1] = floatBitsToUint(bbox.y);
	memory[base + 2] = floatBitsToUint(bbox.z);
	memory[base + 3] = floatBitsToUint(bbox.w);
	}

	void main() {
	// materialize stack up to the start of this partition. This
	// is based on the pure stack monoid, but with two additions.

	// First, (this only matters if the stack goes deeper than the
	// partition size, which might be unlikely in practice), the
	// topmost stack element from each partition is picked, then an
	// exclusive scan of those. Also note that if this is skipped,
	// a scan is not needed in the reduce stage.

	// Second, after the stream compaction, do a scan of the retrieved
	// bbox values.
	uint th = gl_LocalInvocationID.x;
	Bic bic = Bic(0, 0);
	if (th < gl_WorkGroupID.x) {
	bic = load_bic(th);
	}
	sh_bic[th] = bic;
	for (uint i = 0; i < LG_WG_SIZE; i++) {
	barrier();
	if (th + (1u << i) < WG_SIZE) {
	Bic other = sh_bic[th + (1u << i)];
	bic = bic_combine(bic, other);
	}
	barrier();
	sh_bic[th] = bic;
	}
	barrier();
	uint stack_size = sh_bic[0].b;

	// TODO: do bbox scan here (to unlock greater stack depth)

	// binary search in stack
	uint sp = PARTITION_SIZE - 1 - th;
	uint ix = 0;
	for (uint i = 0; i < LG_WG_SIZE; i++) {
	uint probe = ix + (uint(PARTITION_SIZE / 2) >> i);
	if (sp < sh_bic[probe].b) {
	ix = probe;
	}
	}
	// ix is largest value such that sp < sh_bic[ix].b (if any)
	uint b = sh_bic[ix].b;
	vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9);
	if (sp < b) {
	// maybe store the index here for future use?
	ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1);
	sh_stack[th] = el.parent_ix;
	bbox = el.bbox;
	// other element values here?
	}

	// forward scan of bbox values of prefix stack
	for (uint i = 0; i < LG_WG_SIZE; i++) {
	sh_stack_bbox[th] = bbox;
	barrier();
	if (th >= (1u << i)) {
	bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox);
	}
	barrier();
	}
	sh_stack_bbox[th] = bbox;

	// Read input and compute bicyclic semigroup binary tree
	uint inp = load_path_ix(gl_GlobalInvocationID.x);
	bool is_push = int(inp) >= 0;
	bic = Bic(1 - uint(is_push), uint(is_push));
	sh_bic[th] = bic;
	if (is_push) {
	bbox = load_path_bbox(inp);
	} else {
	bbox = vec4(-1e9, -1e9, 1e9, 1e9);
	}
	uint inbase = 0;
	for (uint i = 0; i < LG_WG_SIZE - 1; i++) {
	uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i));
	barrier();
	if (th < (1u << (LG_WG_SIZE - 1 - i))) {
	sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]);
	}
	inbase = outbase;
	}
	barrier();
	// Search for predecessor node
	bic = Bic(0, 0);
	uint link = search_link(bic);
	// we use N_SEQ > 1 convention here:
	// link >= 0 is index within partition
	// link < 0 is reference to stack

	// We want grandparent bbox for pop nodes, so follow those links.
	sh_link[th] = link;
	barrier();
	uint grandparent;
	if (int(link) >= 0) {
	grandparent = sh_link[link];
	} else {
	grandparent = link - 1;
	}

	// Resolve parent
	uint parent;
	if (int(link) >= 0) {
	parent = gl_WorkGroupID.x * PARTITION_SIZE + link;
	} else if (int(link + stack_size) >= 0) {
	parent = sh_stack[PARTITION_SIZE + link];
	} else {
	parent = ~0u;
	}

	// bbox scan along parent links
	for (uint i = 0; i < LG_WG_SIZE; i++) {
	// sh_link was already stored for first iteration
	if (i != 0) {
	sh_link[th] = link;
	}
	sh_bbox[th] = bbox;
	barrier();
	if (int(link) >= 0) {
	bbox = bbox_intersect(sh_bbox[link], bbox);
	link = sh_link[link];
	}
	barrier();
	}
	if (int(link + stack_size) >= 0) {
	bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox);
	}
	// At this point, bbox is the reduction of bounding boxes along the tree.
	sh_bbox[th] = bbox;
	barrier();

	uint path_ix = inp;
	if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) {
	// Is this load expensive? If so, it's loaded earlier for in-partition
	// and is in the ClipEl for cross-partition.
	// If not, can probably get rid of it in the stack intermediate buf.
	path_ix = load_path_ix(parent);
	uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 4 * ~inp;
	// Fix up drawmonoid so path_ix at EndClip matches BeginClip
	memory[drawmonoid_out_base] = path_ix;

	if (int(grandparent) >= 0) {
	bbox = sh_bbox[grandparent];
	} else if (int(grandparent + stack_size) >= 0) {
	bbox = sh_stack_bbox[PARTITION_SIZE + grandparent];
	} else {
	bbox = vec4(-1e9, -1e9, 1e9, 1e9);
	}
	}
	store_clip_bbox(gl_GlobalInvocationID.x, bbox);
	}