piet-gpu/shader/pathseg.comp - external/github.com/linebender/vello - Git at Google

 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

 // Processing of the path stream, after the tag scan.

 #version 450
 #extension GL_GOOGLE_include_directive : enable

 #include "mem.h"
 #include "setup.h"
 #include "pathtag.h"

 #define N_SEQ 4
 #define LG_WG_SIZE (7 + LG_WG_FACTOR)
 #define WG_SIZE (1 << LG_WG_SIZE)
 #define PARTITION_SIZE (WG_SIZE * N_SEQ)

 layout(local_size_x = WG_SIZE, local_size_y = 1) in;

 layout(binding = 1) readonly buffer ConfigBuf {
     Config conf;
 };

 layout(binding = 2) readonly buffer SceneBuf {
     uint[] scene;
 };

 #include "tile.h"
 #include "pathseg.h"

 layout(binding = 3) readonly buffer ParentBuf {
     TagMonoid[] parent;
 };

 struct Monoid {
     vec4 bbox;
     uint flags;
 };

 #define FLAG_RESET_BBOX 1
 #define FLAG_SET_BBOX 2

 Monoid combine_monoid(Monoid a, Monoid b) {
     Monoid c;
     c.bbox = b.bbox;
     // TODO: I think this should be gated on b & SET_BBOX == false also.
     if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
         c.bbox = a.bbox;
     } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
                (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) {
         c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
         c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
     }
     c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
     c.flags |= ((a.flags & FLAG_RESET_BBOX) << 1);
     return c;
 }

 Monoid monoid_identity() {
     return Monoid(vec4(0.0, 0.0, 0.0, 0.0), 0);
 }

 // These are not both live at the same time. A very smart shader compiler
 // would be able to figure that out, but I suspect many won't.
 shared TagMonoid sh_tag[WG_SIZE];
 shared Monoid sh_scratch[WG_SIZE];

 vec2 read_f32_point(uint ix) {
     float x = uintBitsToFloat(scene[ix]);
     float y = uintBitsToFloat(scene[ix + 1]);
     return vec2(x, y);
 }

 vec2 read_i16_point(uint ix) {
     uint raw = scene[ix];
     float x = float(int(raw << 16) >> 16);
     float y = float(int(raw) >> 16);
     return vec2(x, y);
 }

 // Note: these are 16 bit, which is adequate, but we could use 32 bits.

 // Round down and saturate to minimum integer; add bias
 uint round_down(float x) {
     return uint(max(0.0, floor(x) + 32768.0));
 }

 // Round up and saturate to maximum integer; add bias
 uint round_up(float x) {
     return uint(min(65535.0, ceil(x) + 32768.0));
 }

 void main() {
     Monoid local[N_SEQ];
     float linewidth[N_SEQ];
     uint save_trans_ix[N_SEQ];

     uint ix = gl_GlobalInvocationID.x * N_SEQ;

     uint tag_word = scene[(conf.pathtag_offset >> 2) + (ix >> 2)];

     // Scan the tag monoid
     TagMonoid local_tm = reduce_tag(tag_word);
     sh_tag[gl_LocalInvocationID.x] = local_tm;
     for (uint i = 0; i < LG_WG_SIZE; i++) {
         barrier();
         if (gl_LocalInvocationID.x >= (1u << i)) {
             TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)];
             local_tm = combine_tag_monoid(other, local_tm);
         }
         barrier();
         sh_tag[gl_LocalInvocationID.x] = local_tm;
     }
     barrier();
     // sh_tag is now the partition-wide inclusive scan of the tag monoid.
     TagMonoid tm = tag_monoid_identity();
     if (gl_WorkGroupID.x > 0) {
         tm = parent[gl_WorkGroupID.x - 1];
     }
     if (gl_LocalInvocationID.x > 0) {
         tm = combine_tag_monoid(tm, sh_tag[gl_LocalInvocationID.x - 1]);
     }
     // tm is now the full exclusive scan of the tag monoid.

     // Indices to scene buffer in u32 units.
     uint ps_ix = (conf.pathseg_offset >> 2) + tm.pathseg_offset;
     uint lw_ix = (conf.linewidth_offset >> 2) + tm.linewidth_ix;
     uint save_path_ix = tm.path_ix;
     uint trans_ix = tm.trans_ix;
     TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + trans_ix * TransformSeg_size);
     PathSegRef ps_ref = PathSegRef(conf.pathseg_alloc.offset + tm.pathseg_ix * PathSeg_size);
     for (uint i = 0; i < N_SEQ; i++) {
         linewidth[i] = uintBitsToFloat(scene[lw_ix]);
         save_trans_ix[i] = trans_ix;
         // if N_SEQ > 4, need to load tag_word from local if N_SEQ % 4 == 0
         uint tag_byte = tag_word >> (i * 8);
         uint seg_type = tag_byte & 3;
         if (seg_type != 0) {
             // 1 = line, 2 = quad, 3 = cubic
             // Unpack path segment from input
             vec2 p0;
             vec2 p1;
             vec2 p2;
             vec2 p3;
             if ((tag_byte & 8) != 0) {
                 // 32 bit encoding
                 p0 = read_f32_point(ps_ix);
                 p1 = read_f32_point(ps_ix + 2);
                 if (seg_type >= 2) {
                     p2 = read_f32_point(ps_ix + 4);
                     if (seg_type == 3) {
                         p3 = read_f32_point(ps_ix + 6);
                     }
                 }
             } else {
                 // 16 bit encoding
                 p0 = read_i16_point(ps_ix);
                 p1 = read_i16_point(ps_ix + 1);
                 if (seg_type >= 2) {
                     p2 = read_i16_point(ps_ix + 2);
                     if (seg_type == 3) {
                         p3 = read_i16_point(ps_ix + 3);
                     }
                 }
             }
             TransformSeg transform = TransformSeg_read(conf.trans_alloc, trans_ref);
             p0 = transform.mat.xy * p0.x + transform.mat.zw * p0.y + transform.translate;
             p1 = transform.mat.xy * p1.x + transform.mat.zw * p1.y + transform.translate;
             vec4 bbox = vec4(min(p0, p1), max(p0, p1));
             // Degree-raise and compute bbox
             if (seg_type >= 2) {
                 p2 = transform.mat.xy * p2.x + transform.mat.zw * p2.y + transform.translate;
                 bbox.xy = min(bbox.xy, p2);
                 bbox.zw = max(bbox.zw, p2);
                 if (seg_type == 3) {
                     p3 = transform.mat.xy * p3.x + transform.mat.zw * p3.y + transform.translate;
                     bbox.xy = min(bbox.xy, p3);
                     bbox.zw = max(bbox.zw, p3);
                 } else {
                     p3 = p2;
                     p2 = mix(p1, p2, 1.0 / 3.0);
                     p1 = mix(p1, p0, 1.0 / 3.0);
                 }
             } else {
                 p3 = p1;
                 p2 = mix(p3, p0, 1.0 / 3.0);
                 p1 = mix(p0, p3, 1.0 / 3.0);
             }
             vec2 stroke = vec2(0.0, 0.0);
             if (linewidth[i] >= 0.0) {
                 // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
                 stroke = 0.5 * linewidth[i] * vec2(length(transform.mat.xz), length(transform.mat.yw));
                 bbox += vec4(-stroke, stroke);
             }
             local[i].bbox = bbox;
             local[i].flags = 0;

             // Write path segment to output
             PathCubic cubic;
             cubic.p0 = p0;
             cubic.p1 = p1;
             cubic.p2 = p2;
             cubic.p3 = p3;
             cubic.path_ix = tm.path_ix;
             // Not needed, TODO remove from struct
             cubic.trans_ix = gl_GlobalInvocationID.x * 4 + i;
             cubic.stroke = stroke;
             uint fill_mode = uint(linewidth[i] >= 0.0);
             PathSeg_Cubic_write(conf.pathseg_alloc, ps_ref, fill_mode, cubic);

             ps_ref.offset += PathSeg_size;
             uint n_points = (tag_byte & 3) + ((tag_byte >> 2) & 1);
             uint n_words = n_points + (n_points & (((tag_byte >> 3) & 1) * 15));
             ps_ix += n_words;
         } else {
             local[i].bbox = vec4(0.0, 0.0, 0.0, 0.0);
             // These shifts need to be kept in sync with setup.h
             uint is_path = (tag_byte >> 4) & 1;
             // Relies on the fact that RESET_BBOX == 1
             local[i].flags = is_path;
             tm.path_ix += is_path;
             trans_ix += (tag_byte >> 5) & 1;
             trans_ref.offset += ((tag_byte >> 5) & 1) * TransformSeg_size;
             lw_ix += (tag_byte >> 6) & 1;
         }
     }

     // Partition-wide monoid scan for bbox monoid
     Monoid agg = local[0];
     for (uint i = 1; i < N_SEQ; i++) {
         // Note: this could be fused with the map above, but probably
         // a thin performance gain not worth the complexity.
         agg = combine_monoid(agg, local[i]);
         local[i] = agg;
     }
     // local is N_SEQ sub-partition inclusive scan of bbox monoid.
     sh_scratch[gl_LocalInvocationID.x] = agg;
     for (uint i = 0; i < LG_WG_SIZE; i++) {
         barrier();
         if (gl_LocalInvocationID.x >= (1u << i)) {
             Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
             agg = combine_monoid(other, agg);
         }
         barrier();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     // sh_scratch is the partition-wide inclusive scan of the bbox monoid,
     // sampled at the end of the N_SEQ sub-partition.

     barrier();
     uint path_ix = save_path_ix;
     uint bbox_out_ix = (conf.path_bbox_alloc.offset >> 2) + path_ix * 6;
     // Write bboxes to paths; do atomic min/max if partial
     Monoid row = monoid_identity();
     if (gl_LocalInvocationID.x > 0) {
         row = sh_scratch[gl_LocalInvocationID.x - 1];
     }
     for (uint i = 0; i < N_SEQ; i++) {
         Monoid m = combine_monoid(row, local[i]);
         // m is partition-wide inclusive scan of bbox monoid.
         bool do_atomic = false;
         if (i == N_SEQ - 1 && gl_LocalInvocationID.x == WG_SIZE - 1) {
             // last element
             do_atomic = true;
         }
         if ((m.flags & FLAG_RESET_BBOX) != 0) {
             memory[bbox_out_ix + 4] = floatBitsToUint(linewidth[i]);
             memory[bbox_out_ix + 5] = save_trans_ix[i];
             if ((m.flags & FLAG_SET_BBOX) == 0) {
                 do_atomic = true;
             } else {
                 memory[bbox_out_ix] = round_down(m.bbox.x);
                 memory[bbox_out_ix + 1] = round_down(m.bbox.y);
                 memory[bbox_out_ix + 2] = round_up(m.bbox.z);
                 memory[bbox_out_ix + 3] = round_up(m.bbox.w);
                 bbox_out_ix += 6;
                 do_atomic = false;
             }
         }
         if (do_atomic) {
             if (m.bbox.z > m.bbox.x || m.bbox.w > m.bbox.y) {
                 // atomic min/max
                 atomicMin(memory[bbox_out_ix], round_down(m.bbox.x));
                 atomicMin(memory[bbox_out_ix + 1], round_down(m.bbox.y));
                 atomicMax(memory[bbox_out_ix + 2], round_up(m.bbox.z));
                 atomicMax(memory[bbox_out_ix + 3], round_up(m.bbox.w));
             }
             bbox_out_ix += 6;
         }
     }
 }
	// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

	// Processing of the path stream, after the tag scan.

	#version 450
	#extension GL_GOOGLE_include_directive : enable

	#include "mem.h"
	#include "setup.h"
	#include "pathtag.h"

	#define N_SEQ 4
	#define LG_WG_SIZE (7 + LG_WG_FACTOR)
	#define WG_SIZE (1 << LG_WG_SIZE)
	#define PARTITION_SIZE (WG_SIZE * N_SEQ)

	layout(local_size_x = WG_SIZE, local_size_y = 1) in;

	layout(binding = 1) readonly buffer ConfigBuf {
	Config conf;
	};

	layout(binding = 2) readonly buffer SceneBuf {
	uint[] scene;
	};

	#include "tile.h"
	#include "pathseg.h"

	layout(binding = 3) readonly buffer ParentBuf {
	TagMonoid[] parent;
	};

	struct Monoid {
	vec4 bbox;
	uint flags;
	};

	#define FLAG_RESET_BBOX 1
	#define FLAG_SET_BBOX 2

	Monoid combine_monoid(Monoid a, Monoid b) {
	Monoid c;
	c.bbox = b.bbox;
	// TODO: I think this should be gated on b & SET_BBOX == false also.
	if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
	c.bbox = a.bbox;
	} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
	(a.bbox.z > a.bbox.x \|\| a.bbox.w > a.bbox.y)) {
	c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
	c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
	}
	c.flags = (a.flags & FLAG_SET_BBOX) \| b.flags;
	c.flags \|= ((a.flags & FLAG_RESET_BBOX) << 1);
	return c;
	}

	Monoid monoid_identity() {
	return Monoid(vec4(0.0, 0.0, 0.0, 0.0), 0);
	}

	// These are not both live at the same time. A very smart shader compiler
	// would be able to figure that out, but I suspect many won't.
	shared TagMonoid sh_tag[WG_SIZE];
	shared Monoid sh_scratch[WG_SIZE];

	vec2 read_f32_point(uint ix) {
	float x = uintBitsToFloat(scene[ix]);
	float y = uintBitsToFloat(scene[ix + 1]);
	return vec2(x, y);
	}

	vec2 read_i16_point(uint ix) {
	uint raw = scene[ix];
	float x = float(int(raw << 16) >> 16);
	float y = float(int(raw) >> 16);
	return vec2(x, y);
	}

	// Note: these are 16 bit, which is adequate, but we could use 32 bits.

	// Round down and saturate to minimum integer; add bias
	uint round_down(float x) {
	return uint(max(0.0, floor(x) + 32768.0));
	}

	// Round up and saturate to maximum integer; add bias
	uint round_up(float x) {
	return uint(min(65535.0, ceil(x) + 32768.0));
	}

	void main() {
	Monoid local[N_SEQ];
	float linewidth[N_SEQ];
	uint save_trans_ix[N_SEQ];

	uint ix = gl_GlobalInvocationID.x * N_SEQ;

	uint tag_word = scene[(conf.pathtag_offset >> 2) + (ix >> 2)];

	// Scan the tag monoid
	TagMonoid local_tm = reduce_tag(tag_word);
	sh_tag[gl_LocalInvocationID.x] = local_tm;
	for (uint i = 0; i < LG_WG_SIZE; i++) {
	barrier();
	if (gl_LocalInvocationID.x >= (1u << i)) {
	TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)];
	local_tm = combine_tag_monoid(other, local_tm);
	}
	barrier();
	sh_tag[gl_LocalInvocationID.x] = local_tm;
	}
	barrier();
	// sh_tag is now the partition-wide inclusive scan of the tag monoid.
	TagMonoid tm = tag_monoid_identity();
	if (gl_WorkGroupID.x > 0) {
	tm = parent[gl_WorkGroupID.x - 1];
	}
	if (gl_LocalInvocationID.x > 0) {
	tm = combine_tag_monoid(tm, sh_tag[gl_LocalInvocationID.x - 1]);
	}
	// tm is now the full exclusive scan of the tag monoid.

	// Indices to scene buffer in u32 units.
	uint ps_ix = (conf.pathseg_offset >> 2) + tm.pathseg_offset;
	uint lw_ix = (conf.linewidth_offset >> 2) + tm.linewidth_ix;
	uint save_path_ix = tm.path_ix;
	uint trans_ix = tm.trans_ix;
	TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + trans_ix * TransformSeg_size);
	PathSegRef ps_ref = PathSegRef(conf.pathseg_alloc.offset + tm.pathseg_ix * PathSeg_size);
	for (uint i = 0; i < N_SEQ; i++) {
	linewidth[i] = uintBitsToFloat(scene[lw_ix]);
	save_trans_ix[i] = trans_ix;
	// if N_SEQ > 4, need to load tag_word from local if N_SEQ % 4 == 0
	uint tag_byte = tag_word >> (i * 8);
	uint seg_type = tag_byte & 3;
	if (seg_type != 0) {
	// 1 = line, 2 = quad, 3 = cubic
	// Unpack path segment from input
	vec2 p0;
	vec2 p1;
	vec2 p2;
	vec2 p3;
	if ((tag_byte & 8) != 0) {
	// 32 bit encoding
	p0 = read_f32_point(ps_ix);
	p1 = read_f32_point(ps_ix + 2);
	if (seg_type >= 2) {
	p2 = read_f32_point(ps_ix + 4);
	if (seg_type == 3) {
	p3 = read_f32_point(ps_ix + 6);
	}
	}
	} else {
	// 16 bit encoding
	p0 = read_i16_point(ps_ix);
	p1 = read_i16_point(ps_ix + 1);
	if (seg_type >= 2) {
	p2 = read_i16_point(ps_ix + 2);
	if (seg_type == 3) {
	p3 = read_i16_point(ps_ix + 3);
	}
	}
	}
	TransformSeg transform = TransformSeg_read(conf.trans_alloc, trans_ref);
	p0 = transform.mat.xy * p0.x + transform.mat.zw * p0.y + transform.translate;
	p1 = transform.mat.xy * p1.x + transform.mat.zw * p1.y + transform.translate;
	vec4 bbox = vec4(min(p0, p1), max(p0, p1));
	// Degree-raise and compute bbox
	if (seg_type >= 2) {
	p2 = transform.mat.xy * p2.x + transform.mat.zw * p2.y + transform.translate;
	bbox.xy = min(bbox.xy, p2);
	bbox.zw = max(bbox.zw, p2);
	if (seg_type == 3) {
	p3 = transform.mat.xy * p3.x + transform.mat.zw * p3.y + transform.translate;
	bbox.xy = min(bbox.xy, p3);
	bbox.zw = max(bbox.zw, p3);
	} else {
	p3 = p2;
	p2 = mix(p1, p2, 1.0 / 3.0);
	p1 = mix(p1, p0, 1.0 / 3.0);
	}
	} else {
	p3 = p1;
	p2 = mix(p3, p0, 1.0 / 3.0);
	p1 = mix(p0, p3, 1.0 / 3.0);
	}
	vec2 stroke = vec2(0.0, 0.0);
	if (linewidth[i] >= 0.0) {
	// See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
	stroke = 0.5 * linewidth[i] * vec2(length(transform.mat.xz), length(transform.mat.yw));
	bbox += vec4(-stroke, stroke);
	}
	local[i].bbox = bbox;
	local[i].flags = 0;

	// Write path segment to output
	PathCubic cubic;
	cubic.p0 = p0;
	cubic.p1 = p1;
	cubic.p2 = p2;
	cubic.p3 = p3;
	cubic.path_ix = tm.path_ix;
	// Not needed, TODO remove from struct
	cubic.trans_ix = gl_GlobalInvocationID.x * 4 + i;
	cubic.stroke = stroke;
	uint fill_mode = uint(linewidth[i] >= 0.0);
	PathSeg_Cubic_write(conf.pathseg_alloc, ps_ref, fill_mode, cubic);

	ps_ref.offset += PathSeg_size;
	uint n_points = (tag_byte & 3) + ((tag_byte >> 2) & 1);
	uint n_words = n_points + (n_points & (((tag_byte >> 3) & 1) * 15));
	ps_ix += n_words;
	} else {
	local[i].bbox = vec4(0.0, 0.0, 0.0, 0.0);
	// These shifts need to be kept in sync with setup.h
	uint is_path = (tag_byte >> 4) & 1;
	// Relies on the fact that RESET_BBOX == 1
	local[i].flags = is_path;
	tm.path_ix += is_path;
	trans_ix += (tag_byte >> 5) & 1;
	trans_ref.offset += ((tag_byte >> 5) & 1) * TransformSeg_size;
	lw_ix += (tag_byte >> 6) & 1;
	}
	}

	// Partition-wide monoid scan for bbox monoid
	Monoid agg = local[0];
	for (uint i = 1; i < N_SEQ; i++) {
	// Note: this could be fused with the map above, but probably
	// a thin performance gain not worth the complexity.
	agg = combine_monoid(agg, local[i]);
	local[i] = agg;
	}
	// local is N_SEQ sub-partition inclusive scan of bbox monoid.
	sh_scratch[gl_LocalInvocationID.x] = agg;
	for (uint i = 0; i < LG_WG_SIZE; i++) {
	barrier();
	if (gl_LocalInvocationID.x >= (1u << i)) {
	Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
	agg = combine_monoid(other, agg);
	}
	barrier();
	sh_scratch[gl_LocalInvocationID.x] = agg;
	}
	// sh_scratch is the partition-wide inclusive scan of the bbox monoid,
	// sampled at the end of the N_SEQ sub-partition.

	barrier();
	uint path_ix = save_path_ix;
	uint bbox_out_ix = (conf.path_bbox_alloc.offset >> 2) + path_ix * 6;
	// Write bboxes to paths; do atomic min/max if partial
	Monoid row = monoid_identity();
	if (gl_LocalInvocationID.x > 0) {
	row = sh_scratch[gl_LocalInvocationID.x - 1];
	}
	for (uint i = 0; i < N_SEQ; i++) {
	Monoid m = combine_monoid(row, local[i]);
	// m is partition-wide inclusive scan of bbox monoid.
	bool do_atomic = false;
	if (i == N_SEQ - 1 && gl_LocalInvocationID.x == WG_SIZE - 1) {
	// last element
	do_atomic = true;
	}
	if ((m.flags & FLAG_RESET_BBOX) != 0) {
	memory[bbox_out_ix + 4] = floatBitsToUint(linewidth[i]);
	memory[bbox_out_ix + 5] = save_trans_ix[i];
	if ((m.flags & FLAG_SET_BBOX) == 0) {
	do_atomic = true;
	} else {
	memory[bbox_out_ix] = round_down(m.bbox.x);
	memory[bbox_out_ix + 1] = round_down(m.bbox.y);
	memory[bbox_out_ix + 2] = round_up(m.bbox.z);
	memory[bbox_out_ix + 3] = round_up(m.bbox.w);
	bbox_out_ix += 6;
	do_atomic = false;
	}
	}
	if (do_atomic) {
	if (m.bbox.z > m.bbox.x \|\| m.bbox.w > m.bbox.y) {
	// atomic min/max
	atomicMin(memory[bbox_out_ix], round_down(m.bbox.x));
	atomicMin(memory[bbox_out_ix + 1], round_down(m.bbox.y));
	atomicMax(memory[bbox_out_ix + 2], round_up(m.bbox.z));
	atomicMax(memory[bbox_out_ix + 3], round_up(m.bbox.w));
	}
	bbox_out_ix += 6;
	}
	}
	}