tests/shader/prefix_reduce.comp - external/github.com/linebender/vello - Git at Google

 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

 // The reduction phase for prefix sum implemented as a tree reduction.

 #version 450

 #define N_ROWS 8
 #define LG_WG_SIZE 9
 #define WG_SIZE (1 << LG_WG_SIZE)
 #define PARTITION_SIZE (WG_SIZE * N_ROWS)

 layout(local_size_x = WG_SIZE, local_size_y = 1) in;

 struct Monoid {
     uint element;
 };

 layout(set = 0, binding = 0) readonly buffer InBuf {
     Monoid[] inbuf;
 };

 layout(set = 0, binding = 1) buffer OutBuf {
     Monoid[] outbuf;
 };

 shared Monoid sh_scratch[WG_SIZE];

 Monoid combine_monoid(Monoid a, Monoid b) {
     return Monoid(a.element + b.element);
 }

 void main() {
     uint ix = gl_GlobalInvocationID.x * N_ROWS;
     // TODO: gate buffer read
     Monoid agg = inbuf[ix];
     for (uint i = 1; i < N_ROWS; i++) {
         agg = combine_monoid(agg, inbuf[ix + i]);
     }
     sh_scratch[gl_LocalInvocationID.x] = agg;
     for (uint i = 0; i < LG_WG_SIZE; i++) {
         barrier();
         // We could make this predicate tighter, but would it help?
         if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) {
             Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)];
             agg = combine_monoid(agg, other);
         }
         barrier();
         sh_scratch[gl_LocalInvocationID.x] = agg;
     }
     if (gl_LocalInvocationID.x == 0) {
         outbuf[gl_WorkGroupID.x] = agg;
     }
 }
	// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

	// The reduction phase for prefix sum implemented as a tree reduction.

	#version 450

	#define N_ROWS 8
	#define LG_WG_SIZE 9
	#define WG_SIZE (1 << LG_WG_SIZE)
	#define PARTITION_SIZE (WG_SIZE * N_ROWS)

	layout(local_size_x = WG_SIZE, local_size_y = 1) in;

	struct Monoid {
	uint element;
	};

	layout(set = 0, binding = 0) readonly buffer InBuf {
	Monoid[] inbuf;
	};

	layout(set = 0, binding = 1) buffer OutBuf {
	Monoid[] outbuf;
	};

	shared Monoid sh_scratch[WG_SIZE];

	Monoid combine_monoid(Monoid a, Monoid b) {
	return Monoid(a.element + b.element);
	}

	void main() {
	uint ix = gl_GlobalInvocationID.x * N_ROWS;
	// TODO: gate buffer read
	Monoid agg = inbuf[ix];
	for (uint i = 1; i < N_ROWS; i++) {
	agg = combine_monoid(agg, inbuf[ix + i]);
	}
	sh_scratch[gl_LocalInvocationID.x] = agg;
	for (uint i = 0; i < LG_WG_SIZE; i++) {
	barrier();
	// We could make this predicate tighter, but would it help?
	if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) {
	Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)];
	agg = combine_monoid(agg, other);
	}
	barrier();
	sh_scratch[gl_LocalInvocationID.x] = agg;
	}
	if (gl_LocalInvocationID.x == 0) {
	outbuf[gl_WorkGroupID.x] = agg;
	}
	}