piet-gpu/shader/coarse.comp - external/github.com/linebender/vello - Git at Google

 // The coarse rasterizer stage of the pipeline.

 #version 450
 #extension GL_GOOGLE_include_directive : enable

 #include "setup.h"

 layout(local_size_x = N_TILE, local_size_y = 1) in;

 layout(set = 0, binding = 0) buffer AnnotatedBuf {
     uint[] annotated;
 };

 layout(set = 0, binding = 1) buffer BinsBuf {
     uint[] bins;
 };

 layout(set = 0, binding = 2) buffer AllocBuf {
     uint alloc;
 };

 layout(set = 0, binding = 3) buffer PtclBuf {
     uint[] ptcl;
 };

 #include "annotated.h"
 #include "bins.h"
 #include "ptcl.h"

 #define N_RINGBUF 512

 #define TRANSPOSE_BACKDROP

 shared uint sh_elements[N_RINGBUF];
 shared float sh_right_edge[N_RINGBUF];
 shared uint sh_chunk[N_WG];
 shared uint sh_chunk_next[N_WG];
 shared uint sh_chunk_n[N_WG];
 shared uint sh_min_buf;
 // Some of these are kept in shared memory to ease register
 // pressure, but it could go either way.
 shared uint sh_first_el[N_WG];
 shared uint sh_selected_n;
 shared uint sh_elements_ref;

 shared uint sh_bitmaps[N_SLICE][N_TILE];
 shared uint sh_backdrop[N_SLICE][N_TILE];
 shared uint sh_bd_sign[N_SLICE];
 shared uint sh_is_segment[N_SLICE];

 // Shared state for parallel segment output stage

 // Count of total number of segments in each tile, then
 // inclusive prefix sum of same.
 shared uint sh_seg_count[N_TILE];
 shared uint sh_seg_alloc;

 // scale factors useful for converting coordinates to tiles
 #define SX (1.0 / float(TILE_WIDTH_PX))
 #define SY (1.0 / float(TILE_HEIGHT_PX))

 // Perhaps cmd_limit should be a global? This is a style question.
 void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
     if (cmd_ref.offset > cmd_limit) {
         uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
         CmdJump jump = CmdJump(new_cmd);
         Cmd_Jump_write(cmd_ref, jump);
         cmd_ref = CmdRef(new_cmd);
         cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
     }
 }

 #define CHUNK_ALLOC_SLAB 16

 uint alloc_chunk_remaining;
 uint alloc_chunk_offset;

 SegChunkRef alloc_seg_chunk() {
     if (alloc_chunk_remaining == 0) {
         alloc_chunk_offset = atomicAdd(alloc, CHUNK_ALLOC_SLAB * SegChunk_size);
         alloc_chunk_remaining = CHUNK_ALLOC_SLAB;
     }
     uint offset = alloc_chunk_offset;
     alloc_chunk_offset += SegChunk_size;
     alloc_chunk_remaining--;
     return SegChunkRef(offset);
 }

 // Accumulate delta to backdrop.
 //
 // Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each
 // bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1.
 int count_backdrop(uint bd_bitmap, uint bd_sign) {
     return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
 }

 // Implementation of 16 x 16 boolean matrix transpose, using threadgroup shared memory.
 uint block_swap(uint a, uint b, uint m, uint s) {
     uint c;
     if ((gl_LocalInvocationID.x & s) == 0) {
         c = b << s;
     } else {
         m = ~m;
         c = b >> s;
     }
     return (a & m) | (c & ~m);
 }

 const uint masks[4] = uint[4](0x55555555, 0x33333333, 0xf0f0f0f, 0xff00ff);

 shared uint tg_bms[N_TILE];

 uint transpose(uint bitmask) {
     for (uint i = 0; i < 4; i++) {
         tg_bms[gl_LocalInvocationID.x] = bitmask;
         barrier();
         bitmask = block_swap(bitmask, tg_bms[gl_LocalInvocationID.x ^ (1 << i)], masks[i], 1 << i);
         barrier();
     }
     return bitmask;
 }

 void main() {
     // Could use either linear or 2d layouts for both dispatch and
     // invocations within the workgroup. We'll use variables to abstract.
     uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
     // Top left coordinates of this bin.
     vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
     uint th_ix = gl_LocalInvocationID.x;

     uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
     uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
     uint tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
     CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
     uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;

     // Allocation and management of segment output
     SegChunkRef first_seg_chunk = SegChunkRef(0);
     SegChunkRef last_chunk_ref = SegChunkRef(0);
     uint last_chunk_n = 0;
     SegmentRef last_chunk_segs = SegmentRef(0);
     alloc_chunk_remaining = 0;

     uint wr_ix = 0;
     uint rd_ix = 0;
     uint first_el;
     if (th_ix < N_WG) {
         uint start_chunk = (bin_ix * N_WG + th_ix) * BIN_INITIAL_ALLOC;
         sh_chunk[th_ix] = start_chunk;
         BinChunk chunk = BinChunk_read(BinChunkRef(start_chunk));
         sh_chunk_n[th_ix] = chunk.n;
         sh_chunk_next[th_ix] = chunk.next.offset;
         sh_first_el[th_ix] = chunk.n > 0 ?
             BinInstance_read(BinInstanceRef(start_chunk + BinChunk_size)).element_ix : ~0;
     }
     if (th_ix < N_SLICE) {
         sh_bd_sign[th_ix] = 0;
     }
     int backdrop = 0;
     while (true) {
         for (uint i = 0; i < N_SLICE; i++) {
             sh_bitmaps[i][th_ix] = 0;
             sh_backdrop[i][th_ix] = 0;
             sh_is_segment[th_ix] = 0;
         }

         while (wr_ix - rd_ix <= N_TILE) {
             // Choose segment with least element.
             uint my_min;
             if (th_ix < N_WG) {
                 if (th_ix == 0) {
                     sh_selected_n = 0;
                     sh_min_buf = ~0;
                 }
             }
             barrier();
             // Tempting to do this with subgroups, but atomic should be good enough.
             if (th_ix < N_WG) {
                 my_min = sh_first_el[th_ix];
                 atomicMin(sh_min_buf, my_min);
             }
             barrier();
             if (th_ix < N_WG) {
                 if (my_min == sh_min_buf && my_min != ~0) {
                     sh_elements_ref = sh_chunk[th_ix] + BinChunk_size;
                     uint selected_n = sh_chunk_n[th_ix];
                     sh_selected_n = selected_n;
                     uint next_chunk = sh_chunk_next[th_ix];
                     if (next_chunk == 0) {
                         sh_first_el[th_ix] = ~0;
                     } else {
                         sh_chunk[th_ix] = next_chunk;
                         BinChunk chunk = BinChunk_read(BinChunkRef(next_chunk));
                         sh_chunk_n[th_ix] = chunk.n;
                         sh_chunk_next[th_ix] = chunk.next.offset;
                         sh_first_el[th_ix] = BinInstance_read(
                             BinInstanceRef(next_chunk + BinChunk_size)).element_ix;
                     }
                 }
             }
             barrier();
             uint chunk_n = sh_selected_n;
             if (chunk_n == 0) {
                 // All chunks consumed
                 break;
             }
             BinInstanceRef inst_ref = BinInstanceRef(sh_elements_ref);
             if (th_ix < chunk_n) {
                 BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, th_ix));
                 uint wr_el_ix = (wr_ix + th_ix) % N_RINGBUF;
                 sh_elements[wr_el_ix] = inst.element_ix;
                 sh_right_edge[wr_el_ix] = inst.right_edge;
             }
             wr_ix += chunk_n;
         }
         barrier();

         // We've done the merge and filled the buffer.

         // Read one element, compute coverage.
         uint tag = Annotated_Nop;
         AnnotatedRef ref;
         float right_edge = 0.0;
         if (th_ix + rd_ix < wr_ix) {
             uint rd_el_ix = (rd_ix + th_ix) % N_RINGBUF;
             uint element_ix = sh_elements[rd_el_ix];
             right_edge = sh_right_edge[rd_el_ix];
             ref = AnnotatedRef(element_ix * Annotated_size);
             tag = Annotated_tag(ref);
         }

         // Setup for coverage algorithm.
         float a, b, c;
         // Bounding box of element in pixel coordinates.
         float xmin, xmax, ymin, ymax;
         uint my_slice = th_ix / 32;
         uint my_mask = 1 << (th_ix & 31);
         switch (tag) {
         case Annotated_FillLine:
         case Annotated_StrokeLine:
             AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
             xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
             xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
             ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
             ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
             float dx = line.p1.x - line.p0.x;
             float dy = line.p1.y - line.p0.y;
             if (tag == Annotated_FillLine) {
                 // Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
                 if (dy < 0) {
                     atomicOr(sh_bd_sign[my_slice], my_mask);
                 } else {
                     atomicAnd(sh_bd_sign[my_slice], ~my_mask);
                 }
             }
             atomicOr(sh_is_segment[my_slice], my_mask);
             // Set up for per-scanline coverage formula, below.
             float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
             c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
             b = invslope; // Note: assumes square tiles, otherwise scale.
             a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX;
             break;
         case Annotated_Fill:
         case Annotated_Stroke:
             // Note: we take advantage of the fact that fills and strokes
             // have compatible layout.
             AnnoFill fill = Annotated_Fill_read(ref);
             xmin = fill.bbox.x;
             xmax = fill.bbox.z;
             ymin = fill.bbox.y;
             ymax = fill.bbox.w;
             // Just let the clamping to xmin and xmax determine the bounds.
             a = 0.0;
             b = 0.0;
             c = 1e9;
             break;
         default:
             ymin = 0;
             ymax = 0;
             break;
         }

         // Draw the coverage area into the bitmasks. This uses an algorithm
         // that computes the coverage of a span for given scanline.

         // Compute bounding box in tiles and clip to this bin.
         int x0 = int(floor((xmin - xy0.x) * SX));
         int x1 = int(ceil((xmax - xy0.x) * SX));
         int xr = int(ceil((right_edge - xy0.x) * SX));
         int y0 = int(floor((ymin - xy0.y) * SY));
         int y1 = int(ceil((ymax - xy0.y) * SY));
         x0 = clamp(x0, 0, N_TILE_X);
         x1 = clamp(x1, x0, N_TILE_X);
         xr = clamp(xr, 0, N_TILE_X);
         y0 = clamp(y0, 0, N_TILE_Y);
         y1 = clamp(y1, y0, N_TILE_Y);
         float t = a + b * float(y0);
         for (uint y = y0; y < y1; y++) {
             uint xx0 = clamp(int(floor(t - c)), x0, x1);
             uint xx1 = clamp(int(ceil(t + c)), x0, x1);
             uint mask = ((1 << xx1) - (1 << xx0)) << (th_ix & 16);
             atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + (th_ix & 0xf)], mask);
             if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
                 // Assign backdrop to all tiles to the right of the ray crossing the
                 // top edge of this tile, up to the right edge of the fill bbox.
                 float xray = t - 0.5 * b;
 #ifdef TRANSPOSE_BACKDROP
                 xx0 = clamp(int(ceil(xray)), 0, xr);
                 uint mask = ((1 << xr) - (1 << xx0)) << (th_ix & 16);
                 atomicOr(sh_backdrop[my_slice][y * N_TILE_X + (th_ix & 0xf)], mask);
 #else
                 xx0 = max(int(ceil(xray)), 0);
                 for (uint x = xx0; x < xr; x++) {
                     atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
                 }
 #endif
             }
             t += b;
         }
         barrier();
         for (uint i = 0; i < N_SLICE; i++) {
             sh_bitmaps[i][th_ix] = transpose(sh_bitmaps[i][th_ix]);
 #ifdef TRANSPOSE_BACKDROP
             sh_backdrop[i][th_ix] = transpose(sh_backdrop[i][th_ix]);
 #endif
         }
         barrier();

         // We've computed coverage and other info for each element in the input, now for
         // the output stage. We'll do segments first using a more parallel algorithm.

         uint seg_count = 0;
         for (uint i = 0; i < N_SLICE; i++) {
             seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
         }
         sh_seg_count[th_ix] = seg_count;
         // Prefix sum of sh_seg_count
         for (uint i = 0; i < LG_N_TILE; i++) {
             barrier();
             if (th_ix >= (1 << i)) {
                 seg_count += sh_seg_count[th_ix - (1 << i)];
             }
             barrier();
             sh_seg_count[th_ix] = seg_count;
         }
         if (th_ix == N_TILE - 1) {
             sh_seg_alloc = atomicAdd(alloc, seg_count * Segment_size);
         }
         barrier();
         uint total_seg_count = sh_seg_count[N_TILE - 1];
         uint seg_alloc = sh_seg_alloc;

         // Output buffer is allocated as segments for each tile laid end-to-end.

         for (uint ix = th_ix; ix < total_seg_count; ix += N_TILE) {
             // Find the work item; this thread is now not bound to an element or tile.
             // First find the tile (by binary search)
             uint tile_ix = 0;
             for (uint i = 0; i < LG_N_TILE; i++) {
                 uint probe = tile_ix + ((N_TILE / 2) >> i);
                 if (ix >= sh_seg_count[probe - 1]) {
                     tile_ix = probe;
                 }
             }
             // Now, sh_seg_count[tile_ix - 1] <= ix < sh_seg_count[tile_ix].
             // (considering sh_seg_count[-1] == 0)

             // Index of segment within tile's segments
             uint seq_ix = ix;
             // Maybe consider a sentinel value to avoid the conditional?
             if (tile_ix > 0) {
                 seq_ix -= sh_seg_count[tile_ix - 1];
             }
             // Find the segment. This is done by linear scan through the bitmaps of the
             // tile, accelerated by bit counting. Binary search might help, maybe not.
             uint slice_ix = 0;
             uint seq_bits;

             while (true) {
                 seq_bits = sh_bitmaps[slice_ix][tile_ix] & sh_is_segment[slice_ix];
                 uint this_count = bitCount(seq_bits);
                 if (this_count > seq_ix) {
                     break;
                 }
                 seq_ix -= this_count;
                 slice_ix++;
             }
             // Now find position of nth bit set (n = seq_ix) in seq_bits; binary search
             uint bit_ix = 0;
             for (int i = 0; i < 5; i++) {
                 uint probe = bit_ix + (16 >> i);
                 if (seq_ix >= bitCount(seq_bits & ((1 << probe) - 1))) {
                     bit_ix = probe;
                 }
             }
             uint out_offset = seg_alloc + Segment_size * ix + SegChunk_size;
             uint rd_el_ix = (rd_ix + slice_ix * 32 + bit_ix) % N_RINGBUF;
             uint element_ix = sh_elements[rd_el_ix];
             ref = AnnotatedRef(element_ix * Annotated_size);
             AnnoFillLineSeg line = Annotated_FillLine_read(ref);
             float y_edge = 0.0;
             // This is basically the same logic as piet-metal, but should be made numerically robust.
             if (Annotated_tag(ref) == Annotated_FillLine) {
                 vec2 tile_xy = xy0 + vec2((tile_ix % N_TILE_X) * TILE_WIDTH_PX, (tile_ix / N_TILE_X) * TILE_HEIGHT_PX);
                 y_edge = mix(line.p0.y, line.p1.y, (tile_xy.x - line.p0.x) / (line.p1.x - line.p0.x));
                 if (min(line.p0.x, line.p1.x) < tile_xy.x && y_edge >= tile_xy.y && y_edge < tile_xy.y + TILE_HEIGHT_PX) {
                     if (line.p0.x > line.p1.x) {
                         line.p1 = vec2(tile_xy.x, y_edge);
                     } else {
                         line.p0 = vec2(tile_xy.x, y_edge);
                     }
                 } else {
                     y_edge = 1e9;
                 }
             }
             Segment seg = Segment(line.p0, line.p1, y_edge);
             Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
         }

         // Output non-segment elements for this tile. The thread does a sequential walk
         // through the non-segment elements, and for segments, count and backdrop are
         // aggregated using bit counting.
         uint slice_ix = 0;
         uint bitmap = sh_bitmaps[0][th_ix];
         uint bd_bitmap = sh_backdrop[0][th_ix];
         uint bd_sign = sh_bd_sign[0];
         uint is_segment = sh_is_segment[0];
         uint seg_start = th_ix == 0 ? 0 : sh_seg_count[th_ix - 1];
         seg_count = 0;
         while (true) {
             uint nonseg_bitmap = bitmap & ~is_segment;
             if (nonseg_bitmap == 0) {
                 backdrop += count_backdrop(bd_bitmap, bd_sign);
                 seg_count += bitCount(bitmap & is_segment);
                 slice_ix++;
                 if (slice_ix == N_SLICE) {
                     break;
                 }
                 bitmap = sh_bitmaps[slice_ix][th_ix];
                 bd_bitmap = sh_backdrop[slice_ix][th_ix];
                 bd_sign = sh_bd_sign[slice_ix];
                 is_segment = sh_is_segment[slice_ix];
                 nonseg_bitmap = bitmap & ~is_segment;
                 if (nonseg_bitmap == 0) {
                     continue;
                 }
             }
             uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap);
             uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF];

             // Bits up to and including the lsb
             uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap;
             backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
             seg_count += bitCount(bitmap & bd_mask & is_segment);
             // Clear bits that have been consumed.
             bd_bitmap &= ~bd_mask;
             bitmap &= ~bd_mask;

             // At this point, we read the element again from global memory.
             // If that turns out to be expensive, maybe we can pack it into
             // shared memory (or perhaps just the tag).
             ref = AnnotatedRef(element_ix * Annotated_size);
             tag = Annotated_tag(ref);

             switch (tag) {
             case Annotated_Fill:
                 if (last_chunk_n > 0 || seg_count > 0) {
                     SegChunkRef chunk_ref = SegChunkRef(0);
                     if (seg_count > 0) {
                         chunk_ref = alloc_seg_chunk();
                         SegChunk chunk;
                         chunk.n = seg_count;
                         chunk.next = SegChunkRef(0);
                         uint seg_offset = seg_alloc + seg_start * Segment_size;
                         chunk.segs = SegmentRef(seg_offset);
                         SegChunk_write(chunk_ref, chunk);
                     }
                     if (last_chunk_n > 0) {
                         SegChunk chunk;
                         chunk.n = last_chunk_n;
                         chunk.next = chunk_ref;
                         chunk.segs = last_chunk_segs;
                         SegChunk_write(last_chunk_ref, chunk);
                     } else {
                         first_seg_chunk = chunk_ref;
                     }

                     AnnoFill fill = Annotated_Fill_read(ref);
                     CmdFill cmd_fill;
                     cmd_fill.seg_ref = first_seg_chunk;
                     cmd_fill.backdrop = backdrop;
                     cmd_fill.rgba_color = fill.rgba_color;
                     alloc_cmd(cmd_ref, cmd_limit);
                     Cmd_Fill_write(cmd_ref, cmd_fill);
                     cmd_ref.offset += Cmd_size;
                     last_chunk_n = 0;
                 } else if (backdrop != 0) {
                     AnnoFill fill = Annotated_Fill_read(ref);
                     alloc_cmd(cmd_ref, cmd_limit);
                     Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
                     cmd_ref.offset += Cmd_size;
                 }
                 seg_start += seg_count;
                 seg_count = 0;
                 backdrop = 0;
                 break;
             case Annotated_Stroke:
                 // TODO: reduce divergence & code duplication? Much of the
                 // fill and stroke processing is in common.
                 if (last_chunk_n > 0 || seg_count > 0) {
                     SegChunkRef chunk_ref = SegChunkRef(0);
                     if (seg_count > 0) {
                         chunk_ref = alloc_seg_chunk();
                         SegChunk chunk;
                         chunk.n = seg_count;
                         chunk.next = SegChunkRef(0);
                         uint seg_offset = seg_alloc + seg_start * Segment_size;
                         chunk.segs = SegmentRef(seg_offset);
                         SegChunk_write(chunk_ref, chunk);
                     }
                     if (last_chunk_n > 0) {
                         SegChunk chunk;
                         chunk.n = last_chunk_n;
                         chunk.next = chunk_ref;
                         chunk.segs = last_chunk_segs;
                         SegChunk_write(last_chunk_ref, chunk);
                     } else {
                         first_seg_chunk = chunk_ref;
                     }

                     AnnoStroke stroke = Annotated_Stroke_read(ref);
                     CmdStroke cmd_stroke;
                     cmd_stroke.seg_ref = first_seg_chunk;
                     cmd_stroke.half_width = 0.5 * stroke.linewidth;
                     cmd_stroke.rgba_color = stroke.rgba_color;
                     alloc_cmd(cmd_ref, cmd_limit);
                     Cmd_Stroke_write(cmd_ref, cmd_stroke);
                     cmd_ref.offset += Cmd_size;
                     last_chunk_n = 0;
                 }
                 seg_start += seg_count;
                 seg_count = 0;
                 break;
             default:
                 // This shouldn't happen, but just in case.
                 seg_start++;
                 break;
             }
         }
         if (seg_count > 0) {
             SegChunkRef chunk_ref = alloc_seg_chunk();
             if (last_chunk_n > 0) {
                 SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs));
             } else {
                 first_seg_chunk = chunk_ref;
             }
             // TODO: free two registers by writing count and segments ref now,
             // as opposed to deferring SegChunk write until all fields are known.
             last_chunk_ref = chunk_ref;
             last_chunk_n = seg_count;
             uint seg_offset = seg_alloc + seg_start * Segment_size;
             last_chunk_segs = SegmentRef(seg_offset);
         }
         barrier();

         rd_ix += N_TILE;
         // The second disjunct is there as a strange workaround on Nvidia. If it is
         // removed, then the kernel fails with ERROR_DEVICE_LOST.
         if (rd_ix >= wr_ix || bin_ix == ~0) break;
     }
     Cmd_End_write(cmd_ref);
 }
	// The coarse rasterizer stage of the pipeline.

	#version 450
	#extension GL_GOOGLE_include_directive : enable

	#include "setup.h"

	layout(local_size_x = N_TILE, local_size_y = 1) in;

	layout(set = 0, binding = 0) buffer AnnotatedBuf {
	uint[] annotated;
	};

	layout(set = 0, binding = 1) buffer BinsBuf {
	uint[] bins;
	};

	layout(set = 0, binding = 2) buffer AllocBuf {
	uint alloc;
	};

	layout(set = 0, binding = 3) buffer PtclBuf {
	uint[] ptcl;
	};

	#include "annotated.h"
	#include "bins.h"
	#include "ptcl.h"

	#define N_RINGBUF 512

	#define TRANSPOSE_BACKDROP

	shared uint sh_elements[N_RINGBUF];
	shared float sh_right_edge[N_RINGBUF];
	shared uint sh_chunk[N_WG];
	shared uint sh_chunk_next[N_WG];
	shared uint sh_chunk_n[N_WG];
	shared uint sh_min_buf;
	// Some of these are kept in shared memory to ease register
	// pressure, but it could go either way.
	shared uint sh_first_el[N_WG];
	shared uint sh_selected_n;
	shared uint sh_elements_ref;

	shared uint sh_bitmaps[N_SLICE][N_TILE];
	shared uint sh_backdrop[N_SLICE][N_TILE];
	shared uint sh_bd_sign[N_SLICE];
	shared uint sh_is_segment[N_SLICE];

	// Shared state for parallel segment output stage

	// Count of total number of segments in each tile, then
	// inclusive prefix sum of same.
	shared uint sh_seg_count[N_TILE];
	shared uint sh_seg_alloc;

	// scale factors useful for converting coordinates to tiles
	#define SX (1.0 / float(TILE_WIDTH_PX))
	#define SY (1.0 / float(TILE_HEIGHT_PX))

	// Perhaps cmd_limit should be a global? This is a style question.
	void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
	if (cmd_ref.offset > cmd_limit) {
	uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
	CmdJump jump = CmdJump(new_cmd);
	Cmd_Jump_write(cmd_ref, jump);
	cmd_ref = CmdRef(new_cmd);
	cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
	}
	}

	#define CHUNK_ALLOC_SLAB 16

	uint alloc_chunk_remaining;
	uint alloc_chunk_offset;

	SegChunkRef alloc_seg_chunk() {
	if (alloc_chunk_remaining == 0) {
	alloc_chunk_offset = atomicAdd(alloc, CHUNK_ALLOC_SLAB * SegChunk_size);
	alloc_chunk_remaining = CHUNK_ALLOC_SLAB;
	}
	uint offset = alloc_chunk_offset;
	alloc_chunk_offset += SegChunk_size;
	alloc_chunk_remaining--;
	return SegChunkRef(offset);
	}

	// Accumulate delta to backdrop.
	//
	// Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each
	// bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1.
	int count_backdrop(uint bd_bitmap, uint bd_sign) {
	return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
	}

	// Implementation of 16 x 16 boolean matrix transpose, using threadgroup shared memory.
	uint block_swap(uint a, uint b, uint m, uint s) {
	uint c;
	if ((gl_LocalInvocationID.x & s) == 0) {
	c = b << s;
	} else {
	m = ~m;
	c = b >> s;
	}
	return (a & m) \| (c & ~m);
	}

	const uint masks[4] = uint[4](0x55555555, 0x33333333, 0xf0f0f0f, 0xff00ff);

	shared uint tg_bms[N_TILE];

	uint transpose(uint bitmask) {
	for (uint i = 0; i < 4; i++) {
	tg_bms[gl_LocalInvocationID.x] = bitmask;
	barrier();
	bitmask = block_swap(bitmask, tg_bms[gl_LocalInvocationID.x ^ (1 << i)], masks[i], 1 << i);
	barrier();
	}
	return bitmask;
	}

	void main() {
	// Could use either linear or 2d layouts for both dispatch and
	// invocations within the workgroup. We'll use variables to abstract.
	uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
	// Top left coordinates of this bin.
	vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
	uint th_ix = gl_LocalInvocationID.x;

	uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
	uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
	uint tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
	CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
	uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;

	// Allocation and management of segment output
	SegChunkRef first_seg_chunk = SegChunkRef(0);
	SegChunkRef last_chunk_ref = SegChunkRef(0);
	uint last_chunk_n = 0;
	SegmentRef last_chunk_segs = SegmentRef(0);
	alloc_chunk_remaining = 0;

	uint wr_ix = 0;
	uint rd_ix = 0;
	uint first_el;
	if (th_ix < N_WG) {
	uint start_chunk = (bin_ix * N_WG + th_ix) * BIN_INITIAL_ALLOC;
	sh_chunk[th_ix] = start_chunk;
	BinChunk chunk = BinChunk_read(BinChunkRef(start_chunk));
	sh_chunk_n[th_ix] = chunk.n;
	sh_chunk_next[th_ix] = chunk.next.offset;
	sh_first_el[th_ix] = chunk.n > 0 ?
	BinInstance_read(BinInstanceRef(start_chunk + BinChunk_size)).element_ix : ~0;
	}
	if (th_ix < N_SLICE) {
	sh_bd_sign[th_ix] = 0;
	}
	int backdrop = 0;
	while (true) {
	for (uint i = 0; i < N_SLICE; i++) {
	sh_bitmaps[i][th_ix] = 0;
	sh_backdrop[i][th_ix] = 0;
	sh_is_segment[th_ix] = 0;
	}

	while (wr_ix - rd_ix <= N_TILE) {
	// Choose segment with least element.
	uint my_min;
	if (th_ix < N_WG) {
	if (th_ix == 0) {
	sh_selected_n = 0;
	sh_min_buf = ~0;
	}
	}
	barrier();
	// Tempting to do this with subgroups, but atomic should be good enough.
	if (th_ix < N_WG) {
	my_min = sh_first_el[th_ix];
	atomicMin(sh_min_buf, my_min);
	}
	barrier();
	if (th_ix < N_WG) {
	if (my_min == sh_min_buf && my_min != ~0) {
	sh_elements_ref = sh_chunk[th_ix] + BinChunk_size;
	uint selected_n = sh_chunk_n[th_ix];
	sh_selected_n = selected_n;
	uint next_chunk = sh_chunk_next[th_ix];
	if (next_chunk == 0) {
	sh_first_el[th_ix] = ~0;
	} else {
	sh_chunk[th_ix] = next_chunk;
	BinChunk chunk = BinChunk_read(BinChunkRef(next_chunk));
	sh_chunk_n[th_ix] = chunk.n;
	sh_chunk_next[th_ix] = chunk.next.offset;
	sh_first_el[th_ix] = BinInstance_read(
	BinInstanceRef(next_chunk + BinChunk_size)).element_ix;
	}
	}
	}
	barrier();
	uint chunk_n = sh_selected_n;
	if (chunk_n == 0) {
	// All chunks consumed
	break;
	}
	BinInstanceRef inst_ref = BinInstanceRef(sh_elements_ref);
	if (th_ix < chunk_n) {
	BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, th_ix));
	uint wr_el_ix = (wr_ix + th_ix) % N_RINGBUF;
	sh_elements[wr_el_ix] = inst.element_ix;
	sh_right_edge[wr_el_ix] = inst.right_edge;
	}
	wr_ix += chunk_n;
	}
	barrier();

	// We've done the merge and filled the buffer.

	// Read one element, compute coverage.
	uint tag = Annotated_Nop;
	AnnotatedRef ref;
	float right_edge = 0.0;
	if (th_ix + rd_ix < wr_ix) {
	uint rd_el_ix = (rd_ix + th_ix) % N_RINGBUF;
	uint element_ix = sh_elements[rd_el_ix];
	right_edge = sh_right_edge[rd_el_ix];
	ref = AnnotatedRef(element_ix * Annotated_size);
	tag = Annotated_tag(ref);
	}

	// Setup for coverage algorithm.
	float a, b, c;
	// Bounding box of element in pixel coordinates.
	float xmin, xmax, ymin, ymax;
	uint my_slice = th_ix / 32;
	uint my_mask = 1 << (th_ix & 31);
	switch (tag) {
	case Annotated_FillLine:
	case Annotated_StrokeLine:
	AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
	xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
	xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
	ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
	ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
	float dx = line.p1.x - line.p0.x;
	float dy = line.p1.y - line.p0.y;
	if (tag == Annotated_FillLine) {
	// Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
	if (dy < 0) {
	atomicOr(sh_bd_sign[my_slice], my_mask);
	} else {
	atomicAnd(sh_bd_sign[my_slice], ~my_mask);
	}
	}
	atomicOr(sh_is_segment[my_slice], my_mask);
	// Set up for per-scanline coverage formula, below.
	float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
	c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
	b = invslope; // Note: assumes square tiles, otherwise scale.
	a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX;
	break;
	case Annotated_Fill:
	case Annotated_Stroke:
	// Note: we take advantage of the fact that fills and strokes
	// have compatible layout.
	AnnoFill fill = Annotated_Fill_read(ref);
	xmin = fill.bbox.x;
	xmax = fill.bbox.z;
	ymin = fill.bbox.y;
	ymax = fill.bbox.w;
	// Just let the clamping to xmin and xmax determine the bounds.
	a = 0.0;
	b = 0.0;
	c = 1e9;
	break;
	default:
	ymin = 0;
	ymax = 0;
	break;
	}

	// Draw the coverage area into the bitmasks. This uses an algorithm
	// that computes the coverage of a span for given scanline.

	// Compute bounding box in tiles and clip to this bin.
	int x0 = int(floor((xmin - xy0.x) * SX));
	int x1 = int(ceil((xmax - xy0.x) * SX));
	int xr = int(ceil((right_edge - xy0.x) * SX));
	int y0 = int(floor((ymin - xy0.y) * SY));
	int y1 = int(ceil((ymax - xy0.y) * SY));
	x0 = clamp(x0, 0, N_TILE_X);
	x1 = clamp(x1, x0, N_TILE_X);
	xr = clamp(xr, 0, N_TILE_X);
	y0 = clamp(y0, 0, N_TILE_Y);
	y1 = clamp(y1, y0, N_TILE_Y);
	float t = a + b * float(y0);
	for (uint y = y0; y < y1; y++) {
	uint xx0 = clamp(int(floor(t - c)), x0, x1);
	uint xx1 = clamp(int(ceil(t + c)), x0, x1);
	uint mask = ((1 << xx1) - (1 << xx0)) << (th_ix & 16);
	atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + (th_ix & 0xf)], mask);
	if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
	// Assign backdrop to all tiles to the right of the ray crossing the
	// top edge of this tile, up to the right edge of the fill bbox.
	float xray = t - 0.5 * b;
	#ifdef TRANSPOSE_BACKDROP
	xx0 = clamp(int(ceil(xray)), 0, xr);
	uint mask = ((1 << xr) - (1 << xx0)) << (th_ix & 16);
	atomicOr(sh_backdrop[my_slice][y * N_TILE_X + (th_ix & 0xf)], mask);
	#else
	xx0 = max(int(ceil(xray)), 0);
	for (uint x = xx0; x < xr; x++) {
	atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
	}
	#endif
	}
	t += b;
	}
	barrier();
	for (uint i = 0; i < N_SLICE; i++) {
	sh_bitmaps[i][th_ix] = transpose(sh_bitmaps[i][th_ix]);
	#ifdef TRANSPOSE_BACKDROP
	sh_backdrop[i][th_ix] = transpose(sh_backdrop[i][th_ix]);
	#endif
	}
	barrier();

	// We've computed coverage and other info for each element in the input, now for
	// the output stage. We'll do segments first using a more parallel algorithm.

	uint seg_count = 0;
	for (uint i = 0; i < N_SLICE; i++) {
	seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
	}
	sh_seg_count[th_ix] = seg_count;
	// Prefix sum of sh_seg_count
	for (uint i = 0; i < LG_N_TILE; i++) {
	barrier();
	if (th_ix >= (1 << i)) {
	seg_count += sh_seg_count[th_ix - (1 << i)];
	}
	barrier();
	sh_seg_count[th_ix] = seg_count;
	}
	if (th_ix == N_TILE - 1) {
	sh_seg_alloc = atomicAdd(alloc, seg_count * Segment_size);
	}
	barrier();
	uint total_seg_count = sh_seg_count[N_TILE - 1];
	uint seg_alloc = sh_seg_alloc;

	// Output buffer is allocated as segments for each tile laid end-to-end.

	for (uint ix = th_ix; ix < total_seg_count; ix += N_TILE) {
	// Find the work item; this thread is now not bound to an element or tile.
	// First find the tile (by binary search)
	uint tile_ix = 0;
	for (uint i = 0; i < LG_N_TILE; i++) {
	uint probe = tile_ix + ((N_TILE / 2) >> i);
	if (ix >= sh_seg_count[probe - 1]) {
	tile_ix = probe;
	}
	}
	// Now, sh_seg_count[tile_ix - 1] <= ix < sh_seg_count[tile_ix].
	// (considering sh_seg_count[-1] == 0)

	// Index of segment within tile's segments
	uint seq_ix = ix;
	// Maybe consider a sentinel value to avoid the conditional?
	if (tile_ix > 0) {
	seq_ix -= sh_seg_count[tile_ix - 1];
	}
	// Find the segment. This is done by linear scan through the bitmaps of the
	// tile, accelerated by bit counting. Binary search might help, maybe not.
	uint slice_ix = 0;
	uint seq_bits;

	while (true) {
	seq_bits = sh_bitmaps[slice_ix][tile_ix] & sh_is_segment[slice_ix];
	uint this_count = bitCount(seq_bits);
	if (this_count > seq_ix) {
	break;
	}
	seq_ix -= this_count;
	slice_ix++;
	}
	// Now find position of nth bit set (n = seq_ix) in seq_bits; binary search
	uint bit_ix = 0;
	for (int i = 0; i < 5; i++) {
	uint probe = bit_ix + (16 >> i);
	if (seq_ix >= bitCount(seq_bits & ((1 << probe) - 1))) {
	bit_ix = probe;
	}
	}
	uint out_offset = seg_alloc + Segment_size * ix + SegChunk_size;
	uint rd_el_ix = (rd_ix + slice_ix * 32 + bit_ix) % N_RINGBUF;
	uint element_ix = sh_elements[rd_el_ix];
	ref = AnnotatedRef(element_ix * Annotated_size);
	AnnoFillLineSeg line = Annotated_FillLine_read(ref);
	float y_edge = 0.0;
	// This is basically the same logic as piet-metal, but should be made numerically robust.
	if (Annotated_tag(ref) == Annotated_FillLine) {
	vec2 tile_xy = xy0 + vec2((tile_ix % N_TILE_X) * TILE_WIDTH_PX, (tile_ix / N_TILE_X) * TILE_HEIGHT_PX);
	y_edge = mix(line.p0.y, line.p1.y, (tile_xy.x - line.p0.x) / (line.p1.x - line.p0.x));
	if (min(line.p0.x, line.p1.x) < tile_xy.x && y_edge >= tile_xy.y && y_edge < tile_xy.y + TILE_HEIGHT_PX) {
	if (line.p0.x > line.p1.x) {
	line.p1 = vec2(tile_xy.x, y_edge);
	} else {
	line.p0 = vec2(tile_xy.x, y_edge);
	}
	} else {
	y_edge = 1e9;
	}
	}
	Segment seg = Segment(line.p0, line.p1, y_edge);
	Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
	}

	// Output non-segment elements for this tile. The thread does a sequential walk
	// through the non-segment elements, and for segments, count and backdrop are
	// aggregated using bit counting.
	uint slice_ix = 0;
	uint bitmap = sh_bitmaps[0][th_ix];
	uint bd_bitmap = sh_backdrop[0][th_ix];
	uint bd_sign = sh_bd_sign[0];
	uint is_segment = sh_is_segment[0];
	uint seg_start = th_ix == 0 ? 0 : sh_seg_count[th_ix - 1];
	seg_count = 0;
	while (true) {
	uint nonseg_bitmap = bitmap & ~is_segment;
	if (nonseg_bitmap == 0) {
	backdrop += count_backdrop(bd_bitmap, bd_sign);
	seg_count += bitCount(bitmap & is_segment);
	slice_ix++;
	if (slice_ix == N_SLICE) {
	break;
	}
	bitmap = sh_bitmaps[slice_ix][th_ix];
	bd_bitmap = sh_backdrop[slice_ix][th_ix];
	bd_sign = sh_bd_sign[slice_ix];
	is_segment = sh_is_segment[slice_ix];
	nonseg_bitmap = bitmap & ~is_segment;
	if (nonseg_bitmap == 0) {
	continue;
	}
	}
	uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap);
	uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF];

	// Bits up to and including the lsb
	uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap;
	backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
	seg_count += bitCount(bitmap & bd_mask & is_segment);
	// Clear bits that have been consumed.
	bd_bitmap &= ~bd_mask;
	bitmap &= ~bd_mask;

	// At this point, we read the element again from global memory.
	// If that turns out to be expensive, maybe we can pack it into
	// shared memory (or perhaps just the tag).
	ref = AnnotatedRef(element_ix * Annotated_size);
	tag = Annotated_tag(ref);

	switch (tag) {
	case Annotated_Fill:
	if (last_chunk_n > 0 \|\| seg_count > 0) {
	SegChunkRef chunk_ref = SegChunkRef(0);
	if (seg_count > 0) {
	chunk_ref = alloc_seg_chunk();
	SegChunk chunk;
	chunk.n = seg_count;
	chunk.next = SegChunkRef(0);
	uint seg_offset = seg_alloc + seg_start * Segment_size;
	chunk.segs = SegmentRef(seg_offset);
	SegChunk_write(chunk_ref, chunk);
	}
	if (last_chunk_n > 0) {
	SegChunk chunk;
	chunk.n = last_chunk_n;
	chunk.next = chunk_ref;
	chunk.segs = last_chunk_segs;
	SegChunk_write(last_chunk_ref, chunk);
	} else {
	first_seg_chunk = chunk_ref;
	}

	AnnoFill fill = Annotated_Fill_read(ref);
	CmdFill cmd_fill;
	cmd_fill.seg_ref = first_seg_chunk;
	cmd_fill.backdrop = backdrop;
	cmd_fill.rgba_color = fill.rgba_color;
	alloc_cmd(cmd_ref, cmd_limit);
	Cmd_Fill_write(cmd_ref, cmd_fill);
	cmd_ref.offset += Cmd_size;
	last_chunk_n = 0;
	} else if (backdrop != 0) {
	AnnoFill fill = Annotated_Fill_read(ref);
	alloc_cmd(cmd_ref, cmd_limit);
	Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
	cmd_ref.offset += Cmd_size;
	}
	seg_start += seg_count;
	seg_count = 0;
	backdrop = 0;
	break;
	case Annotated_Stroke:
	// TODO: reduce divergence & code duplication? Much of the
	// fill and stroke processing is in common.
	if (last_chunk_n > 0 \|\| seg_count > 0) {
	SegChunkRef chunk_ref = SegChunkRef(0);
	if (seg_count > 0) {
	chunk_ref = alloc_seg_chunk();
	SegChunk chunk;
	chunk.n = seg_count;
	chunk.next = SegChunkRef(0);
	uint seg_offset = seg_alloc + seg_start * Segment_size;
	chunk.segs = SegmentRef(seg_offset);
	SegChunk_write(chunk_ref, chunk);
	}
	if (last_chunk_n > 0) {
	SegChunk chunk;
	chunk.n = last_chunk_n;
	chunk.next = chunk_ref;
	chunk.segs = last_chunk_segs;
	SegChunk_write(last_chunk_ref, chunk);
	} else {
	first_seg_chunk = chunk_ref;
	}

	AnnoStroke stroke = Annotated_Stroke_read(ref);
	CmdStroke cmd_stroke;
	cmd_stroke.seg_ref = first_seg_chunk;
	cmd_stroke.half_width = 0.5 * stroke.linewidth;
	cmd_stroke.rgba_color = stroke.rgba_color;
	alloc_cmd(cmd_ref, cmd_limit);
	Cmd_Stroke_write(cmd_ref, cmd_stroke);
	cmd_ref.offset += Cmd_size;
	last_chunk_n = 0;
	}
	seg_start += seg_count;
	seg_count = 0;
	break;
	default:
	// This shouldn't happen, but just in case.
	seg_start++;
	break;
	}
	}
	if (seg_count > 0) {
	SegChunkRef chunk_ref = alloc_seg_chunk();
	if (last_chunk_n > 0) {
	SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs));
	} else {
	first_seg_chunk = chunk_ref;
	}
	// TODO: free two registers by writing count and segments ref now,
	// as opposed to deferring SegChunk write until all fields are known.
	last_chunk_ref = chunk_ref;
	last_chunk_n = seg_count;
	uint seg_offset = seg_alloc + seg_start * Segment_size;
	last_chunk_segs = SegmentRef(seg_offset);
	}
	barrier();

	rd_ix += N_TILE;
	// The second disjunct is there as a strange workaround on Nvidia. If it is
	// removed, then the kernel fails with ERROR_DEVICE_LOST.
	if (rd_ix >= wr_ix \|\| bin_ix == ~0) break;
	}
	Cmd_End_write(cmd_ref);
	}