| // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense |
| |
| // The binning stage of the pipeline. |
| // |
| // Each workgroup processes N_TILE paths. |
| // Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask |
| // based on the path bounding box to bin the paths. |
| |
| #version 450 |
| #extension GL_GOOGLE_include_directive : enable |
| |
| #include "mem.h" |
| #include "setup.h" |
| |
| layout(local_size_x = N_TILE, local_size_y = 1) in; |
| |
| layout(set = 0, binding = 1) readonly buffer ConfigBuf { |
| Config conf; |
| }; |
| |
| #include "annotated.h" |
| #include "bins.h" |
| |
| // scale factors useful for converting coordinates to bins |
| #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX)) |
| #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX)) |
| |
| // Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000) |
| #define INFINITY (1.0 / 0.0) |
| |
| // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. |
| // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps) |
| shared uint bitmaps[N_SLICE][N_TILE]; |
| shared uint count[N_SLICE][N_TILE]; |
| shared Alloc sh_chunk_alloc[N_TILE]; |
| shared bool sh_alloc_failed; |
| |
| void main() { |
| uint my_n_elements = conf.n_elements; |
| uint my_partition = gl_WorkGroupID.x; |
| |
| for (uint i = 0; i < N_SLICE; i++) { |
| bitmaps[i][gl_LocalInvocationID.x] = 0; |
| } |
| if (gl_LocalInvocationID.x == 0) { |
| sh_alloc_failed = false; |
| } |
| barrier(); |
| |
| // Read inputs and determine coverage of bins |
| uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x; |
| AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); |
| uint tag = Annotated_Nop; |
| if (element_ix < my_n_elements) { |
| tag = Annotated_tag(conf.anno_alloc, ref).tag; |
| } |
| int x0 = 0, y0 = 0, x1 = 0, y1 = 0; |
| switch (tag) { |
| case Annotated_Color: |
| case Annotated_LinGradient: |
| case Annotated_Image: |
| case Annotated_BeginClip: |
| case Annotated_EndClip: |
| // Note: we take advantage of the fact that these drawing elements |
| // have the bbox at the same place in their layout. |
| AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref); |
| x0 = int(floor(clip.bbox.x * SX)); |
| y0 = int(floor(clip.bbox.y * SY)); |
| x1 = int(ceil(clip.bbox.z * SX)); |
| y1 = int(ceil(clip.bbox.w * SY)); |
| break; |
| } |
| |
| // At this point, we run an iterator over the coverage area, |
| // trying to keep divergence low. |
| // Right now, it's just a bbox, but we'll get finer with |
| // segments. |
| uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X; |
| uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1) / N_TILE_Y; |
| x0 = clamp(x0, 0, int(width_in_bins)); |
| x1 = clamp(x1, x0, int(width_in_bins)); |
| y0 = clamp(y0, 0, int(height_in_bins)); |
| y1 = clamp(y1, y0, int(height_in_bins)); |
| if (x0 == x1) |
| y1 = y0; |
| int x = x0, y = y0; |
| uint my_slice = gl_LocalInvocationID.x / 32; |
| uint my_mask = 1u << (gl_LocalInvocationID.x & 31); |
| while (y < y1) { |
| atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask); |
| x++; |
| if (x == x1) { |
| x = x0; |
| y++; |
| } |
| } |
| |
| barrier(); |
| // Allocate output segments. |
| uint element_count = 0; |
| for (uint i = 0; i < N_SLICE; i++) { |
| element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); |
| count[i][gl_LocalInvocationID.x] = element_count; |
| } |
| // element_count is number of elements covering bin for this invocation. |
| Alloc chunk_alloc = new_alloc(0, 0, true); |
| if (element_count != 0) { |
| // TODO: aggregate atomic adds (subgroup is probably fastest) |
| MallocResult chunk = malloc(element_count * BinInstance_size); |
| chunk_alloc = chunk.alloc; |
| sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; |
| if (chunk.failed) { |
| sh_alloc_failed = true; |
| } |
| } |
| // Note: it might be more efficient for reading to do this in the |
| // other order (each bin is a contiguous sequence of partitions) |
| uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; |
| write_mem(conf.bin_alloc, out_ix, element_count); |
| write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset); |
| |
| barrier(); |
| if (sh_alloc_failed || mem_error != NO_ERROR) { |
| return; |
| } |
| |
| // Use similar strategy as Laine & Karras paper; loop over bbox of bins |
| // touched by this element |
| x = x0; |
| y = y0; |
| while (y < y1) { |
| uint bin_ix = y * width_in_bins + x; |
| uint out_mask = bitmaps[my_slice][bin_ix]; |
| if ((out_mask & my_mask) != 0) { |
| uint idx = bitCount(out_mask & (my_mask - 1)); |
| if (my_slice > 0) { |
| idx += count[my_slice - 1][bin_ix]; |
| } |
| Alloc out_alloc = sh_chunk_alloc[bin_ix]; |
| uint out_offset = out_alloc.offset + idx * BinInstance_size; |
| BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix)); |
| } |
| x++; |
| if (x == x1) { |
| x = x0; |
| y++; |
| } |
| } |
| } |