| // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense |
| |
| // The binning stage of the pipeline. |
| // |
| // Each workgroup processes N_TILE paths. |
| // Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask |
| // based on the path bounding box to bin the paths. |
| |
| #version 450 |
| #extension GL_GOOGLE_include_directive : enable |
| |
| #include "mem.h" |
| #include "setup.h" |
| |
| layout(local_size_x = N_TILE, local_size_y = 1) in; |
| |
| layout(set = 0, binding = 1) readonly buffer ConfigBuf { |
| Config conf; |
| }; |
| |
| #include "bins.h" |
| #include "drawtag.h" |
| |
| // scale factors useful for converting coordinates to bins |
| #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX)) |
| #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX)) |
| |
| // Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000) |
| #define INFINITY (1.0 / 0.0) |
| |
| // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. |
| // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps) |
| shared uint bitmaps[N_SLICE][N_TILE]; |
| shared uint count[N_SLICE][N_TILE]; |
| shared Alloc sh_chunk_alloc[N_TILE]; |
| shared bool sh_alloc_failed; |
| |
| DrawMonoid load_draw_monoid(uint element_ix) { |
| uint base = (conf.drawmonoid_alloc.offset >> 2) + 4 * element_ix; |
| uint path_ix = memory[base]; |
| uint clip_ix = memory[base + 1]; |
| uint scene_offset = memory[base + 2]; |
| uint info_offset = memory[base + 3]; |
| return DrawMonoid(path_ix, clip_ix, scene_offset, info_offset); |
| } |
| |
| // Load bounding box computed by clip processing |
| vec4 load_clip_bbox(uint clip_ix) { |
| uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * clip_ix; |
| float x0 = uintBitsToFloat(memory[base]); |
| float y0 = uintBitsToFloat(memory[base + 1]); |
| float x1 = uintBitsToFloat(memory[base + 2]); |
| float y1 = uintBitsToFloat(memory[base + 3]); |
| vec4 bbox = vec4(x0, y0, x1, y1); |
| return bbox; |
| } |
| |
| vec4 bbox_intersect(vec4 a, vec4 b) { |
| return vec4(max(a.xy, b.xy), min(a.zw, b.zw)); |
| } |
| |
| // Load path's bbox from bbox (as written by pathseg). |
| vec4 load_path_bbox(uint path_ix) { |
| uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix; |
| float bbox_l = float(memory[base]) - 32768.0; |
| float bbox_t = float(memory[base + 1]) - 32768.0; |
| float bbox_r = float(memory[base + 2]) - 32768.0; |
| float bbox_b = float(memory[base + 3]) - 32768.0; |
| vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b); |
| return bbox; |
| } |
| |
| void store_draw_bbox(uint draw_ix, vec4 bbox) { |
| uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix; |
| memory[base] = floatBitsToUint(bbox.x); |
| memory[base + 1] = floatBitsToUint(bbox.y); |
| memory[base + 2] = floatBitsToUint(bbox.z); |
| memory[base + 3] = floatBitsToUint(bbox.w); |
| } |
| |
| void main() { |
| uint my_partition = gl_WorkGroupID.x; |
| |
| for (uint i = 0; i < N_SLICE; i++) { |
| bitmaps[i][gl_LocalInvocationID.x] = 0; |
| } |
| if (gl_LocalInvocationID.x == 0) { |
| sh_alloc_failed = false; |
| } |
| barrier(); |
| |
| // Read inputs and determine coverage of bins |
| uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x; |
| int x0 = 0, y0 = 0, x1 = 0, y1 = 0; |
| if (element_ix < conf.n_elements) { |
| DrawMonoid draw_monoid = load_draw_monoid(element_ix); |
| uint path_ix = draw_monoid.path_ix; |
| vec4 clip_bbox = vec4(-1e9, -1e9, 1e9, 1e9); |
| uint clip_ix = draw_monoid.clip_ix; |
| if (clip_ix > 0) { |
| clip_bbox = load_clip_bbox(clip_ix - 1); |
| } |
| // For clip elements, clip_bbox is the bbox of the clip path, intersected |
| // with enclosing clips. |
| // For other elements, it is the bbox of the enclosing clips. |
| |
| vec4 path_bbox = load_path_bbox(path_ix); |
| vec4 bbox = bbox_intersect(path_bbox, clip_bbox); |
| // Avoid negative-size bbox (is this necessary)? |
| bbox.zw = max(bbox.xy, bbox.zw); |
| // Store clip-intersected bbox for tile_alloc. |
| store_draw_bbox(element_ix, bbox); |
| x0 = int(floor(bbox.x * SX)); |
| y0 = int(floor(bbox.y * SY)); |
| x1 = int(ceil(bbox.z * SX)); |
| y1 = int(ceil(bbox.w * SY)); |
| } |
| |
| // At this point, we run an iterator over the coverage area, |
| // trying to keep divergence low. |
| // Right now, it's just a bbox, but we'll get finer with |
| // segments. |
| uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X; |
| uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1) / N_TILE_Y; |
| x0 = clamp(x0, 0, int(width_in_bins)); |
| x1 = clamp(x1, x0, int(width_in_bins)); |
| y0 = clamp(y0, 0, int(height_in_bins)); |
| y1 = clamp(y1, y0, int(height_in_bins)); |
| if (x0 == x1) |
| y1 = y0; |
| int x = x0, y = y0; |
| uint my_slice = gl_LocalInvocationID.x / 32; |
| uint my_mask = 1u << (gl_LocalInvocationID.x & 31); |
| while (y < y1) { |
| atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask); |
| x++; |
| if (x == x1) { |
| x = x0; |
| y++; |
| } |
| } |
| |
| barrier(); |
| // Allocate output segments. |
| uint element_count = 0; |
| for (uint i = 0; i < N_SLICE; i++) { |
| element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); |
| count[i][gl_LocalInvocationID.x] = element_count; |
| } |
| // element_count is number of elements covering bin for this invocation. |
| Alloc chunk_alloc = new_alloc(0, 0, true); |
| if (element_count != 0) { |
| // TODO: aggregate atomic adds (subgroup is probably fastest) |
| MallocResult chunk = malloc(element_count * BinInstance_size); |
| chunk_alloc = chunk.alloc; |
| sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; |
| if (chunk.failed) { |
| sh_alloc_failed = true; |
| } |
| } |
| // Note: it might be more efficient for reading to do this in the |
| // other order (each bin is a contiguous sequence of partitions) |
| uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; |
| write_mem(conf.bin_alloc, out_ix, element_count); |
| write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset); |
| |
| barrier(); |
| if (sh_alloc_failed || mem_error != NO_ERROR) { |
| return; |
| } |
| |
| // Use similar strategy as Laine & Karras paper; loop over bbox of bins |
| // touched by this element |
| x = x0; |
| y = y0; |
| while (y < y1) { |
| uint bin_ix = y * width_in_bins + x; |
| uint out_mask = bitmaps[my_slice][bin_ix]; |
| if ((out_mask & my_mask) != 0) { |
| uint idx = bitCount(out_mask & (my_mask - 1)); |
| if (my_slice > 0) { |
| idx += count[my_slice - 1][bin_ix]; |
| } |
| Alloc out_alloc = sh_chunk_alloc[bin_ix]; |
| uint out_offset = out_alloc.offset + idx * BinInstance_size; |
| BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix)); |
| } |
| x++; |
| if (x == x1) { |
| x = x0; |
| y++; |
| } |
| } |
| } |