| // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense |
| |
| // Propagation of tile backdrop for filling. |
| // |
| // Each thread reads one path element and calculates the row and column counts of spanned tiles |
| // based on the bounding box. |
| // The row count then goes through a prefix sum to redistribute and load-balance the work across the workgroup. |
| // In the following step, the workgroup loops over the corresponding tile rows per element in parallel. |
| // For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel, |
| // and propagated from the left to the right (prefix summed). |
| // |
| // Output state: |
| // - Each path element has an array of tiles covering the whole path based on boundig box |
| // - Each tile per path element contains the 'backdrop' and a list of subdivided path segments |
| |
| #version 450 |
| #extension GL_GOOGLE_include_directive : enable |
| |
| #include "mem.h" |
| #include "setup.h" |
| |
| #define LG_BACKDROP_WG (7 + LG_WG_FACTOR) |
| #define BACKDROP_WG (1 << LG_BACKDROP_WG) |
| #ifndef BACKDROP_DIST_FACTOR |
| // Some paths (those covering a large area) can generate a lot of backdrop tiles; BACKDROP_DIST_FACTOR defines how much |
| // additional threads should we spawn for parallel row processing. The additional threads does not participate in the |
| // earlier stages (calculating the tile counts) but does work in the final prefix sum stage which has a lot more |
| // parallelism. |
| |
| // This feature is opt-in: one variant is compiled with the following default, while the other variant is compiled with |
| // a larger BACKDROP_DIST_FACTOR, which is used on GPUs supporting a larger workgroup size to improve performance. |
| #define BACKDROP_DIST_FACTOR 1 |
| #endif |
| |
| layout(local_size_x = BACKDROP_WG, local_size_y = BACKDROP_DIST_FACTOR) in; |
| |
| layout(set = 0, binding = 1) readonly buffer ConfigBuf { |
| Config conf; |
| }; |
| |
| #include "annotated.h" |
| #include "tile.h" |
| |
| shared uint sh_row_count[BACKDROP_WG]; |
| shared Alloc sh_row_alloc[BACKDROP_WG]; |
| shared uint sh_row_width[BACKDROP_WG]; |
| |
| void main() { |
| uint th_ix = gl_LocalInvocationIndex; |
| uint element_ix = gl_GlobalInvocationID.x; |
| AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); |
| |
| // Work assignment: 1 thread : 1 path element |
| uint row_count = 0; |
| bool mem_ok = mem_error == NO_ERROR; |
| if (gl_LocalInvocationID.y == 0) { |
| if (element_ix < conf.n_elements) { |
| AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref); |
| switch (tag.tag) { |
| case Annotated_Image: |
| case Annotated_LinGradient: |
| case Annotated_BeginClip: |
| case Annotated_Color: |
| if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) { |
| break; |
| } |
| // Fall through. |
| PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size); |
| Path path = Path_read(conf.tile_alloc, path_ref); |
| sh_row_width[th_ix] = path.bbox.z - path.bbox.x; |
| row_count = path.bbox.w - path.bbox.y; |
| // Paths that don't cross tile top edges don't have backdrops. |
| // Don't apply the optimization to paths that may cross the y = 0 |
| // top edge, but clipped to 1 row. |
| if (row_count == 1 && path.bbox.y > 0) { |
| // Note: this can probably be expanded to width = 2 as |
| // long as it doesn't cross the left edge. |
| row_count = 0; |
| } |
| Alloc path_alloc = new_alloc( |
| path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); |
| sh_row_alloc[th_ix] = path_alloc; |
| } |
| } |
| sh_row_count[th_ix] = row_count; |
| } |
| |
| // Prefix sum of sh_row_count |
| for (uint i = 0; i < LG_BACKDROP_WG; i++) { |
| barrier(); |
| if (gl_LocalInvocationID.y == 0 && th_ix >= (1u << i)) { |
| row_count += sh_row_count[th_ix - (1u << i)]; |
| } |
| barrier(); |
| if (gl_LocalInvocationID.y == 0) { |
| sh_row_count[th_ix] = row_count; |
| } |
| } |
| barrier(); |
| // Work assignment: 1 thread : 1 path element row |
| uint total_rows = sh_row_count[BACKDROP_WG - 1]; |
| for (uint row = th_ix; row < total_rows; row += BACKDROP_WG * BACKDROP_DIST_FACTOR) { |
| // Binary search to find element |
| uint el_ix = 0; |
| for (uint i = 0; i < LG_BACKDROP_WG; i++) { |
| uint probe = el_ix + (uint(BACKDROP_WG / 2) >> i); |
| if (row >= sh_row_count[probe - 1]) { |
| el_ix = probe; |
| } |
| } |
| uint width = sh_row_width[el_ix]; |
| if (width > 0 && mem_ok) { |
| // Process one row sequentially |
| // Read backdrop value per tile and prefix sum it |
| Alloc tiles_alloc = sh_row_alloc[el_ix]; |
| uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0); |
| uint tile_el_ix = (tiles_alloc.offset >> 2) + 1 + seq_ix * 2 * width; |
| uint sum = read_mem(tiles_alloc, tile_el_ix); |
| for (uint x = 1; x < width; x++) { |
| tile_el_ix += 2; |
| sum += read_mem(tiles_alloc, tile_el_ix); |
| write_mem(tiles_alloc, tile_el_ix, sum); |
| } |
| } |
| } |
| } |