| // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense |
| |
| // Tile allocation (and zeroing of tiles) |
| |
| #import config |
| #import bump |
| #import drawtag |
| #import tile |
| |
| @group(0) @binding(0) |
| var<uniform> config: Config; |
| |
| @group(0) @binding(1) |
| var<storage> scene: array<u32>; |
| |
| @group(0) @binding(2) |
| var<storage> draw_bboxes: array<vec4<f32>>; |
| |
| @group(0) @binding(3) |
| var<storage, read_write> bump: BumpAllocators; |
| |
| @group(0) @binding(4) |
| var<storage, read_write> paths: array<Path>; |
| |
| @group(0) @binding(5) |
| var<storage, read_write> tiles: array<Tile>; |
| |
| let WG_SIZE = 256u; |
| |
| var<workgroup> sh_tile_count: array<u32, WG_SIZE>; |
| var<workgroup> sh_tile_offset: u32; |
| var<workgroup> sh_atomic_failed: u32; |
| |
| @compute @workgroup_size(256) |
| fn main( |
| @builtin(global_invocation_id) global_id: vec3<u32>, |
| @builtin(local_invocation_id) local_id: vec3<u32>, |
| ) { |
| // Exit early if prior stages failed, as we can't run this stage. |
| // We need to check only prior stages, as if this stage has failed in another workgroup, |
| // we still want to know this workgroup's memory requirement. |
| if local_id.x == 0u { |
| sh_atomic_failed = atomicLoad(&bump.failed); |
| } |
| let failed = workgroupUniformLoad(&sh_atomic_failed); |
| if (failed & STAGE_BINNING) != 0u { |
| return; |
| } |
| // scale factors useful for converting coordinates to tiles |
| // TODO: make into constants |
| let SX = 1.0 / f32(TILE_WIDTH); |
| let SY = 1.0 / f32(TILE_HEIGHT); |
| |
| let drawobj_ix = global_id.x; |
| var drawtag = DRAWTAG_NOP; |
| if drawobj_ix < config.n_drawobj { |
| drawtag = scene[config.drawtag_base + drawobj_ix]; |
| } |
| var x0 = 0; |
| var y0 = 0; |
| var x1 = 0; |
| var y1 = 0; |
| if drawtag != DRAWTAG_NOP && drawtag != DRAWTAG_END_CLIP { |
| let bbox = draw_bboxes[drawobj_ix]; |
| |
| // Don't round up the bottom-right corner of the bbox if the area is zero and leave the |
| // coordinates at 0. This will make `tile_count` zero as the shape is clipped out. |
| if bbox.x < bbox.z && bbox.y < bbox.w { |
| x0 = i32(floor(bbox.x * SX)); |
| y0 = i32(floor(bbox.y * SY)); |
| x1 = i32(ceil(bbox.z * SX)); |
| y1 = i32(ceil(bbox.w * SY)); |
| } |
| } |
| let ux0 = u32(clamp(x0, 0, i32(config.width_in_tiles))); |
| let uy0 = u32(clamp(y0, 0, i32(config.height_in_tiles))); |
| let ux1 = u32(clamp(x1, 0, i32(config.width_in_tiles))); |
| let uy1 = u32(clamp(y1, 0, i32(config.height_in_tiles))); |
| let tile_count = (ux1 - ux0) * (uy1 - uy0); |
| var total_tile_count = tile_count; |
| sh_tile_count[local_id.x] = tile_count; |
| for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { |
| workgroupBarrier(); |
| if local_id.x >= (1u << i) { |
| total_tile_count += sh_tile_count[local_id.x - (1u << i)]; |
| } |
| workgroupBarrier(); |
| sh_tile_count[local_id.x] = total_tile_count; |
| } |
| if local_id.x == WG_SIZE - 1u { |
| let count = sh_tile_count[WG_SIZE - 1u]; |
| var offset = atomicAdd(&bump.tile, count); |
| if offset + count > config.tiles_size { |
| offset = 0u; |
| atomicOr(&bump.failed, STAGE_TILE_ALLOC); |
| } |
| paths[drawobj_ix].tiles = offset; |
| } |
| // Using storage barriers is a workaround for what appears to be a miscompilation |
| // when a normal workgroup-shared variable is used to broadcast the value. |
| storageBarrier(); |
| let tile_offset = paths[drawobj_ix | (WG_SIZE - 1u)].tiles; |
| storageBarrier(); |
| if drawobj_ix < config.n_drawobj { |
| let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u); |
| let bbox = vec4(ux0, uy0, ux1, uy1); |
| let path = Path(bbox, tile_offset + tile_subix); |
| paths[drawobj_ix] = path; |
| } |
| |
| // zero allocated memory |
| // Note: if the number of draw objects is small, utilization will be poor. |
| // There are two things that can be done to improve that. One would be a |
| // separate (indirect) dispatch. Another would be to have each workgroup |
| // process fewer draw objects than the number of threads in the wg. |
| let total_count = sh_tile_count[WG_SIZE - 1u]; |
| for (var i = local_id.x; i < total_count; i += WG_SIZE) { |
| // Note: could format output buffer as u32 for even better load balancing. |
| tiles[tile_offset + i] = Tile(0, 0u); |
| } |
| } |