| /* |
| * Copyright 2017 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can |
| * be found in the LICENSE file. |
| * |
| */ |
| |
| // |
| // |
| // |
| |
| #include "tile.h" |
| #include "common.h" |
| #include "atomic_cl.h" |
| #include "block_pool_cl.h" |
| #include "raster_builder_cl_12.h" |
| #include "kernel_cl_12.h" |
| |
| // #define SKC_ARCH_AVX2 |
| // #define SKC_RASTERIZE_SIMD_USES_SMEM |
| |
| #define PRINTF_ENABLE 0 |
| #define PRINTF_BLOCK_COUNT 0 |
| |
| // |
| // NOTE: |
| // |
| // ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT |
| // AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE |
| // |
| // NOTE: |
| // |
| // IGNORE FOR NOW ANY AVX2 CODE SNIPPETS. THEY WILL BE MOVED ASAP. |
| // |
| // |
| |
| #if 0 // SKC_ARCH_AVX2 |
| |
| // #define SKC_RASTERIZE_SUBGROUP_SIZE 1 |
| // #define SKC_RASTERIZE_VECTOR_SIZE_LOG2 3 |
| // #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP 1 |
| |
| // #define SKC_TTXB_WORDS 8 |
| |
| // #define SKC_RASTERIZE_FLOAT float8 |
| // #define SKC_RASTERIZE_UINT uint8 |
| // #define SKC_RASTERIZE_INT int8 |
| // #define SKC_RASTERIZE_PREDICATE int8 |
| |
| // #define SKC_RASTERIZE_BIN_BLOCK uint16 |
| // #define SKC_RASTERIZE_BIN uint8 |
| |
| // #define SKC_RASTERIZE_POOL uint8 |
| // #define SKC_RASTERIZE_POOL_SCALE 6 |
| |
| // #define SKC_RASTERIZE_TILE_HASH_X_BITS 1 |
| // #define SKC_RASTERIZE_TILE_HASH_Y_BITS 2 |
| |
| // #define SKC_RASTERIZE_VECTOR_EXPAND() SKC_EXPAND_8() |
| |
| #endif |
| |
| // |
| // SIMT |
| // |
| |
| #define SKC_RASTERIZE_BLOCK_ID_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE |
| #define SKC_RASTERIZE_TTSK_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE |
| #define SKC_RASTERIZE_TTSK_V_MASK (SKC_RASTERIZE_TTSK_V_SIZE - 1) |
| |
| // |
| // |
| // |
| |
| #define SKC_RASTERIZE_VECTOR_SIZE (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2) |
| #define SKC_RASTERIZE_ELEMS_PER_SUBGROUP (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE) |
| |
| // |
| // |
| // |
| |
| #define SKC_RASTERIZE_YX_INIT 0x7FFF7FFF // { +32767, +32767 } |
| #define SKC_RASTERIZE_YX_INVALID 0x80008000 // { -32768, -32768 } |
| |
| // |
| // |
| // |
| |
| #define SKC_RASTERIZE_TILE_HASH_X_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS) |
| #define SKC_RASTERIZE_TILE_HASH_Y_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS) |
| #define SKC_RASTERIZE_TILE_HASH_BITS (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS) |
| #define SKC_RASTERIZE_TILE_HASH_BIN_COUNT (1 << SKC_RASTERIZE_TILE_HASH_BITS) |
| #define SKC_RASTERIZE_TILE_HASH_BIN_BITS (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT) |
| #define SKC_RASTERIZE_TILE_HASH_BIN_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS) |
| |
| // |
| // Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" |
| // |
| // https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ |
| // |
| // Lerp in two fma/mad ops: |
| // |
| // t * b + ((-t) * a + a) |
| // |
| // Note: OpenCL documents mix() as being implemented as: |
| // |
| // a + (b - a) * t |
| // |
| // But this may be a native instruction on some devices. For example, |
| // on GEN9 there is an LRP "linear interoplation" opcode but it |
| // doesn't appear to support half floats. |
| // |
| // Feel free to toggle this option and then benchmark and inspect the |
| // generated code. We really want the double FMA to be generated when |
| // there isn't support for a LERP/MIX operation. |
| // |
| |
| #if 1 |
| #define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) |
| #else |
| #define SKC_LERP(a,b,t) mix(a,b,t) |
| #endif |
| |
| // |
| // There is no integer MAD in OpenCL with "don't care" overflow |
| // semantics. |
| // |
| // FIXME -- verify if the platform needs explicit MAD operations even |
| // if a "--fastmath" option is available at compile time. It might |
| // make sense to explicitly use MAD calls if the platform requires it. |
| // |
| |
| #if 1 |
| #define SKC_MAD_UINT(a,b,c) ((a) * (b) + (c)) |
| #else |
| #define SKC_MAD_UINT(a,b,c) mad_sat(a,b,c) |
| #endif |
| |
| // |
| // |
| // |
| |
| #define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()) |
| |
| // |
| // |
| // |
| |
| union skc_bp_elem |
| { |
| skc_uint u32; |
| skc_tagged_block_id_t tag_id; |
| skc_float coord; |
| }; |
| |
| // |
| // |
| // |
| |
| struct skc_subgroup_smem |
| { |
| // |
| // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member |
| // |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM ) |
| struct { |
| union { |
| |
| skc_uint winner; |
| |
| struct { |
| skc_uint scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; |
| } aN; |
| |
| struct { |
| SKC_RASTERIZE_UINT scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; |
| } vN; |
| }; |
| } subgroup; |
| #endif |
| |
| // |
| // work-in-progress TTSB blocks and associated YX keys |
| // |
| union { |
| struct { |
| // FIXME -- some typedefs are valid here |
| skc_uint ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS]; |
| skc_uint yx [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; |
| skc_uint id [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; |
| skc_uint count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; |
| } aN; |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| struct { |
| SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; |
| SKC_RASTERIZE_BIN yx; |
| SKC_RASTERIZE_BIN id; |
| SKC_RASTERIZE_BIN count; |
| } vN; |
| #endif |
| } bin; |
| }; |
| |
| // |
| // |
| // |
| |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| #define skc_subgroup_lane() 0 |
| #else |
| #define skc_subgroup_lane() get_sub_group_local_id() |
| #endif |
| |
| // |
| // |
| // |
| |
| #define SKC_PROJECT(tv,x,y,xp,yp) \ |
| { \ |
| float const d = native_recip(fma(x,tv->w0,fma(y,tv->w1,1.0f))); \ |
| xp *= d; \ |
| yp *= d; \ |
| } |
| |
| // |
| // replenish block ids |
| // |
| // note that you can't overrun the block id pool since it's a ring |
| // |
| |
| static |
| void |
| skc_blocks_replenish(skc_uint * const blocks_next, |
| skc_block_id_v_t * const blocks, |
| __global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| skc_uint const bp_mask, // pow2 modulo mask for block pool ring |
| __global skc_block_id_t const * const bp_ids) |
| { |
| // |
| // get a new vector of block ids -- this is kind of a narrow |
| // allocation but subblocks help stretch out the pool. |
| // |
| // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids |
| // |
| skc_uint bp_idx = 0; |
| |
| if (skc_subgroup_lane() == 0) |
| { |
| bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS, |
| SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads |
| #if 0 |
| printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE); |
| #endif |
| } |
| |
| bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask; |
| *blocks = bp_ids[bp_idx]; |
| *blocks_next = 0; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| skc_block_id_t |
| skc_blocks_get_next(skc_uint * const blocks_next, |
| skc_block_id_v_t * const blocks, |
| __global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| skc_uint const bp_mask, // pow2 modulo mask for block pool ring |
| __global skc_block_id_t const * const bp_ids) |
| { |
| // replenish? |
| if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE) |
| { |
| skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); |
| } |
| |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) |
| // |
| // SIMT |
| // |
| skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next); |
| |
| #else |
| // |
| // SIMD |
| // |
| skc_block_id_t id = blocks->s0; |
| |
| skc_shuffle_down_1(*blocks); |
| |
| #endif |
| |
| *blocks_next += 1; |
| |
| return id; |
| } |
| |
| // |
| // subblock allocator |
| // |
| |
| #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 |
| |
| static |
| skc_block_id_t |
| skc_subblocks_get_next(skc_block_id_t * const subblocks, |
| skc_uint * const blocks_next, |
| skc_block_id_v_t * const blocks, |
| __global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| skc_uint const bp_mask, // pow2 modulo mask for block pool ring |
| __global skc_block_id_t const * const bp_ids) |
| { |
| if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) |
| { |
| *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); |
| } |
| |
| skc_block_id_t const sb_id = *subblocks; |
| |
| *subblocks += 1; |
| |
| #if 0 |
| if (get_sub_group_local_id() == 0) |
| printf("= %u\n",sb_id); |
| #endif |
| |
| return sb_id; |
| } |
| |
| |
| #define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks |
| #define SKC_SUBBLOCKS_BLOCKS_ARGS() subblocks, blocks |
| |
| #else |
| |
| #define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks |
| #define SKC_SUBBLOCKS_BLOCKS_ARGS() blocks |
| |
| #endif |
| |
| // |
| // |
| // |
| |
| static |
| skc_block_id_t |
| skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(), |
| skc_uint * const blocks_next, |
| __global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| skc_uint const bp_mask, // pow2 modulo mask for block pool ring |
| __global skc_block_id_t const * const bp_ids, |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| skc_ttsk_v_t * const sk_v, |
| skc_uint * const sk_v_next, |
| __global skc_ttsk_s_t * const sk_extent, |
| skc_uint const new_yx) |
| { |
| #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 |
| skc_block_id_t const new_id = skc_subblocks_get_next(subblocks, |
| blocks_next, |
| blocks, |
| bp_atomics, |
| bp_mask, |
| bp_ids); |
| #else |
| skc_block_id_t const new_id = skc_blocks_get_next(blocks_next, |
| blocks, |
| bp_atomics, |
| bp_mask, // pow2 modulo mask for block pool ring |
| bp_ids); |
| #endif |
| |
| if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK)) |
| { |
| sk_v->lo = new_id; |
| sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx; |
| #if 0 |
| printf("@ ( %3u, %3u ) %u\n", |
| (new_yx >> 12) & 0xFFF, |
| (new_yx ) & 0xFFF, |
| new_id); |
| #endif |
| } |
| |
| *sk_v_next += 1; |
| |
| if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE) |
| { |
| *sk_v_next = 0; |
| |
| skc_uint sk_idx = 0; |
| |
| if (skc_subgroup_lane() == 0) |
| { |
| sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE |
| (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE); |
| #if 0 |
| printf("+ %u\n",sk_idx); |
| #endif |
| } |
| |
| sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); |
| |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE ) |
| if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE) |
| #endif |
| { |
| sk_extent[sk_idx] = *sk_v; |
| #if 0 |
| printf("> %u : %v2u\n",sk_idx,*sk_v); |
| #endif |
| } |
| } |
| |
| return new_id; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| SKC_RASTERIZE_FLOAT |
| skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v) |
| { |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| // Note that there isn't a built-in horizontal scan for vectors so |
| // we'll define some here for various widths. |
| // |
| // FIXME -- a scalar version might be faster so put in a |
| // compile-time switch to selection between implementations |
| // |
| |
| #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) |
| return v; |
| |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) |
| // 01 |
| // 0 + |
| // -- |
| // 01 |
| SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v); |
| return w; |
| |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) |
| // 0123 |
| // 012 + |
| // ---- |
| // 0123 |
| // 01 + |
| // ---- |
| // 0123 |
| // |
| SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v); |
| SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w); |
| return x; |
| |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) |
| // 01234567 |
| // 0123456 + |
| // -------- |
| // 01234567 |
| // 012345 + |
| // -------- |
| // 01234567 |
| // 0123 + |
| // -------- |
| // 01234567 |
| // |
| SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v); |
| SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w); |
| SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x); |
| return y; |
| |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) |
| // 0123456789abcdef |
| // 0123456789abcde + |
| // ---------------- |
| // 0123456789abcdef |
| // 0123456789abcd + |
| // ---------------- |
| // 0123456789abcdef |
| // 0123456789ab + |
| // ---------------- |
| // 0123456789abcdef |
| // 01234567 + |
| // ---------------- |
| // 0123456789abcdef |
| // |
| SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); |
| SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); |
| SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); |
| SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); |
| return z; |
| |
| #endif |
| |
| #else |
| // |
| // SIMT |
| // |
| |
| return sub_group_scan_inclusive_add(v); |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| SKC_RASTERIZE_UINT |
| skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v) |
| { |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| // Note that there isn't a built-in horizontal scan for vectors so |
| // we'll define some here for various widths. |
| // |
| // FIXME -- a scalar version might be faster so put in a |
| // compile-time switch to selection between implementations |
| // |
| |
| #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) |
| return v; |
| |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) |
| // 01 |
| // 0 + |
| // -- |
| // 01 |
| SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v); |
| return w; |
| |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) |
| // 0123 |
| // 012 + |
| // ---- |
| // 0123 |
| // 01 + |
| // ---- |
| // 0123 |
| // |
| SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v); |
| SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w); |
| return x; |
| |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) |
| // 01234567 |
| // 0123456 + |
| // -------- |
| // 01234567 |
| // 012345 + |
| // -------- |
| // 01234567 |
| // 0123 + |
| // -------- |
| // 01234567 |
| // |
| SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v); |
| SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w); |
| SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x); |
| return y; |
| |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) |
| // 0123456789abcdef |
| // 0123456789abcde + |
| // ---------------- |
| // 0123456789abcdef |
| // 0123456789abcd + |
| // ---------------- |
| // 0123456789abcdef |
| // 0123456789ab + |
| // ---------------- |
| // 0123456789abcdef |
| // 01234567 + |
| // ---------------- |
| // 0123456789abcdef |
| // |
| SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); |
| SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); |
| SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); |
| SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); |
| return z; |
| |
| #endif |
| |
| #else |
| // |
| // SIMT |
| // |
| |
| return sub_group_scan_inclusive_add(v); |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| SKC_RASTERIZE_UINT |
| skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v) |
| { |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| // Note that there isn't a built-in horizontal scan for vectors so |
| // we'll define some here for various widths. |
| // |
| // FIXME -- a scalar version might be faster so put in a |
| // compile-time switch to selection between implementations |
| // |
| |
| #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) |
| return v; |
| |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) |
| // 01 |
| // 00 max |
| // -- |
| // 01 |
| SKC_RASTERIZE_UINT const w = max(v.s00,v); |
| return w; |
| |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) |
| // 0123 |
| // 0012 + |
| // ---- |
| // 0123 |
| // 0101 + |
| // ---- |
| // 0123 |
| // |
| SKC_RASTERIZE_UINT const w = max(v.s0012,v); |
| SKC_RASTERIZE_UINT const x = max(w.s0101,w); |
| return x; |
| |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) |
| // 01234567 |
| // 00123456 + |
| // -------- |
| // 01234567 |
| // 01012345 + |
| // -------- |
| // 01234567 |
| // 01230123 + |
| // -------- |
| // 01234567 |
| // |
| SKC_RASTERIZE_UINT const w = max(v.s00123456,v); |
| SKC_RASTERIZE_UINT const x = max(w.s01012345,w); |
| SKC_RASTERIZE_UINT const y = max(x.s01230123,x); |
| return y; |
| |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) |
| // 0123456789abcdef |
| // 00123456789abcde + |
| // ---------------- |
| // 0123456789abcdef |
| // 010123456789abcd + |
| // ---------------- |
| // 0123456789abcdef |
| // 01230123456789ab + |
| // ---------------- |
| // 0123456789abcdef |
| // 0123456701234567 + |
| // ---------------- |
| // 0123456789abcdef |
| // |
| SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v); |
| SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w); |
| SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x); |
| SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y); |
| return z; |
| |
| #endif |
| |
| #else |
| // |
| // SIMT |
| // |
| |
| return sub_group_scan_inclusive_max(v); |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| float |
| skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v) |
| { |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) |
| return v; |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) |
| return v.s1; |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) |
| return v.s3; |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) |
| return v.s7; |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) |
| return v.sf; |
| #endif |
| |
| #else |
| // |
| // SIMT |
| // |
| return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| SKC_RASTERIZE_UINT |
| skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v) |
| { |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) |
| return v; |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) |
| return v.s1; |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) |
| return v.s3; |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) |
| return v.s7; |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) |
| return v.sf; |
| #endif |
| |
| #else |
| // |
| // SIMT |
| // |
| return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| float |
| skc_subgroup_first(SKC_RASTERIZE_FLOAT const v) |
| { |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) |
| return v; |
| #else |
| return v.s0; |
| #endif |
| |
| #else |
| // |
| // SIMT |
| // |
| return sub_group_broadcast(v,0); |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| SKC_RASTERIZE_FLOAT |
| skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v, |
| SKC_RASTERIZE_UINT const i) |
| { |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) |
| return v; |
| #else |
| return shuffle(v,i); |
| #endif |
| |
| #else |
| // |
| // SIMT |
| // |
| return intel_sub_group_shuffle(v,i); |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| SKC_RASTERIZE_FLOAT |
| skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous |
| SKC_RASTERIZE_FLOAT const c) // current |
| { |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| // FIXME -- there are alternative formulations here: |
| // |
| // Option 1: |
| // |
| // select(c.rotate(+1),p.rotate(-1),(1,0,0,...)) |
| // |
| // Option 2: |
| // |
| // p is a scalar |
| // t = c.rotate(+1) |
| // t.s0 = p; |
| // |
| // Option 3: ... |
| // |
| #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) |
| return p; |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) |
| return shuffle2(p,c,(uint2)(1,2)); |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) |
| return shuffle2(p,c,(uint4)(3,4,5,6)); |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) |
| return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14)); |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) |
| return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30)); |
| #endif |
| |
| #else |
| // |
| // SIMT |
| // |
| return intel_sub_group_shuffle_up(p,c,1); |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| bool |
| skc_is_lane_first() |
| { |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) |
| // |
| // SIMD |
| // |
| return true; |
| #else |
| // |
| // SIMT |
| // |
| return get_sub_group_local_id() == 0; |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| SKC_RASTERIZE_FLOAT |
| skc_delta_offset() |
| { |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) |
| return 1; |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) |
| return (SKC_RASTERIZE_FLOAT)( 1, 2 ); |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) |
| return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 ); |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) |
| return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 ); |
| #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) |
| return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ); |
| #endif |
| |
| #else |
| // |
| // SIMT |
| // |
| return 1.0f + get_sub_group_local_id(); |
| |
| #endif |
| |
| } |
| |
| // |
| // |
| // |
| |
| static |
| int |
| skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p) |
| { |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| return any(p); |
| #else |
| // |
| // SIMT |
| // |
| return sub_group_any(p); |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| #define SKC_PATH_NODEWORD_IS_LAST(n) (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK) |
| |
| void |
| skc_segment_next(__global union skc_bp_elem * const bp_elems, |
| skc_uint * const nodeword, |
| skc_block_id_t * const id) |
| { |
| if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) |
| { |
| if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword)) |
| { |
| *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS; |
| } |
| |
| skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id; |
| |
| *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| SKC_RASTERIZE_FLOAT |
| skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y) |
| { |
| return native_sqrt(x * x + y * y); |
| } |
| |
| // |
| // Wang's Formula (1985) |
| // |
| |
| #define SKC_WANG_PIXEL_RESL 0.25f // <-- this can be tuned |
| |
| #define SKC_WANG_EPSILON (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32) |
| |
| #define SKC_WANG_CUBIC ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON)) |
| #define SKC_WANG_QUADRATIC ((2.0f ) / (8.0f * SKC_WANG_EPSILON)) |
| |
| #define SKC_WANG_LENGTH(x,y) skc_native_length(x,y) |
| #define SKC_WANG_SQRT(x) native_sqrt(x) |
| |
| // |
| // |
| // |
| |
| static |
| SKC_RASTERIZE_FLOAT |
| skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, |
| SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, |
| SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y, |
| SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y) |
| { |
| // |
| // Return the number of evenly spaced (in the parametric sense) line |
| // segments that are guaranteed to be within "epsilon" error of the |
| // curve. |
| // |
| // We're then going to take multiples of the reciprocal of this |
| // number so that the segmentation can be distributed across the |
| // subgroup. |
| // |
| // Note, this can probably be slightly optimized per architecture |
| // but it's probably far from being a hotspot since it's all |
| // straight-line unpredicated code. |
| // |
| // The result is an integer ranging from [1.0,#segments] |
| // |
| // Note that even if all of the control points are coincident, the |
| // max(1.0f) will categorize this as a line of 1 segment. |
| // |
| // This is what we want! We want to convert cubics to lines as |
| // easily as possible and *then* cull lines that are either |
| // horizontal or zero length. |
| // |
| return max(1.0f, |
| ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC * |
| SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x), |
| fabs(t3x - 2.0f * t2x + t1x)), |
| max(fabs(t2y - 2.0f * t1y + t0y), |
| fabs(t3y - 2.0f * t2y + t1y)))))); |
| } |
| |
| static |
| SKC_RASTERIZE_FLOAT |
| skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, |
| SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, |
| SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y) |
| { |
| return max(1.0f, |
| ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC * |
| SKC_WANG_LENGTH(t2x - 2.0f * t1x + t0x, |
| t2y - 2.0f * t1y + t0y)))); |
| } |
| |
| // |
| // rational curves |
| // |
| |
| static |
| SKC_RASTERIZE_FLOAT |
| skc_wangs_formula_cubic_rat() |
| { |
| return 0.0f; |
| } |
| |
| static |
| SKC_RASTERIZE_FLOAT |
| skc_wangs_formula_quad_rat() |
| { |
| return 0.0f; |
| } |
| |
| // |
| // flush any work-in-progress blocks and return unused block ids |
| // |
| |
| static |
| void |
| skc_finalize(__global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| __global union skc_bp_elem * const bp_elems, |
| __global uint * const bp_ids, |
| skc_uint const bp_mask, |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| skc_block_id_v_t * const blocks, |
| skc_uint const blocks_next, |
| skc_ttsk_v_t * const sk_v, |
| skc_uint const sk_v_next, |
| __global skc_ttsk_s_t * const sk_extent, |
| __local struct skc_subgroup_smem volatile * const smem) |
| { |
| // |
| // flush non-empty bins |
| // |
| // FIXME -- accelerate this iteration/search with a subgroup operation |
| // |
| for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++) |
| { |
| if (smem->bin.aN.count[ii] > 0) |
| { |
| skc_block_id_v_t const id = smem->bin.aN.id[ii]; |
| skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); |
| skc_uint const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()]; |
| #if 0 |
| printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts); |
| #endif |
| bp_elems[idx].u32 = tts; |
| } |
| |
| // |
| // FIXME -- vectorize with vstoreN() |
| // |
| } |
| |
| // |
| // return remaining block ids back to the pool |
| // |
| skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next; |
| |
| if (blocks_rem > 0) |
| { |
| skc_uint bp_idx = 0; |
| |
| if (skc_subgroup_lane() == 0) |
| { |
| bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem); |
| |
| #if 0 |
| printf("r-: %8u + %u\n",bp_idx,blocks_rem); |
| #endif |
| } |
| |
| bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask; |
| |
| if (skc_subgroup_lane() >= blocks_next) |
| { |
| bp_ids[bp_idx] = *blocks; |
| } |
| } |
| |
| // |
| // flush work-in-progress ryx keys |
| // |
| if (sk_v_next > 0) |
| { |
| skc_uint sk_idx = 0; |
| |
| if (skc_subgroup_lane() == 0) |
| { |
| sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE |
| (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next); |
| #if 0 |
| printf("* %u\n",sk_idx); |
| #endif |
| } |
| |
| sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); |
| |
| if (skc_subgroup_lane() < sk_v_next) |
| { |
| sk_extent[sk_idx] = *sk_v; |
| } |
| } |
| } |
| |
| // |
| // If there are lanes that were unable to append to a bin because |
| // their hashes collided with a bin's current ryx key then those bins |
| // must be ejected. |
| // |
| // Note that we do not eject "full" bins because lazily waiting for a |
| // collision results in simpler code. |
| // |
| |
| static |
| void |
| skc_flush(__global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| __global union skc_bp_elem * const bp_elems, |
| __global uint * const bp_ids, |
| skc_uint const bp_mask, |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| skc_block_id_t * const subblocks, |
| skc_block_id_v_t * const blocks, |
| skc_uint * const blocks_next, |
| skc_ttsk_v_t * const sk_v, |
| skc_uint * const sk_v_next, |
| __global skc_ttsk_s_t * const sk_extent, |
| __local struct skc_subgroup_smem volatile * const smem, |
| SKC_RASTERIZE_UINT const hash, |
| SKC_RASTERIZE_UINT const yx, |
| SKC_RASTERIZE_PREDICATE is_collision) // pass by value |
| { |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| |
| // |
| // FIXME -- this code is now stale with the changes to the |
| // subblock/block allocation strategy |
| // |
| |
| // |
| // get local TTSB ID queue count |
| // |
| skc_uint ttsb_id_count = smem->pool.count; // scalar |
| |
| // init hash bit mask |
| skc_uint component_mask = 0; |
| |
| for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++) |
| { |
| // if no collision continue |
| if (((int*)&is_collision)[cc] == 0) |
| continue; |
| |
| uint const winner = ((uint*)&hash)[cc]; |
| uint const component_bit = 1u << winner; |
| |
| // if already processed this hash then continue |
| if (component_mask & component_bit) |
| continue; |
| |
| // update component mask |
| component_mask |= component_bit; |
| |
| // |
| // new winner requires ejecting the old TTSB |
| // |
| if (smem->bin.aN.count[winner] > 0) |
| { |
| skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); |
| |
| bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; |
| } |
| |
| // |
| // ensure there is at least one TTSK and TTSB ID |
| // |
| if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE) |
| { |
| // |
| // update remaining count |
| // |
| ttsb_id_count = 0; |
| |
| // |
| // flush accumulated ttsk_ryx keys |
| // |
| uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE |
| (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count |
| |
| #if 0 |
| printf("# %u\n",idx); |
| #endif |
| |
| for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE) |
| { |
| ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii); |
| } |
| |
| // |
| // allocate more ttsb ids from pool |
| // |
| uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads |
| |
| for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE) |
| smem->pool.aN.id[ii] = bp_ids[id + ii]; |
| } |
| |
| // |
| // invalidate the winning block |
| // |
| |
| // |
| // update bin with winning yx, new ttsb id and zero count |
| // |
| // all lanes are loading/storing from/to the same index |
| // |
| smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID ); |
| smem->bin.aN.id [winner] = smem->pool.aN.id[ttsb_id_count]; |
| smem->bin.aN.yx [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc]; |
| smem->bin.aN.count[winner] = 0; |
| |
| // |
| // update count |
| // |
| ttsb_id_count += 1; |
| } |
| |
| // |
| // save count |
| // |
| smem->pool.count = ttsb_id_count; |
| |
| #else |
| // |
| // SIMT |
| // |
| |
| do { |
| // |
| // only one lane will win! |
| // |
| if (is_collision) |
| smem->subgroup.winner = hash; |
| |
| barrier(CLK_LOCAL_MEM_FENCE); |
| |
| // |
| // which bin is being ejected? |
| // |
| skc_uint const winner = smem->subgroup.winner; |
| |
| // |
| // which colliding hash is taking over the bin? |
| // |
| SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner); |
| |
| // |
| // all lanes with the same hash will try to store but only one |
| // lane will win |
| // |
| if (is_winner) |
| smem->subgroup.winner = yx; |
| |
| barrier(CLK_LOCAL_MEM_FENCE); |
| |
| // |
| // flush this block to the pool |
| // |
| if (smem->bin.aN.count[winner] > 0) |
| { |
| skc_block_id_v_t const id = smem->bin.aN.id[winner]; |
| skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); |
| skc_uint const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; |
| #if 0 |
| printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts); |
| #endif |
| bp_elems[idx].u32 = tts; |
| } |
| |
| // |
| // append new ttsk |
| // |
| skc_uint const new_yx = smem->subgroup.winner; |
| skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(), |
| blocks_next, |
| bp_atomics, |
| bp_mask, // pow2 modulo mask for block pool ring |
| bp_ids, |
| cohort_atomics, |
| sk_v, |
| sk_v_next, |
| sk_extent, |
| new_yx); |
| |
| #if 0 |
| if (get_sub_group_local_id() == 0) { |
| printf(">>> %9u\n",new_id); |
| } |
| #endif |
| |
| // |
| // update bin with winning yx, new ttsb id and zero count |
| // |
| smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID; |
| smem->bin.aN.yx [winner] = new_yx; |
| smem->bin.aN.id [winner] = new_id; |
| smem->bin.aN.count[winner] = 0; |
| |
| // |
| // remove all lanes matching this hash |
| // |
| is_collision = is_collision && !is_winner; |
| |
| // |
| // exit if nothing left to do |
| // |
| } while (sub_group_any(is_collision)); |
| |
| #endif |
| } |
| |
| // |
| // scatter scan max |
| // |
| static |
| SKC_RASTERIZE_UINT |
| skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem, |
| SKC_RASTERIZE_FLOAT const iss, |
| SKC_RASTERIZE_FLOAT const ess) |
| { |
| // |
| // prefix sums determine which lanes we're going to work on next |
| // |
| SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP); |
| SKC_RASTERIZE_UINT const scratch_idx = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f)); |
| |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| #ifdef SKC_RASTERIZE_SIMD_USES_SMEM |
| // |
| // SIMD APPROACH 1: SIMT'ISH |
| // |
| |
| // zero the volatile smem scratchpad using vector syntax |
| smem->subgroup.vN.scratch[0] = ( 0 ); |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) \ |
| if (is_scratch_store C) \ |
| smem->subgroup.aN.scratch[scratch_idx C] = I; |
| |
| SKC_RASTERIZE_VECTOR_EXPAND(); |
| |
| // propagate lanes to right using max scan |
| SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0]; |
| SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); |
| |
| #else |
| // |
| // SIMD APPROACH 2: SCALAR'ISH |
| // |
| |
| SKC_RASTERIZE_UINT source = ( 0 ); |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) \ |
| if (is_scratch_store C) \ |
| ((uint *)&source)[scratch_idx C] = I; |
| |
| SKC_RASTERIZE_VECTOR_EXPAND(); |
| |
| for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++) |
| ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]); |
| #endif |
| |
| #else |
| // |
| // SIMT |
| // |
| |
| // |
| // zero the volatile smem scratchpad using vector syntax |
| // |
| smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 ); |
| |
| // |
| // store source lane at starting lane |
| // |
| if (is_scratch_store) |
| smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane(); |
| |
| // |
| // propagate lanes to right using max scan |
| // |
| SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()]; |
| SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); |
| #endif |
| |
| return source; |
| } |
| |
| // |
| // sliver lines into subpixels |
| // |
| |
| static |
| void |
| skc_sliver(__global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| __global union skc_bp_elem * const bp_elems, |
| __global uint * const bp_ids, |
| skc_uint const bp_mask, |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| skc_block_id_t * const subblocks, |
| skc_block_id_v_t * const blocks, |
| skc_uint * const blocks_next, |
| skc_ttsk_v_t * const sk_v, |
| skc_uint * const sk_v_next, |
| __global skc_ttsk_s_t * const sk_extent, |
| __local struct skc_subgroup_smem volatile * const smem, |
| SKC_RASTERIZE_FLOAT const l0x, |
| SKC_RASTERIZE_FLOAT const l0y, |
| SKC_RASTERIZE_FLOAT const l1x, |
| SKC_RASTERIZE_FLOAT const l1y) |
| { |
| // |
| // Y-SLIVERING |
| // ----------- |
| // |
| // immediately sliver all multi-pixel lines in into 1-pixel high |
| // lines |
| // |
| // note this implicitly squelches horizontal lines |
| // |
| // there is another test for horizontal lines after x-slivering |
| // is complete |
| // |
| |
| // |
| // will we need to flip the sign of y_delta ? |
| // |
| SKC_RASTERIZE_PREDICATE const y_lt = (l0y <= l1y); |
| SKC_RASTERIZE_UINT const dy_xor = y_lt ? 0 : 0x80000000; |
| |
| // |
| // save 1/dy |
| // |
| SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y); |
| |
| // |
| // how many non-horizontal subpixel y-axis slivers are there? |
| // |
| SKC_RASTERIZE_FLOAT const y_min = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); |
| SKC_RASTERIZE_FLOAT const y_max = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); |
| SKC_RASTERIZE_FLOAT const y_base = y_lt ? y_min : y_max; |
| SKC_RASTERIZE_FLOAT y_segs = y_max - y_min; |
| |
| // |
| // inclusive subgroup scan of y_segs |
| // |
| SKC_RASTERIZE_FLOAT y_iss = skc_subgroup_scan_inclusive_add_float(y_segs); |
| SKC_RASTERIZE_FLOAT y_ess = y_iss - y_segs; |
| float y_rem = skc_subgroup_last_float(y_iss); |
| |
| // |
| // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails |
| // |
| if (y_segs == 0.0f) |
| y_iss = 0.0f; |
| |
| #if 0 |
| printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem); |
| #endif |
| |
| // |
| // these values don't matter on first iteration |
| // |
| SKC_RASTERIZE_FLOAT n1x_prev = 0; |
| SKC_RASTERIZE_FLOAT n1y_prev = 0; |
| |
| // |
| // loop until done |
| // |
| while (y_rem > 0.0f) |
| { |
| // |
| // distribute work across lanes |
| // |
| SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess); |
| |
| // |
| // get line at y_source line |
| // |
| SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source); |
| SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source); |
| SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source); |
| SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source); |
| |
| // |
| // every lane will create a 1 pixel tall line "sliver" |
| // |
| // FIXME -- this gets expanded on SIMD |
| // |
| // if numerator == 1 then this is the first lane |
| // if numerator == s then this is the last lane |
| // |
| SKC_RASTERIZE_FLOAT const y_delta = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source); |
| SKC_RASTERIZE_FLOAT const y_count = skc_subgroup_shuffle(y_segs,y_source); |
| |
| SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f); |
| SKC_RASTERIZE_PREDICATE const is_y_last = (y_delta >= y_count); |
| |
| // toggle y_delta sign |
| SKC_RASTERIZE_FLOAT const y_offset = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source))); |
| |
| // |
| // calculate "right" line segment endpoint |
| // |
| SKC_RASTERIZE_FLOAT n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP; |
| SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source); |
| SKC_RASTERIZE_FLOAT n1x = round(SKC_LERP(m0x,m1x,n_t)); |
| |
| // |
| // override c1 if this is last point |
| // |
| n1y = select(n1y,m1y,is_y_last); |
| n1x = select(n1x,m1x,is_y_last); |
| |
| // |
| // shuffle up "left" line segment endpoint |
| // |
| // NOTE: Intel's shuffle_up is unique with its elegant |
| // "previous" argument so don't get used to it |
| // |
| SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y); |
| SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x); |
| |
| // |
| // override shuffle up if this is the first line segment |
| // |
| n0y = select(n0y,m0y,is_y_first); |
| n0x = select(n0x,m0x,is_y_first); |
| |
| // |
| // save previous right endpoint |
| // |
| n1x_prev = n1x; |
| n1y_prev = n1y; |
| |
| // |
| // decrement by subgroup size |
| // |
| y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; |
| y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; |
| y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; |
| |
| #if 0 |
| // |
| // debug |
| // |
| if (n0y != n1y) { |
| printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y); |
| } |
| #endif |
| |
| // |
| // X-SLIVERING |
| // ----------- |
| // |
| // now sliver 1-pixel high lines into at either vertical or |
| // 1-pixel wide lines |
| // |
| // save original direction and work with increasing x |
| // |
| SKC_RASTERIZE_PREDICATE const x_lt = (n0x <= n1x); |
| SKC_RASTERIZE_UINT const dx_xor = x_lt ? 0 : 0x80000000; |
| |
| // |
| // save 1/dy |
| // |
| SKC_RASTERIZE_FLOAT const x_denom = native_recip(n1x - n0x); |
| |
| // |
| // how many non-horizontal subpixel y-axis slivers are there? |
| // |
| SKC_RASTERIZE_FLOAT const x_min = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); |
| SKC_RASTERIZE_FLOAT const x_max = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); |
| SKC_RASTERIZE_FLOAT const x_base = x_lt ? x_min : x_max; |
| SKC_RASTERIZE_FLOAT const x_segs = fmax(x_max - x_min,1.0f); |
| |
| // |
| // inclusive subgroup scan of y_segs |
| // |
| SKC_RASTERIZE_FLOAT x_iss = skc_subgroup_scan_inclusive_add_float(x_segs); |
| SKC_RASTERIZE_FLOAT x_ess = x_iss - x_segs; |
| float x_rem = skc_subgroup_last_float(x_iss); |
| |
| // |
| // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails |
| // |
| //if (x_segs == 0.0f) |
| // x_iss = 0.0f; |
| |
| // |
| // these values don't matter on first iteration |
| // |
| SKC_RASTERIZE_FLOAT p1x_prev = 0; |
| SKC_RASTERIZE_FLOAT p1y_prev = 0; |
| |
| // |
| // loop until done |
| // |
| while (x_rem > 0) |
| { |
| // |
| // distribute work across lanes |
| // |
| SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess); |
| |
| // |
| // get line at y_source line |
| // |
| SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source); |
| SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source); |
| SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source); |
| SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source); |
| |
| // |
| // every lane will create a 1 pixel tall line "sliver" |
| // |
| // FIXME -- this gets expanded on SIMD |
| // |
| // if numerator == 1 then this is the first lane |
| // if numerator == s then this is the last lane |
| // |
| SKC_RASTERIZE_FLOAT const x_delta = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source); |
| SKC_RASTERIZE_FLOAT const x_count = skc_subgroup_shuffle(x_segs,x_source); |
| |
| SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f); |
| SKC_RASTERIZE_PREDICATE const is_x_last = (x_delta >= x_count); |
| |
| // toggle x_delta sign |
| SKC_RASTERIZE_FLOAT const x_offset = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source))); |
| |
| // |
| // calculate "right" line segment endpoint |
| // |
| SKC_RASTERIZE_FLOAT p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP; |
| SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source); |
| SKC_RASTERIZE_FLOAT p1y = round(SKC_LERP(o0y,o1y,p_t)); |
| |
| // |
| // override c1 if this is last point |
| // |
| p1x = select(p1x,o1x,is_x_last); |
| p1y = select(p1y,o1y,is_x_last); |
| |
| // |
| // shuffle up "left" line segment endpoint |
| // |
| // NOTE: Intel's shuffle_up is unique with its elegant |
| // "previous" argument so don't get used to it |
| // |
| SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x); |
| SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y); |
| |
| // |
| // override shuffle up if this is the first line segment |
| // |
| p0x = select(p0x,o0x,is_x_first); |
| p0y = select(p0y,o0y,is_x_first); |
| |
| // |
| // save previous right endpoint |
| // |
| p1x_prev = p1x; |
| p1y_prev = p1y; |
| |
| // |
| // decrement by subgroup size |
| // |
| x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; |
| x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; |
| x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; |
| |
| // |
| // only non-horizontal subpixel lines are valid |
| // |
| SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y); |
| |
| // |
| // if no lanes are active then continue |
| // |
| // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY |
| // IMPACTS PERFORMANCE (+12% ?) |
| // |
| // IT SHOULDN'T !!! |
| // |
| #if 0 |
| if (!skc_subgroup_any(is_active)) |
| continue; |
| #endif |
| |
| // |
| // Option 1: use SLM for explicitly managed coalesced stores |
| // |
| // 1. which tile does this line belong? |
| // 2. hash tile coordinates |
| // 3. lookup hash |
| // 4. if tile matches then SLM append keys |
| // 5. if tile doesn't match |
| // a. flush |
| // b. create new TTSK_RYX |
| // c. obtain TTSB block from pool |
| // d. goto 3. |
| // |
| |
| // |
| // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores |
| // |
| // 1. which tile does this line belong? |
| // 2. hash tile coordinates |
| // 3. lookup hash |
| // 4. if tile matches then GMEM append keys |
| // 5. if tile doesn't match |
| // a. flush (and invalidate empty elems) |
| // b. create new TTSK_RYX |
| // c. obtain TTSB block from pool |
| // d. goto 3. |
| // |
| |
| // |
| // The virtual rasterization surface is very large and |
| // signed: +/- ~64K-256K, depending on the architecture. |
| // |
| // Rasters must be clipped to the virtual surface and, |
| // optionally, clipped even further on a per raster |
| // basis. |
| // |
| |
| // |
| // Clip to the per-raster clip |
| // |
| |
| /* |
| |
| CLIP HERE |
| |
| */ |
| |
| // |
| // Hash the tile coordinates |
| // |
| // This table lists nominal values for each architecture. |
| // We want to choose values that are naturally fit the |
| // "width" of the architecture. |
| // |
| // SIMD RANGE BITS MAX RANGE MAX BINS HASH BITS |
| // ---- ------- ---- --------- -------- --------- |
| // 4 [0, 4] 3 [0, 7] 10 mod(10) <-- SSE42, ? |
| // 8 [0, 8] 4 [0, 15] 8 3 <-- GEN*,AVX* |
| // 16 [0, 16] 5 [0, 31] 6 mod(6) <-- GEN*,? |
| // 32 [0, 32] 6 [0, 63] 5 mod(5) <-- CUDA,PowerVR,Adreno,GEN* |
| // 64 [0, 64] 7 [0,127] 4 2 <-- AMD Radeon |
| // |
| // NOTE: When possible, bias the hash toward using more y |
| // bits because of: |
| // |
| // 1. the 90 degree counter-clockwise rotation that we put |
| // in place to offset the render-time clockwise |
| // rotation |
| // |
| // 2. the likely presence of left-to-right or |
| // right-to-left glyphs. |
| // |
| // For power-of-two bins, the hash is easy. |
| // |
| // For non-power-of-two, we may want to either implement a |
| // fast mod (compiler should do this for us... hahahaha) or |
| // drop down to the next power-of-two. |
| // |
| |
| // |
| // FIXME -- this snarl is not good -- can probably reduce |
| // some of the sign casting but some is there to vectorize a |
| // scalar |
| // |
| SKC_RASTERIZE_INT const z0y = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y); |
| SKC_RASTERIZE_INT const z1y = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y); |
| |
| SKC_RASTERIZE_INT const z0x = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x); |
| SKC_RASTERIZE_INT const z1x = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x); |
| |
| SKC_RASTERIZE_INT const min_y = min(z0y,z1y); |
| SKC_RASTERIZE_INT const max_y = max(z0y,z1y); |
| |
| SKC_RASTERIZE_INT const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2; |
| |
| SKC_RASTERIZE_UINT const ty = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y; |
| SKC_RASTERIZE_INT dy = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y); |
| |
| // |
| // map [+1,+32] to [ 0,+31] |
| // map [-1,-32] to [-1,-32] |
| // |
| SKC_RASTERIZE_INT dys = (dy + (~dy >> 31)) << 26; |
| |
| SKC_RASTERIZE_INT const min_x = min(z0x,z1x); |
| SKC_RASTERIZE_INT const max_x = max(z0x,z1x); |
| SKC_RASTERIZE_INT const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2; |
| |
| SKC_RASTERIZE_UINT const tx = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X; |
| SKC_RASTERIZE_UINT const sx = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x); |
| |
| SKC_RASTERIZE_UINT const tts = dys | (ty << 16) | (sx << 10) | tx; |
| |
| SKC_RASTERIZE_UINT const hash = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) | |
| (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK)); |
| |
| SKC_RASTERIZE_UINT const yx = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF)); |
| |
| #if 0 |
| printf("(%3u, %3u)\n",tile_y,tile_x); |
| #endif |
| |
| #if 0 |
| if (is_active) |
| printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx); |
| #endif |
| |
| // |
| // debug |
| // |
| #if 0 // PRINTF_ENABLE |
| |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) \ |
| if (is_active C) \ |
| printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C); |
| |
| SKC_RASTERIZE_VECTOR_EXPAND(); |
| #else |
| if (is_active) |
| printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash); |
| #endif |
| |
| #endif |
| // |
| // flush all active lanes |
| // |
| while (true) |
| { |
| // |
| // either gather load or vector load+shuffle the yx keys |
| // |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| SKC_RASTERIZE_BIN const yx_bin = smem->bin.vN.yx; |
| SKC_RASTERIZE_UINT const yx_cur = shuffle(yx_bin,hash); |
| #else |
| SKC_RASTERIZE_UINT const yx_cur = smem->bin.aN.yx[hash]; |
| #endif |
| |
| // |
| // does yx for lane match yx for hash? |
| // |
| SKC_RASTERIZE_UINT const active_yx = is_active ? yx : SKC_RASTERIZE_YX_INVALID; |
| SKC_RASTERIZE_PREDICATE const is_match = (yx_cur == active_yx); |
| |
| // |
| // OpenCL spec: "When casting a bool to a vector integer |
| // data type, the vector components will be set to -1 |
| // (i.e. all bits set) if the vector bool value is true |
| // and 0 otherwise. |
| // |
| #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) |
| SKC_RASTERIZE_UINT const h_match = (SKC_RASTERIZE_UINT)is_match; |
| #else |
| SKC_RASTERIZE_UINT const h_match = abs(is_match); // {-1,0} -> {+1,0} |
| #endif |
| // |
| // how many new elements for each matching hash bin? |
| // |
| SKC_RASTERIZE_UINT const h_shl = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS; |
| SKC_RASTERIZE_UINT const h = h_match << h_shl; |
| |
| // |
| // prefix sum all of the bins in parallel |
| // |
| SKC_RASTERIZE_UINT const h_iss = skc_subgroup_scan_inclusive_add_uint(h); |
| SKC_RASTERIZE_UINT const h_total = skc_subgroup_last_uint(h_iss); |
| |
| // |
| // current bin counts |
| // |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| SKC_RASTERIZE_BIN const count_bin = smem->bin.vN.count; |
| SKC_RASTERIZE_UINT const count_cur = shuffle(count_bin,hash); |
| #else |
| SKC_RASTERIZE_UINT const count_cur = smem->bin.aN.count[hash]; |
| #endif |
| |
| // |
| // calculate where each cache-hit and in-bounds tts should be stored |
| // |
| SKC_RASTERIZE_UINT const ttsb_index = (h_iss >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1; |
| SKC_RASTERIZE_UINT const count_new = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur; |
| |
| // |
| // which lanes can append to a matching bin? |
| // |
| SKC_RASTERIZE_PREDICATE const is_append = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS); |
| |
| // |
| // scatter append tts elements to bin blocks |
| // |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) |
| // |
| // SIMD |
| // |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) \ |
| if (is_append C) \ |
| { \ |
| smem->bin.aN.ttsb [hash C][ttsb_index C] = tts C; \ |
| smem->bin.aN.count[hash C] = count_new C; \ |
| } |
| |
| SKC_RASTERIZE_VECTOR_EXPAND(); |
| #else |
| // |
| // SIMT |
| // |
| if (is_append) |
| { |
| smem->bin.aN.ttsb [hash][ttsb_index] = tts; |
| smem->bin.aN.count[hash] = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS |
| } |
| #endif |
| // |
| // try to keep predicate updates SIMD-friendly and |
| // outside of predicated code paths -- this is not |
| // always how we would normally do things on SIMT but |
| // either approach is acceptable |
| // |
| |
| // |
| // mask off lanes/components that successfully appended |
| // |
| is_active = is_active && !is_append; |
| |
| // |
| // are there any active lanes left? |
| // |
| if (!skc_subgroup_any(is_active)) |
| break; |
| |
| // |
| // There are active lanes that couldn't be appended to a |
| // bin because their hashes collided with the bin's |
| // current ryx key then those bins must be ejected. |
| // |
| // Note that we do not eject "full" bins because lazily |
| // waiting for a collision results in simpler code. |
| // |
| skc_flush(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| subblocks, |
| blocks, |
| blocks_next, |
| sk_v, |
| sk_v_next, |
| sk_extent, |
| smem, |
| hash, |
| yx, |
| is_active); |
| } |
| } |
| } |
| } |
| |
| // |
| // INITIALIZE SMEM |
| // |
| // Note that SIMD/SIMT have nearly the same syntax. |
| // |
| static |
| void |
| skc_smem_init(__local struct skc_subgroup_smem volatile * const smem) |
| { |
| // |
| // initialize smem bins |
| // |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| smem->bin.vN.yx = ( SKC_RASTERIZE_YX_INIT ); |
| smem->bin.vN.count = ( 0 ); |
| #else |
| // |
| // SIMT |
| // |
| int idx = skc_subgroup_lane(); |
| |
| #if ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) |
| if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT) |
| #elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) |
| for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE) |
| #endif |
| { |
| smem->bin.aN.yx [idx] = ( SKC_RASTERIZE_YX_INIT ); |
| smem->bin.aN.count[idx] = ( 0 ); |
| } |
| #endif |
| } |
| |
| // |
| // RASTERIZE CUBIC KERNEL |
| // |
| |
| static |
| void |
| skc_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| __global union skc_bp_elem * const bp_elems, |
| __global uint * const bp_ids, |
| skc_uint const bp_mask, |
| |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| __global skc_ttsk_s_t * const sk_extent, |
| |
| __local struct skc_subgroup_smem volatile * const smem, |
| |
| skc_uint * const nodeword, |
| skc_block_id_t * const id, |
| |
| union skc_transform const * const tv, |
| union skc_path_clip const * const cv, |
| skc_uint const cohort) |
| { |
| // |
| // the initial segment idx and segments-per-block constant determine |
| // how many block ids will need to be loaded |
| // |
| SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| // |
| // apply transform |
| // |
| // note that we only care if the end points are rounded to subpixel precision |
| // |
| // FIXME -- transformation is currently affine-only support perspective later |
| // |
| // the affine transformation requires 8 FMA + 2 ROUND operations |
| // |
| |
| SKC_RASTERIZE_FLOAT b0x = c0x * tv->sx + c0y * tv->shx + tv->tx; |
| SKC_RASTERIZE_FLOAT b0y = c0x * tv->shy + c0y * tv->sy + tv->ty; |
| |
| SKC_RASTERIZE_FLOAT t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; |
| SKC_RASTERIZE_FLOAT t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; |
| |
| SKC_RASTERIZE_FLOAT t2x = c2x * tv->sx + c2y * tv->shx + tv->tx; |
| SKC_RASTERIZE_FLOAT t2y = c2x * tv->shy + c2y * tv->sy + tv->ty; |
| |
| SKC_RASTERIZE_FLOAT t3x = c3x * tv->sx + c3y * tv->shx + tv->tx; |
| SKC_RASTERIZE_FLOAT t3y = c3x * tv->shy + c3y * tv->sy + tv->ty; |
| |
| // |
| // FIXME -- this is temporary support for projection |
| // |
| bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f); |
| |
| if (!is_affine) |
| { |
| SKC_PROJECT(tv,c0x,c0y,b0x,b0y); |
| SKC_PROJECT(tv,c1x,c1y,t1x,t1y); |
| SKC_PROJECT(tv,c2x,c2y,t2x,t2y); |
| SKC_PROJECT(tv,c3x,c3y,t3x,t3y); |
| } |
| |
| b0x = round(b0x); |
| b0y = round(b0y); |
| |
| t3x = round(t3x); |
| t3y = round(t3y); |
| |
| // |
| // |
| // |
| #if PRINTF_ENABLE |
| |
| #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) \ |
| printf("{ { %.02f, %.02f }, { %.02f, %.02f }," \ |
| " { %.02f, %.02f }, { %.02f, %.02f } },\n", \ |
| b0x C,b0y C,t1x C,t1y C, \ |
| t2x C,t2y C,t3x C,t3y C); |
| |
| SKC_RASTERIZE_VECTOR_EXPAND(); |
| |
| #else |
| |
| printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n", |
| b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); |
| |
| #endif |
| |
| #endif |
| |
| // |
| // OLD APPROACH |
| // ------------ |
| // |
| // The Spinel CUDA rasterizer was significantly more complex and |
| // performed a few different tasks that are probably best kept |
| // separate. |
| // |
| // The Spinel rasterizer Bezier held 4-element x and y coordinates |
| // in adjacent lanes. This simplified intermingling of single lane |
| // 4-coordinate line segments with two-lane cubic Beziers. |
| // |
| // After transformation of the input segments, the Spinel rasterizer |
| // would test cubics for flatness and, if flat, collapse the |
| // adjacent lanes into a single line lane and an empty lane. |
| // |
| // Any lines would then be appended to a line queue. |
| // |
| // Any cubics would then be subdivided. |
| // |
| // The reclassification process would be repeated. |
| // |
| // NEW APPROACH |
| // ------------ |
| // |
| // Assume we're only working with cubics in this kernel. |
| // |
| // Optimization: if the line segment is a special case -- a cusp, |
| // has 1+ inflections, or a loop -- it might be beneficial to |
| // subdivide the control cage 1+ times in order to separate the |
| // flatter segments the high-velocity region(s). |
| // |
| // This means we want to split using [a,b] formulation to _directly_ |
| // subdivide producing a new control cage. |
| // |
| // Wang's Formula is still useful even if we subdivide once or twice |
| // as it's so cheap that it might give some useful hints about where |
| // the high-velocity sections of curve reside. |
| // |
| // But it seems like using Wang's and directly flattening to line |
| // segments without any subdivision is good enough for the limited |
| // set of test cases that I've tried. |
| // |
| // So... use Wang's Formula to estimate how many line segment are |
| // required to properly flatten the cubics. |
| // |
| // Then use inclusive/exclusive scans to put all the lanes to work: |
| // |
| // 1. segmenting cubics to line segments |
| // |
| // 2. slivering line segments into 1-pixel high line segments |
| // |
| // 3. slivering 1-pixel high line segments into 1-pixel wide line |
| // segments |
| // |
| // MORE BACKGROUND ON NEW APPROACH |
| // ------------------------------- |
| // |
| // Two options for handling line segments: |
| // |
| // 1. append the line segments onto an SLM array until enough |
| // work has been accrued (Spinel does this) |
| // |
| // 2. immediately sliver the potentially multi-pixel line |
| // segments into subpixel lines |
| // |
| // The advantage of (1) is that it guarantees the slivering |
| // process will, on average, always be emitting a full subgroup |
| // of subpixel lines. |
| // |
| // The advantage of (2) is that it reduces code complexity and |
| // leaves more room for SLM tile bins. The difference between Spinel |
| // and Skia Compute is that Wang's Formula guarantees there will be |
| // a full subgroup of multi-pixel lines unless this is the final |
| // iteration of the warp of multi-pixel lines. |
| // |
| // Note that wider GPU architectures might benefit from (1) and |
| // other work accumulation strategies because it will minimize |
| // partial warp workloads in the final iteration of each stage. It |
| // also minimizes the sunk cost of the uniform control logic steps. |
| // |
| // So let's implement (2) for now... |
| // |
| |
| // |
| // And... begin! |
| // |
| // Estimate how many line segments are in quad/cubic curve. |
| // |
| // Wang's Formula will return zero if the control points are |
| // collinear but we bump it up to 1.0f. |
| // |
| SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); |
| |
| // |
| // if there are free registers then precalculate the reciprocal for |
| // each estimated segments since it will never change |
| // |
| SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); |
| |
| |
| // |
| // inclusive add scan of estimated line segments |
| // exclusive add scan of estimated line segments |
| // total number of estimated line segments |
| // |
| SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); |
| SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; |
| float s_rem = skc_subgroup_last_float(s_iss); // scalar |
| |
| // |
| // Precompute cubic polynomial coefficients from transformed control |
| // cage so we can shuffle them in on each iteration of the outer |
| // loop and then evaluate the polynomial in Horner form. |
| // |
| // | 1 0 0 0 | | c0 | |
| // | | | | |
| // | -3 3 0 0 | | c1 | |
| // B(t) = [ 1 t^1 t^2 t^3 ] | | | | |
| // | 3 -6 3 0 | | c2 | |
| // | | | | |
| // | -1 3 -3 1 | | c3 | |
| // |
| // |
| SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x); // 2 - 1 MAD + MUL |
| SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y); // 2 - 1 MAD + MUL |
| |
| SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x)); // 3 - 2 MAD + MUL |
| SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y)); // 3 - 2 MAD + MUL |
| |
| SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB |
| SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB |
| |
| // |
| // these values don't matter on the first iteration |
| // |
| SKC_RASTERIZE_FLOAT l1x_prev = 0; |
| SKC_RASTERIZE_FLOAT l1y_prev = 0; |
| |
| // |
| // allocate and init in-register TTSK keys |
| // |
| skc_uint sk_v_next = 0; |
| skc_ttsk_v_t sk_v; |
| |
| sk_v.hi = cohort; |
| |
| // |
| // initialize smem |
| // |
| skc_smem_init(smem); |
| |
| // |
| // initialize blocks / subblocks |
| // |
| skc_block_id_v_t blocks; |
| skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; |
| |
| #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 |
| skc_block_id_t subblocks = 0; |
| #endif |
| |
| // |
| // loop until done |
| // |
| while (s_rem > 0) |
| { |
| // |
| // distribute work across lanes |
| // |
| SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); |
| |
| // |
| // every lane has a fraction to work off of |
| // |
| // FIXME -- this gets expanded on SIMD |
| // |
| // if delta == 1 then this is the first lane |
| // if count == s_segs then this is the last lane |
| // |
| SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); |
| SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); |
| |
| SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); |
| SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); |
| |
| // |
| // init parametric t |
| // |
| SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? |
| |
| // |
| // if last then override to a hard 1.0f |
| // |
| s_t = is_s_last ? 1.0f : s_t; |
| |
| // |
| // decrement by subgroup size |
| // |
| s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; |
| s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; |
| s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; |
| |
| // |
| // now every lane knows what to do and the following lines will |
| // pump out up to SUBGROUP_SIZE line segments |
| // |
| // obtain the src vertices through shared or via a shuffle |
| // |
| |
| // |
| // shuffle in the polynomial coefficients their source lane |
| // |
| SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); |
| SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); |
| |
| SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); |
| SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); |
| |
| SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); |
| SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); |
| |
| SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source); |
| SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source); |
| |
| // |
| // calculate "right" line segment endpoint using Horner form |
| // |
| SKC_RASTERIZE_FLOAT l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND |
| SKC_RASTERIZE_FLOAT l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND |
| |
| // |
| // shuffle up "left" line segment endpoint |
| // |
| // NOTE: Intel's shuffle_up is unique with its elegant |
| // "previous" argument so don't get used to it |
| // |
| SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); |
| SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); |
| |
| // |
| // save previous right endpoint |
| // |
| l1x_prev = l1x; |
| l1y_prev = l1y; |
| |
| // |
| // override shuffle up if this is the first line segment |
| // |
| l0x = select(l0x,s0x,is_s_first); |
| l0y = select(l0y,s0y,is_s_first); |
| |
| // |
| // sliver lines |
| // |
| skc_sliver(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| &subblocks, |
| &blocks, |
| &blocks_next, |
| &sk_v, |
| &sk_v_next, |
| sk_extent, |
| smem, |
| l0x,l0y,l1x,l1y); |
| } |
| |
| // |
| // - flush work-in-progress blocks |
| // - return unused block ids |
| // |
| skc_finalize(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| &blocks, |
| blocks_next, |
| &sk_v, |
| sk_v_next, |
| sk_extent, |
| smem); |
| } |
| |
| // |
| // RASTERIZE QUAD KERNEL |
| // |
| |
| static |
| void |
| skc_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| __global union skc_bp_elem * const bp_elems, |
| __global uint * const bp_ids, |
| skc_uint const bp_mask, |
| |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| __global skc_ttsk_s_t * const sk_extent, |
| |
| __local struct skc_subgroup_smem volatile * const smem, |
| |
| skc_uint * const nodeword, |
| skc_block_id_t * const id, |
| |
| union skc_transform const * const tv, |
| union skc_path_clip const * const cv, |
| skc_uint const cohort) |
| { |
| // |
| // the initial segment idx and segments-per-block constant determine |
| // how many block ids will need to be loaded |
| // |
| SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| // |
| // apply transform |
| // |
| // note that we only care if the end points are rounded to subpixel precision |
| // |
| // FIXME -- transformation is currently affine-only support perspective later |
| // |
| // the affine transformation requires 8 FMA + 2 ROUND operations |
| // |
| SKC_RASTERIZE_FLOAT b0x = c0x * tv->sx + c0y * tv->shx + tv->tx; |
| SKC_RASTERIZE_FLOAT b0y = c0x * tv->shy + c0y * tv->sy + tv->ty; |
| |
| SKC_RASTERIZE_FLOAT t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; |
| SKC_RASTERIZE_FLOAT t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; |
| |
| SKC_RASTERIZE_FLOAT t2x = c2x * tv->sx + c2y * tv->shx + tv->tx; |
| SKC_RASTERIZE_FLOAT t2y = c2x * tv->shy + c2y * tv->sy + tv->ty; |
| |
| // |
| // FIXME -- this is temporary support for projection |
| // |
| bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f); |
| |
| if (!is_affine) |
| { |
| SKC_PROJECT(tv,c0x,c0y,b0x,b0y); |
| SKC_PROJECT(tv,c1x,c1y,t1x,t1y); |
| SKC_PROJECT(tv,c2x,c2y,t2x,t2y); |
| } |
| |
| b0x = round(b0x); |
| b0y = round(b0y); |
| |
| t2x = round(t2x); |
| t2y = round(t2y); |
| |
| // |
| // Estimate how many line segments are in quad/cubic curve. |
| // |
| // Wang's Formula will return zero if the control points are |
| // collinear but we bump it up to 1.0f. |
| // |
| SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y); |
| |
| // |
| // if there are free registers then precalculate the reciprocal for |
| // each estimated segments since it will never change |
| // |
| SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); |
| |
| |
| // |
| // inclusive add scan of estimated line segments |
| // exclusive add scan of estimated line segments |
| // total number of estimated line segments |
| // |
| SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); |
| SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; |
| float s_rem = skc_subgroup_last_float(s_iss); // scalar |
| |
| // |
| // Precompute quadratic polynomial coefficients from control cage so |
| // we can shuffle them in on each iteration of the outer loop and |
| // then evaluate the polynomial in Horner form. |
| // |
| |
| // | 1 0 0 | | c0 | |
| // | | | | |
| // B(t) = [ 1 t^1 t^2 ] | -2 2 0 | | c1 | |
| // | | | | |
| // | 1 -2 1 | | c2 | |
| // |
| // |
| SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL |
| SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL |
| |
| SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x); // 2 - 1 MAD + ADD |
| SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y); // 2 - 1 MAD + ADD |
| |
| // |
| // these values don't matter on the first iteration |
| // |
| SKC_RASTERIZE_FLOAT l1x_prev = 0; |
| SKC_RASTERIZE_FLOAT l1y_prev = 0; |
| |
| // |
| // allocate and init in-register TTSK keys |
| // |
| skc_uint sk_v_next = 0; |
| skc_ttsk_v_t sk_v; |
| |
| sk_v.hi = cohort; |
| |
| // |
| // initialize smem |
| // |
| skc_smem_init(smem); |
| |
| // |
| // initialize blocks / subblocks |
| // |
| skc_block_id_v_t blocks; |
| skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; |
| |
| #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 |
| skc_block_id_t subblocks = 0; |
| #endif |
| |
| // |
| // loop until done |
| // |
| while (s_rem > 0) |
| { |
| // |
| // distribute work across lanes |
| // |
| SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); |
| |
| // |
| // every lane has a fraction to work off of |
| // |
| // FIXME -- this gets expanded on SIMD |
| // |
| // if delta == 1 then this is the first lane |
| // if count == s_segs then this is the last lane |
| // |
| SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); |
| SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); |
| |
| SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); |
| SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); |
| |
| // |
| // init parametric t |
| // |
| SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? |
| |
| // |
| // if last then override to a hard 1.0f |
| // |
| s_t = is_s_last ? 1.0f : s_t; |
| |
| // |
| // decrement by subgroup size |
| // |
| s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; |
| s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; |
| s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; |
| |
| // |
| // now every lane knows what to do and the following lines will |
| // pump out up to SUBGROUP_SIZE line segments |
| // |
| // obtain the src vertices through shared or via a shuffle |
| // |
| |
| // |
| // shuffle in the polynomial coefficients their source lane |
| // |
| SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); |
| SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); |
| |
| SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); |
| SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); |
| |
| SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); |
| SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); |
| |
| // |
| // calculate "right" line segment endpoint using Horner form |
| // |
| SKC_RASTERIZE_FLOAT l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND |
| SKC_RASTERIZE_FLOAT l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND |
| |
| // |
| // shuffle up "left" line segment endpoint |
| // |
| // NOTE: Intel's shuffle_up is unique with its elegant |
| // "previous" argument so don't get used to it |
| // |
| SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); |
| SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); |
| |
| // |
| // save previous right endpoint |
| // |
| l1x_prev = l1x; |
| l1y_prev = l1y; |
| |
| // |
| // override shuffle up if this is the first line segment |
| // |
| l0x = select(l0x,s0x,is_s_first); |
| l0y = select(l0y,s0y,is_s_first); |
| |
| // |
| // sliver lines |
| // |
| skc_sliver(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| &subblocks, |
| &blocks, |
| &blocks_next, |
| &sk_v, |
| &sk_v_next, |
| sk_extent, |
| smem, |
| l0x,l0y,l1x,l1y); |
| } |
| |
| // |
| // - flush work-in-progress blocks |
| // - return unused block ids |
| // |
| skc_finalize(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| &blocks, |
| blocks_next, |
| &sk_v, |
| sk_v_next, |
| sk_extent, |
| smem); |
| } |
| |
| // |
| // RASTERIZE LINE KERNEL |
| // |
| |
| static |
| void |
| skc_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| __global union skc_bp_elem * const bp_elems, |
| __global uint * const bp_ids, |
| skc_uint const bp_mask, |
| |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| __global skc_ttsk_s_t * const sk_extent, |
| |
| __local struct skc_subgroup_smem volatile * const smem, |
| |
| skc_uint * const nodeword, |
| skc_block_id_t * const id, |
| |
| union skc_transform const * const tv, |
| union skc_path_clip const * const cv, |
| skc_uint const cohort) |
| { |
| // |
| // the initial segment idx and segments-per-block constant determine |
| // how many block ids will need to be loaded |
| // |
| SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| skc_segment_next(bp_elems,nodeword,id); |
| |
| SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; |
| |
| #if 0 |
| printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y); |
| #endif |
| |
| // |
| // apply transform |
| // |
| // note that we only care if the end points are rounded to subpixel precision |
| // |
| // FIXME -- transformation is currently affine-only |
| // FIXME -- support perspective later |
| // |
| // the affine transformation requires 8 FMA + 4 ROUND operations |
| // |
| SKC_RASTERIZE_FLOAT l0x = c0x * tv->sx + c0y * tv->shx + tv->tx; |
| SKC_RASTERIZE_FLOAT l0y = c0x * tv->shy + c0y * tv->sy + tv->ty; |
| |
| SKC_RASTERIZE_FLOAT l1x = c1x * tv->sx + c1y * tv->shx + tv->tx; |
| SKC_RASTERIZE_FLOAT l1y = c1x * tv->shy + c1y * tv->sy + tv->ty; |
| |
| // |
| // FIXME -- this is temporary support for projection |
| // |
| bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f); |
| |
| if (!is_affine) { |
| SKC_PROJECT(tv,c0x,c0y,l0x,l0y); |
| SKC_PROJECT(tv,c1x,c1y,l1x,l1y); |
| } |
| |
| l0x = round(l0x); |
| l0y = round(l0y); |
| |
| l1x = round(l1x); |
| l1y = round(l1y); |
| |
| #if 0 |
| printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y); |
| #endif |
| |
| // |
| // allocate and init in-register TTSK keys |
| // |
| skc_uint sk_v_next = 0; |
| skc_ttsk_v_t sk_v; |
| |
| sk_v.hi = cohort; |
| |
| // |
| // initialize smem |
| // |
| skc_smem_init(smem); |
| |
| // |
| // initialize blocks / subblocks |
| // |
| skc_block_id_v_t blocks; |
| skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; |
| |
| #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 |
| skc_block_id_t subblocks = 0; |
| #endif |
| |
| // |
| // sliver lines |
| // |
| skc_sliver(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| &subblocks, |
| &blocks, |
| &blocks_next, |
| &sk_v, |
| &sk_v_next, |
| sk_extent, |
| smem, |
| l0x,l0y,l1x,l1y); |
| |
| // |
| // - flush work-in-progress blocks |
| // - return unused block ids |
| // |
| skc_finalize(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| &blocks, |
| blocks_next, |
| &sk_v, |
| sk_v_next, |
| sk_extent, |
| smem); |
| } |
| |
| // |
| // |
| // |
| |
| __kernel |
| SKC_RASTERIZE_KERNEL_ATTRIBS |
| void |
| skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| __global union skc_bp_elem * const bp_elems, |
| __global uint * const bp_ids, |
| skc_uint const bp_mask, |
| |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| __global skc_ttsk_s_t * const sk_extent, |
| |
| __global float8 const * const transforms, // FIXME -- __constant |
| __global float4 const * const clips, // FIXME -- __constant |
| __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant |
| skc_uint const count) |
| { |
| // |
| // declare shared memory block |
| // |
| #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) |
| __local struct skc_subgroup_smem volatile smem[1]; |
| #else |
| __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; |
| __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); |
| #endif |
| |
| // |
| // this is a subgroup/warp-centric kernel |
| // |
| // which subgroup in the grid is this? |
| // |
| // TAKE NOTE: the Intel GEN compiler appears to be recognizing |
| // get_group_id(0) as a uniform but the alternative calculation used |
| // when there are multiple subgroups per workgroup is not |
| // cooperating and driving spillage elsewhere. |
| // |
| #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) |
| uint const cmd_idx = get_group_id(0); |
| #else |
| uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); |
| #endif |
| |
| #if 0 |
| if (get_sub_group_local_id() == 0) |
| printf("+cmd_idx = %u\n",cmd_idx); |
| #endif |
| |
| // |
| // if worksgroups are multi-subgroup then there may be excess |
| // subgroups in the final workgroup |
| // |
| if (cmd_idx >= count) |
| return; |
| |
| #if 0 |
| if (get_sub_group_local_id() == 0) |
| printf("-cmd_idx = %u\n",cmd_idx); |
| #endif |
| |
| // |
| // load a single command for this subgroup |
| // |
| union skc_cmd_rasterize const cmd = cmds[cmd_idx]; |
| |
| #if 0 |
| if (get_sub_group_local_id() == 0) |
| printf("[ %u ]< %u, %u, %u, %u >\n", |
| cmd_idx, |
| cmd.nodeword, |
| SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd), |
| SKC_CMD_RASTERIZE_GET_CLIP(cmd), |
| SKC_CMD_RASTERIZE_GET_COHORT(cmd)); |
| #endif |
| |
| // |
| // get first block node command word and its subblock |
| // |
| skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing |
| skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; |
| skc_block_id_tag tag = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id); |
| skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); |
| |
| // |
| // load transform -- uniform across subgroup |
| // |
| // v8: { sx shx tx shy sy ty w0 w1 } |
| // |
| // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: |
| // |
| // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] |
| // |
| // Coordinates are scaled to subpixel resolution. All that matters |
| // is that continuity is maintained between end path element |
| // endpoints. |
| // |
| // It's the responsibility of the host to ensure that the transforms |
| // are properly scaled either via intitializing a transform stack |
| // with the subpixel resolution scaled identity or scaling the |
| // transform before its loaded by a rasterization grid. |
| // |
| // FIXME -- horizontal load might be better than this broadcast load |
| // |
| union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load |
| union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load |
| skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted |
| |
| switch (tag) |
| { |
| case SKC_BLOCK_ID_TAG_PATH_LINE: |
| skc_rasterize_lines(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| sk_extent, |
| smem, |
| &nodeword,&id, |
| &tv,&cv,cohort); |
| break; |
| |
| case SKC_BLOCK_ID_TAG_PATH_QUAD: |
| skc_rasterize_quads(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| sk_extent, |
| smem, |
| &nodeword,&id, |
| &tv,&cv,cohort); |
| break; |
| |
| case SKC_BLOCK_ID_TAG_PATH_CUBIC: |
| skc_rasterize_cubics(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| sk_extent, |
| smem, |
| &nodeword,&id, |
| &tv,&cv,cohort); |
| break; |
| |
| case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD: |
| break; |
| case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC: |
| break; |
| |
| default: |
| break; |
| } |
| } |
| |
| // |
| // |
| // |
| |
| __kernel |
| SKC_RASTERIZE_KERNEL_ATTRIBS |
| void |
| skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| __global union skc_bp_elem * const bp_elems, |
| __global uint * const bp_ids, |
| skc_uint const bp_mask, |
| |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| __global skc_ttsk_s_t * const sk_extent, |
| |
| __global float8 const * const transforms, // FIXME -- __constant |
| __global float4 const * const clips, // FIXME -- __constant |
| __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant |
| skc_uint const count) |
| { |
| // |
| // declare shared memory block |
| // |
| #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) |
| __local struct skc_subgroup_smem volatile smem[1]; |
| #else |
| __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; |
| __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); |
| #endif |
| |
| // |
| // this is a subgroup/warp-centric kernel |
| // |
| // which subgroup in the grid is this? |
| // |
| // TAKE NOTE: the Intel GEN compiler appears to be recognizing |
| // get_group_id(0) as a uniform but the alternative calculation used |
| // when there are multiple subgroups per workgroup is not |
| // cooperating and driving spillage elsewhere. |
| // |
| #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) |
| uint const cmd_idx = get_group_id(0); |
| #else |
| uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); |
| #endif |
| |
| // |
| // if worksgroups are multi-subgroup then there may be excess |
| // subgroups in the final workgroup |
| // |
| if (cmd_idx >= count) |
| return; |
| |
| #if 0 |
| if (get_sub_group_local_id() == 0) |
| printf("cmd_idx = %u\n",cmd_idx); |
| #endif |
| |
| // |
| // load a single command for this subgroup |
| // |
| union skc_cmd_rasterize const cmd = cmds[cmd_idx]; |
| |
| // |
| // get first block node command word and its subblock |
| // |
| skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing |
| skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; |
| skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); |
| |
| // |
| // load transform -- uniform across subgroup |
| // |
| // v8: { sx shx tx shy sy ty w0 w1 } |
| // |
| // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: |
| // |
| // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] |
| // |
| // Coordinates are scaled to subpixel resolution. All that matters |
| // is that continuity is maintained between end path element |
| // endpoints. |
| // |
| // It's the responsibility of the host to ensure that the transforms |
| // are properly scaled either via intitializing a transform stack |
| // with the subpixel resolution scaled identity or scaling the |
| // transform before its loaded by a rasterization grid. |
| // |
| // FIXME -- horizontal load might be better than this broadcast load |
| // |
| union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load |
| union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load |
| skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted |
| |
| skc_rasterize_lines(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| sk_extent, |
| smem, |
| &nodeword,&id, |
| &tv,&cv,cohort); |
| } |
| |
| // |
| // |
| // |
| |
| // |
| // |
| // |
| |
| __kernel |
| SKC_RASTERIZE_KERNEL_ATTRIBS |
| void |
| skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| __global union skc_bp_elem * const bp_elems, |
| __global uint * const bp_ids, |
| skc_uint const bp_mask, |
| |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| __global skc_ttsk_s_t * const sk_extent, |
| |
| __global float8 const * const transforms, // FIXME -- __constant |
| __global float4 const * const clips, // FIXME -- __constant |
| __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant |
| skc_uint const count) |
| { |
| // |
| // declare shared memory block |
| // |
| #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) |
| __local struct skc_subgroup_smem volatile smem[1]; |
| #else |
| __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; |
| __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); |
| #endif |
| |
| // |
| // this is a subgroup/warp-centric kernel |
| // |
| // which subgroup in the grid is this? |
| // |
| // TAKE NOTE: the Intel GEN compiler appears to be recognizing |
| // get_group_id(0) as a uniform but the alternative calculation used |
| // when there are multiple subgroups per workgroup is not |
| // cooperating and driving spillage elsewhere. |
| // |
| #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) |
| uint const cmd_idx = get_group_id(0); |
| #else |
| uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); |
| #endif |
| |
| // |
| // if worksgroups are multi-subgroup then there may be excess |
| // subgroups in the final workgroup |
| // |
| if (cmd_idx >= count) |
| return; |
| |
| #if 0 |
| if (get_sub_group_local_id() == 0) |
| printf("cmd_idx = %u\n",cmd_idx); |
| #endif |
| |
| // |
| // load a single command for this subgroup |
| // |
| union skc_cmd_rasterize const cmd = cmds[cmd_idx]; |
| |
| // |
| // get first block node command word and its subblock |
| // |
| skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing |
| skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; |
| skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); |
| |
| // |
| // load transform -- uniform across subgroup |
| // |
| // v8: { sx shx tx shy sy ty w0 w1 } |
| // |
| // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: |
| // |
| // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] |
| // |
| // Coordinates are scaled to subpixel resolution. All that matters |
| // is that continuity is maintained between end path element |
| // endpoints. |
| // |
| // It's the responsibility of the host to ensure that the transforms |
| // are properly scaled either via intitializing a transform stack |
| // with the subpixel resolution scaled identity or scaling the |
| // transform before its loaded by a rasterization grid. |
| // |
| // FIXME -- horizontal load might be better than this broadcast load |
| // |
| union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load |
| union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load |
| skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted |
| |
| skc_rasterize_quads(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| sk_extent, |
| smem, |
| &nodeword,&id, |
| &tv,&cv,cohort); |
| } |
| |
| // |
| // |
| // |
| |
| __kernel |
| SKC_RASTERIZE_KERNEL_ATTRIBS |
| void |
| skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| __global union skc_bp_elem * const bp_elems, |
| __global uint * const bp_ids, |
| skc_uint const bp_mask, |
| |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| __global skc_ttsk_s_t * const sk_extent, |
| |
| __global float8 const * const transforms, // FIXME -- __constant |
| __global float4 const * const clips, // FIXME -- __constant |
| __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant |
| skc_uint const count) |
| { |
| // |
| // declare shared memory block |
| // |
| #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) |
| __local struct skc_subgroup_smem volatile smem[1]; |
| #else |
| __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; |
| __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); |
| #endif |
| |
| // |
| // this is a subgroup/warp-centric kernel |
| // |
| // which subgroup in the grid is this? |
| // |
| // TAKE NOTE: the Intel GEN compiler appears to be recognizing |
| // get_group_id(0) as a uniform but the alternative calculation used |
| // when there are multiple subgroups per workgroup is not |
| // cooperating and driving spillage elsewhere. |
| // |
| #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) |
| uint const cmd_idx = get_group_id(0); |
| #else |
| uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); |
| #endif |
| |
| // |
| // if worksgroups are multi-subgroup then there may be excess |
| // subgroups in the final workgroup |
| // |
| if (cmd_idx >= count) |
| return; |
| |
| #if 0 |
| if (get_sub_group_local_id() == 0) |
| printf("cmd_idx = %u\n",cmd_idx); |
| #endif |
| |
| // |
| // load a single command for this subgroup |
| // |
| union skc_cmd_rasterize const cmd = cmds[cmd_idx]; |
| |
| // |
| // get first block node command word and its subblock |
| // |
| skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing |
| skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; |
| skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); |
| |
| // |
| // load transform -- uniform across subgroup |
| // |
| // v8: { sx shx tx shy sy ty w0 w1 } |
| // |
| // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: |
| // |
| // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] |
| // |
| // Coordinates are scaled to subpixel resolution. All that matters |
| // is that continuity is maintained between end path element |
| // endpoints. |
| // |
| // It's the responsibility of the host to ensure that the transforms |
| // are properly scaled either via intitializing a transform stack |
| // with the subpixel resolution scaled identity or scaling the |
| // transform before its loaded by a rasterization grid. |
| // |
| // FIXME -- horizontal load might be better than this broadcast load |
| // |
| union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load |
| union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load |
| skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted |
| |
| skc_rasterize_cubics(bp_atomics, |
| bp_elems, |
| bp_ids, |
| bp_mask, |
| cohort_atomics, |
| sk_extent, |
| smem, |
| &nodeword,&id, |
| &tv,&cv,cohort); |
| } |
| |
| // |
| // |
| // |
| |
| __kernel |
| SKC_RASTERIZE_KERNEL_ATTRIBS |
| void |
| skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| __global union skc_bp_elem * const bp_elems, |
| __global uint * const bp_ids, |
| skc_uint const bp_mask, |
| |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| __global skc_ttsk_s_t * const sk_extent, |
| |
| __global float8 const * const transforms, // FIXME -- __constant |
| __global float4 const * const clips, // FIXME -- __constant |
| __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant |
| skc_uint const count) |
| { |
| ; |
| } |
| |
| // |
| // |
| // |
| |
| __kernel |
| SKC_RASTERIZE_KERNEL_ATTRIBS |
| void |
| skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, |
| __global union skc_bp_elem * const bp_elems, |
| __global uint * const bp_ids, |
| skc_uint const bp_mask, |
| |
| __global SKC_ATOMIC_UINT volatile * const cohort_atomics, |
| __global skc_ttsk_s_t * const sk_extent, |
| |
| __global float8 const * const transforms, // FIXME -- __constant |
| __global float4 const * const clips, // FIXME -- __constant |
| __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant |
| skc_uint const count) |
| { |
| ; |
| } |
| |
| // |
| // |
| // |