| /* |
| * Copyright 2016 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can |
| * be found in the LICENSE file. |
| * |
| */ |
| |
| // |
| // |
| // |
| |
| #include "tile.h" |
| #include "block.h" |
| #include "styling_types.h" |
| #include "atomic_cl.h" |
| #include "kernel_cl_12.h" |
| |
| // |
| // |
| // |
| |
| #define SKC_RENDER_SUBGROUP_MASK (SKC_RENDER_SUBGROUP_SIZE - 1) |
| |
| // |
| // |
| // |
| |
| #if ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 ) |
| #define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_1() |
| #define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 0 |
| |
| #elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 ) |
| #define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_2() |
| #define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 1 |
| |
| #elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 ) |
| #define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_4() |
| #define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 3 |
| |
| #elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 ) |
| #define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_8() |
| #define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 7 |
| |
| #elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16) |
| #define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_16() |
| #define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 15 |
| #endif |
| |
| // |
| // tile state flag bits |
| // |
| |
| typedef enum skc_tile_flags_e { |
| |
| // FLUSH |
| SKC_TILE_FLAGS_FLUSH_FINALIZE = 0x00000001, |
| SKC_TILE_FLAGS_FLUSH_UNWIND = 0x00000002, |
| SKC_TILE_FLAGS_FLUSH_COMPLETE = 0x00000004, |
| |
| // OPACITY |
| SKC_TILE_FLAGS_SCATTER_SKIP = 0x00000008, |
| |
| // |
| // Note: testing for opacity and skipping scattering is on its way |
| // to becoming a much more programmable option because sometimes we |
| // may be compositing/blending from back-to-front and/or be using |
| // group blend rules that ignore opacity. |
| // |
| // The point is that all of these decisions should be encoded in |
| // styling commands and, as much as possible, removed from the final |
| // group/layer styling traversal render loop. |
| // |
| |
| } skc_tile_flags_e; |
| |
| // |
| // COVER -- assumes availability of either fp16 or fp32 |
| // |
| |
| union skc_tile_cover |
| { |
| struct { |
| SKC_RENDER_TILE_COVER c[SKC_TILE_WIDTH]; |
| } aN; |
| |
| #ifdef SKC_RENDER_TILE_COVER_VECTOR |
| struct { |
| SKC_RENDER_TILE_COVER_VECTOR c[SKC_RENDER_TILE_COVER_VECTOR_COUNT]; |
| } vN; |
| #endif |
| }; |
| |
| // |
| // COLOR -- assumes availability of either fp16 or fp32 |
| // |
| |
| union skc_tile_color |
| { |
| union { |
| struct { |
| SKC_RENDER_TILE_COLOR r; |
| SKC_RENDER_TILE_COLOR g; |
| SKC_RENDER_TILE_COLOR b; |
| SKC_RENDER_TILE_COLOR a; |
| } rgba[SKC_TILE_WIDTH]; |
| } aN; |
| |
| #ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED |
| union { |
| SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH]; |
| } iN; |
| #endif |
| |
| #ifdef SKC_RENDER_TILE_COLOR_VECTOR |
| union { |
| SKC_RENDER_TILE_COLOR_VECTOR rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT]; |
| } vN; |
| #endif |
| |
| struct { |
| union { |
| struct { |
| SKC_RENDER_TILE_COLOR r; |
| SKC_RENDER_TILE_COLOR g; |
| }; |
| SKC_RENDER_GRADIENT_FLOAT distance; |
| }; |
| union { |
| struct { |
| SKC_RENDER_TILE_COLOR b; |
| SKC_RENDER_TILE_COLOR a; |
| }; |
| SKC_RENDER_GRADIENT_FLOAT stoplerp; |
| }; |
| } grad[SKC_TILE_WIDTH]; |
| }; |
| |
| // |
| // SHARED MEMORY STATE |
| // |
| |
| #define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT) |
| |
| #define SKC_RENDER_WIDE_AA_BYTES (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE) |
| #define SKC_RENDER_WIDE_AA_WIDTH (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA)) |
| |
| // |
| // |
| // |
| |
| union skc_subgroup_smem |
| { |
| // |
| // The tiles are stored in column-major / height-major order |
| // |
| // The final column is a guard column that is OK to write to but |
| // will never be read. It simplifies the TTSB scatter but could be |
| // predicated if SMEM is really at a premium. |
| // |
| #if ( SKC_RENDER_SUBGROUP_SIZE > 1 ) |
| struct { |
| SKC_ATOMIC_UINT area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h] |
| } atomic; |
| #endif |
| |
| struct { |
| int area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h] |
| } aN; |
| |
| struct { // assumption is that height = subgroup |
| SKC_RENDER_AREA_V area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE]; |
| } vN; |
| |
| struct { // assumption is that height = subgroup |
| SKC_RENDER_WIDE_AA area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE]; |
| } wide; |
| |
| union skc_styling_cmd cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT]; |
| |
| half gc [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2]; |
| |
| #if 0 |
| // |
| // SPILL TO GMEM |
| // |
| #if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0) |
| struct { |
| |
| #if (SKC_REGS_COLOR_S > 0) |
| union skc_color_r color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH]; |
| #endif |
| |
| #if (SKC_REGS_COVER_S > 0) |
| union float cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH]; |
| #endif |
| |
| } regs; |
| #endif |
| // |
| // |
| // |
| #endif |
| }; |
| |
| // |
| // |
| // |
| |
| #if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) |
| |
| #define skc_subgroup_lane() 0 |
| |
| #else |
| |
| #define skc_subgroup_lane() get_sub_group_local_id() |
| |
| #endif |
| |
| // |
| // |
| // |
| |
| typedef skc_uint skc_ttsk_lo_t; |
| typedef skc_uint skc_ttsk_hi_t; |
| |
| typedef skc_uint skc_ttpk_lo_t; |
| typedef skc_uint skc_ttpk_hi_t; |
| |
| typedef skc_uint skc_ttxk_lo_t; |
| typedef skc_uint skc_ttxk_hi_t; |
| |
| typedef skc_uint skc_ttck_lo_t; |
| typedef skc_uint skc_ttck_hi_t; |
| |
| typedef skc_uint2 skc_ttck_t; |
| |
| typedef skc_int skc_ttxb_t; |
| |
| // |
| // TTCK (32-BIT COMPARE) v1: |
| // |
| // 0 63 |
| // | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | |
| // +----------------------+--------+--------+-------+-----+-----+ |
| // | 30 | 1 | 1 | 18 | 7 | 7 | |
| // |
| // |
| // TTCK (32-BIT COMPARE) v2: |
| // |
| // 0 63 |
| // | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | |
| // +----------------------+--------+--------+-------+-----+-----+ |
| // | 30 | 1 | 1 | 15 | 9 | 8 | |
| // |
| // |
| // TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: |
| // |
| // 0 63 |
| // | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | |
| // +----------------------+--------+--------+-------+-----+-----+ |
| // | 27 | 1 | 1 | 18 | 9 | 8 | |
| // |
| |
| static |
| skc_uint |
| skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a) |
| { |
| return a & SKC_TTCK_LO_MASK_ID; |
| } |
| |
| static |
| skc_layer_id |
| skc_ttck_get_layer(skc_ttck_t const a) |
| { |
| // |
| // FIXME -- a union with a ulong and a shift down and mask is |
| // probably faster on some architectures |
| // |
| skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); |
| skc_uint const hi = (a.hi & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER; |
| |
| return lo | hi; |
| } |
| |
| static |
| skc_uint |
| skc_ttck_hi_get_x(skc_ttck_hi_t const a) |
| { |
| return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X); |
| } |
| |
| static |
| skc_uint |
| skc_ttck_hi_get_y(skc_ttck_hi_t const a) |
| { |
| return a >> SKC_TTCK_HI_OFFSET_Y; |
| } |
| |
| static |
| skc_bool |
| skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b) |
| { |
| skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); |
| skc_uint const hi = (a.hi ^ b.hi); |
| |
| return (lo | hi) == 0; |
| } |
| |
| static |
| skc_bool |
| skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b) |
| { |
| return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0; |
| } |
| |
| static |
| skc_bool |
| skc_ttck_lo_is_prefix(skc_ttck_lo_t const a) |
| { |
| return (a & SKC_TTCK_LO_MASK_PREFIX) != 0; |
| } |
| |
| // |
| // TILE TRACE SUBPIXEL |
| // |
| // The subpixels are encoded with either absolute tile coordinates |
| // (32-bits) or packed in delta-encoded form form. |
| // |
| // For 32-bit subpixel packing of a 32x32 tile: |
| // |
| // A tile X is encoded as: |
| // |
| // TX : 10 : unsigned min(x0,x1) tile subpixel coordinate. |
| // |
| // SX : 6 : unsigned subpixel span from min to max x with range |
| // [0,32]. The original direction is not captured. Would |
| // be nice to capture dx but not necessary right now but |
| // could be in the future. <--- SPARE VALUES AVAILABLE |
| // |
| // A tile Y is encoded as: |
| // |
| // TY : 10 : unsigned min(y0,y1) tile subpixel coordinate. |
| // |
| // DY : 6 : signed subpixel delta y1-y0. The range of delta is |
| // [-32,32] but horizontal lines are not encoded so [1,32] |
| // is mapped to [0,31]. The resulting range [-32,31] fits |
| // in 6 bits. |
| // |
| // TTS: |
| // |
| // 0 31 |
| // | TX | SX | TY | DY | |
| // +-----+------+-----+------+ |
| // | 10 | 6 | 10 | 6 | |
| // |
| |
| static |
| SKC_RENDER_TTS_V_BITFIELD |
| skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a) |
| { |
| // |
| // extract the whole pixel y coordinate |
| // |
| return SKC_BFE(a, |
| SKC_TTS_BITS_TY - SKC_SUBPIXEL_RESL_Y_LOG2, |
| SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2); |
| } |
| |
| static |
| SKC_RENDER_TTS_V_BITFIELD |
| skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a) |
| { |
| // |
| // get the linear array tile index of the pixel |
| // |
| return (((a & SKC_TTS_MASK_TX_PIXEL) |
| |
| #if (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2) |
| >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2) |
| #elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2) |
| << (SKC_TILE_HEIGHT_LOG2 - SKC_SUBPIXEL_RESL_X_LOG2) |
| #endif |
| |
| ) | skc_tts_get_ty_pixel_v(a)); |
| } |
| |
| #if 0 |
| static |
| skc_ttx_v_s32_t |
| skc_tts_get_dy_v(SKC_RENDER_TTS_V const a) |
| { |
| skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY; |
| |
| return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31)); |
| } |
| #else |
| static |
| SKC_RENDER_TTS_V_BITFIELD |
| skc_tts_get_dy_v(SKC_RENDER_TTS_V const a) |
| { |
| SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY; |
| |
| return dy - (~a >> 31); |
| } |
| #endif |
| |
| static |
| SKC_RENDER_TTS_V_BITFIELD |
| skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a) |
| { |
| return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2); |
| } |
| |
| static |
| SKC_RENDER_TTS_V_BITFIELD |
| skc_tts_get_sx_v(SKC_RENDER_TTS_V const a) |
| { |
| return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem) |
| { |
| // |
| // SIMD / CPU |
| // |
| // & |
| // |
| // SIMT / GPU |
| // |
| // Note that atomic_init() is likely implemented as a simple |
| // assignment so there is no identifiable performance difference on |
| // current targets. |
| // |
| // If such an architecture appears in the future then we'll probably |
| // still want to implement this zero'ing operation as below but |
| // follow with an appropriate fence that occurs before any scatter |
| // operations. |
| // |
| // The baroque expansion below improves performance on Intel GEN by, |
| // presumably, achieving the 64-byte per clock SLM write as well as |
| // minimizing the overall number of SEND() block initializations and |
| // launches. |
| // |
| // Intel GENx has a documented 64 byte per cycle SLM write limit. |
| // So having each lane in an 8 lane subgroup zero-write 8 bytes is |
| // probably a safe bet (Later: benchmarking backs this up!). |
| // |
| // Note there is no reason at this time to unroll this loop. |
| // |
| for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++) |
| smem->wide.area[ii][skc_subgroup_lane()] = ( 0 ); |
| } |
| |
| // |
| // Note this is going to be vectorizable on most architectures. |
| // |
| // The return of the key translation feature might complicate things. |
| // |
| |
| static |
| void |
| skc_scatter_ttpb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, |
| __local union skc_subgroup_smem * SKC_RESTRICT const smem, |
| skc_block_id_t const pb_id) |
| { |
| skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane(); |
| |
| #if ( SKC_TILE_RATIO == 1 ) |
| |
| SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset]; |
| |
| #elif ( SKC_TILE_RATIO == 2 ) |
| |
| SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent); |
| |
| #else |
| |
| #error("tile ratio greater than 2 not supported") |
| |
| #endif |
| |
| // |
| // Note there is no need to use an atomic for this operation on the |
| // current group of target platforms... but this may change if |
| // atomic ops truly go through a different path. |
| // |
| // As noted above, this direct increment is probably faster and can |
| // always be followed by a fence. |
| // |
| // Furthermore, note that the key sorting orders all ttck keys |
| // before ttpk keys. |
| // |
| |
| // |
| // FIXME -- if the SMEM store is wider than bank word count then we |
| // might want to odd-even interleave the TTP values if the target |
| // device can't handle 64-bit stores |
| // |
| |
| // |
| // skipping per-key translation for now |
| // |
| smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1); |
| } |
| |
| // |
| // Note that skc_scatter_ttsb is *not* vectorizable unless the |
| // architecture supports a "scatter-add" capability. All relevant |
| // GPUs support atomic add on shared/local memory and thus support |
| // scatter-add. |
| // |
| |
| static |
| void |
| skc_scatter_ttsb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, |
| __local union skc_subgroup_smem * SKC_RESTRICT const smem, |
| skc_block_id_t const sb_id) |
| { |
| skc_uint const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); |
| |
| SKC_RENDER_TTS_V const tts_v = ttxb_extent[offset]; |
| |
| // |
| // Skipping per-key translation for now |
| // |
| |
| // Index into tile |
| // |
| // The tiles are stored in column-major / height-major order |
| // |
| // The final column is a guard column that is OK to write to but |
| // will never be read. It simplifies the TTSB scatter but could be |
| // predicated if SMEM is really at a premium. |
| // |
| |
| SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v); |
| |
| #if 0 |
| if (tts_v != SKC_TTS_INVALID) |
| printf("(%08X) = %u\n",tts_v,xy_idx); |
| #endif |
| |
| // |
| // adjust subpixel range to max y |
| // |
| // range is stored as [-32,31] and when read [0,31] is mapped to |
| // [1,32] because a dy of 0 is not possible. |
| // |
| // more succinctly: if dy >= 0 then ++dy |
| // |
| SKC_RENDER_TTS_V_BITFIELD const dy = skc_tts_get_dy_v(tts_v); |
| |
| // |
| // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid? |
| // |
| |
| // this "min(x0) * 2 + dx" is equivalent to "x0 + x1" |
| SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v); |
| |
| // Calculate left and right coverage contribution trapezoids |
| SKC_RENDER_TTS_V_BITFIELD const left = dy * widths; |
| SKC_RENDER_TTS_V_BITFIELD const right = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left; |
| |
| // |
| // Accumulate altitudes and areas |
| // |
| // Optimization: if the device supports an CPU/SIMD vector-add or |
| // GPU/SIMT scatter-add atomic int2 add operation then placing the |
| // ALT and AREA values side-by-side would halve the number of |
| // additions. |
| // |
| #if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) |
| // |
| // CPU/SIMD |
| // |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) \ |
| if (tts_v C != SKC_TTS_INVALID) { \ |
| smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left C; \ |
| smem->aN.area[ xy_idx C] += right C; \ |
| } |
| |
| #else |
| // |
| // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD |
| // |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) \ |
| if (tts_v C != SKC_TTS_INVALID) { \ |
| SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + \ |
| SKC_TILE_HEIGHT + xy_idx C, \ |
| left C); \ |
| SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \ |
| right C); \ |
| } |
| #endif |
| |
| SKC_RENDER_TTSB_EXPAND(); |
| } |
| |
| // |
| // Note that 2048.0 can be represented exactly with fp16... fortuitous! |
| // |
| |
| #define SKC_RENDER_FILL_MAX_AREA (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y) |
| #define SKC_RENDER_FILL_MAX_AREA_2 (2u * SKC_RENDER_FILL_MAX_AREA) |
| #define SKC_RENDER_FILL_EVEN_ODD_MASK (SKC_RENDER_FILL_MAX_AREA_2 - 1) |
| #define SKC_RENDER_FILL_MAX_AREA_RCP_F32 (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA) |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem, |
| union skc_tile_cover * SKC_RESTRICT const cover, |
| union skc_tile_color * SKC_RESTRICT const color) |
| { |
| SKC_RENDER_ACC_COVER_INT area = 0; |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2 |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| area += smem->vN.area[ii][skc_subgroup_lane()]; |
| SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area); |
| SKC_RENDER_TILE_COVER const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA)); |
| |
| cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32); |
| } |
| } |
| |
| static |
| void |
| skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem, |
| union skc_tile_cover * SKC_RESTRICT const cover, |
| union skc_tile_color * SKC_RESTRICT const color) |
| { |
| SKC_RENDER_ACC_COVER_INT area = 0; |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2 |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| area += smem->vN.area[ii][skc_subgroup_lane()]; |
| SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area); |
| SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA)); |
| |
| cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32; |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands, |
| uint * SKC_RESTRICT const cmd_next, |
| union skc_tile_color * SKC_RESTRICT const color) |
| { |
| // |
| // rgba = solid fill |
| // |
| __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0; |
| |
| *cmd_next += 2; |
| |
| #if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) |
| |
| SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| color->aN.rgba[ii].r = rg.lo; |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| color->aN.rgba[ii].g = rg.hi; |
| |
| SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| color->aN.rgba[ii].b = ba.lo; |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| color->aN.rgba[ii].a = ba.hi; |
| |
| #else |
| |
| SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); |
| SKC_RENDER_TILE_COLOR const r = rg.lo; |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r); |
| |
| SKC_RENDER_TILE_COLOR const g = rg.hi; |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| color->vN.rgba[ii].odd.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g); |
| |
| SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); |
| SKC_RENDER_TILE_COLOR const b = ba.lo; |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| color->vN.rgba[ii].even.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b); |
| |
| SKC_RENDER_TILE_COLOR const a = ba.hi; |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| color->vN.rgba[ii].odd.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a); |
| |
| #endif |
| } |
| |
| // |
| // Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" |
| // |
| // https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ |
| // |
| // Lerp in two fma/mad ops: |
| // |
| // t * b + ((-t) * a + a) |
| // |
| // Note: OpenCL documents mix() as being implemented as: |
| // |
| // a + (b - a) * t |
| // |
| // But this may be a native instruction on some devices. For example, |
| // on GEN9 there is an LRP "linear interoplation" function but it |
| // doesn't appear to support half floats. |
| // |
| |
| #if 1 |
| #define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) |
| #else |
| #define SKC_LERP(a,b,t) mix(a,b,t) |
| #endif |
| |
| // |
| // CPUs have a mock local address space so copying the gradient header |
| // is probably not useful. Just read directly from global. |
| // |
| |
| #ifndef SKC_RENDER_GRADIENT_IS_GLOBAL |
| #define SKC_RENDER_GRADIENT_SPACE __local |
| #else |
| #define SKC_RENDER_GRADIENT_SPACE __global |
| #endif |
| |
| // |
| // gradient is non-vertical |
| // |
| // removed the vertical (actually, horizontal) special case |
| // |
| |
| static |
| void |
| skc_tile_color_fill_gradient_linear_nonvertical(__local union skc_subgroup_smem * SKC_RESTRICT const smem, |
| __global union skc_styling_cmd const * SKC_RESTRICT const commands, |
| uint * SKC_RESTRICT const cmd_next, |
| union skc_tile_color * SKC_RESTRICT const color, |
| skc_ttck_hi_t const ttck_hi) |
| { |
| // |
| // Where is this tile? |
| // |
| // Note that the gradient is being sampled from pixel centers. |
| // |
| SKC_RENDER_GRADIENT_FLOAT const y = |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) I##.5f P |
| (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) + |
| (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE)); |
| |
| float const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH); |
| |
| // |
| // Get starting numerator and denominator |
| // |
| // Note: if gh[0].dx is exactly 0.0f then this is a vertical |
| // gradient and can be handled by a special opcode. |
| // |
| // Note: the mad() ordering is slightly different than the original |
| // CUDA implementation. |
| // |
| union skc_gradient_vector const gv = { vload4(0,&commands[*cmd_next].f32) }; |
| |
| *cmd_next += 4; |
| |
| float const gv_x_dot = mad(x,gv.dx,gv.p0); |
| SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot); |
| |
| // |
| // Where are columns along gradient vector? |
| // |
| // TODO: Note that the gv_denom isn't multiplied through. |
| // |
| // Please doublecheck this... but I recall that in certain cases |
| // this wipes out some precision and results in minor but noticeable |
| // gradient artifacts. |
| // |
| // All arguments are scalars except gv_numer so a simpler |
| // evaluation might save some flops. |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom; |
| |
| // |
| // is gradient non-repeating, repeating or reflecting? |
| // |
| switch (commands[(*cmd_next)++].u32) |
| { |
| case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING: |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f); |
| break; |
| |
| case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING: |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| color->grad[ii].distance -= floor(color->grad[ii].distance); |
| break; |
| |
| default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING |
| // |
| // OPTIMIZATION: Can this be done in fewer than ~4 ops? |
| // |
| // Note: OpenCL "rint()" is round-to-nearest-even integer! |
| // |
| // Note: the floor() "round to -inf" op is implemented in the |
| // GEN op 'FRC' so probably don't use trunc() when floor will |
| // suffice. |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance); |
| color->grad[ii].distance = fabs(dist_abs - rint(dist_abs)); |
| } |
| } |
| |
| // |
| // initialize "stoplerp" for all columns |
| // |
| uint const slope_count = commands[(*cmd_next)++].u32; |
| uint const gd_n_v1 = commands[(*cmd_next)++].u32; // REMOVE ME |
| |
| { |
| float const slope = commands[(*cmd_next)++].f32; |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| color->grad[ii].stoplerp = color->grad[ii].distance * slope; |
| } |
| |
| // |
| // compute stoplerp for remaining stops |
| // |
| for (int jj=1; jj<slope_count; jj++) |
| { |
| float const floor = (float)jj; |
| float const slope = commands[(*cmd_next)++].f32; |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp); |
| } |
| |
| // |
| // copy gradient colors to local memory |
| // |
| uint const gd_n = slope_count + 1; |
| |
| #ifndef SKC_RENDER_GRADIENT_IS_GLOBAL |
| // |
| // copy entire gradient descriptor to local memory |
| // |
| for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE) |
| smem->cmds[ii].u32 = commands[*cmd_next + ii].u32; |
| |
| __local half const * const SKC_RESTRICT gc = smem->gc + 0; |
| #else |
| // |
| // prefetch entire gradient header |
| // |
| // no noticeable impact on performance |
| // |
| // prefetch(&commands[*cmd_next].u32,gh_words); |
| // |
| __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0; |
| #endif |
| |
| // |
| // adjust cmd_next so that V1 structure is consumed -- FIXME |
| // |
| *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n); |
| |
| // |
| // lerp between color pair stops |
| // |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| // |
| // Finally, we have the gradient stop index and the color stop |
| // pair lerp fraction |
| // |
| // Note that if these are vector values then a gather operation |
| // must occur -- there may be platforms (AVX-512?) that can |
| // perform an explicit gather on a vector type but it's not |
| // really expressible in OpenCL except implicitly with a |
| // workgroup of work items. |
| // |
| // *********************** |
| // |
| // FIXME -- USE HERB'S SINGLE FMA LERP |
| // |
| // *********************** |
| // |
| SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp); |
| SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp)); |
| |
| { |
| SKC_RENDER_TILE_COLOR lo, hi; |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) { \ |
| SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \ |
| lo C = cc.lo; \ |
| hi C = cc.hi; \ |
| } |
| |
| SKC_RENDER_SCANLINE_VECTOR_EXPAND(); |
| |
| color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac); |
| } |
| |
| // |
| // |
| // |
| { |
| SKC_RENDER_TILE_COLOR lo, hi; |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) { \ |
| SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \ |
| lo C = cc.lo; \ |
| hi C = cc.hi; \ |
| } |
| |
| SKC_RENDER_SCANLINE_VECTOR_EXPAND(); |
| |
| color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac); |
| } |
| |
| // |
| // |
| // |
| { |
| SKC_RENDER_TILE_COLOR lo, hi; |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) { \ |
| SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \ |
| lo C = cc.lo; \ |
| hi C = cc.hi; \ |
| } |
| |
| SKC_RENDER_SCANLINE_VECTOR_EXPAND(); |
| |
| color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac); |
| } |
| |
| // |
| // |
| // |
| { |
| SKC_RENDER_TILE_COLOR lo, hi; |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) { \ |
| SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \ |
| lo C = cc.lo; \ |
| hi C = cc.hi; \ |
| } |
| |
| SKC_RENDER_SCANLINE_VECTOR_EXPAND(); |
| |
| color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac); |
| } |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_blend_over(union skc_tile_color * SKC_RESTRICT const color_acc, |
| union skc_tile_cover const * SKC_RESTRICT const cover_wip, |
| union skc_tile_color const * SKC_RESTRICT const color_wip) |
| { |
| // |
| // fralunco = cover.wip * acc.a |
| // |
| // acc.r = fralunco * wip.r + acc.r |
| // acc.g = fralunco * wip.g + acc.g |
| // acc.b = fralunco * wip.b + acc.b |
| // acc.a = -fralunco * wip.a + acc.a |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a; |
| |
| color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); |
| color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); |
| color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); |
| color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_blend_plus(union skc_tile_color * SKC_RESTRICT const color_acc, |
| union skc_tile_cover const * SKC_RESTRICT const cover_wip, |
| union skc_tile_color const * SKC_RESTRICT const color_wip) |
| { |
| // |
| // cover_min = min(cover.wip,a.acc) |
| // |
| // r.acc = cover_min * r.wip + r.acc |
| // g.acc = cover_min * g.wip + g.acc |
| // b.acc = cover_min * b.wip + b.acc |
| // a.acc = -cover_min * a.wip + a.acc |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a); |
| |
| color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); |
| color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); |
| color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); |
| color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_blend_multiply(union skc_tile_color * SKC_RESTRICT const color_acc, |
| union skc_tile_cover const * SKC_RESTRICT const cover_wip, |
| union skc_tile_color const * SKC_RESTRICT const color_wip) |
| { |
| // |
| // r.acc = (cover.wip * r.wip) * r.acc |
| // g.acc = (cover.wip * g.wip) * g.acc |
| // b.acc = (cover.wip * b.wip) * b.acc |
| // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha) |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r; |
| color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g; |
| color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b; |
| color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a; |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_blend_knockout(union skc_tile_cover * SKC_RESTRICT const cover_acc, |
| union skc_tile_color * SKC_RESTRICT const color_acc, |
| union skc_tile_cover const * SKC_RESTRICT const cover_wip, |
| union skc_tile_color const * SKC_RESTRICT const color_wip) |
| { |
| // |
| // cover.wip.contrib = (1.0 - cover.acc) * cover.wip |
| // cover.acc = cover.acc + cover.wip.contrib |
| // |
| // r.acc = cover.wip.contrib * r.wip + r.acc |
| // g.acc = cover.wip.contrib * g.wip + g.acc |
| // b.acc = cover.wip.contrib * b.wip + b.acc |
| // a.acc = -cover.wip.contrib * a.wip * a.acc |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii]; |
| |
| cover_acc->aN.c[ii] += contrib; |
| |
| color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); |
| color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); |
| color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); |
| color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_cover_msk_copy_wip(union skc_tile_cover * SKC_RESTRICT const cover_msk, |
| union skc_tile_cover const * SKC_RESTRICT const cover_wip) |
| { |
| #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| cover_msk->aN.c[ii] = cover_wip->aN.c[ii]; |
| |
| #else |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) |
| cover_msk->vN.c[ii] = cover_wip->vN.c[ii]; |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_cover_msk_copy_acc(union skc_tile_cover * SKC_RESTRICT const cover_msk, |
| union skc_tile_cover const * SKC_RESTRICT const cover_acc) |
| { |
| #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| cover_msk->aN.c[ii] = cover_acc->aN.c[ii]; |
| |
| #else |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) |
| cover_msk->vN.c[ii] = cover_acc->vN.c[ii]; |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_cover_accumulate(union skc_tile_cover * SKC_RESTRICT const cover_acc, |
| union skc_tile_cover const * SKC_RESTRICT const cover_wip) |
| { |
| // |
| // cover.wip.contrib = (1.0 - cover.acc) * cover.wip |
| // cover.acc = cover.acc + cover.wip.contrib |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_cover_wip_mask(union skc_tile_cover * SKC_RESTRICT const cover_wip, |
| union skc_tile_cover const * SKC_RESTRICT const cover_msk) |
| { |
| // |
| // cover.wip *= cover.msk |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| cover_wip->aN.c[ii] *= cover_msk->aN.c[ii]; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover) |
| { |
| #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 ) |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| cover->aN.c[ii] = 0; |
| |
| #else |
| // |
| // GEN9 compiler underperforms on this |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) |
| cover->vN.c[ii] = 0; |
| |
| #endif |
| } |
| |
| static |
| void |
| skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover) |
| { |
| #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 ) |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| cover->aN.c[ii] = 0; |
| |
| #else |
| // |
| // GEN9 compiler underperforms on this |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) |
| cover->vN.c[ii] = 0; |
| |
| #endif |
| } |
| |
| static |
| void |
| skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover) |
| { |
| #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| cover->aN.c[ii] = 0; |
| |
| #else |
| // |
| // GEN9 compiler underperforms on this |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) |
| cover->vN.c[ii] = 0; |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover) |
| { |
| #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| cover->aN.c[ii] = 1; |
| |
| #else |
| // |
| // GEN9 compiler underperforms on this |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) |
| cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE; |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover) |
| { |
| #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| cover->aN.c[ii] = 1 - cover->aN.c[ii]; |
| |
| #else |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) |
| cover->vN.c[ii] = 1 - cover->vN.c[ii]; |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color) |
| { |
| #if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 ) |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| color->aN.rgba[ii].r = 0; |
| color->aN.rgba[ii].g = 0; |
| color->aN.rgba[ii].b = 0; |
| color->aN.rgba[ii].a = 1; |
| } |
| |
| #else |
| // |
| // DISABLED ON GEN9 -- probably a compiler bug |
| // |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| color->vN.rgba[ii].even.even = 0; |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| color->vN.rgba[ii].odd.even = 0; |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| color->vN.rgba[ii].even.odd = 0; |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| color->vN.rgba[ii].odd.odd = 1; |
| #endif |
| } |
| |
| static |
| void |
| skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color) |
| { |
| #if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 ) |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| color->aN.rgba[ii].r = 0; |
| color->aN.rgba[ii].g = 0; |
| color->aN.rgba[ii].b = 0; |
| color->aN.rgba[ii].a = 1; |
| } |
| |
| #else |
| // |
| // DISABLED ON GEN9 -- probably a compiler bug |
| // |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| color->vN.rgba[ii].even.even = 0; |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| color->vN.rgba[ii].odd.even = 0; |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| color->vN.rgba[ii].even.odd = 0; |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| color->vN.rgba[ii].odd.odd = 1; |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| bool |
| skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color) |
| { |
| // |
| // returns true if tile is opaque |
| // |
| // various hacks to test for complete tile opacity |
| // |
| // note that front-to-back currently has alpha at 0.0f -- this can |
| // be harmonized to use a traditional alpha if we want to support |
| // rendering in either direction |
| // |
| // hack -- ADD/MAX/OR all alphas together and test for non-zero |
| // |
| SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a; |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) |
| for (uint ii=1; ii<SKC_TILE_WIDTH; ii++) |
| t += color->aN.rgba[ii].a; |
| |
| #if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) |
| // |
| // SIMD |
| // |
| return !any(t != ( 0 )); |
| |
| #elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 ) |
| // |
| // SIMT - scalar per lane |
| // |
| return !sub_group_any(t != 0); |
| |
| #else |
| // |
| // SIMT - vector per lane |
| // |
| return !sub_group_any(any(t != ( 0 ))); |
| |
| #endif |
| |
| // |
| // TODO: The alternative vector-per-lane implementation below is |
| // *not* believed to be performant because the terse vector-wide |
| // test is just hiding a series of comparisons and is likely worse |
| // than the blind ADD/MAX/OR'ing of all alphas followed by a single |
| // test. |
| // |
| #if 0 |
| // |
| // SIMT - vector per lane |
| // |
| |
| // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1))) |
| for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) |
| { |
| if (sub_group_any(any(color->vN.ba[ii].a != ( 0 )))) |
| return false; |
| } |
| |
| return true; |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands, |
| uint * SKC_RESTRICT const cmd_next, |
| union skc_tile_color * SKC_RESTRICT const color) |
| { |
| // |
| // acc.r = acc.a * r + acc.r |
| // acc.g = acc.a * g + acc.g |
| // acc.b = acc.a * b + acc.b |
| // |
| __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0; |
| |
| *cmd_next += 2; |
| |
| SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r); |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g); |
| |
| SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b); |
| } |
| |
| // |
| // |
| // |
| |
| // #define SKC_SURFACE_IS_BUFFER |
| #ifdef SKC_SURFACE_IS_BUFFER |
| |
| static |
| void |
| skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface, |
| skc_uint const surface_pitch, |
| union skc_tile_color const * SKC_RESTRICT const color, |
| skc_ttck_hi_t const ttck_hi) |
| { |
| // |
| // NEW MAJOR OPTIMIZATION: |
| // |
| // Rotating and rasterizing the original world transform by -90 |
| // degrees and then rendering the scene scene by +90 degrees enables |
| // all the final surface composite to be perfomed in perfectly |
| // coalesced wide transactions. |
| // |
| // For this reason, linear access to the framebuffer is preferred. |
| // |
| // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv |
| // |
| // NOTE THIS IS TRANSPOSED BY 90 DEGREES |
| // |
| // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE |
| // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING. |
| // |
| // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS |
| // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS |
| // |
| // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL |
| // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER |
| // |
| uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE; |
| uint const x = skc_ttck_hi_get_x(ttck_hi); |
| uint const y = skc_ttck_hi_get_y(ttck_hi) ; |
| uint const base = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane(); |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 ); |
| |
| rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255); |
| rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8; |
| rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16; |
| |
| surface[base + ii * pitch] = rgba; |
| |
| // printf("%08v2X\n",rgba); |
| } |
| } |
| |
| #else |
| |
| static |
| void |
| skc_surface_composite_u8_rgba(__write_only image2d_t surface, |
| union skc_tile_color const * SKC_RESTRICT const color, |
| skc_ttck_hi_t const ttck_hi) |
| { |
| // |
| // NEW MAJOR OPTIMIZATION: |
| // |
| // Rotating and rasterizing the original world transform by -90 |
| // degrees and then rendering the scene scene by +90 degrees enables |
| // all the final surface composite to be perfomed in perfectly |
| // coalesced wide transactions. |
| // |
| // For this reason, linear access to the framebuffer is preferred. |
| // |
| // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv |
| // |
| // NOTE THIS IS TRANSPOSED BY 90 DEGREES |
| // |
| // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE |
| // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING. |
| // |
| // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS |
| // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS |
| // |
| // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL |
| // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER |
| // |
| |
| #if 1 |
| int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH; |
| int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE); |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| #ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) { \ |
| SKC_RENDER_SURFACE_WRITE(surface, \ |
| (int2)(x,y+I), \ |
| color->iN.rgba[ii] A); \ |
| } |
| |
| #else |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) { \ |
| SKC_RENDER_SURFACE_COLOR const rgba = \ |
| (SKC_RENDER_SURFACE_COLOR) \ |
| (color->aN.rgba[ii].r C, \ |
| color->aN.rgba[ii].g C, \ |
| color->aN.rgba[ii].b C, \ |
| 1.0); \ |
| SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba); \ |
| } |
| |
| #endif |
| |
| SKC_RENDER_SCANLINE_VECTOR_EXPAND(); |
| |
| x += 1; |
| } |
| #else |
| int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE); |
| int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH; |
| |
| // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) |
| for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) |
| { |
| #ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) { \ |
| SKC_RENDER_SURFACE_WRITE(surface, \ |
| (int2)(x+I,y+ii), \ |
| color->iN.rgba[ii] A); \ |
| } |
| |
| #else |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,A) { \ |
| SKC_RENDER_SURFACE_COLOR const rgba = \ |
| (SKC_RENDER_SURFACE_COLOR) \ |
| (color->aN.rgba[ii].r C, \ |
| color->aN.rgba[ii].g C, \ |
| color->aN.rgba[ii].b C, \ |
| 1.0); \ |
| SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba); \ |
| } |
| |
| #endif |
| |
| SKC_RENDER_SCANLINE_VECTOR_EXPAND(); |
| } |
| |
| #endif |
| } |
| |
| #endif |
| |
| // |
| // |
| // |
| static |
| uint const |
| skc_ttck_lane(uint const ttck_idx) |
| { |
| return ttck_idx & SKC_RENDER_SUBGROUP_MASK; |
| } |
| |
| // |
| // RENDER KERNEL |
| // |
| |
| __kernel |
| SKC_RENDER_KERNEL_ATTRIBS |
| void |
| skc_kernel_render(__global union skc_layer_node const * SKC_RESTRICT const layers, |
| __global struct skc_group_node const * SKC_RESTRICT const groups, |
| __global union skc_styling_cmd const * SKC_RESTRICT const commands, // FIXME -- rename |
| |
| __global skc_ttck_t const * SKC_RESTRICT const ttck_keys, // rename: keys |
| skc_uint const ttck_count, // rename: key_count |
| |
| __global uint const * SKC_RESTRICT const ttck_offsets, // rename: offsets |
| skc_uint const tile_count, // rename: offset_count |
| |
| __global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, |
| #ifdef SKC_SURFACE_IS_BUFFER |
| __global void * SKC_RESTRICT const surface, |
| #else |
| __write_only image2d_t surface, |
| #endif |
| #ifdef SKC_SURFACE_IS_BUFFER |
| skc_uint const surface_pitch, |
| #endif |
| uint4 const tile_clip) // rename: clip |
| { |
| // |
| // Each subgroup is responsible for a tile. No extra subgroups are |
| // launched. |
| // |
| // FIXME -- might be better implemented as a "grid stride loop" if |
| // Intel GEN really has a local memory "quantum" of 4KB which means |
| // we would need to launch 4 subgroups per workgroup. |
| // |
| // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB. |
| // |
| |
| // |
| // declare tile cover and color registers |
| // |
| // this used to be a neat unified struct but the Intel GEN compiler |
| // wasn't cooperating and spilling to private memory even though all |
| // registers were indexed by constants |
| // |
| union skc_tile_color color_wip; |
| union skc_tile_color color_acc; |
| |
| union skc_tile_cover cover_wip; |
| union skc_tile_cover cover_acc; |
| union skc_tile_cover cover_msk; |
| |
| // |
| // which subgroup in the grid is this? |
| // |
| // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0) |
| // as a uniform but the alternative calculation used when there are |
| // multiple subgroups per workgroup is not cooperating and |
| // driving spillage elsewhere. |
| // |
| #if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 ) |
| skc_uint const ttck_offset_idx = get_group_id(0); |
| #else |
| skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id(); |
| #endif |
| |
| // |
| // load the starting ttck for this offset and get a bound on the max |
| // number of keys that might be loaded |
| // |
| // these are uniform across all subgroup lanes |
| // |
| skc_uint ttck_idx = ttck_offsets[ttck_offset_idx]; |
| |
| // |
| // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide) |
| // vector of ttck keys |
| // |
| #ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK |
| |
| skc_ttck_t ttck = ttck_keys[ttck_idx]; |
| |
| #else |
| |
| uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK; |
| uint const ttck_lane = ttck_idx & SKC_RENDER_SUBGROUP_MASK; |
| skc_ttck_t ttck_s = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)] |
| |
| #endif |
| |
| // |
| // set up style group/layer state |
| // |
| struct skc_styling_group { |
| union skc_group_range range; |
| skc_uint depth; |
| skc_uint id; |
| } group; |
| |
| group.range.lo = 0; |
| group.range.hi = SKC_UINT_MAX; |
| group.depth = SKC_UINT_MAX; |
| group.id = SKC_UINT_MAX; |
| |
| // |
| // start with clear tile opacity, knockout and flag bits |
| // |
| // uint color_acc_opacity = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32 |
| // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32 |
| // |
| skc_uint flags = 0; |
| |
| // |
| // declare and initialize accumulators |
| // |
| #if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 ) |
| __local union skc_subgroup_smem smem[1]; |
| #else |
| __local union skc_subgroup_smem smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS]; |
| __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id(); |
| #endif |
| |
| #ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK |
| // |
| // select the initial ttck key |
| // |
| skc_ttck_t ttck; |
| #if 0 |
| ttck = sub_group_broadcast(ttck_s,ttck_lane); // SHOULD WORK BUT .4454 COMPILER IS BROKEN |
| #else |
| ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND |
| ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane); |
| #endif |
| |
| #endif |
| |
| // |
| // save the first key so we know what tile we're in |
| // |
| skc_ttck_t ttck0 = ttck; |
| |
| // |
| // evaluate the coarse clip as late as possible |
| // |
| skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi); |
| |
| if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x)) |
| return; |
| |
| skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi); |
| |
| if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y)) |
| return; |
| |
| #if 0 |
| printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y); |
| #endif |
| |
| // |
| // load -> scatter -> flush |
| // |
| while (true) |
| { |
| // if scattering is disabled then just run through ttck keys |
| bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0; |
| |
| // need to clear accumulators before a scatter loop |
| if (is_scatter_enabled) |
| { |
| skc_tile_aa_zero(smem); |
| } |
| |
| do { |
| // skip scattering? |
| if (is_scatter_enabled) |
| { |
| skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo); |
| |
| if (skc_ttck_lo_is_prefix(ttck.lo)) { |
| skc_scatter_ttpb(ttxb_extent,smem,xb_id); |
| } else { |
| skc_scatter_ttsb(ttxb_extent,smem,xb_id); |
| } |
| } |
| |
| // |
| // any ttck keys left? |
| // |
| if (++ttck_idx >= ttck_count) |
| { |
| flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE; |
| break; |
| } |
| |
| // |
| // process next ttck key |
| // |
| #ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK |
| // |
| // SIMD -- read next key |
| // |
| ttck = ttck_keys[ttck_idx]; |
| #else |
| // |
| // SIMT -- refresh the ttck_s? |
| // |
| uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK; |
| |
| if (ttck_lane_next == 0) |
| ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)]; |
| |
| // |
| // broadcast next key to entire subgroup |
| // |
| #if 0 |
| ttck = sub_group_broadcast(ttck_s,ttck_lane_next); // SHOULD WORK BUT .4454 COMPILER IS BROKEN |
| #else |
| ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND |
| ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next); |
| #endif |
| #endif |
| // continue scattering if on same YXL layer |
| } while (skc_ttck_equal_yxl(ttck0,ttck)); |
| |
| // finalize if no longer on same YX tile |
| if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi)) |
| { |
| // otherwise, unwind the tile styling and exit |
| flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE; |
| } |
| |
| // |
| // given: new layer id from ttxk key |
| // |
| // load [layer id]{ group id, depth } |
| // |
| // if within current group's layer range |
| // |
| // if at same depth |
| // |
| // load and execute cover>[mask>]color>blend commands |
| // |
| // else if not at same depth then move deeper |
| // |
| // for all groups in group trail from cur depth to new depth |
| // enter group, saving and initializing regs as necessary |
| // increment depth and update layer range |
| // load and execute cover>[mask>]color>blend commands |
| // |
| // else not within layer range |
| // |
| // exit current group, restoring regs as necessary |
| // decrement depth and update layer range |
| // |
| // |
| skc_layer_id const layer_id_new = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi |
| union skc_layer_node const layer_node_new = layers[layer_id_new]; |
| |
| // clear flag that controls group/layer traversal |
| flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE; |
| |
| do { |
| bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0; |
| |
| // |
| // is layer a child of the current parent group? |
| // |
| uint cmd_next = 0; |
| |
| if (!unwind && (layer_node_new.parent == group.id)) |
| { |
| // execute this layer's cmds |
| cmd_next = layer_node_new.cmds; |
| |
| // if this is final then configure so groups get unwound, otherwise we're done |
| flags |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) |
| ? SKC_TILE_FLAGS_FLUSH_UNWIND |
| : SKC_TILE_FLAGS_FLUSH_COMPLETE); |
| } |
| else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi)) |
| { |
| // |
| // is layer in a child group? |
| // |
| union skc_group_parents const gp = groups[layer_node_new.parent].parents; |
| uint const gn = gp.depth - ++group.depth; |
| |
| if (gn == 0) |
| group.id = layer_node_new.parent; |
| else |
| group.id = commands[gp.base + gn - 1].parent; |
| |
| // update group layer range |
| group.range = groups[group.id].range; |
| |
| // enter current group |
| cmd_next = groups[group.id].cmds.enter; |
| } |
| else // otherwise, exit this group |
| { |
| // enter current group |
| cmd_next = groups[group.id].cmds.leave; |
| |
| // decrement group depth |
| if (--group.depth == SKC_UINT_MAX) |
| { |
| flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE; |
| } |
| else |
| { |
| // get path_base of current group |
| uint const gnpb = groups[group.id].parents.base; |
| |
| // get parent of current group |
| group.id = commands[gnpb].parent; |
| |
| // update group layer range |
| group.range = groups[group.id].range; |
| } |
| } |
| |
| // |
| // execute cmds |
| // |
| while (true) |
| { |
| union skc_styling_cmd const cmd = commands[cmd_next++]; |
| |
| switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE) |
| { |
| case SKC_STYLING_OPCODE_NOOP: |
| break; |
| |
| case SKC_STYLING_OPCODE_COVER_NONZERO: |
| skc_tile_cover_nonzero(smem,&cover_wip,&color_wip); |
| break; |
| |
| case SKC_STYLING_OPCODE_COVER_EVENODD: |
| skc_tile_cover_evenodd(smem,&cover_wip,&color_wip); |
| break; |
| |
| case SKC_STYLING_OPCODE_COVER_ACCUMULATE: |
| skc_tile_cover_accumulate(&cover_acc,&cover_wip); |
| break; |
| |
| case SKC_STYLING_OPCODE_COVER_MASK: |
| skc_tile_cover_wip_mask(&cover_wip,&cover_msk); |
| break; |
| |
| case SKC_STYLING_OPCODE_COVER_WIP_ZERO: |
| skc_tile_cover_wip_zero(&cover_wip); |
| break; |
| |
| case SKC_STYLING_OPCODE_COVER_ACC_ZERO: |
| skc_tile_cover_acc_zero(&cover_acc); |
| break; |
| |
| case SKC_STYLING_OPCODE_COVER_MASK_ZERO: |
| skc_tile_cover_msk_zero(&cover_msk); |
| break; |
| |
| case SKC_STYLING_OPCODE_COVER_MASK_ONE: |
| skc_tile_cover_msk_one(&cover_msk); |
| break; |
| |
| case SKC_STYLING_OPCODE_COVER_MASK_INVERT: |
| skc_tile_cover_msk_invert(&cover_msk); |
| break; |
| |
| case SKC_STYLING_OPCODE_COLOR_FILL_SOLID: |
| skc_tile_color_fill_solid(commands,&cmd_next,&color_wip); |
| break; |
| |
| case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR: |
| // |
| // FIXME -- gradients shouldn't be executing so much |
| // conditional driven code at runtime since we *know* |
| // the gradient style on the host can just create a |
| // new styling command to exploit this. |
| // |
| // FIXME -- it might be time to try using the GPU's |
| // sampler on a linear array of half4 vectors -- it |
| // might outperform the explicit load/lerp routines. |
| // |
| // FIXME -- optimizing for vertical gradients (uhhh, |
| // they're actually horizontal due to the -90 degree |
| // view transform) is nice but is it worthwhile to |
| // have this in the kernel? Easy to add it back... |
| // |
| #if defined( SKC_ARCH_GEN9 ) |
| // disable gradients due to exessive spillage -- fix later |
| cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32); |
| #else |
| skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi); |
| #endif |
| break; |
| |
| case SKC_STYLING_OPCODE_COLOR_WIP_ZERO: |
| skc_tile_color_wip_zero(&color_wip); |
| break; |
| |
| case SKC_STYLING_OPCODE_COLOR_ACC_ZERO: |
| skc_tile_color_acc_zero(&color_acc); |
| break; |
| |
| case SKC_STYLING_OPCODE_BLEND_OVER: |
| skc_tile_blend_over(&color_acc,&cover_wip,&color_wip); |
| break; |
| |
| case SKC_STYLING_OPCODE_BLEND_PLUS: |
| skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip); |
| break; |
| |
| case SKC_STYLING_OPCODE_BLEND_MULTIPLY: |
| skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip); |
| break; |
| |
| case SKC_STYLING_OPCODE_BLEND_KNOCKOUT: |
| skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip); |
| break; |
| |
| case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK: |
| skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip); |
| break; |
| |
| case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK: |
| skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc); |
| break; |
| |
| case SKC_STYLING_OPCODE_BACKGROUND_OVER: |
| skc_tile_background_over(commands,&cmd_next,&color_acc); |
| break; |
| |
| case SKC_STYLING_OPCODE_SURFACE_COMPOSITE: |
| #ifdef SKC_SURFACE_IS_BUFFER |
| skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi); |
| #else |
| skc_surface_composite_u8_rgba(surface, &color_acc,ttck0.hi); |
| #endif |
| break; |
| |
| case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY: |
| if (skc_tile_color_test_opacity(&color_acc)) |
| flags |= SKC_TILE_FLAGS_SCATTER_SKIP; |
| break; |
| |
| default: |
| return; // this is an illegal opcode -- trap and die! |
| } |
| |
| // |
| // if sign bit is set then this was final command |
| // |
| if (cmd.s32 < 0) |
| break; |
| } |
| |
| // continue as long as tile flush isn't complete |
| } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0); |
| |
| // return if was the final flush |
| if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) |
| return; |
| |
| // update wip ttck_hi |
| ttck0 = ttck; |
| } |
| } |
| |
| // |
| // |
| // |