| /* |
| * Copyright 2017 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can |
| * be found in the LICENSE file. |
| * |
| */ |
| |
| // |
| // |
| // |
| |
| #include "tile.h" |
| #include "common.h" |
| #include "raster.h" |
| #include "atomic_cl.h" |
| #include "kernel_cl_12.h" |
| |
| // |
| // |
| // |
| |
| #define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1) |
| #define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1) |
| |
| // |
| // |
| // |
| |
| #define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE) |
| #define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK |
| |
| // |
| // |
| // |
| |
| #define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE) |
| |
| // |
| // |
| // |
| |
| #if ( SKC_PLACE_X == 1 ) |
| #define SKC_PLACE_EXPAND() SKC_EXPAND_1() |
| #define SKC_PLACE_EXPAND_I_LAST 0 |
| |
| #elif ( SKC_PLACE_X == 2 ) |
| #define SKC_PLACE_EXPAND() SKC_EXPAND_2() |
| #define SKC_PLACE_EXPAND_I_LAST 1 |
| |
| #elif ( SKC_PLACE_X == 4 ) |
| #define SKC_PLACE_EXPAND() SKC_EXPAND_4() |
| #define SKC_PLACE_EXPAND_I_LAST 3 |
| |
| #elif ( SKC_PLACE_X == 8 ) |
| #define SKC_PLACE_EXPAND() SKC_EXPAND_8() |
| #define SKC_PLACE_EXPAND_I_LAST 7 |
| |
| #elif ( SKC_PLACE_X == 16) |
| #define SKC_PLACE_EXPAND() SKC_EXPAND_16() |
| #define SKC_PLACE_EXPAND_I_LAST 15 |
| #endif |
| |
| // |
| // PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE |
| // COALESCED WRITES. LO FIRST, FOLLOWED BY HI. |
| // |
| // THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE |
| // KERNELS USE DIFFERENT SUBGROUP SIZES. |
| // |
| // THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE |
| // LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID. |
| // |
| // NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER |
| // OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY |
| // ONLY SUPPORT A SUBGROUP SIZE OF 16. |
| // |
| |
| #if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE ) |
| |
| #define SKC_PLACE_STRIDE_H(L) (L) |
| #define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) |
| #define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE) |
| |
| #elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 |
| |
| #define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE) |
| #define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1) |
| #define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK)) |
| |
| #define SKC_PLACE_STRIDE_H(L) (L) |
| #define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE) |
| #define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE) |
| |
| #elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 |
| |
| #define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) |
| #define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask |
| |
| #define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK)) |
| #define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) |
| #define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO) |
| |
| #endif |
| |
| // |
| // A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE |
| // IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8) |
| // |
| |
| #define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS) |
| |
| #define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS) |
| |
| #define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS) |
| |
| #define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) |
| |
| |
| // |
| // Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX |
| // |
| #define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) |
| #define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k)) |
| |
| // |
| // TTSK v2: |
| // |
| // 0 63 |
| // | TTSB ID | PREFIX | SPAN | X | Y | |
| // +---------+--------+---------+-----+-----+ |
| // | 27 | 1 (=0) | 12 (=0) | 12 | 12 | |
| // |
| // |
| // TTPK v2: |
| // |
| // 0 63 |
| // | TTPB ID | PREFIX | SPAN | X | Y | |
| // +---------+--------+------+-----+-----+ |
| // | 27 | 1 (=1) | 12 | 12 | 12 | |
| // |
| // |
| |
| // |
| // TTCK (32-BIT COMPARE) v1: |
| // |
| // 0 63 |
| // | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | |
| // +----------------------+--------+--------+-------+-----+-----+ |
| // | 30 | 1 | 1 | 18 | 7 | 7 | |
| // |
| // |
| // TTCK (32-BIT COMPARE) v2: |
| // |
| // 0 63 |
| // | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | |
| // +----------------------+--------+--------+-------+-----+-----+ |
| // | 30 | 1 | 1 | 15 | 9 | 8 | |
| // |
| // |
| // TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: |
| // |
| // 0 63 |
| // | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | |
| // +----------------------+--------+--------+-------+-----+-----+ |
| // | 27 | 1 | 1 | 18 | 9 | 8 | |
| // |
| |
| union skc_subgroup_smem |
| { |
| skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE |
| |
| struct { |
| struct { |
| skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; |
| skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; |
| } lo; |
| |
| struct { |
| skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; |
| skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; |
| } hi; |
| |
| // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK]; |
| }; |
| |
| }; |
| |
| // |
| // scatter scan max |
| // |
| static |
| skc_int_v_t |
| skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem, |
| skc_int_v_t const iss, |
| skc_int_v_t const ess) |
| { |
| // |
| // prefix sums determine which lanes we're going to work on next |
| // |
| skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE); |
| skc_int_v_t const scratch_idx = max(ess,0); |
| |
| // |
| // SIMT |
| // |
| |
| // |
| // zero the volatile smem scratchpad using vector syntax |
| // |
| smem->scratch[get_sub_group_local_id()] = ( 0 ); |
| |
| // |
| // store source lane at starting lane |
| // |
| if (is_scratch_store) { |
| smem->scratch[scratch_idx] = get_sub_group_local_id(); |
| } |
| |
| // |
| // propagate lanes to right using max scan |
| // |
| skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()]; |
| skc_int_v_t const source = sub_group_scan_inclusive_max(scratch); |
| |
| return source; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| skc_bool |
| skc_xk_clip(union skc_tile_clip const * const tile_clip, |
| skc_ttxk_t * const xk) |
| { |
| // |
| // clip the sk and pk keys |
| // |
| // if fully clipped then return false |
| // |
| // alternatively -- we can expand all these keys in place |
| // |
| // alternatively -- keep sk and pk keys segregated because sk |
| // represents the vast majority of keys and are easier to process. |
| // don't mess with the fastpath! |
| // |
| return false; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| skc_ttck_t |
| skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem, |
| union skc_cmd_place const * const cmd, |
| skc_uint const sk_idx) |
| { |
| skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0 |
| skc_uint const hi = smem->hi.sk[sk_idx]; |
| |
| skc_ttck_t ck; |
| |
| ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id |
| |
| // FIXME -- x and y should already be clipped and shifted |
| skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; |
| skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; |
| |
| ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; |
| |
| return ck; |
| } |
| |
| static |
| skc_ttck_t |
| skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem, |
| union skc_cmd_place const * const cmd, |
| skc_uint const pk_idx, |
| skc_uint const dx) |
| { |
| skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1 |
| skc_uint const hi = smem->hi.pk[pk_idx]; |
| |
| skc_ttck_t ck; |
| |
| ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id |
| |
| // FIXME -- x and y should already be clipped and shifted |
| skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; |
| skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; |
| |
| ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; |
| |
| return ck; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, |
| __global skc_ttck_t * const ck_extent, |
| __local union skc_subgroup_smem volatile * const smem, |
| union skc_cmd_place const * const cmd, |
| skc_uint const sk) |
| { |
| // |
| // Pretty sure you can never ever have an sk count equal to 0 |
| // |
| skc_uint ck_base = 0; |
| |
| // last lane performs the block pool allocation with an atomic increment |
| if (get_sub_group_local_id() == 0) { |
| ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk); |
| } |
| |
| // broadcast base to all lanes |
| ck_base = sub_group_broadcast(ck_base,0); |
| |
| // convert sk keys to ck keys |
| for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE) |
| { |
| ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii); |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| skc_int |
| skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem, |
| skc_uint const idx) |
| { |
| skc_uint const lo = smem->lo.pk[idx]; |
| skc_uint const hi = smem->hi.pk[idx]; |
| |
| skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN; |
| skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN; |
| |
| return (span_lo | span_hi) + 1; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, |
| __global skc_ttck_t * const ck_extent, |
| __local union skc_subgroup_smem volatile * const smem, |
| union skc_cmd_place const * const cmd, |
| skc_uint const pk) |
| { |
| // bail out if pk queue is empty |
| if (pk == 0) |
| return; |
| |
| #if 0 |
| if (get_sub_group_local_id() == 0) |
| printf("%u\n",pk); |
| #endif |
| |
| // |
| // FIXME -- this nested loop iterates over the queue processing a |
| // subgroup of 64-bit keys at a time. This is probably not the most |
| // efficient approach so investigate how to store and iterate over a |
| // wider than subgroup (node-sized) queue of keys. |
| // |
| |
| // round up so we work with full subgroups |
| skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK; |
| skc_uint ii = 0; |
| |
| // nested loop that expands all ttpk keys |
| #if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE) |
| for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE) |
| #endif |
| { |
| skc_uint idx = ii + get_sub_group_local_id(); |
| skc_int span = 0; |
| |
| // how many tiles does this ttpk span? |
| if (idx < pk) |
| span = skc_ttpk_get_span(smem,idx); |
| |
| // we need inclusive, exclusive and total |
| skc_int iss = sub_group_scan_inclusive_add(span); |
| skc_int ess = iss - span; |
| skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1); |
| |
| // printf("%u : %u\n",span,iss); |
| // continue; |
| |
| // atomically allocate space for the pk keys |
| skc_uint ck_base = 0; |
| |
| // last lane performs the block pool allocation with an atomic increment |
| if (get_sub_group_local_id() == 0) { |
| ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem); |
| } |
| |
| // broadcast atomically allocated extent base to all lanes |
| skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id(); |
| |
| // |
| // FIXME -- this loop would probably be faster if the ttpk keys |
| // were held in registers and accessed with shuffles instead of |
| // SMEM loads |
| // |
| |
| // |
| // loop until there are no more expanded pk keys |
| // |
| while (true) |
| { |
| skc_int const source = skc_scatter_scan_max(smem,iss,ess); |
| skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source); |
| |
| // store valid ck keys to gmem |
| if (get_sub_group_local_id() < rem) { |
| ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx); |
| } |
| |
| // decrement remainder |
| rem -= SKC_PLACE_SUBGROUP_SIZE; |
| |
| if (rem <= 0) |
| break; |
| |
| // increment/decrement indices |
| ck_idx += SKC_PLACE_SUBGROUP_SIZE; |
| iss -= SKC_PLACE_SUBGROUP_SIZE; |
| ess -= SKC_PLACE_SUBGROUP_SIZE; |
| } |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| skc_uint |
| skc_ballot(skc_uint * const xk, skc_uint const is_xk) |
| { |
| #if 0 |
| // |
| // FIXME -- when available, this should use the idiom: |
| // |
| // ballot() + lane_mask_less_than_or_equal + popcount() |
| // |
| // Supported by: |
| // |
| // - Vulkan 1.1 / SPIR-V 1.3 |
| // - CUDA |
| // - AVX2 (SSE*?) |
| // |
| #else |
| // |
| // otherwise, emulate with an inclusive scan (yuk) |
| // |
| skc_uint const prefix = sub_group_scan_inclusive_add(is_xk); |
| |
| skc_uint const xk_idx = *xk + prefix - is_xk; |
| |
| *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST); |
| |
| #if 0 |
| printf("< %3u >\n",xk_idx); |
| #endif |
| |
| return xk_idx; |
| #endif |
| } |
| |
| // |
| // |
| // |
| __kernel |
| SKC_PLACE_KERNEL_ATTRIBS |
| void |
| skc_kernel_place(__global skc_bp_elem_t * const bp_elems, |
| __global SKC_ATOMIC_UINT volatile * const place_atomics, |
| __global skc_ttck_t * const ck_extent, |
| __global union skc_cmd_place const * const cmds, |
| __global skc_block_id_t * const map, |
| skc_uint4 const clip, |
| skc_uint const count) |
| { |
| // |
| // declare shared memory block |
| // |
| #if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) |
| __local union skc_subgroup_smem volatile smem[1]; |
| #else |
| __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS]; |
| __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); |
| #endif |
| |
| // |
| // This is a subgroup-centric kernel |
| // |
| // Which subgroup in the grid is this? |
| // |
| // TAKE NOTE: the Intel GEN compiler appears to be recognizing |
| // get_group_id(0) as a uniform but the alternative calculation used |
| // when there are multiple subgroups per workgroup is not |
| // cooperating and driving spillage elsewhere. |
| // |
| // Test the raster's translated bounds against the composition's |
| // tile clip |
| // |
| // There are 3 cases: |
| // |
| // - the raster is completely clipped -> return |
| // - the raster is partially clipped -> all keys must clipped |
| // - the raster is not clipped -> no keys are tested |
| // |
| // |
| // There are at least 4 implementations of place and we want to |
| // special-case them as much as possible so that, at the least, the |
| // fastpath remains fast. |
| // |
| // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP |
| // |
| // - implement CLIPPED + NO TRANSLATION path |
| // |
| // - implement NO CLIP + TRANSLATION path |
| // |
| // - implement CLIPPED + TRANSLATION path |
| // |
| // |
| // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin |
| // 12:12:8 integer where: |
| // |
| // 12: ttsk |
| // 12: ttpk |
| // 8: /dev/null -- clipped or invalid key |
| // |
| // Three kinds of nodes in a raster's list: |
| // |
| // - the head node |
| // - an internal node |
| // - the final node |
| // |
| |
| #if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) |
| skc_uint const cmd_idx = get_group_id(0); |
| #else |
| skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id(); |
| #endif |
| |
| // load command |
| union skc_cmd_place const cmd = cmds[cmd_idx]; |
| |
| // get the raster header from the raster host id -- scalar |
| skc_block_id_t id = map[cmd.raster_h]; |
| |
| // |
| // load all of the head block ttxk keys into registers |
| // |
| // FIXME -- this pattern lends itself to using the higher |
| // performance Intel GEN block load instructions |
| // |
| skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) \ |
| union skc_raster_node_elem const h##I = { \ |
| .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \ |
| bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \ |
| }; |
| |
| SKC_PLACE_EXPAND(); |
| |
| // |
| // load raster header counts -- we only need the "nodes" and "keys" |
| // words but the keys we loaded are doublewords. |
| // |
| // FIXME -- this can be made portable with compile-time macro expansion |
| // |
| skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES |
| skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS |
| |
| // |
| // |
| // |
| #if 0 |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) \ |
| printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ |
| nodes,keys, \ |
| I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ |
| h##I.u32v2.hi,h##I.u32v2.lo, \ |
| h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); |
| |
| SKC_PLACE_EXPAND(); |
| #endif |
| |
| // |
| #if 0 |
| if (get_sub_group_local_id() == 0) { |
| printf("place: %u / %u / %u\n",head_id,nodes,keys); |
| } |
| #endif |
| |
| { |
| // |
| // classify every key in the header |
| // |
| // keys: 0 is not a key / 1 is a key |
| // skpk: 0 is sk / 1 is pk |
| // |
| skc_uint bits_keys = 0; |
| skc_uint bits_skpk = 0; |
| |
| // |
| // calculate bits_keys |
| // |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) \ |
| if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ |
| skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \ |
| if (idx < keys) { \ |
| bits_keys |= (1u << I); \ |
| } \ |
| if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ |
| if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \ |
| if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ |
| bits_keys &= ~(1u << I); \ |
| } \ |
| } \ |
| } \ |
| } |
| |
| SKC_PLACE_EXPAND(); |
| |
| // |
| // blindly calculate bits_skpk |
| // |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) \ |
| if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ |
| bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ |
| } |
| |
| SKC_PLACE_EXPAND(); |
| |
| #if 0 |
| printf("%2X : %2X\n",bits_keys,bits_skpk); |
| #endif |
| |
| // |
| // next pointer is last element of last row. save it now because |
| // this might be recognized as a subgroup-uniform/scalar. |
| // |
| id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); |
| |
| // |
| // append SK keys first |
| // |
| skc_uint const bits_sk = bits_keys & ~bits_skpk; |
| skc_uint sk = 0; |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) \ |
| if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ |
| skc_uint is_sk = (bits_sk >> I) & 1; \ |
| skc_uint sk_idx = skc_ballot(&sk,is_sk); \ |
| if (is_sk) { \ |
| smem->lo.sk[sk_idx] = h##I.xk.lo; \ |
| smem->hi.sk[sk_idx] = h##I.xk.hi; \ |
| } \ |
| } |
| |
| SKC_PLACE_EXPAND(); |
| |
| // |
| // append PK keys next |
| // |
| skc_uint const bits_pk = bits_keys & bits_skpk; |
| skc_uint pk = 0; |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) \ |
| if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ |
| skc_uint is_pk = (bits_pk >> I) & 1; \ |
| skc_uint pk_idx = skc_ballot(&pk,is_pk); \ |
| if (is_pk) { \ |
| smem->lo.pk[pk_idx] = h##I.xk.lo; \ |
| smem->hi.pk[pk_idx] = h##I.xk.hi; \ |
| } \ |
| } |
| |
| SKC_PLACE_EXPAND(); |
| |
| #if 0 |
| printf("%2u * %2u\n",sk,pk); |
| #endif |
| // |
| // flush the keys |
| // |
| skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); |
| skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); |
| } |
| |
| // |
| // we're done if there was only a head node |
| // |
| if (nodes == 0) |
| return; |
| |
| // |
| // decrement keys |
| // |
| keys -= SKC_RASTER_HEAD_COUNT_KEYS; |
| |
| // |
| // otherwise, append keys in trailing nodes to smem |
| // |
| while (true) |
| { |
| // |
| // load all of the node block ttxk keys into registers |
| // |
| // FIXME -- this pattern lends itself to using the higher |
| // performance Intel GEN block load instructions |
| // |
| skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) \ |
| union skc_raster_node_elem const n##I = { \ |
| .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \ |
| bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \ |
| }; |
| |
| SKC_PLACE_EXPAND(); |
| |
| #if 0 |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) \ |
| printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ |
| nodes,keys, \ |
| I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ |
| n##I.u32v2.hi,n##I.u32v2.lo, \ |
| n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); |
| |
| SKC_PLACE_EXPAND(); |
| #endif |
| |
| // |
| // classify every key in the header |
| // |
| // keys: 0 is not a key / 1 is a key |
| // skpk: 0 is sk / 1 is pk |
| // |
| skc_uint bits_keys = 0; |
| skc_uint bits_skpk = 0; |
| |
| // |
| // calculate bits_keys |
| // |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) { \ |
| skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \ |
| if (idx < keys) { \ |
| bits_keys |= (1u << I); \ |
| } \ |
| if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ |
| if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \ |
| if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ |
| bits_keys &= ~(1u << I); \ |
| } \ |
| } \ |
| } \ |
| } |
| |
| SKC_PLACE_EXPAND(); |
| |
| // |
| // blindly calculate bits_skpk |
| // |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) { \ |
| bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ |
| } |
| |
| SKC_PLACE_EXPAND(); |
| |
| #if 0 |
| printf("%2X : %2X\n",bits_keys,bits_skpk); |
| #endif |
| |
| // |
| // next pointer is last element of last row. save it now because |
| // this might be recognized as a subgroup-uniform/scalar. |
| // |
| id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); |
| |
| // |
| // append SK keys first |
| // |
| skc_uint const bits_sk = bits_keys & ~bits_skpk; |
| skc_uint sk = 0; |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) { \ |
| skc_uint is_sk = (bits_sk >> I) & 1; \ |
| skc_uint sk_idx = skc_ballot(&sk,is_sk); \ |
| if (is_sk) { \ |
| smem->lo.sk[sk_idx] = n##I.xk.lo; \ |
| smem->hi.sk[sk_idx] = n##I.xk.hi; \ |
| } \ |
| } |
| |
| SKC_PLACE_EXPAND(); |
| |
| // |
| // append PK keys next |
| // |
| skc_uint const bits_pk = bits_keys & bits_skpk; |
| skc_uint pk = 0; |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) { \ |
| skc_uint is_pk = (bits_pk >> I) & 1; \ |
| skc_uint pk_idx = skc_ballot(&pk,is_pk); \ |
| if (is_pk) { \ |
| smem->lo.pk[pk_idx] = n##I.xk.lo; \ |
| smem->hi.pk[pk_idx] = n##I.xk.hi; \ |
| } \ |
| } |
| |
| SKC_PLACE_EXPAND(); |
| |
| #if 0 |
| printf("%2u * %2u\n",sk,pk); |
| #endif |
| // |
| // if total for either the sk or pk queue reaches the |
| // highwater mark then flush it to the extent |
| // |
| skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); |
| skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); |
| |
| // |
| // if this was the last node then we're done |
| // |
| if (--nodes == 0) |
| return; |
| |
| // |
| // otherwise decrement keys |
| // |
| keys -= SKC_RASTER_NODE_COUNT_KEYS; |
| } |
| } |
| |
| // |
| // |
| // |