blob: 8866bdb3e6b2d10102d94e5f0b74e33a1f8a95c0 [file] [log] [blame]
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include "tile.h"
#include "common.h"
#include "raster.h"
#include "atomic_cl.h"
#include "kernel_cl_12.h"
//
//
//
#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1)
#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1)
//
//
//
#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK
//
//
//
#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
//
//
//
#if ( SKC_PLACE_X == 1 )
#define SKC_PLACE_EXPAND() SKC_EXPAND_1()
#define SKC_PLACE_EXPAND_I_LAST 0
#elif ( SKC_PLACE_X == 2 )
#define SKC_PLACE_EXPAND() SKC_EXPAND_2()
#define SKC_PLACE_EXPAND_I_LAST 1
#elif ( SKC_PLACE_X == 4 )
#define SKC_PLACE_EXPAND() SKC_EXPAND_4()
#define SKC_PLACE_EXPAND_I_LAST 3
#elif ( SKC_PLACE_X == 8 )
#define SKC_PLACE_EXPAND() SKC_EXPAND_8()
#define SKC_PLACE_EXPAND_I_LAST 7
#elif ( SKC_PLACE_X == 16)
#define SKC_PLACE_EXPAND() SKC_EXPAND_16()
#define SKC_PLACE_EXPAND_I_LAST 15
#endif
//
// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
// COALESCED WRITES. LO FIRST, FOLLOWED BY HI.
//
// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
// KERNELS USE DIFFERENT SUBGROUP SIZES.
//
// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
//
// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
// ONLY SUPPORT A SUBGROUP SIZE OF 16.
//
#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
#define SKC_PLACE_STRIDE_H(L) (L)
#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1)
#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
#define SKC_PLACE_STRIDE_H(L) (L)
#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
#endif
//
// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
//
#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
//
// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
//
#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k))
//
// TTSK v2:
//
// 0 63
// | TTSB ID | PREFIX | SPAN | X | Y |
// +---------+--------+---------+-----+-----+
// | 27 | 1 (=0) | 12 (=0) | 12 | 12 |
//
//
// TTPK v2:
//
// 0 63
// | TTPB ID | PREFIX | SPAN | X | Y |
// +---------+--------+------+-----+-----+
// | 27 | 1 (=1) | 12 | 12 | 12 |
//
//
//
// TTCK (32-BIT COMPARE) v1:
//
// 0 63
// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
// +----------------------+--------+--------+-------+-----+-----+
// | 30 | 1 | 1 | 18 | 7 | 7 |
//
//
// TTCK (32-BIT COMPARE) v2:
//
// 0 63
// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
// +----------------------+--------+--------+-------+-----+-----+
// | 30 | 1 | 1 | 15 | 9 | 8 |
//
//
// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
//
// 0 63
// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
// +----------------------+--------+--------+-------+-----+-----+
// | 27 | 1 | 1 | 18 | 9 | 8 |
//
union skc_subgroup_smem
{
skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
struct {
struct {
skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
} lo;
struct {
skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
} hi;
// skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
};
};
//
// scatter scan max
//
static
skc_int_v_t
skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem,
skc_int_v_t const iss,
skc_int_v_t const ess)
{
//
// prefix sums determine which lanes we're going to work on next
//
skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
skc_int_v_t const scratch_idx = max(ess,0);
//
// SIMT
//
//
// zero the volatile smem scratchpad using vector syntax
//
smem->scratch[get_sub_group_local_id()] = ( 0 );
//
// store source lane at starting lane
//
if (is_scratch_store) {
smem->scratch[scratch_idx] = get_sub_group_local_id();
}
//
// propagate lanes to right using max scan
//
skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
skc_int_v_t const source = sub_group_scan_inclusive_max(scratch);
return source;
}
//
//
//
static
skc_bool
skc_xk_clip(union skc_tile_clip const * const tile_clip,
skc_ttxk_t * const xk)
{
//
// clip the sk and pk keys
//
// if fully clipped then return false
//
// alternatively -- we can expand all these keys in place
//
// alternatively -- keep sk and pk keys segregated because sk
// represents the vast majority of keys and are easier to process.
// don't mess with the fastpath!
//
return false;
}
//
//
//
static
skc_ttck_t
skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem,
union skc_cmd_place const * const cmd,
skc_uint const sk_idx)
{
skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
skc_uint const hi = smem->hi.sk[sk_idx];
skc_ttck_t ck;
ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
// FIXME -- x and y should already be clipped and shifted
skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
return ck;
}
static
skc_ttck_t
skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem,
union skc_cmd_place const * const cmd,
skc_uint const pk_idx,
skc_uint const dx)
{
skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
skc_uint const hi = smem->hi.pk[pk_idx];
skc_ttck_t ck;
ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
// FIXME -- x and y should already be clipped and shifted
skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
return ck;
}
//
//
//
static
void
skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
__global skc_ttck_t * const ck_extent,
__local union skc_subgroup_smem volatile * const smem,
union skc_cmd_place const * const cmd,
skc_uint const sk)
{
//
// Pretty sure you can never ever have an sk count equal to 0
//
skc_uint ck_base = 0;
// last lane performs the block pool allocation with an atomic increment
if (get_sub_group_local_id() == 0) {
ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
}
// broadcast base to all lanes
ck_base = sub_group_broadcast(ck_base,0);
// convert sk keys to ck keys
for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
{
ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
}
}
//
//
//
static
skc_int
skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem,
skc_uint const idx)
{
skc_uint const lo = smem->lo.pk[idx];
skc_uint const hi = smem->hi.pk[idx];
skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
return (span_lo | span_hi) + 1;
}
//
//
//
static
void
skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
__global skc_ttck_t * const ck_extent,
__local union skc_subgroup_smem volatile * const smem,
union skc_cmd_place const * const cmd,
skc_uint const pk)
{
// bail out if pk queue is empty
if (pk == 0)
return;
#if 0
if (get_sub_group_local_id() == 0)
printf("%u\n",pk);
#endif
//
// FIXME -- this nested loop iterates over the queue processing a
// subgroup of 64-bit keys at a time. This is probably not the most
// efficient approach so investigate how to store and iterate over a
// wider than subgroup (node-sized) queue of keys.
//
// round up so we work with full subgroups
skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
skc_uint ii = 0;
// nested loop that expands all ttpk keys
#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
#endif
{
skc_uint idx = ii + get_sub_group_local_id();
skc_int span = 0;
// how many tiles does this ttpk span?
if (idx < pk)
span = skc_ttpk_get_span(smem,idx);
// we need inclusive, exclusive and total
skc_int iss = sub_group_scan_inclusive_add(span);
skc_int ess = iss - span;
skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
// printf("%u : %u\n",span,iss);
// continue;
// atomically allocate space for the pk keys
skc_uint ck_base = 0;
// last lane performs the block pool allocation with an atomic increment
if (get_sub_group_local_id() == 0) {
ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
}
// broadcast atomically allocated extent base to all lanes
skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
//
// FIXME -- this loop would probably be faster if the ttpk keys
// were held in registers and accessed with shuffles instead of
// SMEM loads
//
//
// loop until there are no more expanded pk keys
//
while (true)
{
skc_int const source = skc_scatter_scan_max(smem,iss,ess);
skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
// store valid ck keys to gmem
if (get_sub_group_local_id() < rem) {
ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
}
// decrement remainder
rem -= SKC_PLACE_SUBGROUP_SIZE;
if (rem <= 0)
break;
// increment/decrement indices
ck_idx += SKC_PLACE_SUBGROUP_SIZE;
iss -= SKC_PLACE_SUBGROUP_SIZE;
ess -= SKC_PLACE_SUBGROUP_SIZE;
}
}
}
//
//
//
static
skc_uint
skc_ballot(skc_uint * const xk, skc_uint const is_xk)
{
#if 0
//
// FIXME -- when available, this should use the idiom:
//
// ballot() + lane_mask_less_than_or_equal + popcount()
//
// Supported by:
//
// - Vulkan 1.1 / SPIR-V 1.3
// - CUDA
// - AVX2 (SSE*?)
//
#else
//
// otherwise, emulate with an inclusive scan (yuk)
//
skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
skc_uint const xk_idx = *xk + prefix - is_xk;
*xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
#if 0
printf("< %3u >\n",xk_idx);
#endif
return xk_idx;
#endif
}
//
//
//
__kernel
SKC_PLACE_KERNEL_ATTRIBS
void
skc_kernel_place(__global skc_bp_elem_t * const bp_elems,
__global SKC_ATOMIC_UINT volatile * const place_atomics,
__global skc_ttck_t * const ck_extent,
__global union skc_cmd_place const * const cmds,
__global skc_block_id_t * const map,
skc_uint4 const clip,
skc_uint const count)
{
//
// declare shared memory block
//
#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
__local union skc_subgroup_smem volatile smem[1];
#else
__local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
__local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
#endif
//
// This is a subgroup-centric kernel
//
// Which subgroup in the grid is this?
//
// TAKE NOTE: the Intel GEN compiler appears to be recognizing
// get_group_id(0) as a uniform but the alternative calculation used
// when there are multiple subgroups per workgroup is not
// cooperating and driving spillage elsewhere.
//
// Test the raster's translated bounds against the composition's
// tile clip
//
// There are 3 cases:
//
// - the raster is completely clipped -> return
// - the raster is partially clipped -> all keys must clipped
// - the raster is not clipped -> no keys are tested
//
//
// There are at least 4 implementations of place and we want to
// special-case them as much as possible so that, at the least, the
// fastpath remains fast.
//
// - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
//
// - implement CLIPPED + NO TRANSLATION path
//
// - implement NO CLIP + TRANSLATION path
//
// - implement CLIPPED + TRANSLATION path
//
//
// FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
// 12:12:8 integer where:
//
// 12: ttsk
// 12: ttpk
// 8: /dev/null -- clipped or invalid key
//
// Three kinds of nodes in a raster's list:
//
// - the head node
// - an internal node
// - the final node
//
#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
skc_uint const cmd_idx = get_group_id(0);
#else
skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
#endif
// load command
union skc_cmd_place const cmd = cmds[cmd_idx];
// get the raster header from the raster host id -- scalar
skc_block_id_t id = map[cmd.raster_h];
//
// load all of the head block ttxk keys into registers
//
// FIXME -- this pattern lends itself to using the higher
// performance Intel GEN block load instructions
//
skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
union skc_raster_node_elem const h##I = { \
.u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \
bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \
};
SKC_PLACE_EXPAND();
//
// load raster header counts -- we only need the "nodes" and "keys"
// words but the keys we loaded are doublewords.
//
// FIXME -- this can be made portable with compile-time macro expansion
//
skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
//
//
//
#if 0
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
nodes,keys, \
I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
h##I.u32v2.hi,h##I.u32v2.lo, \
h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
SKC_PLACE_EXPAND();
#endif
//
#if 0
if (get_sub_group_local_id() == 0) {
printf("place: %u / %u / %u\n",head_id,nodes,keys);
}
#endif
{
//
// classify every key in the header
//
// keys: 0 is not a key / 1 is a key
// skpk: 0 is sk / 1 is pk
//
skc_uint bits_keys = 0;
skc_uint bits_skpk = 0;
//
// calculate bits_keys
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
if (idx < keys) { \
bits_keys |= (1u << I); \
} \
if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \
if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
bits_keys &= ~(1u << I); \
} \
} \
} \
}
SKC_PLACE_EXPAND();
//
// blindly calculate bits_skpk
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
}
SKC_PLACE_EXPAND();
#if 0
printf("%2X : %2X\n",bits_keys,bits_skpk);
#endif
//
// next pointer is last element of last row. save it now because
// this might be recognized as a subgroup-uniform/scalar.
//
id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
//
// append SK keys first
//
skc_uint const bits_sk = bits_keys & ~bits_skpk;
skc_uint sk = 0;
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
skc_uint is_sk = (bits_sk >> I) & 1; \
skc_uint sk_idx = skc_ballot(&sk,is_sk); \
if (is_sk) { \
smem->lo.sk[sk_idx] = h##I.xk.lo; \
smem->hi.sk[sk_idx] = h##I.xk.hi; \
} \
}
SKC_PLACE_EXPAND();
//
// append PK keys next
//
skc_uint const bits_pk = bits_keys & bits_skpk;
skc_uint pk = 0;
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
skc_uint is_pk = (bits_pk >> I) & 1; \
skc_uint pk_idx = skc_ballot(&pk,is_pk); \
if (is_pk) { \
smem->lo.pk[pk_idx] = h##I.xk.lo; \
smem->hi.pk[pk_idx] = h##I.xk.hi; \
} \
}
SKC_PLACE_EXPAND();
#if 0
printf("%2u * %2u\n",sk,pk);
#endif
//
// flush the keys
//
skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
}
//
// we're done if there was only a head node
//
if (nodes == 0)
return;
//
// decrement keys
//
keys -= SKC_RASTER_HEAD_COUNT_KEYS;
//
// otherwise, append keys in trailing nodes to smem
//
while (true)
{
//
// load all of the node block ttxk keys into registers
//
// FIXME -- this pattern lends itself to using the higher
// performance Intel GEN block load instructions
//
skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
union skc_raster_node_elem const n##I = { \
.u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \
bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \
};
SKC_PLACE_EXPAND();
#if 0
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
nodes,keys, \
I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
n##I.u32v2.hi,n##I.u32v2.lo, \
n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
SKC_PLACE_EXPAND();
#endif
//
// classify every key in the header
//
// keys: 0 is not a key / 1 is a key
// skpk: 0 is sk / 1 is pk
//
skc_uint bits_keys = 0;
skc_uint bits_skpk = 0;
//
// calculate bits_keys
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) { \
skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
if (idx < keys) { \
bits_keys |= (1u << I); \
} \
if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \
if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
bits_keys &= ~(1u << I); \
} \
} \
} \
}
SKC_PLACE_EXPAND();
//
// blindly calculate bits_skpk
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) { \
bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
}
SKC_PLACE_EXPAND();
#if 0
printf("%2X : %2X\n",bits_keys,bits_skpk);
#endif
//
// next pointer is last element of last row. save it now because
// this might be recognized as a subgroup-uniform/scalar.
//
id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
//
// append SK keys first
//
skc_uint const bits_sk = bits_keys & ~bits_skpk;
skc_uint sk = 0;
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) { \
skc_uint is_sk = (bits_sk >> I) & 1; \
skc_uint sk_idx = skc_ballot(&sk,is_sk); \
if (is_sk) { \
smem->lo.sk[sk_idx] = n##I.xk.lo; \
smem->hi.sk[sk_idx] = n##I.xk.hi; \
} \
}
SKC_PLACE_EXPAND();
//
// append PK keys next
//
skc_uint const bits_pk = bits_keys & bits_skpk;
skc_uint pk = 0;
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) { \
skc_uint is_pk = (bits_pk >> I) & 1; \
skc_uint pk_idx = skc_ballot(&pk,is_pk); \
if (is_pk) { \
smem->lo.pk[pk_idx] = n##I.xk.lo; \
smem->hi.pk[pk_idx] = n##I.xk.hi; \
} \
}
SKC_PLACE_EXPAND();
#if 0
printf("%2u * %2u\n",sk,pk);
#endif
//
// if total for either the sk or pk queue reaches the
// highwater mark then flush it to the extent
//
skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
//
// if this was the last node then we're done
//
if (--nodes == 0)
return;
//
// otherwise decrement keys
//
keys -= SKC_RASTER_NODE_COUNT_KEYS;
}
}
//
//
//