blob: ae3397c26d4707efecbaeb5f7b1144c51653985a [file] [log] [blame]
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include "tile.h"
#include "block.h"
#include "raster.h"
#include "atomic_cl.h"
#include "raster_builder_cl_12.h"
#include "kernel_cl_12.h"
//
// INPUT:
//
// TTRK (64-BIT COMPARE)
//
// 0 63
// | TTSB ID | X | Y | COHORT ID |
// +---------+------+------+-----------+
// | 27 | 12 | 12 | 13 |
//
//
// TTRK (32-BIT COMPARE)
//
// 0 63
// | TTSB ID | N/A | X | Y | COHORT ID |
// +---------+-----+------+------+-----------+
// | 27 | 5 | 12 | 12 | 8 |
//
//
// OUTPUT:
//
// TTSK v2:
//
// 0 63
// | TTSB ID | PREFIX | N/A | X | Y |
// +---------+--------+------+----+----+
// | 27 | 1 (=0) | 12 | 12 | 12 |
//
//
// TTPK v1:
//
// 0 63
// | TTPB ID | ALL ZEROES | SPAN | X | Y |
// +---------+------------+------+-----+-----+
// | 27 | 1 | 12 | 12 | 12 |
//
//
// TTPK v2:
//
// 0 63
// | TTPB ID | PREFIX | SPAN | X | Y |
// +---------+--------+------+-----+-----+
// | 27 | 1 (=1) | 12 | 12 | 12 |
//
#define SKC_PREFIX_SUBGROUP_MASK (SKC_PREFIX_SUBGROUP_SIZE - 1)
//
// smem accumulator
//
union skc_subgroup_accum
{
struct {
SKC_ATOMIC_INT ttp[SKC_TILE_HEIGHT];
} atomic;
struct {
skc_ttp_t ttp[SKC_TILE_HEIGHT];
} aN;
struct {
SKC_PREFIX_TTP_V ttp[SKC_PREFIX_SUBGROUP_SIZE];
} vN;
struct {
SKC_PREFIX_SMEM_ZERO ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH];
} zero;
};
//
//
//
struct skc_subgroup_smem
{
// prefix accumulator
union skc_subgroup_accum accum;
};
//
//
//
static
skc_uint
skc_subgroup_lane()
{
#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
return get_sub_group_local_id();
#else
return 0;
#endif
}
//
//
//
static
SKC_PREFIX_TTS_V_BITFIELD
skc_tts_get_dy(skc_tts_v_t const ttsv)
{
// tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32]
SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY;
return dy - (~ttsv >> 31);
}
static
SKC_PREFIX_TTS_V_BITFIELD
skc_tts_get_py(skc_tts_v_t const ttsv)
{
return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2);
}
//
//
//
static
void
skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v)
{
// get "altitude"
SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v);
// get the y pixel coordinate
SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v);
//
// FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid?
//
// FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op
//
#if 0
if (tts_v != SKC_TTS_INVALID)
printf("< %08X = %u : %d >\n",tts_v,py,dy);
#endif
//
// scatter-add the "altitude" to accumulator
//
#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
//
// GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,A) \
if (tts_v C != SKC_TTS_INVALID) { \
SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \
}
#else
//
// CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS
//
// WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,A) \
if (tts_v C == SKC_TTS_INVALID) \
return; \
smem->accum.aN.ttp[py C] = dy C;
#endif
SKC_PREFIX_TTS_VECTOR_INT_EXPAND();
}
//
// The implication here is that if our device configuration has a
// rectangular 1:2 tile then we need a block size of at least 2
// subblocks. The subblock size of course needs to match the length of
// the smallest tile side.
//
static
void
skc_accum_flush(__local struct skc_subgroup_smem * const smem,
__global skc_bp_elem_t * const bp_elems,
skc_block_id_t const pb_id)
{
// load the ttp elements
SKC_PREFIX_TTP_V const ttp_v = smem->accum.vN.ttp[get_sub_group_local_id()];
skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
#if ( SKC_TILE_RATIO == 1 )
bp_elems[offset] = ttp_v;
#elif ( SKC_TILE_RATIO == 2 )
vstore2(ttp_v,offset,bp_elems);
#else
#error("tile ratio greater than 2 not supported")
#endif
}
//
//
//
static
void
skc_accum_reset(__local struct skc_subgroup_smem * const smem)
{
for (uint ii=0; ii<SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH / SKC_PREFIX_SUBGROUP_SIZE; ii++)
smem->accum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 );
}
//
// get next sk key
//
static
skc_ttsk_s_t
skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v,
skc_uint * const sk_next,
skc_int * const rkpk_rem)
{
// decrement count
*rkpk_rem -= 1;
#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
//
// SIMT with subgroup support is easy
//
// SIMT without subgroup support can always emulate with smem
//
#if 0
//
// BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly
// broadcast a uint2 cast to a long. It was probably bad to do this
// anyway without a union wrapping the TTSK scalar type.
//
// Consider creating a union { ulong; uint2 } at a later date --
// probably no need to ever do this unless it makes broadcast faster
// which is unlikely since it will probably be implemented as 2
// 32-bit broadcasts.
//
// Additionally, the TTRK and TTXK key bitfield sizes are probably
// cast in stone and we aren't going to change them no matter
// architecture we're on.
//
skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++);
#else
skc_ttsk_s_t sk_s;
sk_s.lo = sub_group_broadcast(sk_v->lo,*sk_next);
sk_s.hi = sub_group_broadcast(sk_v->hi,*sk_next);
*sk_next += 1;
#endif
#else
//
// SIMD will always grab component .s0 and then rotate the vector
//
sk_s = ( sk_v->s0 );
skc_ttsk_v_rotate_down(sk_v);
#endif
return sk_s;
}
//
//
//
static
skc_raster_yx_s
skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next)
{
#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
//
// SIMT with subgroup support is easy
//
// SIMT without subgroup support can always emulate with smem
//
skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next);
#else
//
// SIMD will always grab component .s0 and then rotate the vector
//
skc_raster_yx_s const yx_s = ( sk_v->s0.hi );
#endif
return yx_s;
}
//
// mask off ttsb id
//
static
skc_block_id_s_t
skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s)
{
return ( sk_s->lo & SKC_TTXK_LO_MASK_ID );
}
//
// load tts_v as early as possible
//
static
skc_tts_v_t
skc_load_tts(__global skc_bp_elem_t * const bp_elems,
skc_block_id_s_t const sb_id)
{
return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] );
}
//
// massage ttrk keys into ttsk keys
//
static
void
skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v)
{
sk_v->lo = sk_v->lo & SKC_TTXK_LO_MASK_ID; // clear high (N/A) bits
sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits
}
//
// replenish ttsk keys
//
static
void
skc_ttsk_v_replenish(skc_ttsk_v_t * const sk_v,
skc_uint * const sk_next,
skc_uint * const rks_next,
__global skc_ttrk_e_t const * const rks)
{
// if there are still keys available then return
if (*sk_next < SKC_PREFIX_TTXK_V_SIZE)
return;
//
// otherwise, replenish sk_v
//
// NOTE NOTE NOTE -- we are assuming rks[] extent size is always
// divisible by TTXK_V_SIZE and therefore loading some keys from the
// next raster is OK.
//
*sk_next = 0;
*rks_next += SKC_PREFIX_SUBGROUP_SIZE;
*sk_v = rks[*rks_next];
#if 0
printf("* %08X ( %3u, %3u )\n",
sk_v->hi,
(sk_v->hi >> 12) & 0xFFF,
(sk_v->hi ) & 0xFFF);
#endif
skc_ttrk_to_ttsk(sk_v);
#if 0
printf("! %08X ( %3u, %3u )\n",
sk_v->hi,
(sk_v->hi >> 20) & 0xFFF,
(sk_v->hi >> 8) & 0xFFF);
#endif
}
//
// replenish block ids
//
// note that you can't overrun the block id pool since it's a ring
//
static
void
skc_blocks_replenish(skc_uint * const blocks_next,
skc_uint * const blocks_idx,
skc_block_id_v_t * const blocks,
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
__global skc_block_id_t const * const bp_ids)
{
*blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE;
*blocks = bp_ids[*blocks_idx & bp_mask];
*blocks_next = 0;
#if 0
printf("replenish blocks: %u\n",*blocks);
#endif
}
//
//
//
static
skc_block_id_t
skc_blocks_get_next(skc_uint * const blocks_next,
skc_uint * const blocks_idx,
skc_block_id_v_t * const blocks,
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
__global skc_block_id_t const * const bp_ids)
{
// replenish?
if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE)
{
skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
}
#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
//
// SIMT
//
skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
#else
//
// SIMD
//
skc_block_id_t id = blocks->s0;
skc_shuffle_down_1(*blocks);
#endif
*blocks_next += 1;
return id;
}
//
// subblock allocator
//
#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
static
skc_block_id_t
skc_subblocks_get_next_pb_id(skc_block_id_t * const subblocks,
skc_uint * const blocks_next,
skc_uint * const blocks_idx,
skc_block_id_v_t * const blocks,
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
__global skc_block_id_t const * const bp_ids)
{
if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
{
*subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
}
skc_block_id_t const pb_id = *subblocks;
*subblocks += SKC_TILE_RATIO; // note this is one or two subblocks
return pb_id;
}
#endif
//
// append a ttsk key to the work-in-progress node
//
static
void
skc_node_v_append_sk(skc_ttsk_s_t const * const sk_s,
skc_ttxk_v_t * const xk_v,
skc_uint * const xk_v_next,
skc_uint * const xk_v_idx,
__global skc_bp_elem_t * const bp_elems,
skc_int const rkpk_rem,
skc_uint * const blocks_next,
skc_uint * const blocks_idx,
skc_block_id_v_t * const blocks,
skc_uint const bp_mask,
__global skc_block_id_t const * const bp_ids)
{
//
// Append an sk key to the in-register xk_v vector
//
// If the work-in-progress node in gmem will only have room for one
// more key then:
//
// - if this was the final SK then write out xk_v and exit
//
// - otherwise, acquire a block id, link it, write out xk_v,
// prepare new node
//
// Note that this does *not* try to squeeze in a final key into the
// next node slot. This optimization isn't worth the added
// down-pipeline complexity.
//
#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
//
// SIMT
//
if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
{
*xk_v = *sk_s;
}
*xk_v_next += 1;
// are there more keys coming?
if (rkpk_rem > 0)
{
// is the node almost full?
if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
{
skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
{
xk_v->lo = id;
xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
}
// store xk_v (uint2) to bp (uint)
bp_elems[*xk_v_idx ] = xk_v->lo;
bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
#if 0
printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v);
#endif
// reinitialize xk_v
xk_v->lo = SKC_UINT_MAX;
xk_v->hi = SKC_UINT_MAX;
// update node elem idx
*xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
// reset node count
*xk_v_next = 0;
}
// is xk_v full?
else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
{
// store xk_v to bp
bp_elems[*xk_v_idx ] = xk_v->lo;
bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
#if 0
printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v);
#endif
// reinitialize xk_v
xk_v->lo = SKC_UINT_MAX;
xk_v->hi = SKC_UINT_MAX;
// increment node elem idx
*xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
}
}
else
{
bp_elems[*xk_v_idx ] = xk_v->lo;
bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
#if 0
printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v);
#endif
while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
{
*xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
bp_elems[*xk_v_idx] = SKC_UINT_MAX;
bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
}
}
#else
//
// SIMD
//
#endif
}
//
//
//
static
skc_ttpk_s_t
skc_ttpk_create(skc_raster_yx_s const yx_prev,
skc_raster_yx_s const yx_next,
skc_block_id_t const pb_id)
{
// - yx_prev is already incremented by one
// - yx_span is already shifted up at hi.x
skc_uint const yx_span = yx_next - yx_prev;
skc_ttpk_s_t pk;
// turn on prefix bit | shift span bits upward
pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN);
// shift down high span bits | yx of tile
pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev;
#if 0
if (get_sub_group_local_id() == 0)
printf("* %08v2X : %u\n",pk,yx_span);
#endif
return pk;
}
//
// append a ttpk key to the work-in-progress node
//
static
void
skc_node_v_append_pk(skc_ttpk_s_t const * const pk_s,
skc_ttxk_v_t * const xk_v,
skc_uint * const xk_v_next,
skc_uint * const xk_v_idx,
__global skc_bp_elem_t * const bp_elems,
skc_uint * const blocks_next,
skc_uint * const blocks_idx,
skc_block_id_v_t * const blocks,
skc_uint const bp_mask,
__global skc_block_id_t const * const bp_ids)
{
//
// append a pk key to the in-register xk_v vector
//
// if the work-in-progress node in gmem will only have room for one
// more key then:
//
// - if this was the final SK then write out xk_v and exit
//
// - otherwise, acquire a block id, link it, write out xk_v,
// prepare new node
//
#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
//
// SIMT
//
if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
{
*xk_v = *pk_s;
}
*xk_v_next += 1;
// is the node almost full?
if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
{
skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
{
xk_v->lo = id;
xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
}
// store xk_v to bp
bp_elems[*xk_v_idx ] = xk_v->lo;
bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
#if 0
printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v);
#endif
// reinitialize xk_v
xk_v->lo = SKC_UINT_MAX;
xk_v->hi = SKC_UINT_MAX;
// update node elem idx
*xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
// reset node count
*xk_v_next = 0;
}
// is xk_v full?
else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
{
// store xk_v to bp
bp_elems[*xk_v_idx ] = xk_v->lo;
bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
#if 0
printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v);
#endif
// reinitialize xk_v
xk_v->lo = SKC_UINT_MAX;
xk_v->hi = SKC_UINT_MAX;
// increment node elem idx
*xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
}
#else
//
// SIMD
//
#endif
}
//
// append the first 3 fields of meta info to the raster header
//
static
void
skc_node_v_init_header(skc_ttxk_v_t * const xk_v,
skc_uint * const xk_v_next,
union skc_raster_cohort_meta_out const * const meta)
{
#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
//
// SIMT
//
if (get_sub_group_local_id() < 2)
{
*xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi;
}
#if 0
if (get_sub_group_local_id() == 0)
printf("header: %08v4X\n",meta->u32v4);
#endif
//
// increment counter: uint4 + uint4 = uint2 x 4
//
*xk_v_next = 2 + 2; // +2 for unitialized bounds
#else
//
// SIMD
//
#endif
}
//
//
//
__kernel
SKC_PREFIX_KERNEL_ATTRIBS
void
skc_kernel_prefix(__global skc_uint const * const bp_atomics,
__global skc_block_id_t const * const bp_ids,
__global skc_bp_elem_t * const bp_elems,
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
__global skc_ttrk_e_t const * const rks,
__global skc_block_id_t * const map,
__global skc_uint const * const metas,
skc_uint const count)
{
//
// declare shared memory block
//
#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
__local struct skc_subgroup_smem smem[1];
#else
__local struct skc_subgroup_smem smems[SKC_PREFIX_WORKGROUP_SUBGROUPS];
__local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id();
#endif
//
// where is this subgroup in the grid?
//
#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
skc_uint const sgi = get_group_id(0);
#else
skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id();
#endif
skc_uint const sgl = get_sub_group_local_id();
//
// return if this subgroup is excess
//
#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 )
if (sgi >= count)
return;
#endif
//
// get meta info for this subgroup's raster
//
union skc_raster_cohort_meta_out const meta = { vload4(sgi,metas) };
skc_uint const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi];
#if 0
if (get_sub_group_local_id() == 0)
printf("%3u : %5u / %5u / %5u / %5u / %u\n",
sgi,
meta.blocks,
meta.offset,
meta.nodes,
meta.keys,
reads);
#endif
//
// preload blocks -- align on subgroup
//
skc_uint blocks_idx = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
skc_block_id_v_t blocks = bp_ids[blocks_idx & bp_mask];
skc_uint blocks_next = (reads & SKC_PREFIX_SUBGROUP_MASK);
//
// prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset
//
skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
//
// initialize raster header -- assumes block is greater than 8 words (4 doublewords)
//
skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX };
skc_uint xk_v_next;
skc_node_v_init_header(&xk_v,&xk_v_next,&meta);
//
// no keys -- this is an empty raster!
//
if (meta.keys == 0)
{
bp_elems[xk_v_idx ] = xk_v.lo;
bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi;
while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
{
xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
bp_elems[xk_v_idx] = SKC_UINT_MAX;
bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
}
return;
}
//
// load TTRK keys and in-place convert to TTSK keys
//
skc_uint rks_next = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
skc_ttsk_v_t sk_v = rks[rks_next];
skc_uint sk_next = (meta.offset & SKC_PREFIX_SUBGROUP_MASK);
skc_int rkpk_rem = meta.keys; // signed count of remaining rk+pk keys
#if 0
printf("* %08X ( %3u, %3u )\n",
sk_v.hi,
(sk_v.hi >> 12) & 0xFFF,
(sk_v.hi ) & 0xFFF);
#endif
skc_ttrk_to_ttsk(&sk_v);
#if 0
printf("! %08X ( %3u, %3u )\n",
sk_v.hi,
(sk_v.hi >> 20) & 0xFFF,
(sk_v.hi >> 8) & 0xFFF);
#endif
//
// subblocks
//
#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
skc_block_id_t subblocks = 0;
#endif
//
// begin "scan" of tiles
//
skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next);
//
// zero the accumulator
//
skc_accum_reset(smem);
while (true)
{
// get next rk key
skc_ttsk_s_t const sk_s = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem);
// load ttsb id
skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s);
// load tts_v transaction "in flight" as early as possible
skc_tts_v_t const tts_v = skc_load_tts(bp_elems,sb_id);
#if 0
printf("{ %08X }\n",tts_v);
#endif
#if 0
if (get_sub_group_local_id() == 0)
printf("[ %d, %X ]\n",rkpk_rem,sb_id);
#endif
#if 0
if (get_sub_group_local_id() == 0)
printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF);
#endif
//
// FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF
// TIME AND SIMD'IZED
//
// if yx's don't match then we're either issuing a ttpk or
// resetting the accumulator
if (sk_s.hi != yx_prev)
{
// if yx_next.y == yx_last.y then x changed
if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0)
{
//
// if the tile is not square then it's ratio is 1:2
//
#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2
skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks,
&blocks_next,
&blocks_idx,
&blocks,
bp_mask,
bp_ids);
#else
skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next,
&blocks_idx,
&blocks,
bp_mask,
bp_ids);
#endif
// flush accumulated ttp vector to block/subblock at ttpb_id
skc_accum_flush(smem,bp_elems,pb_id);
#if 0
if (get_sub_group_local_id() == 0)
{
printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n",
pb_id,
(yx_prev >> SKC_TTXK_HI_OFFSET_Y),
(yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF,
(sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF,
(sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF);
}
#endif
//
// FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP
//
rkpk_rem -= 1;
// create the pk
skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id);
// append pk key to xk buffer
skc_node_v_append_pk(&pk_s,
&xk_v,
&xk_v_next,
&xk_v_idx,
bp_elems,
&blocks_next,
&blocks_idx,
&blocks,
bp_mask,
bp_ids);
}
else if (rkpk_rem > 0) // we're starting a new tile row
{
skc_accum_reset(smem);
}
}
//
// append sk key to node_v
//
// if rkpk_rem is zero then return from kernel
//
skc_node_v_append_sk(&sk_s,
&xk_v,
&xk_v_next,
&xk_v_idx,
bp_elems,
rkpk_rem,
&blocks_next,
&blocks_idx,
&blocks,
bp_mask,
bp_ids);
// we're done if no more sk keys
if (rkpk_rem == 0)
break;
// move to new tile
yx_prev = sk_s.hi;
// scatter tts values into accumulator
skc_accum_scatter(smem,tts_v);
// replenish sk keys
skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks);
}
}
//
//
//