blob: f6312680721477fe3325b59d508d2acb81a94a26 [file] [log] [blame]
/*
* Copyright 2018 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS
// KERNEL WILL NEED TO BE UPDATED
//
#include "tile.h"
#include "kernel_cl_12.h"
#include "raster_builder_cl_12.h" // need meta_in structure
#include "hs/cl/intel/gen8/u64/hs_config.h"
#include "hs/cl/intel/hs_cl_macros.h"
//
//
//
#define HS_LANE_MASK (HS_SLAB_WIDTH - 1)
//
// THE BEST TYPE TO ZERO SMEM
//
#define SKC_ZERO_TYPE ulong
#define SKC_ZERO_WORDS 2
//
// THE ORDER OF COMPONENTS IS:
//
// 0: blocks
// 1: offset
// 2: pk
// 3: rk
//
#if (HS_SLAB_KEYS < 256)
#define SKC_META_TYPE uint
#define SKC_META_WORDS 1
#define SKC_COMPONENT_TYPE uchar
#else
#define SKC_META_TYPE uint2
#define SKC_META_WORDS 2
#define SKC_COMPONENT_TYPE ushort
#endif
//
//
//
#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
#define SKC_COHORT_TYPE uchar
#else
#define SKC_COHORT_TYPE ushort
#endif
//
//
//
#define SKC_COHORT_ID(row) \
as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
//
// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
//
#define SKC_IS_BLOCK(row) \
((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
#define SKC_YX(row,prev) \
(as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
#define SKC_IS_PK(row,prev) \
((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
//
// COHORT SIZE IS ALWAYS A POWER-OF-TWO
// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
//
// COHORT SIZE >= SUBGROUP SIZE
//
#define SKC_COHORT_SIZE (1<<SKC_TTRK_HI_BITS_COHORT)
#define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS)
#define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_SLAB_WIDTH_LOG2))
#define SKC_META_COMPONENTS 4
#define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
//
//
//
__kernel
__attribute__((intel_reqd_sub_group_size(HS_SLAB_WIDTH)))
void
skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
__global uint * SKC_RESTRICT const metas)
{
__local union
{
SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
SKC_ZERO_TYPE z[SKC_META_ZERO_COUNT];
SKC_COMPONENT_TYPE c[SKC_META_COMPONENT_COUNT];
} shared;
uint const global_id = get_global_id(0);
uint const gmem_base = (global_id >> HS_SLAB_WIDTH_LOG2) * HS_SLAB_KEYS;
uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
uint const gmem_off = (global_id & HS_LANE_MASK) * HS_SLAB_HEIGHT;
//
// LOAD ALL THE ROWS
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_SLAB_WIDTH];
HS_SLAB_ROWS();
//
// LOAD LAST REGISTER FROM COLUMN TO LEFT
//
uint diffs = 0;
uint2 r0 = 0;
if (gmem_base > 0) {
// if this is the first key in any slab but the first then it
// broadcast loads the last key in previous slab
r0.hi = as_uint2(vout[gmem_base - 1]).hi;
} else {
// otherwise broadcast the first key in the first slab
r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
// and mark it as an implicit diff
if (get_sub_group_local_id() == 0)
diffs = 1;
}
// now shuffle in the last key from the column to the left
r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
// shift away y/x
SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
//
// EXTRACT ALL COHORT IDS EARLY...
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
HS_SLAB_ROWS();
//
// DEBUG
//
#if 0
if (gmem_base == HS_SLAB_KEYS * 7)
{
if (get_sub_group_local_id() == 0)
printf("\n%llX ",as_ulong(r0));
else
printf("%llX ",as_ulong(r0));
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (get_sub_group_local_id() == 0) \
printf("\n%llX ",r##row); \
else \
printf("%llX ",r##row);
HS_SLAB_ROWS();
}
#endif
//
// CAPTURE ALL CONDITIONS WE CARE ABOUT
//
// Diffs must be captured before cohorts
//
uint valid = 0;
uint blocks = 0;
uint pks = 0;
SKC_COHORT_TYPE c_max = 0;
//
// FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
// EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
//
#if 0
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
diffs |= ((c##row != c##prev) << prev);
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
blocks |= (SKC_IS_BLOCK(row) << prev);
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
pks |= SKC_IS_PK(row,prev) << prev);
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
valid |= ((r##row != SKC_ULONG_MAX) << prev);
HS_SLAB_ROWS();
#else
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (c##row != c##prev) \
diffs |= 1<<prev;
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (SKC_IS_BLOCK(row)) \
blocks |= 1<<prev;
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (SKC_IS_PK(row,prev)) \
pks |= 1<<prev;
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (r##row != SKC_ULONG_MAX) { \
valid |= 1<<prev; \
c_max = max(c_max,c##row); \
}
HS_SLAB_ROWS();
#endif
//
// TRANSPOSE THE SLAB AND STORE IT
//
HS_TRANSPOSE_SLAB();
// the min cohort is the first key in the slab
uint const c_min = sub_group_broadcast(c1,0);
// the max cohort is the max across all lanes
c_max = sub_group_reduce_max(c_max);
#if 0 // REMOVE ME LATER
if (get_sub_group_local_id() == 0)
printf("%3u : ( %3u , %3u )\n",
get_global_id(0)>>HS_SLAB_WIDTH_LOG2,c_min,c_max);
#endif
//
// ZERO SMEM
//
// zero only the meta info for the cohort ids found in this slab
//
#if (SKC_ZERO_WORDS >= SKC_META_WORDS)
uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
for (; zz<=zz_max; zz+=HS_SLAB_WIDTH)
shared.z[zz] = 0;
#else
// ERROR -- it's highly unlikely that the zero type is smaller than
// the meta type
#error("Unsupported right now...")
#endif
//
// ACCUMULATE AND STORE META INFO
//
uint const valid_blocks = valid & blocks;
uint const valid_pks = valid & pks & ~diffs;
SKC_META_TYPE meta = ( 0 );
#define SKC_META_LOCAL_ADD(meta) \
atomic_add(shared.m+HS_REG_LAST(c),meta);
#define SKC_META_LOCAL_STORE(meta,prev) \
shared.m[c##prev] = meta;
// note this is purposefully off by +1
#define SKC_META_RESET(meta,curr) \
meta = ((gmem_off + curr) << 8);
#if 0
// FIXME -- this can be tweaked to shift directly
#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
meta += ((((blocks >> prev) & 1) ) | \
(((pks >> prev) & 1) << 16) | \
(((rks >> prev) & 1) << 24));
#else
#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
if (blocks & (1<<prev)) \
meta += 1; \
if (pks & (1<<prev)) \
meta += 1<<16; \
if (rks & (1<<prev)) \
meta += 1<<24;
#endif
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (diffs & (1<<prev)) { \
SKC_META_LOCAL_STORE(meta,prev); \
SKC_META_RESET(meta,row); \
} \
SKC_META_ADD(meta,prev, \
valid_blocks, \
valid_pks, \
valid);
HS_SLAB_ROWS();
//
// ATOMICALLY ADD THE CARRIED OUT METAS
//
#if 0 // BUG
if ((valid & (1<<(HS_SLAB_HEIGHT-1))) && (meta != 0))
SKC_META_LOCAL_ADD(meta);
#else
if (meta != 0)
SKC_META_LOCAL_ADD(meta);
#endif
//
// NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
//
// convert the slab offset to an extent offset
bool const is_offset = (get_sub_group_local_id() & 3) == 1;
uint const adjust = is_offset ? gmem_base - 1 : 0;
//
// only process the meta components found in this slab
//
uint const cc_min = c_min * SKC_META_COMPONENTS;
uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
uint cc = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
if ((cc >= cc_min) && (cc <= cc_max))
{
uint const c = shared.c[cc];
if (c != 0)
atomic_add(metas+cc,c+adjust);
}
cc += HS_SLAB_WIDTH;
for (; cc<=cc_max; cc+=HS_SLAB_WIDTH)
{
uint const c = shared.c[cc];
if (c != 0)
atomic_add(metas+cc,c+adjust);
}
}
//
//
//