blob: 2373e1c43011903975d948e024963ba469ac3a28 [file] [log] [blame]
/*
* Copyright 2018 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS
// KERNEL WILL NEED TO BE UPDATED
//
#include "tile.h"
#include "atomic_cl.h"
#include "kernel_cl_12.h"
#include "hs/cl/intel/gen8/u64/hs_config.h"
#include "hs/cl/intel/hs_cl_macros.h"
//
//
//
#define HS_LANE_MASK (HS_SLAB_WIDTH - 1)
//
//
//
#define SKC_YX_NEQ(row,prev) \
(((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
//
//
//
__kernel
__attribute__((intel_reqd_sub_group_size(HS_SLAB_WIDTH)))
void
skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
__global uint * SKC_RESTRICT const indices,
__global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
{
uint const global_id = get_global_id(0);
uint const gmem_base = (global_id >> HS_SLAB_WIDTH_LOG2) * HS_SLAB_KEYS;
uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_SLAB_HEIGHT;
//
// LOAD ALL THE ROWS
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_SLAB_WIDTH];
HS_SLAB_ROWS();
//
// LOAD LAST REGISTER FROM COLUMN TO LEFT
//
uint diffs = 0;
uint2 r0 = r1;
if (gmem_base > 0) {
// if this is the first key in any slab but the first then it
// broadcast loads the last key in previous slab
r0.hi = as_uint2(vout[gmem_base - 1]).hi;
} else if (get_sub_group_local_id() == 0) {
// if this is the first lane in the first slab
diffs = 1;
}
// now shuffle in the last key from the column to the left
r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
//
// FIND ALL DIFFERENCES IN SLAB
//
uint valid = 0;
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
valid |= ((r##row != SKC_ULONG_MAX) << prev);
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
diffs |= (SKC_YX_NEQ(row,prev) << prev);
HS_SLAB_ROWS();
//
// SUM UP THE DIFFERENCES
//
uint const valid_diffs = valid & diffs;
uint const count = popcount(valid_diffs);
uint const inclusive = sub_group_scan_inclusive_add(count);
uint const exclusive = inclusive - count;
//
// RESERVE SPACE IN THE INDICES ARRAY
//
uint next = 0;
if (get_sub_group_local_id() == HS_SLAB_WIDTH-1)
next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
// distribute base across subgroup
next = exclusive + sub_group_broadcast(next,HS_SLAB_WIDTH-1);
//
// STORE THE INDICES
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (valid_diffs & (1 << prev)) \
indices[next++] = lane_idx + prev;
HS_SLAB_ROWS();
//
// TRANSPOSE THE SLAB AND STORE IT
//
HS_TRANSPOSE_SLAB();
}
//
//
//