blob: d34fa22d7eea5089eac0320d84c5c2deafaad046 [file] [log] [blame]
//
// Copyright 2016 Google Inc.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//
#ifndef HS_CUDA_MACROS_ONCE
#define HS_CUDA_MACROS_ONCE
//
//
//
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
#ifdef __cplusplus
}
#endif
//
// Define the type based on key and val sizes
//
#if HS_KEY_WORDS == 1
#if HS_VAL_WORDS == 0
#define HS_KEY_TYPE uint32_t
#endif
#elif HS_KEY_WORDS == 2
#define HS_KEY_TYPE uint64_t
#endif
//
// FYI, restrict shouldn't have any impact on these kernels and
// benchmarks appear to prove that true
//
#define HS_RESTRICT __restrict__
//
//
//
#define HS_SCOPE() \
static
#define HS_KERNEL_QUALIFIER() \
__global__ void
//
// The sm_35 arch has a maximum of 16 blocks per multiprocessor. Just
// clamp it to 16 when targeting this arch.
//
// This only arises when compiling the 32-bit sorting kernels.
//
// You can also generate a narrower 16-warp wide 32-bit sorting kernel
// which is sometimes faster and sometimes slower than the 32-block
// configuration.
//
#if ( __CUDA_ARCH__ == 350 )
#define HS_CUDA_MAX_BPM 16
#else
#define HS_CUDA_MAX_BPM UINT32_MAX // 32
#endif
#define HS_CLAMPED_BPM(min_bpm) \
((min_bpm) < HS_CUDA_MAX_BPM ? (min_bpm) : HS_CUDA_MAX_BPM)
//
//
//
#define HS_LAUNCH_BOUNDS(max_tpb,min_bpm) \
__launch_bounds__(max_tpb,HS_CLAMPED_BPM(min_bpm))
//
// KERNEL PROTOS
//
#define HS_BS_KERNEL_NAME(slab_count_ru_log2) \
hs_kernel_bs_##slab_count_ru_log2
#define HS_BS_KERNEL_PROTO(slab_count,slab_count_ru_log2) \
HS_SCOPE() \
HS_KERNEL_QUALIFIER() \
HS_LAUNCH_BOUNDS(HS_SLAB_THREADS*slab_count,1) \
HS_BS_KERNEL_NAME(slab_count_ru_log2)(HS_KEY_TYPE * const HS_RESTRICT vout, \
HS_KEY_TYPE const * const HS_RESTRICT vin)
//
#define HS_OFFSET_BS_KERNEL_NAME(slab_count_ru_log2) \
hs_kernel_bs_##slab_count_ru_log2
#define HS_OFFSET_BS_KERNEL_PROTO(slab_count,slab_count_ru_log2) \
HS_SCOPE() \
HS_KERNEL_QUALIFIER() \
HS_LAUNCH_BOUNDS(HS_SLAB_THREADS*slab_count,HS_BS_SLABS/(1<<slab_count_ru_log2)) \
HS_OFFSET_BS_KERNEL_NAME(slab_count_ru_log2)(HS_KEY_TYPE * const HS_RESTRICT vout, \
HS_KEY_TYPE const * const HS_RESTRICT vin, \
uint32_t const slab_offset)
//
#define HS_BC_KERNEL_NAME(slab_count_log2) \
hs_kernel_bc_##slab_count_log2
#define HS_BC_KERNEL_PROTO(slab_count,slab_count_log2) \
HS_SCOPE() \
HS_KERNEL_QUALIFIER() \
HS_LAUNCH_BOUNDS(HS_SLAB_THREADS*slab_count,HS_BS_SLABS/(1<<slab_count_log2)) \
HS_BC_KERNEL_NAME(slab_count_log2)(HS_KEY_TYPE * const HS_RESTRICT vout)
//
#define HS_HM_KERNEL_NAME(s) \
hs_kernel_hm_##s
#define HS_HM_KERNEL_PROTO(s) \
HS_SCOPE() \
HS_KERNEL_QUALIFIER() \
HS_HM_KERNEL_NAME(s)(HS_KEY_TYPE * const HS_RESTRICT vout)
//
#define HS_FM_KERNEL_NAME(s,r) \
hs_kernel_fm_##s##_##r
#define HS_FM_KERNEL_PROTO(s,r) \
HS_SCOPE() \
HS_KERNEL_QUALIFIER() \
HS_FM_KERNEL_NAME(s,r)(HS_KEY_TYPE * const HS_RESTRICT vout)
//
#define HS_OFFSET_FM_KERNEL_NAME(s,r) \
hs_kernel_fm_##s##_##r
#define HS_OFFSET_FM_KERNEL_PROTO(s,r) \
HS_SCOPE() \
HS_KERNEL_QUALIFIER() \
HS_OFFSET_FM_KERNEL_NAME(s,r)(HS_KEY_TYPE * const HS_RESTRICT vout, \
uint32_t const span_offset)
//
#define HS_TRANSPOSE_KERNEL_NAME() \
hs_kernel_transpose
#define HS_TRANSPOSE_KERNEL_PROTO() \
HS_SCOPE() \
HS_KERNEL_QUALIFIER() \
HS_LAUNCH_BOUNDS(HS_SLAB_THREADS,1) \
HS_TRANSPOSE_KERNEL_NAME()(HS_KEY_TYPE * const HS_RESTRICT vout)
//
// BLOCK LOCAL MEMORY DECLARATION
//
#define HS_BLOCK_LOCAL_MEM_DECL(width,height) \
__shared__ struct { \
HS_KEY_TYPE m[width * height]; \
} shared
//
// BLOCK BARRIER
//
#define HS_BLOCK_BARRIER() \
__syncthreads()
//
// GRID VARIABLES
//
#define HS_GLOBAL_SIZE_X() (gridDim.x * blockDim.x)
#define HS_GLOBAL_ID_X() (blockDim.x * blockIdx.x + threadIdx.x)
#define HS_LOCAL_ID_X() threadIdx.x
#define HS_WARP_ID_X() (threadIdx.x / 32)
#define HS_LANE_ID() (threadIdx.x & 31)
//
// SLAB GLOBAL
//
#define HS_SLAB_GLOBAL_PREAMBLE() \
uint32_t const gmem_idx = \
(HS_GLOBAL_ID_X() & ~(HS_SLAB_THREADS-1)) * \
HS_SLAB_HEIGHT + HS_LANE_ID()
#define HS_OFFSET_SLAB_GLOBAL_PREAMBLE() \
uint32_t const gmem_idx = \
((slab_offset + HS_GLOBAL_ID_X()) & ~(HS_SLAB_THREADS-1)) * \
HS_SLAB_HEIGHT + HS_LANE_ID()
#define HS_SLAB_GLOBAL_LOAD(extent,row_idx) \
extent[gmem_idx + HS_SLAB_THREADS * row_idx]
#define HS_SLAB_GLOBAL_STORE(row_idx,reg) \
vout[gmem_idx + HS_SLAB_THREADS * row_idx] = reg
//
// SLAB LOCAL
//
#define HS_SLAB_LOCAL_L(offset) \
shared.m[smem_l_idx + (offset)]
#define HS_SLAB_LOCAL_R(offset) \
shared.m[smem_r_idx + (offset)]
//
// SLAB LOCAL VERTICAL LOADS
//
#define HS_BX_LOCAL_V(offset) \
shared.m[HS_LOCAL_ID_X() + (offset)]
//
// BLOCK SORT MERGE HORIZONTAL
//
#define HS_BS_MERGE_H_PREAMBLE(slab_count) \
uint32_t const smem_l_idx = \
HS_WARP_ID_X() * (HS_SLAB_THREADS * slab_count) + \
HS_LANE_ID(); \
uint32_t const smem_r_idx = \
(HS_WARP_ID_X() ^ 1) * (HS_SLAB_THREADS * slab_count) + \
(HS_LANE_ID() ^ (HS_SLAB_THREADS - 1))
//
// BLOCK CLEAN MERGE HORIZONTAL
//
#define HS_BC_MERGE_H_PREAMBLE(slab_count) \
uint32_t const gmem_l_idx = \
(HS_GLOBAL_ID_X() & ~(HS_SLAB_THREADS*slab_count-1)) * \
HS_SLAB_HEIGHT + HS_LOCAL_ID_X(); \
uint32_t const smem_l_idx = \
HS_WARP_ID_X() * (HS_SLAB_THREADS * slab_count) + \
HS_LANE_ID()
#define HS_BC_GLOBAL_LOAD_L(slab_idx) \
vout[gmem_l_idx + (HS_SLAB_THREADS * slab_idx)]
//
// SLAB FLIP AND HALF PREAMBLES
//
#define HS_SLAB_FLIP_PREAMBLE(mask) \
uint32_t const flip_lane_idx = HS_LANE_ID() ^ mask; \
int32_t const t_lt = HS_LANE_ID() < flip_lane_idx;
// if we want to shlf_xor: uint32_t const flip_lane_mask = mask;
#define HS_SLAB_HALF_PREAMBLE(mask) \
uint32_t const half_lane_idx = HS_LANE_ID() ^ mask; \
int32_t const t_lt = HS_LANE_ID() < half_lane_idx;
// if we want to shfl_xor: uint32_t const half_lane_mask = mask;
//
// Inter-lane compare exchange
//
// good
#define HS_CMP_XCHG_V0(a,b) \
{ \
HS_KEY_TYPE const t = min(a,b); \
b = max(a,b); \
a = t; \
}
// surprisingly fast -- #1 on 64-bit keys
#define HS_CMP_XCHG_V1(a,b) \
{ \
HS_KEY_TYPE const tmp = a; \
a = (a < b) ? a : b; \
b ^= a ^ tmp; \
}
// good
#define HS_CMP_XCHG_V2(a,b) \
if (a >= b) { \
HS_KEY_TYPE const t = a; \
a = b; \
b = t; \
}
// good
#define HS_CMP_XCHG_V3(a,b) \
{ \
int32_t const ge = a >= b; \
HS_KEY_TYPE const t = a; \
a = ge ? b : a; \
b = ge ? t : b; \
}
//
//
//
#if (HS_KEY_WORDS == 1)
#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b)
#elif (HS_KEY_WORDS == 2)
#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b)
#endif
//
// The flip/half comparisons rely on a "conditional min/max":
//
// - if the flag is false, return min(a,b)
// - otherwise, return max(a,b)
//
// What's a little surprising is that sequence (1) is faster than (2)
// for 32-bit keys.
//
// I suspect either a code generation problem or that the sequence
// maps well to the GEN instruction set.
//
// We mostly care about 64-bit keys and unsurprisingly sequence (2) is
// fastest for this wider type.
//
// this is what you would normally use
#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a
// this seems to be faster for 32-bit keys
#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b))
//
//
//
#if (HS_KEY_WORDS == 1)
#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
#elif (HS_KEY_WORDS == 2)
#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
#endif
//
// HotSort shuffles are always warp-wide
//
#define HS_SHFL_ALL 0xFFFFFFFF
//
// Conditional inter-subgroup flip/half compare exchange
//
#define HS_CMP_FLIP(i,a,b) \
{ \
HS_KEY_TYPE const ta = __shfl_sync(HS_SHFL_ALL,a,flip_lane_idx); \
HS_KEY_TYPE const tb = __shfl_sync(HS_SHFL_ALL,b,flip_lane_idx); \
a = HS_COND_MIN_MAX(t_lt,a,tb); \
b = HS_COND_MIN_MAX(t_lt,b,ta); \
}
#define HS_CMP_HALF(i,a) \
{ \
HS_KEY_TYPE const ta = __shfl_sync(HS_SHFL_ALL,a,half_lane_idx); \
a = HS_COND_MIN_MAX(t_lt,a,ta); \
}
//
// The device's comparison operator might return what we actually
// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}.
//
#define HS_CMP_IS_ZERO_ONE
#ifdef HS_CMP_IS_ZERO_ONE
// OpenCL requires a {true: +1, false: 0} scalar result
// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF }
#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b))
#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a)
#else
// However, OpenCL requires { -1, 0 } for vectors
// (a < b) -> { 0xFFFFFFFF, 0 }
#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64
#define HS_CMP_TO_MASK(a) (a)
#endif
//
// The "flip-merge" and "half-merge" preambles are very similar
//
// For now, we're only using the .y dimension for the span idx
//
#define HS_OFFSET_HM_PREAMBLE(half_span,span_offset) \
uint32_t const span_idx = span_offset + blockIdx.y; \
uint32_t const span_stride = HS_GLOBAL_SIZE_X(); \
uint32_t const span_size = span_stride * half_span * 2; \
uint32_t const span_base = span_idx * span_size; \
uint32_t const span_off = HS_GLOBAL_ID_X(); \
uint32_t const span_l = span_base + span_off
#define HS_HM_PREAMBLE(half_span) \
HS_OFFSET_HM_PREAMBLE(half_span,0) \
#define HS_FM_PREAMBLE(half_span) \
HS_HM_PREAMBLE(half_span); \
uint32_t const span_r = span_base + span_stride * (half_span + 1) - span_off - 1
#define HS_OFFSET_FM_PREAMBLE(half_span) \
HS_OFFSET_HM_PREAMBLE(half_span,span_offset); \
uint32_t const span_r = span_base + span_stride * (half_span + 1) - span_off - 1
//
//
//
#define HS_XM_GLOBAL_L(stride_idx) \
vout[span_l + span_stride * stride_idx]
#define HS_XM_GLOBAL_LOAD_L(stride_idx) \
HS_XM_GLOBAL_L(stride_idx)
#define HS_XM_GLOBAL_STORE_L(stride_idx,reg) \
HS_XM_GLOBAL_L(stride_idx) = reg
#define HS_FM_GLOBAL_R(stride_idx) \
vout[span_r + span_stride * stride_idx]
#define HS_FM_GLOBAL_LOAD_R(stride_idx) \
HS_FM_GLOBAL_R(stride_idx)
#define HS_FM_GLOBAL_STORE_R(stride_idx,reg) \
HS_FM_GLOBAL_R(stride_idx) = reg
//
// This snarl of macros is for transposing a "slab" of sorted elements
// into linear order.
//
// This can occur as the last step in hs_sort() or via a custom kernel
// that inspects the slab and then transposes and stores it to memory.
//
// The slab format can be inspected more efficiently than a linear
// arrangement.
//
// The prime example is detecting when adjacent keys (in sort order)
// have differing high order bits ("key changes"). The index of each
// change is recorded to an auxilary array.
//
// A post-processing step like this needs to be able to navigate the
// slab and eventually transpose and store the slab in linear order.
//
#define HS_SUBGROUP_SHUFFLE_XOR(v,m) __shfl_xor_sync(HS_SHFL_ALL,v,m)
#define HS_TRANSPOSE_REG(prefix,row) prefix##row
#define HS_TRANSPOSE_DECL(prefix,row) HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row)
#define HS_TRANSPOSE_PRED(level) is_lo_##level
#define HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) \
prefix_curr##row_ll##_##row_ur
#define HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) \
HS_KEY_TYPE const HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur)
#define HS_TRANSPOSE_STAGE(level) \
bool const HS_TRANSPOSE_PRED(level) = \
(HS_LANE_ID() & (1 << (level-1))) == 0;
#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \
HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) = \
HS_SUBGROUP_SHUFFLE_XOR(HS_TRANSPOSE_PRED(level) ? \
HS_TRANSPOSE_REG(prefix_prev,row_ll) : \
HS_TRANSPOSE_REG(prefix_prev,row_ur), \
1<<(level-1)); \
\
HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \
HS_TRANSPOSE_PRED(level) ? \
HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) : \
HS_TRANSPOSE_REG(prefix_prev,row_ll); \
\
HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \
HS_TRANSPOSE_PRED(level) ? \
HS_TRANSPOSE_REG(prefix_prev,row_ur) : \
HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur);
#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \
vout[gmem_idx + ((row_to-1) << HS_SLAB_WIDTH_LOG2)] = \
HS_TRANSPOSE_REG(prefix,row_from);
//
//
//
#endif
//
//
//