/* | |
* Copyright 2016 Google Inc. | |
* | |
* Use of this source code is governed by a BSD-style license that can | |
* be found in the LICENSE file. | |
* | |
*/ | |
// | |
// | |
// | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
#include "common/cuda/assert_cuda.h" | |
#include "common/macros.h" | |
#include "common/util.h" | |
#ifdef __cplusplus | |
} | |
#endif | |
// | |
// We want concurrent kernel execution to occur in a few places. | |
// | |
// The summary is: | |
// | |
// 1) If necessary, some max valued keys are written to the end of | |
// the vin/vout buffers. | |
// | |
// 2) Blocks of slabs of keys are sorted. | |
// | |
// 3) If necesary, the blocks of slabs are merged until complete. | |
// | |
// 4) If requested, the slabs will be converted from slab ordering | |
// to linear ordering. | |
// | |
// Below is the general "happens-before" relationship between HotSort | |
// compute kernels. | |
// | |
// Note the diagram assumes vin and vout are different buffers. If | |
// they're not, then the first merge doesn't include the pad_vout | |
// event in the wait list. | |
// | |
// +----------+ +---------+ | |
// | pad_vout | | pad_vin | | |
// +----+-----+ +----+----+ | |
// | | | |
// | WAITFOR(pad_vin) | |
// | | | |
// | +-----v-----+ | |
// | | | | |
// | +----v----+ +----v----+ | |
// | | bs_full | | bs_frac | | |
// | +----+----+ +----+----+ | |
// | | | | |
// | +-----v-----+ | |
// | | | |
// | +------NO------JUST ONE BLOCK? | |
// | / | | |
// |/ YES | |
// + | | |
// | v | |
// | END_WITH_EVENTS(bs_full,bs_frac) | |
// | | |
// | | |
// WAITFOR(pad_vout,bs_full,bs_frac) >>> first iteration of loop <<< | |
// | | |
// | | |
// +-----------<------------+ | |
// | | | |
// +-----v-----+ | | |
// | | | | |
// +----v----+ +----v----+ | | |
// | fm_full | | fm_frac | | | |
// +----+----+ +----+----+ | | |
// | | ^ | |
// +-----v-----+ | | |
// | | | |
// WAITFOR(fm_full,fm_frac) | | |
// | | | |
// v | | |
// +--v--+ WAITFOR(bc) | |
// | hm | | | |
// +-----+ | | |
// | | | |
// WAITFOR(hm) | | |
// | ^ | |
// +--v--+ | | |
// | bc | | | |
// +-----+ | | |
// | | | |
// v | | |
// MERGING COMPLETE?-------NO------+ | |
// | | |
// YES | |
// | | |
// v | |
// END_WITH_EVENTS(bc) | |
// | |
// | |
// NOTE: CUDA streams are in-order so a dependency isn't required for | |
// kernels launched on the same stream. | |
// | |
// This is actually a more subtle problem than it appears. | |
// | |
// We'll take a different approach and declare the "happens before" | |
// kernel relationships: | |
// | |
// concurrent (pad_vin,pad_vout) -> (pad_vin) happens_before (bs_full,bs_frac) | |
// (pad_vout) happens_before (fm_full,fm_frac) | |
// | |
// concurrent (bs_full,bs_frac) -> (bs_full) happens_before (fm_full,fm_frac) | |
// (bs_frac) happens_before (fm_full,fm_frac) | |
// | |
// concurrent (fm_full,fm_frac) -> (fm_full) happens_before (hm) | |
// (fm_frac) happens_before (hm) | |
// | |
// concurrent (fm_full,fm_frac) -> (fm_full) happens_before (hm) | |
// (fm_frac) happens_before (hm) | |
// | |
// launch (hm) -> (hm) happens_before (hm) | |
// (hm) happens_before (bc) | |
// | |
// launch (bc) -> (bc) happens_before (fm_full,fm_frac) | |
// | |
// | |
// We can go ahead and permanently map kernel launches to our 3 | |
// streams. As an optimization, we'll dynamically assign each kernel | |
// to the lowest available stream. This transforms the problem into | |
// one that considers streams happening before streams -- which | |
// kernels are involved doesn't matter. | |
// | |
// STREAM0 STREAM1 STREAM2 | |
// ------- ------- ------- | |
// | |
// pad_vin pad_vout (pad_vin) happens_before (bs_full,bs_frac) | |
// (pad_vout) happens_before (fm_full,fm_frac) | |
// | |
// bs_full bs_frac (bs_full) happens_before (fm_full,fm_frac) | |
// (bs_frac) happens_before (fm_full,fm_frac) | |
// | |
// fm_full fm_frac (fm_full) happens_before (hm or bc) | |
// (fm_frac) happens_before (hm or bc) | |
// | |
// hm (hm) happens_before (hm or bc) | |
// | |
// bc (bc) happens_before (fm_full,fm_frac) | |
// | |
// A single final kernel will always complete on stream 0. | |
// | |
// This simplifies reasoning about concurrency that's downstream of | |
// hs_cuda_sort(). | |
// | |
typedef void (*hs_kernel_offset_bs_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout, | |
HS_KEY_TYPE const * const HS_RESTRICT vin, | |
uint32_t const slab_offset); | |
static hs_kernel_offset_bs_pfn const hs_kernels_offset_bs[] | |
{ | |
#if HS_BS_SLABS_LOG2_RU >= 1 | |
hs_kernel_bs_0, | |
#endif | |
#if HS_BS_SLABS_LOG2_RU >= 2 | |
hs_kernel_bs_1, | |
#endif | |
#if HS_BS_SLABS_LOG2_RU >= 3 | |
hs_kernel_bs_2, | |
#endif | |
#if HS_BS_SLABS_LOG2_RU >= 4 | |
hs_kernel_bs_3, | |
#endif | |
#if HS_BS_SLABS_LOG2_RU >= 5 | |
hs_kernel_bs_4, | |
#endif | |
#if HS_BS_SLABS_LOG2_RU >= 6 | |
hs_kernel_bs_5, | |
#endif | |
#if HS_BS_SLABS_LOG2_RU >= 7 | |
hs_kernel_bs_6, | |
#endif | |
#if HS_BS_SLABS_LOG2_RU >= 8 | |
hs_kernel_bs_7, | |
#endif | |
}; | |
// | |
// | |
// | |
typedef void (*hs_kernel_bc_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout); | |
static hs_kernel_bc_pfn const hs_kernels_bc[] | |
{ | |
hs_kernel_bc_0, | |
#if HS_BC_SLABS_LOG2_MAX >= 1 | |
hs_kernel_bc_1, | |
#endif | |
#if HS_BC_SLABS_LOG2_MAX >= 2 | |
hs_kernel_bc_2, | |
#endif | |
#if HS_BC_SLABS_LOG2_MAX >= 3 | |
hs_kernel_bc_3, | |
#endif | |
#if HS_BC_SLABS_LOG2_MAX >= 4 | |
hs_kernel_bc_4, | |
#endif | |
#if HS_BC_SLABS_LOG2_MAX >= 5 | |
hs_kernel_bc_5, | |
#endif | |
#if HS_BC_SLABS_LOG2_MAX >= 6 | |
hs_kernel_bc_6, | |
#endif | |
#if HS_BC_SLABS_LOG2_MAX >= 7 | |
hs_kernel_bc_7, | |
#endif | |
#if HS_BC_SLABS_LOG2_MAX >= 8 | |
hs_kernel_bc_8, | |
#endif | |
}; | |
// | |
// | |
// | |
typedef void (*hs_kernel_hm_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout); | |
static hs_kernel_hm_pfn const hs_kernels_hm[] | |
{ | |
#if (HS_HM_SCALE_MIN == 0) | |
hs_kernel_hm_0, | |
#endif | |
#if (HS_HM_SCALE_MIN <= 1) && (1 <= HS_HM_SCALE_MAX) | |
hs_kernel_hm_1, | |
#endif | |
#if (HS_HM_SCALE_MIN <= 2) && (2 <= HS_HM_SCALE_MAX) | |
hs_kernel_hm_2, | |
#endif | |
}; | |
// | |
// | |
// | |
typedef void (*hs_kernel_fm_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout); | |
static hs_kernel_fm_pfn const hs_kernels_fm[] | |
{ | |
#if (HS_FM_SCALE_MIN == 0) | |
#if (HS_BS_SLABS_LOG2_RU == 1) | |
hs_kernel_fm_0_0, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU == 2) | |
hs_kernel_fm_0_1, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU == 3) | |
hs_kernel_fm_0_2, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU == 4) | |
hs_kernel_fm_0_3, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU == 5) | |
hs_kernel_fm_0_4, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU == 6) | |
hs_kernel_fm_0_5, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU == 7) | |
hs_kernel_fm_0_6, | |
#endif | |
#endif | |
#if (HS_FM_SCALE_MIN <= 1) && (1 <= HS_FM_SCALE_MAX) | |
CONCAT_MACRO(hs_kernel_fm_1_,HS_BS_SLABS_LOG2_RU) | |
#endif | |
#if (HS_FM_SCALE_MIN <= 2) && (2 <= HS_FM_SCALE_MAX) | |
#if (HS_BS_SLABS_LOG2_RU == 1) | |
hs_kernel_fm_2_2, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU == 2) | |
hs_kernel_fm_2_3, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU == 3) | |
hs_kernel_fm_2_4, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU == 4) | |
hs_kernel_fm_2_5, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU == 5) | |
hs_kernel_fm_2_6, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU == 6) | |
hs_kernel_fm_2_7, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU == 7) | |
hs_kernel_fm_2_8, | |
#endif | |
#endif | |
}; | |
// | |
// | |
// | |
typedef void (*hs_kernel_offset_fm_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout, | |
uint32_t const span_offset); | |
#if (HS_FM_SCALE_MIN == 0) | |
static hs_kernel_offset_fm_pfn const hs_kernels_offset_fm_0[] | |
{ | |
#if (HS_BS_SLABS_LOG2_RU >= 2) | |
hs_kernel_fm_0_0, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 3) | |
hs_kernel_fm_0_1, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 4) | |
hs_kernel_fm_0_2, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 5) | |
hs_kernel_fm_0_3, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 6) | |
hs_kernel_fm_0_4, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 7) | |
hs_kernel_fm_0_5, | |
#endif | |
}; | |
#endif | |
#if (HS_FM_SCALE_MIN <= 1) && (1 <= HS_FM_SCALE_MAX) | |
static hs_kernel_offset_fm_pfn const hs_kernels_offset_fm_1[] | |
{ | |
#if (HS_BS_SLABS_LOG2_RU >= 1) | |
hs_kernel_fm_1_0, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 2) | |
hs_kernel_fm_1_1, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 3) | |
hs_kernel_fm_1_2, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 4) | |
hs_kernel_fm_1_3, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 5) | |
hs_kernel_fm_1_4, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 6) | |
hs_kernel_fm_1_5, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 7) | |
hs_kernel_fm_1_6, | |
#endif | |
}; | |
#endif | |
#if (HS_FM_SCALE_MIN <= 2) && (2 <= HS_FM_SCALE_MAX) | |
static hs_kernel_offset_fm_pfn const hs_kernels_offset_fm_2[] | |
{ | |
hs_kernel_fm_2_0, | |
#if (HS_BS_SLABS_LOG2_RU >= 1) | |
hs_kernel_fm_2_1, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 2) | |
hs_kernel_fm_2_2, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 3) | |
hs_kernel_fm_2_3, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 4) | |
hs_kernel_fm_2_4, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 5) | |
hs_kernel_fm_2_5, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 6) | |
hs_kernel_fm_2_6, | |
#endif | |
#if (HS_BS_SLABS_LOG2_RU >= 7) | |
hs_kernel_fm_2_7, | |
#endif | |
}; | |
#endif | |
static hs_kernel_offset_fm_pfn const * const hs_kernels_offset_fm[] | |
{ | |
#if (HS_FM_SCALE_MIN == 0) | |
hs_kernels_offset_fm_0, | |
#endif | |
#if (HS_FM_SCALE_MIN <= 1) && (1 <= HS_FM_SCALE_MAX) | |
hs_kernels_offset_fm_1, | |
#endif | |
#if (HS_FM_SCALE_MIN <= 2) && (2 <= HS_FM_SCALE_MAX) | |
hs_kernels_offset_fm_2, | |
#endif | |
}; | |
// | |
// | |
// | |
typedef uint32_t hs_indices_t; | |
// | |
// | |
// | |
struct hs_state | |
{ | |
// key buffers | |
HS_KEY_TYPE * vin; | |
HS_KEY_TYPE * vout; // can be vin | |
cudaStream_t streams[3]; | |
// pool of stream indices | |
hs_indices_t pool; | |
// bx_ru is number of rounded up warps in vin | |
uint32_t bx_ru; | |
}; | |
// | |
// | |
// | |
static | |
uint32_t | |
hs_indices_acquire(hs_indices_t * const indices) | |
{ | |
// | |
// FIXME -- an FFS intrinsic might be faster but there are so few | |
// bits in this implementation that it might not matter. | |
// | |
if (*indices & 1) | |
{ | |
*indices = *indices & ~1; | |
return 0; | |
} | |
else if (*indices & 2) | |
{ | |
*indices = *indices & ~2; | |
return 1; | |
} | |
else // if (*indices & 4) | |
{ | |
*indices = *indices & ~4; | |
return 2; | |
} | |
} | |
static | |
uint32_t | |
hs_state_acquire(struct hs_state * const state, | |
hs_indices_t * const indices) | |
{ | |
// | |
// FIXME -- an FFS intrinsic might be faster but there are so few | |
// bits in this implementation that it might not matter. | |
// | |
if (state->pool & 1) | |
{ | |
state->pool &= ~1; | |
*indices |= 1; | |
return 0; | |
} | |
else if (state->pool & 2) | |
{ | |
state->pool &= ~2; | |
*indices |= 2; | |
return 1; | |
} | |
else // (state->pool & 4) | |
{ | |
state->pool &= ~4; | |
*indices |= 4; | |
return 2; | |
} | |
} | |
static | |
void | |
hs_indices_merge(hs_indices_t * const to, hs_indices_t const from) | |
{ | |
*to |= from; | |
} | |
static | |
void | |
hs_barrier_enqueue(cudaStream_t to, cudaStream_t from) | |
{ | |
cudaEvent_t event_before; | |
cuda(EventCreate(&event_before)); | |
cuda(EventRecord(event_before,from)); | |
cuda(StreamWaitEvent(to,event_before,0)); | |
cuda(EventDestroy(event_before)); | |
} | |
static | |
hs_indices_t | |
hs_barrier(struct hs_state * const state, | |
hs_indices_t const before, | |
hs_indices_t * const after, | |
uint32_t const count) // count is 1 or 2 | |
{ | |
// return streams this stage depends on back into the pool | |
hs_indices_merge(&state->pool,before); | |
hs_indices_t indices = 0; | |
// acquire 'count' stream indices for this stage | |
for (uint32_t ii=0; ii<count; ii++) | |
{ | |
hs_indices_t new_indices = 0; | |
// new index | |
uint32_t const idx = hs_state_acquire(state,&new_indices); | |
// add the new index to the indices | |
indices |= new_indices; | |
// only enqueue barriers when streams are different | |
uint32_t const wait = before & ~new_indices; | |
if (wait != 0) | |
{ | |
cudaStream_t to = state->streams[idx]; | |
// | |
// FIXME -- an FFS loop might be slower for so few bits. So | |
// leave it as is for now. | |
// | |
if (wait & 1) | |
hs_barrier_enqueue(to,state->streams[0]); | |
if (wait & 2) | |
hs_barrier_enqueue(to,state->streams[1]); | |
if (wait & 4) | |
hs_barrier_enqueue(to,state->streams[2]); | |
} | |
} | |
hs_indices_merge(after,indices); | |
return indices; | |
} | |
// | |
// | |
// | |
#ifndef NDEBUG | |
#include <stdio.h> | |
#define HS_STREAM_SYNCHRONIZE(s) \ | |
cuda(StreamSynchronize(s)); \ | |
fprintf(stderr,"%s\n",__func__); | |
#else | |
#define HS_STREAM_SYNCHRONIZE(s) | |
#endif | |
// | |
// | |
// | |
static | |
void | |
hs_transpose(struct hs_state * const state) | |
{ | |
HS_TRANSPOSE_KERNEL_NAME() | |
<<<state->bx_ru,HS_SLAB_THREADS,0,state->streams[0]>>> | |
(state->vout); | |
HS_STREAM_SYNCHRONIZE(state->streams[0]); | |
} | |
// | |
// | |
// | |
static | |
void | |
hs_bc(struct hs_state * const state, | |
hs_indices_t const hs_bc, | |
hs_indices_t * const fm, | |
uint32_t const down_slabs, | |
uint32_t const clean_slabs_log2) | |
{ | |
// enqueue any necessary barriers | |
hs_indices_t indices = hs_barrier(state,hs_bc,fm,1); | |
// block clean the minimal number of down_slabs_log2 spans | |
uint32_t const frac_ru = (1u << clean_slabs_log2) - 1; | |
uint32_t const full = (down_slabs + frac_ru) >> clean_slabs_log2; | |
uint32_t const threads = HS_SLAB_THREADS << clean_slabs_log2; | |
// stream will *always* be stream[0] | |
cudaStream_t stream = state->streams[hs_indices_acquire(&indices)]; | |
hs_kernels_bc[clean_slabs_log2] | |
<<<full,threads,0,stream>>> | |
(state->vout); | |
HS_STREAM_SYNCHRONIZE(stream); | |
} | |
// | |
// | |
// | |
static | |
uint32_t | |
hs_hm(struct hs_state * const state, | |
hs_indices_t const hs_bc, | |
hs_indices_t * const hs_bc_tmp, | |
uint32_t const down_slabs, | |
uint32_t const clean_slabs_log2) | |
{ | |
// enqueue any necessary barriers | |
hs_indices_t indices = hs_barrier(state,hs_bc,hs_bc_tmp,1); | |
// how many scaled half-merge spans are there? | |
uint32_t const frac_ru = (1 << clean_slabs_log2) - 1; | |
uint32_t const spans = (down_slabs + frac_ru) >> clean_slabs_log2; | |
// for now, just clamp to the max | |
uint32_t const log2_rem = clean_slabs_log2 - HS_BC_SLABS_LOG2_MAX; | |
uint32_t const scale_log2 = MIN_MACRO(HS_HM_SCALE_MAX,log2_rem); | |
uint32_t const log2_out = log2_rem - scale_log2; | |
// | |
// Size the grid | |
// | |
// The simplifying choices below limit the maximum keys that can be | |
// sorted with this grid scheme to around ~2B. | |
// | |
// .x : slab height << clean_log2 -- this is the slab span | |
// .y : [1...65535] -- this is the slab index | |
// .z : ( this could also be used to further expand .y ) | |
// | |
// Note that OpenCL declares a grid in terms of global threads and | |
// not grids and blocks | |
// | |
dim3 grid; | |
grid.x = (HS_SLAB_HEIGHT / HS_HM_BLOCK_HEIGHT) << log2_out; | |
grid.y = spans; | |
grid.z = 1; | |
cudaStream_t stream = state->streams[hs_indices_acquire(&indices)]; | |
hs_kernels_hm[scale_log2-HS_HM_SCALE_MIN] | |
<<<grid,HS_SLAB_THREADS * HS_HM_BLOCK_HEIGHT,0,stream>>> | |
(state->vout); | |
HS_STREAM_SYNCHRONIZE(stream); | |
return log2_out; | |
} | |
// | |
// FIXME -- some of this logic can be skipped if BS is a power-of-two | |
// | |
static | |
uint32_t | |
hs_fm(struct hs_state * const state, | |
hs_indices_t const fm, | |
hs_indices_t * const hs_bc, | |
uint32_t * const down_slabs, | |
uint32_t const up_scale_log2) | |
{ | |
// | |
// FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes | |
// a performance win to bias toward launching the smaller flip merge | |
// kernel in order to get more warps in flight (increased | |
// occupancy). This is useful when merging small numbers of slabs. | |
// | |
// Note that HS_FM_SCALE_MIN will always be 0 or 1. | |
// | |
// So, for now, just clamp to the max until there is a reason to | |
// restore the fancier and probably low-impact approach. | |
// | |
uint32_t const scale_log2 = MIN_MACRO(HS_FM_SCALE_MAX,up_scale_log2); | |
uint32_t const clean_log2 = up_scale_log2 - scale_log2; | |
// number of slabs in a full-sized scaled flip-merge span | |
uint32_t const full_span_slabs = HS_BS_SLABS << up_scale_log2; | |
// how many full-sized scaled flip-merge spans are there? | |
uint32_t full_fm = state->bx_ru / full_span_slabs; | |
uint32_t frac_fm = 0; | |
// initialize down_slabs | |
*down_slabs = full_fm * full_span_slabs; | |
// how many half-size scaled + fractional scaled spans are there? | |
uint32_t const span_rem = state->bx_ru - *down_slabs; | |
uint32_t const half_span_slabs = full_span_slabs >> 1; | |
// if we have over a half-span then fractionally merge it | |
if (span_rem > half_span_slabs) | |
{ | |
// the remaining slabs will be cleaned | |
*down_slabs += span_rem; | |
uint32_t const frac_rem = span_rem - half_span_slabs; | |
uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem); | |
if (frac_rem_pow2 >= half_span_slabs) | |
{ | |
// bump it up to a full span | |
full_fm += 1; | |
} | |
else | |
{ | |
// otherwise, add fractional | |
frac_fm = MAX_MACRO(1,frac_rem_pow2 >> clean_log2); | |
} | |
} | |
// enqueue any necessary barriers | |
bool const both = (full_fm != 0) && (frac_fm != 0); | |
hs_indices_t indices = hs_barrier(state,fm,hs_bc,both ? 2 : 1); | |
// | |
// Size the grid | |
// | |
// The simplifying choices below limit the maximum keys that can be | |
// sorted with this grid scheme to around ~2B. | |
// | |
// .x : slab height << clean_log2 -- this is the slab span | |
// .y : [1...65535] -- this is the slab index | |
// .z : ( this could also be used to further expand .y ) | |
// | |
// Note that OpenCL declares a grid in terms of global threads and | |
// not grids and blocks | |
// | |
dim3 grid; | |
grid.x = (HS_SLAB_HEIGHT / HS_FM_BLOCK_HEIGHT) << clean_log2; | |
grid.z = 1; | |
if (full_fm > 0) | |
{ | |
cudaStream_t stream = state->streams[hs_indices_acquire(&indices)]; | |
grid.y = full_fm; | |
hs_kernels_fm[scale_log2-HS_FM_SCALE_MIN] | |
<<<grid,HS_SLAB_THREADS * HS_FM_BLOCK_HEIGHT,0,stream>>> | |
(state->vout); | |
HS_STREAM_SYNCHRONIZE(stream); | |
} | |
if (frac_fm > 0) | |
{ | |
cudaStream_t stream = state->streams[hs_indices_acquire(&indices)]; | |
grid.y = 1; | |
hs_kernels_offset_fm[scale_log2-HS_FM_SCALE_MIN][msb_idx_u32(frac_fm)] | |
<<<grid,HS_SLAB_THREADS * HS_FM_BLOCK_HEIGHT,0,stream>>> | |
(state->vout,full_fm); | |
HS_STREAM_SYNCHRONIZE(stream); | |
} | |
return clean_log2; | |
} | |
// | |
// | |
// | |
static | |
void | |
hs_bs(struct hs_state * const state, | |
hs_indices_t const bs, | |
hs_indices_t * const fm, | |
uint32_t const count_padded_in) | |
{ | |
uint32_t const slabs_in = count_padded_in / HS_SLAB_KEYS; | |
uint32_t const full_bs = slabs_in / HS_BS_SLABS; | |
uint32_t const frac_bs = slabs_in - full_bs * HS_BS_SLABS; | |
bool const both = (full_bs != 0) && (frac_bs != 0); | |
// enqueue any necessary barriers | |
hs_indices_t indices = hs_barrier(state,bs,fm,both ? 2 : 1); | |
if (full_bs != 0) | |
{ | |
cudaStream_t stream = state->streams[hs_indices_acquire(&indices)]; | |
CONCAT_MACRO(hs_kernel_bs_,HS_BS_SLABS_LOG2_RU) | |
<<<full_bs,HS_BS_SLABS*HS_SLAB_THREADS,0,stream>>> | |
(state->vout,state->vin); | |
HS_STREAM_SYNCHRONIZE(stream); | |
} | |
if (frac_bs != 0) | |
{ | |
cudaStream_t stream = state->streams[hs_indices_acquire(&indices)]; | |
hs_kernels_offset_bs[msb_idx_u32(frac_bs)] | |
<<<1,frac_bs*HS_SLAB_THREADS,0,stream>>> | |
(state->vout,state->vin,full_bs*HS_BS_SLABS*HS_SLAB_THREADS); | |
HS_STREAM_SYNCHRONIZE(stream); | |
} | |
} | |
// | |
// | |
// | |
static | |
void | |
hs_keyset_pre_merge(struct hs_state * const state, | |
hs_indices_t * const fm, | |
uint32_t const count_lo, | |
uint32_t const count_hi) | |
{ | |
uint32_t const vout_span = count_hi - count_lo; | |
cudaStream_t stream = state->streams[hs_state_acquire(state,fm)]; | |
cuda(MemsetAsync(state->vout + count_lo, | |
0xFF, | |
vout_span * sizeof(HS_KEY_TYPE), | |
stream)); | |
} | |
// | |
// | |
// | |
static | |
void | |
hs_keyset_pre_sort(struct hs_state * const state, | |
hs_indices_t * const bs, | |
uint32_t const count, | |
uint32_t const count_hi) | |
{ | |
uint32_t const vin_span = count_hi - count; | |
cudaStream_t stream = state->streams[hs_state_acquire(state,bs)]; | |
cuda(MemsetAsync(state->vin + count, | |
0xFF, | |
vin_span * sizeof(HS_KEY_TYPE), | |
stream)); | |
} | |
// | |
// | |
// | |
void | |
CONCAT_MACRO(hs_cuda_sort_,HS_KEY_TYPE_PRETTY) | |
(HS_KEY_TYPE * const vin, | |
HS_KEY_TYPE * const vout, | |
uint32_t const count, | |
uint32_t const count_padded_in, | |
uint32_t const count_padded_out, | |
bool const linearize, | |
cudaStream_t stream0, // primary stream | |
cudaStream_t stream1, // auxilary | |
cudaStream_t stream2) // auxilary | |
{ | |
// is this sort in place? | |
bool const is_in_place = (vout == NULL); | |
// cq, buffers, wait list and slab count | |
struct hs_state state; | |
state.vin = vin; | |
state.vout = is_in_place ? vin : vout; | |
state.streams[0] = stream0; | |
state.streams[1] = stream1; | |
state.streams[2] = stream2; | |
state.pool = 0x7; // 3 bits | |
state.bx_ru = (count + HS_SLAB_KEYS - 1) / HS_SLAB_KEYS; | |
// initialize vin | |
uint32_t const count_hi = is_in_place ? count_padded_out : count_padded_in; | |
bool const is_pre_sort_keyset_reqd = count_hi > count; | |
bool const is_pre_merge_keyset_reqd = !is_in_place && (count_padded_out > count_padded_in); | |
hs_indices_t bs = 0; | |
// initialize any trailing keys in vin before sorting | |
if (is_pre_sort_keyset_reqd) | |
hs_keyset_pre_sort(&state,&bs,count,count_hi); | |
hs_indices_t fm = 0; | |
// concurrently initialize any trailing keys in vout before merging | |
if (is_pre_merge_keyset_reqd) | |
hs_keyset_pre_merge(&state,&fm,count_padded_in,count_padded_out); | |
// immediately sort blocks of slabs | |
hs_bs(&state,bs,&fm,count_padded_in); | |
// | |
// we're done if this was a single bs block... | |
// | |
// otherwise, merge sorted spans of slabs until done | |
// | |
if (state.bx_ru > HS_BS_SLABS) | |
{ | |
int32_t up_scale_log2 = 1; | |
while (true) | |
{ | |
hs_indices_t hs_or_bc = 0; | |
uint32_t down_slabs; | |
// flip merge slabs -- return span of slabs that must be cleaned | |
uint32_t clean_slabs_log2 = hs_fm(&state, | |
fm, | |
&hs_or_bc, | |
&down_slabs, | |
up_scale_log2); | |
// if span is gt largest slab block cleaner then half merge | |
while (clean_slabs_log2 > HS_BC_SLABS_LOG2_MAX) | |
{ | |
hs_indices_t hs_or_bc_tmp; | |
clean_slabs_log2 = hs_hm(&state, | |
hs_or_bc, | |
&hs_or_bc_tmp, | |
down_slabs, | |
clean_slabs_log2); | |
hs_or_bc = hs_or_bc_tmp; | |
} | |
// reset fm | |
fm = 0; | |
// launch clean slab grid -- is it the final launch? | |
hs_bc(&state, | |
hs_or_bc, | |
&fm, | |
down_slabs, | |
clean_slabs_log2); | |
// was this the final block clean? | |
if (((uint32_t)HS_BS_SLABS << up_scale_log2) >= state.bx_ru) | |
break; | |
// otherwise, merge twice as many slabs | |
up_scale_log2 += 1; | |
} | |
} | |
// slabs or linear? | |
if (linearize) { | |
// guaranteed to be on stream0 | |
hs_transpose(&state); | |
} | |
} | |
// | |
// all grids will be computed as a function of the minimum number of slabs | |
// | |
void | |
CONCAT_MACRO(hs_cuda_pad_,HS_KEY_TYPE_PRETTY) | |
(uint32_t const count, | |
uint32_t * const count_padded_in, | |
uint32_t * const count_padded_out) | |
{ | |
// | |
// round up the count to slabs | |
// | |
uint32_t const slabs_ru = (count + HS_SLAB_KEYS - 1) / HS_SLAB_KEYS; | |
uint32_t const blocks = slabs_ru / HS_BS_SLABS; | |
uint32_t const block_slabs = blocks * HS_BS_SLABS; | |
uint32_t const slabs_ru_rem = slabs_ru - block_slabs; | |
uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),HS_BS_SLABS); | |
*count_padded_in = (block_slabs + slabs_ru_rem_ru) * HS_SLAB_KEYS; | |
*count_padded_out = *count_padded_in; | |
// | |
// will merging be required? | |
// | |
if (slabs_ru > HS_BS_SLABS) | |
{ | |
// more than one block | |
uint32_t const blocks_lo = pow2_rd_u32(blocks); | |
uint32_t const block_slabs_lo = blocks_lo * HS_BS_SLABS; | |
uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo; | |
if (block_slabs_rem > 0) | |
{ | |
uint32_t const block_slabs_rem_ru = pow2_ru_u32(block_slabs_rem); | |
uint32_t const block_slabs_hi = MAX_MACRO(block_slabs_rem_ru, | |
blocks_lo << (1 - HS_FM_SCALE_MIN)); | |
uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi, | |
block_slabs_lo*2); // clamp non-pow2 blocks | |
*count_padded_out = block_slabs_padded_out * HS_SLAB_KEYS; | |
} | |
} | |
} | |
// | |
// | |
// | |
void | |
CONCAT_MACRO(hs_cuda_info_,HS_KEY_TYPE_PRETTY) | |
(uint32_t * const key_words, | |
uint32_t * const val_words, | |
uint32_t * const slab_height, | |
uint32_t * const slab_width_log2) | |
{ | |
*key_words = HS_KEY_WORDS; | |
*val_words = HS_VAL_WORDS; | |
*slab_height = HS_SLAB_HEIGHT; | |
*slab_width_log2 = HS_SLAB_WIDTH_LOG2; | |
} | |
// | |
// | |
// |