blob: 074a97e1d9dffb02b2b178a16aed4cb543f930bc [file] [log] [blame]
/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#ifdef __cplusplus
extern "C" {
#endif
#include "common/cuda/assert_cuda.h"
#include "common/macros.h"
#include "common/util.h"
#ifdef __cplusplus
}
#endif
//
// We want concurrent kernel execution to occur in a few places.
//
// The summary is:
//
// 1) If necessary, some max valued keys are written to the end of
// the vin/vout buffers.
//
// 2) Blocks of slabs of keys are sorted.
//
// 3) If necesary, the blocks of slabs are merged until complete.
//
// 4) If requested, the slabs will be converted from slab ordering
// to linear ordering.
//
// Below is the general "happens-before" relationship between HotSort
// compute kernels.
//
// Note the diagram assumes vin and vout are different buffers. If
// they're not, then the first merge doesn't include the pad_vout
// event in the wait list.
//
// +----------+ +---------+
// | pad_vout | | pad_vin |
// +----+-----+ +----+----+
// | |
// | WAITFOR(pad_vin)
// | |
// | +-----v-----+
// | | |
// | +----v----+ +----v----+
// | | bs_full | | bs_frac |
// | +----+----+ +----+----+
// | | |
// | +-----v-----+
// | |
// | +------NO------JUST ONE BLOCK?
// | / |
// |/ YES
// + |
// | v
// | END_WITH_EVENTS(bs_full,bs_frac)
// |
// |
// WAITFOR(pad_vout,bs_full,bs_frac) >>> first iteration of loop <<<
// |
// |
// +-----------<------------+
// | |
// +-----v-----+ |
// | | |
// +----v----+ +----v----+ |
// | fm_full | | fm_frac | |
// +----+----+ +----+----+ |
// | | ^
// +-----v-----+ |
// | |
// WAITFOR(fm_full,fm_frac) |
// | |
// v |
// +--v--+ WAITFOR(bc)
// | hm | |
// +-----+ |
// | |
// WAITFOR(hm) |
// | ^
// +--v--+ |
// | bc | |
// +-----+ |
// | |
// v |
// MERGING COMPLETE?-------NO------+
// |
// YES
// |
// v
// END_WITH_EVENTS(bc)
//
//
// NOTE: CUDA streams are in-order so a dependency isn't required for
// kernels launched on the same stream.
//
// This is actually a more subtle problem than it appears.
//
// We'll take a different approach and declare the "happens before"
// kernel relationships:
//
// concurrent (pad_vin,pad_vout) -> (pad_vin) happens_before (bs_full,bs_frac)
// (pad_vout) happens_before (fm_full,fm_frac)
//
// concurrent (bs_full,bs_frac) -> (bs_full) happens_before (fm_full,fm_frac)
// (bs_frac) happens_before (fm_full,fm_frac)
//
// concurrent (fm_full,fm_frac) -> (fm_full) happens_before (hm)
// (fm_frac) happens_before (hm)
//
// concurrent (fm_full,fm_frac) -> (fm_full) happens_before (hm)
// (fm_frac) happens_before (hm)
//
// launch (hm) -> (hm) happens_before (hm)
// (hm) happens_before (bc)
//
// launch (bc) -> (bc) happens_before (fm_full,fm_frac)
//
//
// We can go ahead and permanently map kernel launches to our 3
// streams. As an optimization, we'll dynamically assign each kernel
// to the lowest available stream. This transforms the problem into
// one that considers streams happening before streams -- which
// kernels are involved doesn't matter.
//
// STREAM0 STREAM1 STREAM2
// ------- ------- -------
//
// pad_vin pad_vout (pad_vin) happens_before (bs_full,bs_frac)
// (pad_vout) happens_before (fm_full,fm_frac)
//
// bs_full bs_frac (bs_full) happens_before (fm_full,fm_frac)
// (bs_frac) happens_before (fm_full,fm_frac)
//
// fm_full fm_frac (fm_full) happens_before (hm or bc)
// (fm_frac) happens_before (hm or bc)
//
// hm (hm) happens_before (hm or bc)
//
// bc (bc) happens_before (fm_full,fm_frac)
//
// A single final kernel will always complete on stream 0.
//
// This simplifies reasoning about concurrency that's downstream of
// hs_cuda_sort().
//
typedef void (*hs_kernel_offset_bs_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout,
HS_KEY_TYPE const * const HS_RESTRICT vin,
uint32_t const slab_offset);
static hs_kernel_offset_bs_pfn const hs_kernels_offset_bs[]
{
#if HS_BS_SLABS_LOG2_RU >= 1
hs_kernel_bs_0,
#endif
#if HS_BS_SLABS_LOG2_RU >= 2
hs_kernel_bs_1,
#endif
#if HS_BS_SLABS_LOG2_RU >= 3
hs_kernel_bs_2,
#endif
#if HS_BS_SLABS_LOG2_RU >= 4
hs_kernel_bs_3,
#endif
#if HS_BS_SLABS_LOG2_RU >= 5
hs_kernel_bs_4,
#endif
#if HS_BS_SLABS_LOG2_RU >= 6
hs_kernel_bs_5,
#endif
#if HS_BS_SLABS_LOG2_RU >= 7
hs_kernel_bs_6,
#endif
#if HS_BS_SLABS_LOG2_RU >= 8
hs_kernel_bs_7,
#endif
};
//
//
//
typedef void (*hs_kernel_bc_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout);
static hs_kernel_bc_pfn const hs_kernels_bc[]
{
hs_kernel_bc_0,
#if HS_BC_SLABS_LOG2_MAX >= 1
hs_kernel_bc_1,
#endif
#if HS_BC_SLABS_LOG2_MAX >= 2
hs_kernel_bc_2,
#endif
#if HS_BC_SLABS_LOG2_MAX >= 3
hs_kernel_bc_3,
#endif
#if HS_BC_SLABS_LOG2_MAX >= 4
hs_kernel_bc_4,
#endif
#if HS_BC_SLABS_LOG2_MAX >= 5
hs_kernel_bc_5,
#endif
#if HS_BC_SLABS_LOG2_MAX >= 6
hs_kernel_bc_6,
#endif
#if HS_BC_SLABS_LOG2_MAX >= 7
hs_kernel_bc_7,
#endif
#if HS_BC_SLABS_LOG2_MAX >= 8
hs_kernel_bc_8,
#endif
};
//
//
//
typedef void (*hs_kernel_hm_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout);
static hs_kernel_hm_pfn const hs_kernels_hm[]
{
#if (HS_HM_SCALE_MIN == 0)
hs_kernel_hm_0,
#endif
#if (HS_HM_SCALE_MIN <= 1) && (1 <= HS_HM_SCALE_MAX)
hs_kernel_hm_1,
#endif
#if (HS_HM_SCALE_MIN <= 2) && (2 <= HS_HM_SCALE_MAX)
hs_kernel_hm_2,
#endif
};
//
//
//
typedef void (*hs_kernel_fm_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout);
static hs_kernel_fm_pfn const hs_kernels_fm[]
{
#if (HS_FM_SCALE_MIN == 0)
#if (HS_BS_SLABS_LOG2_RU == 1)
hs_kernel_fm_0_0,
#endif
#if (HS_BS_SLABS_LOG2_RU == 2)
hs_kernel_fm_0_1,
#endif
#if (HS_BS_SLABS_LOG2_RU == 3)
hs_kernel_fm_0_2,
#endif
#if (HS_BS_SLABS_LOG2_RU == 4)
hs_kernel_fm_0_3,
#endif
#if (HS_BS_SLABS_LOG2_RU == 5)
hs_kernel_fm_0_4,
#endif
#if (HS_BS_SLABS_LOG2_RU == 6)
hs_kernel_fm_0_5,
#endif
#if (HS_BS_SLABS_LOG2_RU == 7)
hs_kernel_fm_0_6,
#endif
#endif
#if (HS_FM_SCALE_MIN <= 1) && (1 <= HS_FM_SCALE_MAX)
CONCAT_MACRO(hs_kernel_fm_1_,HS_BS_SLABS_LOG2_RU)
#endif
#if (HS_FM_SCALE_MIN <= 2) && (2 <= HS_FM_SCALE_MAX)
#if (HS_BS_SLABS_LOG2_RU == 1)
hs_kernel_fm_2_2,
#endif
#if (HS_BS_SLABS_LOG2_RU == 2)
hs_kernel_fm_2_3,
#endif
#if (HS_BS_SLABS_LOG2_RU == 3)
hs_kernel_fm_2_4,
#endif
#if (HS_BS_SLABS_LOG2_RU == 4)
hs_kernel_fm_2_5,
#endif
#if (HS_BS_SLABS_LOG2_RU == 5)
hs_kernel_fm_2_6,
#endif
#if (HS_BS_SLABS_LOG2_RU == 6)
hs_kernel_fm_2_7,
#endif
#if (HS_BS_SLABS_LOG2_RU == 7)
hs_kernel_fm_2_8,
#endif
#endif
};
//
//
//
typedef void (*hs_kernel_offset_fm_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout,
uint32_t const span_offset);
#if (HS_FM_SCALE_MIN == 0)
static hs_kernel_offset_fm_pfn const hs_kernels_offset_fm_0[]
{
#if (HS_BS_SLABS_LOG2_RU >= 2)
hs_kernel_fm_0_0,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 3)
hs_kernel_fm_0_1,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 4)
hs_kernel_fm_0_2,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 5)
hs_kernel_fm_0_3,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 6)
hs_kernel_fm_0_4,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 7)
hs_kernel_fm_0_5,
#endif
};
#endif
#if (HS_FM_SCALE_MIN <= 1) && (1 <= HS_FM_SCALE_MAX)
static hs_kernel_offset_fm_pfn const hs_kernels_offset_fm_1[]
{
#if (HS_BS_SLABS_LOG2_RU >= 1)
hs_kernel_fm_1_0,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 2)
hs_kernel_fm_1_1,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 3)
hs_kernel_fm_1_2,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 4)
hs_kernel_fm_1_3,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 5)
hs_kernel_fm_1_4,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 6)
hs_kernel_fm_1_5,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 7)
hs_kernel_fm_1_6,
#endif
};
#endif
#if (HS_FM_SCALE_MIN <= 2) && (2 <= HS_FM_SCALE_MAX)
static hs_kernel_offset_fm_pfn const hs_kernels_offset_fm_2[]
{
hs_kernel_fm_2_0,
#if (HS_BS_SLABS_LOG2_RU >= 1)
hs_kernel_fm_2_1,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 2)
hs_kernel_fm_2_2,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 3)
hs_kernel_fm_2_3,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 4)
hs_kernel_fm_2_4,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 5)
hs_kernel_fm_2_5,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 6)
hs_kernel_fm_2_6,
#endif
#if (HS_BS_SLABS_LOG2_RU >= 7)
hs_kernel_fm_2_7,
#endif
};
#endif
static hs_kernel_offset_fm_pfn const * const hs_kernels_offset_fm[]
{
#if (HS_FM_SCALE_MIN == 0)
hs_kernels_offset_fm_0,
#endif
#if (HS_FM_SCALE_MIN <= 1) && (1 <= HS_FM_SCALE_MAX)
hs_kernels_offset_fm_1,
#endif
#if (HS_FM_SCALE_MIN <= 2) && (2 <= HS_FM_SCALE_MAX)
hs_kernels_offset_fm_2,
#endif
};
//
//
//
typedef uint32_t hs_indices_t;
//
//
//
struct hs_state
{
// key buffers
HS_KEY_TYPE * vin;
HS_KEY_TYPE * vout; // can be vin
cudaStream_t streams[3];
// pool of stream indices
hs_indices_t pool;
// bx_ru is number of rounded up warps in vin
uint32_t bx_ru;
};
//
//
//
static
uint32_t
hs_indices_acquire(hs_indices_t * const indices)
{
//
// FIXME -- an FFS intrinsic might be faster but there are so few
// bits in this implementation that it might not matter.
//
if (*indices & 1)
{
*indices = *indices & ~1;
return 0;
}
else if (*indices & 2)
{
*indices = *indices & ~2;
return 1;
}
else // if (*indices & 4)
{
*indices = *indices & ~4;
return 2;
}
}
static
uint32_t
hs_state_acquire(struct hs_state * const state,
hs_indices_t * const indices)
{
//
// FIXME -- an FFS intrinsic might be faster but there are so few
// bits in this implementation that it might not matter.
//
if (state->pool & 1)
{
state->pool &= ~1;
*indices |= 1;
return 0;
}
else if (state->pool & 2)
{
state->pool &= ~2;
*indices |= 2;
return 1;
}
else // (state->pool & 4)
{
state->pool &= ~4;
*indices |= 4;
return 2;
}
}
static
void
hs_indices_merge(hs_indices_t * const to, hs_indices_t const from)
{
*to |= from;
}
static
void
hs_barrier_enqueue(cudaStream_t to, cudaStream_t from)
{
cudaEvent_t event_before;
cuda(EventCreate(&event_before));
cuda(EventRecord(event_before,from));
cuda(StreamWaitEvent(to,event_before,0));
cuda(EventDestroy(event_before));
}
static
hs_indices_t
hs_barrier(struct hs_state * const state,
hs_indices_t const before,
hs_indices_t * const after,
uint32_t const count) // count is 1 or 2
{
// return streams this stage depends on back into the pool
hs_indices_merge(&state->pool,before);
hs_indices_t indices = 0;
// acquire 'count' stream indices for this stage
for (uint32_t ii=0; ii<count; ii++)
{
hs_indices_t new_indices = 0;
// new index
uint32_t const idx = hs_state_acquire(state,&new_indices);
// add the new index to the indices
indices |= new_indices;
// only enqueue barriers when streams are different
uint32_t const wait = before & ~new_indices;
if (wait != 0)
{
cudaStream_t to = state->streams[idx];
//
// FIXME -- an FFS loop might be slower for so few bits. So
// leave it as is for now.
//
if (wait & 1)
hs_barrier_enqueue(to,state->streams[0]);
if (wait & 2)
hs_barrier_enqueue(to,state->streams[1]);
if (wait & 4)
hs_barrier_enqueue(to,state->streams[2]);
}
}
hs_indices_merge(after,indices);
return indices;
}
//
//
//
#ifndef NDEBUG
#include <stdio.h>
#define HS_STREAM_SYNCHRONIZE(s) \
cuda(StreamSynchronize(s)); \
fprintf(stderr,"%s\n",__func__);
#else
#define HS_STREAM_SYNCHRONIZE(s)
#endif
//
//
//
static
void
hs_transpose(struct hs_state * const state)
{
HS_TRANSPOSE_KERNEL_NAME()
<<<state->bx_ru,HS_SLAB_THREADS,0,state->streams[0]>>>
(state->vout);
HS_STREAM_SYNCHRONIZE(state->streams[0]);
}
//
//
//
static
void
hs_bc(struct hs_state * const state,
hs_indices_t const hs_bc,
hs_indices_t * const fm,
uint32_t const down_slabs,
uint32_t const clean_slabs_log2)
{
// enqueue any necessary barriers
hs_indices_t indices = hs_barrier(state,hs_bc,fm,1);
// block clean the minimal number of down_slabs_log2 spans
uint32_t const frac_ru = (1u << clean_slabs_log2) - 1;
uint32_t const full = (down_slabs + frac_ru) >> clean_slabs_log2;
uint32_t const threads = HS_SLAB_THREADS << clean_slabs_log2;
// stream will *always* be stream[0]
cudaStream_t stream = state->streams[hs_indices_acquire(&indices)];
hs_kernels_bc[clean_slabs_log2]
<<<full,threads,0,stream>>>
(state->vout);
HS_STREAM_SYNCHRONIZE(stream);
}
//
//
//
static
uint32_t
hs_hm(struct hs_state * const state,
hs_indices_t const hs_bc,
hs_indices_t * const hs_bc_tmp,
uint32_t const down_slabs,
uint32_t const clean_slabs_log2)
{
// enqueue any necessary barriers
hs_indices_t indices = hs_barrier(state,hs_bc,hs_bc_tmp,1);
// how many scaled half-merge spans are there?
uint32_t const frac_ru = (1 << clean_slabs_log2) - 1;
uint32_t const spans = (down_slabs + frac_ru) >> clean_slabs_log2;
// for now, just clamp to the max
uint32_t const log2_rem = clean_slabs_log2 - HS_BC_SLABS_LOG2_MAX;
uint32_t const scale_log2 = MIN_MACRO(HS_HM_SCALE_MAX,log2_rem);
uint32_t const log2_out = log2_rem - scale_log2;
//
// Size the grid
//
// The simplifying choices below limit the maximum keys that can be
// sorted with this grid scheme to around ~2B.
//
// .x : slab height << clean_log2 -- this is the slab span
// .y : [1...65535] -- this is the slab index
// .z : ( this could also be used to further expand .y )
//
// Note that OpenCL declares a grid in terms of global threads and
// not grids and blocks
//
dim3 grid;
grid.x = (HS_SLAB_HEIGHT / HS_HM_BLOCK_HEIGHT) << log2_out;
grid.y = spans;
grid.z = 1;
cudaStream_t stream = state->streams[hs_indices_acquire(&indices)];
hs_kernels_hm[scale_log2-HS_HM_SCALE_MIN]
<<<grid,HS_SLAB_THREADS * HS_HM_BLOCK_HEIGHT,0,stream>>>
(state->vout);
HS_STREAM_SYNCHRONIZE(stream);
return log2_out;
}
//
// FIXME -- some of this logic can be skipped if BS is a power-of-two
//
static
uint32_t
hs_fm(struct hs_state * const state,
hs_indices_t const fm,
hs_indices_t * const hs_bc,
uint32_t * const down_slabs,
uint32_t const up_scale_log2)
{
//
// FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes
// a performance win to bias toward launching the smaller flip merge
// kernel in order to get more warps in flight (increased
// occupancy). This is useful when merging small numbers of slabs.
//
// Note that HS_FM_SCALE_MIN will always be 0 or 1.
//
// So, for now, just clamp to the max until there is a reason to
// restore the fancier and probably low-impact approach.
//
uint32_t const scale_log2 = MIN_MACRO(HS_FM_SCALE_MAX,up_scale_log2);
uint32_t const clean_log2 = up_scale_log2 - scale_log2;
// number of slabs in a full-sized scaled flip-merge span
uint32_t const full_span_slabs = HS_BS_SLABS << up_scale_log2;
// how many full-sized scaled flip-merge spans are there?
uint32_t full_fm = state->bx_ru / full_span_slabs;
uint32_t frac_fm = 0;
// initialize down_slabs
*down_slabs = full_fm * full_span_slabs;
// how many half-size scaled + fractional scaled spans are there?
uint32_t const span_rem = state->bx_ru - *down_slabs;
uint32_t const half_span_slabs = full_span_slabs >> 1;
// if we have over a half-span then fractionally merge it
if (span_rem > half_span_slabs)
{
// the remaining slabs will be cleaned
*down_slabs += span_rem;
uint32_t const frac_rem = span_rem - half_span_slabs;
uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem);
if (frac_rem_pow2 >= half_span_slabs)
{
// bump it up to a full span
full_fm += 1;
}
else
{
// otherwise, add fractional
frac_fm = MAX_MACRO(1,frac_rem_pow2 >> clean_log2);
}
}
// enqueue any necessary barriers
bool const both = (full_fm != 0) && (frac_fm != 0);
hs_indices_t indices = hs_barrier(state,fm,hs_bc,both ? 2 : 1);
//
// Size the grid
//
// The simplifying choices below limit the maximum keys that can be
// sorted with this grid scheme to around ~2B.
//
// .x : slab height << clean_log2 -- this is the slab span
// .y : [1...65535] -- this is the slab index
// .z : ( this could also be used to further expand .y )
//
// Note that OpenCL declares a grid in terms of global threads and
// not grids and blocks
//
dim3 grid;
grid.x = (HS_SLAB_HEIGHT / HS_FM_BLOCK_HEIGHT) << clean_log2;
grid.z = 1;
if (full_fm > 0)
{
cudaStream_t stream = state->streams[hs_indices_acquire(&indices)];
grid.y = full_fm;
hs_kernels_fm[scale_log2-HS_FM_SCALE_MIN]
<<<grid,HS_SLAB_THREADS * HS_FM_BLOCK_HEIGHT,0,stream>>>
(state->vout);
HS_STREAM_SYNCHRONIZE(stream);
}
if (frac_fm > 0)
{
cudaStream_t stream = state->streams[hs_indices_acquire(&indices)];
grid.y = 1;
hs_kernels_offset_fm[scale_log2-HS_FM_SCALE_MIN][msb_idx_u32(frac_fm)]
<<<grid,HS_SLAB_THREADS * HS_FM_BLOCK_HEIGHT,0,stream>>>
(state->vout,full_fm);
HS_STREAM_SYNCHRONIZE(stream);
}
return clean_log2;
}
//
//
//
static
void
hs_bs(struct hs_state * const state,
hs_indices_t const bs,
hs_indices_t * const fm,
uint32_t const count_padded_in)
{
uint32_t const slabs_in = count_padded_in / HS_SLAB_KEYS;
uint32_t const full_bs = slabs_in / HS_BS_SLABS;
uint32_t const frac_bs = slabs_in - full_bs * HS_BS_SLABS;
bool const both = (full_bs != 0) && (frac_bs != 0);
// enqueue any necessary barriers
hs_indices_t indices = hs_barrier(state,bs,fm,both ? 2 : 1);
if (full_bs != 0)
{
cudaStream_t stream = state->streams[hs_indices_acquire(&indices)];
CONCAT_MACRO(hs_kernel_bs_,HS_BS_SLABS_LOG2_RU)
<<<full_bs,HS_BS_SLABS*HS_SLAB_THREADS,0,stream>>>
(state->vout,state->vin);
HS_STREAM_SYNCHRONIZE(stream);
}
if (frac_bs != 0)
{
cudaStream_t stream = state->streams[hs_indices_acquire(&indices)];
hs_kernels_offset_bs[msb_idx_u32(frac_bs)]
<<<1,frac_bs*HS_SLAB_THREADS,0,stream>>>
(state->vout,state->vin,full_bs*HS_BS_SLABS*HS_SLAB_THREADS);
HS_STREAM_SYNCHRONIZE(stream);
}
}
//
//
//
static
void
hs_keyset_pre_merge(struct hs_state * const state,
hs_indices_t * const fm,
uint32_t const count_lo,
uint32_t const count_hi)
{
uint32_t const vout_span = count_hi - count_lo;
cudaStream_t stream = state->streams[hs_state_acquire(state,fm)];
cuda(MemsetAsync(state->vout + count_lo,
0xFF,
vout_span * sizeof(HS_KEY_TYPE),
stream));
}
//
//
//
static
void
hs_keyset_pre_sort(struct hs_state * const state,
hs_indices_t * const bs,
uint32_t const count,
uint32_t const count_hi)
{
uint32_t const vin_span = count_hi - count;
cudaStream_t stream = state->streams[hs_state_acquire(state,bs)];
cuda(MemsetAsync(state->vin + count,
0xFF,
vin_span * sizeof(HS_KEY_TYPE),
stream));
}
//
//
//
void
CONCAT_MACRO(hs_cuda_sort_,HS_KEY_TYPE_PRETTY)
(HS_KEY_TYPE * const vin,
HS_KEY_TYPE * const vout,
uint32_t const count,
uint32_t const count_padded_in,
uint32_t const count_padded_out,
bool const linearize,
cudaStream_t stream0, // primary stream
cudaStream_t stream1, // auxilary
cudaStream_t stream2) // auxilary
{
// is this sort in place?
bool const is_in_place = (vout == NULL);
// cq, buffers, wait list and slab count
struct hs_state state;
state.vin = vin;
state.vout = is_in_place ? vin : vout;
state.streams[0] = stream0;
state.streams[1] = stream1;
state.streams[2] = stream2;
state.pool = 0x7; // 3 bits
state.bx_ru = (count + HS_SLAB_KEYS - 1) / HS_SLAB_KEYS;
// initialize vin
uint32_t const count_hi = is_in_place ? count_padded_out : count_padded_in;
bool const is_pre_sort_keyset_reqd = count_hi > count;
bool const is_pre_merge_keyset_reqd = !is_in_place && (count_padded_out > count_padded_in);
hs_indices_t bs = 0;
// initialize any trailing keys in vin before sorting
if (is_pre_sort_keyset_reqd)
hs_keyset_pre_sort(&state,&bs,count,count_hi);
hs_indices_t fm = 0;
// concurrently initialize any trailing keys in vout before merging
if (is_pre_merge_keyset_reqd)
hs_keyset_pre_merge(&state,&fm,count_padded_in,count_padded_out);
// immediately sort blocks of slabs
hs_bs(&state,bs,&fm,count_padded_in);
//
// we're done if this was a single bs block...
//
// otherwise, merge sorted spans of slabs until done
//
if (state.bx_ru > HS_BS_SLABS)
{
int32_t up_scale_log2 = 1;
while (true)
{
hs_indices_t hs_or_bc = 0;
uint32_t down_slabs;
// flip merge slabs -- return span of slabs that must be cleaned
uint32_t clean_slabs_log2 = hs_fm(&state,
fm,
&hs_or_bc,
&down_slabs,
up_scale_log2);
// if span is gt largest slab block cleaner then half merge
while (clean_slabs_log2 > HS_BC_SLABS_LOG2_MAX)
{
hs_indices_t hs_or_bc_tmp;
clean_slabs_log2 = hs_hm(&state,
hs_or_bc,
&hs_or_bc_tmp,
down_slabs,
clean_slabs_log2);
hs_or_bc = hs_or_bc_tmp;
}
// reset fm
fm = 0;
// launch clean slab grid -- is it the final launch?
hs_bc(&state,
hs_or_bc,
&fm,
down_slabs,
clean_slabs_log2);
// was this the final block clean?
if (((uint32_t)HS_BS_SLABS << up_scale_log2) >= state.bx_ru)
break;
// otherwise, merge twice as many slabs
up_scale_log2 += 1;
}
}
// slabs or linear?
if (linearize) {
// guaranteed to be on stream0
hs_transpose(&state);
}
}
//
// all grids will be computed as a function of the minimum number of slabs
//
void
CONCAT_MACRO(hs_cuda_pad_,HS_KEY_TYPE_PRETTY)
(uint32_t const count,
uint32_t * const count_padded_in,
uint32_t * const count_padded_out)
{
//
// round up the count to slabs
//
uint32_t const slabs_ru = (count + HS_SLAB_KEYS - 1) / HS_SLAB_KEYS;
uint32_t const blocks = slabs_ru / HS_BS_SLABS;
uint32_t const block_slabs = blocks * HS_BS_SLABS;
uint32_t const slabs_ru_rem = slabs_ru - block_slabs;
uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),HS_BS_SLABS);
*count_padded_in = (block_slabs + slabs_ru_rem_ru) * HS_SLAB_KEYS;
*count_padded_out = *count_padded_in;
//
// will merging be required?
//
if (slabs_ru > HS_BS_SLABS)
{
// more than one block
uint32_t const blocks_lo = pow2_rd_u32(blocks);
uint32_t const block_slabs_lo = blocks_lo * HS_BS_SLABS;
uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo;
if (block_slabs_rem > 0)
{
uint32_t const block_slabs_rem_ru = pow2_ru_u32(block_slabs_rem);
uint32_t const block_slabs_hi = MAX_MACRO(block_slabs_rem_ru,
blocks_lo << (1 - HS_FM_SCALE_MIN));
uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi,
block_slabs_lo*2); // clamp non-pow2 blocks
*count_padded_out = block_slabs_padded_out * HS_SLAB_KEYS;
}
}
}
//
//
//
void
CONCAT_MACRO(hs_cuda_info_,HS_KEY_TYPE_PRETTY)
(uint32_t * const key_words,
uint32_t * const val_words,
uint32_t * const slab_height,
uint32_t * const slab_width_log2)
{
*key_words = HS_KEY_WORDS;
*val_words = HS_VAL_WORDS;
*slab_height = HS_SLAB_HEIGHT;
*slab_width_log2 = HS_SLAB_WIDTH_LOG2;
}
//
//
//