blob: b0b9e4091b46ad5af0274211f0723ea62a6921b4 [file] [log] [blame]
/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include <stdlib.h>
//
//
//
#include "hs_cl_launcher.h"
#include "assert_cl.h"
#include "macros.h"
#include "util.h"
//
//
//
typedef uint32_t uint;
typedef uint64_t ulong;
//
//
//
#include "hs_cl.h"
//
//
//
#if 0 // #ifndef NDEBUG
#define HS_KERNEL_SOURCE
#else
#define HS_KERNEL_BINARY
#endif
//
// #define HS_KERNEL_SPIRV
//
//
//
//
#ifdef NDEBUG
#define HS_LAUNCH_TRACE(k,g,l)
#else
#include <stdio.h>
#define HS_KERNEL_NAME_MAX 20
static
void
hs_launch_trace(cl_kernel kernel,
size_t const global_work_size,
size_t const local_work_size)
{
if (kernel == NULL)
return;
char name[HS_KERNEL_NAME_MAX];
cl(GetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,HS_KERNEL_NAME_MAX,name,NULL));
fprintf(stderr,"%-19s ( %6zu, %4zu )\n",name,global_work_size,local_work_size);
}
#define HS_LAUNCH_TRACE(k,g,l) hs_launch_trace(k,g,l)
#endif
//
//
//
#ifdef NDEBUG
#define HS_EVENT_NEXT() NULL
#define HS_EVENT_PROFILE(cq)
#else
#define HS_EVENTS_MAX 128
static cl_event events[HS_EVENTS_MAX];
static uint32_t events_count;
static
cl_event *
hs_event_next()
{
if (events_count + 1 >= HS_EVENTS_MAX) // no events can be recorded?
{
return NULL;
}
else // return next event slot
{
return events + events_count++;
}
}
static
void
hs_event_profile(cl_command_queue cq)
{
cl(Finish(cq));
cl_command_queue_properties props;
cl(GetCommandQueueInfo(cq,
CL_QUEUE_PROPERTIES,
sizeof(props),
&props,
NULL));
cl_ulong t_min=UINT64_MAX, t_max=0;
for (uint32_t ee=0; ee<events_count; ee++)
{
cl_event event = events[ee];
//
// profiling
//
cl_ulong t_start=0, t_end=0;
if (props & CL_QUEUE_PROFILING_ENABLE)
{
// start
cl(GetEventProfilingInfo(event,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
&t_start,
NULL));
// end
cl(GetEventProfilingInfo(event,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),
&t_end,
NULL));
t_min = MIN_MACRO(t_min,t_start);
t_max = MAX_MACRO(t_max,t_end);
}
//
// status
//
cl_int status;
cl_command_type type;
cl_get_event_info(event,&status,&type);
fprintf(stdout,"%-3u, %-13s, %-28s, %20llu, %20llu, %20llu, %20llu\n",
ee,
cl_get_event_command_status_string(status),
cl_get_event_command_type_string(type),
t_start,t_end,t_end-t_start,t_max-t_min);
// release
cl(ReleaseEvent(event));
}
}
#define HS_EVENT_NEXT() hs_event_next()
#define HS_EVENT_PROFILE(cq) hs_event_profile(cq);
#endif
//
//
//
struct hs_state
{
cl_mem vin;
cl_mem vout;
// bx.ru is number of rounded up warps in vin
struct {
uint32_t ru;
} bx;
// these values change on each iteration
union {
struct {
uint32_t full;
uint32_t frac;
} bs; // warps
struct {
uint32_t full;
uint32_t na;
} bc; // warps
struct {
uint32_t full;
uint32_t frac;
} fm; // rows
};
};
//
//
//
#define HS_THREADS_PER_BLOCK (HS_BS_WARPS * HS_LANES_PER_WARP)
#define HS_KEYS_PER_WARP (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
#define HS_BS_KEYS_PER_BLOCK (HS_KEYS_PER_WARP * HS_BS_WARPS)
#define HS_BS_BLOCK_SIZE (HS_BS_KEYS_PER_BLOCK * sizeof(HS_KEY_TYPE))
#define HS_BC_KEYS_PER_BLOCK (HS_KEYS_PER_WARP << HS_BC_WARPS_LOG2_MAX)
#define HS_BC_BLOCK_SIZE (HS_BC_KEYS_PER_BLOCK * sizeof(HS_KEY_TYPE))
//
//
//
#if defined( HS_KERNEL_SOURCE )
#include "hs_cl.pre.src.inl"
#elif defined( HS_KERNEL_BINARY )
#include "hs_cl.pre.bin.inl"
#elif defined( HS_KERNEL_SPIRV )
#include "hs_cl.pre.spv.inl"
#endif
//
//
//
struct hs_transpose_kernel
{
cl_kernel kernel;
char const * name;
};
#define HS_TRANSPOSE_KERNEL_DECLARE(n) { .name = #n }
static struct hs_transpose_kernel transpose_kernels[] =
{
HS_TRANSPOSE_KERNEL_DECLARE(hs_kernel_transpose)
};
//
//
//
struct hs_bs_kernel
{
cl_kernel kernel;
char const * name;
};
#define HS_BS_KERNEL_DECLARE(n) { .name = #n }
static struct hs_bs_kernel bs_kernels[] =
{
#if 0 <= HS_BS_WARPS_LOG2_RU
HS_BS_KERNEL_DECLARE(hs_kernel_bs_0),
#endif
#if 1 <= HS_BS_WARPS_LOG2_RU
HS_BS_KERNEL_DECLARE(hs_kernel_bs_1),
#endif
#if 2 <= HS_BS_WARPS_LOG2_RU
HS_BS_KERNEL_DECLARE(hs_kernel_bs_2),
#endif
#if 3 <= HS_BS_WARPS_LOG2_RU
HS_BS_KERNEL_DECLARE(hs_kernel_bs_3),
#endif
#if 4 <= HS_BS_WARPS_LOG2_RU
HS_BS_KERNEL_DECLARE(hs_kernel_bs_4),
#endif
#if 5 <= HS_BS_WARPS_LOG2_RU
HS_BS_KERNEL_DECLARE(hs_kernel_bs_5),
#endif
#if 6 <= HS_BS_WARPS_LOG2_RU
HS_BS_KERNEL_DECLARE(hs_kernel_bs_6),
#endif
#if 7 <= HS_BS_WARPS_LOG2_RU
HS_BS_KERNEL_DECLARE(hs_kernel_bs_7),
#endif
};
//
//
//
struct hs_bc_kernel
{
cl_kernel kernel;
char const * name;
};
#define HS_BC_KERNEL_DECLARE(n) { .name = #n }
static struct hs_bc_kernel bc_kernels[] =
{
#if (0 >= HS_BC_WARPS_LOG2_MIN) && (0 <= HS_BC_WARPS_LOG2_MAX)
HS_BC_KERNEL_DECLARE(hs_kernel_bc_0),
#endif
#if (1 >= HS_BC_WARPS_LOG2_MIN) && (1 <= HS_BC_WARPS_LOG2_MAX)
HS_BC_KERNEL_DECLARE(hs_kernel_bc_1),
#endif
#if (2 >= HS_BC_WARPS_LOG2_MIN) && (2 <= HS_BC_WARPS_LOG2_MAX)
HS_BC_KERNEL_DECLARE(hs_kernel_bc_2),
#endif
#if (3 >= HS_BC_WARPS_LOG2_MIN) && (3 <= HS_BC_WARPS_LOG2_MAX)
HS_BC_KERNEL_DECLARE(hs_kernel_bc_3),
#endif
#if (4 >= HS_BC_WARPS_LOG2_MIN) && (4 <= HS_BC_WARPS_LOG2_MAX)
HS_BC_KERNEL_DECLARE(hs_kernel_bc_4),
#endif
#if (5 >= HS_BC_WARPS_LOG2_MIN) && (5 <= HS_BC_WARPS_LOG2_MAX)
HS_BC_KERNEL_DECLARE(hs_kernel_bc_5),
#endif
#if (6 >= HS_BC_WARPS_LOG2_MIN) && (6 <= HS_BC_WARPS_LOG2_MAX)
HS_BC_KERNEL_DECLARE(hs_kernel_bc_6),
#endif
#if (7 >= HS_BC_WARPS_LOG2_MIN) && (7 <= HS_BC_WARPS_LOG2_MAX)
HS_BC_KERNEL_DECLARE(hs_kernel_bc_7),
#endif
};
//
//
//
struct hs_fm_kernel
{
cl_kernel kernel;
char const * name;
uint32_t const log2;
};
#define HS_FM_KERNEL_DECLARE(n,l) { .name = #n, .log2 = l }
static struct hs_fm_kernel fm_kernels[] =
{
#ifdef HS_FM_BLOCKS_LOG2_0
HS_FM_KERNEL_DECLARE(hs_kernel_fm_0,HS_FM_BLOCKS_LOG2_0),
#endif
#ifdef HS_FM_BLOCKS_LOG2_1
HS_FM_KERNEL_DECLARE(hs_kernel_fm_1,HS_FM_BLOCKS_LOG2_1),
#endif
#ifdef HS_FM_BLOCKS_LOG2_2
HS_FM_KERNEL_DECLARE(hs_kernel_fm_2,HS_FM_BLOCKS_LOG2_2),
#endif
#ifdef HS_FM_BLOCKS_LOG2_3
HS_FM_KERNEL_DECLARE(hs_kernel_fm_3,HS_FM_BLOCKS_LOG2_3),
#endif
#ifdef HS_FM_BLOCKS_LOG2_4
HS_FM_KERNEL_DECLARE(hs_kernel_fm_4,HS_FM_BLOCKS_LOG2_4),
#endif
#ifdef HS_FM_BLOCKS_LOG2_5
HS_FM_KERNEL_DECLARE(hs_kernel_fm_5,HS_FM_BLOCKS_LOG2_5),
#endif
#ifdef HS_FM_BLOCKS_LOG2_6
HS_FM_KERNEL_DECLARE(hs_kernel_fm_6,HS_FM_BLOCKS_LOG2_6),
#endif
#ifdef HS_FM_BLOCKS_LOG2_7
HS_FM_KERNEL_DECLARE(hs_kernel_fm_7,HS_FM_BLOCKS_LOG2_7),
#endif
#ifdef HS_FM_BLOCKS_LOG2_8
HS_FM_KERNEL_DECLARE(hs_kernel_fm_8,HS_FM_BLOCKS_LOG2_8),
#endif
#ifdef HS_FM_BLOCKS_LOG2_9
HS_FM_KERNEL_DECLARE(hs_kernel_fm_9,HS_FM_BLOCKS_LOG2_9),
#endif
#ifdef HS_FM_BLOCKS_LOG2_10
HS_FM_KERNEL_DECLARE(hs_kernel_fm_10,HS_FM_BLOCKS_LOG2_10),
#endif
#ifdef HS_FM_BLOCKS_LOG2_11
HS_FM_KERNEL_DECLARE(hs_kernel_fm_11,HS_FM_BLOCKS_LOG2_11),
#endif
#ifdef HS_FM_BLOCKS_LOG2_12
HS_FM_KERNEL_DECLARE(hs_kernel_fm_12,HS_FM_BLOCKS_LOG2_12),
#endif
#ifdef HS_FM_BLOCKS_LOG2_13
HS_FM_KERNEL_DECLARE(hs_kernel_fm_13,HS_FM_BLOCKS_LOG2_13),
#endif
#ifdef HS_FM_BLOCKS_LOG2_14
HS_FM_KERNEL_DECLARE(hs_kernel_fm_14,HS_FM_BLOCKS_LOG2_14),
#endif
#ifdef HS_FM_BLOCKS_LOG2_15
HS_FM_KERNEL_DECLARE(hs_kernel_fm_15,HS_FM_BLOCKS_LOG2_15),
#endif
#ifdef HS_FM_BLOCKS_LOG2_16
HS_FM_KERNEL_DECLARE(hs_kernel_fm_16,HS_FM_BLOCKS_LOG2_16),
#endif
};
//
//
//
struct hs_hm_kernel
{
cl_kernel kernel;
char const * name;
uint32_t const log2;
};
#define HS_HM_KERNEL_DECLARE(n,l) { .name = #n, .log2 = l }
static struct hs_hm_kernel hm_kernels[] =
{
#ifdef HS_HM_BLOCKS_LOG2_0
HS_HM_KERNEL_DECLARE(hs_kernel_hm_0,HS_HM_BLOCKS_LOG2_0),
#endif
#ifdef HS_HM_BLOCKS_LOG2_1
HS_HM_KERNEL_DECLARE(hs_kernel_hm_1,HS_HM_BLOCKS_LOG2_1),
#endif
#ifdef HS_HM_BLOCKS_LOG2_2
HS_HM_KERNEL_DECLARE(hs_kernel_hm_2,HS_HM_BLOCKS_LOG2_2),
#endif
#ifdef HS_HM_BLOCKS_LOG2_3
HS_HM_KERNEL_DECLARE(hs_kernel_hm_3,HS_HM_BLOCKS_LOG2_3),
#endif
#ifdef HS_HM_BLOCKS_LOG2_4
HS_HM_KERNEL_DECLARE(hs_kernel_hm_4,HS_HM_BLOCKS_LOG2_4),
#endif
#ifdef HS_HM_BLOCKS_LOG2_5
HS_HM_KERNEL_DECLARE(hs_kernel_hm_5,HS_HM_BLOCKS_LOG2_5),
#endif
#ifdef HS_HM_BLOCKS_LOG2_6
HS_HM_KERNEL_DECLARE(hs_kernel_hm_6,HS_HM_BLOCKS_LOG2_6),
#endif
#ifdef HS_HM_BLOCKS_LOG2_7
HS_HM_KERNEL_DECLARE(hs_kernel_hm_7,HS_HM_BLOCKS_LOG2_7),
#endif
#ifdef HS_HM_BLOCKS_LOG2_8
HS_HM_KERNEL_DECLARE(hs_kernel_hm_8,HS_HM_BLOCKS_LOG2_8),
#endif
#ifdef HS_HM_BLOCKS_LOG2_9
HS_HM_KERNEL_DECLARE(hs_kernel_hm_9,HS_HM_BLOCKS_LOG2_9),
#endif
#ifdef HS_HM_BLOCKS_LOG2_10
HS_HM_KERNEL_DECLARE(hs_kernel_hm_10,HS_HM_BLOCKS_LOG2_10),
#endif
#ifdef HS_HM_BLOCKS_LOG2_11
HS_HM_KERNEL_DECLARE(hs_kernel_hm_11,HS_HM_BLOCKS_LOG2_11),
#endif
#ifdef HS_HM_BLOCKS_LOG2_12
HS_HM_KERNEL_DECLARE(hs_kernel_hm_12,HS_HM_BLOCKS_LOG2_12),
#endif
#ifdef HS_HM_BLOCKS_LOG2_13
HS_HM_KERNEL_DECLARE(hs_kernel_hm_13,HS_HM_BLOCKS_LOG2_13),
#endif
#ifdef HS_HM_BLOCKS_LOG2_14
HS_HM_KERNEL_DECLARE(hs_kernel_hm_14,HS_HM_BLOCKS_LOG2_14),
#endif
#ifdef HS_HM_BLOCKS_LOG2_15
HS_HM_KERNEL_DECLARE(hs_kernel_hm_15,HS_HM_BLOCKS_LOG2_15),
#endif
#ifdef HS_HM_BLOCKS_LOG2_16
HS_HM_KERNEL_DECLARE(hs_kernel_hm_16,HS_HM_BLOCKS_LOG2_16),
#endif
};
//
//
//
static
void
hs_barrier(cl_command_queue cq)
{
cl(EnqueueBarrierWithWaitList(cq,0,NULL,NULL));
}
//
//
//
static
void
hs_launch_transpose(struct hs_state const * const state,
cl_command_queue cq,
cl_kernel kernel,
size_t const global_work_size,
size_t const local_work_size)
{
HS_LAUNCH_TRACE(kernel,global_work_size,local_work_size);
cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
cl(EnqueueNDRangeKernel(cq,
kernel,
1,
NULL,
&global_work_size,
&local_work_size,
0,
NULL,
HS_EVENT_NEXT()));
}
//
//
//
static
void
hs_launch_bs(struct hs_state const * const state,
cl_command_queue cq,
cl_kernel kernel_full,
cl_kernel kernel_frac,
size_t const global_work_size_full,
size_t const local_work_size_full,
size_t const local_work_size_frac)
{
HS_LAUNCH_TRACE(kernel_full,global_work_size_full,local_work_size_full);
HS_LAUNCH_TRACE(kernel_frac,local_work_size_frac,local_work_size_frac);
if (kernel_full != NULL)
{
cl(SetKernelArg(kernel_full,0,sizeof(state->vin), &state->vin));
cl(SetKernelArg(kernel_full,1,sizeof(state->vout),&state->vout));
cl(EnqueueNDRangeKernel(cq,
kernel_full,
1,
NULL,
&global_work_size_full,
&local_work_size_full,
0,
NULL,
HS_EVENT_NEXT()));
}
if (kernel_frac != NULL)
{
cl(SetKernelArg(kernel_frac,0,sizeof(state->vin), &state->vin));
cl(SetKernelArg(kernel_frac,1,sizeof(state->vout),&state->vout));
cl(EnqueueNDRangeKernel(cq,
kernel_frac,
1,
&global_work_size_full,
&local_work_size_frac,
&local_work_size_frac,
0,
NULL,
HS_EVENT_NEXT()));
}
}
//
//
//
static
void
hs_launch_bc(struct hs_state const * const state,
cl_command_queue cq,
cl_kernel kernel,
size_t const global_work_size,
size_t const local_work_size)
{
HS_LAUNCH_TRACE(kernel,global_work_size,local_work_size);
cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
cl(EnqueueNDRangeKernel(cq,
kernel,
1,
NULL,
&global_work_size,
&local_work_size,
0,
NULL,
HS_EVENT_NEXT()));
}
//
//
//
static
void
hs_launch_fm(struct hs_state const * const state,
cl_command_queue cq,
cl_kernel kernel,
size_t const global_work_size)
{
HS_LAUNCH_TRACE(kernel,global_work_size,0);
cl(SetKernelArg(kernel,0,sizeof(state->vout), &state->vout));
cl(SetKernelArg(kernel,1,sizeof(state->fm.full),&state->fm.full));
cl(SetKernelArg(kernel,2,sizeof(state->fm.frac),&state->fm.frac));
cl(EnqueueNDRangeKernel(cq,
kernel,
1,
NULL,
&global_work_size,
NULL,
0,
NULL,
HS_EVENT_NEXT()));
}
//
//
//
static
void
hs_launch_hm(struct hs_state const * const state,
cl_command_queue cq,
cl_kernel kernel,
size_t const global_work_size)
{
HS_LAUNCH_TRACE(kernel,global_work_size,0);
cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
cl(EnqueueNDRangeKernel(cq,
kernel,
1,
NULL,
&global_work_size,
NULL,
0,
NULL,
HS_EVENT_NEXT()));
}
//
//
//
static
void
hs_transpose_launcher(struct hs_state * const state,
cl_command_queue cq)
{
// transpose each slab
size_t const global_work_size = state->bx.ru * HS_LANES_PER_WARP;
size_t const local_work_size = HS_LANES_PER_WARP; // FIXME -- might not always want to specify this
hs_launch_transpose(state,
cq,
transpose_kernels[0].kernel,
global_work_size,
local_work_size);
}
//
//
//
static
void
hs_bs_launcher(struct hs_state * const state,
uint32_t const warps_in,
cl_command_queue cq)
{
// warps_in is already rounded up
uint32_t const full = (warps_in / HS_BS_WARPS) * HS_BS_WARPS;
uint32_t const frac = warps_in - full;
//
// FIXME -- launch on different queues
//
cl_kernel kernel_full = (full == 0) ? NULL : bs_kernels[HS_BS_WARPS_LOG2_RU].kernel;
cl_kernel kernel_frac = (frac == 0) ? NULL : bs_kernels[msb_idx_u32(frac)].kernel;
hs_launch_bs(state,
cq,
kernel_full,
kernel_frac,
full * HS_LANES_PER_WARP,
HS_BS_WARPS * HS_LANES_PER_WARP,
frac * HS_LANES_PER_WARP);
}
//
//
//
static
void
hs_bc_launcher(struct hs_state * const state,
uint32_t const down_warps,
uint32_t const down_warps_log2,
cl_command_queue cq)
{
// block clean the minimal number of down_warps_log2 spans
uint32_t const frac_ru = (1u << down_warps_log2) - 1;
state->bc.full = (down_warps + frac_ru) & ~frac_ru;
// launch block slab sorting grid
size_t const global_work_size = state->bc.full * HS_LANES_PER_WARP;
size_t const local_work_size = HS_LANES_PER_WARP << down_warps_log2;
//
// we better be capable of cleaning at least two warps !!!
//
hs_launch_bc(state,
cq,
bc_kernels[down_warps_log2].kernel,
global_work_size,
local_work_size);
}
//
//
//
static
uint32_t
hs_hm_launcher(struct hs_state * const state,
uint32_t const down_warps,
uint32_t const down_warps_log2_in,
cl_command_queue cq)
{
// how many scaled half-merge spans are there?
uint32_t const frac_ru = (1 << down_warps_log2_in) - 1;
uint32_t const spans_ru = (down_warps + frac_ru) >> down_warps_log2_in;
// get the kernel record
struct hs_hm_kernel const * const hm = hm_kernels + down_warps_log2_in - HS_BC_WARPS_LOG2_MAX - 1;
// how large is the grid?
size_t const global_work_size = HS_LANES_PER_WARP * HS_KEYS_PER_LANE * (spans_ru << hm->log2);
size_t const local_work_size = HS_LANES_PER_WARP;
// launch the hm kernel
hs_launch_hm(state,
cq,
hm->kernel,
global_work_size);
return hm->log2;
}
//
// FIXME -- some of this logic can be skipped if BS is a power-of-two
//
static
uint32_t
hs_fm_launcher(struct hs_state * const state,
uint32_t const up_scale_log2,
uint32_t * const down_warps,
cl_command_queue cq)
{
// get the kernel record
struct hs_fm_kernel const * const fm = fm_kernels + up_scale_log2 - 1;
// number of warps in a full-sized scaled flip-merge span
uint32_t const full_span_warps = HS_BS_WARPS << up_scale_log2;
// how many full-sized scaled flip-merge spans are there?
state->fm.full = state->bx.ru / full_span_warps;
state->fm.frac = 0;
// initialize down_warps
*down_warps = state->fm.full * full_span_warps;
// how many half-size scaled + fractional scaled spans are there?
uint32_t const span_rem = state->bx.ru - state->fm.full * full_span_warps;
uint32_t const half_span_warps = full_span_warps >> 1;
if (span_rem > half_span_warps)
{
uint32_t const frac_rem = span_rem - half_span_warps;
uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem);
if (frac_rem_pow2 >= half_span_warps)
{
*down_warps += full_span_warps;
state->fm.full += 1;
}
else
{
uint32_t const frac_interleaved = frac_rem_pow2 >> fm->log2;
*down_warps += half_span_warps + frac_rem_pow2;
state->fm.frac = MAX_MACRO(1,frac_interleaved);
}
}
// size the grid
uint32_t const spans_frac = MIN_MACRO(state->fm.frac,1);
uint32_t const spans_total = state->fm.full + spans_frac;
uint32_t const scale = spans_total << fm->log2;
size_t const global_work_size = HS_LANES_PER_WARP * HS_KEYS_PER_LANE * scale;
size_t const local_work_size = HS_LANES_PER_WARP;
//
// launch the fm kernel
//
hs_launch_fm(state,
cq,
fm->kernel,
global_work_size);
return fm->log2;
}
//
//
//
static
void
hs_keyset_launcher(cl_mem mem,
uint32_t const offset,
uint32_t const span,
cl_command_queue cq)
{
//
// DOES NOT TEST FOR SPAN = 0
//
HS_KEY_TYPE const pattern = (HS_KEY_TYPE)-1L;
cl(EnqueueFillBuffer(cq,
mem,
&pattern,
sizeof(HS_KEY_TYPE),
offset * sizeof(HS_KEY_TYPE),
span * sizeof(HS_KEY_TYPE),
0,
NULL,
HS_EVENT_NEXT()));
}
//
// all grids will be computed as a function of the minimum number of warps
//
void
hs_pad(uint32_t const count,
uint32_t * const count_padded_in,
uint32_t * const count_padded_out)
{
//
// round up the count to warps
//
uint32_t const warps_ru = (count + HS_KEYS_PER_WARP - 1) / HS_KEYS_PER_WARP;
uint32_t const blocks = warps_ru / HS_BS_WARPS;
uint32_t const warps_mod = warps_ru % HS_BS_WARPS;
uint32_t const warps_mod_ru = MIN_MACRO(pow2_ru_u32(warps_mod),HS_BS_WARPS);
*count_padded_in = (blocks * HS_BS_WARPS + warps_mod_ru) * HS_KEYS_PER_WARP;
*count_padded_out = *count_padded_in;
//
// more than a single block sort?
//
if (warps_ru > HS_BS_WARPS)
{
// more than one block
uint32_t const blocks_lo = pow2_rd_u32(blocks);
uint32_t const warps_lo = blocks_lo * HS_BS_WARPS;
uint32_t const warps_rem = warps_ru - warps_lo;
if (warps_rem > 0)
{
uint32_t const warps_rem_ru = pow2_ru_u32(warps_rem);
uint32_t const warps_hi = MAX_MACRO(warps_rem_ru,blocks_lo << HS_FM_BLOCKS_LOG2_1);
uint32_t const warps_padded_out = MIN_MACRO(warps_lo+warps_hi,warps_lo*2); // clamp non-pow2 blocks
*count_padded_out = warps_padded_out * HS_KEYS_PER_WARP;
}
}
}
//
//
//
void
hs_sort(cl_command_queue cq, // out-of-order cq
cl_mem vin,
cl_mem vout,
uint32_t const count,
uint32_t const count_padded_in,
uint32_t const count_padded_out,
bool const linearize)
{
#ifndef NDEBUG
events_count = 0;
#endif
//
// FIXME -- get rid of this vestigial structure
//
struct hs_state state = { .vin = vin, .vout = vout };
// how many rounded-up key slabs are there?
state.bx.ru = (count + HS_KEYS_PER_WARP - 1) / HS_KEYS_PER_WARP;
//
// init padding with max-valued keys
//
bool const split = state.vout != state.vin; // FIXME -- careful this comparison might not always be correct
bool keyset = false;
if (!split)
{
uint32_t const vin_span = count_padded_out - count;
if (vin_span > 0)
{
hs_keyset_launcher(state.vin,
count,vin_span,
cq);
keyset = true;
}
}
else
{
uint32_t const vin_span = count_padded_in - count;
if (vin_span > 0)
{
hs_keyset_launcher(state.vin,
count,vin_span,
cq);
keyset = true;
}
uint32_t const vout_span = count_padded_out - count_padded_in;
if (vout_span > 0)
{
hs_keyset_launcher(state.vout,
count_padded_in,vout_span,
cq);
keyset = true;
}
}
if (keyset)
{
hs_barrier(cq);
}
//
// sort blocks
//
uint32_t const warps_in = count_padded_in / HS_KEYS_PER_WARP;
hs_bs_launcher(&state,warps_in,cq);
hs_barrier(cq);
//
// we're done if only a single bs kernel block was required
//
if (state.bx.ru > HS_BS_WARPS)
{
//
// otherwise... merge sorted spans of warps until done
//
uint32_t up_scale_log2 = 1;
while (true)
{
uint32_t down_warps;
// flip merge warps -- return span of warps that must be cleaned
uint32_t down_warps_log2 = hs_fm_launcher(&state,
up_scale_log2,
&down_warps,
cq);
hs_barrier(cq);
// if span is gt largest slab block cleaner then half merge
while (down_warps_log2 > HS_BC_WARPS_LOG2_MAX)
{
down_warps_log2 = hs_hm_launcher(&state,
down_warps,
down_warps_log2,
cq);
hs_barrier(cq);
}
// launch clean slab grid -- is it the final launch?
hs_bc_launcher(&state,
down_warps,
down_warps_log2,
cq);
hs_barrier(cq);
// was this the final block clean?
if (((uint32_t)HS_BS_WARPS << up_scale_log2) >= state.bx.ru)
break;
// otherwise, merge twice as many slabs
up_scale_log2 += 1;
}
}
if (linearize)
{
// launch linearize;
hs_transpose_launcher(&state,cq);
hs_barrier(cq);
}
HS_EVENT_PROFILE(cq);
}
//
//
//
void
hs_create(cl_context context,
cl_device_id device_id,
struct hs_info * const info)
{
//
// create and build the program from source or a precompiled binary
//
if (info != NULL)
{
info->words = HS_KEY_WORDS;
info->keys = HS_KEYS_PER_LANE;
info->lanes = HS_LANES_PER_WARP;
}
#if defined( HS_KERNEL_SOURCE )
cl_int err;
size_t const strings_sizeof[] = { sizeof(hs_cl_pre_cl) };
char const * strings[] = { (char*)hs_cl_pre_cl };
cl_program program = clCreateProgramWithSource(context,
1,
strings,
strings_sizeof,
&err);
cl_ok(err);
char const * const options =
"-cl-std=CL2.0 -cl-fast-relaxed-math "
"-cl-no-signed-zeros -cl-mad-enable "
"-cl-denorms-are-zero "
"-cl-kernel-arg-info";
cl(BuildProgram(program,
1,
&device_id,
options,
NULL,
NULL));
#elif defined( HS_KERNEL_BINARY )
cl_int status, err;
size_t const bins_sizeof[] = { sizeof(hs_cl_pre_ir) };
unsigned char const * bins[] = { hs_cl_pre_ir };
cl_program program = clCreateProgramWithBinary(context,
1,
&device_id,
bins_sizeof,
bins,
&status,
&err);
cl_ok(err);
cl(BuildProgram(program,
1,
&device_id,
NULL,
NULL,
NULL));
#endif
//
// create all the kernels and release the program
//
#define HS_CREATE_KERNELS(ks) \
for (uint32_t ii=0; ii<ARRAY_LENGTH(ks); ii++) { \
ks[ii].kernel = clCreateKernel(program,ks[ii].name,&err); \
cl_ok(err); \
}
HS_CREATE_KERNELS(bs_kernels);
HS_CREATE_KERNELS(bc_kernels);
HS_CREATE_KERNELS(fm_kernels);
HS_CREATE_KERNELS(hm_kernels);
HS_CREATE_KERNELS(transpose_kernels);
cl(ReleaseProgram(program));
}
//
//
//
void
hs_release()
{
#define HS_RELEASE_KERNELS(ks) \
for (uint32_t ii=0; ii<ARRAY_LENGTH(ks); ii++) \
cl(ReleaseKernel(ks[ii].kernel))
HS_RELEASE_KERNELS(bs_kernels);
HS_RELEASE_KERNELS(bc_kernels);
HS_RELEASE_KERNELS(fm_kernels);
HS_RELEASE_KERNELS(hm_kernels);
HS_RELEASE_KERNELS(transpose_kernels);
}
//
//
//