blob: 63a1a43177071487a4017588490cea87a24f8ead [file] [log] [blame]
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include "path.h"
#include "block_pool_cl.h"
#include "path_builder_cl_12.h"
#include "kernel_cl_12.h"
//
//
//
#if 0
//
// SIMD AVX2
//
#define SKC_PATHS_COPY_WORDS_PER_ELEM 8
#define SKC_PATHS_COPY_SUBGROUP_SIZE 1
#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
typedef skc_uint8 skc_paths_copy_elem;
typedef skc_uint8 skc_pb_idx_v;
#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8()
#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS)
#endif
//
//
//
#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
// FIXME -- use SUBGROUP terminology everywhere
#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
//
//
//
#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \
(SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \
(SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
//
//
//
//
// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
//
#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
//
//
//
skc_uint
skc_sub_group_local_id()
{
#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
return get_sub_group_local_id();
#else
return 0;
#endif
}
//
// convert an atomic read counter offset to a block id
//
skc_block_id_t
skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
skc_uint const bp_idx_mask,
skc_uint const bp_reads,
skc_uint const bp_off)
{
skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
return bp_ids[bp_idx];
}
//
//
//
void
skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to
skc_uint const bp_elems_idx,
__global skc_paths_copy_elem const * const pb_elems, // from
skc_uint const pb_elems_idx)
{
for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
{
(bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
}
#if 0
//
// NOTE THIS IS PRINTING 8 ROWS
//
printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
(skc_uint)get_global_id(0),pb_elems_idx,
as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
(skc_uint)get_global_id(0),pb_elems_idx,
as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
#endif
}
//
//
//
void
skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to
skc_uint const bp_elems_idx,
__global skc_block_id_t const * const bp_ids,
skc_uint const bp_reads,
skc_uint const bp_idx_mask,
__global skc_paths_copy_elem const * const pb_elems, // from
skc_uint const pb_elems_idx,
skc_uint const pb_rolling)
{
//
// remap block id tags bp_elems the host-side rolling counter pb_elems a
// device-side block pool id
//
for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
{
// load block_id_tag words
skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii];
// calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
// FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
//
// FIXME -- SIMD can be fully parallelized since a bp_ids[] load
// will _always_ be safe as long as we don't use the loaded
// value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
// of iterating over the vector components.
//
// only convert if original elem is not invalid
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
skc_block_id_t const b = bp_ids[bp_idx C]; \
elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
}
// printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
SKC_PATHS_COPY_ELEM_EXPAND();
// store the elem back
(bp_elems+bp_elems_idx)[ii] = elem;
}
}
//
//
//
void
skc_host_map_update(__global skc_uint * const host_map,
skc_uint const block,
skc_paths_copy_elem const elem)
{
//
// write first elem to map -- FIXME -- this is a little nasty
// because it relies on the the host handle always being the first
// word in the path header.
//
// OTOH, this is not unreasonable. The alternative is to have a
// separate kernel initializing the map.
//
#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
#endif
{
#if SKC_PATHS_COPY_ELEM_WORDS == 1
host_map[elem] = block;
#if 0
printf("[%u] = %u\n",elem,block);
#endif
#else
host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
#endif
}
}
//
//
//
void
skc_copy_head(__global skc_uint * const host_map,
skc_uint const block,
__global skc_paths_copy_elem * const bp_elems, // to
skc_uint const bp_elems_idx,
__global skc_block_id_t const * const bp_ids,
skc_uint const bp_reads,
skc_uint const bp_idx_mask,
__global skc_paths_copy_elem const * const pb_elems, // from
skc_uint const pb_elems_idx,
skc_uint const pb_rolling)
{
//
// if there are more path header words than there are
// threads-per-block then we can just copy the initial header words
//
#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
{
skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
(bp_elems+bp_elems_idx)[ii] = elem;
if (ii == 0) {
skc_host_map_update(host_map,block,elem);
}
}
#endif
//
// this is similar to copy node but the first H words of the path
// header are not modified and simply copied
//
for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
{
skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
if (ii == 0) {
skc_host_map_update(host_map,block,elem);
}
#endif
// calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
//
// FIXME -- SIMD can be fully parallelized since a bp_ids[] load
// will _always_ be safe as long as we don't use the loaded
// value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
// of iterating over the vector components.
//
// FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
// FIXME -- MIX MIX MIX MIX / SELECT
// only convert if original elem is not invalid
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
skc_block_id_t const b = bp_ids[bp_idx C]; \
elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
}
// printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
SKC_PATHS_COPY_ELEM_EXPAND();
// store the elem back
(bp_elems+bp_elems_idx)[ii] = elem;
}
//
// the remaining words are treated like a node
//
for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
{
// load block_id_tag words
skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
// calculate ahead of time
skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
//
// FIXME -- SIMD can be fully parallelized since a bp_ids[] load
// will _always_ be safe as long as we don't use the loaded
// value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
// of iterating over the vector components.
//
// FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
// only convert if original elem is not invalid
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
skc_block_id_t const b = bp_ids[bp_idx C]; \
elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
}
// printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
SKC_PATHS_COPY_ELEM_EXPAND();
// store the elem
(bp_elems+bp_elems_idx)[ii] = elem;
}
}
//
// FIXME -- pack some of these constant integer args in a vec or struct
//
__kernel
SKC_PATHS_COPY_KERNEL_ATTRIBS
void
skc_kernel_paths_copy
(__global skc_uint * const host_map,
__global skc_block_id_t const * const bp_ids,
__global skc_paths_copy_elem * const bp_elems,
skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring
__global skc_uint const * const bp_alloc, // block pool ring base
skc_uint const bp_alloc_idx,// which subbuf
__global union skc_tagged_block_id const * const pb_cmds,
__global skc_paths_copy_elem const * const pb_elems,
skc_uint const pb_size, // # of commands/blocks in buffer
skc_uint const pb_rolling, // shifted rolling counter base
skc_uint const pb_prev_from,
skc_uint const pb_prev_span,
skc_uint const pb_curr_from)
{
//
// THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
//
// - HEAD
// - NODE
// - SEGS
//
// THESE ARE SUBGROUP ORIENTED KERNELS
//
// A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
//
//
// It's likely that peak bandwidth is achievable with a single
// workgroup.
//
// So let's keep the grids modestly sized and for simplicity and
// portability, let's assume that a single workgroup can perform all
// steps in the copy.
//
// Launch as large of a workgroup as possiblex
//
// 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
// 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
// 3. FOR EACH COMMAND:
// - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
// - NODE: CONVERT AND COPY B INDICES
// - SEGS: BULK COPY
//
// B : number of words in block -- always pow2
// W : intelligently/arbitrarily chosen factor of B -- always pow2
//
//
// There are several approaches to processing the commands:
//
// 1. B threads are responsible for one block. All threads broadcast
// load a single command word. Workgroup size must be a facpb_elemsr of
// B.
//
// 2. W threads process an entire block. W will typically be the
// device's subgroup/warp/wave width. W threads broadcast load a
// single command word.
//
// 3. W threads process W blocks. W threads load W command words and
// process W blocks.
//
// Clearly (1) has low I/O intensity but will achieve high
// parallelism by activating the most possible threads. The downside
// of this kind of approach is that the kernel will occupy even a
// large GPU with low intensity work and reduce opportunities for
// concurrent kernel execution (of other kernels).
//
// See Vasily Volkov's CUDA presentation describing these tradeoffs.
//
// Note that there are many other approaches. For example, similar
// pb_elems (1) but each thread loads a pow2 vector of block data.
//
// load the copied atomic read "base" from gmem
skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
// will always be less than 2^32
skc_uint const gid = get_global_id(0);
// every subgroup/simd that will work on the block loads the same command
skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
// path builder data can be spread across two spans
skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
// no need pb_elems make this branchless
if (pb_idx >= pb_size)
pb_idx -= pb_size;
// broadcast load the command
union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx];
// what do we want pb_elems do with this block?
skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
// compute offset from rolling base to get index into block pool ring allocation
skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
// convert the pb_cmd's offset counter pb_elems a block id
skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
#if 0
if (get_sub_group_local_id() == 0) {
printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
printf("< %8u >\n",block);
}
#endif
// FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
// calculate bp_elems (to) / pb_elems (from)
skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid;
if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
{
#if 0
if (tid == 0)
printf("%3u, segs\n",bp_off);
#endif
skc_copy_segs(bp_elems,
bp_elems_idx,
pb_elems,
pb_elems_idx);
}
else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
{
#if 0
if (tid == 0)
printf("%3u, NODE\n",bp_off);
#endif
skc_copy_node(bp_elems, // to
bp_elems_idx,
bp_ids,
bp_reads,
bp_idx_mask,
pb_elems, // from
pb_elems_idx,
pb_rolling);
}
else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
{
#if 0
if (tid == 0)
printf("%3u, HEAD\n",bp_off);
#endif
skc_copy_head(host_map,
block,
bp_elems, // to
bp_elems_idx,
bp_ids,
bp_reads,
bp_idx_mask,
pb_elems, // from
pb_elems_idx,
pb_rolling);
}
}
//
//
//
__kernel
SKC_PATHS_ALLOC_KERNEL_ATTRIBS
void
skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
__global skc_uint * const bp_alloc,
skc_uint const bp_alloc_idx,
skc_uint const pb_cmd_count)
{
//
// allocate blocks in block pool
//
skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
// store in slot
bp_alloc[bp_alloc_idx] = reads;
#if 0
printf("pc: %8u + %u\n",reads,pb_cmd_count);
#endif
}
//
//
//