| /* |
| * Copyright 2017 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can |
| * be found in the LICENSE file. |
| * |
| */ |
| |
| // |
| // |
| // |
| |
| #include "path.h" |
| #include "block_pool_cl.h" |
| #include "path_builder_cl_12.h" |
| #include "kernel_cl_12.h" |
| |
| // |
| // |
| // |
| |
| #if 0 |
| |
| // |
| // SIMD AVX2 |
| // |
| |
| #define SKC_PATHS_COPY_WORDS_PER_ELEM 8 |
| #define SKC_PATHS_COPY_SUBGROUP_SIZE 1 |
| #define SKC_PATHS_COPY_KERNEL_ATTRIBUTES |
| |
| typedef skc_uint8 skc_paths_copy_elem; |
| typedef skc_uint8 skc_pb_idx_v; |
| |
| #define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8() |
| |
| #define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS) |
| |
| #endif |
| |
| // |
| // |
| // |
| |
| #define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1) |
| #define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) |
| #define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) |
| #define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE) |
| |
| // FIXME -- use SUBGROUP terminology everywhere |
| #define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS) |
| |
| // |
| // |
| // |
| |
| #define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \ |
| (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS)) |
| |
| #define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \ |
| (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS)) |
| |
| // #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS) |
| |
| // |
| // |
| // |
| |
| // |
| // BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL |
| // |
| |
| #define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2) |
| |
| #define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS) |
| |
| #define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS) |
| |
| #define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti) |
| |
| #define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS) |
| |
| #define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG)) |
| |
| // |
| // |
| // |
| |
| skc_uint |
| skc_sub_group_local_id() |
| { |
| #if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 |
| return get_sub_group_local_id(); |
| #else |
| return 0; |
| #endif |
| } |
| |
| // |
| // convert an atomic read counter offset to a block id |
| // |
| |
| skc_block_id_t |
| skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids, |
| skc_uint const bp_idx_mask, |
| skc_uint const bp_reads, |
| skc_uint const bp_off) |
| { |
| skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask; |
| |
| return bp_ids[bp_idx]; |
| } |
| |
| // |
| // |
| // |
| |
| void |
| skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to |
| skc_uint const bp_elems_idx, |
| __global skc_paths_copy_elem const * const pb_elems, // from |
| skc_uint const pb_elems_idx) |
| { |
| for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) |
| { |
| (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii]; |
| } |
| |
| #if 0 |
| // |
| // NOTE THIS IS PRINTING 8 ROWS |
| // |
| printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n", |
| (skc_uint)get_global_id(0),pb_elems_idx, |
| as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]), |
| as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]), |
| as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]), |
| as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE])); |
| printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n", |
| (skc_uint)get_global_id(0),pb_elems_idx, |
| as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]), |
| as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]), |
| as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]), |
| as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE])); |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| void |
| skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to |
| skc_uint const bp_elems_idx, |
| __global skc_block_id_t const * const bp_ids, |
| skc_uint const bp_reads, |
| skc_uint const bp_idx_mask, |
| __global skc_paths_copy_elem const * const pb_elems, // from |
| skc_uint const pb_elems_idx, |
| skc_uint const pb_rolling) |
| { |
| // |
| // remap block id tags bp_elems the host-side rolling counter pb_elems a |
| // device-side block pool id |
| // |
| for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) |
| { |
| // load block_id_tag words |
| skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii]; |
| |
| // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid |
| skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; |
| |
| // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS |
| |
| // |
| // FIXME -- SIMD can be fully parallelized since a bp_ids[] load |
| // will _always_ be safe as long as we don't use the loaded |
| // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead |
| // of iterating over the vector components. |
| // |
| |
| // only convert if original elem is not invalid |
| |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) \ |
| if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \ |
| skc_block_id_t const b = bp_ids[bp_idx C]; \ |
| elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ |
| } |
| |
| // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C); |
| |
| SKC_PATHS_COPY_ELEM_EXPAND(); |
| |
| // store the elem back |
| (bp_elems+bp_elems_idx)[ii] = elem; |
| } |
| } |
| |
| // |
| // |
| // |
| |
| void |
| skc_host_map_update(__global skc_uint * const host_map, |
| skc_uint const block, |
| skc_paths_copy_elem const elem) |
| { |
| // |
| // write first elem to map -- FIXME -- this is a little nasty |
| // because it relies on the the host handle always being the first |
| // word in the path header. |
| // |
| // OTOH, this is not unreasonable. The alternative is to have a |
| // separate kernel initializing the map. |
| // |
| #if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 |
| if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE) |
| #endif |
| { |
| #if SKC_PATHS_COPY_ELEM_WORDS == 1 |
| host_map[elem] = block; |
| #if 0 |
| printf("[%u] = %u\n",elem,block); |
| #endif |
| #else |
| host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block; |
| #endif |
| } |
| } |
| |
| // |
| // |
| // |
| |
| void |
| skc_copy_head(__global skc_uint * const host_map, |
| skc_uint const block, |
| __global skc_paths_copy_elem * const bp_elems, // to |
| skc_uint const bp_elems_idx, |
| __global skc_block_id_t const * const bp_ids, |
| skc_uint const bp_reads, |
| skc_uint const bp_idx_mask, |
| __global skc_paths_copy_elem const * const pb_elems, // from |
| skc_uint const pb_elems_idx, |
| skc_uint const pb_rolling) |
| { |
| // |
| // if there are more path header words than there are |
| // threads-per-block then we can just copy the initial header words |
| // |
| #if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 ) |
| for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) |
| { |
| skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii]; |
| |
| (bp_elems+bp_elems_idx)[ii] = elem; |
| |
| if (ii == 0) { |
| skc_host_map_update(host_map,block,elem); |
| } |
| } |
| #endif |
| |
| // |
| // this is similar to copy node but the first H words of the path |
| // header are not modified and simply copied |
| // |
| for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) |
| { |
| skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii]; |
| |
| #if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 ) |
| if (ii == 0) { |
| skc_host_map_update(host_map,block,elem); |
| } |
| #endif |
| // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid |
| skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; |
| |
| // |
| // FIXME -- SIMD can be fully parallelized since a bp_ids[] load |
| // will _always_ be safe as long as we don't use the loaded |
| // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead |
| // of iterating over the vector components. |
| // |
| |
| // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS |
| |
| // FIXME -- MIX MIX MIX MIX / SELECT |
| |
| // only convert if original elem is not invalid |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) \ |
| if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \ |
| skc_block_id_t const b = bp_ids[bp_idx C]; \ |
| elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ |
| } |
| |
| // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C); |
| |
| SKC_PATHS_COPY_ELEM_EXPAND(); |
| |
| // store the elem back |
| (bp_elems+bp_elems_idx)[ii] = elem; |
| } |
| |
| // |
| // the remaining words are treated like a node |
| // |
| for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) |
| { |
| // load block_id_tag words |
| skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii]; |
| |
| // calculate ahead of time |
| skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; |
| |
| // |
| // FIXME -- SIMD can be fully parallelized since a bp_ids[] load |
| // will _always_ be safe as long as we don't use the loaded |
| // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead |
| // of iterating over the vector components. |
| // |
| |
| // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS |
| |
| // only convert if original elem is not invalid |
| #undef SKC_EXPAND_X |
| #define SKC_EXPAND_X(I,S,C,P,R) \ |
| if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \ |
| skc_block_id_t const b = bp_ids[bp_idx C]; \ |
| elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ |
| } |
| |
| // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C); |
| |
| SKC_PATHS_COPY_ELEM_EXPAND(); |
| |
| // store the elem |
| (bp_elems+bp_elems_idx)[ii] = elem; |
| } |
| } |
| |
| // |
| // FIXME -- pack some of these constant integer args in a vec or struct |
| // |
| |
| __kernel |
| SKC_PATHS_COPY_KERNEL_ATTRIBS |
| void |
| skc_kernel_paths_copy |
| (__global skc_uint * const host_map, |
| |
| __global skc_block_id_t const * const bp_ids, |
| __global skc_paths_copy_elem * const bp_elems, |
| skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring |
| |
| __global skc_uint const * const bp_alloc, // block pool ring base |
| skc_uint const bp_alloc_idx,// which subbuf |
| |
| __global union skc_tagged_block_id const * const pb_cmds, |
| __global skc_paths_copy_elem const * const pb_elems, |
| |
| skc_uint const pb_size, // # of commands/blocks in buffer |
| skc_uint const pb_rolling, // shifted rolling counter base |
| |
| skc_uint const pb_prev_from, |
| skc_uint const pb_prev_span, |
| skc_uint const pb_curr_from) |
| { |
| // |
| // THERE ARE 3 TYPES OF PATH COPYING COMMANDS: |
| // |
| // - HEAD |
| // - NODE |
| // - SEGS |
| // |
| // THESE ARE SUBGROUP ORIENTED KERNELS |
| // |
| // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS |
| // |
| |
| // |
| // It's likely that peak bandwidth is achievable with a single |
| // workgroup. |
| // |
| // So let's keep the grids modestly sized and for simplicity and |
| // portability, let's assume that a single workgroup can perform all |
| // steps in the copy. |
| // |
| // Launch as large of a workgroup as possiblex |
| // |
| // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL |
| // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS |
| // 3. FOR EACH COMMAND: |
| // - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES. |
| // - NODE: CONVERT AND COPY B INDICES |
| // - SEGS: BULK COPY |
| // |
| // B : number of words in block -- always pow2 |
| // W : intelligently/arbitrarily chosen factor of B -- always pow2 |
| // |
| |
| // |
| // There are several approaches to processing the commands: |
| // |
| // 1. B threads are responsible for one block. All threads broadcast |
| // load a single command word. Workgroup size must be a facpb_elemsr of |
| // B. |
| // |
| // 2. W threads process an entire block. W will typically be the |
| // device's subgroup/warp/wave width. W threads broadcast load a |
| // single command word. |
| // |
| // 3. W threads process W blocks. W threads load W command words and |
| // process W blocks. |
| // |
| // Clearly (1) has low I/O intensity but will achieve high |
| // parallelism by activating the most possible threads. The downside |
| // of this kind of approach is that the kernel will occupy even a |
| // large GPU with low intensity work and reduce opportunities for |
| // concurrent kernel execution (of other kernels). |
| // |
| // See Vasily Volkov's CUDA presentation describing these tradeoffs. |
| // |
| // Note that there are many other approaches. For example, similar |
| // pb_elems (1) but each thread loads a pow2 vector of block data. |
| // |
| |
| // load the copied atomic read "base" from gmem |
| skc_uint const bp_reads = bp_alloc[bp_alloc_idx]; |
| // will always be less than 2^32 |
| skc_uint const gid = get_global_id(0); |
| // every subgroup/simd that will work on the block loads the same command |
| skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE; |
| // path builder data can be spread across two spans |
| skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from); |
| |
| // no need pb_elems make this branchless |
| if (pb_idx >= pb_size) |
| pb_idx -= pb_size; |
| |
| // broadcast load the command |
| union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx]; |
| |
| // what do we want pb_elems do with this block? |
| skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32); |
| |
| // compute offset from rolling base to get index into block pool ring allocation |
| skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling); |
| |
| // convert the pb_cmd's offset counter pb_elems a block id |
| skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off); |
| |
| #if 0 |
| if (get_sub_group_local_id() == 0) { |
| printf("bp_off/reads = %u / %u\n",bp_off,bp_reads); |
| printf("< %8u >\n",block); |
| } |
| #endif |
| |
| // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id() |
| skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK; |
| |
| // calculate bp_elems (to) / pb_elems (from) |
| skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid; |
| skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid; |
| |
| if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS) |
| { |
| #if 0 |
| if (tid == 0) |
| printf("%3u, segs\n",bp_off); |
| #endif |
| skc_copy_segs(bp_elems, |
| bp_elems_idx, |
| pb_elems, |
| pb_elems_idx); |
| } |
| else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE) |
| { |
| #if 0 |
| if (tid == 0) |
| printf("%3u, NODE\n",bp_off); |
| #endif |
| skc_copy_node(bp_elems, // to |
| bp_elems_idx, |
| bp_ids, |
| bp_reads, |
| bp_idx_mask, |
| pb_elems, // from |
| pb_elems_idx, |
| pb_rolling); |
| } |
| else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD) |
| { |
| #if 0 |
| if (tid == 0) |
| printf("%3u, HEAD\n",bp_off); |
| #endif |
| skc_copy_head(host_map, |
| block, |
| bp_elems, // to |
| bp_elems_idx, |
| bp_ids, |
| bp_reads, |
| bp_idx_mask, |
| pb_elems, // from |
| pb_elems_idx, |
| pb_rolling); |
| } |
| } |
| |
| // |
| // |
| // |
| |
| __kernel |
| SKC_PATHS_ALLOC_KERNEL_ATTRIBS |
| void |
| skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics, |
| __global skc_uint * const bp_alloc, |
| skc_uint const bp_alloc_idx, |
| skc_uint const pb_cmd_count) |
| { |
| // |
| // allocate blocks in block pool |
| // |
| skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count); |
| |
| // store in slot |
| bp_alloc[bp_alloc_idx] = reads; |
| |
| #if 0 |
| printf("pc: %8u + %u\n",reads,pb_cmd_count); |
| #endif |
| } |
| |
| // |
| // |
| // |