src/compute/skc/platforms/cl_12/kernels/paths_copy.cl - skia - Git at Google

 /*
  * Copyright 2017 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can
  * be found in the LICENSE file.
  *
  */

 //
 //
 //

 #include "path.h"
 #include "block_pool_cl.h"
 #include "path_builder_cl_12.h"
 #include "kernel_cl_12.h"

 //
 //
 //

 #if 0

 //
 // SIMD AVX2
 //

 #define SKC_PATHS_COPY_WORDS_PER_ELEM          8
 #define SKC_PATHS_COPY_SUBGROUP_SIZE           1
 #define SKC_PATHS_COPY_KERNEL_ATTRIBUTES

 typedef skc_uint8  skc_paths_copy_elem;
 typedef skc_uint8  skc_pb_idx_v;

 #define SKC_PATHS_COPY_ELEM_EXPAND()           SKC_EXPAND_8()

 #define SKC_IS_NOT_PATH_HEAD(sg,I)             ((sg) + I >= SKC_PATH_HEAD_WORDS)

 #endif

 //
 //
 //

 #define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK      (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
 #define SKC_PATHS_COPY_ELEMS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
 #define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK      (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
 #define SKC_PATHS_COPY_ELEMS_PER_THREAD        (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)

 // FIXME -- use SUBGROUP terminology everywhere
 #define SKC_PATHS_COPY_SUBGROUP_WORDS          (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)

 //
 //
 //

 #define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER                              \
   (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))

 #define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER                           \
   (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))

 // #define SKC_PATHS_COPY_HEAD_ELEMS    ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)

 //
 //
 //

 //
 // BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
 //

 #define SKC_CMD_PATHS_COPY_ONE_BITS              (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)

 #define SKC_CMD_PATHS_COPY_ONE_MASK              SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)

 #define SKC_CMD_PATHS_COPY_ONE                   (1u << SKC_CMD_PATHS_COPY_ONE_BITS)

 #define SKC_CMD_PATHS_COPY_GET_TAG(ti)           SKC_TAGGED_BLOCK_ID_GET_TAG(ti)

 #define SKC_CMD_PATHS_COPY_GET_ROLLING(ti)       ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)

 #define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b)  (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))

 //
 //
 //

 skc_uint
 skc_sub_group_local_id()
 {
 #if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
   return get_sub_group_local_id();
 #else
   return 0;
 #endif
 }

 //
 // convert an atomic read counter offset to a block id
 //

 skc_block_id_t
 skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
                  skc_uint                        const bp_idx_mask,
                  skc_uint                        const bp_reads,
                  skc_uint                        const bp_off)
 {
   skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;

   return bp_ids[bp_idx];
 }

 //
 //
 //

 void
 skc_copy_segs(__global skc_paths_copy_elem       * const bp_elems, // to
               skc_uint                             const bp_elems_idx,
               __global skc_paths_copy_elem const * const pb_elems, // from
               skc_uint                             const pb_elems_idx)
 {
   for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
     {
       (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
     }

 #if 0
   //
   // NOTE THIS IS PRINTING 8 ROWS
   //
   printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
          (skc_uint)get_global_id(0),pb_elems_idx,
          as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
          as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
          as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
          as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
   printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
          (skc_uint)get_global_id(0),pb_elems_idx,
          as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
          as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
          as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
          as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
 #endif
 }

 //
 //
 //

 void
 skc_copy_node(__global skc_paths_copy_elem       * const bp_elems, // to
               skc_uint                             const bp_elems_idx,
               __global skc_block_id_t      const * const bp_ids,
               skc_uint                             const bp_reads,
               skc_uint                             const bp_idx_mask,
               __global skc_paths_copy_elem const * const pb_elems, // from
               skc_uint                             const pb_elems_idx,
               skc_uint                             const pb_rolling)
 {
   //
   // remap block id tags bp_elems the host-side rolling counter pb_elems a
   // device-side block pool id
   //
   for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
     {
       // load block_id_tag words
       skc_paths_copy_elem elem   = (pb_elems + pb_elems_idx)[ii];

       // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
       skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;

       // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS

       //
       // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
       // will _always_ be safe as long as we don't use the loaded
       // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
       // of iterating over the vector components.
       //

       // only convert if original elem is not invalid

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R)                                 \
       if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
         skc_block_id_t const b = bp_ids[bp_idx C];              \
         elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
       }

       // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);

       SKC_PATHS_COPY_ELEM_EXPAND();

       // store the elem back
       (bp_elems+bp_elems_idx)[ii] = elem;
     }
 }

 //
 //
 //

 void
 skc_host_map_update(__global skc_uint * const host_map,
                     skc_uint            const block,
                     skc_paths_copy_elem const elem)
 {
   //
   // write first elem to map -- FIXME -- this is a little nasty
   // because it relies on the the host handle always being the first
   // word in the path header.
   //
   // OTOH, this is not unreasonable.  The alternative is to have a
   // separate kernel initializing the map.
   //
 #if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
   if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
 #endif
     {
 #if SKC_PATHS_COPY_ELEM_WORDS == 1
       host_map[elem] = block;
 #if 0
       printf("[%u] = %u\n",elem,block);
 #endif
 #else
       host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
 #endif
     }
 }

 //
 //
 //

 void
 skc_copy_head(__global skc_uint                  * const host_map,
               skc_uint                             const block,
               __global skc_paths_copy_elem       * const bp_elems, // to
               skc_uint                             const bp_elems_idx,
               __global skc_block_id_t      const * const bp_ids,
               skc_uint                             const bp_reads,
               skc_uint                             const bp_idx_mask,
               __global skc_paths_copy_elem const * const pb_elems, // from
               skc_uint                             const pb_elems_idx,
               skc_uint                             const pb_rolling)
 {
   //
   // if there are more path header words than there are
   // threads-per-block then we can just copy the initial header words
   //
 #if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
   for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
     {
       skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];

       (bp_elems+bp_elems_idx)[ii] = elem;

       if (ii == 0) {
         skc_host_map_update(host_map,block,elem);
       }
     }
 #endif

   //
   // this is similar to copy node but the first H words of the path
   // header are not modified and simply copied
   //
   for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
     {
       skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];

 #if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
       if (ii == 0) {
         skc_host_map_update(host_map,block,elem);
       }
 #endif
       // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
       skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;

       //
       // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
       // will _always_ be safe as long as we don't use the loaded
       // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
       // of iterating over the vector components.
       //

       // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS

       // FIXME -- MIX MIX MIX MIX / SELECT

       // only convert if original elem is not invalid
 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
       if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
         skc_block_id_t const b = bp_ids[bp_idx C];                      \
         elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);           \
       }

       // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);

       SKC_PATHS_COPY_ELEM_EXPAND();

       // store the elem back
       (bp_elems+bp_elems_idx)[ii] = elem;
     }

   //
   // the remaining words are treated like a node
   //
   for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
     {
       // load block_id_tag words
       skc_paths_copy_elem elem   = (pb_elems+pb_elems_idx)[ii];

       // calculate ahead of time
       skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;

       //
       // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
       // will _always_ be safe as long as we don't use the loaded
       // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
       // of iterating over the vector components.
       //

       // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS

       // only convert if original elem is not invalid
 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R)                                 \
       if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
         skc_block_id_t const b = bp_ids[bp_idx C];              \
         elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
       }

       // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);

       SKC_PATHS_COPY_ELEM_EXPAND();

       // store the elem
       (bp_elems+bp_elems_idx)[ii] = elem;
     }
 }

 //
 // FIXME -- pack some of these constant integer args in a vec or struct
 //

 __kernel
 SKC_PATHS_COPY_KERNEL_ATTRIBS
 void
 skc_kernel_paths_copy
 (__global skc_uint                        * const host_map,

  __global skc_block_id_t            const * const bp_ids,
  __global skc_paths_copy_elem             * const bp_elems,
  skc_uint                                   const bp_idx_mask, // pow2 modulo mask for block pool ring

  __global skc_uint                  const * const bp_alloc,    // block pool ring base
  skc_uint                                   const bp_alloc_idx,// which subbuf

  __global union skc_tagged_block_id const * const pb_cmds,
  __global skc_paths_copy_elem       const * const pb_elems,

  skc_uint                                   const pb_size,     // # of commands/blocks in buffer
  skc_uint                                   const pb_rolling,  // shifted rolling counter base

  skc_uint                                   const pb_prev_from,
  skc_uint                                   const pb_prev_span,
  skc_uint                                   const pb_curr_from)
 {
   //
   // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
   //
   // - HEAD
   // - NODE
   // - SEGS
   //
   // THESE ARE SUBGROUP ORIENTED KERNELS
   //
   // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
   //

   //
   // It's likely that peak bandwidth is achievable with a single
   // workgroup.
   //
   // So let's keep the grids modestly sized and for simplicity and
   // portability, let's assume that a single workgroup can perform all
   // steps in the copy.
   //
   // Launch as large of a workgroup as possiblex
   //
   // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
   // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
   // 3. FOR EACH COMMAND:
   //      - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
   //      - NODE: CONVERT AND COPY B INDICES
   //      - SEGS: BULK COPY
   //
   // B : number of words in block -- always pow2
   // W : intelligently/arbitrarily chosen factor of B -- always pow2
   //

   //
   // There are several approaches to processing the commands:
   //
   // 1. B threads are responsible for one block. All threads broadcast
   //    load a single command word. Workgroup size must be a facpb_elemsr of
   //    B.
   //
   // 2. W threads process an entire block. W will typically be the
   //    device's subgroup/warp/wave width. W threads broadcast load a
   //    single command word.
   //
   // 3. W threads process W blocks. W threads load W command words and
   //    process W blocks.
   //
   // Clearly (1) has low I/O intensity but will achieve high
   // parallelism by activating the most possible threads. The downside
   // of this kind of approach is that the kernel will occupy even a
   // large GPU with low intensity work and reduce opportunities for
   // concurrent kernel execution (of other kernels).
   //
   // See Vasily Volkov's CUDA presentation describing these tradeoffs.
   //
   // Note that there are many other approaches.  For example, similar
   // pb_elems (1) but each thread loads a pow2 vector of block data.
   //

   // load the copied atomic read "base" from gmem
   skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
   // will always be less than 2^32
   skc_uint const gid      = get_global_id(0);
   // every subgroup/simd that will work on the block loads the same command
   skc_uint const sg_idx   = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
   // path builder data can be spread across two spans
   skc_uint       pb_idx   = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);

   // no need pb_elems make this branchless
   if (pb_idx >= pb_size)
     pb_idx -= pb_size;

   // broadcast load the command
   union skc_tagged_block_id const pb_cmd       = pb_cmds[pb_idx];

   // what do we want pb_elems do with this block?
   skc_cmd_paths_copy_tag    const tag          = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);

   // compute offset from rolling base to get index into block pool ring allocation
   skc_uint                  const bp_off       = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);

   // convert the pb_cmd's offset counter pb_elems a block id
   skc_block_id_t            const block        = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);

 #if 0
   if (get_sub_group_local_id() == 0) {
     printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
     printf("< %8u >\n",block);
   }
 #endif

   // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
   skc_uint                 const tid          = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;

   // calculate bp_elems (to) / pb_elems (from)
   skc_uint                 const bp_elems_idx = block  * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
   skc_uint                 const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK    + tid;

   if      (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
     {
 #if 0
       if (tid == 0)
         printf("%3u, segs\n",bp_off);
 #endif
       skc_copy_segs(bp_elems,
                     bp_elems_idx,
                     pb_elems,
                     pb_elems_idx);
     }
   else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
     {
 #if 0
       if (tid == 0)
         printf("%3u, NODE\n",bp_off);
 #endif
       skc_copy_node(bp_elems, // to
                     bp_elems_idx,
                     bp_ids,
                     bp_reads,
                     bp_idx_mask,
                     pb_elems, // from
                     pb_elems_idx,
                     pb_rolling);
     }
   else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
     {
 #if 0
       if (tid == 0)
         printf("%3u, HEAD\n",bp_off);
 #endif
       skc_copy_head(host_map,
                     block,
                     bp_elems, // to
                     bp_elems_idx,
                     bp_ids,
                     bp_reads,
                     bp_idx_mask,
                     pb_elems, // from
                     pb_elems_idx,
                     pb_rolling);
     }
 }

 //
 //
 //

 __kernel
 SKC_PATHS_ALLOC_KERNEL_ATTRIBS
 void
 skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
                        __global skc_uint          * const bp_alloc,
                        skc_uint                     const bp_alloc_idx,
                        skc_uint                     const pb_cmd_count)
 {
   //
   // allocate blocks in block pool
   //
   skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);

   // store in slot
   bp_alloc[bp_alloc_idx] = reads;

 #if 0
   printf("pc: %8u + %u\n",reads,pb_cmd_count);
 #endif
 }

 //
 //
 //
	/*
	* Copyright 2017 Google Inc.
	*
	* Use of this source code is governed by a BSD-style license that can
	* be found in the LICENSE file.
	*
	*/

	//
	//
	//

	#include "path.h"
	#include "block_pool_cl.h"
	#include "path_builder_cl_12.h"
	#include "kernel_cl_12.h"

	//
	//
	//

	#if 0

	//
	// SIMD AVX2
	//

	#define SKC_PATHS_COPY_WORDS_PER_ELEM 8
	#define SKC_PATHS_COPY_SUBGROUP_SIZE 1
	#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES

	typedef skc_uint8 skc_paths_copy_elem;
	typedef skc_uint8 skc_pb_idx_v;

	#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8()

	#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS)

	#endif

	//
	//
	//

	#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
	#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
	#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
	#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)

	// FIXME -- use SUBGROUP terminology everywhere
	#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)

	//
	//
	//

	#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \
	(SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))

	#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \
	(SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))

	// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)

	//
	//
	//

	//
	// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
	//

	#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)

	#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)

	#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS)

	#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti)

	#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)

	#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) \| ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))

	//
	//
	//

	skc_uint
	skc_sub_group_local_id()
	{
	#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
	return get_sub_group_local_id();
	#else
	return 0;
	#endif
	}

	//
	// convert an atomic read counter offset to a block id
	//

	skc_block_id_t
	skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
	skc_uint const bp_idx_mask,
	skc_uint const bp_reads,
	skc_uint const bp_off)
	{
	skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;

	return bp_ids[bp_idx];
	}

	//
	//
	//

	void
	skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to
	skc_uint const bp_elems_idx,
	__global skc_paths_copy_elem const * const pb_elems, // from
	skc_uint const pb_elems_idx)
	{
	for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
	{
	(bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
	}

	#if 0
	//
	// NOTE THIS IS PRINTING 8 ROWS
	//
	printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
	(skc_uint)get_global_id(0),pb_elems_idx,
	as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
	as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
	as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
	as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
	printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
	(skc_uint)get_global_id(0),pb_elems_idx,
	as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
	as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
	as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
	as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
	#endif
	}

	//
	//
	//

	void
	skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to
	skc_uint const bp_elems_idx,
	__global skc_block_id_t const * const bp_ids,
	skc_uint const bp_reads,
	skc_uint const bp_idx_mask,
	__global skc_paths_copy_elem const * const pb_elems, // from
	skc_uint const pb_elems_idx,
	skc_uint const pb_rolling)
	{
	//
	// remap block id tags bp_elems the host-side rolling counter pb_elems a
	// device-side block pool id
	//
	for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
	{
	// load block_id_tag words
	skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii];

	// calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
	skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;

	// FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS

	//
	// FIXME -- SIMD can be fully parallelized since a bp_ids[] load
	// will _always_ be safe as long as we don't use the loaded
	// value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
	// of iterating over the vector components.
	//

	// only convert if original elem is not invalid

	#undef SKC_EXPAND_X
	#define SKC_EXPAND_X(I,S,C,P,R) \
	if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
	skc_block_id_t const b = bp_ids[bp_idx C]; \
	elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
	}

	// printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);

	SKC_PATHS_COPY_ELEM_EXPAND();

	// store the elem back
	(bp_elems+bp_elems_idx)[ii] = elem;
	}
	}

	//
	//
	//

	void
	skc_host_map_update(__global skc_uint * const host_map,
	skc_uint const block,
	skc_paths_copy_elem const elem)
	{
	//
	// write first elem to map -- FIXME -- this is a little nasty
	// because it relies on the the host handle always being the first
	// word in the path header.
	//
	// OTOH, this is not unreasonable. The alternative is to have a
	// separate kernel initializing the map.
	//
	#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
	if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
	#endif
	{
	#if SKC_PATHS_COPY_ELEM_WORDS == 1
	host_map[elem] = block;
	#if 0
	printf("[%u] = %u\n",elem,block);
	#endif
	#else
	host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
	#endif
	}
	}

	//
	//
	//

	void
	skc_copy_head(__global skc_uint * const host_map,
	skc_uint const block,
	__global skc_paths_copy_elem * const bp_elems, // to
	skc_uint const bp_elems_idx,
	__global skc_block_id_t const * const bp_ids,
	skc_uint const bp_reads,
	skc_uint const bp_idx_mask,
	__global skc_paths_copy_elem const * const pb_elems, // from
	skc_uint const pb_elems_idx,
	skc_uint const pb_rolling)
	{
	//
	// if there are more path header words than there are
	// threads-per-block then we can just copy the initial header words
	//
	#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
	for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
	{
	skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];

	(bp_elems+bp_elems_idx)[ii] = elem;

	if (ii == 0) {
	skc_host_map_update(host_map,block,elem);
	}
	}
	#endif

	//
	// this is similar to copy node but the first H words of the path
	// header are not modified and simply copied
	//
	for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
	{
	skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];

	#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
	if (ii == 0) {
	skc_host_map_update(host_map,block,elem);
	}
	#endif
	// calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
	skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;

	//
	// FIXME -- SIMD can be fully parallelized since a bp_ids[] load
	// will _always_ be safe as long as we don't use the loaded
	// value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
	// of iterating over the vector components.
	//

	// FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS

	// FIXME -- MIX MIX MIX MIX / SELECT

	// only convert if original elem is not invalid
	#undef SKC_EXPAND_X
	#define SKC_EXPAND_X(I,S,C,P,R) \
	if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
	skc_block_id_t const b = bp_ids[bp_idx C]; \
	elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
	}

	// printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);

	SKC_PATHS_COPY_ELEM_EXPAND();

	// store the elem back
	(bp_elems+bp_elems_idx)[ii] = elem;
	}

	//
	// the remaining words are treated like a node
	//
	for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
	{
	// load block_id_tag words
	skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];

	// calculate ahead of time
	skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;

	//
	// FIXME -- SIMD can be fully parallelized since a bp_ids[] load
	// will _always_ be safe as long as we don't use the loaded
	// value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
	// of iterating over the vector components.
	//

	// FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS

	// only convert if original elem is not invalid
	#undef SKC_EXPAND_X
	#define SKC_EXPAND_X(I,S,C,P,R) \
	if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
	skc_block_id_t const b = bp_ids[bp_idx C]; \
	elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
	}

	// printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);

	SKC_PATHS_COPY_ELEM_EXPAND();

	// store the elem
	(bp_elems+bp_elems_idx)[ii] = elem;
	}
	}

	//
	// FIXME -- pack some of these constant integer args in a vec or struct
	//

	__kernel
	SKC_PATHS_COPY_KERNEL_ATTRIBS
	void
	skc_kernel_paths_copy
	(__global skc_uint * const host_map,

	__global skc_block_id_t const * const bp_ids,
	__global skc_paths_copy_elem * const bp_elems,
	skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring

	__global skc_uint const * const bp_alloc, // block pool ring base
	skc_uint const bp_alloc_idx,// which subbuf

	__global union skc_tagged_block_id const * const pb_cmds,
	__global skc_paths_copy_elem const * const pb_elems,

	skc_uint const pb_size, // # of commands/blocks in buffer
	skc_uint const pb_rolling, // shifted rolling counter base

	skc_uint const pb_prev_from,
	skc_uint const pb_prev_span,
	skc_uint const pb_curr_from)
	{
	//
	// THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
	//
	// - HEAD
	// - NODE
	// - SEGS
	//
	// THESE ARE SUBGROUP ORIENTED KERNELS
	//
	// A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
	//

	//
	// It's likely that peak bandwidth is achievable with a single
	// workgroup.
	//
	// So let's keep the grids modestly sized and for simplicity and
	// portability, let's assume that a single workgroup can perform all
	// steps in the copy.
	//
	// Launch as large of a workgroup as possiblex
	//
	// 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
	// 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
	// 3. FOR EACH COMMAND:
	// - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
	// - NODE: CONVERT AND COPY B INDICES
	// - SEGS: BULK COPY
	//
	// B : number of words in block -- always pow2
	// W : intelligently/arbitrarily chosen factor of B -- always pow2
	//

	//
	// There are several approaches to processing the commands:
	//
	// 1. B threads are responsible for one block. All threads broadcast
	// load a single command word. Workgroup size must be a facpb_elemsr of
	// B.
	//
	// 2. W threads process an entire block. W will typically be the
	// device's subgroup/warp/wave width. W threads broadcast load a
	// single command word.
	//
	// 3. W threads process W blocks. W threads load W command words and
	// process W blocks.
	//
	// Clearly (1) has low I/O intensity but will achieve high
	// parallelism by activating the most possible threads. The downside
	// of this kind of approach is that the kernel will occupy even a
	// large GPU with low intensity work and reduce opportunities for
	// concurrent kernel execution (of other kernels).
	//
	// See Vasily Volkov's CUDA presentation describing these tradeoffs.
	//
	// Note that there are many other approaches. For example, similar
	// pb_elems (1) but each thread loads a pow2 vector of block data.
	//

	// load the copied atomic read "base" from gmem
	skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
	// will always be less than 2^32
	skc_uint const gid = get_global_id(0);
	// every subgroup/simd that will work on the block loads the same command
	skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
	// path builder data can be spread across two spans
	skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);

	// no need pb_elems make this branchless
	if (pb_idx >= pb_size)
	pb_idx -= pb_size;

	// broadcast load the command
	union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx];

	// what do we want pb_elems do with this block?
	skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);

	// compute offset from rolling base to get index into block pool ring allocation
	skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);

	// convert the pb_cmd's offset counter pb_elems a block id
	skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);

	#if 0
	if (get_sub_group_local_id() == 0) {
	printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
	printf("< %8u >\n",block);
	}
	#endif

	// FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
	skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;

	// calculate bp_elems (to) / pb_elems (from)
	skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
	skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid;

	if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
	{
	#if 0
	if (tid == 0)
	printf("%3u, segs\n",bp_off);
	#endif
	skc_copy_segs(bp_elems,
	bp_elems_idx,
	pb_elems,
	pb_elems_idx);
	}
	else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
	{
	#if 0
	if (tid == 0)
	printf("%3u, NODE\n",bp_off);
	#endif
	skc_copy_node(bp_elems, // to
	bp_elems_idx,
	bp_ids,
	bp_reads,
	bp_idx_mask,
	pb_elems, // from
	pb_elems_idx,
	pb_rolling);
	}
	else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
	{
	#if 0
	if (tid == 0)
	printf("%3u, HEAD\n",bp_off);
	#endif
	skc_copy_head(host_map,
	block,
	bp_elems, // to
	bp_elems_idx,
	bp_ids,
	bp_reads,
	bp_idx_mask,
	pb_elems, // from
	pb_elems_idx,
	pb_rolling);
	}
	}

	//
	//
	//

	__kernel
	SKC_PATHS_ALLOC_KERNEL_ATTRIBS
	void
	skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
	__global skc_uint * const bp_alloc,
	skc_uint const bp_alloc_idx,
	skc_uint const pb_cmd_count)
	{
	//
	// allocate blocks in block pool
	//
	skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);

	// store in slot
	bp_alloc[bp_alloc_idx] = reads;

	#if 0
	printf("pc: %8u + %u\n",reads,pb_cmd_count);
	#endif
	}

	//
	//
	//