src/compute/skc/platforms/cl_12/kernels/place.cl - skia - Git at Google

 /*
  * Copyright 2017 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can
  * be found in the LICENSE file.
  *
  */

 //
 //
 //

 #include "tile.h"
 #include "common.h"
 #include "raster.h"
 #include "atomic_cl.h"
 #include "kernel_cl_12.h"

 //
 //
 //

 #define SKC_PLACE_SUBGROUP_MASK      (SKC_PLACE_SUBGROUP_SIZE - 1)
 #define SKC_PLACE_SUBGROUP_LAST      (SKC_PLACE_SUBGROUP_SIZE - 1)

 //
 //
 //

 #define SKC_PLACE_SMEM_COUNT_TTSK    SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
 #define SKC_PLACE_SMEM_COUNT_TTPK    SKC_RASTER_NODE_MAX_TTPK

 //
 //
 //

 #define SKC_PLACE_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)

 //
 //
 //

 #if   ( SKC_PLACE_X == 1 )
 #define SKC_PLACE_EXPAND()           SKC_EXPAND_1()
 #define SKC_PLACE_EXPAND_I_LAST      0

 #elif ( SKC_PLACE_X == 2 )
 #define SKC_PLACE_EXPAND()           SKC_EXPAND_2()
 #define SKC_PLACE_EXPAND_I_LAST      1

 #elif ( SKC_PLACE_X == 4 )
 #define SKC_PLACE_EXPAND()           SKC_EXPAND_4()
 #define SKC_PLACE_EXPAND_I_LAST      3

 #elif ( SKC_PLACE_X == 8 )
 #define SKC_PLACE_EXPAND()           SKC_EXPAND_8()
 #define SKC_PLACE_EXPAND_I_LAST      7

 #elif ( SKC_PLACE_X == 16)
 #define SKC_PLACE_EXPAND()           SKC_EXPAND_16()
 #define SKC_PLACE_EXPAND_I_LAST      15
 #endif

 //
 // PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
 // COALESCED WRITES.  LO FIRST, FOLLOWED BY HI.
 //
 // THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
 // KERNELS USE DIFFERENT SUBGROUP SIZES.
 //
 // THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
 // LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
 //
 // NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
 // OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
 // ONLY SUPPORT A SUBGROUP SIZE OF 16.
 //

 #if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )

 #define SKC_PLACE_STRIDE_H(L)              (L)
 #define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
 #define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)

 #elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1

 #define SKC_PLACE_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
 #define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_RATIO - 1)
 #define SKC_PLACE_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))

 #define SKC_PLACE_STRIDE_H(L)              (L)
 #define SKC_PLACE_STRIDE_V_LO(I)           (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
 #define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)

 #elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1

 #define SKC_PLACE_SUBGROUP_RATIO           (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
 #define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask

 #define SKC_PLACE_STRIDE_H(L)              (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
 #define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
 #define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)

 #endif

 //
 // A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
 // IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
 //

 #define SKC_PLACE_IS_ALL_HEADER_ROW(i)   (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)

 #define SKC_PLACE_IS_NOT_HEADER_ROW(i)   ( (i)    * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)

 #define SKC_PLACE_IS_TRAILING_ROW(i)     (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)

 #define SKC_PLACE_IS_HEADER_ROW_KEY(i)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))


 //
 // Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
 //
 #define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
 #define SKC_PLACE_NODE_LESS_THAN(i,k)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id()                          < (k))

 //
 // TTSK v2:
 //
 //  0                                       63
 //  | TTSB ID | PREFIX |  SPAN   |  X  |  Y  |
 //  +---------+--------+---------+-----+-----+
 //  |    27   | 1 (=0) | 12 (=0) | 12  | 12  |
 //
 //
 // TTPK v2:
 //
 //  0                                    63
 //  | TTPB ID | PREFIX | SPAN |  X  |  Y  |
 //  +---------+--------+------+-----+-----+
 //  |    27   | 1 (=1) |  12  | 12  | 12  |
 //
 //

 //
 // TTCK (32-BIT COMPARE) v1:
 //
 //  0                                                           63
 //  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
 //  +----------------------+--------+--------+-------+-----+-----+
 //  |          30          |    1   |    1   |   18  |  7  |  7  |
 //
 //
 // TTCK (32-BIT COMPARE) v2:
 //
 //  0                                                           63
 //  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
 //  +----------------------+--------+--------+-------+-----+-----+
 //  |          30          |    1   |    1   |   15  |  9  |  8  |
 //
 //
 // TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
 //
 //  0                                                           63
 //  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
 //  +----------------------+--------+--------+-------+-----+-----+
 //  |          27          |    1   |    1   |   18  |  9  |  8  |
 //

 union skc_subgroup_smem
 {
   skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE

   struct {
     struct {
       skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
       skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
     } lo;

     struct {
       skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
       skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
     } hi;

     // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
   };

 };

 //
 // scatter scan max
 //
 static
 skc_int_v_t
 skc_scatter_scan_max(__local union skc_subgroup_smem  volatile * const smem,
                      skc_int_v_t                                 const iss,
                      skc_int_v_t                                 const ess)
 {
   //
   // prefix sums determine which lanes we're going to work on next
   //
   skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
   skc_int_v_t  const scratch_idx      = max(ess,0);

   //
   // SIMT
   //

   //
   // zero the volatile smem scratchpad using vector syntax
   //
   smem->scratch[get_sub_group_local_id()] = ( 0 );

   //
   // store source lane at starting lane
   //
   if (is_scratch_store) {
     smem->scratch[scratch_idx] = get_sub_group_local_id();
   }

   //
   // propagate lanes to right using max scan
   //
   skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
   skc_int_v_t const source  = sub_group_scan_inclusive_max(scratch);

   return source;
 }

 //
 //
 //

 static
 skc_bool
 skc_xk_clip(union skc_tile_clip const * const tile_clip,
             skc_ttxk_t                * const xk)
 {
   //
   // clip the sk and pk keys
   //
   // if fully clipped then return false
   //
   // alternatively -- we can expand all these keys in place
   //
   // alternatively -- keep sk and pk keys segregated because sk
   // represents the vast majority of keys and are easier to process.
   // don't mess with the fastpath!
   //
   return false;
 }

 //
 //
 //

 static
 skc_ttck_t
 skc_sk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
              union skc_cmd_place              const    * const cmd,
              skc_uint                                    const sk_idx)
 {
   skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
   skc_uint const hi = smem->hi.sk[sk_idx];

   skc_ttck_t ck;

   ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id

   // FIXME -- x and y should already be clipped and shifted
   skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
   skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;

   ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;

   return ck;
 }

 static
 skc_ttck_t
 skc_pk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
              union skc_cmd_place              const    * const cmd,
              skc_uint                                    const pk_idx,
              skc_uint                                    const dx)
 {
   skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
   skc_uint const hi = smem->hi.pk[pk_idx];

   skc_ttck_t ck;

   ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id

   // FIXME -- x and y should already be clipped and shifted
   skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
   skc_uint const y = (cmd->ty +      SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;

   ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;

   return ck;
 }

 //
 //
 //

 static
 void
 skc_ttsk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
                __global skc_ttck_t                       * const ck_extent,
                __local union skc_subgroup_smem  volatile * const smem,
                union skc_cmd_place              const    * const cmd,
                skc_uint                         const            sk)
 {
   //
   // Pretty sure you can never ever have an sk count equal to 0
   //
   skc_uint ck_base = 0;

   // last lane performs the block pool allocation with an atomic increment
   if (get_sub_group_local_id() == 0) {
     ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
   }

   // broadcast base to all lanes
   ck_base = sub_group_broadcast(ck_base,0);

   // convert sk keys to ck keys
   for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
     {
       ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
     }
 }

 //
 //
 //

 static
 skc_int
 skc_ttpk_get_span(__local union skc_subgroup_smem  volatile * const smem,
                   skc_uint                                    const idx)
 {
   skc_uint const lo      = smem->lo.pk[idx];
   skc_uint const hi      = smem->hi.pk[idx];

   skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
   skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;

   return (span_lo | span_hi) + 1;
 }

 //
 //
 //

 static
 void
 skc_ttpk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
                __global skc_ttck_t                       * const ck_extent,
                __local union skc_subgroup_smem  volatile * const smem,
                union skc_cmd_place              const    * const cmd,
                skc_uint                         const            pk)
 {
   // bail out if pk queue is empty
   if (pk == 0)
     return;

 #if 0
   if (get_sub_group_local_id() == 0)
     printf("%u\n",pk);
 #endif

   //
   // FIXME -- this nested loop iterates over the queue processing a
   // subgroup of 64-bit keys at a time.  This is probably not the most
   // efficient approach so investigate how to store and iterate over a
   // wider than subgroup (node-sized) queue of keys.
   //

   // round up so we work with full subgroups
   skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
   skc_uint       ii    = 0;

   // nested loop that expands all ttpk keys
 #if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
   for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
 #endif
     {
       skc_uint idx  = ii + get_sub_group_local_id();
       skc_int  span = 0;

       // how many tiles does this ttpk span?
       if (idx < pk)
         span = skc_ttpk_get_span(smem,idx);

       // we need inclusive, exclusive and total
       skc_int iss = sub_group_scan_inclusive_add(span);
       skc_int ess = iss - span;
       skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);

       // printf("%u : %u\n",span,iss);
       // continue;

       // atomically allocate space for the pk keys
       skc_uint ck_base = 0;

       // last lane performs the block pool allocation with an atomic increment
       if (get_sub_group_local_id() == 0) {
         ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
       }

       // broadcast atomically allocated extent base to all lanes
       skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();

       //
       // FIXME -- this loop would probably be faster if the ttpk keys
       // were held in registers and accessed with shuffles instead of
       // SMEM loads
       //

       //
       // loop until there are no more expanded pk keys
       //
       while (true)
         {
           skc_int const source = skc_scatter_scan_max(smem,iss,ess);
           skc_int const dx     = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);

           // store valid ck keys to gmem
           if (get_sub_group_local_id() < rem) {
             ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
           }

           // decrement remainder
           rem -= SKC_PLACE_SUBGROUP_SIZE;

           if (rem <= 0)
             break;

           // increment/decrement indices
           ck_idx += SKC_PLACE_SUBGROUP_SIZE;
           iss    -= SKC_PLACE_SUBGROUP_SIZE;
           ess    -= SKC_PLACE_SUBGROUP_SIZE;
         }
     }
 }

 //
 //
 //

 static
 skc_uint
 skc_ballot(skc_uint * const xk, skc_uint const is_xk)
 {
 #if 0
   //
   // FIXME -- when available, this should use the idiom:
   //
   //   ballot() + lane_mask_less_than_or_equal + popcount()
   //
   // Supported by:
   //
   //   - Vulkan 1.1 / SPIR-V 1.3
   //   - CUDA
   //   - AVX2 (SSE*?)
   //
 #else
   //
   // otherwise, emulate with an inclusive scan (yuk)
   //
   skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);

   skc_uint const xk_idx = *xk + prefix - is_xk;

   *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);

 #if 0
   printf("< %3u >\n",xk_idx);
 #endif

   return xk_idx;
 #endif
 }

 //
 //
 //
 __kernel
 SKC_PLACE_KERNEL_ATTRIBS
 void
 skc_kernel_place(__global skc_bp_elem_t                * const bp_elems,
                  __global SKC_ATOMIC_UINT     volatile * const place_atomics,
                  __global skc_ttck_t                   * const ck_extent,
                  __global union skc_cmd_place const    * const cmds,
                  __global skc_block_id_t               * const map,
                  skc_uint4                               const clip,
                  skc_uint                                const count)
 {
   //
   // declare shared memory block
   //
 #if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
   __local union skc_subgroup_smem  volatile                smem[1];
 #else
   __local union skc_subgroup_smem  volatile                smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
   __local union skc_subgroup_smem  volatile * const smem = smem_wg + get_sub_group_id();
 #endif

   //
   // This is a subgroup-centric kernel
   //
   // Which subgroup in the grid is this?
   //
   // TAKE NOTE: the Intel GEN compiler appears to be recognizing
   // get_group_id(0) as a uniform but the alternative calculation used
   // when there are multiple subgroups per workgroup is not
   // cooperating and driving spillage elsewhere.
   //
   // Test the raster's translated bounds against the composition's
   // tile clip
   //
   // There are 3 cases:
   //
   //   - the raster is completely clipped -> return
   //   - the raster is partially  clipped -> all keys must clipped
   //   - the raster is not        clipped -> no keys are tested
   //
   //
   // There are at least 4 implementations of place and we want to
   // special-case them as much as possible so that, at the least, the
   // fastpath remains fast.
   //
   //  - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
   //
   //  - implement CLIPPED + NO TRANSLATION path
   //
   //  - implement NO CLIP +    TRANSLATION path
   //
   //  - implement CLIPPED +    TRANSLATION path
   //
   //
   // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
   // 12:12:8 integer where:
   //
   //  12: ttsk
   //  12: ttpk
   //   8: /dev/null -- clipped or invalid key
   //
   // Three kinds of nodes in a raster's list:
   //
   //  - the head node
   //  - an internal node
   //  - the final node
   //

 #if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
   skc_uint const cmd_idx = get_group_id(0);
 #else
   skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
 #endif

   // load command
   union skc_cmd_place const cmd = cmds[cmd_idx];

   // get the raster header from the raster host id -- scalar
   skc_block_id_t            id  = map[cmd.raster_h];

   //
   // load all of the head block ttxk keys into registers
   //
   // FIXME -- this pattern lends itself to using the higher
   // performance Intel GEN block load instructions
   //
   skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R)                                 \
   union skc_raster_node_elem const h##I = {                     \
     .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)],    \
                bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)]  }  \
   };

   SKC_PLACE_EXPAND();

   //
   // load raster header counts -- we only need the "nodes" and "keys"
   // words but the keys we loaded are doublewords.
   //
   // FIXME -- this can be made portable with compile-time macro expansion
   //
   skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
   skc_uint keys  = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS

   //
   //
   //
 #if 0
 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R)                                 \
   printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",             \
          nodes,keys,                                            \
          I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),  \
          h##I.u32v2.hi,h##I.u32v2.lo,                           \
          h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);

   SKC_PLACE_EXPAND();
 #endif

   //
 #if 0
   if (get_sub_group_local_id() == 0) {
     printf("place: %u / %u / %u\n",head_id,nodes,keys);
   }
 #endif

   {
     //
     // classify every key in the header
     //
     // keys: 0 is not a key / 1 is a key
     // skpk: 0 is sk        / 1 is pk
     //
     skc_uint bits_keys = 0;
     skc_uint bits_skpk = 0;

     //
     // calculate bits_keys
     //
 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
     if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
       skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
       if (idx < keys) {                                                 \
         bits_keys |= (1u << I);                                         \
       }                                                                 \
       if (SKC_PLACE_IS_TRAILING_ROW(I)) {                               \
         if (keys > SKC_RASTER_HEAD_COUNT_KEYS) {                        \
           if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {    \
             bits_keys &= ~(1u << I);                                    \
           }                                                             \
         }                                                               \
       }                                                                 \
     }

     SKC_PLACE_EXPAND();

     //
     // blindly calculate bits_skpk
     //
 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
     if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
       bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
     }

     SKC_PLACE_EXPAND();

 #if 0
     printf("%2X : %2X\n",bits_keys,bits_skpk);
 #endif

     //
     // next pointer is last element of last row.  save it now because
     // this might be recognized as a subgroup-uniform/scalar.
     //
     id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);

     //
     // append SK keys first
     //
     skc_uint const bits_sk = bits_keys & ~bits_skpk;
     skc_uint       sk      = 0;

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R)                 \
     if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
       skc_uint is_sk  = (bits_sk >> I) & 1;     \
       skc_uint sk_idx = skc_ballot(&sk,is_sk);  \
       if (is_sk) {                              \
         smem->lo.sk[sk_idx] = h##I.xk.lo;       \
         smem->hi.sk[sk_idx] = h##I.xk.hi;       \
       }                                         \
     }

     SKC_PLACE_EXPAND();

     //
     // append PK keys next
     //
     skc_uint const bits_pk = bits_keys & bits_skpk;
     skc_uint       pk      = 0;

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R)                 \
     if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
       skc_uint is_pk  = (bits_pk >> I) & 1;     \
       skc_uint pk_idx = skc_ballot(&pk,is_pk);  \
       if (is_pk) {                              \
         smem->lo.pk[pk_idx] = h##I.xk.lo;       \
         smem->hi.pk[pk_idx] = h##I.xk.hi;       \
       }                                         \
     }

     SKC_PLACE_EXPAND();

 #if 0
     printf("%2u * %2u\n",sk,pk);
 #endif
     //
     // flush the keys
     //
     skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
     skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
   }

   //
   // we're done if there was only a head node
   //
   if (nodes == 0)
     return;

   //
   // decrement keys
   //
   keys -= SKC_RASTER_HEAD_COUNT_KEYS;

   //
   // otherwise, append keys in trailing nodes to smem
   //
   while (true)
     {
       //
       // load all of the node block ttxk keys into registers
       //
       // FIXME -- this pattern lends itself to using the higher
       // performance Intel GEN block load instructions
       //
       skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
       union skc_raster_node_elem const n##I = {                         \
         .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)],        \
                    bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)]  }      \
       };

       SKC_PLACE_EXPAND();

 #if 0
 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
       printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",                 \
              nodes,keys,                                                \
              I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),      \
              n##I.u32v2.hi,n##I.u32v2.lo,                               \
              n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);

       SKC_PLACE_EXPAND();
 #endif

       //
       // classify every key in the header
       //
       // keys: 0 is not a key / 1 is a key
       // skpk: 0 is sk        / 1 is pk
       //
       skc_uint bits_keys = 0;
       skc_uint bits_skpk = 0;

       //
       // calculate bits_keys
       //
 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R) {                                       \
         skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
         if (idx < keys) {                                               \
           bits_keys |= (1u << I);                                       \
         }                                                               \
         if (SKC_PLACE_IS_TRAILING_ROW(I)) {                             \
           if (keys > SKC_RASTER_NODE_COUNT_KEYS) {                      \
             if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {  \
               bits_keys &= ~(1u << I);                                  \
             }                                                           \
           }                                                             \
         }                                                               \
       }

       SKC_PLACE_EXPAND();

       //
       // blindly calculate bits_skpk
       //
 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R) {                                       \
         bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
       }

       SKC_PLACE_EXPAND();

 #if 0
       printf("%2X : %2X\n",bits_keys,bits_skpk);
 #endif

       //
       // next pointer is last element of last row.  save it now because
       // this might be recognized as a subgroup-uniform/scalar.
       //
       id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);

       //
       // append SK keys first
       //
       skc_uint const bits_sk = bits_keys & ~bits_skpk;
       skc_uint       sk      = 0;

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R) {                       \
         skc_uint is_sk  = (bits_sk >> I) & 1;           \
         skc_uint sk_idx = skc_ballot(&sk,is_sk);        \
         if (is_sk) {                                    \
           smem->lo.sk[sk_idx] = n##I.xk.lo;             \
           smem->hi.sk[sk_idx] = n##I.xk.hi;             \
         }                                               \
       }

       SKC_PLACE_EXPAND();

       //
       // append PK keys next
       //
       skc_uint const bits_pk = bits_keys & bits_skpk;
       skc_uint       pk      = 0;

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,R) {                       \
         skc_uint is_pk  = (bits_pk >> I) & 1;           \
         skc_uint pk_idx = skc_ballot(&pk,is_pk);        \
         if (is_pk) {                                    \
           smem->lo.pk[pk_idx] = n##I.xk.lo;             \
           smem->hi.pk[pk_idx] = n##I.xk.hi;             \
         }                                               \
       }

       SKC_PLACE_EXPAND();

 #if 0
     printf("%2u * %2u\n",sk,pk);
 #endif
       //
       // if total for either the sk or pk queue reaches the
       // highwater mark then flush it to the extent
       //
       skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
       skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);

       //
       // if this was the last node then we're done
       //
       if (--nodes == 0)
         return;

       //
       // otherwise decrement keys
       //
       keys -= SKC_RASTER_NODE_COUNT_KEYS;
     }
 }

 //
 //
 //