src/compute/hs/cuda/hs_cuda.inl - skia - Git at Google

 /*
  * Copyright 2016 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can
  * be found in the LICENSE file.
  *
  */

 //
 //
 //

 #ifdef __cplusplus
 extern "C" {
 #endif

 #include "common/cuda/assert_cuda.h"
 #include "common/macros.h"
 #include "common/util.h"

 #ifdef __cplusplus
 }
 #endif

 //
 // We want concurrent kernel execution to occur in a few places.
 //
 // The summary is:
 //
 //   1) If necessary, some max valued keys are written to the end of
 //      the vin/vout buffers.
 //
 //   2) Blocks of slabs of keys are sorted.
 //
 //   3) If necesary, the blocks of slabs are merged until complete.
 //
 //   4) If requested, the slabs will be converted from slab ordering
 //      to linear ordering.
 //
 // Below is the general "happens-before" relationship between HotSort
 // compute kernels.
 //
 // Note the diagram assumes vin and vout are different buffers.  If
 // they're not, then the first merge doesn't include the pad_vout
 // event in the wait list.
 //
 //                    +----------+            +---------+
 //                    | pad_vout |            | pad_vin |
 //                    +----+-----+            +----+----+
 //                         |                       |
 //                         |                WAITFOR(pad_vin)
 //                         |                       |
 //                         |                 +-----v-----+
 //                         |                 |           |
 //                         |            +----v----+ +----v----+
 //                         |            | bs_full | | bs_frac |
 //                         |            +----+----+ +----+----+
 //                         |                 |           |
 //                         |                 +-----v-----+
 //                         |                       |
 //                         |  +------NO------JUST ONE BLOCK?
 //                         | /                     |
 //                         |/                     YES
 //                         +                       |
 //                         |                       v
 //                         |         END_WITH_EVENTS(bs_full,bs_frac)
 //                         |
 //                         |
 //        WAITFOR(pad_vout,bs_full,bs_frac) >>> first iteration of loop <<<
 //                         |
 //                         |
 //                         +-----------<------------+
 //                         |                        |
 //                   +-----v-----+                  |
 //                   |           |                  |
 //              +----v----+ +----v----+             |
 //              | fm_full | | fm_frac |             |
 //              +----+----+ +----+----+             |
 //                   |           |                  ^
 //                   +-----v-----+                  |
 //                         |                        |
 //              WAITFOR(fm_full,fm_frac)            |
 //                         |                        |
 //                         v                        |
 //                      +--v--+                WAITFOR(bc)
 //                      | hm  |                     |
 //                      +-----+                     |
 //                         |                        |
 //                    WAITFOR(hm)                   |
 //                         |                        ^
 //                      +--v--+                     |
 //                      | bc  |                     |
 //                      +-----+                     |
 //                         |                        |
 //                         v                        |
 //                  MERGING COMPLETE?-------NO------+
 //                         |
 //                        YES
 //                         |
 //                         v
 //                END_WITH_EVENTS(bc)
 //
 //
 // NOTE: CUDA streams are in-order so a dependency isn't required for
 // kernels launched on the same stream.
 //
 // This is actually a more subtle problem than it appears.
 //
 // We'll take a different approach and declare the "happens before"
 // kernel relationships:
 //
 //      concurrent (pad_vin,pad_vout) -> (pad_vin)  happens_before (bs_full,bs_frac)
 //                                       (pad_vout) happens_before (fm_full,fm_frac)
 //
 //      concurrent (bs_full,bs_frac)  -> (bs_full)  happens_before (fm_full,fm_frac)
 //                                       (bs_frac)  happens_before (fm_full,fm_frac)
 //
 //      concurrent (fm_full,fm_frac)  -> (fm_full)  happens_before (hm)
 //                                       (fm_frac)  happens_before (hm)
 //
 //      concurrent (fm_full,fm_frac)  -> (fm_full)  happens_before (hm)
 //                                       (fm_frac)  happens_before (hm)
 //
 //      launch     (hm)               -> (hm)       happens_before (hm)
 //                                       (hm)       happens_before (bc)
 //
 //      launch     (bc)               -> (bc)       happens_before (fm_full,fm_frac)
 //
 //
 // We can go ahead and permanently map kernel launches to our 3
 // streams.  As an optimization, we'll dynamically assign each kernel
 // to the lowest available stream.  This transforms the problem into
 // one that considers streams happening before streams -- which
 // kernels are involved doesn't matter.
 //
 //      STREAM0   STREAM1   STREAM2
 //      -------   -------   -------
 //
 //      pad_vin             pad_vout     (pad_vin)  happens_before (bs_full,bs_frac)
 //                                       (pad_vout) happens_before (fm_full,fm_frac)
 //
 //      bs_full   bs_frac                (bs_full)  happens_before (fm_full,fm_frac)
 //                                       (bs_frac)  happens_before (fm_full,fm_frac)
 //
 //      fm_full   fm_frac                (fm_full)  happens_before (hm or bc)
 //                                       (fm_frac)  happens_before (hm or bc)
 //
 //      hm                               (hm)       happens_before (hm or bc)
 //
 //      bc                               (bc)       happens_before (fm_full,fm_frac)
 //
 // A single final kernel will always complete on stream 0.
 //
 // This simplifies reasoning about concurrency that's downstream of
 // hs_cuda_sort().
 //

 typedef void (*hs_kernel_offset_bs_pfn)(HS_KEY_TYPE       * const HS_RESTRICT vout,
                                         HS_KEY_TYPE const * const HS_RESTRICT vin,
                                         uint32_t            const slab_offset);

 static hs_kernel_offset_bs_pfn const hs_kernels_offset_bs[]
 {
 #if HS_BS_SLABS_LOG2_RU >= 1
   hs_kernel_bs_0,
 #endif
 #if HS_BS_SLABS_LOG2_RU >= 2
   hs_kernel_bs_1,
 #endif
 #if HS_BS_SLABS_LOG2_RU >= 3
   hs_kernel_bs_2,
 #endif
 #if HS_BS_SLABS_LOG2_RU >= 4
   hs_kernel_bs_3,
 #endif
 #if HS_BS_SLABS_LOG2_RU >= 5
   hs_kernel_bs_4,
 #endif
 #if HS_BS_SLABS_LOG2_RU >= 6
   hs_kernel_bs_5,
 #endif
 #if HS_BS_SLABS_LOG2_RU >= 7
   hs_kernel_bs_6,
 #endif
 #if HS_BS_SLABS_LOG2_RU >= 8
   hs_kernel_bs_7,
 #endif
 };

 //
 //
 //

 typedef void (*hs_kernel_bc_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout);

 static hs_kernel_bc_pfn const hs_kernels_bc[]
 {
   hs_kernel_bc_0,
 #if HS_BC_SLABS_LOG2_MAX >= 1
   hs_kernel_bc_1,
 #endif
 #if HS_BC_SLABS_LOG2_MAX >= 2
   hs_kernel_bc_2,
 #endif
 #if HS_BC_SLABS_LOG2_MAX >= 3
   hs_kernel_bc_3,
 #endif
 #if HS_BC_SLABS_LOG2_MAX >= 4
   hs_kernel_bc_4,
 #endif
 #if HS_BC_SLABS_LOG2_MAX >= 5
   hs_kernel_bc_5,
 #endif
 #if HS_BC_SLABS_LOG2_MAX >= 6
   hs_kernel_bc_6,
 #endif
 #if HS_BC_SLABS_LOG2_MAX >= 7
   hs_kernel_bc_7,
 #endif
 #if HS_BC_SLABS_LOG2_MAX >= 8
   hs_kernel_bc_8,
 #endif
 };

 //
 //
 //

 typedef void (*hs_kernel_hm_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout);

 static hs_kernel_hm_pfn const hs_kernels_hm[]
 {
 #if (HS_HM_SCALE_MIN == 0)
   hs_kernel_hm_0,
 #endif
 #if (HS_HM_SCALE_MIN <= 1) && (1 <= HS_HM_SCALE_MAX)
   hs_kernel_hm_1,
 #endif
 #if (HS_HM_SCALE_MIN <= 2) && (2 <= HS_HM_SCALE_MAX)
   hs_kernel_hm_2,
 #endif
 };

 //
 //
 //

 typedef void (*hs_kernel_fm_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout);

 static hs_kernel_fm_pfn const hs_kernels_fm[]
 {
 #if (HS_FM_SCALE_MIN == 0)
 #if (HS_BS_SLABS_LOG2_RU == 1)
   hs_kernel_fm_0_0,
 #endif
 #if (HS_BS_SLABS_LOG2_RU == 2)
   hs_kernel_fm_0_1,
 #endif
 #if (HS_BS_SLABS_LOG2_RU == 3)
   hs_kernel_fm_0_2,
 #endif
 #if (HS_BS_SLABS_LOG2_RU == 4)
   hs_kernel_fm_0_3,
 #endif
 #if (HS_BS_SLABS_LOG2_RU == 5)
   hs_kernel_fm_0_4,
 #endif
 #if (HS_BS_SLABS_LOG2_RU == 6)
   hs_kernel_fm_0_5,
 #endif
 #if (HS_BS_SLABS_LOG2_RU == 7)
   hs_kernel_fm_0_6,
 #endif
 #endif

 #if (HS_FM_SCALE_MIN <= 1) && (1 <= HS_FM_SCALE_MAX)
   CONCAT_MACRO(hs_kernel_fm_1_,HS_BS_SLABS_LOG2_RU)
 #endif

 #if (HS_FM_SCALE_MIN <= 2) && (2 <= HS_FM_SCALE_MAX)
 #if (HS_BS_SLABS_LOG2_RU == 1)
   hs_kernel_fm_2_2,
 #endif
 #if (HS_BS_SLABS_LOG2_RU == 2)
   hs_kernel_fm_2_3,
 #endif
 #if (HS_BS_SLABS_LOG2_RU == 3)
   hs_kernel_fm_2_4,
 #endif
 #if (HS_BS_SLABS_LOG2_RU == 4)
   hs_kernel_fm_2_5,
 #endif
 #if (HS_BS_SLABS_LOG2_RU == 5)
   hs_kernel_fm_2_6,
 #endif
 #if (HS_BS_SLABS_LOG2_RU == 6)
   hs_kernel_fm_2_7,
 #endif
 #if (HS_BS_SLABS_LOG2_RU == 7)
   hs_kernel_fm_2_8,
 #endif

 #endif
 };

 //
 //
 //

 typedef void (*hs_kernel_offset_fm_pfn)(HS_KEY_TYPE * const HS_RESTRICT vout,
                                         uint32_t const span_offset);

 #if (HS_FM_SCALE_MIN == 0)
 static hs_kernel_offset_fm_pfn const hs_kernels_offset_fm_0[]
 {
 #if (HS_BS_SLABS_LOG2_RU >= 2)
   hs_kernel_fm_0_0,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 3)
   hs_kernel_fm_0_1,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 4)
   hs_kernel_fm_0_2,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 5)
   hs_kernel_fm_0_3,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 6)
   hs_kernel_fm_0_4,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 7)
   hs_kernel_fm_0_5,
 #endif
 };
 #endif

 #if (HS_FM_SCALE_MIN <= 1) && (1 <= HS_FM_SCALE_MAX)
 static hs_kernel_offset_fm_pfn const hs_kernels_offset_fm_1[]
 {
 #if (HS_BS_SLABS_LOG2_RU >= 1)
   hs_kernel_fm_1_0,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 2)
   hs_kernel_fm_1_1,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 3)
   hs_kernel_fm_1_2,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 4)
   hs_kernel_fm_1_3,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 5)
   hs_kernel_fm_1_4,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 6)
   hs_kernel_fm_1_5,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 7)
   hs_kernel_fm_1_6,
 #endif
 };
 #endif

 #if (HS_FM_SCALE_MIN <= 2) && (2 <= HS_FM_SCALE_MAX)
 static hs_kernel_offset_fm_pfn const hs_kernels_offset_fm_2[]
 {
   hs_kernel_fm_2_0,
 #if (HS_BS_SLABS_LOG2_RU >= 1)
   hs_kernel_fm_2_1,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 2)
   hs_kernel_fm_2_2,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 3)
   hs_kernel_fm_2_3,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 4)
   hs_kernel_fm_2_4,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 5)
   hs_kernel_fm_2_5,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 6)
   hs_kernel_fm_2_6,
 #endif
 #if (HS_BS_SLABS_LOG2_RU >= 7)
   hs_kernel_fm_2_7,
 #endif
 };
 #endif

 static hs_kernel_offset_fm_pfn const * const hs_kernels_offset_fm[]
 {
 #if (HS_FM_SCALE_MIN == 0)
   hs_kernels_offset_fm_0,
 #endif
 #if (HS_FM_SCALE_MIN <= 1) && (1 <= HS_FM_SCALE_MAX)
   hs_kernels_offset_fm_1,
 #endif
 #if (HS_FM_SCALE_MIN <= 2) && (2 <= HS_FM_SCALE_MAX)
   hs_kernels_offset_fm_2,
 #endif
 };

 //
 //
 //

 typedef uint32_t hs_indices_t;

 //
 //
 //

 struct hs_state
 {
   // key buffers
   HS_KEY_TYPE *  vin;
   HS_KEY_TYPE *  vout; // can be vin

   cudaStream_t   streams[3];

   // pool of stream indices
   hs_indices_t   pool;

   // bx_ru is number of rounded up warps in vin
   uint32_t       bx_ru;
 };

 //
 //
 //

 static
 uint32_t
 hs_indices_acquire(hs_indices_t * const indices)
 {
   //
   // FIXME -- an FFS intrinsic might be faster but there are so few
   // bits in this implementation that it might not matter.
   //
   if      (*indices & 1)
     {
       *indices = *indices & ~1;
       return 0;
     }
   else if (*indices & 2)
     {
       *indices = *indices & ~2;
       return 1;
     }
   else // if (*indices & 4)
     {
       *indices = *indices & ~4;
       return 2;
     }
 }


 static
 uint32_t
 hs_state_acquire(struct hs_state * const state,
                  hs_indices_t    * const indices)
 {
   //
   // FIXME -- an FFS intrinsic might be faster but there are so few
   // bits in this implementation that it might not matter.
   //
   if      (state->pool & 1)
     {
       state->pool &= ~1;
       *indices    |=  1;
       return 0;
     }
   else if (state->pool & 2)
     {
       state->pool &= ~2;
       *indices    |=  2;
       return 1;
     }
   else // (state->pool & 4)
     {
       state->pool &= ~4;
       *indices    |=  4;
       return 2;
     }
 }

 static
 void
 hs_indices_merge(hs_indices_t * const to, hs_indices_t const from)
 {
   *to |= from;
 }

 static
 void
 hs_barrier_enqueue(cudaStream_t to, cudaStream_t from)
 {
   cudaEvent_t event_before;

   cuda(EventCreate(&event_before));

   cuda(EventRecord(event_before,from));

   cuda(StreamWaitEvent(to,event_before,0));

   cuda(EventDestroy(event_before));
 }

 static
 hs_indices_t
 hs_barrier(struct hs_state * const state,
            hs_indices_t      const before,
            hs_indices_t    * const after,
            uint32_t          const count) // count is 1 or 2
 {
   // return streams this stage depends on back into the pool
   hs_indices_merge(&state->pool,before);

   hs_indices_t indices = 0;

   // acquire 'count' stream indices for this stage
   for (uint32_t ii=0; ii<count; ii++)
     {
       hs_indices_t new_indices = 0;

       // new index
       uint32_t const idx = hs_state_acquire(state,&new_indices);

       // add the new index to the indices
       indices |= new_indices;

       // only enqueue barriers when streams are different
       uint32_t const wait = before & ~new_indices;

       if (wait != 0)
         {
           cudaStream_t to = state->streams[idx];

           //
           // FIXME -- an FFS loop might be slower for so few bits. So
           // leave it as is for now.
           //
           if (wait & 1)
             hs_barrier_enqueue(to,state->streams[0]);
           if (wait & 2)
             hs_barrier_enqueue(to,state->streams[1]);
           if (wait & 4)
             hs_barrier_enqueue(to,state->streams[2]);
         }
     }

   hs_indices_merge(after,indices);

   return indices;
 }

 //
 //
 //

 #ifndef NDEBUG

 #include <stdio.h>
 #define HS_STREAM_SYNCHRONIZE(s)                \
   cuda(StreamSynchronize(s));                   \
   fprintf(stderr,"%s\n",__func__);
 #else

 #define HS_STREAM_SYNCHRONIZE(s)

 #endif

 //
 //
 //

 static
 void
 hs_transpose(struct hs_state * const state)
 {
   HS_TRANSPOSE_KERNEL_NAME()
     <<<state->bx_ru,HS_SLAB_THREADS,0,state->streams[0]>>>
     (state->vout);

   HS_STREAM_SYNCHRONIZE(state->streams[0]);
 }

 //
 //
 //

 static
 void
 hs_bc(struct hs_state * const state,
       hs_indices_t      const hs_bc,
       hs_indices_t    * const fm,
       uint32_t          const down_slabs,
       uint32_t          const clean_slabs_log2)
 {
   // enqueue any necessary barriers
   hs_indices_t indices = hs_barrier(state,hs_bc,fm,1);

   // block clean the minimal number of down_slabs_log2 spans
   uint32_t const frac_ru = (1u << clean_slabs_log2) - 1;
   uint32_t const full    = (down_slabs + frac_ru) >> clean_slabs_log2;
   uint32_t const threads = HS_SLAB_THREADS << clean_slabs_log2;

   // stream will *always* be stream[0]
   cudaStream_t stream  = state->streams[hs_indices_acquire(&indices)];

   hs_kernels_bc[clean_slabs_log2]
     <<<full,threads,0,stream>>>
     (state->vout);

   HS_STREAM_SYNCHRONIZE(stream);
 }

 //
 //
 //

 static
 uint32_t
 hs_hm(struct hs_state  * const state,
       hs_indices_t       const hs_bc,
       hs_indices_t     * const hs_bc_tmp,
       uint32_t           const down_slabs,
       uint32_t           const clean_slabs_log2)
 {
   // enqueue any necessary barriers
   hs_indices_t   indices    = hs_barrier(state,hs_bc,hs_bc_tmp,1);

   // how many scaled half-merge spans are there?
   uint32_t const frac_ru    = (1 << clean_slabs_log2) - 1;
   uint32_t const spans      = (down_slabs + frac_ru) >> clean_slabs_log2;

   // for now, just clamp to the max
   uint32_t const log2_rem   = clean_slabs_log2 - HS_BC_SLABS_LOG2_MAX;
   uint32_t const scale_log2 = MIN_MACRO(HS_HM_SCALE_MAX,log2_rem);
   uint32_t const log2_out   = log2_rem - scale_log2;

   //
   // Size the grid
   //
   // The simplifying choices below limit the maximum keys that can be
   // sorted with this grid scheme to around ~2B.
   //
   //   .x : slab height << clean_log2  -- this is the slab span
   //   .y : [1...65535]                -- this is the slab index
   //   .z : ( this could also be used to further expand .y )
   //
   // Note that OpenCL declares a grid in terms of global threads and
   // not grids and blocks
   //
   dim3 grid;

   grid.x = (HS_SLAB_HEIGHT / HS_HM_BLOCK_HEIGHT) << log2_out;
   grid.y = spans;
   grid.z = 1;

   cudaStream_t stream = state->streams[hs_indices_acquire(&indices)];

   hs_kernels_hm[scale_log2-HS_HM_SCALE_MIN]
     <<<grid,HS_SLAB_THREADS * HS_HM_BLOCK_HEIGHT,0,stream>>>
     (state->vout);

   HS_STREAM_SYNCHRONIZE(stream);

   return log2_out;
 }

 //
 // FIXME -- some of this logic can be skipped if BS is a power-of-two
 //

 static
 uint32_t
 hs_fm(struct hs_state * const state,
       hs_indices_t      const fm,
       hs_indices_t    * const hs_bc,
       uint32_t        * const down_slabs,
       uint32_t          const up_scale_log2)
 {
   //
   // FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes
   // a performance win to bias toward launching the smaller flip merge
   // kernel in order to get more warps in flight (increased
   // occupancy).  This is useful when merging small numbers of slabs.
   //
   // Note that HS_FM_SCALE_MIN will always be 0 or 1.
   //
   // So, for now, just clamp to the max until there is a reason to
   // restore the fancier and probably low-impact approach.
   //
   uint32_t const scale_log2 = MIN_MACRO(HS_FM_SCALE_MAX,up_scale_log2);
   uint32_t const clean_log2 = up_scale_log2 - scale_log2;

   // number of slabs in a full-sized scaled flip-merge span
   uint32_t const full_span_slabs = HS_BS_SLABS << up_scale_log2;

   // how many full-sized scaled flip-merge spans are there?
   uint32_t full_fm = state->bx_ru / full_span_slabs;
   uint32_t frac_fm = 0;

   // initialize down_slabs
   *down_slabs = full_fm * full_span_slabs;

   // how many half-size scaled + fractional scaled spans are there?
   uint32_t const span_rem        = state->bx_ru - *down_slabs;
   uint32_t const half_span_slabs = full_span_slabs >> 1;

   // if we have over a half-span then fractionally merge it
   if (span_rem > half_span_slabs)
     {
       // the remaining slabs will be cleaned
       *down_slabs += span_rem;

       uint32_t const frac_rem      = span_rem - half_span_slabs;
       uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem);

       if (frac_rem_pow2 >= half_span_slabs)
         {
           // bump it up to a full span
           full_fm += 1;
         }
       else
         {
           // otherwise, add fractional
           frac_fm  = MAX_MACRO(1,frac_rem_pow2 >> clean_log2);
         }
     }

   // enqueue any necessary barriers
   bool const   both    = (full_fm != 0) && (frac_fm != 0);
   hs_indices_t indices = hs_barrier(state,fm,hs_bc,both ? 2 : 1);

   //
   // Size the grid
   //
   // The simplifying choices below limit the maximum keys that can be
   // sorted with this grid scheme to around ~2B.
   //
   //   .x : slab height << clean_log2  -- this is the slab span
   //   .y : [1...65535]                -- this is the slab index
   //   .z : ( this could also be used to further expand .y )
   //
   // Note that OpenCL declares a grid in terms of global threads and
   // not grids and blocks
   //
   dim3 grid;

   grid.x = (HS_SLAB_HEIGHT / HS_FM_BLOCK_HEIGHT) << clean_log2;
   grid.z = 1;

   if (full_fm > 0)
     {
       cudaStream_t stream = state->streams[hs_indices_acquire(&indices)];

       grid.y = full_fm;

       hs_kernels_fm[scale_log2-HS_FM_SCALE_MIN]
         <<<grid,HS_SLAB_THREADS * HS_FM_BLOCK_HEIGHT,0,stream>>>
           (state->vout);

       HS_STREAM_SYNCHRONIZE(stream);
     }

   if (frac_fm > 0)
     {
       cudaStream_t stream = state->streams[hs_indices_acquire(&indices)];

       grid.y = 1;

       hs_kernels_offset_fm[scale_log2-HS_FM_SCALE_MIN][msb_idx_u32(frac_fm)]
         <<<grid,HS_SLAB_THREADS * HS_FM_BLOCK_HEIGHT,0,stream>>>
         (state->vout,full_fm);

       HS_STREAM_SYNCHRONIZE(stream);
     }

   return clean_log2;
 }

 //
 //
 //

 static
 void
 hs_bs(struct hs_state * const state,
       hs_indices_t      const bs,
       hs_indices_t    * const fm,
       uint32_t          const count_padded_in)
 {
   uint32_t const slabs_in = count_padded_in / HS_SLAB_KEYS;
   uint32_t const full_bs  = slabs_in / HS_BS_SLABS;
   uint32_t const frac_bs  = slabs_in - full_bs * HS_BS_SLABS;
   bool     const both     = (full_bs != 0) && (frac_bs != 0);

   // enqueue any necessary barriers
   hs_indices_t   indices  = hs_barrier(state,bs,fm,both ? 2 : 1);

   if (full_bs != 0)
     {
       cudaStream_t stream = state->streams[hs_indices_acquire(&indices)];

       CONCAT_MACRO(hs_kernel_bs_,HS_BS_SLABS_LOG2_RU)
         <<<full_bs,HS_BS_SLABS*HS_SLAB_THREADS,0,stream>>>
         (state->vout,state->vin);

       HS_STREAM_SYNCHRONIZE(stream);
     }

   if (frac_bs != 0)
     {
       cudaStream_t stream = state->streams[hs_indices_acquire(&indices)];

       hs_kernels_offset_bs[msb_idx_u32(frac_bs)]
         <<<1,frac_bs*HS_SLAB_THREADS,0,stream>>>
         (state->vout,state->vin,full_bs*HS_BS_SLABS*HS_SLAB_THREADS);

       HS_STREAM_SYNCHRONIZE(stream);
     }
 }

 //
 //
 //

 static
 void
 hs_keyset_pre_merge(struct hs_state * const state,
                     hs_indices_t    * const fm,
                     uint32_t          const count_lo,
                     uint32_t          const count_hi)
 {
   uint32_t const vout_span = count_hi - count_lo;
   cudaStream_t   stream    = state->streams[hs_state_acquire(state,fm)];

   cuda(MemsetAsync(state->vout + count_lo,
                    0xFF,
                    vout_span * sizeof(HS_KEY_TYPE),
                    stream));
 }

 //
 //
 //

 static
 void
 hs_keyset_pre_sort(struct hs_state * const state,
                    hs_indices_t    * const bs,
                    uint32_t          const count,
                    uint32_t          const count_hi)
 {
   uint32_t const vin_span = count_hi - count;
   cudaStream_t   stream   = state->streams[hs_state_acquire(state,bs)];

   cuda(MemsetAsync(state->vin + count,
                    0xFF,
                    vin_span * sizeof(HS_KEY_TYPE),
                    stream));
 }

 //
 //
 //

 void
 CONCAT_MACRO(hs_cuda_sort_,HS_KEY_TYPE_PRETTY)
   (HS_KEY_TYPE * const vin,
    HS_KEY_TYPE * const vout,
    uint32_t      const count,
    uint32_t      const count_padded_in,
    uint32_t      const count_padded_out,
    bool          const linearize,
    cudaStream_t        stream0,  // primary stream
    cudaStream_t        stream1,  // auxilary
    cudaStream_t        stream2)  // auxilary
 {
   // is this sort in place?
   bool const is_in_place = (vout == NULL);

   // cq, buffers, wait list and slab count
   struct hs_state state;

   state.vin        = vin;
   state.vout       = is_in_place ? vin : vout;
   state.streams[0] = stream0;
   state.streams[1] = stream1;
   state.streams[2] = stream2;
   state.pool       = 0x7; // 3 bits
   state.bx_ru      = (count + HS_SLAB_KEYS - 1) / HS_SLAB_KEYS;

   // initialize vin
   uint32_t const count_hi                 = is_in_place ? count_padded_out : count_padded_in;
   bool     const is_pre_sort_keyset_reqd  = count_hi > count;
   bool     const is_pre_merge_keyset_reqd = !is_in_place && (count_padded_out > count_padded_in);

   hs_indices_t bs = 0;

   // initialize any trailing keys in vin before sorting
   if (is_pre_sort_keyset_reqd)
     hs_keyset_pre_sort(&state,&bs,count,count_hi);

   hs_indices_t fm = 0;

   // concurrently initialize any trailing keys in vout before merging
   if (is_pre_merge_keyset_reqd)
     hs_keyset_pre_merge(&state,&fm,count_padded_in,count_padded_out);

   // immediately sort blocks of slabs
   hs_bs(&state,bs,&fm,count_padded_in);

   //
   // we're done if this was a single bs block...
   //
   // otherwise, merge sorted spans of slabs until done
   //
   if (state.bx_ru > HS_BS_SLABS)
     {
       int32_t up_scale_log2 = 1;

       while (true)
         {
           hs_indices_t hs_or_bc = 0;

           uint32_t down_slabs;

           // flip merge slabs -- return span of slabs that must be cleaned
           uint32_t clean_slabs_log2 = hs_fm(&state,
                                             fm,
                                             &hs_or_bc,
                                             &down_slabs,
                                             up_scale_log2);

           // if span is gt largest slab block cleaner then half merge
           while (clean_slabs_log2 > HS_BC_SLABS_LOG2_MAX)
             {
               hs_indices_t hs_or_bc_tmp;

               clean_slabs_log2 = hs_hm(&state,
                                        hs_or_bc,
                                        &hs_or_bc_tmp,
                                        down_slabs,
                                        clean_slabs_log2);
               hs_or_bc = hs_or_bc_tmp;
             }

           // reset fm
           fm = 0;

           // launch clean slab grid -- is it the final launch?
           hs_bc(&state,
                 hs_or_bc,
                 &fm,
                 down_slabs,
                 clean_slabs_log2);

           // was this the final block clean?
           if (((uint32_t)HS_BS_SLABS << up_scale_log2) >= state.bx_ru)
             break;

           // otherwise, merge twice as many slabs
           up_scale_log2 += 1;
         }
     }

   // slabs or linear?
   if (linearize) {
     // guaranteed to be on stream0
     hs_transpose(&state);
   }
 }

 //
 // all grids will be computed as a function of the minimum number of slabs
 //

 void
 CONCAT_MACRO(hs_cuda_pad_,HS_KEY_TYPE_PRETTY)
   (uint32_t   const count,
    uint32_t * const count_padded_in,
    uint32_t * const count_padded_out)
 {
   //
   // round up the count to slabs
   //
   uint32_t const slabs_ru        = (count + HS_SLAB_KEYS - 1) / HS_SLAB_KEYS;
   uint32_t const blocks          = slabs_ru / HS_BS_SLABS;
   uint32_t const block_slabs     = blocks * HS_BS_SLABS;
   uint32_t const slabs_ru_rem    = slabs_ru - block_slabs;
   uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),HS_BS_SLABS);

   *count_padded_in  = (block_slabs + slabs_ru_rem_ru) * HS_SLAB_KEYS;
   *count_padded_out = *count_padded_in;

   //
   // will merging be required?
   //
   if (slabs_ru > HS_BS_SLABS)
     {
       // more than one block
       uint32_t const blocks_lo       = pow2_rd_u32(blocks);
       uint32_t const block_slabs_lo  = blocks_lo * HS_BS_SLABS;
       uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo;

       if (block_slabs_rem > 0)
         {
           uint32_t const block_slabs_rem_ru     = pow2_ru_u32(block_slabs_rem);

           uint32_t const block_slabs_hi         = MAX_MACRO(block_slabs_rem_ru,
                                                             blocks_lo << (1 - HS_FM_SCALE_MIN));

           uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi,
                                                             block_slabs_lo*2); // clamp non-pow2 blocks

           *count_padded_out = block_slabs_padded_out * HS_SLAB_KEYS;
         }
     }
 }

 //
 //
 //

 void
 CONCAT_MACRO(hs_cuda_info_,HS_KEY_TYPE_PRETTY)
   (uint32_t * const key_words,
    uint32_t * const val_words,
    uint32_t * const slab_height,
    uint32_t * const slab_width_log2)
 {
   *key_words       = HS_KEY_WORDS;
   *val_words       = HS_VAL_WORDS;
   *slab_height     = HS_SLAB_HEIGHT;
   *slab_width_log2 = HS_SLAB_WIDTH_LOG2;
 }

 //
 //
 //