src/compute/hs/vk/hs_vk.c - skia.git - Git at Google

 /*
  * Copyright 2016 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can
  * be found in the LICENSE file.
  *
  */

 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>

 #include "common/util.h"
 #include "common/macros.h"
 #include "common/vk/assert_vk.h"

 #include "hs_vk.h"
 #include "hs_vk_target.h"

 //
 // We want concurrent kernel execution to occur in a few places.
 //
 // The summary is:
 //
 //   1) If necessary, some max valued keys are written to the end of
 //      the vin/vout buffers.
 //
 //   2) Blocks of slabs of keys are sorted.
 //
 //   3) If necesary, the blocks of slabs are merged until complete.
 //
 //   4) If requested, the slabs will be converted from slab ordering
 //      to linear ordering.
 //
 // Below is the general "happens-before" relationship between HotSort
 // compute kernels.
 //
 // Note the diagram assumes vin and vout are different buffers.  If
 // they're not, then the first merge doesn't include the pad_vout
 // event in the wait list.
 //
 //                    +----------+            +---------+
 //                    | pad_vout |            | pad_vin |
 //                    +----+-----+            +----+----+
 //                         |                       |
 //                         |                WAITFOR(pad_vin)
 //                         |                       |
 //                         |                 +-----v-----+
 //                         |                 |           |
 //                         |            +----v----+ +----v----+
 //                         |            | bs_full | | bs_frac |
 //                         |            +----+----+ +----+----+
 //                         |                 |           |
 //                         |                 +-----v-----+
 //                         |                       |
 //                         |  +------NO------JUST ONE BLOCK?
 //                         | /                     |
 //                         |/                     YES
 //                         +                       |
 //                         |                       v
 //                         |         END_WITH_EVENTS(bs_full,bs_frac)
 //                         |
 //                         |
 //        WAITFOR(pad_vout,bs_full,bs_frac) >>> first iteration of loop <<<
 //                         |
 //                         |
 //                         +-----------<------------+
 //                         |                        |
 //                   +-----v-----+                  |
 //                   |           |                  |
 //              +----v----+ +----v----+             |
 //              | fm_full | | fm_frac |             |
 //              +----+----+ +----+----+             |
 //                   |           |                  ^
 //                   +-----v-----+                  |
 //                         |                        |
 //              WAITFOR(fm_full,fm_frac)            |
 //                         |                        |
 //                         v                        |
 //                      +--v--+                WAITFOR(bc)
 //                      | hm  |                     |
 //                      +-----+                     |
 //                         |                        |
 //                    WAITFOR(hm)                   |
 //                         |                        ^
 //                      +--v--+                     |
 //                      | bc  |                     |
 //                      +-----+                     |
 //                         |                        |
 //                         v                        |
 //                  MERGING COMPLETE?-------NO------+
 //                         |
 //                        YES
 //                         |
 //                         v
 //                END_WITH_EVENTS(bc)
 //

 struct hs_vk
 {
   VkAllocationCallbacks const * allocator;
   VkDevice                      device;

   struct {
     struct {
       VkDescriptorSetLayout     vout_vin;
     } layout;
   } desc_set;

   struct {
     struct {
       VkPipelineLayout          vout_vin;
     } layout;
   } pipeline;

   struct hs_vk_target_config    config;

   uint32_t                      key_val_size;
   uint32_t                      slab_keys;
   uint32_t                      bs_slabs_log2_ru;
   uint32_t                      bc_slabs_log2_max;

   struct {
     uint32_t                    count;
     VkPipeline                * bs;
     VkPipeline                * bc;
     VkPipeline                * fm[3];
     VkPipeline                * hm[3];
     VkPipeline                * transpose;
     VkPipeline                  all[];
   } pipelines;
 };

 //
 //
 //

 struct hs_state
 {
   VkCommandBuffer      cb;

   // If sorting in-place, then vout == vin
   VkBuffer             vout;
   VkBuffer             vin;

   // bx_ru is number of rounded up warps in vin
   uint32_t             bx_ru;
 };

 //
 //
 //

 static
 void
 hs_barrier_compute_w_to_compute_r(struct hs_state * const state)
 {
   static VkMemoryBarrier const shader_w_to_r = {
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
     .pNext         = NULL,
     .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
     .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
   };

   vkCmdPipelineBarrier(state->cb,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        0,
                        1,
                        &shader_w_to_r,
                        0,
                        NULL,
                        0,
                        NULL);
 }

 //
 //
 //

 static
 void
 hs_barrier_to_compute_r(struct hs_state    * const state,
                         VkPipelineStageFlags const src_stage,
                         VkAccessFlagBits     const src_access)
 {
   if (src_stage == 0)
     return;

   VkMemoryBarrier const compute_r = {
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
     .pNext         = NULL,
     .srcAccessMask = src_access,
     .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
   };

   vkCmdPipelineBarrier(state->cb,
                        src_stage,
                        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                        0,
                        1,
                        &compute_r,
                        0,
                        NULL,
                        0,
                        NULL);
 }

 //
 //
 //

 static
 void
 hs_barrier_to_transfer_fill(struct hs_state    * const state,
                             VkPipelineStageFlags const src_stage,
                             VkAccessFlagBits     const src_access)
 {
   if (src_stage == 0)
     return;

   VkMemoryBarrier const fill_w = {
     .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
     .pNext         = NULL,
     .srcAccessMask = src_access,
     .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT
   };

   vkCmdPipelineBarrier(state->cb,
                        src_stage,
                        VK_PIPELINE_STAGE_TRANSFER_BIT,
                        0,
                        1,
                        &fill_w,
                        0,
                        NULL,
                        0,
                        NULL);
 }

 //
 //
 //

 static
 void
 hs_transpose(struct hs_vk const * const hs,
              struct hs_state    * const state)
 {
   hs_barrier_compute_w_to_compute_r(state);

   vkCmdBindPipeline(state->cb,
                     VK_PIPELINE_BIND_POINT_COMPUTE,
                     hs->pipelines.transpose[0]);

   vkCmdDispatch(state->cb,state->bx_ru,1,1);
 }

 //
 //
 //

 static
 void
 hs_bc(struct hs_vk const * const hs,
       struct hs_state    * const state,
       uint32_t             const down_slabs,
       uint32_t             const clean_slabs_log2)
 {
   hs_barrier_compute_w_to_compute_r(state);

   // block clean the minimal number of down_slabs_log2 spans
   uint32_t const frac_ru = (1u << clean_slabs_log2) - 1;
   uint32_t const full_bc = (down_slabs + frac_ru) >> clean_slabs_log2;

   vkCmdBindPipeline(state->cb,
                     VK_PIPELINE_BIND_POINT_COMPUTE,
                     hs->pipelines.bc[clean_slabs_log2]);

   vkCmdDispatch(state->cb,full_bc,1,1);
 }

 //
 //
 //

 static
 uint32_t
 hs_hm(struct hs_vk const * const hs,
       struct hs_state    * const state,
       uint32_t             const down_slabs,
       uint32_t             const clean_slabs_log2)
 {
   hs_barrier_compute_w_to_compute_r(state);

   // how many scaled half-merge spans are there?
   uint32_t const frac_ru    = (1 << clean_slabs_log2) - 1;
   uint32_t const spans      = (down_slabs + frac_ru) >> clean_slabs_log2;

   // for now, just clamp to the max
   uint32_t const log2_rem   = clean_slabs_log2 - hs->bc_slabs_log2_max;
   uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.hm.scale_max,log2_rem);
   uint32_t const log2_out   = log2_rem - scale_log2;

   // size the grid
   uint32_t const slab_span  = hs->config.slab.height << log2_out;

   vkCmdBindPipeline(state->cb,
                     VK_PIPELINE_BIND_POINT_COMPUTE,
                     hs->pipelines.hm[scale_log2][0]);

   vkCmdDispatch(state->cb,slab_span,spans,1);

   return log2_out;
 }

 //
 // FIXME -- some of this logic can be skipped if BS is a power-of-two
 //

 static
 uint32_t
 hs_fm(struct hs_vk const * const hs,
       struct hs_state    * const state,
       uint32_t           * const down_slabs,
       uint32_t             const up_scale_log2)
 {
   //
   // FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes
   // a performance win to bias toward launching the smaller flip merge
   // kernel in order to get more warps in flight (increased
   // occupancy).  This is useful when merging small numbers of slabs.
   //
   // Note that HS_FM_SCALE_MIN will always be 0 or 1.
   //
   // So, for now, just clamp to the max until there is a reason to
   // restore the fancier and probably low-impact approach.
   //
   uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.fm.scale_max,up_scale_log2);
   uint32_t const clean_log2 = up_scale_log2 - scale_log2;

   // number of slabs in a full-sized scaled flip-merge span
   uint32_t const full_span_slabs = hs->config.block.slabs << up_scale_log2;

   // how many full-sized scaled flip-merge spans are there?
   uint32_t full_fm = state->bx_ru / full_span_slabs;
   uint32_t frac_fm = 0;

   // initialize down_slabs
   *down_slabs = full_fm * full_span_slabs;

   // how many half-size scaled + fractional scaled spans are there?
   uint32_t const span_rem        = state->bx_ru - *down_slabs;
   uint32_t const half_span_slabs = full_span_slabs >> 1;

   // if we have over a half-span then fractionally merge it
   if (span_rem > half_span_slabs)
     {
       // the remaining slabs will be cleaned
       *down_slabs += span_rem;

       uint32_t const frac_rem      = span_rem - half_span_slabs;
       uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem);

       if (frac_rem_pow2 >= half_span_slabs)
         {
           // bump it up to a full span
           full_fm += 1;
         }
       else
         {
           // otherwise, add fractional
           frac_fm  = MAX_MACRO(1,frac_rem_pow2 >> clean_log2);
         }
     }

   //
   // Size the grid
   //
   // The simplifying choices below limit the maximum keys that can be
   // sorted with this grid scheme to around ~2B.
   //
   //   .x : slab height << clean_log2  -- this is the slab span
   //   .y : [1...65535]                -- this is the slab index
   //   .z : ( this could also be used to further expand .y )
   //
   // Note that OpenCL declares a grid in terms of global threads and
   // not grids and blocks
   //

   //
   // size the grid
   //
   uint32_t const slab_span = hs->config.slab.height << clean_log2;

   if (full_fm > 0)
     {
       uint32_t const full_idx = hs->bs_slabs_log2_ru - 1 + scale_log2;

       vkCmdBindPipeline(state->cb,
                         VK_PIPELINE_BIND_POINT_COMPUTE,
                         hs->pipelines.fm[scale_log2][full_idx]);

       vkCmdDispatch(state->cb,slab_span,full_fm,1);
     }

   if (frac_fm > 0)
     {
       vkCmdBindPipeline(state->cb,
                         VK_PIPELINE_BIND_POINT_COMPUTE,
                         hs->pipelines.fm[scale_log2][msb_idx_u32(frac_fm)]);

       vkCmdDispatchBase(state->cb,
                         0,full_fm,0,
                         slab_span,1,1);
     }

   return clean_log2;
 }

 //
 //
 //

 static
 void
 hs_bs(struct hs_vk const * const hs,
       struct hs_state    * const state,
       uint32_t             const count_padded_in)
 {
   uint32_t const slabs_in = count_padded_in / hs->slab_keys;
   uint32_t const full_bs  = slabs_in / hs->config.block.slabs;
   uint32_t const frac_bs  = slabs_in - full_bs * hs->config.block.slabs;

   if (full_bs > 0)
     {
       vkCmdBindPipeline(state->cb,
                         VK_PIPELINE_BIND_POINT_COMPUTE,
                         hs->pipelines.bs[hs->bs_slabs_log2_ru]);

       vkCmdDispatch(state->cb,full_bs,1,1);
     }

   if (frac_bs > 0)
     {
       uint32_t const frac_idx          = msb_idx_u32(frac_bs);
       uint32_t const full_to_frac_log2 = hs->bs_slabs_log2_ru - frac_idx;

       vkCmdBindPipeline(state->cb,
                         VK_PIPELINE_BIND_POINT_COMPUTE,
                         hs->pipelines.bs[msb_idx_u32(frac_bs)]);

       vkCmdDispatchBase(state->cb,
                         full_bs<<full_to_frac_log2,0,0,
                         1,1,1);
     }
 }

 //
 //
 //

 static
 void
 hs_keyset_pre_fm(struct hs_vk const * const hs,
                  struct hs_state    * const state,
                  uint32_t             const count_lo,
                  uint32_t             const count_hi)
 {
   uint32_t const vout_span = count_hi - count_lo;

   vkCmdFillBuffer(state->cb,
                   state->vout,
                   count_lo  * hs->key_val_size,
                   vout_span * hs->key_val_size,
                   UINT32_MAX);
 }

 //
 //
 //

 static
 void
 hs_keyset_pre_bs(struct hs_vk const * const hs,
                  struct hs_state    * const state,
                  uint32_t             const count,
                  uint32_t             const count_hi)
 {
   uint32_t const vin_span = count_hi - count;

   vkCmdFillBuffer(state->cb,
                   state->vin,
                   count    * hs->key_val_size,
                   vin_span * hs->key_val_size,
                   UINT32_MAX);
 }

 //
 //
 //

 void
 hs_vk_ds_bind(struct hs_vk const * const hs,
               VkDescriptorSet            hs_ds,
               VkCommandBuffer            cb,
               VkBuffer                   vin,
               VkBuffer                   vout)
 {
   //
   // initialize the HotSort descriptor set
   //
   VkDescriptorBufferInfo const dbi[] = {
     {
       .buffer = vout == VK_NULL_HANDLE ? vin : vout,
       .offset = 0,
       .range  = VK_WHOLE_SIZE
     },
     {
       .buffer = vin,
       .offset = 0,
       .range  = VK_WHOLE_SIZE
     }
   };

   VkWriteDescriptorSet const wds[] = {
     {
       .sType            = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
       .pNext            = NULL,
       .dstSet           = hs_ds,
       .dstBinding       = 0,
       .dstArrayElement  = 0,
       .descriptorCount  = 2,
       .descriptorType   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
       .pImageInfo       = NULL,
       .pBufferInfo      = dbi,
       .pTexelBufferView = NULL
     }
   };

   vkUpdateDescriptorSets(hs->device,
                          ARRAY_LENGTH_MACRO(wds),
                          wds,
                          0,
                          NULL);

   //
   // All HotSort kernels can use the same descriptor set:
   //
   //   {
   //     HS_KEY_TYPE vout[];
   //     HS_KEY_TYPE vin[];
   //   }
   //
   // Note that only the bs() kernels read from vin().
   //
   vkCmdBindDescriptorSets(cb,
                           VK_PIPELINE_BIND_POINT_COMPUTE,
                           hs->pipeline.layout.vout_vin,
                           0,
                           1,
                           &hs_ds,
                           0,
                           NULL);
 }

 //
 //
 //

 void
 hs_vk_sort(struct hs_vk const * const hs,
            VkCommandBuffer            cb,
            VkBuffer                   vin,
            VkPipelineStageFlags const vin_src_stage,
            VkAccessFlagBits     const vin_src_access,
            VkBuffer                   vout,
            VkPipelineStageFlags const vout_src_stage,
            VkAccessFlagBits     const vout_src_access,
            uint32_t             const count,
            uint32_t             const count_padded_in,
            uint32_t             const count_padded_out,
            bool                 const linearize)
 {
   // is this sort in place?
   bool const is_in_place = (vout == VK_NULL_HANDLE);

   //
   // create some common state
   //
   struct hs_state state = {
     .cb    = cb,
     .vin   = vin,
     .vout  = is_in_place ? vin : vout,
     .bx_ru = (count + hs->slab_keys - 1) / hs->slab_keys
   };

   // initialize vin
   uint32_t const count_hi          = is_in_place ? count_padded_out : count_padded_in;
   bool     const is_pre_sort_reqd  = count_hi > count;
   bool     const is_pre_merge_reqd = !is_in_place && (count_padded_out > count_padded_in);

   //
   // pre-sort  keyset needs to happen before bs()
   // pre-merge keyset needs to happen before fm()
   //

   VkPipelineStageFlags bs_src_stage  = 0;
   VkAccessFlagBits     bs_src_access = 0;

   // initialize any trailing keys in vin before sorting
   if (is_pre_sort_reqd)
     {
       hs_barrier_to_transfer_fill(&state,vin_src_stage,vin_src_access);

       hs_keyset_pre_bs(hs,&state,count,count_hi);

       bs_src_stage  |= VK_PIPELINE_STAGE_TRANSFER_BIT;
       bs_src_access |= VK_ACCESS_TRANSFER_WRITE_BIT;
     }
   else
     {
       bs_src_stage  = vin_src_stage;
       bs_src_access = vin_src_access;
     }

   hs_barrier_to_compute_r(&state,bs_src_stage,bs_src_access);

   // sort blocks of slabs... after hs_keyset_pre_sort()
   hs_bs(hs,&state,count_padded_in);

   VkPipelineStageFlags fm_src_stage  = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
   VkAccessFlagBits     fm_src_access = VK_ACCESS_SHADER_READ_BIT;

   // initialize any trailing keys in vout before merging
   if (is_pre_merge_reqd)
     {
       hs_barrier_to_transfer_fill(&state,vout_src_stage,vout_src_access);

       hs_keyset_pre_fm(hs,&state,count_padded_in,count_padded_out);

       fm_src_stage  |= VK_PIPELINE_STAGE_TRANSFER_BIT;
       fm_src_access |= VK_ACCESS_TRANSFER_WRITE_BIT;
     }
   else
     {
       fm_src_stage  |= vout_src_stage;
       fm_src_access |= vout_src_access;
     }

   //
   // if this was a single bs block then there is no merging
   //
   if (state.bx_ru > hs->config.block.slabs)
     {
       hs_barrier_to_compute_r(&state,fm_src_stage,fm_src_access);

       //
       // otherwise, merge sorted spans of slabs until done
       //
       int32_t up_scale_log2 = 1;

       while (true)
         {
           uint32_t down_slabs;

           // flip merge slabs -- return span of slabs that must be cleaned
           uint32_t clean_slabs_log2 = hs_fm(hs,&state,
                                             &down_slabs,
                                             up_scale_log2);

           // if span is gt largest slab block cleaner then half merge
           while (clean_slabs_log2 > hs->bc_slabs_log2_max)
             {
               clean_slabs_log2 = hs_hm(hs,&state,
                                        down_slabs,
                                        clean_slabs_log2);
             }

           // launch clean slab grid -- is it the final launch?
           hs_bc(hs,&state,down_slabs,clean_slabs_log2);

           // was this the final block clean?
           if (((uint32_t)hs->config.block.slabs << up_scale_log2) >= state.bx_ru)
             break;

           // otherwise, merge twice as many slabs
           up_scale_log2 += 1;

           // drop a barrier
           hs_barrier_compute_w_to_compute_r(&state);
         }
     }

   // slabs or linear?
   if (linearize)
     hs_transpose(hs,&state);
 }

 //
 //
 //

 #ifdef HS_VK_VERBOSE_STATISTICS_AMD

 #include <stdio.h>

 static
 void
 hs_vk_verbose_statistics_amd(VkDevice device, struct hs_vk const * const hs)
 {
   PFN_vkGetShaderInfoAMD vkGetShaderInfoAMD =
     (PFN_vkGetShaderInfoAMD)
     vkGetDeviceProcAddr(device,"vkGetShaderInfoAMD");

   if (vkGetShaderInfoAMD == NULL)
     return;

   fprintf(stdout,
           "                                   PHY   PHY  AVAIL AVAIL\n"
           "VGPRs SGPRs LDS_MAX LDS/WG  SPILL VGPRs SGPRs VGPRs SGPRs  WORKGROUP_SIZE\n");

   for (uint32_t ii=0; ii<hs->pipelines.count; ii++)
     {
       VkShaderStatisticsInfoAMD ssi_amd;
       size_t                    ssi_amd_size = sizeof(ssi_amd);

       if (vkGetShaderInfoAMD(hs->device,
                              hs->pipelines.all[ii],
                              VK_SHADER_STAGE_COMPUTE_BIT,
                              VK_SHADER_INFO_TYPE_STATISTICS_AMD,
                              &ssi_amd_size,
                              &ssi_amd) == VK_SUCCESS)
         {
           fprintf(stdout,
                   "%5" PRIu32 " "
                   "%5" PRIu32 "   "
                   "%5" PRIu32 " "

                   "%6zu "
                   "%6zu "

                   "%5" PRIu32 " "
                   "%5" PRIu32 " "
                   "%5" PRIu32 " "
                   "%5" PRIu32 "  "

                   "( %6" PRIu32 ", " "%6" PRIu32 ", " "%6" PRIu32 " )\n",
                   ssi_amd.resourceUsage.numUsedVgprs,
                   ssi_amd.resourceUsage.numUsedSgprs,
                   ssi_amd.resourceUsage.ldsSizePerLocalWorkGroup,
                   ssi_amd.resourceUsage.ldsUsageSizeInBytes,    // size_t
                   ssi_amd.resourceUsage.scratchMemUsageInBytes, // size_t
                   ssi_amd.numPhysicalVgprs,
                   ssi_amd.numPhysicalSgprs,
                   ssi_amd.numAvailableVgprs,
                   ssi_amd.numAvailableSgprs,
                   ssi_amd.computeWorkGroupSize[0],
                   ssi_amd.computeWorkGroupSize[1],
                   ssi_amd.computeWorkGroupSize[2]);
         }
     }
 }

 #endif

 //
 //
 //

 #ifdef HS_VK_VERBOSE_DISASSEMBLY_AMD

 #include <stdio.h>

 static
 void
 hs_vk_verbose_disassembly_amd(VkDevice device, struct hs_vk const * const hs)
 {
   PFN_vkGetShaderInfoAMD vkGetShaderInfoAMD =
     (PFN_vkGetShaderInfoAMD)
     vkGetDeviceProcAddr(device,"vkGetShaderInfoAMD");

   if (vkGetShaderInfoAMD == NULL)
     return;

   for (uint32_t ii=0; ii<hs->pipelines.count; ii++)
     {
       size_t disassembly_amd_size;

       if (vkGetShaderInfoAMD(hs->device,
                              hs->pipelines.all[ii],
                              VK_SHADER_STAGE_COMPUTE_BIT,
                              VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD,
                              &disassembly_amd_size,
                              NULL) == VK_SUCCESS)
         {
           void * disassembly_amd = malloc(disassembly_amd_size);

           if (vkGetShaderInfoAMD(hs->device,
                                  hs->pipelines.all[ii],
                                  VK_SHADER_STAGE_COMPUTE_BIT,
                                  VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD,
                                  &disassembly_amd_size,
                                  disassembly_amd) == VK_SUCCESS)
             {
               fprintf(stdout,"%s",(char*)disassembly_amd);
             }

           free(disassembly_amd);
         }
     }
 }

 #endif

 //
 //
 //

 struct hs_vk *
 hs_vk_create(struct hs_vk_target   const * const target,
              VkDevice                            device,
              VkAllocationCallbacks const *       allocator,
              VkPipelineCache                     pipeline_cache)
 {
   //
   // we reference these values a lot
   //
   uint32_t const bs_slabs_log2_ru  = msb_idx_u32(pow2_ru_u32(target->config.block.slabs));
   uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs));

   //
   // how many kernels will be created?
   //
   uint32_t const count_bs    = bs_slabs_log2_ru + 1;
   uint32_t const count_bc    = bc_slabs_log2_max + 1;
   uint32_t       count_fm[3] = { 0 };
   uint32_t       count_hm[3] = { 0 };

   // guaranteed to be in range [0,2]
   for (uint32_t scale = target->config.merge.fm.scale_min;
        scale <= target->config.merge.fm.scale_max;
        scale++)
     {
       uint32_t fm_left = (target->config.block.slabs / 2) << scale;

       count_fm[scale] = msb_idx_u32(pow2_ru_u32(fm_left)) + 1;
     }

   // guaranteed to be in range [0,2]
   for (uint32_t scale = target->config.merge.hm.scale_min;
        scale <= target->config.merge.hm.scale_max;
        scale++)
     {
       count_hm[scale] = 1;
     }

   uint32_t const count_bc_fm_hm_transpose =
     + count_bc
     + count_fm[0] + count_fm[1] + count_fm[2]
     + count_hm[0] + count_hm[1] + count_hm[2] +
     1; // transpose

   uint32_t const count_all = count_bs + count_bc_fm_hm_transpose;

   //
   // allocate hs_vk
   //
   struct hs_vk * hs;

   if (allocator == NULL)
     {
       hs = malloc(sizeof(*hs) + sizeof(VkPipeline*) * count_all);
     }
   else
     {
       hs = allocator->pfnAllocation(NULL,
                                     sizeof(*hs) + sizeof(VkPipeline*) * count_all,
                                     0,
                                     VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
     }

   // save device & allocator
   hs->device    = device;
   hs->allocator = allocator;

   //
   // create one descriptor set layout
   //
   static VkDescriptorSetLayoutBinding const dslb_vout_vin[] = {
     {
       .binding            = 0, // vout
       .descriptorType     = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
       .descriptorCount    = 1,
       .stageFlags         = VK_SHADER_STAGE_COMPUTE_BIT,
       .pImmutableSamplers = NULL
     },
     {
       .binding            = 1, // vin
       .descriptorType     = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
       .descriptorCount    = 1,
       .stageFlags         = VK_SHADER_STAGE_COMPUTE_BIT,
       .pImmutableSamplers = NULL
     }
   };

   static VkDescriptorSetLayoutCreateInfo const dscli = {
     .sType        = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
     .pNext        = NULL,
     .flags        = 0,
     .bindingCount = 2, // 0:vout[], 1:vin[]
     .pBindings    = dslb_vout_vin
   };

   vk(CreateDescriptorSetLayout(device,
                                &dscli,
                                allocator,
                                &hs->desc_set.layout.vout_vin));

   //
   // create one pipeline layout
   //
   VkPipelineLayoutCreateInfo plci = {
     .sType                  = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
     .pNext                  = NULL,
     .flags                  = 0,
     .setLayoutCount         = 1,
     .pSetLayouts            = &hs->desc_set.layout.vout_vin,
     .pushConstantRangeCount = 0,
     .pPushConstantRanges    = NULL
   };

   vk(CreatePipelineLayout(device,
                           &plci,
                           allocator,
                           &hs->pipeline.layout.vout_vin));

   //
   // copy the config from the target -- we need these values later
   //
   memcpy(&hs->config,&target->config,sizeof(hs->config));

   // save some frequently used calculated values
   hs->key_val_size      = (target->config.words.key + target->config.words.val) * 4;
   hs->slab_keys         = target->config.slab.height << target->config.slab.width_log2;
   hs->bs_slabs_log2_ru  = bs_slabs_log2_ru;
   hs->bc_slabs_log2_max = bc_slabs_log2_max;

   // save kernel count
   hs->pipelines.count   = count_all;

   //
   // create all the compute pipelines by reusing this info
   //
   VkComputePipelineCreateInfo cpci = {
     .sType                 = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
     .pNext                 = NULL,
     .flags                 = VK_PIPELINE_CREATE_DISPATCH_BASE, // | VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT,
     .stage = {
       .sType               = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
       .pNext               = NULL,
       .flags               = 0,
       .stage               = VK_SHADER_STAGE_COMPUTE_BIT,
       .module              = VK_NULL_HANDLE,
       .pName               = "main",
       .pSpecializationInfo = NULL
     },
     .layout                = hs->pipeline.layout.vout_vin,
     .basePipelineHandle    = VK_NULL_HANDLE,
     .basePipelineIndex     = 0
   };

   //
   // Create a shader module, use it to create a pipeline... and
   // dispose of the shader module.
   //
   // The BS     compute shaders have the same layout
   // The non-BS compute shaders have the same layout
   //
   VkShaderModuleCreateInfo smci = {
     .sType    = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
     .pNext    = NULL,
     .flags    = 0,
     .codeSize = 0,
     .pCode    = (uint32_t const *)target->modules // FIXME -- unfortunate typecast
   };

   //
   // bs kernels have layout: (vout,vin)
   // remaining  have layout: (vout)
   //
   for (uint32_t ii=0; ii<count_all; ii++)
     {
       // convert bytes to words
       uint32_t const * const module = smci.pCode + smci.codeSize / sizeof(*module);

       smci.codeSize = NTOHL_MACRO(module[0]);
       smci.pCode    = module + 1;

       vk(CreateShaderModule(device,
                             &smci,
                             allocator,
                             &cpci.stage.module));

       vk(CreateComputePipelines(device,
                                 pipeline_cache,
                                 1,
                                 &cpci,
                                 allocator,
                                 hs->pipelines.all+ii));

       vkDestroyShaderModule(device,
                             cpci.stage.module,
                             allocator);
     }

   //
   // initialize pointers to pipeline handles
   //
   VkPipeline * pipeline_next = hs->pipelines.all;

   // BS
   hs->pipelines.bs        = pipeline_next;
   pipeline_next          += count_bs;

   // BC
   hs->pipelines.bc        = pipeline_next;
   pipeline_next          += count_bc;

   // FM[0]
   hs->pipelines.fm[0]     = count_fm[0] ? pipeline_next : NULL;
   pipeline_next          += count_fm[0];

   // FM[1]
   hs->pipelines.fm[1]     = count_fm[1] ? pipeline_next : NULL;
   pipeline_next          += count_fm[1];

   // FM[2]
   hs->pipelines.fm[2]     = count_fm[2] ? pipeline_next : NULL;
   pipeline_next          += count_fm[2];

   // HM[0]
   hs->pipelines.hm[0]     = count_hm[0] ? pipeline_next : NULL;
   pipeline_next          += count_hm[0];

   // HM[1]
   hs->pipelines.hm[1]     = count_hm[1] ? pipeline_next : NULL;
   pipeline_next          += count_hm[1];

   // HM[2]
   hs->pipelines.hm[2]     = count_hm[2] ? pipeline_next : NULL;
   pipeline_next          += count_hm[2];

   // TRANSPOSE
   hs->pipelines.transpose = pipeline_next;
   pipeline_next          += 1;

   //
   // optionally dump pipeline stats
   //
 #ifdef HS_VK_VERBOSE_STATISTICS_AMD
   hs_vk_verbose_statistics_amd(device,hs);
 #endif
 #ifdef HS_VK_VERBOSE_DISASSEMBLY_AMD
   hs_vk_verbose_disassembly_amd(device,hs);
 #endif

   //
   //
   //

   return hs;
 }

 //
 //
 //

 void
 hs_vk_release(struct hs_vk * const hs)
 {
   vkDestroyDescriptorSetLayout(hs->device,
                                hs->desc_set.layout.vout_vin,
                                hs->allocator);

   vkDestroyPipelineLayout(hs->device,
                           hs->pipeline.layout.vout_vin,
                           hs->allocator);

   for (uint32_t ii=0; ii<hs->pipelines.count; ii++)
     {
       vkDestroyPipeline(hs->device,
                         hs->pipelines.all[ii],
                         hs->allocator);
     }

   if (hs->allocator == NULL)
     {
       free(hs);
     }
   else
     {
       hs->allocator->pfnFree(NULL,hs);
     }
 }

 //
 // Allocate a per-thread descriptor set for the vin and vout
 // VkBuffers.  Note that HotSort uses only one descriptor set.
 //

 VkDescriptorSet
 hs_vk_ds_alloc(struct hs_vk const * const hs, VkDescriptorPool desc_pool)
 {
   VkDescriptorSetAllocateInfo const ds_alloc_info = {
     .sType              = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
     .pNext              = NULL,
     .descriptorPool     = desc_pool,
     .descriptorSetCount = 1,
     .pSetLayouts        = &hs->desc_set.layout.vout_vin
   };

   VkDescriptorSet hs_ds;

   vk(AllocateDescriptorSets(hs->device,
                             &ds_alloc_info,
                             &hs_ds));

   return hs_ds;
 }

 //
 //
 //

 void
 hs_vk_pad(struct hs_vk const * const hs,
           uint32_t             const count,
           uint32_t           * const count_padded_in,
           uint32_t           * const count_padded_out)
 {
   //
   // round up the count to slabs
   //
   uint32_t const slabs_ru        = (count + hs->slab_keys - 1) / hs->slab_keys;
   uint32_t const blocks          = slabs_ru / hs->config.block.slabs;
   uint32_t const block_slabs     = blocks * hs->config.block.slabs;
   uint32_t const slabs_ru_rem    = slabs_ru - block_slabs;
   uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),hs->config.block.slabs);

   *count_padded_in  = (block_slabs + slabs_ru_rem_ru) * hs->slab_keys;
   *count_padded_out = *count_padded_in;

   //
   // will merging be required?
   //
   if (slabs_ru > hs->config.block.slabs)
     {
       // more than one block
       uint32_t const blocks_lo       = pow2_rd_u32(blocks);
       uint32_t const block_slabs_lo  = blocks_lo * hs->config.block.slabs;
       uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo;

       if (block_slabs_rem > 0)
         {
           uint32_t const block_slabs_rem_ru     = pow2_ru_u32(block_slabs_rem);

           uint32_t const block_slabs_hi         = MAX_MACRO(block_slabs_rem_ru,
                                                             blocks_lo << (1 - hs->config.merge.fm.scale_min));

           uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi,
                                                             block_slabs_lo*2); // clamp non-pow2 blocks

           *count_padded_out = block_slabs_padded_out * hs->slab_keys;
         }
     }
 }

 //
 //
 //