src/compute/hs/gen/main.c - skia - Git at Google

 /*
  * Copyright 2016 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  *
  */

 //
 //
 //

 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
 #include <getopt.h>

 //
 //
 //

 #include "networks.h"
 #include "macros.h"
 #include "util.h"

 //
 //
 //

 #define HSG_INDENT  2

 //
 //
 //

 #undef  HSG_OP_EXPAND_X
 #define HSG_OP_EXPAND_X(t) #t ,

 static
 char const * const
 hsg_op_type_string[] =
   {
     HSG_OP_EXPAND_ALL()
   };

 //
 //
 //

 #define EXIT()                            (struct hsg_op){ HSG_OP_TYPE_EXIT                                     }

 #define END()                             (struct hsg_op){ HSG_OP_TYPE_END                                      }
 #define BEGIN()                           (struct hsg_op){ HSG_OP_TYPE_BEGIN                                    }
 #define ELSE()                            (struct hsg_op){ HSG_OP_TYPE_ELSE                                     }

 #define STORE_SLAB_EARLY_EXIT()           (struct hsg_op){ HSG_OP_TYPE_STORE_SLAB_EARLY_EXIT                    }

 #define FILE_HEADER()                     (struct hsg_op){ HSG_OP_TYPE_FILE_HEADER                              }
 #define FILE_FOOTER()                     (struct hsg_op){ HSG_OP_TYPE_FILE_FOOTER                              }

 #define TRANSPOSE_KERNEL_PROTO()          (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO                   }
 #define TRANSPOSE_KERNEL_PREAMBLE()       (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE                }
 #define TRANSPOSE_KERNEL_BODY()           (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY                    }

 #define BS_KERNEL_PROTO(i)                (struct hsg_op){ HSG_OP_TYPE_BS_KERNEL_PROTO,             { i       } }
 #define BS_KERNEL_PREAMBLE(i)             (struct hsg_op){ HSG_OP_TYPE_BS_KERNEL_PREAMBLE,          { i       } }

 #define BC_KERNEL_PROTO(i)                (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PROTO,             { i       } }
 #define BC_KERNEL_PREAMBLE(i)             (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PREAMBLE,          { i       } }

 #define FM_KERNEL_PROTO(l,s)              (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PROTO,             { l, s    } }
 #define FM_KERNEL_PREAMBLE(w,s)           (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PREAMBLE,          { w, s    } }

 #define HM_KERNEL_PROTO(d,w)              (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PROTO,             { d, w    } }
 #define HM_KERNEL_PREAMBLE(w,s)           (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PREAMBLE,          { w, s    } }

 #define BX_REG_GLOBAL_LOAD(n,v)           (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_LOAD,          { n, v    } }
 #define BX_REG_GLOBAL_STORE(n)            (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_STORE,         { n       } }

 #define FM_REG_GLOBAL_LOAD_LEFT(n,i)      (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT,     { n, i    } }
 #define FM_REG_GLOBAL_STORE_LEFT(n,i)     (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT,    { n, i    } }
 #define FM_REG_GLOBAL_LOAD_RIGHT(n,i)     (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT,    { n, i    } }
 #define FM_REG_GLOBAL_STORE_RIGHT(n,i)    (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT,   { n, i    } }

 #define HM_REG_GLOBAL_LOAD(n,i)           (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_LOAD,          { n, i    } }
 #define HM_REG_GLOBAL_STORE(n,i)          (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_STORE,         { n, i    } }

 #define WARP_FLIP(f)                      (struct hsg_op){ HSG_OP_TYPE_WARP_FLIP,                   { f       } }
 #define WARP_HALF(h)                      (struct hsg_op){ HSG_OP_TYPE_WARP_HALF,                   { h       } }

 #define CMP_FLIP(a,b,c)                   (struct hsg_op){ HSG_OP_TYPE_CMP_FLIP,                    { a, b, c } }
 #define CMP_HALF(a,b)                     (struct hsg_op){ HSG_OP_TYPE_CMP_HALF,                    { a, b    } }

 #define CMP_XCHG(a,b,p)                   (struct hsg_op){ HSG_OP_TYPE_CMP_XCHG,                    { a, b, p } }

 #define BS_REG_SHARED_STORE_V(m,i,r)      (struct hsg_op){ HSG_OP_TYPE_BS_REG_SHARED_STORE_V,       { m, i, r } }
 #define BS_REG_SHARED_LOAD_V(m,i,r)       (struct hsg_op){ HSG_OP_TYPE_BS_REG_SHARED_LOAD_V,        { m, i, r } }
 #define BC_REG_SHARED_LOAD_V(m,i,r)       (struct hsg_op){ HSG_OP_TYPE_BC_REG_SHARED_LOAD_V,        { m, i, r } }

 #define BX_REG_SHARED_STORE_LEFT(r,i,p)   (struct hsg_op){ HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT,    { r, i, p } }
 #define BS_REG_SHARED_STORE_RIGHT(r,i,p)  (struct hsg_op){ HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT,   { r, i, p } }

 #define BS_REG_SHARED_LOAD_LEFT(r,i,p)    (struct hsg_op){ HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT,     { r, i, p } }
 #define BS_REG_SHARED_LOAD_RIGHT(r,i,p)   (struct hsg_op){ HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT,    { r, i, p } }

 #define BC_REG_GLOBAL_LOAD_LEFT(r,i,p)    (struct hsg_op){ HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT,     { r, i, p } }

 #define REG_F_PREAMBLE(s)                 (struct hsg_op){ HSG_OP_TYPE_REG_F_PREAMBLE,              { s       } }
 #define REG_SHARED_STORE_F(r,i,s)         (struct hsg_op){ HSG_OP_TYPE_REG_SHARED_STORE_F,          { r, i, s } }
 #define REG_SHARED_LOAD_F(r,i,s)          (struct hsg_op){ HSG_OP_TYPE_REG_SHARED_LOAD_F,           { r, i, s } }
 #define REG_GLOBAL_STORE_F(r,i,s)         (struct hsg_op){ HSG_OP_TYPE_REG_GLOBAL_STORE_F,          { r, i, s } }

 #define BLOCK_SYNC()                      (struct hsg_op){ HSG_OP_TYPE_BLOCK_SYNC                               }

 #define BS_FRAC_PRED(m,w)                 (struct hsg_op){ HSG_OP_TYPE_BS_FRAC_PRED,                { m, w    } }

 #define BS_MERGE_H_PREAMBLE(i)            (struct hsg_op){ HSG_OP_TYPE_BS_MERGE_H_PREAMBLE,         { i       } }
 #define BC_MERGE_H_PREAMBLE(i)            (struct hsg_op){ HSG_OP_TYPE_BC_MERGE_H_PREAMBLE,         { i       } }

 #define BX_MERGE_H_PRED(p)                (struct hsg_op){ HSG_OP_TYPE_BX_MERGE_H_PRED,             { p       } }

 #define BS_ACTIVE_PRED(m,l)               (struct hsg_op){ HSG_OP_TYPE_BS_ACTIVE_PRED,              { m, l    } }

 #define FM_MERGE_RIGHT_PRED(n,s)          (struct hsg_op){ HSG_OP_TYPE_FM_MERGE_RIGHT_PRED,         { n, s    } }

 //
 // DEFAULTS
 //

 struct hsg_config hsg_config = // FIXME -- how useful is this?
   {
     .merge  = {
       .flip = {
         .lo         = 1,
         .hi         = 1
       },
       .half =  {
         .lo         = 1,
         .hi         = 1
       },

       .max_log2     = 27 // 2^27th = 128m
     },

     .block  = {
       .warps_min    = 1,          // min warps for a block that uses smem barriers
       .warps_max    = UINT32_MAX, // max warps for the entire multiprocessor
       .warps_mod    = 2,          // the number of warps necessary to load balance horizontal merging

       .smem_min     = 0,
       .smem_quantum = 1,

       .smem_bs      = 49152,
       .smem_bc      = UINT32_MAX  // implies field not set
     },

     .warp   = {
       .lanes        = 32,
     },

     .thread = {
       .regs         = 24,
       .xtra         = 0
     },

     .type   = {
       .words        = 2
     }
   };

 //
 // ZERO HSG_MERGE STRUCT
 //

 struct hsg_merge hsg_merge[MERGE_LEVELS_MAX_LOG2] = { 0 };

 //
 //
 //

 static const hsg_target_pfn hsg_target_pfns[] =
   {
     hsg_target_debug,
     hsg_target_cuda_sm3x,
     hsg_target_igp_genx,
     // hsg_target_adreno_5xx,
     // hsg_target_amd_gcn,
     // hsg_target_x86_sse,
     // hsg_target_x86_avx2,
   };

 static const char * hsg_target_pfn_string[] =
   {
     "hs_debug",
     "hs_cuda",
     "hs_cl"
   };

 static const char * hsg_file_type_string[][2] =
   {
     { ".h", ".txt" },
     { ".h", ".cu"  },
     { ".h", ".cl"  }
   };

 //
 //
 //

 #define HSG_TARGET_PFN_COUNT  ARRAY_LENGTH(hsg_target_pfns)

 //
 //
 //

 static hsg_op_type hsg_op_type_counts[HSG_OP_TYPE_COUNT] = { 0 };

 //
 //
 //

 static
 void
 hsg_op_debug()
 {
   for (hsg_op_type t=HSG_OP_TYPE_EXIT; t<HSG_OP_TYPE_COUNT; t++)
     fprintf(stderr,"%-37s : %u\n",hsg_op_type_string[t],hsg_op_type_counts[t]);
 }

 //
 //
 //

 static
 void
 hsg_config_init_shared()
 {
   //
   // The assumption here is that a proper smem_bs value was provided
   // that represents the maximum fraction of the multiprocessor's
   // available shared memory that can be accessed by the initial block
   // sorting kernel.
   //
   // With CUDA devices this is 48KB out of 48KB, 64KB or 96KB.
   //
   // Intel subslices are a little trickier and the minimum allocation
   // is 4KB and the maximum is 64KB on pre-Skylake IGPs.  Sizes are
   // allocated in 1KB increments.  If a maximum of two block sorters
   // can occupy a subslice then each should be assigned 32KB of shared
   // memory.
   //
   // News Flash: apparently GEN9+ IGPs can allocate 1KB of SMEM per
   // workgroup so all the previously written logic to support this
   // issue is being removed.
   //
   uint32_t const bs_keys = hsg_config.block.smem_bs / (hsg_config.type.words * sizeof(uint32_t));

   hsg_config.warp.skpw_bs = bs_keys / hsg_merge[0].warps;
 }

 static
 void
 hsg_merge_levels_init_shared(struct hsg_merge * const merge)
 {
   {
     //
     // What is the max amount of shared in each possible bs block config?
     //
     // The provided smem_bs size will be allocated for each sorting block.
     //
     uint32_t const bs_threads   = merge->warps * hsg_config.warp.lanes;
     uint32_t const bs_keys      = hsg_config.block.smem_bs / (hsg_config.type.words * sizeof(uint32_t));
     uint32_t const bs_kpt       = bs_keys / bs_threads;
     uint32_t const bs_kpt_mod   = (bs_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod;
     uint32_t const bs_rows_even = bs_kpt_mod & ~1; // must be even because flip merge only works on row pairs

     // this is a showstopper
     if (bs_rows_even < 2)
       {
         fprintf(stderr,"Error: need at least 2 rows of shared memory.\n");
         exit(-1);
       }

     // clamp to number of registers
     merge->rows_bs = min(bs_rows_even, hsg_config.thread.regs);
   }

   //
   // smem key allocation rule for BC kernels is that a single block
   // can't allocate more than smem_bs and must allocate at least
   // smem_min in smem_quantum steps.
   //
   // Note that BC blocks will always be less than or equal to BS
   // blocks.
   //
   {
     //
     // if merge->warps is not pow2 then we're going to skip creating a bc elsewhere
     //
     uint32_t const bc_warps_min  = max(merge->warps,hsg_config.block.warps_min);
     uint32_t const bc_threads    = bc_warps_min * hsg_config.warp.lanes;
     uint32_t const bc_block_rd   = (((hsg_config.block.smem_bc * bc_warps_min) / hsg_config.block.warps_max) /
                                     hsg_config.block.smem_quantum) * hsg_config.block.smem_quantum;
     uint32_t const bc_block_max  = max(bc_block_rd,hsg_config.block.smem_min);
     uint32_t const bc_block_smem = min(bc_block_max,hsg_config.block.smem_bs);

     // what is the max amount of shared in each possible bc block config?
     uint32_t const bc_keys       = bc_block_smem / (hsg_config.type.words * sizeof(uint32_t));
     uint32_t const bc_kpt        = bc_keys / bc_threads;
     uint32_t const bc_kpt_mod    = (bc_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod;

     merge->rows_bc = min(bc_kpt_mod, hsg_config.thread.regs);
     merge->skpw_bc = bc_keys / bc_warps_min;
   }
 }

 //
 //
 //

 static
 void
 hsg_merge_levels_init_1(struct hsg_merge * const merge, uint32_t const warps, uint32_t const level, uint32_t const offset)
 {
   uint32_t const even_odd = warps & 1;

   merge->levels[level].evenodds[even_odd]++;
   merge->levels[level].networks[even_odd] = warps;

   if (warps == 1)
     return;

   merge->levels[level].active.b64 |= BITS_TO_MASK_AT_64(warps,offset);

   uint32_t const count = merge->levels[level].count++;
   uint32_t const index = (1 << level) + count;
   uint32_t const bit   = 1 << count;

   merge->levels[level].evenodd_masks[even_odd] |= bit;

   if (count > 0)
     {
       // offset from network to left of this network
       uint32_t const diff   = offset - merge->offsets[index-1];

       uint32_t const diff_0 = merge->levels[level].diffs[0];
       uint32_t const diff_1 = merge->levels[level].diffs[1];

       uint32_t diff_idx = UINT32_MAX;

       if        ((diff_0 == 0) || (diff_0 == diff)) {
         diff_idx = 0;
       } else if ((diff_1 == 0) || (diff_1 == diff)) {
         diff_idx = 1;
       } else {
         fprintf(stderr, "*** MORE THAN TWO DIFFS ***\n");
         exit(-1);
       }

       merge->levels[level].diffs     [diff_idx]  = diff;
       merge->levels[level].diff_masks[diff_idx] |= 1 << (count-1);
     }

   merge->networks[index] = warps;
   merge->offsets [index] = offset;

   uint32_t const l = (warps+1)/2; // lower/larger  on left
   uint32_t const r = (warps+0)/2; // higher/smaller on right

   hsg_merge_levels_init_1(merge,l,level+1,offset);
   hsg_merge_levels_init_1(merge,r,level+1,offset+l);
 }

 static
 void
 hsg_merge_levels_debug(struct hsg_merge * const merge)
 {
   for (uint32_t level=0; level<MERGE_LEVELS_MAX_LOG2; level++)
     {
       uint32_t count = merge->levels[level].count;

       if (count == 0)
         break;

       fprintf(stderr,
               "%-4u : %016llX \n",
               count,
               merge->levels[level].active.b64);

       fprintf(stderr,
               "%-4u : %08X (%2u)\n"
               "%-4u : %08X (%2u)\n",
               merge->levels[level].diffs[0],
               merge->levels[level].diff_masks[0],
               __popcnt(merge->levels[level].diff_masks[0]),
               merge->levels[level].diffs[1],
               merge->levels[level].diff_masks[1],
               __popcnt(merge->levels[level].diff_masks[1]));

       fprintf(stderr,
               "EVEN : %08X (%2u)\n"
               "ODD  : %08X (%2u)\n",
               merge->levels[level].evenodd_masks[0],
               __popcnt(merge->levels[level].evenodd_masks[0]),
               merge->levels[level].evenodd_masks[1],
               __popcnt(merge->levels[level].evenodd_masks[1]));

       for (uint32_t ii=0; ii<2; ii++)
         {
           if (merge->levels[level].networks[ii] > 1)
             {
               fprintf(stderr,
                       "%-4s : ( %2u x %2u )\n",
                       (ii == 0) ? "EVEN" : "ODD",
                       merge->levels[level].evenodds[ii],
                       merge->levels[level].networks[ii]);
             }
         }

       uint32_t index = 1 << level;

       while (count-- > 0)
         {
           fprintf(stderr,
                   "[ %2u %2u ] ",
                   merge->offsets [index],
                   merge->networks[index]);

           index += 1;
         }

       fprintf(stderr,"\n\n");
     }
 }

 static
 void
 hsg_merge_levels_hint(struct hsg_merge * const merge, bool const autotune)
 {
   // clamp against merge levels
   for (uint32_t level=0; level<MERGE_LEVELS_MAX_LOG2; level++)
     {
       // max network
       uint32_t const n_max = max(merge->levels[level].networks[0],
                                  merge->levels[level].networks[1]);

       if (n_max <= (merge->rows_bs + hsg_config.thread.xtra))
         break;

       if (autotune)
         {
           hsg_config.thread.xtra = n_max - merge->rows_bs;

           uint32_t const r_total = hsg_config.thread.regs + hsg_config.thread.xtra;
           uint32_t const r_limit = (hsg_config.type.words == 1) ? 120 : 58;

           if (r_total <= r_limit)
             {
               fprintf(stderr,"autotune: %u + %u\n",
                       hsg_config.thread.regs,
                       hsg_config.thread.xtra);
               break;
             }
           else
             {
               fprintf(stderr,"skipping autotune: %u + %u > %u\n",
                       hsg_config.thread.regs,
                       hsg_config.thread.xtra,
                       r_limit);
               exit(-1);
             }
         }

       fprintf(stderr,"*** HINT *** Try extra registers: %u\n",
               n_max - merge->rows_bs);

       exit(-1);
     }
 }

 //
 //
 //

 static
 struct hsg_op *
 hsg_op(struct hsg_op * ops, struct hsg_op const opcode)
 {
   hsg_op_type_counts[opcode.type] += 1;

   *ops = opcode;

   return ops+1;
 }

 static
 struct hsg_op *
 hsg_exit(struct hsg_op * ops)
 {
   return hsg_op(ops,EXIT());
 }

 static
 struct hsg_op *
 hsg_end(struct hsg_op * ops)
 {
   return hsg_op(ops,END());
 }

 static
 struct hsg_op *
 hsg_begin(struct hsg_op * ops)
 {
   return hsg_op(ops,BEGIN());
 }

 static
 struct hsg_op *
 hsg_else(struct hsg_op * ops)
 {
   return hsg_op(ops,ELSE());
 }

 static
 struct hsg_op *
 hsg_network_copy(struct hsg_op            *       ops,
                  struct hsg_network const * const nets,
                  uint32_t                   const idx,
                  uint32_t                   const prefix)
 {
   uint32_t              const len = nets[idx].length;
   struct hsg_op const * const cxa = nets[idx].network;

   for (uint32_t ii=0; ii<len; ii++)
     {
       const struct hsg_op * const cx = cxa + ii;

       ops = hsg_op(ops,CMP_XCHG(cx->a,cx->b,prefix));
     }

   return ops;
 }

 static
 struct hsg_op *
 hsg_thread_sort(struct hsg_op * ops)
 {
   uint32_t const idx = hsg_config.thread.regs / 2 - 1;

   return hsg_network_copy(ops,hsg_networks_sorting,idx,UINT32_MAX);
 }

 static
 struct hsg_op *
 hsg_thread_merge_prefix(struct hsg_op * ops, uint32_t const network, uint32_t const prefix)
 {
   if (network <= 1)
     return ops;

   return hsg_network_copy(ops,hsg_networks_merging,network-2,prefix);
 }

 static
 struct hsg_op *
 hsg_thread_merge(struct hsg_op * ops, uint32_t const network)
 {
   return hsg_thread_merge_prefix(ops,network,UINT32_MAX);
 }

 static
 struct hsg_op *
 hsg_thread_merge_offset_prefix(struct hsg_op * ops, uint32_t const offset, uint32_t const network, uint32_t const prefix)
 {
   if (network <= 1)
     return ops;

   uint32_t                  const idx = network - 2;
   uint32_t                  const len = hsg_networks_merging[idx].length;
   struct hsg_op const * const cxa = hsg_networks_merging[idx].network;

   for (uint32_t ii=0; ii<len; ii++)
     {
       struct hsg_op const * const cx = cxa + ii;

       ops = hsg_op(ops,CMP_XCHG(offset + cx->a,offset + cx->b,prefix));
     }

   return ops;
 }

 static
 struct hsg_op *
 hsg_thread_merge_offset(struct hsg_op * ops, uint32_t const offset, uint32_t const network)
 {
   return hsg_thread_merge_offset_prefix(ops,offset,network,UINT32_MAX);
 }

 static
 struct hsg_op *
 hsg_thread_merge_left_right_prefix(struct hsg_op * ops, uint32_t const left, uint32_t const right, uint32_t const prefix)
 {
   for (uint32_t l=left,r=left+1; r<=left+right; l--,r++)
     {
       ops = hsg_op(ops,CMP_XCHG(l,r,prefix));
     }

   return ops;
 }

 static
 struct hsg_op *
 hsg_thread_merge_left_right(struct hsg_op * ops, uint32_t const left, uint32_t const right)
 {
   return hsg_thread_merge_left_right_prefix(ops,left,right,UINT32_MAX);
 }

 static
 struct hsg_op *
 hsg_warp_half_network(struct hsg_op * ops)
 {
   uint32_t const n = hsg_config.thread.regs;

   for (uint32_t r=1; r<=n; r++)
     ops = hsg_op(ops,CMP_HALF(r-1,r));

   return ops;
 }

 static
 struct hsg_op *
 hsg_warp_half_downto(struct hsg_op * ops, uint32_t h)
 {
   //
   // *** from h: downto[f/2,1)
   // **** lane_half(h)
   //
   for (; h > 1; h/=2)
     {
       ops = hsg_begin(ops);

       ops = hsg_op(ops,WARP_HALF(h));
       ops = hsg_warp_half_network(ops);

       ops = hsg_end(ops);
     }

   return ops;
 }

 static
 struct hsg_op *
 hsg_warp_flip_network(struct hsg_op * ops)
 {
   uint32_t const n = hsg_config.thread.regs;

   for (uint32_t r=1; r<=n/2; r++)
     ops = hsg_op(ops,CMP_FLIP(r-1,r,n+1-r));

   return ops;
 }

 static
 struct hsg_op *
 hsg_warp_flip(struct hsg_op * ops, uint32_t f)
 {
   ops = hsg_begin(ops);

   ops = hsg_op(ops,WARP_FLIP(f));
   ops = hsg_warp_flip_network(ops);

   ops = hsg_end(ops);

   return ops;
 }

 static
 struct hsg_op *
 hsg_bx_warp_load(struct hsg_op * ops, const int32_t vin_or_vout)
 {
   uint32_t const n = hsg_config.thread.regs;

   for (uint32_t r=1; r<=n; r++)
     ops = hsg_op(ops,BX_REG_GLOBAL_LOAD(r,vin_or_vout));

   return ops;
 }

 static
 struct hsg_op *
 hsg_bx_warp_store(struct hsg_op * ops)
 {
   uint32_t const n = hsg_config.thread.regs;

   for (uint32_t r=1; r<=n; r++)
     ops = hsg_op(ops,BX_REG_GLOBAL_STORE(r));

   return ops;
 }

 //
 //
 //

 static
 struct hsg_op *
 hsg_warp_transpose(struct hsg_op * ops)
 {
   // func proto
   ops = hsg_op(ops,TRANSPOSE_KERNEL_PROTO());

   // begin
   ops = hsg_begin(ops);

   // preamble
   ops = hsg_op(ops,TRANSPOSE_KERNEL_PREAMBLE());

   // load
   ops = hsg_bx_warp_load(ops,1); // 1 = load from vout[]

   // emit transpose blend and remap macros ...
   ops = hsg_op(ops,TRANSPOSE_KERNEL_BODY());

   // ... done!
   ops = hsg_end(ops);

   return ops;
 }

 //
 //
 //

 static
 struct hsg_op *
 hsg_warp_half(struct hsg_op * ops, uint32_t const h)
 {
   //
   // *** from h: downto[f/2,1)
   // **** lane_half(h)
   // *** thread_merge
   //
   ops = hsg_warp_half_downto(ops,h);
   ops = hsg_thread_merge(ops,hsg_config.thread.regs);

   return ops;
 }

 static
 struct hsg_op *
 hsg_warp_merge(struct hsg_op * ops)
 {
   //
   // * from f: upto[2,warp.lanes]
   // ** lane_flip(f)
   // *** from h: downto[f/2,1)
   // **** lane_half(h)
   // *** thread_merge
   //
   uint32_t const level = hsg_config.warp.lanes;

   for (uint32_t f=2; f<=level; f*=2)
     {
       ops = hsg_warp_flip(ops,f);
       ops = hsg_warp_half(ops,f/2);
     }

   return ops;
 }

 //
 //
 //

 static
 struct hsg_op *
 hsg_bc_half_merge_level(struct hsg_op          *       ops,
                         struct hsg_merge const * const merge,
                         uint32_t                 const r_lo,
                         uint32_t                 const s_count)
 {
   // guaranteed to be an even network
   uint32_t const net_even = merge->levels[0].networks[0];

   // min of warps in block and remaining horizontal rows
   uint32_t const active = min(s_count, net_even);

   // conditional on blockIdx.x
   if (active < merge->warps)
     ops = hsg_op(ops,BX_MERGE_H_PRED(active)); // FIXME BX_MERGE

   // body begin
   ops = hsg_begin(ops);

   // scale for min block
   uint32_t const scale = net_even >= hsg_config.block.warps_min ? 1 : hsg_config.block.warps_min / net_even;

   // loop if more smem rows than warps
   for (uint32_t rr=0; rr<s_count; rr+=active)
     {
       // body begin
       ops = hsg_begin(ops);

       // skip down slab
       uint32_t const gmem_base = r_lo - 1 + rr;

       // load registers horizontally -- striding across slabs
       for (uint32_t ll=1; ll<=net_even; ll++)
         ops = hsg_op(ops,BC_REG_GLOBAL_LOAD_LEFT(ll,gmem_base+(ll-1)*hsg_config.thread.regs,0));

       // merge all registers
       ops = hsg_thread_merge_prefix(ops,net_even,0);

       // if we're looping then there is a base
       uint32_t const smem_base = rr * net_even * scale;

       // store all registers
       for (uint32_t ll=1; ll<=net_even; ll++)
         ops = hsg_op(ops,BX_REG_SHARED_STORE_LEFT(ll,smem_base+ll-1,0));

       // body end
       ops = hsg_end(ops);
     }

   // body end
   ops = hsg_end(ops);

   return ops;
 }

 static
 struct hsg_op *
 hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge)
 {
   //
   // will only be called with merge->warps >= 2
   //
   uint32_t const warps    = max(merge->warps,hsg_config.block.warps_min);

   // guaranteed to be an even network
   uint32_t const net_even = merge->levels[0].networks[0];

   // set up left SMEM pointer
   ops = hsg_op(ops,BC_MERGE_H_PREAMBLE(merge->index));

   // trim to number of warps in block -- FIXME -- try make this a
   // multiple of local processor count (Intel = 8, NVIDIA = 4)
   uint32_t const s_max = merge->rows_bc;

   // for all the registers
   for (uint32_t r_lo = 1; r_lo <= hsg_config.thread.regs; r_lo += s_max)
     {
       // compute store count
       uint32_t const r_rem   = hsg_config.thread.regs + 1 - r_lo;
       uint32_t const s_count = min(s_max,r_rem);

       // block sync -- can skip if first
       if (r_lo > 1)
         ops = hsg_op(ops,BLOCK_SYNC());

       // merge loop
       ops = hsg_bc_half_merge_level(ops,merge,r_lo,s_count);

       // block sync
       ops = hsg_op(ops,BLOCK_SYNC());

       // load rows from shared
       for (uint32_t c=0; c<s_count; c++)
         ops = hsg_op(ops,BC_REG_SHARED_LOAD_V(warps,r_lo+c,c));
     }

   return ops;
 }

 //
 //
 //

 static
 struct hsg_op *
 hsg_bs_flip_merge_level(struct hsg_op          *       ops,
                         struct hsg_merge const * const merge,
                         uint32_t                 const level,
                         uint32_t                 const s_pairs)
 {
   //
   // Note there are a number of ways to flip merge these warps.  There
   // is a magic number in the merge structure that indicates which
   // warp to activate as well as what network size to invoke.
   //
   // This more complex scheme was used in the past.
   //
   // The newest scheme is far dumber/simpler and simply directs a warp
   // to gather up the network associated with a row and merge them.
   //
   // This scheme may use more registers per thread but not all
   // compilers are high quality.
   //
   // If there are more warps than smem row pairs to merge then we
   // disable the spare warps.
   //
   // If there are more row pairs than warps then each warp works on
   // an equal number of rows.
   //
   // Note that it takes two warps to flip merge two smem rows.
   //
   // FIXME -- We may want to apply the warp smem "mod" value here to
   // attempt to balance the load>merge>store operations across the
   // multiprocessor cores.
   //
   // FIXME -- the old scheme attempted to keep all the warps active
   // but the iteration logic was more complex.  See 2016 checkins.
   //

   // where are we in computed merge?
   uint32_t const count  = merge->levels[level].count;
   uint32_t const index  = 1 << level;

   uint32_t       s_rows = s_pairs * 2;
   uint32_t       base   = 0;

   while (s_rows > 0)
     {
       uint32_t active = merge->warps;

       // disable warps if necessary
       if (merge->warps > s_rows) {
         active = s_rows;
         ops    = hsg_op(ops,BX_MERGE_H_PRED(active));
       }

       // body begin
       ops = hsg_begin(ops);

       // how many equal number of rows to merge?
       uint32_t loops = s_rows / active;

       // decrement
       s_rows -= loops * active;

       for (uint32_t ss=0; ss<loops; ss++)
         {
           // load all registers
           for (uint32_t ii=0; ii<count; ii++)
             {
               // body begin
               ops = hsg_begin(ops);

               uint32_t const offset  = merge->offsets [index+ii];
               uint32_t const network = merge->networks[index+ii];
               uint32_t const lo      = (network + 1) / 2;

               for (uint32_t ll=1; ll<=lo; ll++)
                 ops = hsg_op(ops,BS_REG_SHARED_LOAD_LEFT(ll,base+offset+ll-1,ii));

               for (uint32_t rr=lo+1; rr<=network; rr++)
                 ops = hsg_op(ops,BS_REG_SHARED_LOAD_RIGHT(rr,base+offset+rr-1,ii));

               // compare left and right
               ops = hsg_thread_merge_left_right_prefix(ops,lo,network-lo,ii);

               // right merging network
               ops = hsg_thread_merge_offset_prefix(ops,lo,network-lo,ii);

               // left merging network
               ops = hsg_thread_merge_prefix(ops,lo,ii);

               for (uint32_t ll=1; ll<=lo; ll++)
                 ops = hsg_op(ops,BX_REG_SHARED_STORE_LEFT(ll,base+offset+ll-1,ii));

               for (uint32_t rr=lo+1; rr<=network; rr++)
                 ops = hsg_op(ops,BS_REG_SHARED_STORE_RIGHT(rr,base+offset+rr-1,ii));

               // body end
               ops = hsg_end(ops);
             }

           base += active * merge->warps;
         }

       // body end
       ops = hsg_end(ops);
     }

   return ops;
 }

 static
 struct hsg_op *
 hsg_bs_flip_merge(struct hsg_op * ops, struct hsg_merge const * const merge)
 {
   // set up horizontal smem pointer
   ops = hsg_op(ops,BS_MERGE_H_PREAMBLE(merge->index));

   // begin merge
   uint32_t level = MERGE_LEVELS_MAX_LOG2;

   while (level-- > 0)
     {
       uint32_t const count = merge->levels[level].count;

       if (count == 0)
         continue;

       uint32_t const r_mid       = hsg_config.thread.regs/2 + 1;
       uint32_t const s_pairs_max = merge->rows_bs/2; // this is warp mod

       // for all the registers
       for (uint32_t r_lo=1; r_lo<r_mid; r_lo+=s_pairs_max)
         {
           uint32_t r_hi = hsg_config.thread.regs + 1 - r_lo;

           // compute store count
           uint32_t const s_pairs = min(s_pairs_max,r_mid - r_lo);

           // store rows to shared
           for (uint32_t c=0; c<s_pairs; c++)
             {
               ops = hsg_op(ops,BS_REG_SHARED_STORE_V(merge->index,r_lo+c,c*2+0));
               ops = hsg_op(ops,BS_REG_SHARED_STORE_V(merge->index,r_hi-c,c*2+1));
             }

           // block sync
           ops = hsg_op(ops,BLOCK_SYNC());

           // merge loop
           ops = hsg_bs_flip_merge_level(ops,merge,level,s_pairs);

           // block sync
           ops = hsg_op(ops,BLOCK_SYNC());

           // load rows from shared
           for (uint32_t c=0; c<s_pairs; c++)
             {
               ops = hsg_op(ops,BS_REG_SHARED_LOAD_V(merge->index,r_lo+c,c*2+0));
               ops = hsg_op(ops,BS_REG_SHARED_LOAD_V(merge->index,r_hi-c,c*2+1));
             }
         }

       // conditionally clean -- no-op if equal to number of warps/block
       if (merge->levels[level].active.b64 != BITS_TO_MASK_64(merge->warps))
         ops = hsg_op(ops,BS_ACTIVE_PRED(merge->index,level));

       // clean warp
       ops = hsg_begin(ops);
       ops = hsg_warp_half(ops,hsg_config.warp.lanes);
       ops = hsg_end(ops);
     }

   return ops;
 }

 /*

 //
 // DELETE ME WHEN READY
 //

 static
 struct hsg_op *
 hsg_bs_flip_merge_all(struct hsg_op * ops, const struct hsg_merge * const merge)
 {
   for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++)
     {
       const struct hsg_merge* const m = merge + merge_idx;

       if (m->warps < 2)
         break;

       ops = hsg_op(ops,BS_FRAC_PRED(merge_idx,m->warps));
       ops = hsg_begin(ops);
       ops = hsg_bs_flip_merge(ops,m);
       ops = hsg_end(ops);
     }

   return ops;
 }
 */

 //
 // GENERATE SORT KERNEL
 //

 static
 struct hsg_op *
 hsg_bs_sort(struct hsg_op * ops, const struct hsg_merge * const merge)
 {
   // func proto
   ops = hsg_op(ops,BS_KERNEL_PROTO(merge->index));

   // begin
   ops = hsg_begin(ops);

   // shared declare
   ops = hsg_op(ops,BS_KERNEL_PREAMBLE(merge->index));

   // load
   ops = hsg_bx_warp_load(ops,0); // 0 = load from vin[]

   // thread sorting network
   ops = hsg_thread_sort(ops);

   // warp merging network
   ops = hsg_warp_merge(ops);

   // slab merging network
   if (merge->warps > 1)
     ops = hsg_bs_flip_merge(ops,merge);

   // store
   ops = hsg_bx_warp_store(ops);

   // end
   ops = hsg_end(ops);

   return ops;
 }

 //
 // GENERATE SORT KERNELS
 //

 static
 struct hsg_op *
 hsg_bs_sort_all(struct hsg_op * ops)
 {
   for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++)
     {
       const struct hsg_merge* const m = hsg_merge + merge_idx;

       if (m->warps == 0)
         break;

       ops = hsg_bs_sort(ops,m);
     }

   return ops;
 }

 //
 // GENERATE CLEAN KERNEL FOR A POWER-OF-TWO
 //

 static
 struct hsg_op *
 hsg_bc_clean(struct hsg_op * ops, const struct hsg_merge * const merge)
 {
   // func proto
   ops = hsg_op(ops,BC_KERNEL_PROTO(merge->index));

   // begin
   ops = hsg_begin(ops);

   // shared declare
   ops = hsg_op(ops,BC_KERNEL_PREAMBLE(merge->index));

   // if warps == 1 then smem isn't used for merging
   if (merge->warps == 1)
     {
       // load slab directly
       ops = hsg_bx_warp_load(ops,1); // load from vout[]
     }
   else
     {
       // block merging network -- strided load of slabs
       ops = hsg_bc_half_merge(ops,merge);
     }

   // clean warp
   ops = hsg_begin(ops);
   ops = hsg_warp_half(ops,hsg_config.warp.lanes);
   ops = hsg_end(ops);

   // store
   ops = hsg_bx_warp_store(ops);

   // end
   ops = hsg_end(ops);

   return ops;
 }

 //
 // GENERATE CLEAN KERNELS
 //

 static
 struct hsg_op *
 hsg_bc_clean_all(struct hsg_op * ops)
 {
   for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++)
     {
       const struct hsg_merge* const m = hsg_merge + merge_idx;

       if (m->warps == 0)
         break;

       // only generate pow2 clean kernels less than or equal to max
       // warps in block with the assumption that we would've generated
       // a wider sort kernel if we could've so a wider clean kernel
       // isn't a feasible size
       if (!is_pow2_u32(m->warps))
         continue;

       ops = hsg_bc_clean(ops,m);
     }

   return ops;
 }

 //
 // GENERATE FLIP MERGE KERNEL
 //

 static
 struct hsg_op *
 hsg_fm_thread_load_left(struct hsg_op * ops, uint32_t const n)
 {
   uint32_t const mid = n/2;

   for (uint32_t r=1; r<=mid; r++)
     ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_LEFT(r,r-1));

   return ops;
 }

 static
 struct hsg_op *
 hsg_fm_thread_store_left(struct hsg_op * ops, uint32_t const n)
 {
   uint32_t const mid = n/2;

   for (uint32_t r=mid; r>=1; r--)
     ops = hsg_op(ops,FM_REG_GLOBAL_STORE_LEFT(r,r-1));

   return ops;
 }

 static
 struct hsg_op *
 hsg_fm_thread_load_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right)
 {
   uint32_t const mid   = n / 2;
   uint32_t const first = mid + 1;
   uint32_t const last  = mid + span_right;

   for (uint32_t r=first; r<=last; r++)
     ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_RIGHT(r,r-first));

   return ops;
 }

 static
 struct hsg_op *
 hsg_fm_thread_store_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right)
 {
   uint32_t const mid   = n / 2;
   uint32_t const first = mid + 1;
   uint32_t const last  = mid + span_right;

   for (uint32_t r=last; r>=first; r--)
     ops = hsg_op(ops,FM_REG_GLOBAL_STORE_RIGHT(r,r-first));

   return ops;
 }

 static
 struct hsg_op *
 hsg_fm_thread_merge_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right)
 {
   // conditional
   ops = hsg_op(ops,FM_MERGE_RIGHT_PRED(n/2,span_right));

   // begin
   ops = hsg_begin(ops);

   // load
   ops = hsg_fm_thread_load_right(ops,n,span_right);

   // compare left and right
   ops = hsg_thread_merge_left_right(ops,n/2,span_right);

   // right merging network
   ops = hsg_thread_merge_offset(ops,n/2,span_right);

   // store
   ops = hsg_fm_thread_store_right(ops,n,span_right);

   // end
   ops = hsg_end(ops);

   return ops;
 }

 static
 struct hsg_op *
 hsg_fm_thread_merge_right_all(struct hsg_op * ops, uint32_t const span)
 {
   ops = hsg_fm_thread_merge_right(ops,span,span/2);

   for (uint32_t span_pow2 = pow2_ru_u32(span) / 4; span_pow2 >= 1; span_pow2 /= 2)
     {
       ops = hsg_fm_thread_merge_right(ops,span,span_pow2);
     }

   return ops;
 }

 static
 struct hsg_op *
 hsg_fm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const fm_scale)
 {
   // func proto
   ops = hsg_op(ops,FM_KERNEL_PROTO(level,fm_scale));

   // begin
   ops = hsg_begin(ops);

   // shared declare
   ops = hsg_op(ops,FM_KERNEL_PREAMBLE(span,fm_scale));

   // load
   ops = hsg_fm_thread_load_left(ops,span);

   // right merging network
   ops = hsg_fm_thread_merge_right_all(ops,span);

   // left merging network
   ops = hsg_thread_merge(ops,span/2);

   // store
   ops = hsg_fm_thread_store_left(ops,span);

   // end
   ops = hsg_end(ops);

   return ops;
 }

 //
 // GENERATE HALF MERGE KERNELS
 //

 static
 struct hsg_op *
 hsg_hm_thread_load(struct hsg_op * ops, uint32_t const n)
 {
   for (uint32_t r=1; r<=n; r++)
     ops = hsg_op(ops,HM_REG_GLOBAL_LOAD(r,r-1));

   return ops;
 }

 static
 struct hsg_op *
 hsg_hm_thread_store(struct hsg_op * ops, uint32_t const n)
 {
   for (uint32_t r=n; r>=1; r--)
     ops = hsg_op(ops,HM_REG_GLOBAL_STORE(r,r-1));

   return ops;
 }

 static
 struct hsg_op *
 hsg_hm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const hm_scale)
 {
   // func proto
   ops = hsg_op(ops,HM_KERNEL_PROTO(level,level-msb_idx_u32(span)));

   // begin
   ops = hsg_begin(ops);

   // declarations
   ops = hsg_op(ops,HM_KERNEL_PREAMBLE(span,hm_scale));

   // load
   ops = hsg_hm_thread_load(ops,span);

   // thread merging network
   ops = hsg_thread_merge(ops,span);

   // store
   ops = hsg_hm_thread_store(ops,span);

   // end
   ops = hsg_end(ops);

   return ops;
 }

 //
 //
 //

 static
 struct hsg_op *
 hsg_fm_merge_level(struct hsg_op * ops, uint32_t const level)
 {
   uint32_t const bc_max      = pow2_rd_u32(hsg_merge[0].warps);
   uint32_t const bc_max_log2 = msb_idx_u32(bc_max);

   uint32_t const fm_level    = (level <= bc_max_log2) ? hsg_config.merge.flip.lo : min(level - bc_max_log2,hsg_config.merge.flip.hi);
   uint32_t const fm_scale    = level - fm_level;

   ops = hsg_fm_merge(ops,
                      level,
                      hsg_merge[0].warps * (1u << fm_level),
                      fm_scale);

   return ops;
 }

 //
 //
 //

 static
 struct hsg_op *
 hsg_hm_merge_level(struct hsg_op * ops, uint32_t const level)
 {
   uint32_t const bc_max      = pow2_rd_u32(hsg_merge[0].warps);
   uint32_t const bc_max_log2 = msb_idx_u32(bc_max);

   uint32_t const fm_log2_max = bc_max_log2 + hsg_config.merge.flip.hi;

   if (level > fm_log2_max)
     {
       uint32_t const down_warps_log2 = level - fm_log2_max;
       uint32_t const hm_level        = max(hsg_config.merge.half.lo,min(hsg_config.merge.half.hi,down_warps_log2));

       ops = hsg_hm_merge(ops,
                          level - hsg_config.merge.flip.hi,
                          bc_max * (1u << hm_level),
                          down_warps_log2 - hm_level);
     }

   return ops;
 }

 //
 // GENERATE MERGE KERNELS
 //

 static
 struct hsg_op *
 hsg_xm_merge_all(struct hsg_op * ops)
 {
   uint32_t const keys_per_block = hsg_merge[0].warps * hsg_config.warp.lanes * hsg_config.thread.regs;
   uint32_t const blocks         = ((1U << hsg_config.merge.max_log2) + keys_per_block - 1) / keys_per_block;
   uint32_t const blocks_ru      = pow2_ru_u32(blocks);
   uint32_t const blocks_log2    = msb_idx_u32(blocks_ru);

   for (uint32_t level=1; level<=blocks_log2; level+=1)
     {
       //
       // GENERATE FLIP MERGE KERNELS
       //
       ops = hsg_fm_merge_level(ops,level);

       //
       // GENERATE HALF MERGE KERNELS
       //
       ops = hsg_hm_merge_level(ops,level);
     }

   return ops;
 }

 //
 //
 //

 void
 hsg_target_indent(struct hsg_file * const files, uint32_t const depth)
 {
   fprintf(files[HSG_FILE_TYPE_SOURCE].file,
           "%*s",
           depth*HSG_INDENT,"");
 }

 void
 hsg_target_debug(struct hsg_file        * const files,
                  const struct hsg_merge * const merge,
                  const struct hsg_op    * const ops,
                  uint32_t                     const depth)
 {

   hsg_target_indent(files,depth);

   fprintf(files[HSG_FILE_TYPE_SOURCE].file,
           "%s\n",
           hsg_op_type_string[ops->type]);
 }

 //
 //
 //

 static
 struct hsg_file*
 hsg_files_open(const char * prefix, const char ** suffix)
 {
 #define STR_BUF_SIZE 80

   struct hsg_file * files = malloc(sizeof(struct hsg_file) * HSG_FILE_TYPE_COUNT);

   for (int32_t ii=0; ii<HSG_FILE_TYPE_COUNT; ii++)
     {
       char * name = files[ii].name;

       // save prefix
       files[ii].prefix = prefix;

       // build filename
       strcpy_s(name,STR_BUF_SIZE,prefix);
       strcat_s(name,STR_BUF_SIZE,suffix[ii]);

       // open file
       fopen_s(&files[ii].file,name,"w+");
     }

   return files;
 }

 static
 void
 hsg_files_close(struct hsg_file * files)
 {
   for (int32_t ii=0; ii<HSG_FILE_TYPE_COUNT; ii++)
     fclose(files[ii].file);
 }

 //
 //
 //

 static
 const struct hsg_op *
 hsg_op_translate_depth(hsg_target_pfn                 target_pfn,
                        struct hsg_file        * const files,
                        const struct hsg_merge * const merge,
                        const struct hsg_op    *       ops,
                        uint32_t                     const depth)
 {
   while (ops->type != HSG_OP_TYPE_EXIT)
     {
       switch (ops->type)
         {
         case HSG_OP_TYPE_END:
           target_pfn(files,merge,ops,depth-1);
           return ops + 1;

         case HSG_OP_TYPE_BEGIN:
           target_pfn(files,merge,ops,depth);
           ops = hsg_op_translate_depth(target_pfn,files,merge,ops+1,depth+1);
           break;

         default:
           target_pfn(files,merge,ops++,depth);
         }
     }

   return ops;
 }

 static
 void
 hsg_op_translate(hsg_target_pfn                 target_pfn,
                  struct hsg_file        * const files,
                  const struct hsg_merge * const merge,
                  const struct hsg_op    *       ops)
 {
   hsg_op_translate_depth(target_pfn,files,merge,ops,0);
 }

 //
 //
 //

 int
 main(int argc, char * argv[])
 {
   //
   // INIT
   //
   for (uint32_t ii=0; ii<=MERGE_LEVELS_MAX_LOG2; ii++)
     {
       hsg_merge[ii].index = ii;
       hsg_merge[ii].warps = 32 / (1u << ii);
     }

   //
   // PROCESS OPTIONS
   //
   int32_t arch     = 0;
   int32_t opt      = 0;

   bool    quiet    = false;
   bool    autotune = false;

   while ((opt = getopt(argc,argv,"hqa:g:G:s:S:w:b:B:m:M:k:r:x:t:f:F:c:C:z")) != EOF)
     {
       switch (opt)
         {
         case 'h':
           fprintf(stderr,"Help goes here...\n");
           return -1;

         case 'q':
           quiet = true;
           break;

         case 'a':
           arch = atoi(optarg);
           break;

         case 'g':
           hsg_config.block.smem_min = atoi(optarg);
           break;

         case 'G':
           hsg_config.block.smem_quantum = atoi(optarg);
           break;

         case 's':
           hsg_config.block.smem_bs = atoi(optarg);

           // set smem_bc if not already set
           if (hsg_config.block.smem_bc == UINT32_MAX)
             hsg_config.block.smem_bc = hsg_config.block.smem_bs;
           break;

         case 'S':
           hsg_config.block.smem_bc = atoi(optarg);
           break;

         case 'w':
           hsg_config.warp.lanes = atoi(optarg);
           break;

         case 'b':
           // maximum warps in a workgroup / cta / thread block
           {
             uint32_t const warps         = atoi(optarg);
             uint32_t const warps_ru_pow2 = pow2_ru_u32(warps);

             // set warps_max if not already set
             if (hsg_config.block.warps_max == UINT32_MAX)
               hsg_config.block.warps_max = warps_ru_pow2;

             // must always be even
             if ((warps&1) != 0)
               {
                 fprintf(stderr,"Error: -b must be even.\n");
                 exit(-1);
               }

             hsg_merge[0].warps = warps;

             for (uint32_t ii=1; ii<=MERGE_LEVELS_MAX_LOG2; ii++)
               hsg_merge[ii].warps = warps_ru_pow2 / (1u << ii);
           }
           break;

         case 'B':
           // maximum warps that can fit in a multiprocessor
           hsg_config.block.warps_max = atoi(optarg);
           break;

         case 'm':
           // blocks using smem barriers must at least this many warps
           hsg_config.block.warps_min = atoi(optarg);
           break;

         case 'M':
           // the number of warps necessary to load balance horizontal merging
           hsg_config.block.warps_mod = atoi(optarg);
           break;

         case 'k':
           hsg_config.merge.max_log2 = atoi(optarg);
           break;

         case 'r':
           {
             uint32_t const regs = atoi(optarg);

             if ((regs&1) != 0)
               {
                 fprintf(stderr,"Error: -r must be even.\n");
                 exit(-1);
               }

             hsg_config.thread.regs = regs;
           }
           break;

         case 'x':
           hsg_config.thread.xtra   = atoi(optarg);
           break;

         case 't':
           hsg_config.type.words    = atoi(optarg);
           break;

         case 'f':
           hsg_config.merge.flip.lo = atoi(optarg);
           break;

         case 'F':
           hsg_config.merge.flip.hi = atoi(optarg);
           break;

         case 'c':
           hsg_config.merge.half.lo = atoi(optarg);
           break;

         case 'C':
           hsg_config.merge.half.hi = atoi(optarg);
           break;

         case 'z':
           autotune = true;
           break;
         }
     }

   //
   // WHICH ARCH TARGET?
   //
   hsg_target_pfn hsg_target_pfn = (arch < HSG_TARGET_PFN_COUNT) ? hsg_target_pfns[arch] : hsg_target_debug;

   //
   // OPEN FILES
   //
   struct hsg_file * files = hsg_files_open(hsg_target_pfn_string[arch],hsg_file_type_string[arch]);

   //
   // INIT F_KEYS
   //
   hsg_config_init_shared();

   //
   // INIT MERGE MAGIC
   //
   for (uint32_t ii=0; ii<MERGE_LEVELS_MAX_LOG2; ii++)
     {
       struct hsg_merge * const merge = hsg_merge + ii;

       if (merge->warps == 0)
         break;

       fprintf(stderr,">>> Generating: %1u %5u %5u %3u %3u ...\n",
               hsg_config.type.words,
               hsg_config.block.smem_bs,
               hsg_config.block.smem_bc,
               hsg_config.thread.regs,
               merge->warps);

       hsg_merge_levels_init_shared(merge);

       hsg_merge_levels_init_1(merge,merge->warps,0,0);

       hsg_merge_levels_hint(merge,autotune);

       //
       // THESE ARE FOR DEBUG/INSPECTION
       //

       if (!quiet)
         {
           hsg_merge_levels_debug(merge);
         }
     }

   if (!quiet)
     fprintf(stderr,"\n\n");

   //
   //
   //
   uint32_t        const op_count  = 1024*1024; // 2^20 ops for now!
   struct hsg_op * const ops_begin = malloc(op_count * sizeof(*ops_begin));
   struct hsg_op *       ops       = ops_begin;

   //
   // APPEND HEADER
   //
   ops = hsg_op(ops,FILE_HEADER());

   //
   // GENERATE TRANSPOSE KERNEL
   //
   ops = hsg_warp_transpose(ops);

   //
   // GENERATE SORT KERNEL
   //
   ops = hsg_bs_sort_all(ops);

   //
   // GENERATE CLEAN KERNELS
   //
   ops = hsg_bc_clean_all(ops);

   //
   // GENERATE MERGE KERNELS
   //
   ops = hsg_xm_merge_all(ops);

   //
   // APPEND FOOTER
   //
   ops = hsg_op(ops,FILE_FOOTER());

   //
   // ... WE'RE DONE!
   //
   ops = hsg_exit(ops);

   //
   // APPLY TARGET TRANSLATOR TO ACCUMULATED OPS
   //
   hsg_op_translate(hsg_target_pfn,files,hsg_merge,ops_begin);

   //
   //
   //
   if (!quiet)
     hsg_op_debug();

   //
   //
   //
   hsg_files_close(files);

   return 0;
 }

 //
 //
 //