src/compute/skc/platforms/cl_12/kernels/rasterize.cl - skia - Git at Google

 /*
  * Copyright 2017 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can
  * be found in the LICENSE file.
  *
  */

 //
 //
 //

 #include "tile.h"
 #include "common.h"
 #include "atomic_cl.h"
 #include "block_pool_cl.h"
 #include "raster_builder_cl_12.h"
 #include "kernel_cl_12.h"

 // #define SKC_ARCH_AVX2
 // #define SKC_RASTERIZE_SIMD_USES_SMEM

 #define PRINTF_ENABLE       0
 #define PRINTF_BLOCK_COUNT  0

 //
 // NOTE:
 //
 // ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT
 // AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE
 //
 // NOTE:
 //
 // IGNORE FOR NOW ANY AVX2 CODE SNIPPETS.  THEY WILL BE MOVED ASAP.
 //
 //

 #if 0 // SKC_ARCH_AVX2

 // #define SKC_RASTERIZE_SUBGROUP_SIZE              1
 // #define SKC_RASTERIZE_VECTOR_SIZE_LOG2           3
 // #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP   1

 // #define SKC_TTXB_WORDS                           8

 // #define SKC_RASTERIZE_FLOAT                      float8
 // #define SKC_RASTERIZE_UINT                       uint8
 // #define SKC_RASTERIZE_INT                        int8
 // #define SKC_RASTERIZE_PREDICATE                  int8

 // #define SKC_RASTERIZE_BIN_BLOCK                  uint16
 // #define SKC_RASTERIZE_BIN                        uint8

 // #define SKC_RASTERIZE_POOL                       uint8
 // #define SKC_RASTERIZE_POOL_SCALE                 6

 // #define SKC_RASTERIZE_TILE_HASH_X_BITS           1
 // #define SKC_RASTERIZE_TILE_HASH_Y_BITS           2

 // #define SKC_RASTERIZE_VECTOR_EXPAND()            SKC_EXPAND_8()

 #endif

 //
 // SIMT
 //

 #define SKC_RASTERIZE_BLOCK_ID_V_SIZE        SKC_RASTERIZE_SUBGROUP_SIZE
 #define SKC_RASTERIZE_TTSK_V_SIZE            SKC_RASTERIZE_SUBGROUP_SIZE
 #define SKC_RASTERIZE_TTSK_V_MASK            (SKC_RASTERIZE_TTSK_V_SIZE - 1)

 //
 //
 //

 #define SKC_RASTERIZE_VECTOR_SIZE            (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2)
 #define SKC_RASTERIZE_ELEMS_PER_SUBGROUP     (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE)

 //
 //
 //

 #define SKC_RASTERIZE_YX_INIT                0x7FFF7FFF  // { +32767, +32767 }
 #define SKC_RASTERIZE_YX_INVALID             0x80008000  // { -32768, -32768 }

 //
 //
 //

 #define SKC_RASTERIZE_TILE_HASH_X_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS)
 #define SKC_RASTERIZE_TILE_HASH_Y_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS)
 #define SKC_RASTERIZE_TILE_HASH_BITS         (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS)
 #define SKC_RASTERIZE_TILE_HASH_BIN_COUNT    (1 << SKC_RASTERIZE_TILE_HASH_BITS)
 #define SKC_RASTERIZE_TILE_HASH_BIN_BITS     (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT)
 #define SKC_RASTERIZE_TILE_HASH_BIN_MASK     SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS)

 //
 // Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
 //
 // https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
 //
 // Lerp in two fma/mad ops:
 //
 //    t * b + ((-t) * a + a)
 //
 // Note: OpenCL documents mix() as being implemented as:
 //
 //    a + (b - a) * t
 //
 // But this may be a native instruction on some devices. For example,
 // on GEN9 there is an LRP "linear interoplation" opcode but it
 // doesn't appear to support half floats.
 //
 // Feel free to toggle this option and then benchmark and inspect the
 // generated code.  We really want the double FMA to be generated when
 // there isn't support for a LERP/MIX operation.
 //

 #if 1
 #define SKC_LERP(a,b,t)      mad(t,b,mad(-(t),a,a))
 #else
 #define SKC_LERP(a,b,t)      mix(a,b,t)
 #endif

 //
 // There is no integer MAD in OpenCL with "don't care" overflow
 // semantics.
 //
 // FIXME -- verify if the platform needs explicit MAD operations even
 // if a "--fastmath" option is available at compile time.  It might
 // make sense to explicitly use MAD calls if the platform requires it.
 //

 #if 1
 #define SKC_MAD_UINT(a,b,c)  ((a) * (b) + (c))
 #else
 #define SKC_MAD_UINT(a,b,c)  mad_sat(a,b,c)
 #endif

 //
 //
 //

 #define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane())

 //
 //
 //

 union skc_bp_elem
 {
   skc_uint              u32;
   skc_tagged_block_id_t tag_id;
   skc_float             coord;
 };

 //
 //
 //

 struct skc_subgroup_smem
 {
   //
   // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member
   //
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM )
   struct {
     union {

       skc_uint                winner;

       struct {
         skc_uint              scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
       } aN;

       struct {
         SKC_RASTERIZE_UINT    scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
       } vN;
     };
   } subgroup;
 #endif

   //
   // work-in-progress TTSB blocks and associated YX keys
   //
   union {
     struct {
       // FIXME -- some typedefs are valid here
       skc_uint                ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS];
       skc_uint                yx   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
       skc_uint                id   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
       skc_uint                count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
     } aN;
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
     struct {
       SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
       SKC_RASTERIZE_BIN       yx;
       SKC_RASTERIZE_BIN       id;
       SKC_RASTERIZE_BIN       count;
     } vN;
 #endif
   } bin;
 };

 //
 //
 //

 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
 #define skc_subgroup_lane()  0
 #else
 #define skc_subgroup_lane()  get_sub_group_local_id()
 #endif

 //
 //
 //

 #define SKC_PROJECT(tv,x,y,xp,yp)                                       \
   {                                                                     \
     float const d = native_recip(fma(x,tv->w0,fma(y,tv->w1,1.0f)));     \
     xp *= d;                                                            \
     yp *= d;                                                            \
   }

 //
 // replenish block ids
 //
 // note that you can't overrun the block id pool since it's a ring
 //

 static
 void
 skc_blocks_replenish(skc_uint                           * const blocks_next,
                      skc_block_id_v_t                   * const blocks,
                      __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
                      skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
                      __global skc_block_id_t   const    * const bp_ids)
 {
   //
   // get a new vector of block ids -- this is kind of a narrow
   // allocation but subblocks help stretch out the pool.
   //
   // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids
   //
   skc_uint bp_idx = 0;

   if (skc_subgroup_lane() == 0)
     {
       bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,
                                                     SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads
 #if 0
       printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE);
 #endif
     }

   bp_idx       = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask;
   *blocks      = bp_ids[bp_idx];
   *blocks_next = 0;
 }

 //
 //
 //

 static
 skc_block_id_t
 skc_blocks_get_next(skc_uint                           * const blocks_next,
                     skc_block_id_v_t                   * const blocks,
                     __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
                     skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
                     __global skc_block_id_t   const    * const bp_ids)
 {
   // replenish?
   if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE)
     {
       skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
     }

 #if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 )
   //
   // SIMT
   //
   skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);

 #else
   //
   // SIMD
   //
   skc_block_id_t id = blocks->s0;

   skc_shuffle_down_1(*blocks);

 #endif

   *blocks_next += 1;

   return id;
 }

 //
 // subblock allocator
 //

 #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2

 static
 skc_block_id_t
 skc_subblocks_get_next(skc_block_id_t                     * const subblocks,
                        skc_uint                           * const blocks_next,
                        skc_block_id_v_t                   * const blocks,
                        __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
                        skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
                        __global skc_block_id_t   const    * const bp_ids)
 {
   if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
     {
       *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
     }

   skc_block_id_t const sb_id = *subblocks;

   *subblocks += 1;

 #if 0
   if (get_sub_group_local_id() == 0)
     printf("= %u\n",sb_id);
 #endif

   return sb_id;
 }


 #define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks
 #define SKC_SUBBLOCKS_BLOCKS_ARGS()  subblocks, blocks

 #else

 #define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks
 #define SKC_SUBBLOCKS_BLOCKS_ARGS()  blocks

 #endif

 //
 //
 //

 static
 skc_block_id_t
 skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(),
                   skc_uint                           * const blocks_next,
                   __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
                   skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
                   __global skc_block_id_t   const    * const bp_ids,
                   __global SKC_ATOMIC_UINT  volatile * const cohort_atomics,
                   skc_ttsk_v_t                       * const sk_v,
                   skc_uint                           * const sk_v_next,
                   __global skc_ttsk_s_t              * const sk_extent,
                   skc_uint                             const new_yx)
 {
 #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
   skc_block_id_t const new_id = skc_subblocks_get_next(subblocks,
                                                        blocks_next,
                                                        blocks,
                                                        bp_atomics,
                                                        bp_mask,
                                                        bp_ids);
 #else
   skc_block_id_t const new_id = skc_blocks_get_next(blocks_next,
                                                     blocks,
                                                     bp_atomics,
                                                     bp_mask, // pow2 modulo mask for block pool ring
                                                     bp_ids);
 #endif

   if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK))
     {
       sk_v->lo = new_id;
       sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx;
 #if 0
       printf("@ ( %3u, %3u ) %u\n",
              (new_yx >> 12) & 0xFFF,
              (new_yx      ) & 0xFFF,
              new_id);
 #endif
     }

   *sk_v_next += 1;

   if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE)
     {
       *sk_v_next = 0;

       skc_uint sk_idx = 0;

       if (skc_subgroup_lane() == 0)
         {
           sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
             (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE);
 #if 0
           printf("+ %u\n",sk_idx);
 #endif
         }

       sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();

 #if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE )
       if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE)
 #endif
         {
           sk_extent[sk_idx] = *sk_v;
 #if 0
           printf("> %u : %v2u\n",sk_idx,*sk_v);
 #endif
         }
     }

   return new_id;
 }

 //
 //
 //

 static
 SKC_RASTERIZE_FLOAT
 skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v)
 {
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
   // Note that there isn't a built-in horizontal scan for vectors so
   // we'll define some here for various widths.
   //
   // FIXME -- a scalar version might be faster so put in a
   // compile-time switch to selection between implementations
   //

 #if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
   return v;

 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
   // 01
   //  0 +
   // --
   // 01
   SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v);
   return w;

 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
   // 0123
   //  012 +
   // ----
   // 0123
   //   01 +
   // ----
   // 0123
   //
   SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v);
   SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w);
   return x;

 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
   // 01234567
   //  0123456 +
   // --------
   // 01234567
   //   012345 +
   // --------
   // 01234567
   //     0123 +
   // --------
   // 01234567
   //
   SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v);
   SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w);
   SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x);
   return y;

 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
   // 0123456789abcdef
   //  0123456789abcde +
   // ----------------
   // 0123456789abcdef
   //   0123456789abcd +
   // ----------------
   // 0123456789abcdef
   //     0123456789ab +
   // ----------------
   // 0123456789abcdef
   //         01234567 +
   // ----------------
   // 0123456789abcdef
   //
   SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
   SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
   SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
   SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
   return z;

 #endif

 #else
   //
   // SIMT
   //

   return sub_group_scan_inclusive_add(v);

 #endif
 }

 //
 //
 //

 static
 SKC_RASTERIZE_UINT
 skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v)
 {
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
   // Note that there isn't a built-in horizontal scan for vectors so
   // we'll define some here for various widths.
   //
   // FIXME -- a scalar version might be faster so put in a
   // compile-time switch to selection between implementations
   //

 #if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
   return v;

 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
   // 01
   //  0 +
   // --
   // 01
   SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v);
   return w;

 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
   // 0123
   //  012 +
   // ----
   // 0123
   //   01 +
   // ----
   // 0123
   //
   SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v);
   SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w);
   return x;

 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
   // 01234567
   //  0123456 +
   // --------
   // 01234567
   //   012345 +
   // --------
   // 01234567
   //     0123 +
   // --------
   // 01234567
   //
   SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v);
   SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w);
   SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x);
   return y;

 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
   // 0123456789abcdef
   //  0123456789abcde +
   // ----------------
   // 0123456789abcdef
   //   0123456789abcd +
   // ----------------
   // 0123456789abcdef
   //     0123456789ab +
   // ----------------
   // 0123456789abcdef
   //         01234567 +
   // ----------------
   // 0123456789abcdef
   //
   SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
   SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
   SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
   SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
   return z;

 #endif

 #else
   //
   // SIMT
   //

   return sub_group_scan_inclusive_add(v);

 #endif
 }

 //
 //
 //

 static
 SKC_RASTERIZE_UINT
 skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v)
 {
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
   // Note that there isn't a built-in horizontal scan for vectors so
   // we'll define some here for various widths.
   //
   // FIXME -- a scalar version might be faster so put in a
   // compile-time switch to selection between implementations
   //

 #if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
   return v;

 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
   // 01
   // 00 max
   // --
   // 01
   SKC_RASTERIZE_UINT const w = max(v.s00,v);
   return w;

 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
   // 0123
   // 0012 +
   // ----
   // 0123
   // 0101 +
   // ----
   // 0123
   //
   SKC_RASTERIZE_UINT const w = max(v.s0012,v);
   SKC_RASTERIZE_UINT const x = max(w.s0101,w);
   return x;

 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
   // 01234567
   // 00123456 +
   // --------
   // 01234567
   // 01012345 +
   // --------
   // 01234567
   // 01230123 +
   // --------
   // 01234567
   //
   SKC_RASTERIZE_UINT const w = max(v.s00123456,v);
   SKC_RASTERIZE_UINT const x = max(w.s01012345,w);
   SKC_RASTERIZE_UINT const y = max(x.s01230123,x);
   return y;

 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
   // 0123456789abcdef
   // 00123456789abcde +
   // ----------------
   // 0123456789abcdef
   // 010123456789abcd +
   // ----------------
   // 0123456789abcdef
   // 01230123456789ab +
   // ----------------
   // 0123456789abcdef
   // 0123456701234567 +
   // ----------------
   // 0123456789abcdef
   //
   SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v);
   SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w);
   SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x);
   SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y);
   return z;

 #endif

 #else
   //
   // SIMT
   //

   return sub_group_scan_inclusive_max(v);

 #endif
 }

 //
 //
 //

 static
 float
 skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v)
 {
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
 #if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
   return v;
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
   return v.s1;
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
   return v.s3;
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
   return v.s7;
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
   return v.sf;
 #endif

 #else
   //
   // SIMT
   //
   return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);

 #endif
 }

 //
 //
 //

 static
 SKC_RASTERIZE_UINT
 skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v)
 {
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
 #if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
   return v;
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
   return v.s1;
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
   return v.s3;
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
   return v.s7;
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
   return v.sf;
 #endif

 #else
   //
   // SIMT
   //
   return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);

 #endif
 }

 //
 //
 //

 static
 float
 skc_subgroup_first(SKC_RASTERIZE_FLOAT const v)
 {
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
 #if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
   return v;
 #else
   return v.s0;
 #endif

 #else
   //
   // SIMT
   //
   return sub_group_broadcast(v,0);

 #endif
 }

 //
 //
 //

 static
 SKC_RASTERIZE_FLOAT
 skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v,
                       SKC_RASTERIZE_UINT  const i)
 {
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
 #if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
   return v;
 #else
   return shuffle(v,i);
 #endif

 #else
   //
   // SIMT
   //
   return intel_sub_group_shuffle(v,i);

 #endif
 }

 //
 //
 //

 static
 SKC_RASTERIZE_FLOAT
 skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous
                           SKC_RASTERIZE_FLOAT const c) // current
 {
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
   // FIXME -- there are alternative formulations here:
   //
   // Option 1:
   //
   //   select(c.rotate(+1),p.rotate(-1),(1,0,0,...))
   //
   // Option 2:
   //
   //   p is a scalar
   //   t    = c.rotate(+1)
   //   t.s0 = p;
   //
   // Option 3: ...
   //
 #if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
   return p;
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
   return shuffle2(p,c,(uint2)(1,2));
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
   return shuffle2(p,c,(uint4)(3,4,5,6));
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
   return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14));
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
   return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30));
 #endif

 #else
   //
   // SIMT
   //
   return intel_sub_group_shuffle_up(p,c,1);

 #endif
 }

 //
 //
 //

 static
 bool
 skc_is_lane_first()
 {
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
   //
   // SIMD
   //
   return true;
 #else
   //
   // SIMT
   //
   return get_sub_group_local_id() == 0;
 #endif
 }

 //
 //
 //

 static
 SKC_RASTERIZE_FLOAT
 skc_delta_offset()
 {
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
 #if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
   return 1;
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
   return (SKC_RASTERIZE_FLOAT)( 1, 2 );
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
   return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 );
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
   return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 );
 #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
   return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 );
 #endif

 #else
   //
   // SIMT
   //
   return 1.0f + get_sub_group_local_id();

 #endif

 }

 //
 //
 //

 static
 int
 skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p)
 {
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
   return any(p);
 #else
   //
   // SIMT
   //
   return sub_group_any(p);
 #endif
 }

 //
 //
 //

 #define SKC_PATH_NODEWORD_IS_LAST(n)  (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK)

 void
 skc_segment_next(__global union skc_bp_elem * const bp_elems,
                  skc_uint                   * const nodeword,
                  skc_block_id_t             * const id)
 {
   if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
     {
       if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword))
         {
           *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS;
         }

       skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id;

       *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
     }
 }

 //
 //
 //

 static
 SKC_RASTERIZE_FLOAT
 skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y)
 {
   return native_sqrt(x * x + y * y);
 }

 //
 // Wang's Formula (1985)
 //

 #define SKC_WANG_PIXEL_RESL   0.25f // <-- this can be tuned

 #define SKC_WANG_EPSILON      (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32)

 #define SKC_WANG_CUBIC        ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON))
 #define SKC_WANG_QUADRATIC    ((2.0f       ) / (8.0f * SKC_WANG_EPSILON))

 #define SKC_WANG_LENGTH(x,y)  skc_native_length(x,y)
 #define SKC_WANG_SQRT(x)      native_sqrt(x)

 //
 //
 //

 static
 SKC_RASTERIZE_FLOAT
 skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
                         SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
                         SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y,
                         SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y)
 {
   //
   // Return the number of evenly spaced (in the parametric sense) line
   // segments that are guaranteed to be within "epsilon" error of the
   // curve.
   //
   // We're then going to take multiples of the reciprocal of this
   // number so that the segmentation can be distributed across the
   // subgroup.
   //
   // Note, this can probably be slightly optimized per architecture
   // but it's probably far from being a hotspot since it's all
   // straight-line unpredicated code.
   //
   // The result is an integer ranging from [1.0,#segments]
   //
   // Note that even if all of the control points are coincident, the
   // max(1.0f) will categorize this as a line of 1 segment.
   //
   // This is what we want!  We want to convert cubics to lines as
   // easily as possible and *then* cull lines that are either
   // horizontal or zero length.
   //
   return max(1.0f,
              ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC *
                                 SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x),
                                                     fabs(t3x - 2.0f * t2x + t1x)),
                                                 max(fabs(t2y - 2.0f * t1y + t0y),
                                                     fabs(t3y - 2.0f * t2y + t1y))))));
 }

 static
 SKC_RASTERIZE_FLOAT
 skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
                             SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
                             SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y)
 {
   return max(1.0f,
              ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC *
                                 SKC_WANG_LENGTH(t2x - 2.0f * t1x + t0x,
                                                 t2y - 2.0f * t1y + t0y))));
 }

 //
 // rational curves
 //

 static
 SKC_RASTERIZE_FLOAT
 skc_wangs_formula_cubic_rat()
 {
   return 0.0f;
 }

 static
 SKC_RASTERIZE_FLOAT
 skc_wangs_formula_quad_rat()
 {
   return 0.0f;
 }

 //
 // flush any work-in-progress blocks and return unused block ids
 //

 static
 void
 skc_finalize(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
              __global union skc_bp_elem                 * const bp_elems,
              __global uint                              * const bp_ids,
              skc_uint                                     const bp_mask,
              __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
              skc_block_id_v_t                           * const blocks,
              skc_uint                                     const blocks_next,
              skc_ttsk_v_t                               * const sk_v,
              skc_uint                                     const sk_v_next,
              __global skc_ttsk_s_t                      * const sk_extent,
              __local  struct skc_subgroup_smem volatile * const smem)
 {
   //
   // flush non-empty bins
   //
   // FIXME -- accelerate this iteration/search with a subgroup operation
   //
   for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++)
     {
       if (smem->bin.aN.count[ii] > 0)
         {
           skc_block_id_v_t const id  = smem->bin.aN.id[ii];
           skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
           skc_uint         const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()];
 #if 0
           printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts);
 #endif
           bp_elems[idx].u32 = tts;
         }

       //
       // FIXME -- vectorize with vstoreN()
       //
     }

   //
   // return remaining block ids back to the pool
   //
   skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next;

   if (blocks_rem > 0)
     {
       skc_uint bp_idx = 0;

       if (skc_subgroup_lane() == 0)
         {
           bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem);

 #if 0
           printf("r-: %8u + %u\n",bp_idx,blocks_rem);
 #endif
         }

       bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask;

       if (skc_subgroup_lane() >= blocks_next)
         {
           bp_ids[bp_idx] = *blocks;
         }
     }

   //
   // flush work-in-progress ryx keys
   //
   if (sk_v_next > 0)
     {
       skc_uint sk_idx = 0;

       if (skc_subgroup_lane() == 0)
         {
           sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
             (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next);
 #if 0
           printf("* %u\n",sk_idx);
 #endif
         }

       sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();

       if (skc_subgroup_lane() < sk_v_next)
         {
           sk_extent[sk_idx] = *sk_v;
         }
     }
 }

 //
 // If there are lanes that were unable to append to a bin because
 // their hashes collided with a bin's current ryx key then those bins
 // must be ejected.
 //
 // Note that we do not eject "full" bins because lazily waiting for a
 // collision results in simpler code.
 //

 static
 void
 skc_flush(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
           __global union skc_bp_elem                 * const bp_elems,
           __global uint                              * const bp_ids,
           skc_uint                                     const bp_mask,
           __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
           skc_block_id_t                             * const subblocks,
           skc_block_id_v_t                           * const blocks,
           skc_uint                                   * const blocks_next,
           skc_ttsk_v_t                               * const sk_v,
           skc_uint                                   * const sk_v_next,
           __global skc_ttsk_s_t                      * const sk_extent,
           __local  struct skc_subgroup_smem volatile * const smem,
           SKC_RASTERIZE_UINT                           const hash,
           SKC_RASTERIZE_UINT                           const yx,
           SKC_RASTERIZE_PREDICATE                            is_collision) // pass by value
 {
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //

   //
   // FIXME -- this code is now stale with the changes to the
   // subblock/block allocation strategy
   //

   //
   // get local TTSB ID queue count
   //
   skc_uint ttsb_id_count  = smem->pool.count; // scalar

   // init hash bit mask
   skc_uint component_mask = 0;

   for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++)
     {
       // if no collision continue
       if (((int*)&is_collision)[cc] == 0)
         continue;

       uint const winner        = ((uint*)&hash)[cc];
       uint const component_bit = 1u << winner;

       // if already processed this hash then continue
       if (component_mask & component_bit)
         continue;

       // update component mask
       component_mask |= component_bit;

       //
       // new winner requires ejecting the old TTSB
       //
       if (smem->bin.aN.count[winner] > 0)
         {
           skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();

           bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
         }

         //
         // ensure there is at least one TTSK and TTSB ID
         //
         if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE)
           {
             //
             // update remaining count
             //
             ttsb_id_count = 0;

             //
             // flush accumulated ttsk_ryx keys
             //
             uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
               (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count

 #if 0
             printf("# %u\n",idx);
 #endif

             for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
               {
                 ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii);
               }

             //
             // allocate more ttsb ids from pool
             //
             uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads

             for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
               smem->pool.aN.id[ii] = bp_ids[id + ii];
           }

       //
       // invalidate the winning block
       //

       //
       // update bin with winning yx, new ttsb id and zero count
       //
       // all lanes are loading/storing from/to the same index
       //
       smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID );
       smem->bin.aN.id   [winner] = smem->pool.aN.id[ttsb_id_count];
       smem->bin.aN.yx   [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc];
       smem->bin.aN.count[winner] = 0;

       //
       // update count
       //
       ttsb_id_count += 1;
     }

   //
   // save count
   //
   smem->pool.count = ttsb_id_count;

 #else
   //
   // SIMT
   //

   do {
     //
     // only one lane will win!
     //
     if (is_collision)
       smem->subgroup.winner = hash;

     barrier(CLK_LOCAL_MEM_FENCE);

     //
     // which bin is being ejected?
     //
     skc_uint const winner = smem->subgroup.winner;

     //
     // which colliding hash is taking over the bin?
     //
     SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner);

     //
     // all lanes with the same hash will try to store but only one
     // lane will win
     //
     if (is_winner)
       smem->subgroup.winner = yx;

     barrier(CLK_LOCAL_MEM_FENCE);

     //
     // flush this block to the pool
     //
     if (smem->bin.aN.count[winner] > 0)
       {
         skc_block_id_v_t const id  = smem->bin.aN.id[winner];
         skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
         skc_uint         const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
 #if 0
         printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts);
 #endif
         bp_elems[idx].u32 = tts;
       }

     //
     // append new ttsk
     //
     skc_uint       const new_yx = smem->subgroup.winner;
     skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(),
                                                     blocks_next,
                                                     bp_atomics,
                                                     bp_mask, // pow2 modulo mask for block pool ring
                                                     bp_ids,
                                                     cohort_atomics,
                                                     sk_v,
                                                     sk_v_next,
                                                     sk_extent,
                                                     new_yx);

 #if 0
     if (get_sub_group_local_id() == 0) {
       printf(">>> %9u\n",new_id);
     }
 #endif

     //
     // update bin with winning yx, new ttsb id and zero count
     //
     smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID;
     smem->bin.aN.yx   [winner]                      = new_yx;
     smem->bin.aN.id   [winner]                      = new_id;
     smem->bin.aN.count[winner]                      = 0;

     //
     // remove all lanes matching this hash
     //
     is_collision = is_collision && !is_winner;

     //
     // exit if nothing left to do
     //
   } while (sub_group_any(is_collision));

 #endif
 }

 //
 // scatter scan max
 //
 static
 SKC_RASTERIZE_UINT
 skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem,
                      SKC_RASTERIZE_FLOAT                         const iss,
                      SKC_RASTERIZE_FLOAT                         const ess)
 {
   //
   // prefix sums determine which lanes we're going to work on next
   //
   SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP);
   SKC_RASTERIZE_UINT      const scratch_idx      = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f));

 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
 #ifdef SKC_RASTERIZE_SIMD_USES_SMEM
   //
   // SIMD APPROACH 1: SIMT'ISH
   //

   // zero the volatile smem scratchpad using vector syntax
   smem->subgroup.vN.scratch[0] = ( 0 );

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A)                         \
   if (is_scratch_store C)                               \
     smem->subgroup.aN.scratch[scratch_idx C] = I;

   SKC_RASTERIZE_VECTOR_EXPAND();

   // propagate lanes to right using max scan
   SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0];
   SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);

 #else
   //
   // SIMD APPROACH 2: SCALAR'ISH
   //

   SKC_RASTERIZE_UINT source = ( 0 );

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A)                 \
   if (is_scratch_store C)                       \
     ((uint *)&source)[scratch_idx C] = I;

   SKC_RASTERIZE_VECTOR_EXPAND();

   for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++)
     ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]);
 #endif

 #else
   //
   // SIMT
   //

   //
   // zero the volatile smem scratchpad using vector syntax
   //
   smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 );

   //
   // store source lane at starting lane
   //
   if (is_scratch_store)
     smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane();

   //
   // propagate lanes to right using max scan
   //
   SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()];
   SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
 #endif

   return source;
 }

 //
 // sliver lines into subpixels
 //

 static
 void
 skc_sliver(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
            __global union skc_bp_elem                 * const bp_elems,
            __global uint                              * const bp_ids,
            skc_uint                                     const bp_mask,
            __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
            skc_block_id_t                             * const subblocks,
            skc_block_id_v_t                           * const blocks,
            skc_uint                                   * const blocks_next,
            skc_ttsk_v_t                               * const sk_v,
            skc_uint                                   * const sk_v_next,
            __global skc_ttsk_s_t                      * const sk_extent,
            __local  struct skc_subgroup_smem volatile * const smem,
            SKC_RASTERIZE_FLOAT                          const l0x,
            SKC_RASTERIZE_FLOAT                          const l0y,
            SKC_RASTERIZE_FLOAT                          const l1x,
            SKC_RASTERIZE_FLOAT                          const l1y)
 {
   //
   // Y-SLIVERING
   // -----------
   //
   // immediately sliver all multi-pixel lines in into 1-pixel high
   // lines
   //
   // note this implicitly squelches horizontal lines
   //
   // there is another test for horizontal lines after x-slivering
   // is complete
   //

   //
   // will we need to flip the sign of y_delta ?
   //
   SKC_RASTERIZE_PREDICATE const y_lt   = (l0y <= l1y);
   SKC_RASTERIZE_UINT      const dy_xor = y_lt ? 0 : 0x80000000;

   //
   // save 1/dy
   //
   SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y);

   //
   // how many non-horizontal subpixel y-axis slivers are there?
   //
   SKC_RASTERIZE_FLOAT const y_min   = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
   SKC_RASTERIZE_FLOAT const y_max   = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
   SKC_RASTERIZE_FLOAT const y_base  = y_lt ? y_min : y_max;
   SKC_RASTERIZE_FLOAT       y_segs  = y_max - y_min;

   //
   // inclusive subgroup scan of y_segs
   //
   SKC_RASTERIZE_FLOAT       y_iss   = skc_subgroup_scan_inclusive_add_float(y_segs);
   SKC_RASTERIZE_FLOAT       y_ess   = y_iss - y_segs;
   float                     y_rem   = skc_subgroup_last_float(y_iss);

   //
   // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails
   //
   if (y_segs == 0.0f)
     y_iss = 0.0f;

 #if 0
   printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem);
 #endif

   //
   // these values don't matter on first iteration
   //
   SKC_RASTERIZE_FLOAT n1x_prev = 0;
   SKC_RASTERIZE_FLOAT n1y_prev = 0;

   //
   // loop until done
   //
   while (y_rem > 0.0f)
     {
       //
       // distribute work across lanes
       //
       SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess);

       //
       // get line at y_source line
       //
       SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source);
       SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source);
       SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source);
       SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source);

       //
       // every lane will create a 1 pixel tall line "sliver"
       //
       // FIXME -- this gets expanded on SIMD
       //
       // if numerator == 1 then this is the first lane
       // if numerator == s then this is the last  lane
       //
       SKC_RASTERIZE_FLOAT     const y_delta    = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source);
       SKC_RASTERIZE_FLOAT     const y_count    = skc_subgroup_shuffle(y_segs,y_source);

       SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f);
       SKC_RASTERIZE_PREDICATE const is_y_last  = (y_delta >= y_count);

       // toggle y_delta sign
       SKC_RASTERIZE_FLOAT     const y_offset   = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source)));

       //
       // calculate "right" line segment endpoint
       //
       SKC_RASTERIZE_FLOAT       n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP;
       SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source);
       SKC_RASTERIZE_FLOAT       n1x = round(SKC_LERP(m0x,m1x,n_t));

       //
       // override c1 if this is last point
       //
       n1y = select(n1y,m1y,is_y_last);
       n1x = select(n1x,m1x,is_y_last);

       //
       // shuffle up "left" line segment endpoint
       //
       // NOTE: Intel's shuffle_up is unique with its elegant
       // "previous" argument so don't get used to it
       //
       SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y);
       SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x);

       //
       // override shuffle up if this is the first line segment
       //
       n0y = select(n0y,m0y,is_y_first);
       n0x = select(n0x,m0x,is_y_first);

       //
       // save previous right endpoint
       //
       n1x_prev = n1x;
       n1y_prev = n1y;

       //
       // decrement by subgroup size
       //
       y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
       y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
       y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

 #if 0
       //
       // debug
       //
       if (n0y != n1y) {
         printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y);
       }
 #endif

       //
       // X-SLIVERING
       // -----------
       //
       // now sliver 1-pixel high lines into at either vertical or
       // 1-pixel wide lines
       //
       // save original direction and work with increasing x
       //
       SKC_RASTERIZE_PREDICATE const x_lt   = (n0x <= n1x);
       SKC_RASTERIZE_UINT      const dx_xor = x_lt ? 0 : 0x80000000;

       //
       // save 1/dy
       //
       SKC_RASTERIZE_FLOAT const x_denom  = native_recip(n1x - n0x);

       //
       // how many non-horizontal subpixel y-axis slivers are there?
       //
       SKC_RASTERIZE_FLOAT const x_min    = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
       SKC_RASTERIZE_FLOAT const x_max    = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
       SKC_RASTERIZE_FLOAT const x_base   = x_lt ? x_min : x_max;
       SKC_RASTERIZE_FLOAT const x_segs   = fmax(x_max - x_min,1.0f);

       //
       // inclusive subgroup scan of y_segs
       //
       SKC_RASTERIZE_FLOAT       x_iss    = skc_subgroup_scan_inclusive_add_float(x_segs);
       SKC_RASTERIZE_FLOAT       x_ess    = x_iss - x_segs;
       float                     x_rem    = skc_subgroup_last_float(x_iss);

       //
       // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails
       //
       //if (x_segs == 0.0f)
       // x_iss = 0.0f;

       //
       // these values don't matter on first iteration
       //
       SKC_RASTERIZE_FLOAT       p1x_prev = 0;
       SKC_RASTERIZE_FLOAT       p1y_prev = 0;

       //
       // loop until done
       //
       while (x_rem > 0)
         {
           //
           // distribute work across lanes
           //
           SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess);

           //
           // get line at y_source line
           //
           SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source);
           SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source);
           SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source);
           SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source);

           //
           // every lane will create a 1 pixel tall line "sliver"
           //
           // FIXME -- this gets expanded on SIMD
           //
           // if numerator == 1 then this is the first lane
           // if numerator == s then this is the last  lane
           //
           SKC_RASTERIZE_FLOAT     const x_delta    = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source);
           SKC_RASTERIZE_FLOAT     const x_count    = skc_subgroup_shuffle(x_segs,x_source);

           SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f);
           SKC_RASTERIZE_PREDICATE const is_x_last  = (x_delta >= x_count);

           // toggle x_delta sign
           SKC_RASTERIZE_FLOAT     const x_offset   = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source)));

           //
           // calculate "right" line segment endpoint
           //
           SKC_RASTERIZE_FLOAT       p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP;
           SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source);
           SKC_RASTERIZE_FLOAT       p1y = round(SKC_LERP(o0y,o1y,p_t));

           //
           // override c1 if this is last point
           //
           p1x = select(p1x,o1x,is_x_last);
           p1y = select(p1y,o1y,is_x_last);

           //
           // shuffle up "left" line segment endpoint
           //
           // NOTE: Intel's shuffle_up is unique with its elegant
           // "previous" argument so don't get used to it
           //
           SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x);
           SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y);

           //
           // override shuffle up if this is the first line segment
           //
           p0x = select(p0x,o0x,is_x_first);
           p0y = select(p0y,o0y,is_x_first);

           //
           // save previous right endpoint
           //
           p1x_prev = p1x;
           p1y_prev = p1y;

           //
           // decrement by subgroup size
           //
           x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
           x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
           x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

           //
           // only non-horizontal subpixel lines are valid
           //
           SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y);

           //
           // if no lanes are active then continue
           //
           // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY
           // IMPACTS PERFORMANCE (+12% ?)
           //
           // IT SHOULDN'T !!!
           //
 #if 0
           if (!skc_subgroup_any(is_active))
             continue;
 #endif

           //
           // Option 1: use SLM for explicitly managed coalesced stores
           //
           // 1. which tile does this line belong?
           // 2. hash tile coordinates
           // 3. lookup hash
           // 4. if tile matches then SLM append keys
           // 5. if tile doesn't match
           //   a. flush
           //   b. create new TTSK_RYX
           //   c. obtain TTSB block from pool
           //   d. goto 3.
           //

           //
           // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores
           //
           // 1. which tile does this line belong?
           // 2. hash tile coordinates
           // 3. lookup hash
           // 4. if tile matches then GMEM append keys
           // 5. if tile doesn't match
           //   a. flush (and invalidate empty elems)
           //   b. create new TTSK_RYX
           //   c. obtain TTSB block from pool
           //   d. goto 3.
           //

           //
           // The virtual rasterization surface is very large and
           // signed: +/- ~64K-256K, depending on the architecture.
           //
           // Rasters must be clipped to the virtual surface and,
           // optionally, clipped even further on a per raster
           // basis.
           //

           //
           // Clip to the per-raster clip
           //

           /*

             CLIP HERE

           */

           //
           // Hash the tile coordinates
           //
           // This table lists nominal values for each architecture.
           // We want to choose values that are naturally fit the
           // "width" of the architecture.
           //
           //   SIMD   RANGE   BITS  MAX RANGE  MAX BINS  HASH BITS
           //   ----  -------  ----  ---------  --------  ---------
           //     4   [0,  4]    3    [0,  7]      10      mod(10)  <-- SSE42, ?
           //     8   [0,  8]    4    [0, 15]       8         3     <-- GEN*,AVX*
           //    16   [0, 16]    5    [0, 31]       6      mod(6)   <-- GEN*,?
           //    32   [0, 32]    6    [0, 63]       5      mod(5)   <-- CUDA,PowerVR,Adreno,GEN*
           //    64   [0, 64]    7    [0,127]       4         2     <-- AMD Radeon
           //
           // NOTE: When possible, bias the hash toward using more y
           // bits because of:
           //
           //   1. the 90 degree counter-clockwise rotation that we put
           //      in place to offset the render-time clockwise
           //      rotation
           //
           //   2. the likely presence of left-to-right or
           //      right-to-left glyphs.
           //
           // For power-of-two bins, the hash is easy.
           //
           // For non-power-of-two, we may want to either implement a
           // fast mod (compiler should do this for us... hahahaha) or
           // drop down to the next power-of-two.
           //

           //
           // FIXME -- this snarl is not good -- can probably reduce
           // some of the sign casting but some is there to vectorize a
           // scalar
           //
           SKC_RASTERIZE_INT       const z0y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y);
           SKC_RASTERIZE_INT       const z1y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y);

           SKC_RASTERIZE_INT       const z0x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x);
           SKC_RASTERIZE_INT       const z1x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x);

           SKC_RASTERIZE_INT       const min_y  = min(z0y,z1y);
           SKC_RASTERIZE_INT       const max_y  = max(z0y,z1y);

           SKC_RASTERIZE_INT       const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2;

           SKC_RASTERIZE_UINT      const ty     = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y;
           SKC_RASTERIZE_INT             dy     = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y);

           //
           // map [+1,+32] to [ 0,+31]
           // map [-1,-32] to [-1,-32]
           //
           SKC_RASTERIZE_INT             dys    = (dy + (~dy >> 31)) << 26;

           SKC_RASTERIZE_INT       const min_x  = min(z0x,z1x);
           SKC_RASTERIZE_INT       const max_x  = max(z0x,z1x);
           SKC_RASTERIZE_INT       const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2;

           SKC_RASTERIZE_UINT      const tx     = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X;
           SKC_RASTERIZE_UINT      const sx     = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x);

           SKC_RASTERIZE_UINT      const tts    = dys | (ty << 16) | (sx << 10) | tx;

           SKC_RASTERIZE_UINT      const hash   = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) |
                                                    (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK));

           SKC_RASTERIZE_UINT      const yx     = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF));

 #if 0
           printf("(%3u, %3u)\n",tile_y,tile_x);
 #endif

 #if 0
           if (is_active)
             printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx);
 #endif

           //
           // debug
           //
 #if 0 // PRINTF_ENABLE

 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A)                                         \
           if (is_active C)                                              \
             printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C);

           SKC_RASTERIZE_VECTOR_EXPAND();
 #else
           if (is_active)
             printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash);
 #endif

 #endif
           //
           // flush all active lanes
           //
           while (true)
             {
               //
               // either gather load or vector load+shuffle the yx keys
               //
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
               SKC_RASTERIZE_BIN       const yx_bin     = smem->bin.vN.yx;
               SKC_RASTERIZE_UINT      const yx_cur     = shuffle(yx_bin,hash);
 #else
               SKC_RASTERIZE_UINT      const yx_cur     = smem->bin.aN.yx[hash];
 #endif

               //
               // does yx for lane match yx for hash?
               //
               SKC_RASTERIZE_UINT      const active_yx  = is_active ? yx : SKC_RASTERIZE_YX_INVALID;
               SKC_RASTERIZE_PREDICATE const is_match   = (yx_cur == active_yx);

               //
               // OpenCL spec: "When casting a bool to a vector integer
               // data type, the vector components will be set to -1
               // (i.e. all bits set) if the vector bool value is true
               // and 0 otherwise.
               //
 #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
               SKC_RASTERIZE_UINT      const h_match    = (SKC_RASTERIZE_UINT)is_match;
 #else
               SKC_RASTERIZE_UINT      const h_match    = abs(is_match); // {-1,0} -> {+1,0}
 #endif
               //
               // how many new elements for each matching hash bin?
               //
               SKC_RASTERIZE_UINT      const h_shl      = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS;
               SKC_RASTERIZE_UINT      const h          = h_match << h_shl;

               //
               // prefix sum all of the bins in parallel
               //
               SKC_RASTERIZE_UINT      const h_iss      = skc_subgroup_scan_inclusive_add_uint(h);
               SKC_RASTERIZE_UINT      const h_total    = skc_subgroup_last_uint(h_iss);

               //
               // current bin counts
               //
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
               SKC_RASTERIZE_BIN       const count_bin  = smem->bin.vN.count;
               SKC_RASTERIZE_UINT      const count_cur  = shuffle(count_bin,hash);
 #else
               SKC_RASTERIZE_UINT      const count_cur  = smem->bin.aN.count[hash];
 #endif

               //
               // calculate where each cache-hit and in-bounds tts should be stored
               //
               SKC_RASTERIZE_UINT      const ttsb_index = (h_iss   >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1;
               SKC_RASTERIZE_UINT      const count_new  = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur;

               //
               // which lanes can append to a matching bin?
               //
               SKC_RASTERIZE_PREDICATE const is_append  = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS);

               //
               // scatter append tts elements to bin blocks
               //
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
               //
               // SIMD
               //
 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A)                                         \
               if (is_append C)                                          \
                 {                                                       \
                   smem->bin.aN.ttsb [hash C][ttsb_index C] = tts       C; \
                   smem->bin.aN.count[hash C]               = count_new C; \
                 }

               SKC_RASTERIZE_VECTOR_EXPAND();
 #else
               //
               // SIMT
               //
               if (is_append)
                 {
                   smem->bin.aN.ttsb [hash][ttsb_index] = tts;
                   smem->bin.aN.count[hash]             = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS
                 }
 #endif
               //
               // try to keep predicate updates SIMD-friendly and
               // outside of predicated code paths -- this is not
               // always how we would normally do things on SIMT but
               // either approach is acceptable
               //

               //
               // mask off lanes/components that successfully appended
               //
               is_active = is_active && !is_append;

               //
               // are there any active lanes left?
               //
               if (!skc_subgroup_any(is_active))
                 break;

               //
               // There are active lanes that couldn't be appended to a
               // bin because their hashes collided with the bin's
               // current ryx key then those bins must be ejected.
               //
               // Note that we do not eject "full" bins because lazily
               // waiting for a collision results in simpler code.
               //
               skc_flush(bp_atomics,
                         bp_elems,
                         bp_ids,
                         bp_mask,
                         cohort_atomics,
                         subblocks,
                         blocks,
                         blocks_next,
                         sk_v,
                         sk_v_next,
                         sk_extent,
                         smem,
                         hash,
                         yx,
                         is_active);
             }
         }
     }
 }

 //
 // INITIALIZE SMEM
 //
 // Note that SIMD/SIMT have nearly the same syntax.
 //
 static
 void
 skc_smem_init(__local struct skc_subgroup_smem volatile * const smem)
 {
   //
   // initialize smem bins
   //
 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
   smem->bin.vN.yx    = ( SKC_RASTERIZE_YX_INIT );
   smem->bin.vN.count = ( 0 );
 #else
   //
   // SIMT
   //
   int idx = skc_subgroup_lane();

 #if   ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
   if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT)
 #elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
   for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE)
 #endif
     {
       smem->bin.aN.yx   [idx] = ( SKC_RASTERIZE_YX_INIT );
       smem->bin.aN.count[idx] = ( 0 );
     }
 #endif
 }

 //
 // RASTERIZE CUBIC KERNEL
 //

 static
 void
 skc_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                      __global union skc_bp_elem                * const bp_elems,
                      __global uint                             * const bp_ids,
                      skc_uint                                    const bp_mask,

                      __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                      __global skc_ttsk_s_t                     * const sk_extent,

                      __local struct skc_subgroup_smem volatile * const smem,

                      skc_uint                                  * const nodeword,
                      skc_block_id_t                            * const id,

                      union skc_transform              const    * const tv,
                      union skc_path_clip              const    * const cv,
                      skc_uint                                    const cohort)
 {
   //
   // the initial segment idx and segments-per-block constant determine
   // how many block ids will need to be loaded
   //
   SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   //
   // apply transform
   //
   // note that we only care if the end points are rounded to subpixel precision
   //
   // FIXME -- transformation is currently affine-only support perspective later
   //
   // the affine transformation requires 8 FMA + 2 ROUND operations
   //

   SKC_RASTERIZE_FLOAT b0x = c0x * tv->sx  + c0y * tv->shx + tv->tx;
   SKC_RASTERIZE_FLOAT b0y = c0x * tv->shy + c0y * tv->sy  + tv->ty;

   SKC_RASTERIZE_FLOAT t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
   SKC_RASTERIZE_FLOAT t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;

   SKC_RASTERIZE_FLOAT t2x = c2x * tv->sx  + c2y * tv->shx + tv->tx;
   SKC_RASTERIZE_FLOAT t2y = c2x * tv->shy + c2y * tv->sy  + tv->ty;

   SKC_RASTERIZE_FLOAT t3x = c3x * tv->sx  + c3y * tv->shx + tv->tx;
   SKC_RASTERIZE_FLOAT t3y = c3x * tv->shy + c3y * tv->sy  + tv->ty;

   //
   // FIXME -- this is temporary support for projection
   //
   bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f);

   if (!is_affine)
     {
       SKC_PROJECT(tv,c0x,c0y,b0x,b0y);
       SKC_PROJECT(tv,c1x,c1y,t1x,t1y);
       SKC_PROJECT(tv,c2x,c2y,t2x,t2y);
       SKC_PROJECT(tv,c3x,c3y,t3x,t3y);
     }

   b0x = round(b0x);
   b0y = round(b0y);

   t3x = round(t3x);
   t3y = round(t3y);

   //
   //
   //
 #if PRINTF_ENABLE

 #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A)                                         \
   printf("{ { %.02f, %.02f }, { %.02f, %.02f },"                        \
          "  { %.02f, %.02f }, { %.02f, %.02f } },\n",                   \
          b0x C,b0y C,t1x C,t1y C,                                       \
          t2x C,t2y C,t3x C,t3y C);

   SKC_RASTERIZE_VECTOR_EXPAND();

 #else

   printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n",
          b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);

 #endif

 #endif

   //
   // OLD APPROACH
   // ------------
   //
   // The Spinel CUDA rasterizer was significantly more complex and
   // performed a few different tasks that are probably best kept
   // separate.
   //
   // The Spinel rasterizer Bezier held 4-element x and y coordinates
   // in adjacent lanes. This simplified intermingling of single lane
   // 4-coordinate line segments with two-lane cubic Beziers.
   //
   // After transformation of the input segments, the Spinel rasterizer
   // would test cubics for flatness and, if flat, collapse the
   // adjacent lanes into a single line lane and an empty lane.
   //
   // Any lines would then be appended to a line queue.
   //
   // Any cubics would then be subdivided.
   //
   // The reclassification process would be repeated.
   //
   // NEW APPROACH
   // ------------
   //
   // Assume we're only working with cubics in this kernel.
   //
   // Optimization: if the line segment is a special case -- a cusp,
   // has 1+ inflections, or a loop -- it might be beneficial to
   // subdivide the control cage 1+ times in order to separate the
   // flatter segments the high-velocity region(s).
   //
   // This means we want to split using [a,b] formulation to _directly_
   // subdivide producing a new control cage.
   //
   // Wang's Formula is still useful even if we subdivide once or twice
   // as it's so cheap that it might give some useful hints about where
   // the high-velocity sections of curve reside.
   //
   // But it seems like using Wang's and directly flattening to line
   // segments without any subdivision is good enough for the limited
   // set of test cases that I've tried.
   //
   // So... use Wang's Formula to estimate how many line segment are
   // required to properly flatten the cubics.
   //
   // Then use inclusive/exclusive scans to put all the lanes to work:
   //
   //   1. segmenting cubics to line segments
   //
   //   2. slivering line segments into 1-pixel high line segments
   //
   //   3. slivering 1-pixel high line segments into 1-pixel wide line
   //      segments
   //
   // MORE BACKGROUND ON NEW APPROACH
   // -------------------------------
   //
   // Two options for handling line segments:
   //
   // 1. append the line segments onto an SLM array until enough
   //    work has been accrued (Spinel does this)
   //
   // 2. immediately sliver the potentially multi-pixel line
   //    segments into subpixel lines
   //
   // The advantage of (1) is that it guarantees the slivering
   // process will, on average, always be emitting a full subgroup
   // of subpixel lines.
   //
   // The advantage of (2) is that it reduces code complexity and
   // leaves more room for SLM tile bins. The difference between Spinel
   // and Skia Compute is that Wang's Formula guarantees there will be
   // a full subgroup of multi-pixel lines unless this is the final
   // iteration of the warp of multi-pixel lines.
   //
   // Note that wider GPU architectures might benefit from (1) and
   // other work accumulation strategies because it will minimize
   // partial warp workloads in the final iteration of each stage.  It
   // also minimizes the sunk cost of the uniform control logic steps.
   //
   // So let's implement (2) for now...
   //

   //
   // And... begin!
   //
   // Estimate how many line segments are in quad/cubic curve.
   //
   // Wang's Formula will return zero if the control points are
   // collinear but we bump it up to 1.0f.
   //
   SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);

   //
   // if there are free registers then precalculate the reciprocal for
   // each estimated segments since it will never change
   //
   SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);


   //
   // inclusive add scan of estimated line segments
   // exclusive add scan of estimated line segments
   // total number       of estimated line segments
   //
   SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
   SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
   float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar

   //
   // Precompute cubic polynomial coefficients from transformed control
   // cage so we can shuffle them in on each iteration of the outer
   // loop and then evaluate the polynomial in Horner form.
   //
   //                            |  1  0  0  0 | | c0 |
   //                            |             | |    |
   //                            | -3  3  0  0 | | c1 |
   //   B(t) = [ 1 t^1 t^2 t^3 ] |             | |    |
   //                            |  3 -6  3  0 | | c2 |
   //                            |             | |    |
   //                            | -1  3 -3  1 | | c3 |
   //
   //
   SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x);                // 2 - 1 MAD + MUL
   SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y);                // 2 - 1 MAD + MUL

   SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x));  // 3 - 2 MAD + MUL
   SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y));  // 3 - 2 MAD + MUL

   SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB
   SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB

   //
   // these values don't matter on the first iteration
   //
   SKC_RASTERIZE_FLOAT l1x_prev  = 0;
   SKC_RASTERIZE_FLOAT l1y_prev  = 0;

   //
   // allocate and init in-register TTSK keys
   //
   skc_uint     sk_v_next = 0;
   skc_ttsk_v_t sk_v;

   sk_v.hi = cohort;

   //
   // initialize smem
   //
   skc_smem_init(smem);

   //
   // initialize blocks / subblocks
   //
   skc_block_id_v_t blocks;
   skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;

 #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
   skc_block_id_t   subblocks   = 0;
 #endif

   //
   // loop until done
   //
   while (s_rem > 0)
     {
       //
       // distribute work across lanes
       //
       SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);

       //
       // every lane has a fraction to work off of
       //
       // FIXME -- this gets expanded on SIMD
       //
       // if delta == 1      then this is the first lane
       // if count == s_segs then this is the last  lane
       //
       SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
       SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);

       SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
       SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);

       //
       // init parametric t
       //
       SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?

       //
       // if last then override to a hard 1.0f
       //
       s_t    = is_s_last ? 1.0f : s_t;

       //
       // decrement by subgroup size
       //
       s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
       s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
       s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

       //
       // now every lane knows what to do and the following lines will
       // pump out up to SUBGROUP_SIZE line segments
       //
       // obtain the src vertices through shared or via a shuffle
       //

       //
       // shuffle in the polynomial coefficients their source lane
       //
       SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
       SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);

       SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
       SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);

       SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
       SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);

       SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source);
       SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source);

       //
       // calculate "right" line segment endpoint using Horner form
       //
       SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND
       SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND

       //
       // shuffle up "left" line segment endpoint
       //
       // NOTE: Intel's shuffle_up is unique with its elegant
       // "previous" argument so don't get used to it
       //
       SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
       SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);

       //
       // save previous right endpoint
       //
       l1x_prev = l1x;
       l1y_prev = l1y;

       //
       // override shuffle up if this is the first line segment
       //
       l0x = select(l0x,s0x,is_s_first);
       l0y = select(l0y,s0y,is_s_first);

       //
       // sliver lines
       //
       skc_sliver(bp_atomics,
                  bp_elems,
                  bp_ids,
                  bp_mask,
                  cohort_atomics,
                  &subblocks,
                  &blocks,
                  &blocks_next,
                  &sk_v,
                  &sk_v_next,
                  sk_extent,
                  smem,
                  l0x,l0y,l1x,l1y);
     }

   //
   // - flush work-in-progress blocks
   // - return unused block ids
   //
   skc_finalize(bp_atomics,
                bp_elems,
                bp_ids,
                bp_mask,
                cohort_atomics,
                &blocks,
                blocks_next,
                &sk_v,
                sk_v_next,
                sk_extent,
                smem);
 }

 //
 // RASTERIZE QUAD KERNEL
 //

 static
 void
 skc_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                     __global union skc_bp_elem                * const bp_elems,
                     __global uint                             * const bp_ids,
                     skc_uint                                    const bp_mask,

                     __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                     __global skc_ttsk_s_t                     * const sk_extent,

                     __local struct skc_subgroup_smem volatile * const smem,

                     skc_uint                                  * const nodeword,
                     skc_block_id_t                            * const id,

                     union skc_transform              const    * const tv,
                     union skc_path_clip              const    * const cv,
                     skc_uint                                    const cohort)
 {
   //
   // the initial segment idx and segments-per-block constant determine
   // how many block ids will need to be loaded
   //
   SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   //
   // apply transform
   //
   // note that we only care if the end points are rounded to subpixel precision
   //
   // FIXME -- transformation is currently affine-only support perspective later
   //
   // the affine transformation requires 8 FMA + 2 ROUND operations
   //
   SKC_RASTERIZE_FLOAT b0x = c0x * tv->sx  + c0y * tv->shx + tv->tx;
   SKC_RASTERIZE_FLOAT b0y = c0x * tv->shy + c0y * tv->sy  + tv->ty;

   SKC_RASTERIZE_FLOAT t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
   SKC_RASTERIZE_FLOAT t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;

   SKC_RASTERIZE_FLOAT t2x = c2x * tv->sx  + c2y * tv->shx + tv->tx;
   SKC_RASTERIZE_FLOAT t2y = c2x * tv->shy + c2y * tv->sy  + tv->ty;

   //
   // FIXME -- this is temporary support for projection
   //
   bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f);

   if (!is_affine)
     {
       SKC_PROJECT(tv,c0x,c0y,b0x,b0y);
       SKC_PROJECT(tv,c1x,c1y,t1x,t1y);
       SKC_PROJECT(tv,c2x,c2y,t2x,t2y);
     }

   b0x = round(b0x);
   b0y = round(b0y);

   t2x = round(t2x);
   t2y = round(t2y);

   //
   // Estimate how many line segments are in quad/cubic curve.
   //
   // Wang's Formula will return zero if the control points are
   // collinear but we bump it up to 1.0f.
   //
   SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y);

   //
   // if there are free registers then precalculate the reciprocal for
   // each estimated segments since it will never change
   //
   SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);


   //
   // inclusive add scan of estimated line segments
   // exclusive add scan of estimated line segments
   // total number       of estimated line segments
   //
   SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
   SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
   float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar

   //
   // Precompute quadratic polynomial coefficients from control cage so
   // we can shuffle them in on each iteration of the outer loop and
   // then evaluate the polynomial in Horner form.
   //

   //                        |  1  0  0  | | c0 |
   //                        |           | |    |
   //   B(t) = [ 1 t^1 t^2 ] | -2  2  0  | | c1 |
   //                        |           | |    |
   //                        |  1 -2  1  | | c2 |
   //
   //
   SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL
   SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL

   SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x);  // 2 - 1 MAD + ADD
   SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y);  // 2 - 1 MAD + ADD

   //
   // these values don't matter on the first iteration
   //
   SKC_RASTERIZE_FLOAT l1x_prev  = 0;
   SKC_RASTERIZE_FLOAT l1y_prev  = 0;

   //
   // allocate and init in-register TTSK keys
   //
   skc_uint     sk_v_next = 0;
   skc_ttsk_v_t sk_v;

   sk_v.hi = cohort;

   //
   // initialize smem
   //
   skc_smem_init(smem);

   //
   // initialize blocks / subblocks
   //
   skc_block_id_v_t blocks;
   skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;

 #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
   skc_block_id_t   subblocks   = 0;
 #endif

   //
   // loop until done
   //
   while (s_rem > 0)
     {
       //
       // distribute work across lanes
       //
       SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);

       //
       // every lane has a fraction to work off of
       //
       // FIXME -- this gets expanded on SIMD
       //
       // if delta == 1      then this is the first lane
       // if count == s_segs then this is the last  lane
       //
       SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
       SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);

       SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
       SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);

       //
       // init parametric t
       //
       SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?

       //
       // if last then override to a hard 1.0f
       //
       s_t    = is_s_last ? 1.0f : s_t;

       //
       // decrement by subgroup size
       //
       s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
       s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
       s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

       //
       // now every lane knows what to do and the following lines will
       // pump out up to SUBGROUP_SIZE line segments
       //
       // obtain the src vertices through shared or via a shuffle
       //

       //
       // shuffle in the polynomial coefficients their source lane
       //
       SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
       SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);

       SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
       SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);

       SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
       SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);

       //
       // calculate "right" line segment endpoint using Horner form
       //
       SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND
       SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND

       //
       // shuffle up "left" line segment endpoint
       //
       // NOTE: Intel's shuffle_up is unique with its elegant
       // "previous" argument so don't get used to it
       //
       SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
       SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);

       //
       // save previous right endpoint
       //
       l1x_prev = l1x;
       l1y_prev = l1y;

       //
       // override shuffle up if this is the first line segment
       //
       l0x = select(l0x,s0x,is_s_first);
       l0y = select(l0y,s0y,is_s_first);

       //
       // sliver lines
       //
       skc_sliver(bp_atomics,
                  bp_elems,
                  bp_ids,
                  bp_mask,
                  cohort_atomics,
                  &subblocks,
                  &blocks,
                  &blocks_next,
                  &sk_v,
                  &sk_v_next,
                  sk_extent,
                  smem,
                  l0x,l0y,l1x,l1y);
     }

   //
   // - flush work-in-progress blocks
   // - return unused block ids
   //
   skc_finalize(bp_atomics,
                bp_elems,
                bp_ids,
                bp_mask,
                cohort_atomics,
                &blocks,
                blocks_next,
                &sk_v,
                sk_v_next,
                sk_extent,
                smem);
 }

 //
 // RASTERIZE LINE KERNEL
 //

 static
 void
 skc_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                     __global union skc_bp_elem                * const bp_elems,
                     __global uint                             * const bp_ids,
                     skc_uint                                    const bp_mask,

                     __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                     __global skc_ttsk_s_t                     * const sk_extent,

                     __local struct skc_subgroup_smem volatile * const smem,

                     skc_uint                                  * const nodeword,
                     skc_block_id_t                            * const id,

                     union skc_transform              const    * const tv,
                     union skc_path_clip              const    * const cv,
                     skc_uint                                    const cohort)
 {
   //
   // the initial segment idx and segments-per-block constant determine
   // how many block ids will need to be loaded
   //
   SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

   skc_segment_next(bp_elems,nodeword,id);

   SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

 #if 0
   printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y);
 #endif

   //
   // apply transform
   //
   // note that we only care if the end points are rounded to subpixel precision
   //
   // FIXME -- transformation is currently affine-only
   // FIXME -- support perspective later
   //
   // the affine transformation requires 8 FMA + 4 ROUND operations
   //
   SKC_RASTERIZE_FLOAT l0x = c0x * tv->sx  + c0y * tv->shx + tv->tx;
   SKC_RASTERIZE_FLOAT l0y = c0x * tv->shy + c0y * tv->sy  + tv->ty;

   SKC_RASTERIZE_FLOAT l1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
   SKC_RASTERIZE_FLOAT l1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;

   //
   // FIXME -- this is temporary support for projection
   //
   bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f);

   if (!is_affine) {
     SKC_PROJECT(tv,c0x,c0y,l0x,l0y);
     SKC_PROJECT(tv,c1x,c1y,l1x,l1y);
   }

   l0x = round(l0x);
   l0y = round(l0y);

   l1x = round(l1x);
   l1y = round(l1y);

 #if 0
   printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y);
 #endif

   //
   // allocate and init in-register TTSK keys
   //
   skc_uint     sk_v_next = 0;
   skc_ttsk_v_t sk_v;

   sk_v.hi = cohort;

   //
   // initialize smem
   //
   skc_smem_init(smem);

   //
   // initialize blocks / subblocks
   //
   skc_block_id_v_t blocks;
   skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;

 #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
   skc_block_id_t   subblocks   = 0;
 #endif

   //
   // sliver lines
   //
   skc_sliver(bp_atomics,
              bp_elems,
              bp_ids,
              bp_mask,
              cohort_atomics,
              &subblocks,
              &blocks,
              &blocks_next,
              &sk_v,
              &sk_v_next,
              sk_extent,
              smem,
              l0x,l0y,l1x,l1y);

   //
   // - flush work-in-progress blocks
   // - return unused block ids
   //
   skc_finalize(bp_atomics,
                bp_elems,
                bp_ids,
                bp_mask,
                cohort_atomics,
                &blocks,
                blocks_next,
                &sk_v,
                sk_v_next,
                sk_extent,
                smem);
 }

 //
 //
 //

 __kernel
 SKC_RASTERIZE_KERNEL_ATTRIBS
 void
 skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                          __global union skc_bp_elem                * const bp_elems,
                          __global uint                             * const bp_ids,
                          skc_uint                                    const bp_mask,

                          __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                          __global skc_ttsk_s_t                     * const sk_extent,

                          __global float8                  const    * const transforms, // FIXME -- __constant
                          __global float4                  const    * const clips,      // FIXME -- __constant
                          __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
                          skc_uint                                    const count)
 {
   //
   // declare shared memory block
   //
 #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
   __local struct skc_subgroup_smem volatile                smem[1];
 #else
   __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
   __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
 #endif

   //
   // this is a subgroup/warp-centric kernel
   //
   // which subgroup in the grid is this?
   //
   // TAKE NOTE: the Intel GEN compiler appears to be recognizing
   // get_group_id(0) as a uniform but the alternative calculation used
   // when there are multiple subgroups per workgroup is not
   // cooperating and driving spillage elsewhere.
   //
 #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
   uint const cmd_idx = get_group_id(0);
 #else
   uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
 #endif

 #if 0
   if (get_sub_group_local_id() == 0)
     printf("+cmd_idx = %u\n",cmd_idx);
 #endif

   //
   // if worksgroups are multi-subgroup then there may be excess
   // subgroups in the final workgroup
   //
   if (cmd_idx >= count)
     return;

 #if 0
   if (get_sub_group_local_id() == 0)
     printf("-cmd_idx = %u\n",cmd_idx);
 #endif

   //
   // load a single command for this subgroup
   //
   union skc_cmd_rasterize const cmd = cmds[cmd_idx];

 #if 0
   if (get_sub_group_local_id() == 0)
     printf("[ %u ]< %u, %u, %u, %u >\n",
            cmd_idx,
            cmd.nodeword,
            SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd),
            SKC_CMD_RASTERIZE_GET_CLIP(cmd),
            SKC_CMD_RASTERIZE_GET_COHORT(cmd));
 #endif

   //
   // get first block node command word and its subblock
   //
   skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
   skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
   skc_block_id_tag      tag      = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id);
   skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

   //
   // load transform -- uniform across subgroup
   //
   // v8: { sx shx tx shy sy ty w0 w1 }
   //
   // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
   //
   //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
   //
   // Coordinates are scaled to subpixel resolution.  All that matters
   // is that continuity is maintained between end path element
   // endpoints.
   //
   // It's the responsibility of the host to ensure that the transforms
   // are properly scaled either via intitializing a transform stack
   // with the subpixel resolution scaled identity or scaling the
   // transform before its loaded by a rasterization grid.
   //
   // FIXME -- horizontal load might be better than this broadcast load
   //
   union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
   union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
   skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted

   switch (tag)
     {
     case SKC_BLOCK_ID_TAG_PATH_LINE:
       skc_rasterize_lines(bp_atomics,
                           bp_elems,
                           bp_ids,
                           bp_mask,
                           cohort_atomics,
                           sk_extent,
                           smem,
                           &nodeword,&id,
                           &tv,&cv,cohort);
       break;

     case SKC_BLOCK_ID_TAG_PATH_QUAD:
       skc_rasterize_quads(bp_atomics,
                           bp_elems,
                           bp_ids,
                           bp_mask,
                           cohort_atomics,
                           sk_extent,
                           smem,
                           &nodeword,&id,
                           &tv,&cv,cohort);
       break;

     case SKC_BLOCK_ID_TAG_PATH_CUBIC:
       skc_rasterize_cubics(bp_atomics,
                            bp_elems,
                            bp_ids,
                            bp_mask,
                            cohort_atomics,
                            sk_extent,
                            smem,
                            &nodeword,&id,
                            &tv,&cv,cohort);
       break;

     case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD:
       break;
     case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC:
       break;

     default:
       break;
     }
 }

 //
 //
 //

 __kernel
 SKC_RASTERIZE_KERNEL_ATTRIBS
 void
 skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                            __global union skc_bp_elem                * const bp_elems,
                            __global uint                             * const bp_ids,
                            skc_uint                                    const bp_mask,

                            __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                            __global skc_ttsk_s_t                     * const sk_extent,

                            __global float8                  const    * const transforms, // FIXME -- __constant
                            __global float4                  const    * const clips,      // FIXME -- __constant
                            __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
                            skc_uint                                    const count)
 {
   //
   // declare shared memory block
   //
 #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
   __local struct skc_subgroup_smem volatile                smem[1];
 #else
   __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
   __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
 #endif

   //
   // this is a subgroup/warp-centric kernel
   //
   // which subgroup in the grid is this?
   //
   // TAKE NOTE: the Intel GEN compiler appears to be recognizing
   // get_group_id(0) as a uniform but the alternative calculation used
   // when there are multiple subgroups per workgroup is not
   // cooperating and driving spillage elsewhere.
   //
 #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
   uint const cmd_idx = get_group_id(0);
 #else
   uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
 #endif

   //
   // if worksgroups are multi-subgroup then there may be excess
   // subgroups in the final workgroup
   //
   if (cmd_idx >= count)
     return;

 #if 0
   if (get_sub_group_local_id() == 0)
     printf("cmd_idx = %u\n",cmd_idx);
 #endif

   //
   // load a single command for this subgroup
   //
   union skc_cmd_rasterize const cmd = cmds[cmd_idx];

   //
   // get first block node command word and its subblock
   //
   skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
   skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
   skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

   //
   // load transform -- uniform across subgroup
   //
   // v8: { sx shx tx shy sy ty w0 w1 }
   //
   // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
   //
   //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
   //
   // Coordinates are scaled to subpixel resolution.  All that matters
   // is that continuity is maintained between end path element
   // endpoints.
   //
   // It's the responsibility of the host to ensure that the transforms
   // are properly scaled either via intitializing a transform stack
   // with the subpixel resolution scaled identity or scaling the
   // transform before its loaded by a rasterization grid.
   //
   // FIXME -- horizontal load might be better than this broadcast load
   //
   union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
   union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
   skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted

   skc_rasterize_lines(bp_atomics,
                       bp_elems,
                       bp_ids,
                       bp_mask,
                       cohort_atomics,
                       sk_extent,
                       smem,
                       &nodeword,&id,
                       &tv,&cv,cohort);
 }

 //
 //
 //

 //
 //
 //

 __kernel
 SKC_RASTERIZE_KERNEL_ATTRIBS
 void
 skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                            __global union skc_bp_elem                * const bp_elems,
                            __global uint                             * const bp_ids,
                            skc_uint                                    const bp_mask,

                            __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                            __global skc_ttsk_s_t                     * const sk_extent,

                            __global float8                  const    * const transforms, // FIXME -- __constant
                            __global float4                  const    * const clips,      // FIXME -- __constant
                            __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
                            skc_uint                                    const count)
 {
   //
   // declare shared memory block
   //
 #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
   __local struct skc_subgroup_smem volatile                smem[1];
 #else
   __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
   __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
 #endif

   //
   // this is a subgroup/warp-centric kernel
   //
   // which subgroup in the grid is this?
   //
   // TAKE NOTE: the Intel GEN compiler appears to be recognizing
   // get_group_id(0) as a uniform but the alternative calculation used
   // when there are multiple subgroups per workgroup is not
   // cooperating and driving spillage elsewhere.
   //
 #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
   uint const cmd_idx = get_group_id(0);
 #else
   uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
 #endif

   //
   // if worksgroups are multi-subgroup then there may be excess
   // subgroups in the final workgroup
   //
   if (cmd_idx >= count)
     return;

 #if 0
   if (get_sub_group_local_id() == 0)
     printf("cmd_idx = %u\n",cmd_idx);
 #endif

   //
   // load a single command for this subgroup
   //
   union skc_cmd_rasterize const cmd = cmds[cmd_idx];

   //
   // get first block node command word and its subblock
   //
   skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
   skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
   skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

   //
   // load transform -- uniform across subgroup
   //
   // v8: { sx shx tx shy sy ty w0 w1 }
   //
   // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
   //
   //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
   //
   // Coordinates are scaled to subpixel resolution.  All that matters
   // is that continuity is maintained between end path element
   // endpoints.
   //
   // It's the responsibility of the host to ensure that the transforms
   // are properly scaled either via intitializing a transform stack
   // with the subpixel resolution scaled identity or scaling the
   // transform before its loaded by a rasterization grid.
   //
   // FIXME -- horizontal load might be better than this broadcast load
   //
   union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
   union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
   skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted

   skc_rasterize_quads(bp_atomics,
                       bp_elems,
                       bp_ids,
                       bp_mask,
                       cohort_atomics,
                       sk_extent,
                       smem,
                       &nodeword,&id,
                       &tv,&cv,cohort);
 }

 //
 //
 //

 __kernel
 SKC_RASTERIZE_KERNEL_ATTRIBS
 void
 skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                             __global union skc_bp_elem                * const bp_elems,
                             __global uint                             * const bp_ids,
                             skc_uint                                    const bp_mask,

                             __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                             __global skc_ttsk_s_t                     * const sk_extent,

                             __global float8                  const    * const transforms, // FIXME -- __constant
                             __global float4                  const    * const clips,      // FIXME -- __constant
                             __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
                             skc_uint                                    const count)
 {
   //
   // declare shared memory block
   //
 #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
   __local struct skc_subgroup_smem volatile                smem[1];
 #else
   __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
   __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
 #endif

   //
   // this is a subgroup/warp-centric kernel
   //
   // which subgroup in the grid is this?
   //
   // TAKE NOTE: the Intel GEN compiler appears to be recognizing
   // get_group_id(0) as a uniform but the alternative calculation used
   // when there are multiple subgroups per workgroup is not
   // cooperating and driving spillage elsewhere.
   //
 #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
   uint const cmd_idx = get_group_id(0);
 #else
   uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
 #endif

   //
   // if worksgroups are multi-subgroup then there may be excess
   // subgroups in the final workgroup
   //
   if (cmd_idx >= count)
     return;

 #if 0
   if (get_sub_group_local_id() == 0)
     printf("cmd_idx = %u\n",cmd_idx);
 #endif

   //
   // load a single command for this subgroup
   //
   union skc_cmd_rasterize const cmd = cmds[cmd_idx];

   //
   // get first block node command word and its subblock
   //
   skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
   skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
   skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

   //
   // load transform -- uniform across subgroup
   //
   // v8: { sx shx tx shy sy ty w0 w1 }
   //
   // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
   //
   //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
   //
   // Coordinates are scaled to subpixel resolution.  All that matters
   // is that continuity is maintained between end path element
   // endpoints.
   //
   // It's the responsibility of the host to ensure that the transforms
   // are properly scaled either via intitializing a transform stack
   // with the subpixel resolution scaled identity or scaling the
   // transform before its loaded by a rasterization grid.
   //
   // FIXME -- horizontal load might be better than this broadcast load
   //
   union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
   union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
   skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted

   skc_rasterize_cubics(bp_atomics,
                        bp_elems,
                        bp_ids,
                        bp_mask,
                        cohort_atomics,
                        sk_extent,
                        smem,
                        &nodeword,&id,
                        &tv,&cv,cohort);
 }

 //
 //
 //

 __kernel
 SKC_RASTERIZE_KERNEL_ATTRIBS
 void
 skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                                __global union skc_bp_elem                * const bp_elems,
                                __global uint                             * const bp_ids,
                                skc_uint                                    const bp_mask,

                                __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                                __global skc_ttsk_s_t                     * const sk_extent,

                                __global float8                  const    * const transforms, // FIXME -- __constant
                                __global float4                  const    * const clips,      // FIXME -- __constant
                                __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
                                skc_uint                                    const count)
 {
   ;
 }

 //
 //
 //

 __kernel
 SKC_RASTERIZE_KERNEL_ATTRIBS
 void
 skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                                 __global union skc_bp_elem                * const bp_elems,
                                 __global uint                             * const bp_ids,
                                 skc_uint                                    const bp_mask,

                                 __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                                 __global skc_ttsk_s_t                     * const sk_extent,

                                 __global float8                  const    * const transforms, // FIXME -- __constant
                                 __global float4                  const    * const clips,      // FIXME -- __constant
                                 __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
                                 skc_uint                                    const count)
 {
   ;
 }

 //
 //
 //