src/compute/skc/platforms/cl_12/kernels/render.cl - skia - Git at Google

 /*
  * Copyright 2016 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can
  * be found in the LICENSE file.
  *
  */

 //
 //
 //

 #include "tile.h"
 #include "block.h"
 #include "styling_types.h"
 #include "atomic_cl.h"
 #include "kernel_cl_12.h"

 //
 //
 //

 #define SKC_RENDER_SUBGROUP_MASK  (SKC_RENDER_SUBGROUP_SIZE - 1)

 //
 //
 //

 #if   ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
 #define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_1()
 #define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      0

 #elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 )
 #define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_2()
 #define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      1

 #elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 )
 #define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_4()
 #define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      3

 #elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 )
 #define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_8()
 #define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      7

 #elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16)
 #define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_16()
 #define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      15
 #endif

 //
 // tile state flag bits
 //

 typedef enum skc_tile_flags_e {

   // FLUSH
   SKC_TILE_FLAGS_FLUSH_FINALIZE    = 0x00000001,
   SKC_TILE_FLAGS_FLUSH_UNWIND      = 0x00000002,
   SKC_TILE_FLAGS_FLUSH_COMPLETE    = 0x00000004,

   // OPACITY
   SKC_TILE_FLAGS_SCATTER_SKIP      = 0x00000008,

   //
   // Note: testing for opacity and skipping scattering is on its way
   // to becoming a much more programmable option because sometimes we
   // may be compositing/blending from back-to-front and/or be using
   // group blend rules that ignore opacity.
   //
   // The point is that all of these decisions should be encoded in
   // styling commands and, as much as possible, removed from the final
   // group/layer styling traversal render loop.
   //

 } skc_tile_flags_e;

 //
 // COVER -- assumes availability of either fp16 or fp32
 //

 union skc_tile_cover
 {
   struct {
     SKC_RENDER_TILE_COVER             c[SKC_TILE_WIDTH];
   } aN;

 #ifdef SKC_RENDER_TILE_COVER_VECTOR
   struct {
     SKC_RENDER_TILE_COVER_VECTOR      c[SKC_RENDER_TILE_COVER_VECTOR_COUNT];
   } vN;
 #endif
 };

 //
 // COLOR -- assumes availability of either fp16 or fp32
 //

 union skc_tile_color
 {
   union {
     struct {
       SKC_RENDER_TILE_COLOR           r;
       SKC_RENDER_TILE_COLOR           g;
       SKC_RENDER_TILE_COLOR           b;
       SKC_RENDER_TILE_COLOR           a;
     } rgba[SKC_TILE_WIDTH];
   } aN;

 #ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
   union {
     SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH];
   } iN;
 #endif

 #ifdef SKC_RENDER_TILE_COLOR_VECTOR
   union {
     SKC_RENDER_TILE_COLOR_VECTOR      rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT];
   } vN;
 #endif

   struct {
     union {
       struct {
         SKC_RENDER_TILE_COLOR         r;
         SKC_RENDER_TILE_COLOR         g;
       };
       SKC_RENDER_GRADIENT_FLOAT       distance;
     };
     union {
       struct {
         SKC_RENDER_TILE_COLOR         b;
         SKC_RENDER_TILE_COLOR         a;
       };
       SKC_RENDER_GRADIENT_FLOAT       stoplerp;
     };
   } grad[SKC_TILE_WIDTH];
 };

 //
 // SHARED MEMORY STATE
 //

 #define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT)

 #define SKC_RENDER_WIDE_AA_BYTES   (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE)
 #define SKC_RENDER_WIDE_AA_WIDTH   (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA))

 //
 //
 //

 union skc_subgroup_smem
 {
   //
   // The tiles are stored in column-major / height-major order
   //
   // The final column is a guard column that is OK to write to but
   // will never be read.  It simplifies the TTSB scatter but could be
   // predicated if SMEM is really at a premium.
   //
 #if ( SKC_RENDER_SUBGROUP_SIZE > 1 )
   struct {
     SKC_ATOMIC_UINT              area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
   } atomic;
 #endif

   struct {
     int                          area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
   } aN;

   struct { // assumption is that height = subgroup
     SKC_RENDER_AREA_V            area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE];
   } vN;

   struct { // assumption is that height = subgroup
     SKC_RENDER_WIDE_AA           area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE];
   } wide;

   union skc_styling_cmd          cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT];

   half                           gc  [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2];

 #if 0
   //
   // SPILL TO GMEM
   //
 #if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0)
   struct {

 #if (SKC_REGS_COLOR_S > 0)
     union skc_color_r            color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
 #endif

 #if (SKC_REGS_COVER_S > 0)
     union float                  cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
 #endif

   } regs;
 #endif
   //
   //
   //
 #endif
 };

 //
 //
 //

 #if ( SKC_RENDER_SUBGROUP_SIZE == 1 )

 #define skc_subgroup_lane()  0

 #else

 #define skc_subgroup_lane()  get_sub_group_local_id()

 #endif

 //
 //
 //

 typedef skc_uint  skc_ttsk_lo_t;
 typedef skc_uint  skc_ttsk_hi_t;

 typedef skc_uint  skc_ttpk_lo_t;
 typedef skc_uint  skc_ttpk_hi_t;

 typedef skc_uint  skc_ttxk_lo_t;
 typedef skc_uint  skc_ttxk_hi_t;

 typedef skc_uint  skc_ttck_lo_t;
 typedef skc_uint  skc_ttck_hi_t;

 typedef skc_uint2 skc_ttck_t;

 typedef skc_int   skc_ttxb_t;

 //
 // TTCK (32-BIT COMPARE) v1:
 //
 //  0                                                           63
 //  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
 //  +----------------------+--------+--------+-------+-----+-----+
 //  |          30          |    1   |    1   |   18  |  7  |  7  |
 //
 //
 // TTCK (32-BIT COMPARE) v2:
 //
 //  0                                                           63
 //  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
 //  +----------------------+--------+--------+-------+-----+-----+
 //  |          30          |    1   |    1   |   15  |  9  |  8  |
 //
 //
 // TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
 //
 //  0                                                           63
 //  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
 //  +----------------------+--------+--------+-------+-----+-----+
 //  |          27          |    1   |    1   |   18  |  9  |  8  |
 //

 static
 skc_uint
 skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a)
 {
   return a & SKC_TTCK_LO_MASK_ID;
 }

 static
 skc_layer_id
 skc_ttck_get_layer(skc_ttck_t const a)
 {
   //
   // FIXME -- a union with a ulong and a shift down and mask is
   // probably faster on some architectures
   //
   skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
   skc_uint const hi = (a.hi  & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER;

   return lo | hi;
 }

 static
 skc_uint
 skc_ttck_hi_get_x(skc_ttck_hi_t const a)
 {
   return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X);
 }

 static
 skc_uint
 skc_ttck_hi_get_y(skc_ttck_hi_t const a)
 {
   return a >> SKC_TTCK_HI_OFFSET_Y;
 }

 static
 skc_bool
 skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b)
 {
   skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
   skc_uint const hi = (a.hi ^ b.hi);

   return (lo | hi) == 0;
 }

 static
 skc_bool
 skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b)
 {
   return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0;
 }

 static
 skc_bool
 skc_ttck_lo_is_prefix(skc_ttck_lo_t const a)
 {
   return (a & SKC_TTCK_LO_MASK_PREFIX) != 0;
 }

 //
 // TILE TRACE SUBPIXEL
 //
 // The subpixels are encoded with either absolute tile coordinates
 // (32-bits) or packed in delta-encoded form form.
 //
 // For 32-bit subpixel packing of a 32x32 tile:
 //
 // A tile X is encoded as:
 //
 //   TX : 10 : unsigned min(x0,x1) tile subpixel coordinate.
 //
 //   SX :  6 : unsigned subpixel span from min to max x with range
 //             [0,32]. The original direction is not captured. Would
 //             be nice to capture dx but not necessary right now but
 //             could be in the future. <--- SPARE VALUES AVAILABLE
 //
 // A tile Y is encoded as:
 //
 //   TY : 10 : unsigned min(y0,y1) tile subpixel coordinate.
 //
 //   DY :  6 : signed subpixel delta y1-y0. The range of delta is
 //             [-32,32] but horizontal lines are not encoded so [1,32]
 //             is mapped to [0,31]. The resulting range [-32,31] fits
 //             in 6 bits.
 //
 // TTS:
 //
 //  0                        31
 //  |  TX |  SX  |  TY |  DY  |
 //  +-----+------+-----+------+
 //  |  10 |   6  |  10 |   6  |
 //

 static
 SKC_RENDER_TTS_V_BITFIELD
 skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a)
 {
   //
   // extract the whole pixel y coordinate
   //
   return SKC_BFE(a,
                  SKC_TTS_BITS_TY   - SKC_SUBPIXEL_RESL_Y_LOG2,
                  SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2);
 }

 static
 SKC_RENDER_TTS_V_BITFIELD
 skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a)
 {
   //
   // get the linear array tile index of the pixel
   //
   return (((a & SKC_TTS_MASK_TX_PIXEL)

 #if   (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2)
            >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2)
 #elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2)
            << (SKC_TILE_HEIGHT_LOG2     - SKC_SUBPIXEL_RESL_X_LOG2)
 #endif

            ) | skc_tts_get_ty_pixel_v(a));
 }

 #if 0
 static
 skc_ttx_v_s32_t
 skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
 {
   skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY;

   return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31));
 }
 #else
 static
 SKC_RENDER_TTS_V_BITFIELD
 skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
 {
   SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY;

   return dy - (~a >> 31);
 }
 #endif

 static
 SKC_RENDER_TTS_V_BITFIELD
 skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a)
 {
   return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2);
 }

 static
 SKC_RENDER_TTS_V_BITFIELD
 skc_tts_get_sx_v(SKC_RENDER_TTS_V const a)
 {
   return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX);
 }

 //
 //
 //

 static
 void
 skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem)
 {
   //
   // SIMD / CPU
   //
   //      &
   //
   // SIMT / GPU
   //
   // Note that atomic_init() is likely implemented as a simple
   // assignment so there is no identifiable performance difference on
   // current targets.
   //
   // If such an architecture appears in the future then we'll probably
   // still want to implement this zero'ing operation as below but
   // follow with an appropriate fence that occurs before any scatter
   // operations.
   //
   // The baroque expansion below improves performance on Intel GEN by,
   // presumably, achieving the 64-byte per clock SLM write as well as
   // minimizing the overall number of SEND() block initializations and
   // launches.
   //
   // Intel GENx has a documented 64 byte per cycle SLM write limit.
   // So having each lane in an 8 lane subgroup zero-write 8 bytes is
   // probably a safe bet (Later: benchmarking backs this up!).
   //
   // Note there is no reason at this time to unroll this loop.
   //
   for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++)
     smem->wide.area[ii][skc_subgroup_lane()] = ( 0 );
 }

 //
 // Note this is going to be vectorizable on most architectures.
 //
 // The return of the key translation feature might complicate things.
 //

 static
 void
 skc_scatter_ttpb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
                  __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
                  skc_block_id_t                                  const pb_id)
 {
   skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();

 #if   ( SKC_TILE_RATIO == 1 )

   SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset];

 #elif ( SKC_TILE_RATIO == 2 )

   SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent);

 #else

 #error("tile ratio greater than 2 not supported")

 #endif

   //
   // Note there is no need to use an atomic for this operation on the
   // current group of target platforms... but this may change if
   // atomic ops truly go through a different path.
   //
   // As noted above, this direct increment is probably faster and can
   // always be followed by a fence.
   //
   // Furthermore, note that the key sorting orders all ttck keys
   // before ttpk keys.
   //

   //
   // FIXME -- if the SMEM store is wider than bank word count then we
   // might want to odd-even interleave the TTP values if the target
   // device can't handle 64-bit stores
   //

   //
   // skipping per-key translation for now
   //
   smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1);
 }

 //
 // Note that skc_scatter_ttsb is *not* vectorizable unless the
 // architecture supports a "scatter-add" capability.  All relevant
 // GPUs support atomic add on shared/local memory and thus support
 // scatter-add.
 //

 static
 void
 skc_scatter_ttsb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
                  __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
                  skc_block_id_t                                  const sb_id)
 {
   skc_uint         const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();

   SKC_RENDER_TTS_V const tts_v  = ttxb_extent[offset];

   //
   // Skipping per-key translation for now
   //

   // Index into tile
   //
   // The tiles are stored in column-major / height-major order
   //
   // The final column is a guard column that is OK to write to but
   // will never be read.  It simplifies the TTSB scatter but could be
   // predicated if SMEM is really at a premium.
   //

   SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v);

 #if 0
   if (tts_v != SKC_TTS_INVALID)
     printf("(%08X) = %u\n",tts_v,xy_idx);
 #endif

   //
   // adjust subpixel range to max y
   //
   // range is stored as [-32,31] and when read [0,31] is mapped to
   // [1,32] because a dy of 0 is not possible.
   //
   // more succinctly: if dy >= 0 then ++dy
   //
   SKC_RENDER_TTS_V_BITFIELD const dy     = skc_tts_get_dy_v(tts_v);

   //
   // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid?
   //

   // this "min(x0) * 2 + dx" is equivalent to "x0 + x1"
   SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v);

   // Calculate left and right coverage contribution trapezoids
   SKC_RENDER_TTS_V_BITFIELD const left   = dy * widths;
   SKC_RENDER_TTS_V_BITFIELD const right  = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left;

   //
   // Accumulate altitudes and areas
   //
   // Optimization: if the device supports an CPU/SIMD vector-add or
   // GPU/SIMT scatter-add atomic int2 add operation then placing the
   // ALT and AREA values side-by-side would halve the number of
   // additions.
   //
 #if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
   //
   // CPU/SIMD
   //
 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A)                                 \
   if (tts_v C != SKC_TTS_INVALID) {                             \
     smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left  C;       \
     smem->aN.area[                  xy_idx C] += right C;       \
   }

 #else
   //
   // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
   //
 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A)                                         \
   if (tts_v C != SKC_TTS_INVALID) {                                     \
     SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area +           \
                                           SKC_TILE_HEIGHT   + xy_idx C, \
                                           left C);                      \
     SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \
                                           right C);                     \
   }
 #endif

   SKC_RENDER_TTSB_EXPAND();
 }

 //
 // Note that 2048.0 can be represented exactly with fp16... fortuitous!
 //

 #define SKC_RENDER_FILL_MAX_AREA          (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y)
 #define SKC_RENDER_FILL_MAX_AREA_2        (2u * SKC_RENDER_FILL_MAX_AREA)
 #define SKC_RENDER_FILL_EVEN_ODD_MASK     (SKC_RENDER_FILL_MAX_AREA_2 - 1)
 #define SKC_RENDER_FILL_MAX_AREA_RCP_F32  (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA)

 //
 //
 //

 static
 void
 skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
                        union skc_tile_cover            * SKC_RESTRICT const cover,
                        union skc_tile_color            * SKC_RESTRICT const color)
 {
   SKC_RENDER_ACC_COVER_INT area = 0;

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     {
       area                                   += smem->vN.area[ii][skc_subgroup_lane()];
       SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
       SKC_RENDER_TILE_COVER     const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA));

       cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32);
     }
 }

 static
 void
 skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
                        union skc_tile_cover            * SKC_RESTRICT const cover,
                        union skc_tile_color            * SKC_RESTRICT const color)
 {
   SKC_RENDER_ACC_COVER_INT area = 0;

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     {
       area                                   += smem->vN.area[ii][skc_subgroup_lane()];
       SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
       SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA));

       cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32;
     }
 }

 //
 //
 //

 static
 void
 skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
                           uint                                 * SKC_RESTRICT const cmd_next,
                           union skc_tile_color                 * SKC_RESTRICT const color)
 {
   //
   // rgba = solid fill
   //
   __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;

   *cmd_next += 2;

 #if !defined( SKC_RENDER_TILE_COLOR_VECTOR )

   SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     color->aN.rgba[ii].r = rg.lo;

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     color->aN.rgba[ii].g = rg.hi;

   SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     color->aN.rgba[ii].b = ba.lo;

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     color->aN.rgba[ii].a = ba.hi;

 #else

   SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
   SKC_RENDER_TILE_COLOR      const r  = rg.lo;

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r);

   SKC_RENDER_TILE_COLOR      const g  = rg.hi;

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     color->vN.rgba[ii].odd.even  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g);

   SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
   SKC_RENDER_TILE_COLOR      const b  = ba.lo;

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     color->vN.rgba[ii].even.odd  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b);

   SKC_RENDER_TILE_COLOR      const a  = ba.hi;

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     color->vN.rgba[ii].odd.odd   = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a);

 #endif
 }

 //
 // Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
 //
 // https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
 //
 // Lerp in two fma/mad ops:
 //
 //    t * b + ((-t) * a + a)
 //
 // Note: OpenCL documents mix() as being implemented as:
 //
 //    a + (b - a) * t
 //
 // But this may be a native instruction on some devices.  For example,
 // on GEN9 there is an LRP "linear interoplation" function but it
 // doesn't appear to support half floats.
 //

 #if 1
 #define SKC_LERP(a,b,t)  mad(t,b,mad(-(t),a,a))
 #else
 #define SKC_LERP(a,b,t)  mix(a,b,t)
 #endif

 //
 // CPUs have a mock local address space so copying the gradient header
 // is probably not useful.  Just read directly from global.
 //

 #ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
 #define SKC_RENDER_GRADIENT_SPACE  __local
 #else
 #define SKC_RENDER_GRADIENT_SPACE  __global
 #endif

 //
 // gradient is non-vertical
 //
 // removed the vertical (actually, horizontal) special case
 //

 static
 void
 skc_tile_color_fill_gradient_linear_nonvertical(__local  union skc_subgroup_smem     * SKC_RESTRICT const smem,
                                                 __global union skc_styling_cmd const * SKC_RESTRICT const commands,
                                                 uint                                 * SKC_RESTRICT const cmd_next,
                                                 union skc_tile_color                 * SKC_RESTRICT const color,
                                                 skc_ttck_hi_t                                       const ttck_hi)
 {
   //
   // Where is this tile?
   //
   // Note that the gradient is being sampled from pixel centers.
   //
   SKC_RENDER_GRADIENT_FLOAT const y =
 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A) I##.5f P
     (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) +
     (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE));

   float                     const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH);

   //
   // Get starting numerator and denominator
   //
   // Note: if gh[0].dx is exactly 0.0f then this is a vertical
   // gradient and can be handled by a special opcode.
   //
   // Note: the mad() ordering is slightly different than the original
   // CUDA implementation.
   //
   union skc_gradient_vector const gv       = { vload4(0,&commands[*cmd_next].f32) };

   *cmd_next += 4;

   float                     const gv_x_dot = mad(x,gv.dx,gv.p0);
   SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot);

   //
   // Where are columns along gradient vector?
   //
   // TODO: Note that the gv_denom isn't multiplied through.
   //
   // Please doublecheck this... but I recall that in certain cases
   // this wipes out some precision and results in minor but noticeable
   // gradient artifacts.
   //
   // All arguments are scalars except gv_numer so a simpler
   // evaluation might save some flops.
   //

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom;

   //
   // is gradient non-repeating, repeating or reflecting?
   //
   switch (commands[(*cmd_next)++].u32)
     {
     case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING:
       // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
       for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
         color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f);
       break;

     case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING:
       // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
       for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
         color->grad[ii].distance -= floor(color->grad[ii].distance);
       break;

     default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING
       //
       // OPTIMIZATION: Can this be done in fewer than ~4 ops?
       //
       // Note: OpenCL "rint()" is round-to-nearest-even integer!
       //
       // Note: the floor() "round to -inf" op is implemented in the
       // GEN op 'FRC' so probably don't use trunc() when floor will
       // suffice.
       //

       // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
       for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
         {
           SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance);
           color->grad[ii].distance = fabs(dist_abs - rint(dist_abs));
         }
     }

   //
   // initialize "stoplerp" for all columns
   //
   uint const slope_count = commands[(*cmd_next)++].u32;
   uint const gd_n_v1     = commands[(*cmd_next)++].u32; // REMOVE ME

   {
     float const slope = commands[(*cmd_next)++].f32;

     // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
     for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
       color->grad[ii].stoplerp = color->grad[ii].distance * slope;
   }

   //
   // compute stoplerp for remaining stops
   //
   for (int jj=1; jj<slope_count; jj++)
     {
       float const floor = (float)jj;
       float const slope = commands[(*cmd_next)++].f32;

       // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
       for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
         color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp);
     }

   //
   // copy gradient colors to local memory
   //
   uint const gd_n = slope_count + 1;

 #ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
   //
   // copy entire gradient descriptor to local memory
   //
   for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE)
     smem->cmds[ii].u32 = commands[*cmd_next + ii].u32;

   __local  half const * const SKC_RESTRICT gc = smem->gc + 0;
 #else
   //
   // prefetch entire gradient header
   //
   // no noticeable impact on performance
   //
   // prefetch(&commands[*cmd_next].u32,gh_words);
   //
   __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0;
 #endif

   //
   // adjust cmd_next so that V1 structure is consumed -- FIXME
   //
   *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n);

   //
   // lerp between color pair stops
   //
   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     {
       //
       // Finally, we have the gradient stop index and the color stop
       // pair lerp fraction
       //
       // Note that if these are vector values then a gather operation
       // must occur -- there may be platforms (AVX-512?) that can
       // perform an explicit gather on a vector type but it's not
       // really expressible in OpenCL except implicitly with a
       // workgroup of work items.
       //
       // ***********************
       //
       // FIXME -- USE HERB'S SINGLE FMA LERP
       //
       // ***********************
       //
       SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp);
       SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp));

       {
         SKC_RENDER_TILE_COLOR lo, hi;

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A) {                                       \
           SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \
           lo C                                = cc.lo;                  \
           hi C                                = cc.hi;                  \
         }

         SKC_RENDER_SCANLINE_VECTOR_EXPAND();

         color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac);
       }

       //
       //
       //
       {
         SKC_RENDER_TILE_COLOR lo, hi;

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A) {                                       \
           SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \
           lo C                                = cc.lo;                  \
           hi C                                = cc.hi;                  \
         }

         SKC_RENDER_SCANLINE_VECTOR_EXPAND();

         color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac);
       }

       //
       //
       //
       {
         SKC_RENDER_TILE_COLOR lo, hi;

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A) {                                       \
           SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \
           lo C                                = cc.lo;                  \
           hi C                                = cc.hi;                  \
         }

         SKC_RENDER_SCANLINE_VECTOR_EXPAND();

         color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac);
       }

       //
       //
       //
       {
         SKC_RENDER_TILE_COLOR lo, hi;

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A) {                                       \
           SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \
           lo C                                = cc.lo;                  \
           hi C                                = cc.hi;                  \
         }

         SKC_RENDER_SCANLINE_VECTOR_EXPAND();

         color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac);
       }
     }
 }

 //
 //
 //

 static
 void
 skc_tile_blend_over(union skc_tile_color       * SKC_RESTRICT const color_acc,
                     union skc_tile_cover const * SKC_RESTRICT const cover_wip,
                     union skc_tile_color const * SKC_RESTRICT const color_wip)
 {
   //
   // fralunco = cover.wip * acc.a
   //
   // acc.r    =  fralunco * wip.r + acc.r
   // acc.g    =  fralunco * wip.g + acc.g
   // acc.b    =  fralunco * wip.b + acc.b
   // acc.a    = -fralunco * wip.a + acc.a
   //

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     {
       SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a;

       color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
       color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
       color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
       color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
     }
 }

 //
 //
 //

 static
 void
 skc_tile_blend_plus(union skc_tile_color       * SKC_RESTRICT const color_acc,
                     union skc_tile_cover const * SKC_RESTRICT const cover_wip,
                     union skc_tile_color const * SKC_RESTRICT const color_wip)
 {
   //
   // cover_min = min(cover.wip,a.acc)
   //
   // r.acc =  cover_min * r.wip + r.acc
   // g.acc =  cover_min * g.wip + g.acc
   // b.acc =  cover_min * b.wip + b.acc
   // a.acc = -cover_min * a.wip + a.acc
   //

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     {
       SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a);

       color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
       color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
       color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
       color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
     }
 }

 //
 //
 //

 static
 void
 skc_tile_blend_multiply(union skc_tile_color       * SKC_RESTRICT const color_acc,
                         union skc_tile_cover const * SKC_RESTRICT const cover_wip,
                         union skc_tile_color const * SKC_RESTRICT const color_wip)
 {
   //
   // r.acc = (cover.wip * r.wip) * r.acc
   // g.acc = (cover.wip * g.wip) * g.acc
   // b.acc = (cover.wip * b.wip) * b.acc
   // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha)
   //

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     {
       color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r;
       color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g;
       color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b;
       color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a;
     }
 }

 //
 //
 //

 static
 void
 skc_tile_blend_knockout(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
                         union skc_tile_color       * SKC_RESTRICT const color_acc,
                         union skc_tile_cover const * SKC_RESTRICT const cover_wip,
                         union skc_tile_color const * SKC_RESTRICT const color_wip)
 {
   //
   // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
   // cover.acc         = cover.acc + cover.wip.contrib
   //
   // r.acc =  cover.wip.contrib * r.wip + r.acc
   // g.acc =  cover.wip.contrib * g.wip + g.acc
   // b.acc =  cover.wip.contrib * b.wip + b.acc
   // a.acc = -cover.wip.contrib * a.wip * a.acc
   //

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     {
       SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii];

       cover_acc->aN.c[ii]     += contrib;

       color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
       color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
       color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
       color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
     }
 }

 //
 //
 //

 static
 void
 skc_tile_cover_msk_copy_wip(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
                             union skc_tile_cover const * SKC_RESTRICT const cover_wip)
 {
 #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     cover_msk->aN.c[ii] = cover_wip->aN.c[ii];

 #else

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
     cover_msk->vN.c[ii] = cover_wip->vN.c[ii];

 #endif
 }

 //
 //
 //

 static
 void
 skc_tile_cover_msk_copy_acc(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
                             union skc_tile_cover const * SKC_RESTRICT const cover_acc)
 {
 #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     cover_msk->aN.c[ii] = cover_acc->aN.c[ii];

 #else

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
     cover_msk->vN.c[ii] = cover_acc->vN.c[ii];

 #endif
 }

 //
 //
 //

 static
 void
 skc_tile_cover_accumulate(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
                           union skc_tile_cover const * SKC_RESTRICT const cover_wip)
 {
   //
   // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
   // cover.acc         = cover.acc + cover.wip.contrib
   //

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]);
 }

 //
 //
 //

 static
 void
 skc_tile_cover_wip_mask(union skc_tile_cover       * SKC_RESTRICT const cover_wip,
                         union skc_tile_cover const * SKC_RESTRICT const cover_msk)
 {
   //
   // cover.wip *= cover.msk
   //

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     cover_wip->aN.c[ii] *= cover_msk->aN.c[ii];
 }

 //
 //
 //

 static
 void
 skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover)
 {
 #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     cover->aN.c[ii] = 0;

 #else
   //
   // GEN9 compiler underperforms on this
   //

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
     cover->vN.c[ii] = 0;

 #endif
 }

 static
 void
 skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover)
 {
 #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     cover->aN.c[ii] = 0;

 #else
   //
   // GEN9 compiler underperforms on this
   //

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
     cover->vN.c[ii] = 0;

 #endif
 }

 static
 void
 skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover)
 {
 #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     cover->aN.c[ii] = 0;

 #else
   //
   // GEN9 compiler underperforms on this
   //

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
     cover->vN.c[ii] = 0;

 #endif
 }

 //
 //
 //

 static
 void
 skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover)
 {
 #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     cover->aN.c[ii] = 1;

 #else
   //
   // GEN9 compiler underperforms on this
   //

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
     cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE;

 #endif
 }

 //
 //
 //

 static
 void
 skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover)
 {
 #if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     cover->aN.c[ii] = 1 - cover->aN.c[ii];

 #else

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
     cover->vN.c[ii] = 1 - cover->vN.c[ii];

 #endif
 }

 //
 //
 //

 static
 void
 skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color)
 {
 #if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     {
       color->aN.rgba[ii].r = 0;
       color->aN.rgba[ii].g = 0;
       color->aN.rgba[ii].b = 0;
       color->aN.rgba[ii].a = 1;
     }

 #else
   //
   // DISABLED ON GEN9 -- probably a compiler bug
   //
   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     color->vN.rgba[ii].even.even = 0;

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     color->vN.rgba[ii].odd.even  = 0;

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     color->vN.rgba[ii].even.odd  = 0;

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     color->vN.rgba[ii].odd.odd   = 1;
 #endif
 }

 static
 void
 skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color)
 {
 #if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     {
       color->aN.rgba[ii].r = 0;
       color->aN.rgba[ii].g = 0;
       color->aN.rgba[ii].b = 0;
       color->aN.rgba[ii].a = 1;
     }

 #else
   //
   // DISABLED ON GEN9 -- probably a compiler bug
   //
   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     color->vN.rgba[ii].even.even = 0;

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     color->vN.rgba[ii].odd.even  = 0;

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     color->vN.rgba[ii].even.odd  = 0;

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     color->vN.rgba[ii].odd.odd   = 1;
 #endif
 }

 //
 //
 //

 static
 bool
 skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color)
 {
   //
   // returns true if tile is opaque
   //
   // various hacks to test for complete tile opacity
   //
   // note that front-to-back currently has alpha at 0.0f -- this can
   // be harmonized to use a traditional alpha if we want to support
   // rendering in either direction
   //
   // hack -- ADD/MAX/OR all alphas together and test for non-zero
   //
   SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a;

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
   for (uint ii=1; ii<SKC_TILE_WIDTH; ii++)
     t += color->aN.rgba[ii].a;

 #if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
   //
   // SIMD
   //
   return !any(t != ( 0 ));

 #elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
   //
   // SIMT - scalar per lane
   //
   return !sub_group_any(t != 0);

 #else
   //
   // SIMT - vector per lane
   //
   return !sub_group_any(any(t != ( 0 )));

 #endif

   //
   // TODO: The alternative vector-per-lane implementation below is
   // *not* believed to be performant because the terse vector-wide
   // test is just hiding a series of comparisons and is likely worse
   // than the blind ADD/MAX/OR'ing of all alphas followed by a single
   // test.
   //
 #if 0
   //
   // SIMT - vector per lane
   //

   // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1)))
   for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
     {
       if (sub_group_any(any(color->vN.ba[ii].a != ( 0 ))))
         return false;
     }

   return true;
 #endif
 }

 //
 //
 //

 static
 void
 skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
                          uint                                 * SKC_RESTRICT const cmd_next,
                          union skc_tile_color                 * SKC_RESTRICT const color)
 {
   //
   // acc.r = acc.a * r + acc.r
   // acc.g = acc.a * g + acc.g
   // acc.b = acc.a * b + acc.b
   //
   __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;

   *cmd_next += 2;

   SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r);

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g);

   SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b);
 }

 //
 //
 //

 // #define SKC_SURFACE_IS_BUFFER
 #ifdef  SKC_SURFACE_IS_BUFFER

 static
 void
 skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface,
                               skc_uint                                           const surface_pitch,
                               union skc_tile_color          const * SKC_RESTRICT const color,
                               skc_ttck_hi_t                                      const ttck_hi)
 {
   //
   // NEW MAJOR OPTIMIZATION:
   //
   // Rotating and rasterizing the original world transform by -90
   // degrees and then rendering the scene scene by +90 degrees enables
   // all the final surface composite to be perfomed in perfectly
   // coalesced wide transactions.
   //
   // For this reason, linear access to the framebuffer is preferred.
   //
   // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
   //
   // NOTE THIS IS TRANSPOSED BY 90 DEGREES
   //
   // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
   // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
   //
   // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
   // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
   //
   // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
   // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
   //
   uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE;
   uint const x     = skc_ttck_hi_get_x(ttck_hi);
   uint const y     = skc_ttck_hi_get_y(ttck_hi) ;
   uint const base  = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane();

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     {
       SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 );

       rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255);
       rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8;
       rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16;

       surface[base + ii * pitch] = rgba;

       // printf("%08v2X\n",rgba);
     }
 }

 #else

 static
 void
 skc_surface_composite_u8_rgba(__write_only image2d_t                          surface,
                               union skc_tile_color const * SKC_RESTRICT const color,
                               skc_ttck_hi_t                                   const ttck_hi)
 {
   //
   // NEW MAJOR OPTIMIZATION:
   //
   // Rotating and rasterizing the original world transform by -90
   // degrees and then rendering the scene scene by +90 degrees enables
   // all the final surface composite to be perfomed in perfectly
   // coalesced wide transactions.
   //
   // For this reason, linear access to the framebuffer is preferred.
   //
   // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
   //
   // NOTE THIS IS TRANSPOSED BY 90 DEGREES
   //
   // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
   // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
   //
   // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
   // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
   //
   // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
   // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
   //

 #if 1
   int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
   int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);

   // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
   for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
     {
 #ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A) {                       \
         SKC_RENDER_SURFACE_WRITE(surface,               \
                                  (int2)(x,y+I),         \
                                  color->iN.rgba[ii] A); \
       }

 #else

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A) {                               \
         SKC_RENDER_SURFACE_COLOR const rgba =                   \
           (SKC_RENDER_SURFACE_COLOR)                            \
           (color->aN.rgba[ii].r C,                              \
            color->aN.rgba[ii].g C,                              \
            color->aN.rgba[ii].b C,                              \
            1.0);                                                \
         SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba);   \
       }

 #endif

       SKC_RENDER_SCANLINE_VECTOR_EXPAND();

       x += 1;
     }
 #else
     int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
     int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;

     // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
     for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
       {
 #ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A) {                       \
         SKC_RENDER_SURFACE_WRITE(surface,               \
                                  (int2)(x+I,y+ii),      \
                                  color->iN.rgba[ii] A); \
       }

 #else

 #undef  SKC_EXPAND_X
 #define SKC_EXPAND_X(I,S,C,P,A) {                               \
       SKC_RENDER_SURFACE_COLOR const rgba =                     \
         (SKC_RENDER_SURFACE_COLOR)                              \
         (color->aN.rgba[ii].r C,                                \
         color->aN.rgba[ii].g C,                                 \
         color->aN.rgba[ii].b C,                                 \
         1.0);                                                   \
       SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba);  \
     }

 #endif

       SKC_RENDER_SCANLINE_VECTOR_EXPAND();
     }

 #endif
 }

 #endif

 //
 //
 //
 static
 uint const
 skc_ttck_lane(uint const ttck_idx)
 {
   return ttck_idx & SKC_RENDER_SUBGROUP_MASK;
 }

 //
 // RENDER KERNEL
 //

 __kernel
 SKC_RENDER_KERNEL_ATTRIBS
 void
 skc_kernel_render(__global   union  skc_layer_node   const * SKC_RESTRICT const layers,
                   __global   struct skc_group_node   const * SKC_RESTRICT const groups,
                   __global   union  skc_styling_cmd  const * SKC_RESTRICT const commands,     // FIXME -- rename

                   __global   skc_ttck_t              const * SKC_RESTRICT const ttck_keys,    // rename: keys
                   skc_uint                                                const ttck_count,   // rename: key_count

                   __global   uint                    const * SKC_RESTRICT const ttck_offsets, // rename: offsets
                   skc_uint                                                const tile_count,   // rename: offset_count

                   __global   skc_ttxb_t              const * SKC_RESTRICT const ttxb_extent,
 #ifdef SKC_SURFACE_IS_BUFFER
                   __global   void                          * SKC_RESTRICT const surface,
 #else
                   __write_only image2d_t                                        surface,
 #endif
 #ifdef SKC_SURFACE_IS_BUFFER
                   skc_uint                                                const surface_pitch,
 #endif
                   uint4                                                   const tile_clip)    // rename: clip
 {
   //
   // Each subgroup is responsible for a tile.  No extra subgroups are
   // launched.
   //
   // FIXME -- might be better implemented as a "grid stride loop" if
   // Intel GEN really has a local memory "quantum" of 4KB which means
   // we would need to launch 4 subgroups per workgroup.
   //
   // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB.
   //

   //
   // declare tile cover and color registers
   //
   // this used to be a neat unified struct but the Intel GEN compiler
   // wasn't cooperating and spilling to private memory even though all
   // registers were indexed by constants
   //
   union skc_tile_color  color_wip;
   union skc_tile_color  color_acc;

   union skc_tile_cover  cover_wip;
   union skc_tile_cover  cover_acc;
   union skc_tile_cover  cover_msk;

   //
   // which subgroup in the grid is this?
   //
   // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0)
   // as a uniform but the alternative calculation used when there are
   // multiple subgroups per workgroup is not cooperating and
   // driving spillage elsewhere.
   //
 #if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
   skc_uint const ttck_offset_idx = get_group_id(0);
 #else
   skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id();
 #endif

   //
   // load the starting ttck for this offset and get a bound on the max
   // number of keys that might be loaded
   //
   // these are uniform across all subgroup lanes
   //
   skc_uint ttck_idx = ttck_offsets[ttck_offset_idx];

   //
   // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide)
   // vector of ttck keys
   //
 #ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK

   skc_ttck_t ttck = ttck_keys[ttck_idx];

 #else

   uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK;
   uint const ttck_lane = ttck_idx &  SKC_RENDER_SUBGROUP_MASK;
   skc_ttck_t ttck_s    = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)]

 #endif

   //
   // set up style group/layer state
   //
   struct skc_styling_group {
     union skc_group_range range;
     skc_uint              depth;
     skc_uint              id;
   } group;

   group.range.lo = 0;
   group.range.hi = SKC_UINT_MAX;
   group.depth    = SKC_UINT_MAX;
   group.id       = SKC_UINT_MAX;

   //
   // start with clear tile opacity, knockout and flag bits
   //
   // uint color_acc_opacity  = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
   // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
   //
   skc_uint flags = 0;

   //
   // declare and initialize accumulators
   //
 #if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
   __local union skc_subgroup_smem                      smem[1];
 #else
   __local union skc_subgroup_smem                      smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS];
   __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id();
 #endif

 #ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
   //
   // select the initial ttck key
   //
   skc_ttck_t ttck;
 #if 0
   ttck    = sub_group_broadcast(ttck_s,ttck_lane);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
 #else
   ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND
   ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane);
 #endif

 #endif

   //
   // save the first key so we know what tile we're in
   //
   skc_ttck_t ttck0 = ttck;

   //
   // evaluate the coarse clip as late as possible
   //
   skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi);

   if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x))
     return;

   skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi);

   if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y))
     return;

 #if 0
   printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y);
 #endif

   //
   // load -> scatter -> flush
   //
   while (true)
     {
       // if scattering is disabled then just run through ttck keys
       bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0;

       // need to clear accumulators before a scatter loop
       if (is_scatter_enabled)
         {
           skc_tile_aa_zero(smem);
         }

       do {
         // skip scattering?
         if (is_scatter_enabled)
           {
             skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo);

             if (skc_ttck_lo_is_prefix(ttck.lo)) {
               skc_scatter_ttpb(ttxb_extent,smem,xb_id);
             } else {
               skc_scatter_ttsb(ttxb_extent,smem,xb_id);
             }
           }

         //
         // any ttck keys left?
         //
         if (++ttck_idx >= ttck_count)
           {
             flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
             break;
           }

         //
         // process next ttck key
         //
 #ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
         //
         // SIMD -- read next key
         //
         ttck = ttck_keys[ttck_idx];
 #else
         //
         // SIMT -- refresh the ttck_s?
         //
         uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK;

         if (ttck_lane_next == 0)
           ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)];

         //
         // broadcast next key to entire subgroup
         //
 #if 0
         ttck    = sub_group_broadcast(ttck_s,ttck_lane_next);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
 #else
         ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND
         ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next);
 #endif
 #endif
         // continue scattering if on same YXL layer
       } while (skc_ttck_equal_yxl(ttck0,ttck));

       // finalize if no longer on same YX tile
       if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi))
         {
           // otherwise, unwind the tile styling and exit
           flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
         }

       //
       // given: new layer id from ttxk key
       //
       // load [layer id]{ group id, depth }
       //
       // if within current group's layer range
       //
       //   if at same depth
       //
       //     load and execute cover>[mask>]color>blend commands
       //
       //   else if not at same depth then move deeper
       //
       //     for all groups in group trail from cur depth to new depth
       //       enter group, saving and initializing regs as necessary
       //     increment depth and update layer range
       //     load and execute cover>[mask>]color>blend commands
       //
       // else not within layer range
       //
       //   exit current group, restoring regs as necessary
       //   decrement depth and update layer range
       //
       //
       skc_layer_id         const layer_id_new   = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi
       union skc_layer_node const layer_node_new = layers[layer_id_new];

       // clear flag that controls group/layer traversal
       flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE;

       do {
         bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0;

         //
         // is layer a child of the current parent group?
         //
         uint cmd_next = 0;

         if (!unwind && (layer_node_new.parent == group.id))
           {
             // execute this layer's cmds
             cmd_next = layer_node_new.cmds;

             // if this is final then configure so groups get unwound, otherwise we're done
             flags   |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE)
                         ? SKC_TILE_FLAGS_FLUSH_UNWIND
                         : SKC_TILE_FLAGS_FLUSH_COMPLETE);
           }
         else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi))
           {
             //
             // is layer in a child group?
             //
             union skc_group_parents const gp = groups[layer_node_new.parent].parents;
             uint                    const gn = gp.depth - ++group.depth;

             if (gn == 0)
               group.id = layer_node_new.parent;
             else
               group.id = commands[gp.base + gn - 1].parent;

             // update group layer range
             group.range = groups[group.id].range;

             // enter current group
             cmd_next    = groups[group.id].cmds.enter;
           }
         else // otherwise, exit this group
           {
             // enter current group
             cmd_next = groups[group.id].cmds.leave;

             // decrement group depth
             if (--group.depth == SKC_UINT_MAX)
               {
                 flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE;
               }
             else
               {
                 // get path_base of current group
                 uint const gnpb = groups[group.id].parents.base;

                 // get parent of current group
                 group.id    = commands[gnpb].parent;

                 // update group layer range
                 group.range = groups[group.id].range;
               }
           }

         //
         // execute cmds
         //
         while (true)
           {
             union skc_styling_cmd const cmd = commands[cmd_next++];

             switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE)
               {
               case SKC_STYLING_OPCODE_NOOP:
                 break;

               case SKC_STYLING_OPCODE_COVER_NONZERO:
                 skc_tile_cover_nonzero(smem,&cover_wip,&color_wip);
                 break;

               case SKC_STYLING_OPCODE_COVER_EVENODD:
                 skc_tile_cover_evenodd(smem,&cover_wip,&color_wip);
                 break;

               case SKC_STYLING_OPCODE_COVER_ACCUMULATE:
                 skc_tile_cover_accumulate(&cover_acc,&cover_wip);
                 break;

               case SKC_STYLING_OPCODE_COVER_MASK:
                 skc_tile_cover_wip_mask(&cover_wip,&cover_msk);
                 break;

               case SKC_STYLING_OPCODE_COVER_WIP_ZERO:
                 skc_tile_cover_wip_zero(&cover_wip);
                 break;

               case SKC_STYLING_OPCODE_COVER_ACC_ZERO:
                 skc_tile_cover_acc_zero(&cover_acc);
                 break;

               case SKC_STYLING_OPCODE_COVER_MASK_ZERO:
                 skc_tile_cover_msk_zero(&cover_msk);
                 break;

               case SKC_STYLING_OPCODE_COVER_MASK_ONE:
                 skc_tile_cover_msk_one(&cover_msk);
                 break;

               case SKC_STYLING_OPCODE_COVER_MASK_INVERT:
                 skc_tile_cover_msk_invert(&cover_msk);
                 break;

               case SKC_STYLING_OPCODE_COLOR_FILL_SOLID:
                 skc_tile_color_fill_solid(commands,&cmd_next,&color_wip);
                 break;

               case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR:
                 //
                 // FIXME -- gradients shouldn't be executing so much
                 // conditional driven code at runtime since we *know*
                 // the gradient style on the host can just create a
                 // new styling command to exploit this.
                 //
                 // FIXME -- it might be time to try using the GPU's
                 // sampler on a linear array of half4 vectors -- it
                 // might outperform the explicit load/lerp routines.
                 //
                 // FIXME -- optimizing for vertical gradients (uhhh,
                 // they're actually horizontal due to the -90 degree
                 // view transform) is nice but is it worthwhile to
                 // have this in the kernel?  Easy to add it back...
                 //
 #if defined( SKC_ARCH_GEN9 )
                 // disable gradients due to exessive spillage -- fix later
                 cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32);
 #else
                 skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi);
 #endif
                 break;

               case SKC_STYLING_OPCODE_COLOR_WIP_ZERO:
                 skc_tile_color_wip_zero(&color_wip);
                 break;

               case SKC_STYLING_OPCODE_COLOR_ACC_ZERO:
                 skc_tile_color_acc_zero(&color_acc);
                 break;

               case SKC_STYLING_OPCODE_BLEND_OVER:
                 skc_tile_blend_over(&color_acc,&cover_wip,&color_wip);
                 break;

               case SKC_STYLING_OPCODE_BLEND_PLUS:
                 skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip);
                 break;

               case SKC_STYLING_OPCODE_BLEND_MULTIPLY:
                 skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip);
                 break;

               case SKC_STYLING_OPCODE_BLEND_KNOCKOUT:
                 skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip);
                 break;

               case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK:
                 skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip);
                 break;

               case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK:
                 skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc);
                 break;

               case SKC_STYLING_OPCODE_BACKGROUND_OVER:
                 skc_tile_background_over(commands,&cmd_next,&color_acc);
                 break;

               case SKC_STYLING_OPCODE_SURFACE_COMPOSITE:
 #ifdef SKC_SURFACE_IS_BUFFER
                 skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi);
 #else
                 skc_surface_composite_u8_rgba(surface,              &color_acc,ttck0.hi);
 #endif
                 break;

               case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY:
                 if (skc_tile_color_test_opacity(&color_acc))
                   flags |= SKC_TILE_FLAGS_SCATTER_SKIP;
                 break;

               default:
                 return; // this is an illegal opcode -- trap and die!
               }

             //
             // if sign bit is set then this was final command
             //
             if (cmd.s32 < 0)
               break;
           }

         // continue as long as tile flush isn't complete
       } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0);

       // return if was the final flush
       if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE)
         return;

       // update wip ttck_hi
       ttck0 = ttck;
     }
 }

 //
 //
 //