src/compute/skc/platforms/cl_12/path_builder_cl_12.c - skia - Git at Google

 /*
  * Copyright 2017 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can
  * be found in the LICENSE file.
  *
  */

 //
 //
 //

 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <float.h>
 #include <stdio.h>

 #include "common/cl/assert_cl.h"

 #include "context.h"
 #include "handle.h"
 #include "grid.h"
 #include "path.h"
 #include "path_builder.h"

 #include "config_cl.h"
 #include "export_cl_12.h"
 #include "runtime_cl_12.h"
 #include "path_builder_cl_12.h"

 //
 // OpenCL 1.2 devices support mapping of buffers into the host address
 // space.
 //
 // Mapped buffers must be aligned on MIN_DATA_TYPE_ALIGN_SIZE bit
 // boundary (e.g. 128 bytes).  This complicates coordinating sharing
 // of data between the host and the device.
 //
 // Some OpenCL 2.0 devices support fine-grained shared virtual memory
 // pointers with byte-addressing and allow simpler coordination
 // strategies at the cost of maintaining cache coherency.
 //
 // The path builder is focused on moving bulk path data from the host
 // into the device-managed "block" memory pool and arranging it into a
 // SIMT/SIMD-friendly data structure that can be efficiently read by
 // the rasterizer.
 //
 // Note that one simplifying assumption is that the maximum length of
 // a *single* path can't be larger than what fits in the single extent
 // (which is split into M subbuffers).  This would be a very long path
 // and a legitimate size limitation.
 //
 // For some systems, it may be appropriate to never pull path data
 // into the device-managed block pool and instead present the path
 // data to the device in a temporarily available allocated memory
 // "zone" of paths that can be discarded all at once.
 //
 // For other systems, it may be appropriate to simply copy the path
 // data from host to device.
 //
 // But the majority of OpenCL (and VK, MTL, DX12) devices we'll be
 // targeting support basic map/unmap functionality similar to OpenCL
 // 1.2.  Furthermore, not all OpenCL 2.0 devices support fine-grained
 // sharing of memory and still require a map/unmap step... but note
 // that they all support byte-aligned mapping and subbuffers.
 //
 // The general strategy that this particular CL_12 implementation uses
 // is to allocate a large mappable bulk-data path buffer and an
 // auxilary mappable command buffer.
 //
 // The buffers are split into a reasonable number of properly aligned
 // subbuffers to enable simultaneous host and device access.
 //

 //
 // Blocks:
 //   1 extent
 //   M mapped subbuffers (configurable) to allow for concurrency
 //
 // Commands:
 //   1 extent
 //   M mapped subbuffers (configurable) to allow for concurrency
 //
 // Spans:
 //   M hi/lo structures
 //
 // { cl_sub, void*, event, base }
 //
 // - size of sub buffer
 // - remaining
 //
 // - counts
 //

 //
 // For any kernel launch, at most one path will be discontiguous and
 // defined across two sub-buffers.
 //
 // Nodes are updated locally until full and then stored so they will
 // never be incomplete.  Headers are stored locally until the path is
 // ended so they will never be incomplete.
 //
 // A line, quad or cubic acquires 4/6/8 segments which may be spread
 // across one or more congtiguous blocks.
 //
 // If a flush() occurs then the remaining columns of multi-segment
 // paths are initialized with zero-length line, quad, cubic elements.
 //
 // Every block's command word has a type and a count acquired from a
 // rolling counter.
 //
 // The kernel is passed two spans of blocks { base, count } to
 // process.  The grid is must process (lo.count + hi.count) blocks.
 //

 struct skc_subbuffer_blocks
 {
   cl_mem   device;
   void *   host;
 };

 struct skc_subbuffer_cmds
 {
   cl_mem   device;
   void *   host;
   cl_event map;
 };

 //
 // ringdex is an index with range [0, blocks-per-subbuf * subbufs-per-buffer )
 //

 typedef skc_uint skc_ringdex_t;

 union skc_ringdex_expand
 {
   div_t      qr;

   struct {
 #ifndef SKC_DIV_REM_BEFORE_QUOT // offsetof(div_t,quot) != 0
     skc_uint subbuf;
     skc_uint block;
 #else
     skc_uint block;
     skc_uint subbuf;
 #endif
   };
 };

 //
 // this record is executed by the grid
 //

 struct skc_release_record
 {
   struct skc_path_builder_impl * impl; // back pointer to impl

   skc_grid_t                     grid; // pointer to scheduled grid

   skc_uint                       from; // inclusive starting index   : [from,to)
   skc_uint                       to;   // non-inclusive ending index : [from,to)
 };

 //
 //
 //

 struct skc_path_builder_impl
 {
   struct skc_path_builder       * path_builder;

   struct skc_runtime            * runtime;

   cl_command_queue                cq;

   struct {
     cl_kernel                     alloc;
     cl_kernel                     copy;
   } kernels;

   //
   // FIXME -- make this pointer to constant config
   //
   // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
   struct {
     skc_uint                      subbufs;  // how many subbufs in the buffer?

     struct {
       skc_uint                    buffer;   // how many blocks in the buffer?
       skc_uint                    subbuf;   // how many blocks in a   subbuf?
     } blocks_per;
   } ring;
   //
   // ^^^^^^^^^^^ don't duplicate these constants ^^^^^^^^^^^^^^^^^^
   //

   struct {
     cl_mem                        buffer;   // backing buffer for blocks
     struct skc_subbuffer_blocks * subbufs;  // array of structures
   } blocks;

   struct {
     cl_mem                        buffer;   // backing buffer for commands
     struct skc_subbuffer_cmds   * subbufs;  // array of structures
   } cmds;

   struct {
     struct skc_release_record   * records;  // max release records is equal to max subbufs
     skc_path_t                  * paths;    // max paths is less than or equal to max commands
   } release;

   cl_mem                          reads;    // each kernel only requires one word to store the block pool "base"

   struct {
     skc_uint                      rolling;  // rolling counter used by cmds to map to block pool alloc
     skc_ringdex_t                 from;
     skc_ringdex_t                 to;
   } prev;

   struct {
     skc_ringdex_t                 from;
     skc_ringdex_t                 to;
   } curr;

   struct {
     struct skc_path_head        * head;     // pointer to local path header -- not written until path end
     struct skc_path_node        * node;     // pointer to local node -- may alias head until head is full

     struct {
       skc_uint                    rolling;  // rolling counter of wip node -- valid after one node is allocated
       union skc_tagged_block_id * next;     // next slot in node -- may initially point to head.ids
       skc_uint                    rem;      // how many id slots left in node block
     } ids;

     struct {
       skc_uint                    rem;      // how many subblocks left in block?
       skc_uint                    rolling;  // rolling counter of block of subblocks
       float                     * next;     // next subblock in current subblock block
       skc_uint                    idx;      // index of next subblock
     } subblocks;

     struct {
       skc_uint                    one;      // .block = 1
       skc_uint                    next;     // rolling counter used by cmds to map to block pool alloc
     } rolling;

     skc_ringdex_t                 to;       // ringdex of _next_available_ command/block in ring -- FIXME -- should be current
   } wip;
 };

 //
 // FIXME -- move to a pow2 subbuffer size and dispense with division
 // and modulo operations
 //

 static
 union skc_ringdex_expand
 skc_ringdex_expand(struct skc_path_builder_impl * const impl,
                    skc_ringdex_t                  const ringdex)
 {
   return (union skc_ringdex_expand){
     .qr = div(ringdex,impl->ring.blocks_per.subbuf)
   };
 }

 static
 void
 skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl)
 {
   //
   // FIXME - which is faster?
   //
 #if 1
   impl->wip.to  = (impl->wip.to + 1) % impl->ring.blocks_per.buffer;
 #else
   impl->wip.to -= (impl->wip.to < impl->ring.blocks_per.buffer) ? -1 : impl->wip.to;
 #endif

   // this path is too long -- for now assert() and die
   assert(impl->wip.to != impl->curr.from);
 }

 static
 skc_ringdex_t
 skc_ringdex_span(struct skc_path_builder_impl * const impl,
                  skc_ringdex_t                  const from,
                  skc_ringdex_t                  const to)
 {
   return (to - from) % impl->ring.blocks_per.buffer;
 }

 static
 void
 skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl)
 {
   union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);

   // nothing to do if this is the first block in the subbuf
   if (to.block == 0)
     return;

   skc_uint const new_subbuf = (to.subbuf + 1) % impl->ring.subbufs;

   // otherwise increment and mod
   impl->wip.to = new_subbuf * impl->ring.blocks_per.subbuf;
 }

 static
 skc_bool
 skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl)
 {
   return impl->curr.from == impl->curr.to;
 }

 static
 skc_bool
 skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl)
 {
   return impl->prev.from == impl->prev.to;
 }

 static
 skc_uint
 skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl,
                           skc_uint                       const to_block)
 {
   // no blocks acquired OR this is last block in subbuf
   return !((impl->wip.to == impl->curr.to) || (to_block == 0));
 }

 //
 //
 //

 static
 struct skc_release_record *
 skc_release_curr(struct skc_path_builder_impl * const impl)
 {
   union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from);

   return impl->release.records + curr_from.subbuf;
 }

 //
 // FIXME -- get rid of all distant config references -- grab them at all at creation time
 //

 static
 void
 skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl)
 {
   // init header counters // { handle, blocks, nodes, prims }
   impl->wip.head->header = (union skc_path_header){
     .handle = 0,
     .blocks = 0,
     .nodes  = 0,
     .prims  = 0
   };

   // FIXME -- BOUNDS SHOULD USE SIMD4 TRICK AND NEGATE ONE OF THE CORNERS
   impl->wip.head->bounds  = (union skc_path_bounds){ +FLT_MIN, +FLT_MIN, -FLT_MIN, -FLT_MIN };

   // point wip ids at local head node
   impl->wip.ids.next      = impl->wip.head->tag_ids; // point to local head node
   impl->wip.ids.rem       = impl->runtime->config->block.words - SKC_PATH_HEAD_WORDS; // FIXME -- save this constant somewhere

   // start with no subblocks
   impl->wip.subblocks.rem = 0;
 }

 //
 //
 //

 static
 void
 skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl)
 {
 #if 1
   //
   // FIXME -- a Duff's device might be optimal here but would have to
   // be customized per device since node's could be 16-128+ words
   //
   while (impl->wip.ids.rem > 0)
     {
       impl->wip.ids.rem      -= 1;
       impl->wip.ids.next->u32 = SKC_TAGGED_BLOCK_ID_INVALID;
       impl->wip.ids.next     += 1;
     }
 #else
   memset(&impl->wip.ids.next->u32,
          SKC_TAGGED_BLOCK_ID_INVALID, // 0xFF
          sizeof(impl->wip.ids.next->u32) * impl->wip.ids.rem);

   impl->wip.ids.next += impl->wip.ids.rem;
   impl->wip.ids.rem   = 0;
 #endif
 }

 //
 //
 //

 static
 void
 skc_zero_float(skc_float * p, skc_uint rem)
 {
   memset(p,0,sizeof(*p)*rem);
 }

 static
 void
 skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder)
 {
   //
   // FIXME -- it might be more performant to zero the remaining
   // columns in a subblock -- a subblock at a time -- instead of the
   // same column across all the subblocks
   //
 #if 0
   while (path_builder->line.rem > 0)
     {
       --path_builder->line.rem;

       *path_builder->line.coords[0]++ = 0.0f;
       *path_builder->line.coords[1]++ = 0.0f;
       *path_builder->line.coords[2]++ = 0.0f;
       *path_builder->line.coords[3]++ = 0.0f;
     }

   while (path_builder->quad.rem > 0)
     {
       --path_builder->quad.rem;

       *path_builder->line.coords[0]++ = 0.0f;
       *path_builder->line.coords[1]++ = 0.0f;
       *path_builder->line.coords[2]++ = 0.0f;
       *path_builder->line.coords[3]++ = 0.0f;
       *path_builder->line.coords[4]++ = 0.0f;
       *path_builder->line.coords[5]++ = 0.0f;
     }

   while (path_builder->cubic.rem > 0)
     {
       --path_builder->cubic.rem;

       *path_builder->line.coords[0]++ = 0.0f;
       *path_builder->line.coords[1]++ = 0.0f;
       *path_builder->line.coords[2]++ = 0.0f;
       *path_builder->line.coords[3]++ = 0.0f;
       *path_builder->line.coords[4]++ = 0.0f;
       *path_builder->line.coords[5]++ = 0.0f;
       *path_builder->line.coords[6]++ = 0.0f;
       *path_builder->line.coords[7]++ = 0.0f;
     }
 #else
   if (path_builder->line.rem > 0)
     {
       skc_zero_float(path_builder->line.coords[0],path_builder->line.rem);
       skc_zero_float(path_builder->line.coords[1],path_builder->line.rem);
       skc_zero_float(path_builder->line.coords[2],path_builder->line.rem);
       skc_zero_float(path_builder->line.coords[3],path_builder->line.rem);

       path_builder->line.rem = 0;
     }

   if (path_builder->quad.rem > 0)
     {
       skc_zero_float(path_builder->quad.coords[0],path_builder->quad.rem);
       skc_zero_float(path_builder->quad.coords[1],path_builder->quad.rem);
       skc_zero_float(path_builder->quad.coords[2],path_builder->quad.rem);
       skc_zero_float(path_builder->quad.coords[3],path_builder->quad.rem);
       skc_zero_float(path_builder->quad.coords[4],path_builder->quad.rem);
       skc_zero_float(path_builder->quad.coords[5],path_builder->quad.rem);

       path_builder->quad.rem = 0;
     }

   if (path_builder->cubic.rem > 0)
     {
       skc_zero_float(path_builder->cubic.coords[0],path_builder->cubic.rem);
       skc_zero_float(path_builder->cubic.coords[1],path_builder->cubic.rem);
       skc_zero_float(path_builder->cubic.coords[2],path_builder->cubic.rem);
       skc_zero_float(path_builder->cubic.coords[3],path_builder->cubic.rem);
       skc_zero_float(path_builder->cubic.coords[4],path_builder->cubic.rem);
       skc_zero_float(path_builder->cubic.coords[5],path_builder->cubic.rem);
       skc_zero_float(path_builder->cubic.coords[6],path_builder->cubic.rem);
       skc_zero_float(path_builder->cubic.coords[7],path_builder->cubic.rem);

       path_builder->cubic.rem = 0;
     }
 #endif
 }

 //
 //
 //

 static
 void
 skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl,
                             skc_uint                             from,
                             skc_uint                             to)
 {
   // to might be out of range
   to = to % impl->ring.subbufs;

 #if 0
   fprintf(stderr,"unmap: [%2u,%2u)\n",from,to);
 #endif

   while (from != to) // 'to' might be out of range
     {
       // bring 'from' back in range
       from = from % impl->ring.subbufs;

       struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
       struct skc_subbuffer_cmds   * const cmds   = impl->cmds  .subbufs + from;

       cl(EnqueueUnmapMemObject(impl->cq,
                                blocks->device,
                                blocks->host,
                                0,NULL,NULL));

       cl(EnqueueUnmapMemObject(impl->cq,
                                cmds->device,
                                cmds->host,
                                0,NULL,NULL));

       // bring from back in range
       from = (from + 1) % impl->ring.subbufs;
     }
 }

 //
 // FIXME -- reuse this in create()
 //

 static
 void
 skc_path_builder_impl_map(struct skc_path_builder_impl * const impl,
                           skc_uint                             from,
                           skc_uint                             to)
 {
   // to might be out of range
   to = to % impl->ring.subbufs;

 #if 0
   fprintf(stderr,"  map: [%2u,%2u)\n",from,to);
 #endif

   while (from != to)
     {
       cl_int cl_err;

       struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
       struct skc_subbuffer_cmds   * const cmds   = impl->cmds  .subbufs + from;

       blocks->host = clEnqueueMapBuffer(impl->cq,
                                         blocks->device,
                                         CL_FALSE,
                                         CL_MAP_WRITE_INVALIDATE_REGION,
                                         0,impl->runtime->config->paths_copy.block.subbuf,
                                         0,NULL,NULL,
                                         &cl_err); cl_ok(cl_err);

       cl(ReleaseEvent(cmds->map));

       cmds->host   = clEnqueueMapBuffer(impl->cq,
                                         cmds->device,
                                         CL_FALSE,
                                         CL_MAP_WRITE_INVALIDATE_REGION,
                                         0,impl->runtime->config->paths_copy.command.subbuf,
                                         0,NULL,&cmds->map,
                                         &cl_err); cl_ok(cl_err);

       // bring from back in range
       from = (from + 1) % impl->ring.subbufs;
     }
   //
   // FIXME -- when we switch to out of order queues we'll need a barrier here
   //
 }

 //
 //
 //

 static
 void
 skc_path_builder_release_dispose(struct skc_release_record    * const release,
                                  struct skc_path_builder_impl * const impl)
 {
   struct skc_runtime * runtime = impl->runtime;

   if (release->from <= release->to) // no wrap
     {
       skc_path_t const * paths = impl->release.paths + release->from;
       skc_uint           count = release->to         - release->from;

       skc_grid_deps_unmap(runtime->deps,paths,count);
       skc_runtime_path_device_release(runtime,paths,count);
     }
   else // from > to implies wrap
     {
       skc_path_t const * paths_lo = impl->release.paths + release->from;
       skc_uint           count_lo = impl->ring.blocks_per.buffer - release->from;

       skc_grid_deps_unmap(runtime->deps,paths_lo,count_lo);
       skc_runtime_path_device_release(runtime,paths_lo,count_lo);

       skc_grid_deps_unmap(runtime->deps,impl->release.paths,release->to);
       skc_runtime_path_device_release(runtime,impl->release.paths,release->to);
     }

   release->to = release->from;
 }

 static
 void
 skc_path_builder_grid_pfn_dispose(skc_grid_t const grid)
 {
   struct skc_release_record    * const release = skc_grid_get_data(grid);
   struct skc_path_builder_impl * const impl    = release->impl;

   skc_path_builder_release_dispose(release,impl);
 }

 static
 void
 // skc_path_builder_complete(struct skc_release_record * const release)
 skc_path_builder_complete(skc_grid_t grid)
 {
   //
   // notify deps that this grid is complete enough for other grids to
   // proceed
   //
   // the path builder still has some cleanup to do before all its
   // resources can be reused
   //
   skc_grid_complete(grid);
 }

 static
 void
 skc_path_builder_paths_copy_cb(cl_event event, cl_int status, skc_grid_t grid)
 {
   SKC_CL_CB(status);

   struct skc_release_record * const release = skc_grid_get_data(grid);

   SKC_SCHEDULER_SCHEDULE(release->impl->runtime->scheduler,skc_path_builder_complete,grid);
 }

 //
 //
 //

 static
 void
 skc_path_builder_grid_pfn_waiting(skc_grid_t const grid)
 {
   struct skc_release_record    * const release = skc_grid_get_data(grid);
   struct skc_path_builder_impl * const impl    = release->impl;

   // 1. flush incomplete subblocks of path elements
   // 2. unmap subbuffer on cq.unmap
   // 3. flush cq.unmap
   // 4. launch kernel on cq.kernel but wait for unmap completion
   // 5. flush cq.kernel
   // 6. remap relevant subbuffers on cq.map but wait for kernel completion
   // 7. flush cq.map

   //
   // FIXME -- can be smarter about flushing if the wip paths are not
   // in the same subbuf as curr.to
   //
   // THIS IS IMPORTANT TO FIX
   //

   // flush incomplete subblocks
   skc_path_builder_finalize_subblocks(impl->path_builder);

   //
   // get range of subbufs that need to be unmapped
   //
   // note that impl->prev subbufs have already been unmapped
   //
   union skc_ringdex_expand       curr_from  = skc_ringdex_expand(impl,impl->curr.from);
   union skc_ringdex_expand       curr_to    = skc_ringdex_expand(impl,impl->curr.to);
   skc_uint                 const is_partial = curr_to.block > 0;
   skc_uint                 const unmap_to   = curr_to.subbuf + is_partial;

   //
   // unmap all subbufs in range [from,to)
   //
   skc_path_builder_impl_unmap(impl,curr_from.subbuf,unmap_to);

   //
   // launch kernels
   //
   skc_uint const pb_prev_span = skc_ringdex_span(impl,impl->prev.from,impl->prev.to);
   skc_uint const pb_curr_span = skc_ringdex_span(impl,impl->curr.from,impl->curr.to);
   skc_uint const pb_cmds      = pb_prev_span + pb_curr_span;

   //
   // 1) allocate blocks from pool
   //

   //
   // FIXME -- pack integers into struct/vector
   //
   cl(SetKernelArg(impl->kernels.alloc,0,SKC_CL_ARG(impl->runtime->block_pool.atomics.drw)));
   cl(SetKernelArg(impl->kernels.alloc,1,SKC_CL_ARG(impl->reads)));
   cl(SetKernelArg(impl->kernels.alloc,2,SKC_CL_ARG(curr_from.subbuf)));
   cl(SetKernelArg(impl->kernels.alloc,3,SKC_CL_ARG(pb_cmds)));

   skc_device_enqueue_kernel(impl->runtime->device,
                             SKC_DEVICE_KERNEL_ID_PATHS_ALLOC,
                             impl->cq,
                             impl->kernels.alloc,
                             1,
                             0,NULL,NULL);

   //
   // 2) copy blocks from unmapped device-accessible memory
   //

   //
   // FIXME -- pack integers into struct/vector and reduce 13 arguments down to 7
   //
   cl(SetKernelArg(impl->kernels.copy, 0,SKC_CL_ARG(impl->runtime->handle_pool.map.drw)));

   cl(SetKernelArg(impl->kernels.copy, 1,SKC_CL_ARG(impl->runtime->block_pool.ids.drw)));
   cl(SetKernelArg(impl->kernels.copy, 2,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
   cl(SetKernelArg(impl->kernels.copy, 3,SKC_CL_ARG(impl->runtime->block_pool.size->ring_mask)));

   cl(SetKernelArg(impl->kernels.copy, 4,SKC_CL_ARG(impl->reads)));
   cl(SetKernelArg(impl->kernels.copy, 5,SKC_CL_ARG(curr_from.subbuf)));

   cl(SetKernelArg(impl->kernels.copy, 6,SKC_CL_ARG(impl->cmds.buffer)));
   cl(SetKernelArg(impl->kernels.copy, 7,SKC_CL_ARG(impl->blocks.buffer)));

   cl(SetKernelArg(impl->kernels.copy, 8,SKC_CL_ARG(impl->ring.blocks_per.buffer)));
   cl(SetKernelArg(impl->kernels.copy, 9,SKC_CL_ARG(impl->prev.rolling)));

   cl(SetKernelArg(impl->kernels.copy,10,SKC_CL_ARG(impl->prev.from)));
   cl(SetKernelArg(impl->kernels.copy,11,SKC_CL_ARG(pb_prev_span)));
   cl(SetKernelArg(impl->kernels.copy,12,SKC_CL_ARG(impl->curr.from)));

   cl_event complete;

   skc_device_enqueue_kernel(impl->runtime->device,
                             SKC_DEVICE_KERNEL_ID_PATHS_COPY,
                             impl->cq,
                             impl->kernels.copy,
                             pb_cmds,
                             0,NULL,&complete);

   // set a callback on completion
   cl(SetEventCallback(complete,CL_COMPLETE,
                       skc_path_builder_paths_copy_cb,
                       grid));

   // immediately release
   cl(ReleaseEvent(complete));

   //
   // remap as many subbuffers as possible after the kernel completes
   //
   // note that remaps are async and enqueued on the same command queue
   // as the kernel launch
   //
   // we can't remap subbuffers that are in the possibly empty range
   //
   // cases:
   //
   //   - curr.to == wip.to which means no blocks have been acquired
   //   - curr.to points to first block in (next) subbuf
   //   - otherwise, wip acquired blocks in the curr.to subbuf
   //
   // check for these first 2 cases!
   //
   union skc_ringdex_expand const prev_from = skc_ringdex_expand(impl,impl->prev.from);
   skc_uint                 const no_wip    = impl->curr.to == impl->wip.to;
   skc_uint                       map_to    = curr_to.subbuf + (is_partial && no_wip);

   // remap all subbufs in range [from,to)
   skc_path_builder_impl_map(impl,prev_from.subbuf,map_to);

   // flush command queue
   cl(Flush(impl->cq));

   // save rolling
   impl->prev.rolling = impl->wip.rolling.next;

   // update prev and curr
   if (no_wip)
     {
       //
       // if there was no wip then round up to the next subbuf
       //
       skc_ringdex_wip_to_subbuf_inc(impl);

       //
       // update prev/curr with with incremented wip
       //
       impl->prev.from = impl->prev.to = impl->wip.to;
       impl->curr.from = impl->curr.to = impl->wip.to;
     }
   else
     {
       //
       // update prev with wip partials
       //
       impl->prev.from    = impl->curr.to;
       impl->prev.to      = impl->wip .to;

       //
       // start curr on a new subbuf boundary
       //
       skc_ringdex_wip_to_subbuf_inc(impl);

       impl->curr.from    = impl->wip.to;
       impl->curr.to      = impl->wip.to;
     }
 }

 //
 //
 //

 static
 void
 skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl,
                                         skc_uint                       const subbuf)
 {
   //
   // FIXME -- move to a power-of-two subbuf size and kickstart path
   // copies as early as possible
   //
   // FIXME -- the subbufs "self-clock" (flow control) the kernel
   // launches and accounting.  Combine all the subbuffers and release
   // records into a single indexable struct instead of 3.
   //
   struct skc_subbuffer_cmds * const sc        = impl->cmds.subbufs    + subbuf;
   struct skc_release_record * const release   = impl->release.records + subbuf;
   struct skc_scheduler      * const scheduler = impl->runtime->scheduler;

   // can't proceed until the paths have been released
   SKC_SCHEDULER_WAIT_WHILE(scheduler,release->from != release->to);

   // throw in a scheduler yield ... FIXME -- get rid of
   skc_scheduler_yield(scheduler);

   // can't proceed until the subbuffer is mapped
   cl(WaitForEvents(1,&sc->map));
 }

 //
 //
 //

 static
 union skc_ringdex_expand
 skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl)
 {
   // break ringdex into components
   union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);

   // does wip ringdex point to a new subbuffer?
   if (to.block == 0)
     {
       // potentially spin/block waiting for subbuffer
       skc_path_builder_impl_acquire_subbuffer(impl,to.subbuf);
     }

   // post increment wip.to
   skc_ringdex_wip_to_block_inc(impl);

   return to;
 }

 //
 //
 //

 static
 skc_uint
 skc_rolling_block(skc_uint const rolling, skc_uint const tag)
 {
   return rolling | tag;
 }

 static
 skc_uint
 skc_rolling_subblock(skc_uint const rolling, skc_uint const subblock, skc_uint const tag)
 {
   return rolling | (subblock << SKC_TAGGED_BLOCK_ID_BITS_TAG) | tag;
 }

 static
 void
 skc_rolling_inc(struct skc_path_builder_impl * const impl)
 {
   impl->wip.rolling.next += impl->wip.rolling.one;
 }

 //
 //
 //

 static
 void *
 skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl,
                                   skc_uint                       const rolling,
                                   skc_cmd_paths_copy_tag         const tag)
 {
   // bump blocks count
   impl->wip.head->header.blocks += 1;

   // acquire a block
   union skc_ringdex_expand    const to          = skc_path_builder_impl_acquire_block(impl);

   // make a pointer
   union skc_tagged_block_id * const cmds_subbuf = impl->cmds.subbufs[to.subbuf].host;

   // store command for block
   cmds_subbuf[to.block].u32 = skc_rolling_block(rolling,tag);

 #if 0
   // store command for block
   cmds_subbuf[to.block].u32 = skc_rolling_block(impl->wip.rolling.next,tag);

   // increment rolling
   skc_rolling_inc(impl);
 #endif

   // return pointer to block
   float * const blocks_subbuf = impl->blocks.subbufs[to.subbuf].host;

   // FIXME -- make it easier to get config constant
   return blocks_subbuf + (to.block * impl->runtime->config->block.words);
 }

 //
 //
 //

 static
 void
 skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl)
 {
   // store command to subbuf and get pointer to blocks subbuf
   void * const block = skc_path_builder_impl_new_command(impl,impl->wip.ids.rolling,
                                                          SKC_CMD_PATHS_COPY_TAG_NODE);

   // copy head to blocks subbuf -- write-only
   memcpy(block,impl->wip.node,impl->runtime->config->block.bytes);
 }

 static
 void
 skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl)
 {
   // store command to subbuf and get pointer to blocks subbuf
   void * const block = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
                                                          SKC_CMD_PATHS_COPY_TAG_HEAD);

   // copy head to blocks subbuf -- write-only
   memcpy(block,impl->wip.head,impl->runtime->config->block.bytes);

   // increment rolling
   skc_rolling_inc(impl);

   // the 'to' index is non-inclusive so assign wip.to after flush_head
   impl->curr.to = impl->wip.to;
 }

 //
 //
 //

 static
 void
 skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl)
 {
   // update final block id in node
   impl->wip.ids.next->u32 = skc_rolling_block(impl->wip.rolling.next,SKC_BLOCK_ID_TAG_PATH_NEXT);

   // if wip.ids is not the header then flush now full wip node
   if (impl->wip.head->header.nodes > 0)
     skc_path_builder_impl_flush_node(impl);

   // bump node count
   impl->wip.head->header.nodes += 1;

   // save current rolling
   impl->wip.ids.rolling = impl->wip.rolling.next;

   // increment rolling
   skc_rolling_inc(impl);

   // update wip.ids.*
   impl->wip.ids.next = impl->wip.node->tag_ids;
   impl->wip.ids.rem  = impl->runtime->config->block.words;
 }

 static
 void
 skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl)
 {
   impl->wip.subblocks.rem     = impl->runtime->config->block.subblocks; // FIXME -- move constants closer to structure
   impl->wip.subblocks.rolling = impl->wip.rolling.next;
   impl->wip.subblocks.next    = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
                                                                   SKC_CMD_PATHS_COPY_TAG_SEGS);
   impl->wip.subblocks.idx     = 0;

   // increment rolling
   skc_rolling_inc(impl);
 }

 //
 //
 //

 static
 void
 skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl,
                                         skc_block_id_tag                     tag,
                                         skc_uint                             vertices,
                                         float * *                            subblocks)
 {
   //
   // FIRST TAG RECORDS THE ELEMENT TYPE
   //
   while (true)
     {
       // if only one block id left in node then acquire new node block
       // and append its block id as with a next tag
       if (impl->wip.ids.rem == 1)
         skc_path_builder_impl_new_node_block(impl);

       // if zero subblocks left then acquire a new subblock block and
       // append its block id
       if (impl->wip.subblocks.rem == 0)
         skc_path_builder_impl_new_segs_block(impl);

       // save first command -- tag and subblocks may have been updated
       impl->wip.ids.next->u32 = skc_rolling_subblock(impl->wip.subblocks.rolling,impl->wip.subblocks.idx,tag);

       // increment node block subblock pointer
       impl->wip.ids.next += 1;
       impl->wip.ids.rem  -= 1;

       // how many vertices can we store
       skc_uint rem = min(vertices,impl->wip.subblocks.rem);

       // decrement vertices
       vertices                -= rem;
       impl->wip.subblocks.rem -= rem;
       impl->wip.subblocks.idx += rem;

       // assign subblocks
       do {
         *subblocks++              = impl->wip.subblocks.next;
         impl->wip.subblocks.next += impl->runtime->config->subblock.words;
         // FIXME -- move constants closer to structure
       } while (--rem > 0);

       // anything left to do?
       if (vertices == 0)
         break;

       // any tag after this will be a caboose command
       tag = SKC_BLOCK_ID_TAG_PATH_NEXT;
     }
 }

 //
 //
 //

 static
 void
 skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl, skc_path_t * const path)
 {
   // finalize incomplete active subblocks -- we don't care about any
   // remaining unused subblocks in block
   skc_path_builder_finalize_subblocks(impl->path_builder);

   // mark remaining wips.ids in the head or node as invalid
   skc_path_builder_impl_finalize_node(impl);

   // flush node if rem > 0 and node is not actually head
   if (impl->wip.head->header.nodes >= 1)
     skc_path_builder_impl_flush_node(impl);

   // acquire path host id
   *path = skc_runtime_handle_device_acquire(impl->runtime); // FIXME -- MAY WANT TO GRAB AN ID ON BEGIN

   // save path host handle
   impl->wip.head->header.handle = *path;

   // flush head -- acquires a block and bumps head->header.blocks
   skc_path_builder_impl_flush_head(impl);

   // get current release
   struct skc_release_record * const release = skc_release_curr(impl);

   // acquire grid if null
   if (release->grid == NULL)
     {
       release->grid =
         SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
                              &release->grid, // NULL on start/force
                              release,        // data payload
                              skc_path_builder_grid_pfn_waiting,
                              NULL,           // no execute pfn
                              skc_path_builder_grid_pfn_dispose);
     }

   // update grid map
   skc_grid_map(release->grid,*path);

   // update path release
   impl->release.paths[release->to] = *path;

   // increment release.to
   release->to = (release->to + 1) % impl->ring.blocks_per.buffer;

   // add guard bit
   *path |= SKC_TYPED_HANDLE_TYPE_IS_PATH;

 #if 1
   //
   // eager kernel launch?
   //
   {
     union skc_ringdex_expand const curr_from = skc_ringdex_expand(impl,impl->curr.from);
     union skc_ringdex_expand const curr_to   = skc_ringdex_expand(impl,impl->curr.to);

     if (curr_from.subbuf != curr_to.subbuf)
       {
         skc_grid_start(release->grid);
         // skc_scheduler_yield(impl->runtime->scheduler);
       }
   }
 #endif
 }

 //
 // FIXME -- clean up accessing of CONFIG constants in these 3 routines
 //

 static
 void
 skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl)
 {
   // acquire subblock pointers
   skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_LINE,4,
                                           impl->path_builder->line.coords);

   // increment line count
   impl->wip.head->header.prims += 1;

   // update rem_count_xxx count
   impl->path_builder->line.rem = impl->runtime->config->subblock.words;
 }

 static
 void
 skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl)
 {
   // acquire subblock pointers
   skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_QUAD,6,
                                           impl->path_builder->quad.coords);

   // increment line count
   impl->wip.head->header.prims += 1;

   // update rem_count_xxx count
   impl->path_builder->quad.rem = impl->runtime->config->subblock.words;
 }

 static
 void
 skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl)
 {
   // acquire subblock pointers
   skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_CUBIC,8,
                                           impl->path_builder->cubic.coords);

   // increment line count
   impl->wip.head->header.prims += 1;

   // update rem_count_xxx count
   impl->path_builder->cubic.rem = impl->runtime->config->subblock.words;
 }

 //
 //
 //

 static
 void
 skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl)
 {
   // decrement reference count
   if (--impl->path_builder->refcount != 0)
     return;

   //
   // otherwise, dispose of everything
   //
   struct skc_runtime * const runtime = impl->runtime;

   // free path builder
   skc_runtime_host_perm_free(impl->runtime,impl->path_builder);

   // release cq
   skc_runtime_release_cq_in_order(runtime,impl->cq);

   // release kernels
   cl(ReleaseKernel(impl->kernels.alloc));
   cl(ReleaseKernel(impl->kernels.copy));

   // free blocks extents
   cl(ReleaseMemObject(impl->blocks.buffer));
   skc_runtime_host_perm_free(runtime,impl->blocks.subbufs);

   cl(ReleaseMemObject(impl->cmds.buffer));
   skc_runtime_host_perm_free(runtime,impl->cmds.subbufs);

   // free records
   skc_runtime_host_perm_free(runtime,impl->release.records);
   skc_runtime_host_perm_free(runtime,impl->release.paths);

   // release staging head and node
   skc_runtime_host_perm_free(runtime,impl->wip.head);
   skc_runtime_host_perm_free(runtime,impl->wip.node);

   // release reads scratch array
   cl(ReleaseMemObject(impl->reads));

   // for all subbuffers
   //   unmap   subbuffer
   //   release subbuffer
   // printf("%s not releasing subbuffers\n",__func__);

   skc_runtime_host_perm_free(impl->runtime,impl);
 }

 //
 //
 //

 skc_err
 skc_path_builder_cl_12_create(struct skc_context        * const context,
                               struct skc_path_builder * * const path_builder)
 {
   //
   // retain the context
   // skc_context_retain(context);
   //
   struct skc_runtime * const runtime = context->runtime;

   // allocate path builder
   (*path_builder)             = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**path_builder));

   // init state
   SKC_ASSERT_STATE_INIT((*path_builder),SKC_PATH_BUILDER_STATE_READY);

   (*path_builder)->context    = context;

   // save opaque impl-specific pointers
   (*path_builder)->begin      = skc_path_builder_pfn_begin;
   (*path_builder)->end        = skc_path_builder_pfn_end;
   (*path_builder)->new_line   = skc_path_builder_pfn_new_line;
   (*path_builder)->new_quad   = skc_path_builder_pfn_new_quad;
   (*path_builder)->new_cubic  = skc_path_builder_pfn_new_cubic;
   (*path_builder)->release    = skc_path_builder_pfn_release;

   // initialize path builder counts
   (*path_builder)->line.rem   = 0;
   (*path_builder)->quad.rem   = 0;
   (*path_builder)->cubic.rem  = 0;

   (*path_builder)->refcount   = 1;

   struct skc_path_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));

   (*path_builder)->impl       = impl;

   //
   // init impl
   //
   impl->path_builder  = *path_builder;
   impl->runtime       = runtime;

   impl->cq            = skc_runtime_acquire_cq_in_order(runtime);

   impl->kernels.alloc = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_ALLOC);
   impl->kernels.copy  = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_COPY);

   //
   // FIXME -- let these config constants remain constant and in place
   //
   struct skc_config const * const config = runtime->config;

   impl->ring.subbufs           = config->paths_copy.buffer.count;
   impl->ring.blocks_per.buffer = config->paths_copy.subbuf.count * config->paths_copy.buffer.count;
   impl->ring.blocks_per.subbuf = config->paths_copy.subbuf.count;
   //
   // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   //

   cl_int cl_err;

   // allocate large device-side extent for path data
   impl->blocks.buffer   = clCreateBuffer(runtime->cl.context,
                                          CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                                          config->paths_copy.block.buffer, // FIXME -- either use config or local constants everywhere
                                          NULL,&cl_err); cl_ok(cl_err);

   // allocate small host-side array of pointers to mapped subbufs
   impl->blocks.subbufs  = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
                                                       impl->ring.subbufs *
                                                       sizeof(*impl->blocks.subbufs));

   // allocate large device-side extent for path copy commands
   impl->cmds.buffer     = clCreateBuffer(runtime->cl.context,
                                          CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                                          config->paths_copy.command.buffer,
                                          NULL,&cl_err); cl_ok(cl_err);

   // allocate small host-side array of pointers to mapped subbufs
   impl->cmds.subbufs    = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
                                                       impl->ring.subbufs *
                                                       sizeof(*impl->cmds.subbufs));

   // allocate small host-side array of intervals of path handles
   impl->release.records = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
                                                       impl->ring.subbufs *
                                                       sizeof(*impl->release.records));

   // allocate large host-side array that is max # of path handles in flight
   impl->release.paths   = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
                                                       impl->ring.blocks_per.buffer *
                                                       sizeof(*impl->release.paths));

   // small scratch used by kernels
   impl->reads           = clCreateBuffer(runtime->cl.context,
                                          CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
                                          sizeof(skc_uint) * impl->ring.subbufs,
                                          NULL,&cl_err); cl_ok(cl_err);

   // initialize release record with impl backpointer
   for (skc_uint ii=0; ii<impl->ring.subbufs; ii++)
     {
       struct skc_release_record * record = impl->release.records + ii;

       record->impl = impl;
       record->grid = NULL;
       record->from = record->to = ii * impl->ring.blocks_per.subbuf;
     }

   //
   // allocate and map subbuffers -- we always check the command
   // subbuffer's map/unmap events before touching it or its associated
   // block subbuffer.
   //
   struct skc_subbuffer_blocks * sb = impl->blocks.subbufs;
   struct skc_subbuffer_cmds   * sc = impl->cmds  .subbufs;

   cl_buffer_region              rb = { 0, config->paths_copy.block.subbuf   };
   cl_buffer_region              rc = { 0, config->paths_copy.command.subbuf };

   // for each subbuffer
   for (skc_uint ii=0; ii<config->paths_copy.buffer.count; ii++)
     {
       sb->device = clCreateSubBuffer(impl->blocks.buffer,
                                      CL_MEM_HOST_WRITE_ONLY,
                                      CL_BUFFER_CREATE_TYPE_REGION,
                                      &rb,
                                      &cl_err); cl_ok(cl_err);

       sb->host   = clEnqueueMapBuffer(impl->cq,
                                       sb->device,
                                       CL_FALSE,
                                       CL_MAP_WRITE_INVALIDATE_REGION,
                                       0,rb.size,
                                       0,NULL,NULL,
                                       &cl_err); cl_ok(cl_err);

       sc->device = clCreateSubBuffer(impl->cmds.buffer,
                                      CL_MEM_HOST_WRITE_ONLY,
                                      CL_BUFFER_CREATE_TYPE_REGION,
                                      &rc,
                                      &cl_err); cl_ok(cl_err);

       sc->host   = clEnqueueMapBuffer(impl->cq,
                                       sc->device,
                                       CL_FALSE,
                                       CL_MAP_WRITE_INVALIDATE_REGION,
                                       0,rc.size,
                                       0,NULL,&sc->map,
                                       &cl_err); cl_ok(cl_err);
       sb        += 1;
       sc        += 1;

       rb.origin += rb.size;
       rc.origin += rc.size;
     }

   //
   // initialize remaining members
   //
   impl->prev.from        = 0;
   impl->prev.to          = 0;
   impl->prev.rolling     = 0;

   impl->curr.from        = 0;
   impl->curr.to          = 0;

   impl->wip.to           = 0;

   impl->wip.head         = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
   impl->wip.node         = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);

   impl->wip.rolling.one  = SKC_BLOCK_ID_TAG_COUNT * config->block.subblocks;
   impl->wip.rolling.next = 0;

   // for now, completely initialize builder before returning
   cl(Finish(impl->cq));

   return SKC_ERR_SUCCESS;
 }

 //
 //
 //