src/compute/hs/cl/bench/main.c - skia.git - Git at Google

 /*
  * Copyright 2016 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can
  * be found in the LICENSE file.
  *
  */

 //
 //
 //

 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <inttypes.h>

 //
 // squelch OpenCL 1.2 deprecation warning
 //

 #ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #endif

 #include "common/macros.h"
 #include "common/cl/assert_cl.h"
 #include "common/cl/find_cl.h"
 //
 //
 //

 #include "hs_cl.h"

 //
 // FIXME -- LIMITED TO INTEL / GEN8+ FOR NOW
 //

 #include "intel/gen8/u32/hs_target.h"
 #include "intel/gen8/u64/hs_target.h"

 // #include "intel/gen9lp/u32/hs_target.h"
 // #include "intel/gen9lp/u64/hs_target.h"

 //
 // The quality of the RNG doesn't matter.  The same number of
 // instructions will be run no matter what the key distribution looks
 // like.  So here is something small and fast.
 //

 static
 uint32_t
 hs_rand_u32()
 {
   static uint32_t seed = 0xDEADBEEF;

   // Numerical Recipes
   seed = seed * 1664525 + 1013904223;

   return seed;
 }

 //
 //
 //

 static
 void
 hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words)
 {
 #if   1
   for (uint32_t ii=0; ii<count*words; ii++)
     vin_h[ii] = hs_rand_u32();
 #elif 0 // in-order
   memset(vin_h,0,count*words*sizeof(uint32_t));
   for (uint32_t ii=0; ii<count; ii++)
     vin_h[ii*words] = ii;
 #else   // reverse order
   memset(vin_h,0,count*words*sizeof(uint32_t));
   for (uint32_t ii=0; ii<count; ii++)
     vin_h[ii*words] = count - 1 - ii;
 #endif
 }

 //
 //
 //

 char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns);
 char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns);

 //
 //
 //

 static
 char const *
 hs_cpu_sort(void     *       sorted_h,
             uint32_t   const hs_words,
             uint32_t   const count,
             double   * const cpu_ns)
 {
   if (hs_words == 1)
     return hs_cpu_sort_u32(sorted_h,count,cpu_ns);
   else
     return hs_cpu_sort_u64(sorted_h,count,cpu_ns);
 }

 static
 void
 hs_transpose_slabs_u32(uint32_t const hs_words,
                        uint32_t const hs_width,
                        uint32_t const hs_height,
                        uint32_t *     vout_h,
                        uint32_t const count)
 {
   uint32_t   const slab_keys  = hs_width * hs_height;
   size_t     const slab_size  = sizeof(uint32_t) * hs_words * slab_keys;
   uint32_t * const slab       = ALLOCA_MACRO(slab_size);
   uint32_t         slab_count = count / slab_keys;

   while (slab_count-- > 0)
     {
       memcpy(slab,vout_h,slab_size);

       for (uint32_t row=0; row<hs_height; row++)
         for (uint32_t col=0; col<hs_width; col++)
           vout_h[col * hs_height + row] = slab[row * hs_width + col];

       vout_h += slab_keys;
     }
 }

 static
 void
 hs_transpose_slabs_u64(uint32_t const hs_words,
                        uint32_t const hs_width,
                        uint32_t const hs_height,
                        uint64_t *     vout_h,
                        uint32_t const count)
 {
   uint32_t   const slab_keys  = hs_width * hs_height;
   size_t     const slab_size  = sizeof(uint32_t) * hs_words * slab_keys;
   uint64_t * const slab       = ALLOCA_MACRO(slab_size);
   uint32_t         slab_count = count / slab_keys;

   while (slab_count-- > 0)
     {
       memcpy(slab,vout_h,slab_size);

       for (uint32_t row=0; row<hs_height; row++)
         for (uint32_t col=0; col<hs_width; col++)
           vout_h[col * hs_height + row] = slab[row * hs_width + col];

       vout_h += slab_keys;
     }
 }

 static
 void
 hs_transpose_slabs(uint32_t const hs_words,
                    uint32_t const hs_width,
                    uint32_t const hs_height,
                    void   *       vout_h,
                    uint32_t const count)
 {
   if (hs_words == 1)
     hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
   else
     hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
 }

 //
 //
 //

 static
 void
 hs_debug_u32(uint32_t const   hs_width,
              uint32_t const   hs_height,
              uint32_t const * vout_h,
              uint32_t const   count)
 {
   uint32_t const slab_keys = hs_width * hs_height;
   uint32_t const slabs     = (count + slab_keys - 1) / slab_keys;

   for (uint32_t ss=0; ss<slabs; ss++) {
     fprintf(stderr,"%u\n",ss);
     for (uint32_t cc=0; cc<hs_height; cc++) {
       for (uint32_t rr=0; rr<hs_width; rr++)
         fprintf(stderr,"%8" PRIX32 " ",*vout_h++);
       fprintf(stderr,"\n");
     }
   }
 }

 static
 void
 hs_debug_u64(uint32_t const   hs_width,
              uint32_t const   hs_height,
              uint64_t const * vout_h,
              uint32_t const   count)
 {
   uint32_t const slab_keys = hs_width * hs_height;
   uint32_t const slabs     = (count + slab_keys - 1) / slab_keys;

   for (uint32_t ss=0; ss<slabs; ss++) {
     fprintf(stderr,"%u\n",ss);
     for (uint32_t cc=0; cc<hs_height; cc++) {
       for (uint32_t rr=0; rr<hs_width; rr++)
         fprintf(stderr,"%16" PRIX64 " ",*vout_h++);
       fprintf(stderr,"\n");
     }
   }
 }

 //
 // Used for benchmarking on out-of-order queues.  Attaching an event
 // to a kernel on an OOQ with profiling enabled will result in a
 // synchronization point and block concurrent execution of kernels.
 //
 // The workaround that enables measuring the entire runtime of the
 // sort is to launch a dummy kernel with an event, a barrier without
 // an event, then the call to hs_sort(), followed by a final dummy
 // kernel with an event.
 //
 // The end time of the first dummy and start time of the second dummy
 // will provide a conservative estimate of the total execution time of
 // the hs_sort() routine.
 //
 // Note that once kernels are enqueued they are scheduled with only
 // microseconds between them so this should only be a small number of
 // microseconds longer than the true hs_sort() execution time.
 //

 #define HS_DUMMY_KERNEL_PROGRAM "kernel void hs_dummy_kernel() { ; }"

 static cl_kernel hs_dummy_kernel;

 static
 void
 hs_dummy_kernel_create(cl_context context, cl_device_id device_id)
 {
   cl_int err;

   char   const * strings[]        = { HS_DUMMY_KERNEL_PROGRAM         };
   size_t const   strings_sizeof[] = { sizeof(HS_DUMMY_KERNEL_PROGRAM) };

   cl_program program = clCreateProgramWithSource(context,
                                                  1,
                                                  strings,
                                                  strings_sizeof,
                                                  &err); cl_ok(err);
   cl(BuildProgram(program,
                   1,
                   &device_id,
                   NULL,
                   NULL,
                   NULL));

   hs_dummy_kernel = clCreateKernel(program,"hs_dummy_kernel",&err); cl_ok(err);

   cl(ReleaseProgram(program));
 }

 static
 void
 hs_dummy_kernel_release()
 {
   cl(ReleaseKernel(hs_dummy_kernel));
 }

 static
 void
 hs_dummy_kernel_enqueue(cl_command_queue cq,
                         uint32_t         wait_list_size,
                         cl_event const * wait_list,
                         cl_event       * event)
 {
   size_t const global_work_size = 1;

   cl(EnqueueNDRangeKernel(cq,
                           hs_dummy_kernel,
                           1,
                           NULL,
                           &global_work_size,
                           NULL,
                           wait_list_size,
                           wait_list,
                           event));
 }

 //
 //
 //

 static
 void
 hs_bench(cl_context                   context,
          cl_command_queue             cq,
          cl_command_queue             cq_profile,
          char           const * const device_name,
          char           const * const driver_version,
          uint32_t               const hs_words,
          uint32_t               const hs_width,
          uint32_t               const hs_height,
          struct hs_cl   const * const hs,
          uint32_t               const count_lo,
          uint32_t               const count_hi,
          uint32_t               const count_step,
          uint32_t               const loops,
          uint32_t               const warmup,
          bool                   const linearize)
 {
   //
   // return if nothing to do
   //
   if (count_hi <= 1)
     return;

   //
   // size the arrays
   //
   uint32_t count_hi_padded_in, count_hi_padded_out;

   hs_cl_pad(hs,count_hi,&count_hi_padded_in,&count_hi_padded_out);

   //
   // SIZE
   //
   size_t const key_size    = sizeof(uint32_t)    * hs_words;

   size_t const size_hi_in  = count_hi_padded_in  * key_size;
   size_t const size_hi_out = count_hi_padded_out * key_size;

   //
   // ALLOCATE
   //
   cl_int cl_err;

   void * sorted_h = malloc(size_hi_in);

   cl_mem random   = clCreateBuffer(context,
                                    CL_MEM_READ_ONLY  | CL_MEM_ALLOC_HOST_PTR,
                                    size_hi_in,
                                    NULL,&cl_err); cl_ok(cl_err);

   cl_mem vin      = clCreateBuffer(context,
                                    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                    size_hi_in,
                                    NULL,&cl_err); cl_ok(cl_err);

   cl_mem vout     = clCreateBuffer(context,
                                    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                    size_hi_out,
                                    NULL,&cl_err); cl_ok(cl_err);
   //
   // BLOCKING MAP AND INIT KEYS
   //
   {
     void * random_h = clEnqueueMapBuffer(cq,
                                          random,
                                          CL_TRUE,
                                          CL_MAP_WRITE_INVALIDATE_REGION,
                                          0,size_hi_in,
                                          0,NULL,NULL,
                                          &cl_err); cl_ok(cl_err);

     // fill with random numbers
     hs_fill_rand(random_h,count_hi,hs_words);

     //
     // UNMAP
     //
     cl(EnqueueUnmapMemObject(cq,random,random_h,0,NULL,NULL));
   }

   //
   // BENCHMARK
   //
   for (uint32_t count=count_lo; count<=count_hi; count+=count_step)
     {
       // compute padding before sorting
       uint32_t count_padded_in, count_padded_out;

       hs_cl_pad(hs,count,&count_padded_in,&count_padded_out);

       cl_ulong elapsed_ns_min = UINT64_MAX;
       cl_ulong elapsed_ns_max = 0;
       cl_ulong elapsed_ns_sum = 0;

       cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
       cl(Finish(cq));

       for (uint32_t ii=0; ii<warmup+loops; ii++)
         {
           if (ii == warmup)
             {
               elapsed_ns_min = UINT64_MAX;
               elapsed_ns_max = 0;
               elapsed_ns_sum = 0;
             }

 #if 0
           //
           // optionally, initialize vin on every loop -- no need
           //
           cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
           cl(Finish(cq));
 #endif

           //
           // sort vin
           //
           cl_event start, complete, end;

           hs_dummy_kernel_enqueue(cq_profile,0,NULL,&start);

           // note hs_sort enqueues a final barrier
           hs_cl_sort(hs,
                      cq,
                      1,&start,&complete,
                      vin,vout,
                      count,
                      count_padded_in,
                      count_padded_out,
                      linearize);

           hs_dummy_kernel_enqueue(cq_profile,1,&complete,&end);

           cl(Finish(cq_profile));

           //
           // measure duration
           //
           cl_ulong t_start=0, t_end=0;

           // start
           cl(GetEventProfilingInfo(start,
                                    CL_PROFILING_COMMAND_END,
                                    sizeof(cl_ulong),
                                    &t_start,
                                    NULL));

           // end
           cl(GetEventProfilingInfo(end,
                                    CL_PROFILING_COMMAND_START,
                                    sizeof(cl_ulong),
                                    &t_end,
                                    NULL));

           cl_ulong const t = t_end - t_start;

           elapsed_ns_min  = MIN_MACRO(elapsed_ns_min,t);
           elapsed_ns_max  = MAX_MACRO(elapsed_ns_max,t);
           elapsed_ns_sum += t;

           cl(ReleaseEvent(start));
           cl(ReleaseEvent(complete));
           cl(ReleaseEvent(end));
         }

       //
       // COPY KEYS BACK FOR VERIFICATION
       //
       size_t const size_padded_in = count_padded_in * key_size;

       void * vin_h = clEnqueueMapBuffer(cq,
                                         vin,
                                         CL_FALSE,
                                         CL_MAP_READ,
                                         0,size_padded_in,
                                         0,NULL,NULL,
                                         &cl_err); cl_ok(cl_err);

       void * vout_h = clEnqueueMapBuffer(cq,
                                          vout,
                                          CL_FALSE,
                                          CL_MAP_READ,
                                          0,size_padded_in,
                                          0,NULL,NULL,
                                          &cl_err); cl_ok(cl_err);
       cl(Finish(cq));

       //
       // SORT THE UNTOUCHED RANDOM INPUT
       //
       memcpy(sorted_h,vin_h,size_padded_in);

       double cpu_ns;

       char const * const algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns);

       //
       // EXPLICITLY TRANSPOSE THE CPU SORTED SLABS IF NOT LINEARIZING
       //
       if (!linearize)
         hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);

       //
       // VERIFY
       //
       bool const verified = memcmp(sorted_h,vout_h,size_padded_in) == 0;

 #ifndef NDEBUG
       if (!verified)
         {
           if (hs_words == 1)
             hs_debug_u32(hs_width,hs_height,vout_h,count);
           else // ulong
             hs_debug_u64(hs_width,hs_height,vout_h,count);
         }
 #endif

       cl(EnqueueUnmapMemObject(cq,vin, vin_h, 0,NULL,NULL));
       cl(EnqueueUnmapMemObject(cq,vout,vout_h,0,NULL,NULL));

       cl(Finish(cq));

       //
       // REPORT
       //
       fprintf(stdout,"%s, %s, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
               device_name,
               driver_version,
               (hs_words == 1) ? "uint" : "ulong",
               linearize ? "linear" : "slab",
               verified ? "  OK  " : "*FAIL*",
               count,
               count_padded_in,
               count_padded_out,
               // CPU
               algo,
               cpu_ns / 1000000.0,                       // milliseconds
               1000.0 * count / cpu_ns,                  // mkeys / sec
               // GPU
               loops,
               elapsed_ns_sum / 1000000.0 / loops,       // avg msecs
               elapsed_ns_min / 1000000.0,               // min msecs
               elapsed_ns_max / 1000000.0,               // max msecs
               1000.0 * count * loops / elapsed_ns_sum,  // mkeys / sec - avg
               1000.0 * count         / elapsed_ns_min); // mkeys / sec - max

       // quit early if not verified
       if (!verified)
         break;
     }

   //
   // dispose
   //
   cl(ReleaseMemObject(vout));
   cl(ReleaseMemObject(vin));
   cl(ReleaseMemObject(random));
   free(sorted_h);
 }

 //
 //
 //

 int
 main(int argc, char const * argv[])
 {
   char const * const target_platform_substring = "Intel";
   char const * const target_device_substring   = "Graphics";

   //
   // find platform and device ids
   //
   cl_platform_id platform_id;
   cl_device_id   device_id;

 #define HS_DEVICE_NAME_SIZE  64

   char   device_name[HS_DEVICE_NAME_SIZE];
   size_t device_name_size;

   cl(FindIdsByName(target_platform_substring,
                    target_device_substring,
                    &platform_id,
                    &device_id,
                    HS_DEVICE_NAME_SIZE,
                    device_name,
                    &device_name_size,
                    true));
   //
   // create context
   //
   cl_context_properties context_properties[] =
     {
       CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id,
       0
     };

   cl_int     cl_err;
   cl_context context = clCreateContext(context_properties,
                                        1,
                                        &device_id,
                                        NULL,
                                        NULL,
                                        &cl_err);
   cl_ok(cl_err);

   //
   // create command queue
   //
 #if 0 // OPENCL 2.0

   cl_queue_properties props[] = {
     CL_QUEUE_PROPERTIES,
     (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
 #ifndef NDEBUG
     (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
 #endif
     0
   };

   cl_queue_properties props_profile[] = {
     CL_QUEUE_PROPERTIES,
     (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
     0
   };

   cl_command_queue cq = clCreateCommandQueueWithProperties(context,
                                                            device_id,
                                                            props,
                                                            &cl_err); cl_ok(cl_err);

   cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context,
                                                                    device_id,
                                                                    props_profile,
                                                                    &cl_err); cl_ok(cl_err);
 #else // OPENCL 1.2

   cl_command_queue cq = clCreateCommandQueue(context,
                                              device_id,
 #ifndef NDEBUG
                                              CL_QUEUE_PROFILING_ENABLE |
 #endif
                                              CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
                                              &cl_err); cl_ok(cl_err);

   cl_command_queue cq_profile = clCreateCommandQueue(context,
                                                      device_id,
                                                      CL_QUEUE_PROFILING_ENABLE,
                                                      &cl_err); cl_ok(cl_err);
 #endif

   //
   // Intel GEN workaround -- create dummy kernel for semi-accurate
   // profiling on an out-of-order queue.
   //
   hs_dummy_kernel_create(context,device_id);

   //
   // select the target
   //

   uint32_t const key_val_words = (argc == 1) ? 2 : strtoul(argv[1],NULL,0);

   struct hs_cl_target const * hs_target;

   if (key_val_words == 1)
     hs_target = &hs_intel_gen8_u32;
   else
     hs_target = &hs_intel_gen8_u64;

   //
   // create kernels
   //
   fprintf(stdout,"Creating... ");

   struct hs_cl * const hs = hs_cl_create(hs_target,context,device_id);

   fprintf(stdout,"done.\n");

   //
   //
   //

 #ifdef NDEBUG
 #define HS_BENCH_LOOPS   100
 #define HS_BENCH_WARMUP  100
 #else
 #define HS_BENCH_LOOPS   1
 #define HS_BENCH_WARMUP  0
 #endif

   //
   // sort sizes and loops
   //
   uint32_t const kpb        = hs_target->config.slab.height << hs_target->config.slab.width_log2;

   uint32_t const count_lo   = (argc <= 2) ? kpb             : strtoul(argv[2],NULL,0);
   uint32_t const count_hi   = (argc <= 3) ? count_lo        : strtoul(argv[3],NULL,0);
   uint32_t const count_step = (argc <= 4) ? count_lo        : strtoul(argv[4],NULL,0);
   uint32_t const loops      = (argc <= 5) ? HS_BENCH_LOOPS  : strtoul(argv[5],NULL,0);
   uint32_t const warmup     = (argc <= 6) ? HS_BENCH_WARMUP : strtoul(argv[6],NULL,0);
   bool     const linearize  = (argc <= 7) ? true            : strtoul(argv[7],NULL,0);

   //
   // labels
   //
   fprintf(stdout,
           "Device, "
           "Driver, "
           "Type, "
           "Slab/Linear, "
           "Verified?, "
           "Keys, "
           "Keys Padded In, "
           "Keys Padded Out, "
           "CPU Algorithm, "
           "CPU Msecs, "
           "CPU Mkeys/s, "
           "Trials, "
           "Avg. Msecs, "
           "Min Msecs, "
           "Max Msecs, "
           "Avg. Mkeys/s, "
           "Max. Mkeys/s\n");

   //
   // we want to track driver versions
   //
   size_t driver_version_size;

   cl(GetDeviceInfo(device_id,
                    CL_DRIVER_VERSION,
                    0,
                    NULL,
                    &driver_version_size));

   char * const driver_version = ALLOCA_MACRO(driver_version_size);

   cl(GetDeviceInfo(device_id,
                    CL_DRIVER_VERSION,
                    driver_version_size,
                    driver_version,
                    NULL));
   //
   // benchmark
   //
   hs_bench(context,
            cq,cq_profile,
            device_name,
            driver_version,
            hs_target->config.words.key + hs_target->config.words.val,
            1 << hs_target->config.slab.width_log2,
            hs_target->config.slab.height,
            hs,
            count_lo,
            count_hi,
            count_step,
            loops,
            warmup,
            linearize);

   //
   // release everything
   //
   hs_cl_release(hs);

   hs_dummy_kernel_release();

   cl(ReleaseCommandQueue(cq));
   cl(ReleaseCommandQueue(cq_profile));

   cl(ReleaseContext(context));

   return 0;
 }
	/*
	* Copyright 2016 Google Inc.
	*
	* Use of this source code is governed by a BSD-style license that can
	* be found in the LICENSE file.
	*
	*/

	//
	//
	//

	#include <stdlib.h>
	#include <stdio.h>
	#include <string.h>
	#include <inttypes.h>

	//
	// squelch OpenCL 1.2 deprecation warning
	//

	#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
	#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
	#endif

	#include "common/macros.h"
	#include "common/cl/assert_cl.h"
	#include "common/cl/find_cl.h"
	//
	//
	//

	#include "hs_cl.h"

	//
	// FIXME -- LIMITED TO INTEL / GEN8+ FOR NOW
	//

	#include "intel/gen8/u32/hs_target.h"
	#include "intel/gen8/u64/hs_target.h"

	// #include "intel/gen9lp/u32/hs_target.h"
	// #include "intel/gen9lp/u64/hs_target.h"

	//
	// The quality of the RNG doesn't matter. The same number of
	// instructions will be run no matter what the key distribution looks
	// like. So here is something small and fast.
	//

	static
	uint32_t
	hs_rand_u32()
	{
	static uint32_t seed = 0xDEADBEEF;

	// Numerical Recipes
	seed = seed * 1664525 + 1013904223;

	return seed;
	}

	//
	//
	//

	static
	void
	hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words)
	{
	#if 1
	for (uint32_t ii=0; ii<count*words; ii++)
	vin_h[ii] = hs_rand_u32();
	#elif 0 // in-order
	memset(vin_h,0,countwordssizeof(uint32_t));
	for (uint32_t ii=0; ii<count; ii++)
	vin_h[ii*words] = ii;
	#else // reverse order
	memset(vin_h,0,countwordssizeof(uint32_t));
	for (uint32_t ii=0; ii<count; ii++)
	vin_h[ii*words] = count - 1 - ii;
	#endif
	}

	//
	//
	//

	char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns);
	char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns);

	//
	//
	//

	static
	char const *
	hs_cpu_sort(void * sorted_h,
	uint32_t const hs_words,
	uint32_t const count,
	double * const cpu_ns)
	{
	if (hs_words == 1)
	return hs_cpu_sort_u32(sorted_h,count,cpu_ns);
	else
	return hs_cpu_sort_u64(sorted_h,count,cpu_ns);
	}

	static
	void
	hs_transpose_slabs_u32(uint32_t const hs_words,
	uint32_t const hs_width,
	uint32_t const hs_height,
	uint32_t * vout_h,
	uint32_t const count)
	{
	uint32_t const slab_keys = hs_width * hs_height;
	size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
	uint32_t * const slab = ALLOCA_MACRO(slab_size);
	uint32_t slab_count = count / slab_keys;

	while (slab_count-- > 0)
	{
	memcpy(slab,vout_h,slab_size);

	for (uint32_t row=0; row<hs_height; row++)
	for (uint32_t col=0; col<hs_width; col++)
	vout_h[col * hs_height + row] = slab[row * hs_width + col];

	vout_h += slab_keys;
	}
	}

	static
	void
	hs_transpose_slabs_u64(uint32_t const hs_words,
	uint32_t const hs_width,
	uint32_t const hs_height,
	uint64_t * vout_h,
	uint32_t const count)
	{
	uint32_t const slab_keys = hs_width * hs_height;
	size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
	uint64_t * const slab = ALLOCA_MACRO(slab_size);
	uint32_t slab_count = count / slab_keys;

	while (slab_count-- > 0)
	{
	memcpy(slab,vout_h,slab_size);

	for (uint32_t row=0; row<hs_height; row++)
	for (uint32_t col=0; col<hs_width; col++)
	vout_h[col * hs_height + row] = slab[row * hs_width + col];

	vout_h += slab_keys;
	}
	}

	static
	void
	hs_transpose_slabs(uint32_t const hs_words,
	uint32_t const hs_width,
	uint32_t const hs_height,
	void * vout_h,
	uint32_t const count)
	{
	if (hs_words == 1)
	hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
	else
	hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
	}

	//
	//
	//

	static
	void
	hs_debug_u32(uint32_t const hs_width,
	uint32_t const hs_height,
	uint32_t const * vout_h,
	uint32_t const count)
	{
	uint32_t const slab_keys = hs_width * hs_height;
	uint32_t const slabs = (count + slab_keys - 1) / slab_keys;

	for (uint32_t ss=0; ss<slabs; ss++) {
	fprintf(stderr,"%u\n",ss);
	for (uint32_t cc=0; cc<hs_height; cc++) {
	for (uint32_t rr=0; rr<hs_width; rr++)
	fprintf(stderr,"%8" PRIX32 " ",*vout_h++);
	fprintf(stderr,"\n");
	}
	}
	}

	static
	void
	hs_debug_u64(uint32_t const hs_width,
	uint32_t const hs_height,
	uint64_t const * vout_h,
	uint32_t const count)
	{
	uint32_t const slab_keys = hs_width * hs_height;
	uint32_t const slabs = (count + slab_keys - 1) / slab_keys;

	for (uint32_t ss=0; ss<slabs; ss++) {
	fprintf(stderr,"%u\n",ss);
	for (uint32_t cc=0; cc<hs_height; cc++) {
	for (uint32_t rr=0; rr<hs_width; rr++)
	fprintf(stderr,"%16" PRIX64 " ",*vout_h++);
	fprintf(stderr,"\n");
	}
	}
	}

	//
	// Used for benchmarking on out-of-order queues. Attaching an event
	// to a kernel on an OOQ with profiling enabled will result in a
	// synchronization point and block concurrent execution of kernels.
	//
	// The workaround that enables measuring the entire runtime of the
	// sort is to launch a dummy kernel with an event, a barrier without
	// an event, then the call to hs_sort(), followed by a final dummy
	// kernel with an event.
	//
	// The end time of the first dummy and start time of the second dummy
	// will provide a conservative estimate of the total execution time of
	// the hs_sort() routine.
	//
	// Note that once kernels are enqueued they are scheduled with only
	// microseconds between them so this should only be a small number of
	// microseconds longer than the true hs_sort() execution time.
	//

	#define HS_DUMMY_KERNEL_PROGRAM "kernel void hs_dummy_kernel() { ; }"

	static cl_kernel hs_dummy_kernel;

	static
	void
	hs_dummy_kernel_create(cl_context context, cl_device_id device_id)
	{
	cl_int err;

	char const * strings[] = { HS_DUMMY_KERNEL_PROGRAM };
	size_t const strings_sizeof[] = { sizeof(HS_DUMMY_KERNEL_PROGRAM) };

	cl_program program = clCreateProgramWithSource(context,
	1,
	strings,
	strings_sizeof,
	&err); cl_ok(err);
	cl(BuildProgram(program,
	1,
	&device_id,
	NULL,
	NULL,
	NULL));

	hs_dummy_kernel = clCreateKernel(program,"hs_dummy_kernel",&err); cl_ok(err);

	cl(ReleaseProgram(program));
	}

	static
	void
	hs_dummy_kernel_release()
	{
	cl(ReleaseKernel(hs_dummy_kernel));
	}

	static
	void
	hs_dummy_kernel_enqueue(cl_command_queue cq,
	uint32_t wait_list_size,
	cl_event const * wait_list,
	cl_event * event)
	{
	size_t const global_work_size = 1;

	cl(EnqueueNDRangeKernel(cq,
	hs_dummy_kernel,
	1,
	NULL,
	&global_work_size,
	NULL,
	wait_list_size,
	wait_list,
	event));
	}

	//
	//
	//

	static
	void
	hs_bench(cl_context context,
	cl_command_queue cq,
	cl_command_queue cq_profile,
	char const * const device_name,
	char const * const driver_version,
	uint32_t const hs_words,
	uint32_t const hs_width,
	uint32_t const hs_height,
	struct hs_cl const * const hs,
	uint32_t const count_lo,
	uint32_t const count_hi,
	uint32_t const count_step,
	uint32_t const loops,
	uint32_t const warmup,
	bool const linearize)
	{
	//
	// return if nothing to do
	//
	if (count_hi <= 1)
	return;

	//
	// size the arrays
	//
	uint32_t count_hi_padded_in, count_hi_padded_out;

	hs_cl_pad(hs,count_hi,&count_hi_padded_in,&count_hi_padded_out);

	//
	// SIZE
	//
	size_t const key_size = sizeof(uint32_t) * hs_words;

	size_t const size_hi_in = count_hi_padded_in * key_size;
	size_t const size_hi_out = count_hi_padded_out * key_size;

	//
	// ALLOCATE
	//
	cl_int cl_err;

	void * sorted_h = malloc(size_hi_in);

	cl_mem random = clCreateBuffer(context,
	CL_MEM_READ_ONLY \| CL_MEM_ALLOC_HOST_PTR,
	size_hi_in,
	NULL,&cl_err); cl_ok(cl_err);

	cl_mem vin = clCreateBuffer(context,
	CL_MEM_READ_WRITE \| CL_MEM_ALLOC_HOST_PTR,
	size_hi_in,
	NULL,&cl_err); cl_ok(cl_err);

	cl_mem vout = clCreateBuffer(context,
	CL_MEM_READ_WRITE \| CL_MEM_ALLOC_HOST_PTR,
	size_hi_out,
	NULL,&cl_err); cl_ok(cl_err);
	//
	// BLOCKING MAP AND INIT KEYS
	//
	{
	void * random_h = clEnqueueMapBuffer(cq,
	random,
	CL_TRUE,
	CL_MAP_WRITE_INVALIDATE_REGION,
	0,size_hi_in,
	0,NULL,NULL,
	&cl_err); cl_ok(cl_err);

	// fill with random numbers
	hs_fill_rand(random_h,count_hi,hs_words);

	//
	// UNMAP
	//
	cl(EnqueueUnmapMemObject(cq,random,random_h,0,NULL,NULL));
	}

	//
	// BENCHMARK
	//
	for (uint32_t count=count_lo; count<=count_hi; count+=count_step)
	{
	// compute padding before sorting
	uint32_t count_padded_in, count_padded_out;

	hs_cl_pad(hs,count,&count_padded_in,&count_padded_out);

	cl_ulong elapsed_ns_min = UINT64_MAX;
	cl_ulong elapsed_ns_max = 0;
	cl_ulong elapsed_ns_sum = 0;

	cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
	cl(Finish(cq));

	for (uint32_t ii=0; ii<warmup+loops; ii++)
	{
	if (ii == warmup)
	{
	elapsed_ns_min = UINT64_MAX;
	elapsed_ns_max = 0;
	elapsed_ns_sum = 0;
	}

	#if 0
	//
	// optionally, initialize vin on every loop -- no need
	//
	cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
	cl(Finish(cq));
	#endif

	//
	// sort vin
	//
	cl_event start, complete, end;

	hs_dummy_kernel_enqueue(cq_profile,0,NULL,&start);

	// note hs_sort enqueues a final barrier
	hs_cl_sort(hs,
	cq,
	1,&start,&complete,
	vin,vout,
	count,
	count_padded_in,
	count_padded_out,
	linearize);

	hs_dummy_kernel_enqueue(cq_profile,1,&complete,&end);

	cl(Finish(cq_profile));

	//
	// measure duration
	//
	cl_ulong t_start=0, t_end=0;

	// start
	cl(GetEventProfilingInfo(start,
	CL_PROFILING_COMMAND_END,
	sizeof(cl_ulong),
	&t_start,
	NULL));

	// end
	cl(GetEventProfilingInfo(end,
	CL_PROFILING_COMMAND_START,
	sizeof(cl_ulong),
	&t_end,
	NULL));

	cl_ulong const t = t_end - t_start;

	elapsed_ns_min = MIN_MACRO(elapsed_ns_min,t);
	elapsed_ns_max = MAX_MACRO(elapsed_ns_max,t);
	elapsed_ns_sum += t;

	cl(ReleaseEvent(start));
	cl(ReleaseEvent(complete));
	cl(ReleaseEvent(end));
	}

	//
	// COPY KEYS BACK FOR VERIFICATION
	//
	size_t const size_padded_in = count_padded_in * key_size;

	void * vin_h = clEnqueueMapBuffer(cq,
	vin,
	CL_FALSE,
	CL_MAP_READ,
	0,size_padded_in,
	0,NULL,NULL,
	&cl_err); cl_ok(cl_err);

	void * vout_h = clEnqueueMapBuffer(cq,
	vout,
	CL_FALSE,
	CL_MAP_READ,
	0,size_padded_in,
	0,NULL,NULL,
	&cl_err); cl_ok(cl_err);
	cl(Finish(cq));

	//
	// SORT THE UNTOUCHED RANDOM INPUT
	//
	memcpy(sorted_h,vin_h,size_padded_in);

	double cpu_ns;

	char const * const algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns);

	//
	// EXPLICITLY TRANSPOSE THE CPU SORTED SLABS IF NOT LINEARIZING
	//
	if (!linearize)
	hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);

	//
	// VERIFY
	//
	bool const verified = memcmp(sorted_h,vout_h,size_padded_in) == 0;

	#ifndef NDEBUG
	if (!verified)
	{
	if (hs_words == 1)
	hs_debug_u32(hs_width,hs_height,vout_h,count);
	else // ulong
	hs_debug_u64(hs_width,hs_height,vout_h,count);
	}
	#endif

	cl(EnqueueUnmapMemObject(cq,vin, vin_h, 0,NULL,NULL));
	cl(EnqueueUnmapMemObject(cq,vout,vout_h,0,NULL,NULL));

	cl(Finish(cq));

	//
	// REPORT
	//
	fprintf(stdout,"%s, %s, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
	device_name,
	driver_version,
	(hs_words == 1) ? "uint" : "ulong",
	linearize ? "linear" : "slab",
	verified ? " OK " : "FAIL",
	count,
	count_padded_in,
	count_padded_out,
	// CPU
	algo,
	cpu_ns / 1000000.0, // milliseconds
	1000.0 * count / cpu_ns, // mkeys / sec
	// GPU
	loops,
	elapsed_ns_sum / 1000000.0 / loops, // avg msecs
	elapsed_ns_min / 1000000.0, // min msecs
	elapsed_ns_max / 1000000.0, // max msecs
	1000.0 * count * loops / elapsed_ns_sum, // mkeys / sec - avg
	1000.0 * count / elapsed_ns_min); // mkeys / sec - max

	// quit early if not verified
	if (!verified)
	break;
	}

	//
	// dispose
	//
	cl(ReleaseMemObject(vout));
	cl(ReleaseMemObject(vin));
	cl(ReleaseMemObject(random));
	free(sorted_h);
	}

	//
	//
	//

	int
	main(int argc, char const * argv[])
	{
	char const * const target_platform_substring = "Intel";
	char const * const target_device_substring = "Graphics";

	//
	// find platform and device ids
	//
	cl_platform_id platform_id;
	cl_device_id device_id;

	#define HS_DEVICE_NAME_SIZE 64

	char device_name[HS_DEVICE_NAME_SIZE];
	size_t device_name_size;

	cl(FindIdsByName(target_platform_substring,
	target_device_substring,
	&platform_id,
	&device_id,
	HS_DEVICE_NAME_SIZE,
	device_name,
	&device_name_size,
	true));
	//
	// create context
	//
	cl_context_properties context_properties[] =
	{
	CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id,
	0
	};

	cl_int cl_err;
	cl_context context = clCreateContext(context_properties,
	1,
	&device_id,
	NULL,
	NULL,
	&cl_err);
	cl_ok(cl_err);

	//
	// create command queue
	//
	#if 0 // OPENCL 2.0

	cl_queue_properties props[] = {
	CL_QUEUE_PROPERTIES,
	(cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
	#ifndef NDEBUG
	(cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
	#endif
	0
	};

	cl_queue_properties props_profile[] = {
	CL_QUEUE_PROPERTIES,
	(cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
	0
	};

	cl_command_queue cq = clCreateCommandQueueWithProperties(context,
	device_id,
	props,
	&cl_err); cl_ok(cl_err);

	cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context,
	device_id,
	props_profile,
	&cl_err); cl_ok(cl_err);
	#else // OPENCL 1.2

	cl_command_queue cq = clCreateCommandQueue(context,
	device_id,
	#ifndef NDEBUG
	CL_QUEUE_PROFILING_ENABLE \|
	#endif
	CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
	&cl_err); cl_ok(cl_err);

	cl_command_queue cq_profile = clCreateCommandQueue(context,
	device_id,
	CL_QUEUE_PROFILING_ENABLE,
	&cl_err); cl_ok(cl_err);
	#endif

	//
	// Intel GEN workaround -- create dummy kernel for semi-accurate
	// profiling on an out-of-order queue.
	//
	hs_dummy_kernel_create(context,device_id);

	//
	// select the target
	//

	uint32_t const key_val_words = (argc == 1) ? 2 : strtoul(argv[1],NULL,0);

	struct hs_cl_target const * hs_target;

	if (key_val_words == 1)
	hs_target = &hs_intel_gen8_u32;
	else
	hs_target = &hs_intel_gen8_u64;

	//
	// create kernels
	//
	fprintf(stdout,"Creating... ");

	struct hs_cl * const hs = hs_cl_create(hs_target,context,device_id);

	fprintf(stdout,"done.\n");

	//
	//
	//

	#ifdef NDEBUG
	#define HS_BENCH_LOOPS 100
	#define HS_BENCH_WARMUP 100
	#else
	#define HS_BENCH_LOOPS 1
	#define HS_BENCH_WARMUP 0
	#endif

	//
	// sort sizes and loops
	//
	uint32_t const kpb = hs_target->config.slab.height << hs_target->config.slab.width_log2;

	uint32_t const count_lo = (argc <= 2) ? kpb : strtoul(argv[2],NULL,0);
	uint32_t const count_hi = (argc <= 3) ? count_lo : strtoul(argv[3],NULL,0);
	uint32_t const count_step = (argc <= 4) ? count_lo : strtoul(argv[4],NULL,0);
	uint32_t const loops = (argc <= 5) ? HS_BENCH_LOOPS : strtoul(argv[5],NULL,0);
	uint32_t const warmup = (argc <= 6) ? HS_BENCH_WARMUP : strtoul(argv[6],NULL,0);
	bool const linearize = (argc <= 7) ? true : strtoul(argv[7],NULL,0);

	//
	// labels
	//
	fprintf(stdout,
	"Device, "
	"Driver, "
	"Type, "
	"Slab/Linear, "
	"Verified?, "
	"Keys, "
	"Keys Padded In, "
	"Keys Padded Out, "
	"CPU Algorithm, "
	"CPU Msecs, "
	"CPU Mkeys/s, "
	"Trials, "
	"Avg. Msecs, "
	"Min Msecs, "
	"Max Msecs, "
	"Avg. Mkeys/s, "
	"Max. Mkeys/s\n");

	//
	// we want to track driver versions
	//
	size_t driver_version_size;

	cl(GetDeviceInfo(device_id,
	CL_DRIVER_VERSION,
	0,
	NULL,
	&driver_version_size));

	char * const driver_version = ALLOCA_MACRO(driver_version_size);

	cl(GetDeviceInfo(device_id,
	CL_DRIVER_VERSION,
	driver_version_size,
	driver_version,
	NULL));
	//
	// benchmark
	//
	hs_bench(context,
	cq,cq_profile,
	device_name,
	driver_version,
	hs_target->config.words.key + hs_target->config.words.val,
	1 << hs_target->config.slab.width_log2,
	hs_target->config.slab.height,
	hs,
	count_lo,
	count_hi,
	count_step,
	loops,
	warmup,
	linearize);

	//
	// release everything
	//
	hs_cl_release(hs);

	hs_dummy_kernel_release();

	cl(ReleaseCommandQueue(cq));
	cl(ReleaseCommandQueue(cq_profile));

	cl(ReleaseContext(context));

	return 0;
	}