| /* |
| * Copyright 2016 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can |
| * be found in the LICENSE file. |
| * |
| */ |
| |
| // |
| // |
| // |
| |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <inttypes.h> |
| |
| // |
| // squelch OpenCL 1.2 deprecation warning |
| // |
| |
| #ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS |
| #define CL_USE_DEPRECATED_OPENCL_1_2_APIS |
| #endif |
| |
| #include "common/macros.h" |
| #include "common/cl/assert_cl.h" |
| #include "common/cl/find_cl.h" |
| // |
| // |
| // |
| |
| #include "hs_cl.h" |
| |
| // |
| // FIXME -- LIMITED TO INTEL / GEN8+ FOR NOW |
| // |
| |
| #include "intel/gen8/u32/hs_target.h" |
| #include "intel/gen8/u64/hs_target.h" |
| |
| // #include "intel/gen9lp/u32/hs_target.h" |
| // #include "intel/gen9lp/u64/hs_target.h" |
| |
| // |
| // The quality of the RNG doesn't matter. The same number of |
| // instructions will be run no matter what the key distribution looks |
| // like. So here is something small and fast. |
| // |
| |
| static |
| uint32_t |
| hs_rand_u32() |
| { |
| static uint32_t seed = 0xDEADBEEF; |
| |
| // Numerical Recipes |
| seed = seed * 1664525 + 1013904223; |
| |
| return seed; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words) |
| { |
| #if 1 |
| for (uint32_t ii=0; ii<count*words; ii++) |
| vin_h[ii] = hs_rand_u32(); |
| #elif 0 // in-order |
| memset(vin_h,0,count*words*sizeof(uint32_t)); |
| for (uint32_t ii=0; ii<count; ii++) |
| vin_h[ii*words] = ii; |
| #else // reverse order |
| memset(vin_h,0,count*words*sizeof(uint32_t)); |
| for (uint32_t ii=0; ii<count; ii++) |
| vin_h[ii*words] = count - 1 - ii; |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns); |
| char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns); |
| |
| // |
| // |
| // |
| |
| static |
| char const * |
| hs_cpu_sort(void * sorted_h, |
| uint32_t const hs_words, |
| uint32_t const count, |
| double * const cpu_ns) |
| { |
| if (hs_words == 1) |
| return hs_cpu_sort_u32(sorted_h,count,cpu_ns); |
| else |
| return hs_cpu_sort_u64(sorted_h,count,cpu_ns); |
| } |
| |
| static |
| void |
| hs_transpose_slabs_u32(uint32_t const hs_words, |
| uint32_t const hs_width, |
| uint32_t const hs_height, |
| uint32_t * vout_h, |
| uint32_t const count) |
| { |
| uint32_t const slab_keys = hs_width * hs_height; |
| size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; |
| uint32_t * const slab = ALLOCA_MACRO(slab_size); |
| uint32_t slab_count = count / slab_keys; |
| |
| while (slab_count-- > 0) |
| { |
| memcpy(slab,vout_h,slab_size); |
| |
| for (uint32_t row=0; row<hs_height; row++) |
| for (uint32_t col=0; col<hs_width; col++) |
| vout_h[col * hs_height + row] = slab[row * hs_width + col]; |
| |
| vout_h += slab_keys; |
| } |
| } |
| |
| static |
| void |
| hs_transpose_slabs_u64(uint32_t const hs_words, |
| uint32_t const hs_width, |
| uint32_t const hs_height, |
| uint64_t * vout_h, |
| uint32_t const count) |
| { |
| uint32_t const slab_keys = hs_width * hs_height; |
| size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; |
| uint64_t * const slab = ALLOCA_MACRO(slab_size); |
| uint32_t slab_count = count / slab_keys; |
| |
| while (slab_count-- > 0) |
| { |
| memcpy(slab,vout_h,slab_size); |
| |
| for (uint32_t row=0; row<hs_height; row++) |
| for (uint32_t col=0; col<hs_width; col++) |
| vout_h[col * hs_height + row] = slab[row * hs_width + col]; |
| |
| vout_h += slab_keys; |
| } |
| } |
| |
| static |
| void |
| hs_transpose_slabs(uint32_t const hs_words, |
| uint32_t const hs_width, |
| uint32_t const hs_height, |
| void * vout_h, |
| uint32_t const count) |
| { |
| if (hs_words == 1) |
| hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count); |
| else |
| hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_debug_u32(uint32_t const hs_width, |
| uint32_t const hs_height, |
| uint32_t const * vout_h, |
| uint32_t const count) |
| { |
| uint32_t const slab_keys = hs_width * hs_height; |
| uint32_t const slabs = (count + slab_keys - 1) / slab_keys; |
| |
| for (uint32_t ss=0; ss<slabs; ss++) { |
| fprintf(stderr,"%u\n",ss); |
| for (uint32_t cc=0; cc<hs_height; cc++) { |
| for (uint32_t rr=0; rr<hs_width; rr++) |
| fprintf(stderr,"%8" PRIX32 " ",*vout_h++); |
| fprintf(stderr,"\n"); |
| } |
| } |
| } |
| |
| static |
| void |
| hs_debug_u64(uint32_t const hs_width, |
| uint32_t const hs_height, |
| uint64_t const * vout_h, |
| uint32_t const count) |
| { |
| uint32_t const slab_keys = hs_width * hs_height; |
| uint32_t const slabs = (count + slab_keys - 1) / slab_keys; |
| |
| for (uint32_t ss=0; ss<slabs; ss++) { |
| fprintf(stderr,"%u\n",ss); |
| for (uint32_t cc=0; cc<hs_height; cc++) { |
| for (uint32_t rr=0; rr<hs_width; rr++) |
| fprintf(stderr,"%16" PRIX64 " ",*vout_h++); |
| fprintf(stderr,"\n"); |
| } |
| } |
| } |
| |
| // |
| // Used for benchmarking on out-of-order queues. Attaching an event |
| // to a kernel on an OOQ with profiling enabled will result in a |
| // synchronization point and block concurrent execution of kernels. |
| // |
| // The workaround that enables measuring the entire runtime of the |
| // sort is to launch a dummy kernel with an event, a barrier without |
| // an event, then the call to hs_sort(), followed by a final dummy |
| // kernel with an event. |
| // |
| // The end time of the first dummy and start time of the second dummy |
| // will provide a conservative estimate of the total execution time of |
| // the hs_sort() routine. |
| // |
| // Note that once kernels are enqueued they are scheduled with only |
| // microseconds between them so this should only be a small number of |
| // microseconds longer than the true hs_sort() execution time. |
| // |
| |
| #define HS_DUMMY_KERNEL_PROGRAM "kernel void hs_dummy_kernel() { ; }" |
| |
| static cl_kernel hs_dummy_kernel; |
| |
| static |
| void |
| hs_dummy_kernel_create(cl_context context, cl_device_id device_id) |
| { |
| cl_int err; |
| |
| char const * strings[] = { HS_DUMMY_KERNEL_PROGRAM }; |
| size_t const strings_sizeof[] = { sizeof(HS_DUMMY_KERNEL_PROGRAM) }; |
| |
| cl_program program = clCreateProgramWithSource(context, |
| 1, |
| strings, |
| strings_sizeof, |
| &err); cl_ok(err); |
| cl(BuildProgram(program, |
| 1, |
| &device_id, |
| NULL, |
| NULL, |
| NULL)); |
| |
| hs_dummy_kernel = clCreateKernel(program,"hs_dummy_kernel",&err); cl_ok(err); |
| |
| cl(ReleaseProgram(program)); |
| } |
| |
| static |
| void |
| hs_dummy_kernel_release() |
| { |
| cl(ReleaseKernel(hs_dummy_kernel)); |
| } |
| |
| static |
| void |
| hs_dummy_kernel_enqueue(cl_command_queue cq, |
| uint32_t wait_list_size, |
| cl_event const * wait_list, |
| cl_event * event) |
| { |
| size_t const global_work_size = 1; |
| |
| cl(EnqueueNDRangeKernel(cq, |
| hs_dummy_kernel, |
| 1, |
| NULL, |
| &global_work_size, |
| NULL, |
| wait_list_size, |
| wait_list, |
| event)); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_bench(cl_context context, |
| cl_command_queue cq, |
| cl_command_queue cq_profile, |
| char const * const device_name, |
| char const * const driver_version, |
| uint32_t const hs_words, |
| uint32_t const hs_width, |
| uint32_t const hs_height, |
| struct hs_cl const * const hs, |
| uint32_t const count_lo, |
| uint32_t const count_hi, |
| uint32_t const count_step, |
| uint32_t const loops, |
| uint32_t const warmup, |
| bool const linearize) |
| { |
| // |
| // return if nothing to do |
| // |
| if (count_hi <= 1) |
| return; |
| |
| // |
| // size the arrays |
| // |
| uint32_t count_hi_padded_in, count_hi_padded_out; |
| |
| hs_cl_pad(hs,count_hi,&count_hi_padded_in,&count_hi_padded_out); |
| |
| // |
| // SIZE |
| // |
| size_t const key_size = sizeof(uint32_t) * hs_words; |
| |
| size_t const size_hi_in = count_hi_padded_in * key_size; |
| size_t const size_hi_out = count_hi_padded_out * key_size; |
| |
| // |
| // ALLOCATE |
| // |
| cl_int cl_err; |
| |
| void * sorted_h = malloc(size_hi_in); |
| |
| cl_mem random = clCreateBuffer(context, |
| CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, |
| size_hi_in, |
| NULL,&cl_err); cl_ok(cl_err); |
| |
| cl_mem vin = clCreateBuffer(context, |
| CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, |
| size_hi_in, |
| NULL,&cl_err); cl_ok(cl_err); |
| |
| cl_mem vout = clCreateBuffer(context, |
| CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, |
| size_hi_out, |
| NULL,&cl_err); cl_ok(cl_err); |
| // |
| // BLOCKING MAP AND INIT KEYS |
| // |
| { |
| void * random_h = clEnqueueMapBuffer(cq, |
| random, |
| CL_TRUE, |
| CL_MAP_WRITE_INVALIDATE_REGION, |
| 0,size_hi_in, |
| 0,NULL,NULL, |
| &cl_err); cl_ok(cl_err); |
| |
| // fill with random numbers |
| hs_fill_rand(random_h,count_hi,hs_words); |
| |
| // |
| // UNMAP |
| // |
| cl(EnqueueUnmapMemObject(cq,random,random_h,0,NULL,NULL)); |
| } |
| |
| // |
| // BENCHMARK |
| // |
| for (uint32_t count=count_lo; count<=count_hi; count+=count_step) |
| { |
| // compute padding before sorting |
| uint32_t count_padded_in, count_padded_out; |
| |
| hs_cl_pad(hs,count,&count_padded_in,&count_padded_out); |
| |
| cl_ulong elapsed_ns_min = UINT64_MAX; |
| cl_ulong elapsed_ns_max = 0; |
| cl_ulong elapsed_ns_sum = 0; |
| |
| cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL)); |
| cl(Finish(cq)); |
| |
| for (uint32_t ii=0; ii<warmup+loops; ii++) |
| { |
| if (ii == warmup) |
| { |
| elapsed_ns_min = UINT64_MAX; |
| elapsed_ns_max = 0; |
| elapsed_ns_sum = 0; |
| } |
| |
| #if 0 |
| // |
| // optionally, initialize vin on every loop -- no need |
| // |
| cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL)); |
| cl(Finish(cq)); |
| #endif |
| |
| // |
| // sort vin |
| // |
| cl_event start, complete, end; |
| |
| hs_dummy_kernel_enqueue(cq_profile,0,NULL,&start); |
| |
| // note hs_sort enqueues a final barrier |
| hs_cl_sort(hs, |
| cq, |
| 1,&start,&complete, |
| vin,vout, |
| count, |
| count_padded_in, |
| count_padded_out, |
| linearize); |
| |
| hs_dummy_kernel_enqueue(cq_profile,1,&complete,&end); |
| |
| cl(Finish(cq_profile)); |
| |
| // |
| // measure duration |
| // |
| cl_ulong t_start=0, t_end=0; |
| |
| // start |
| cl(GetEventProfilingInfo(start, |
| CL_PROFILING_COMMAND_END, |
| sizeof(cl_ulong), |
| &t_start, |
| NULL)); |
| |
| // end |
| cl(GetEventProfilingInfo(end, |
| CL_PROFILING_COMMAND_START, |
| sizeof(cl_ulong), |
| &t_end, |
| NULL)); |
| |
| cl_ulong const t = t_end - t_start; |
| |
| elapsed_ns_min = MIN_MACRO(elapsed_ns_min,t); |
| elapsed_ns_max = MAX_MACRO(elapsed_ns_max,t); |
| elapsed_ns_sum += t; |
| |
| cl(ReleaseEvent(start)); |
| cl(ReleaseEvent(complete)); |
| cl(ReleaseEvent(end)); |
| } |
| |
| // |
| // COPY KEYS BACK FOR VERIFICATION |
| // |
| size_t const size_padded_in = count_padded_in * key_size; |
| |
| void * vin_h = clEnqueueMapBuffer(cq, |
| vin, |
| CL_FALSE, |
| CL_MAP_READ, |
| 0,size_padded_in, |
| 0,NULL,NULL, |
| &cl_err); cl_ok(cl_err); |
| |
| void * vout_h = clEnqueueMapBuffer(cq, |
| vout, |
| CL_FALSE, |
| CL_MAP_READ, |
| 0,size_padded_in, |
| 0,NULL,NULL, |
| &cl_err); cl_ok(cl_err); |
| cl(Finish(cq)); |
| |
| // |
| // SORT THE UNTOUCHED RANDOM INPUT |
| // |
| memcpy(sorted_h,vin_h,size_padded_in); |
| |
| double cpu_ns; |
| |
| char const * const algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns); |
| |
| // |
| // EXPLICITLY TRANSPOSE THE CPU SORTED SLABS IF NOT LINEARIZING |
| // |
| if (!linearize) |
| hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in); |
| |
| // |
| // VERIFY |
| // |
| bool const verified = memcmp(sorted_h,vout_h,size_padded_in) == 0; |
| |
| #ifndef NDEBUG |
| if (!verified) |
| { |
| if (hs_words == 1) |
| hs_debug_u32(hs_width,hs_height,vout_h,count); |
| else // ulong |
| hs_debug_u64(hs_width,hs_height,vout_h,count); |
| } |
| #endif |
| |
| cl(EnqueueUnmapMemObject(cq,vin, vin_h, 0,NULL,NULL)); |
| cl(EnqueueUnmapMemObject(cq,vout,vout_h,0,NULL,NULL)); |
| |
| cl(Finish(cq)); |
| |
| // |
| // REPORT |
| // |
| fprintf(stdout,"%s, %s, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n", |
| device_name, |
| driver_version, |
| (hs_words == 1) ? "uint" : "ulong", |
| linearize ? "linear" : "slab", |
| verified ? " OK " : "*FAIL*", |
| count, |
| count_padded_in, |
| count_padded_out, |
| // CPU |
| algo, |
| cpu_ns / 1000000.0, // milliseconds |
| 1000.0 * count / cpu_ns, // mkeys / sec |
| // GPU |
| loops, |
| elapsed_ns_sum / 1000000.0 / loops, // avg msecs |
| elapsed_ns_min / 1000000.0, // min msecs |
| elapsed_ns_max / 1000000.0, // max msecs |
| 1000.0 * count * loops / elapsed_ns_sum, // mkeys / sec - avg |
| 1000.0 * count / elapsed_ns_min); // mkeys / sec - max |
| |
| // quit early if not verified |
| if (!verified) |
| break; |
| } |
| |
| // |
| // dispose |
| // |
| cl(ReleaseMemObject(vout)); |
| cl(ReleaseMemObject(vin)); |
| cl(ReleaseMemObject(random)); |
| free(sorted_h); |
| } |
| |
| // |
| // |
| // |
| |
| int |
| main(int argc, char const * argv[]) |
| { |
| char const * const target_platform_substring = "Intel"; |
| char const * const target_device_substring = "Graphics"; |
| |
| // |
| // find platform and device ids |
| // |
| cl_platform_id platform_id; |
| cl_device_id device_id; |
| |
| #define HS_DEVICE_NAME_SIZE 64 |
| |
| char device_name[HS_DEVICE_NAME_SIZE]; |
| size_t device_name_size; |
| |
| cl(FindIdsByName(target_platform_substring, |
| target_device_substring, |
| &platform_id, |
| &device_id, |
| HS_DEVICE_NAME_SIZE, |
| device_name, |
| &device_name_size, |
| true)); |
| // |
| // create context |
| // |
| cl_context_properties context_properties[] = |
| { |
| CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, |
| 0 |
| }; |
| |
| cl_int cl_err; |
| cl_context context = clCreateContext(context_properties, |
| 1, |
| &device_id, |
| NULL, |
| NULL, |
| &cl_err); |
| cl_ok(cl_err); |
| |
| // |
| // create command queue |
| // |
| #if 0 // OPENCL 2.0 |
| |
| cl_queue_properties props[] = { |
| CL_QUEUE_PROPERTIES, |
| (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, |
| #ifndef NDEBUG |
| (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE, |
| #endif |
| 0 |
| }; |
| |
| cl_queue_properties props_profile[] = { |
| CL_QUEUE_PROPERTIES, |
| (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE, |
| 0 |
| }; |
| |
| cl_command_queue cq = clCreateCommandQueueWithProperties(context, |
| device_id, |
| props, |
| &cl_err); cl_ok(cl_err); |
| |
| cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context, |
| device_id, |
| props_profile, |
| &cl_err); cl_ok(cl_err); |
| #else // OPENCL 1.2 |
| |
| cl_command_queue cq = clCreateCommandQueue(context, |
| device_id, |
| #ifndef NDEBUG |
| CL_QUEUE_PROFILING_ENABLE | |
| #endif |
| CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, |
| &cl_err); cl_ok(cl_err); |
| |
| cl_command_queue cq_profile = clCreateCommandQueue(context, |
| device_id, |
| CL_QUEUE_PROFILING_ENABLE, |
| &cl_err); cl_ok(cl_err); |
| #endif |
| |
| // |
| // Intel GEN workaround -- create dummy kernel for semi-accurate |
| // profiling on an out-of-order queue. |
| // |
| hs_dummy_kernel_create(context,device_id); |
| |
| // |
| // select the target |
| // |
| |
| uint32_t const key_val_words = (argc == 1) ? 2 : strtoul(argv[1],NULL,0); |
| |
| struct hs_cl_target const * hs_target; |
| |
| if (key_val_words == 1) |
| hs_target = &hs_intel_gen8_u32; |
| else |
| hs_target = &hs_intel_gen8_u64; |
| |
| // |
| // create kernels |
| // |
| fprintf(stdout,"Creating... "); |
| |
| struct hs_cl * const hs = hs_cl_create(hs_target,context,device_id); |
| |
| fprintf(stdout,"done.\n"); |
| |
| // |
| // |
| // |
| |
| #ifdef NDEBUG |
| #define HS_BENCH_LOOPS 100 |
| #define HS_BENCH_WARMUP 100 |
| #else |
| #define HS_BENCH_LOOPS 1 |
| #define HS_BENCH_WARMUP 0 |
| #endif |
| |
| // |
| // sort sizes and loops |
| // |
| uint32_t const kpb = hs_target->config.slab.height << hs_target->config.slab.width_log2; |
| |
| uint32_t const count_lo = (argc <= 2) ? kpb : strtoul(argv[2],NULL,0); |
| uint32_t const count_hi = (argc <= 3) ? count_lo : strtoul(argv[3],NULL,0); |
| uint32_t const count_step = (argc <= 4) ? count_lo : strtoul(argv[4],NULL,0); |
| uint32_t const loops = (argc <= 5) ? HS_BENCH_LOOPS : strtoul(argv[5],NULL,0); |
| uint32_t const warmup = (argc <= 6) ? HS_BENCH_WARMUP : strtoul(argv[6],NULL,0); |
| bool const linearize = (argc <= 7) ? true : strtoul(argv[7],NULL,0); |
| |
| // |
| // labels |
| // |
| fprintf(stdout, |
| "Device, " |
| "Driver, " |
| "Type, " |
| "Slab/Linear, " |
| "Verified?, " |
| "Keys, " |
| "Keys Padded In, " |
| "Keys Padded Out, " |
| "CPU Algorithm, " |
| "CPU Msecs, " |
| "CPU Mkeys/s, " |
| "Trials, " |
| "Avg. Msecs, " |
| "Min Msecs, " |
| "Max Msecs, " |
| "Avg. Mkeys/s, " |
| "Max. Mkeys/s\n"); |
| |
| // |
| // we want to track driver versions |
| // |
| size_t driver_version_size; |
| |
| cl(GetDeviceInfo(device_id, |
| CL_DRIVER_VERSION, |
| 0, |
| NULL, |
| &driver_version_size)); |
| |
| char * const driver_version = ALLOCA_MACRO(driver_version_size); |
| |
| cl(GetDeviceInfo(device_id, |
| CL_DRIVER_VERSION, |
| driver_version_size, |
| driver_version, |
| NULL)); |
| // |
| // benchmark |
| // |
| hs_bench(context, |
| cq,cq_profile, |
| device_name, |
| driver_version, |
| hs_target->config.words.key + hs_target->config.words.val, |
| 1 << hs_target->config.slab.width_log2, |
| hs_target->config.slab.height, |
| hs, |
| count_lo, |
| count_hi, |
| count_step, |
| loops, |
| warmup, |
| linearize); |
| |
| // |
| // release everything |
| // |
| hs_cl_release(hs); |
| |
| hs_dummy_kernel_release(); |
| |
| cl(ReleaseCommandQueue(cq)); |
| cl(ReleaseCommandQueue(cq_profile)); |
| |
| cl(ReleaseContext(context)); |
| |
| return 0; |
| } |