| /* |
| * Copyright 2016 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can |
| * be found in the LICENSE file. |
| * |
| */ |
| |
| // |
| // |
| // |
| |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <inttypes.h> |
| #include <float.h> |
| #include <stdbool.h> |
| |
| // |
| // |
| // |
| |
| #include <cuda_runtime.h> |
| |
| // |
| // |
| // |
| |
| #include "common/cuda/assert_cuda.h" |
| #include "common/macros.h" |
| |
| // |
| // |
| // |
| |
| #include "hs/cuda/sm_35/u32/hs_cuda.h" |
| #include "hs/cuda/sm_35/u64/hs_cuda.h" |
| |
| // |
| // PFNs to select between different key widths |
| // |
| |
| typedef void (*hs_cuda_info_pfn)(uint32_t * const key_words, |
| uint32_t * const val_words, |
| uint32_t * const slab_height, |
| uint32_t * const slab_width_log2); |
| |
| typedef void (*hs_cuda_pad_pfn)(uint32_t const count, |
| uint32_t * const count_padded_in, |
| uint32_t * const count_padded_out); |
| |
| typedef void (*hs_cuda_sort_pfn)(void * const vin, |
| void * const vout, |
| uint32_t const count, |
| uint32_t const count_padded_in, |
| uint32_t const count_padded_out, |
| bool const linearize, |
| cudaStream_t stream0, |
| cudaStream_t stream1, |
| cudaStream_t stream2); |
| |
| // |
| // The quality of the RNG doesn't matter. The same number of |
| // instructions will be run no matter what the key distribution looks |
| // like. So here is something small and fast. |
| // |
| |
| static |
| uint32_t |
| hs_rand_u32() |
| { |
| static uint32_t seed = 0xDEADBEEF; |
| |
| // Numerical Recipes |
| seed = seed * 1664525 + 1013904223; |
| |
| return seed; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words) |
| { |
| #if 1 |
| for (uint32_t ii=0; ii<count*words; ii++) |
| vin_h[ii] = hs_rand_u32(); |
| #elif 0 // in-order |
| memset(vin_h,0,count*words*sizeof(uint32_t)); |
| for (uint32_t ii=0; ii<count; ii++) |
| vin_h[ii*words] = ii; |
| #else // reverse order |
| memset(vin_h,0,count*words*sizeof(uint32_t)); |
| for (uint32_t ii=0; ii<count; ii++) |
| vin_h[ii*words] = count - 1 - ii; |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns); |
| char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns); |
| |
| // |
| // |
| // |
| |
| static |
| char const * |
| hs_cpu_sort(void * sorted_h, |
| uint32_t const hs_words, |
| uint32_t const count, |
| double * const cpu_ns) |
| { |
| if (hs_words == 1) |
| return hs_cpu_sort_u32(sorted_h,count,cpu_ns); |
| else |
| return hs_cpu_sort_u64(sorted_h,count,cpu_ns); |
| } |
| |
| static |
| bool |
| hs_verify_linear(uint32_t const hs_words, |
| void * sorted_h, |
| void * vout_h, |
| uint32_t const count) |
| { |
| return memcmp(sorted_h, vout_h, sizeof(uint32_t) * hs_words * count) == 0; |
| } |
| |
| static |
| void |
| hs_transpose_slabs_u32(uint32_t const hs_words, |
| uint32_t const hs_width, |
| uint32_t const hs_height, |
| uint32_t * vout_h, |
| uint32_t const count) |
| { |
| uint32_t const slab_keys = hs_width * hs_height; |
| size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; |
| uint32_t * const slab = ALLOCA_MACRO(slab_size); |
| uint32_t slab_count = count / slab_keys; |
| |
| while (slab_count-- > 0) |
| { |
| memcpy(slab,vout_h,slab_size); |
| |
| for (uint32_t row=0; row<hs_height; row++) |
| for (uint32_t col=0; col<hs_width; col++) |
| vout_h[col * hs_height + row] = slab[row * hs_width + col]; |
| |
| vout_h += slab_keys; |
| } |
| } |
| |
| static |
| void |
| hs_transpose_slabs_u64(uint32_t const hs_words, |
| uint32_t const hs_width, |
| uint32_t const hs_height, |
| uint64_t * vout_h, |
| uint32_t const count) |
| { |
| uint32_t const slab_keys = hs_width * hs_height; |
| size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; |
| uint64_t * const slab = ALLOCA_MACRO(slab_size); |
| uint32_t slab_count = count / slab_keys; |
| |
| while (slab_count-- > 0) |
| { |
| memcpy(slab,vout_h,slab_size); |
| |
| for (uint32_t row=0; row<hs_height; row++) |
| for (uint32_t col=0; col<hs_width; col++) |
| vout_h[col * hs_height + row] = slab[row * hs_width + col]; |
| |
| vout_h += slab_keys; |
| } |
| } |
| |
| static |
| void |
| hs_transpose_slabs(uint32_t const hs_words, |
| uint32_t const hs_width, |
| uint32_t const hs_height, |
| void * vout_h, |
| uint32_t const count) |
| { |
| if (hs_words == 1) |
| hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count); |
| else |
| hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_debug_u32(uint32_t const hs_width, |
| uint32_t const hs_height, |
| uint32_t const * vout_h, |
| uint32_t const count) |
| { |
| uint32_t const slab_keys = hs_width * hs_height; |
| uint32_t const slabs = (count + slab_keys - 1) / slab_keys; |
| |
| for (uint32_t ss=0; ss<slabs; ss++) { |
| fprintf(stderr,"%u\n",ss); |
| for (uint32_t cc=0; cc<hs_height; cc++) { |
| for (uint32_t rr=0; rr<hs_width; rr++) |
| fprintf(stderr,"%8" PRIX32 " ",*vout_h++); |
| fprintf(stderr,"\n"); |
| } |
| } |
| } |
| |
| static |
| void |
| hs_debug_u64(uint32_t const hs_width, |
| uint32_t const hs_height, |
| uint64_t const * vout_h, |
| uint32_t const count) |
| { |
| uint32_t const slab_keys = hs_width * hs_height; |
| uint32_t const slabs = (count + slab_keys - 1) / slab_keys; |
| |
| for (uint32_t ss=0; ss<slabs; ss++) { |
| fprintf(stderr,"%u\n",ss); |
| for (uint32_t cc=0; cc<hs_height; cc++) { |
| for (uint32_t rr=0; rr<hs_width; rr++) |
| fprintf(stderr,"%16" PRIX64 " ",*vout_h++); |
| fprintf(stderr,"\n"); |
| } |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_bench(hs_cuda_pad_pfn hs_pad, |
| hs_cuda_sort_pfn hs_sort, |
| cudaStream_t stream0, |
| cudaStream_t stream1, |
| cudaStream_t stream2, |
| struct cudaDeviceProp const * props, |
| int const driver_version, |
| uint32_t const hs_words, |
| uint32_t const hs_height, |
| uint32_t const hs_width, |
| uint32_t const count_lo, |
| uint32_t const count_hi, |
| uint32_t const count_step, |
| uint32_t const loops, |
| uint32_t const warmup, |
| bool const linearize, |
| bool const verify) |
| { |
| // |
| // return if nothing to do |
| // |
| if (count_hi <= 1) |
| return; |
| |
| // |
| // size for the largest array |
| // |
| uint32_t count_hi_padded_in, count_hi_padded_out; |
| |
| hs_pad(count_hi,&count_hi_padded_in,&count_hi_padded_out); |
| |
| size_t const key_size = sizeof(uint32_t) * hs_words; |
| size_t const size_hi_in = count_hi_padded_in * key_size; |
| size_t const size_hi_out = count_hi_padded_out * key_size; |
| |
| // |
| // allocate device extents |
| // |
| void * random_d; |
| void * vin_d; |
| void * vout_d; |
| |
| cuda(Malloc(&random_d,size_hi_in)); |
| cuda(Malloc(&vin_d, size_hi_in)); |
| cuda(Malloc(&vout_d, size_hi_out)); |
| |
| // |
| // initialize device random extent |
| // |
| void * random_h = malloc(size_hi_in); |
| |
| // fill with random numbers |
| hs_fill_rand(random_h,count_hi,hs_words); |
| |
| // copy to device |
| cuda(Memcpy(random_d,random_h,size_hi_in,cudaMemcpyHostToDevice)); |
| |
| free(random_h); |
| |
| // |
| // allocate host result extent |
| // |
| void * sorted_h = malloc(size_hi_in); |
| void * vout_h = malloc(size_hi_in); |
| |
| // |
| // LABELS |
| // |
| fprintf(stdout, |
| "Device, " |
| "Driver, " |
| "Type, " |
| "Slab/Linear, " |
| "Verified?, " |
| "Keys, " |
| "Keys Padded In, " |
| "Keys Padded Out, " |
| "CPU Algorithm, " |
| "CPU Msecs, " |
| "CPU Mkeys/s, " |
| "Trials, " |
| "Avg. Msecs, " |
| "Min Msecs, " |
| "Max Msecs, " |
| "Avg. Mkeys/s, " |
| "Max. Mkeys/s\n"); |
| // |
| // BENCHMARK |
| // |
| cudaEvent_t start, end; |
| |
| cuda(EventCreate(&start)); |
| cuda(EventCreate(&end)); |
| |
| for (uint32_t count=count_lo; count<=count_hi; count+=count_step) |
| { |
| // compute padding before sorting |
| uint32_t count_padded_in, count_padded_out; |
| |
| hs_pad(count,&count_padded_in,&count_padded_out); |
| |
| cuda(Memcpy(vin_d,random_d,count*key_size,cudaMemcpyDeviceToDevice)); |
| |
| float elapsed_ms_min = FLT_MAX; |
| float elapsed_ms_max = 0.0f; |
| float elapsed_ms_sum = 0.0f; |
| |
| for (uint32_t ii=0; ii<warmup+loops; ii++) |
| { |
| if (ii == warmup) |
| { |
| elapsed_ms_min = FLT_MAX; |
| elapsed_ms_max = 0.0f; |
| elapsed_ms_sum = 0.0f; |
| } |
| |
| // |
| // sort vin/vout |
| // |
| cuda(EventRecord(start,stream0)); |
| cuda(StreamWaitEvent(stream1,start,0)); |
| cuda(StreamWaitEvent(stream2,start,0)); |
| |
| hs_sort(vin_d, |
| vout_d, |
| count, |
| count_padded_in, |
| count_padded_out, |
| linearize, |
| stream0, |
| stream1, |
| stream2); |
| |
| cuda(EventRecord(end,stream0)); |
| |
| cuda(EventSynchronize(end)); |
| |
| float elapsed; |
| |
| cuda(EventElapsedTime(&elapsed,start,end)); |
| |
| elapsed_ms_min = MIN_MACRO(elapsed_ms_min,elapsed); |
| elapsed_ms_max = MAX_MACRO(elapsed_ms_max,elapsed); |
| elapsed_ms_sum += elapsed; |
| } |
| |
| // |
| // verify |
| // |
| char const * cpu_algo = NULL; |
| double cpu_ns = 1.0; |
| bool verified = false; |
| |
| if (verify) |
| { |
| // |
| // copy back the results |
| // |
| size_t const size_padded_in = count_padded_in * key_size; |
| |
| cuda(Memcpy(sorted_h,vin_d, size_padded_in,cudaMemcpyDeviceToHost)); |
| cuda(Memcpy(vout_h, vout_d,size_padded_in,cudaMemcpyDeviceToHost)); |
| |
| // |
| // sort the input with another algorithm |
| // |
| cpu_algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns); |
| |
| // transpose the cpu sorted slabs before comparison |
| if (!linearize) { |
| hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in); |
| } |
| |
| verified = hs_verify_linear(hs_words,sorted_h,vout_h,count_padded_in); |
| |
| #ifndef NDEBUG |
| if (!verified) |
| { |
| if (hs_words == 1) { |
| hs_debug_u32(hs_width,hs_height,vout_h, count); |
| hs_debug_u32(hs_width,hs_height,sorted_h,count); |
| } else { // ulong |
| hs_debug_u64(hs_width,hs_height,vout_h, count); |
| hs_debug_u64(hs_width,hs_height,sorted_h,count); |
| } |
| } |
| #endif |
| } |
| |
| // |
| // REPORT |
| // |
| fprintf(stdout,"%s, %u, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n", |
| props->name, |
| driver_version, |
| (hs_words == 1) ? "uint32_t" : "uint64_t", |
| linearize ? "linear" : "slab", |
| verify ? (verified ? " OK " : "*FAIL*") : "UNVERIFIED", |
| count, |
| count_padded_in, |
| count_padded_out, |
| // CPU |
| verify ? cpu_algo : "UNVERIFIED", |
| verify ? (cpu_ns / 1000000.0) : 0.0, // milliseconds |
| verify ? (1000.0 * count / cpu_ns) : 0.0, // mkeys / sec |
| // GPU |
| loops, |
| elapsed_ms_sum / loops, // avg msecs |
| elapsed_ms_min, // min msecs |
| elapsed_ms_max, // max msecs |
| (double)(count * loops) / (1000.0 * elapsed_ms_sum), // mkeys / sec - avg |
| (double) count / (1000.0 * elapsed_ms_min)); // mkeys / sec - max |
| |
| // quit early if not verified |
| if (verify && !verified) |
| break; |
| } |
| |
| // |
| // dispose |
| // |
| cuda(EventDestroy(start)); |
| cuda(EventDestroy(end)); |
| |
| free(sorted_h); |
| free(vout_h); |
| |
| cuda(Free(random_d)); |
| cuda(Free(vin_d)); |
| cuda(Free(vout_d)); |
| } |
| |
| // |
| // |
| // |
| |
| int |
| main(int argc, char const * argv[]) |
| { |
| // |
| // which CUDA device? |
| // |
| const int32_t device = (argc == 1) ? 0 : atoi(argv[1]); |
| |
| struct cudaDeviceProp props; |
| cuda(GetDeviceProperties(&props,device)); |
| |
| cuda(SetDeviceFlags(cudaDeviceScheduleBlockingSync)); |
| cuda(SetDevice(device)); |
| |
| int driver_version; |
| |
| cuda(DriverGetVersion(&driver_version)); |
| |
| #ifndef NDEBUG |
| fprintf(stdout,"%s (%2d) : %u\n", |
| props.name, |
| props.multiProcessorCount, |
| driver_version); |
| #endif |
| |
| // |
| // create some streams |
| // |
| cudaStream_t stream0,stream1,stream2; |
| |
| cuda(StreamCreate(&stream0)); |
| cuda(StreamCreate(&stream1)); |
| cuda(StreamCreate(&stream2)); |
| |
| // |
| // |
| // |
| #ifdef NDEBUG |
| #define HS_BENCH_LOOPS 100 |
| #define HS_BENCH_WARMUP 100 |
| #else |
| #define HS_BENCH_LOOPS 1 |
| #define HS_BENCH_WARMUP 0 |
| #endif |
| |
| // |
| // are we sorting 32-bit or 64-bit keys? |
| // |
| uint32_t const key_size = (argc <= 2) ? 2 : strtoul(argv[2],NULL,0); |
| |
| hs_cuda_info_pfn hs_info; |
| hs_cuda_pad_pfn hs_pad; |
| hs_cuda_sort_pfn hs_sort; |
| |
| if (key_size == 1) |
| { |
| hs_info = hs_cuda_info_u32; |
| hs_pad = hs_cuda_pad_u32; |
| hs_sort = hs_cuda_sort_u32; |
| } |
| else |
| { |
| hs_info = hs_cuda_info_u64; |
| hs_pad = hs_cuda_pad_u64; |
| hs_sort = hs_cuda_sort_u64; |
| } |
| |
| // |
| // get some configuration info |
| // |
| uint32_t key_words, val_words, slab_height, slab_width_log2; |
| |
| hs_info(&key_words,&val_words,&slab_height,&slab_width_log2); |
| |
| // |
| // sort sizes and loops |
| // |
| uint32_t const kpb = slab_height << slab_width_log2; |
| uint32_t const count_lo = (argc <= 3) ? kpb : strtoul(argv[3],NULL,0); |
| uint32_t const count_hi = (argc <= 4) ? count_lo : strtoul(argv[4],NULL,0); |
| uint32_t const count_step = (argc <= 5) ? count_lo : strtoul(argv[5],NULL,0); |
| uint32_t const loops = (argc <= 6) ? HS_BENCH_LOOPS : strtoul(argv[6],NULL,0); |
| uint32_t const warmup = (argc <= 7) ? HS_BENCH_WARMUP : strtoul(argv[7],NULL,0); |
| bool const linearize = (argc <= 8) ? true : strtoul(argv[8],NULL,0); |
| bool const verify = (argc <= 9) ? true : strtoul(argv[9],NULL,0); |
| |
| // |
| // benchmark |
| // |
| hs_bench(hs_pad, |
| hs_sort, |
| stream0, |
| stream1, |
| stream2, |
| &props, |
| driver_version, |
| key_words + val_words, |
| slab_height, |
| 1 << slab_width_log2, |
| count_lo, |
| count_hi, |
| count_step, |
| loops, |
| warmup, |
| linearize, |
| verify); |
| |
| // |
| // cleanup |
| // |
| cuda(StreamDestroy(stream0)); |
| cuda(StreamDestroy(stream1)); |
| cuda(StreamDestroy(stream2)); |
| |
| cuda(DeviceReset()); |
| |
| return EXIT_SUCCESS; |
| } |