| /* |
| * Copyright 2016 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can |
| * be found in the LICENSE file. |
| * |
| */ |
| |
| // |
| // |
| // |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <inttypes.h> |
| |
| // |
| // |
| // |
| |
| #include "common/cl/assert_cl.h" |
| #include "common/macros.h" |
| #include "common/util.h" |
| |
| // |
| // |
| // |
| |
| #include "hs_cl.h" |
| |
| // |
| // |
| // |
| |
| struct hs_cl |
| { |
| struct hs_cl_target_config config; |
| |
| uint32_t key_val_size; |
| uint32_t slab_keys; |
| uint32_t bs_slabs_log2_ru; |
| uint32_t bc_slabs_log2_max; |
| |
| struct { |
| uint32_t count; |
| cl_kernel * bs; |
| cl_kernel * bc; |
| cl_kernel * fm[3]; |
| cl_kernel * hm[3]; |
| cl_kernel * transpose; |
| cl_kernel all[]; |
| } kernels; |
| }; |
| |
| // |
| // |
| // |
| |
| struct hs_state |
| { |
| #ifndef NDEBUG |
| cl_ulong t_total; // 0 |
| #endif |
| |
| cl_command_queue cq; |
| |
| // key buffers |
| cl_mem vin; |
| cl_mem vout; // can be vin |
| |
| // enforces ordering on out-of-order queue |
| cl_event wait_list[3]; // worst case |
| uint32_t wait_list_size; |
| |
| // bx_ru is number of rounded up warps in vin |
| uint32_t bx_ru; |
| }; |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_state_wait_list_release(struct hs_state * const state) |
| { |
| for (uint32_t ii=0; ii<state->wait_list_size; ii++) |
| cl(ReleaseEvent(state->wait_list[ii])); |
| |
| state->wait_list_size = 0; |
| } |
| |
| static |
| void |
| hs_state_wait_list_update(struct hs_state * const state, |
| uint32_t const wait_list_size, |
| cl_event const * const wait_list) |
| { |
| uint32_t const new_size = state->wait_list_size + wait_list_size; |
| |
| for (uint32_t ii=state->wait_list_size; ii<new_size; ii++) |
| state->wait_list[ii] = wait_list[ii]; |
| |
| state->wait_list_size = new_size; |
| } |
| |
| // |
| // |
| // |
| |
| #ifdef NDEBUG |
| |
| #define HS_STATE_WAIT_LIST_PROFILE(state) |
| #define HS_STATE_WAIT_LIST_PROFILE_EX(state,wait_list_size,wait_list) |
| |
| #else |
| |
| #include <stdio.h> |
| |
| #define HS_STATE_WAIT_LIST_PROFILE(state) \ |
| hs_state_wait_list_profile(state, \ |
| state->wait_list_size, \ |
| state->wait_list) |
| |
| #define HS_STATE_WAIT_LIST_PROFILE_EX(state,wait_list_size,wait_list) \ |
| hs_state_wait_list_profile(state, \ |
| wait_list_size, \ |
| wait_list) |
| |
| static |
| void |
| hs_state_wait_list_profile(struct hs_state * const state, |
| uint32_t const wait_list_size, |
| cl_event const * const wait_list) |
| { |
| cl(Finish(state->cq)); |
| |
| cl_command_queue_properties props; |
| |
| cl(GetCommandQueueInfo(state->cq, |
| CL_QUEUE_PROPERTIES, |
| sizeof(props), |
| &props, |
| NULL)); |
| |
| for (uint32_t ii=0; ii<wait_list_size; ii++) |
| { |
| cl_event event = wait_list[ii]; |
| |
| // |
| // profiling |
| // |
| cl_ulong t_start=0, t_end=0; |
| |
| if (props & CL_QUEUE_PROFILING_ENABLE) |
| { |
| // start |
| cl(GetEventProfilingInfo(event, |
| CL_PROFILING_COMMAND_START, |
| sizeof(cl_ulong), |
| &t_start, |
| NULL)); |
| // end |
| cl(GetEventProfilingInfo(event, |
| CL_PROFILING_COMMAND_END, |
| sizeof(cl_ulong), |
| &t_end, |
| NULL)); |
| |
| state->t_total += t_end - t_start; |
| } |
| |
| // |
| // status |
| // |
| cl_int status; |
| cl_command_type type; |
| |
| cl_get_event_info(event,&status,&type); |
| |
| fprintf(stdout,"%-13s, %-28s, %20"PRIu64", %20"PRIu64", %20"PRIu64", %20"PRIu64"\n", |
| cl_get_event_command_status_string(status), |
| cl_get_event_command_type_string(type), |
| t_start,t_end,t_end-t_start,state->t_total); |
| } |
| } |
| |
| #endif |
| |
| // |
| // |
| // |
| |
| #ifdef NDEBUG |
| |
| #define HS_TRACE(k,g,l) |
| |
| #else |
| |
| #define HS_KERNEL_NAME_MAX 20 |
| |
| static |
| void |
| hs_trace(cl_kernel kernel, |
| uint32_t const dim, |
| size_t const * const global_work_size) |
| { |
| if (kernel == NULL) |
| return; |
| |
| char name[HS_KERNEL_NAME_MAX]; |
| |
| cl(GetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,HS_KERNEL_NAME_MAX,name,NULL)); |
| |
| fprintf(stderr,"%-19s ( %6zu, %6zu, %6zu )\n", |
| name, |
| global_work_size[0], |
| dim < 2 ? 0 : global_work_size[1], |
| dim < 3 ? 0 : global_work_size[2]); |
| } |
| |
| #define HS_TRACE(k,d,g) hs_trace(k,d,g) |
| |
| #endif |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_transpose(struct hs_cl const * const hs, |
| struct hs_state * const state) |
| { |
| size_t const size[1] = { state->bx_ru << hs->config.slab.threads_log2 }; |
| cl_kernel kernel = hs->kernels.transpose[0]; |
| |
| HS_TRACE(kernel,1,size); |
| |
| // |
| // The transpose kernel operates on a single slab. For now, let's |
| // rely on the driver to choose a workgroup size. |
| // |
| // size_t local_work_size[1] = { HS_SLAB_THREADS }; |
| // |
| cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout)); |
| |
| cl_event wait_list_out[1]; |
| |
| cl(EnqueueNDRangeKernel(state->cq, |
| kernel, |
| 1, |
| NULL, |
| size, |
| NULL, |
| state->wait_list_size, |
| state->wait_list, |
| wait_list_out)); |
| |
| hs_state_wait_list_release(state); |
| hs_state_wait_list_update(state,1,wait_list_out); |
| |
| HS_STATE_WAIT_LIST_PROFILE(state); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_hm_enqueue(struct hs_cl const * const hs, |
| struct hs_state * const state, |
| uint32_t const scale_log2, |
| uint32_t const spans, |
| uint32_t const span_threads) |
| { |
| // |
| // Note that some platforms might need to use .z on large grids |
| // |
| size_t const size[3] = { span_threads, spans, 1 }; |
| cl_kernel kernel = hs->kernels.hm[scale_log2][0]; |
| |
| HS_TRACE(kernel,3,size); |
| |
| cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout)); |
| |
| cl_event wait_list_out[1]; |
| |
| cl(EnqueueNDRangeKernel(state->cq, |
| kernel, |
| 3, |
| NULL, |
| size, |
| NULL, |
| state->wait_list_size, |
| state->wait_list, |
| wait_list_out)); |
| |
| hs_state_wait_list_release(state); |
| hs_state_wait_list_update(state,1,wait_list_out); |
| |
| HS_STATE_WAIT_LIST_PROFILE(state); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| uint32_t |
| hs_hm(struct hs_cl const * const hs, |
| struct hs_state * const state, |
| uint32_t const down_slabs, |
| uint32_t const clean_slabs_log2) |
| { |
| // how many scaled half-merge spans are there? |
| uint32_t const frac_ru = (1 << clean_slabs_log2) - 1; |
| uint32_t const spans = (down_slabs + frac_ru) >> clean_slabs_log2; |
| |
| // for now, just clamp to the max |
| uint32_t const log2_rem = clean_slabs_log2 - hs->bc_slabs_log2_max; |
| uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.hm.scale_max,log2_rem); |
| uint32_t const log2_out = log2_rem - scale_log2; |
| |
| // size the grid |
| uint32_t const span_threads = hs->slab_keys << log2_out; |
| |
| // launch the hm kernel |
| hs_hm_enqueue(hs, |
| state, |
| scale_log2, |
| spans, |
| span_threads); |
| |
| return log2_out; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_bc_enqueue(struct hs_cl const * const hs, |
| struct hs_state * const state, |
| uint32_t const full, |
| uint32_t const clean_slabs_log2) |
| { |
| size_t const size[1] = { full << hs->config.slab.threads_log2 }; |
| cl_kernel kernel = hs->kernels.bc[clean_slabs_log2]; |
| |
| HS_TRACE(kernel,1,size); |
| |
| cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout)); |
| |
| cl_event wait_list_out[1]; |
| |
| cl(EnqueueNDRangeKernel(state->cq, |
| kernel, |
| 1, |
| NULL, |
| size, |
| NULL, |
| state->wait_list_size, |
| state->wait_list, |
| wait_list_out)); |
| |
| hs_state_wait_list_release(state); |
| hs_state_wait_list_update(state,1,wait_list_out); |
| |
| HS_STATE_WAIT_LIST_PROFILE(state); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_bc(struct hs_cl const * const hs, |
| struct hs_state * const state, |
| uint32_t const down_slabs, |
| uint32_t const clean_slabs_log2) |
| { |
| // block clean the minimal number of down_slabs_log2 spans |
| uint32_t const frac_ru = (1u << clean_slabs_log2) - 1; |
| uint32_t const full = (down_slabs + frac_ru) & ~frac_ru; |
| |
| // we better be capable of cleaning at least two warps !!! |
| hs_bc_enqueue(hs,state,full,clean_slabs_log2); |
| } |
| |
| // |
| // FIXME -- some of this logic can be skipped if BS is a power-of-two |
| // |
| |
| static |
| void |
| hs_fm_enqueue(struct hs_cl const * const hs, |
| struct hs_state * const state, |
| uint32_t const scale_log2, |
| uint32_t const fm_full, |
| uint32_t const fm_frac, |
| uint32_t const span_threads) |
| { |
| // |
| // Note that some platforms might need to use .z on large grids |
| // |
| uint32_t wait_list_out_size = 0; |
| cl_event wait_list_out[2]; |
| |
| if (fm_full > 0) |
| { |
| size_t const size_full[3] = { span_threads, fm_full, 1 }; |
| cl_kernel kernel_full = hs->kernels.fm[scale_log2][hs->bs_slabs_log2_ru-1+scale_log2]; |
| |
| HS_TRACE(kernel_full,3,size_full); |
| |
| cl(SetKernelArg(kernel_full,0,sizeof(state->vout),&state->vout)); |
| |
| cl(EnqueueNDRangeKernel(state->cq, |
| kernel_full, |
| 3, |
| NULL, |
| size_full, |
| NULL, |
| state->wait_list_size, |
| state->wait_list, |
| wait_list_out+wait_list_out_size++)); |
| } |
| |
| if (fm_frac > 0) |
| { |
| size_t const offset_frac[3] = { 0, fm_full, 0 }; |
| size_t const size_frac [3] = { span_threads, 1, 1 }; |
| cl_kernel kernel_frac = hs->kernels.fm[scale_log2][msb_idx_u32(fm_frac)]; |
| |
| HS_TRACE(kernel_frac,3,size_frac); |
| |
| cl(SetKernelArg(kernel_frac,0,sizeof(state->vout),&state->vout)); |
| |
| cl(EnqueueNDRangeKernel(state->cq, |
| kernel_frac, |
| 3, |
| offset_frac, |
| size_frac, |
| NULL, |
| state->wait_list_size, |
| state->wait_list, |
| wait_list_out+wait_list_out_size++)); |
| } |
| |
| hs_state_wait_list_release(state); |
| hs_state_wait_list_update(state,wait_list_out_size,wait_list_out); |
| |
| HS_STATE_WAIT_LIST_PROFILE(state); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| uint32_t |
| hs_fm(struct hs_cl const * const hs, |
| struct hs_state * const state, |
| uint32_t * const down_slabs, |
| uint32_t const up_scale_log2) |
| { |
| // |
| // FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes |
| // a performance win to bias toward launching the smaller flip merge |
| // kernel in order to get more warps in flight (increased |
| // occupancy). This is useful when merging small numbers of slabs. |
| // |
| // Note that HS_FM_SCALE_MIN will always be 0 or 1. |
| // |
| // So, for now, just clamp to the max until there is a reason to |
| // restore the fancier and probably low-impact approach. |
| // |
| uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.fm.scale_max,up_scale_log2); |
| uint32_t const clean_log2 = up_scale_log2 - scale_log2; |
| |
| // number of slabs in a full-sized scaled flip-merge span |
| uint32_t const full_span_slabs = hs->config.block.slabs << up_scale_log2; |
| |
| // how many full-sized scaled flip-merge spans are there? |
| uint32_t fm_full = state->bx_ru / full_span_slabs; |
| uint32_t fm_frac = 0; |
| |
| // initialize down_slabs |
| *down_slabs = fm_full * full_span_slabs; |
| |
| // how many half-size scaled + fractional scaled spans are there? |
| uint32_t const span_rem = state->bx_ru - *down_slabs; |
| uint32_t const half_span_slabs = full_span_slabs >> 1; |
| |
| // if we have over a half-span then fractionally merge it |
| if (span_rem > half_span_slabs) |
| { |
| // the remaining slabs will be cleaned |
| *down_slabs += span_rem; |
| |
| uint32_t const frac_rem = span_rem - half_span_slabs; |
| uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem); |
| |
| if (frac_rem_pow2 >= half_span_slabs) |
| { |
| // bump it up to a full span |
| fm_full += 1; |
| } |
| else |
| { |
| // otherwise, add fractional |
| fm_frac = MAX_MACRO(1,frac_rem_pow2 >> clean_log2); |
| } |
| } |
| |
| // size the grid |
| uint32_t const span_threads = hs->slab_keys << clean_log2; |
| |
| // |
| // launch the fm kernel |
| // |
| hs_fm_enqueue(hs, |
| state, |
| scale_log2, |
| fm_full, |
| fm_frac, |
| span_threads); |
| |
| return clean_log2; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_bs_enqueue(struct hs_cl const * const hs, |
| struct hs_state * const state, |
| uint32_t const full, |
| uint32_t const frac, |
| uint32_t const wait_list_size, |
| cl_event * wait_list) |
| { |
| uint32_t wait_list_out_size = 0; |
| cl_event wait_list_out[2]; |
| |
| if (full > 0) |
| { |
| size_t const size_full[1] = { full << hs->config.slab.threads_log2 }; |
| cl_kernel kernel_full = hs->kernels.bs[hs->bs_slabs_log2_ru]; |
| |
| HS_TRACE(kernel_full,1,size_full); |
| |
| cl(SetKernelArg(kernel_full,0,sizeof(state->vin), &state->vin)); |
| cl(SetKernelArg(kernel_full,1,sizeof(state->vout),&state->vout)); |
| |
| cl(EnqueueNDRangeKernel(state->cq, |
| kernel_full, |
| 1, |
| NULL, |
| size_full, |
| NULL, |
| wait_list_size, |
| wait_list, |
| wait_list_out+wait_list_out_size++)); |
| } |
| |
| if (frac > 0) |
| { |
| size_t const offset_frac[1] = { full << hs->config.slab.threads_log2 }; |
| size_t const size_frac [1] = { frac << hs->config.slab.threads_log2 }; |
| cl_kernel kernel_frac = hs->kernels.bs[msb_idx_u32(frac)]; |
| |
| HS_TRACE(kernel_frac,1,size_frac); |
| |
| cl(SetKernelArg(kernel_frac,0,sizeof(state->vin), &state->vin)); |
| cl(SetKernelArg(kernel_frac,1,sizeof(state->vout),&state->vout)); |
| |
| cl(EnqueueNDRangeKernel(state->cq, |
| kernel_frac, |
| 1, |
| offset_frac, |
| size_frac, |
| NULL, |
| wait_list_size, |
| wait_list, |
| wait_list_out+wait_list_out_size++)); |
| } |
| |
| hs_state_wait_list_release(state); |
| hs_state_wait_list_update(state,wait_list_out_size,wait_list_out); |
| |
| HS_STATE_WAIT_LIST_PROFILE(state); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_bs(struct hs_cl const * const hs, |
| struct hs_state * const state, |
| uint32_t const count_padded_in, |
| uint32_t const wait_list_size, |
| cl_event * wait_list) |
| { |
| uint32_t const slabs_in = count_padded_in / hs->slab_keys; |
| uint32_t const full = (slabs_in / hs->config.block.slabs) * hs->config.block.slabs; |
| uint32_t const frac = slabs_in - full; |
| |
| hs_bs_enqueue(hs,state, |
| full,frac, |
| wait_list_size,wait_list); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_keyset_pre_sort(struct hs_cl const * const hs, |
| struct hs_state * const state, |
| uint32_t const count, |
| uint32_t const count_hi, |
| uint32_t const wait_list_size, |
| cl_event * wait_list, |
| cl_event * event) |
| { |
| uint32_t const vin_span = count_hi - count; |
| uint32_t const pattern = UINT32_MAX; |
| |
| cl(EnqueueFillBuffer(state->cq, |
| state->vin, |
| &pattern, |
| sizeof(pattern), |
| count * hs->key_val_size, |
| vin_span * hs->key_val_size, |
| wait_list_size, |
| wait_list, |
| event)); |
| |
| HS_STATE_WAIT_LIST_PROFILE_EX(state,1,event); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_keyset_pre_merge(struct hs_cl const * const hs, |
| struct hs_state * const state, |
| uint32_t const count_lo, |
| uint32_t const count_hi, |
| uint32_t const wait_list_size, |
| cl_event * wait_list) |
| { |
| uint32_t const vout_span = count_hi - count_lo; |
| uint32_t const pattern = UINT32_MAX; |
| |
| // appends event to incoming wait list |
| cl(EnqueueFillBuffer(state->cq, |
| state->vout, |
| &pattern, |
| sizeof(pattern), |
| count_lo * hs->key_val_size, |
| vout_span * hs->key_val_size, |
| wait_list_size, |
| wait_list, |
| state->wait_list+state->wait_list_size++)); |
| |
| HS_STATE_WAIT_LIST_PROFILE(state); |
| } |
| |
| // |
| // We want concurrent kernel execution to occur in a few places. |
| // |
| // The summary is: |
| // |
| // 1) If necessary, some max valued keys are written to the end of |
| // the vin/vout buffers. |
| // |
| // 2) Blocks of slabs of keys are sorted. |
| // |
| // 3) If necesary, the blocks of slabs are merged until complete. |
| // |
| // 4) If requested, the slabs will be converted from slab ordering |
| // to linear ordering. |
| // |
| // Below is the general "happens-before" relationship between HotSort |
| // compute kernels. |
| // |
| // Note the diagram assumes vin and vout are different buffers. If |
| // they're not, then the first merge doesn't include the pad_vout |
| // event in the wait list. |
| // |
| // +----------+ +---------+ |
| // | pad_vout | | pad_vin | |
| // +----+-----+ +----+----+ |
| // | | |
| // | WAITFOR(pad_vin) |
| // | | |
| // | +-----v-----+ |
| // | | | |
| // | +----v----+ +----v----+ |
| // | | bs_full | | bs_frac | |
| // | +----+----+ +----+----+ |
| // | | | |
| // | +-----v-----+ |
| // | | |
| // | +------NO------JUST ONE BLOCK? |
| // | / | |
| // |/ YES |
| // + | |
| // | v |
| // | END_WITH_EVENTS(bs_full,bs_frac) |
| // | |
| // | |
| // WAITFOR(pad_vout,bs_full,bs_frac) >>> first iteration of loop <<< |
| // | |
| // | |
| // +-----------<------------+ |
| // | | |
| // +-----v-----+ | |
| // | | | |
| // +----v----+ +----v----+ | |
| // | fm_full | | fm_frac | | |
| // +----+----+ +----+----+ | |
| // | | ^ |
| // +-----v-----+ | |
| // | | |
| // WAITFOR(fm_full,fm_frac) | |
| // | | |
| // v | |
| // +--v--+ WAITFOR(bc) |
| // | hm | | |
| // +-----+ | |
| // | | |
| // WAITFOR(hm) | |
| // | ^ |
| // +--v--+ | |
| // | bc | | |
| // +-----+ | |
| // | | |
| // v | |
| // MERGING COMPLETE?-------NO------+ |
| // | |
| // YES |
| // | |
| // v |
| // END_WITH_EVENTS(bc) |
| // |
| |
| void |
| hs_cl_sort(struct hs_cl const * const hs, |
| cl_command_queue cq, |
| uint32_t const wait_list_size, |
| cl_event * wait_list, |
| cl_event * event, |
| cl_mem vin, |
| cl_mem vout, |
| uint32_t const count, |
| uint32_t const count_padded_in, |
| uint32_t const count_padded_out, |
| bool const linearize) |
| { |
| // is this sort in place? |
| bool const is_in_place = (vout == NULL); |
| |
| // cq, buffers, wait list and slab count |
| struct hs_state state = { |
| #ifndef NDEBUG |
| .t_total = 0, |
| #endif |
| .cq = cq, |
| .vin = vin, |
| .vout = is_in_place ? vin : vout, |
| .wait_list_size = 0, |
| .bx_ru = (count + hs->slab_keys - 1) / hs->slab_keys |
| }; |
| |
| // initialize vin |
| uint32_t const count_hi = is_in_place ? count_padded_out : count_padded_in; |
| bool const is_pre_sort_keyset_reqd = count_hi > count; |
| cl_event event_keyset_pre_sort[1]; |
| |
| // initialize any trailing keys in vin before sorting |
| if (is_pre_sort_keyset_reqd) |
| { |
| hs_keyset_pre_sort(hs,&state, |
| count,count_hi, |
| wait_list_size,wait_list, |
| event_keyset_pre_sort); |
| } |
| |
| // initialize any trailing keys in vout before merging |
| if (!is_in_place && (count_padded_out > count_padded_in)) |
| { |
| hs_keyset_pre_merge(hs,&state, |
| count_padded_in,count_padded_out, |
| wait_list_size,wait_list); |
| } |
| |
| // |
| // sort blocks of slabs |
| // |
| hs_bs(hs,&state, |
| count_padded_in, |
| is_pre_sort_keyset_reqd ? 1 : wait_list_size, |
| is_pre_sort_keyset_reqd ? event_keyset_pre_sort : wait_list); |
| |
| // release the event |
| if (is_pre_sort_keyset_reqd) |
| cl(ReleaseEvent(event_keyset_pre_sort[0])); |
| |
| // |
| // we're done if this was a single bs block... |
| // |
| // otherwise, merge sorted spans of slabs until done |
| // |
| if (state.bx_ru > hs->config.block.slabs) |
| { |
| int32_t up_scale_log2 = 1; |
| |
| while (true) |
| { |
| uint32_t down_slabs; |
| |
| // flip merge slabs -- return span of slabs that must be cleaned |
| uint32_t clean_slabs_log2 = hs_fm(hs,&state, |
| &down_slabs, |
| up_scale_log2); |
| |
| // if span is gt largest slab block cleaner then half merge |
| while (clean_slabs_log2 > hs->bc_slabs_log2_max) |
| { |
| clean_slabs_log2 = hs_hm(hs,&state, |
| down_slabs, |
| clean_slabs_log2); |
| } |
| |
| // launch clean slab grid -- is it the final launch? |
| hs_bc(hs,&state,down_slabs,clean_slabs_log2); |
| |
| // was this the final block clean? |
| if (((uint32_t)hs->config.block.slabs << up_scale_log2) >= state.bx_ru) |
| break; |
| |
| // otherwise, merge twice as many slabs |
| up_scale_log2 += 1; |
| } |
| } |
| |
| // slabs or linear? |
| if (linearize) { |
| hs_transpose(hs,&state); |
| } |
| |
| // does the caller want the final event? |
| if (event != NULL) { |
| *event = state.wait_list[0]; |
| } else { |
| cl(ReleaseEvent(state.wait_list[0])); |
| } |
| } |
| |
| // |
| // all grids will be computed as a function of the minimum number of slabs |
| // |
| |
| void |
| hs_cl_pad(struct hs_cl const * const hs, |
| uint32_t const count, |
| uint32_t * const count_padded_in, |
| uint32_t * const count_padded_out) |
| { |
| // |
| // round up the count to slabs |
| // |
| uint32_t const slabs_ru = (count + hs->slab_keys - 1) / hs->slab_keys; |
| uint32_t const blocks = slabs_ru / hs->config.block.slabs; |
| uint32_t const block_slabs = blocks * hs->config.block.slabs; |
| uint32_t const slabs_ru_rem = slabs_ru - block_slabs; |
| uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),hs->config.block.slabs); |
| |
| *count_padded_in = (block_slabs + slabs_ru_rem_ru) * hs->slab_keys; |
| *count_padded_out = *count_padded_in; |
| |
| // |
| // will merging be required? |
| // |
| if (slabs_ru > hs->config.block.slabs) |
| { |
| // more than one block |
| uint32_t const blocks_lo = pow2_rd_u32(blocks); |
| uint32_t const block_slabs_lo = blocks_lo * hs->config.block.slabs; |
| uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo; |
| |
| if (block_slabs_rem > 0) |
| { |
| uint32_t const block_slabs_rem_ru = pow2_ru_u32(block_slabs_rem); |
| |
| uint32_t const block_slabs_hi = MAX_MACRO(block_slabs_rem_ru, |
| blocks_lo << (1 - hs->config.merge.fm.scale_min)); |
| |
| uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi, |
| block_slabs_lo*2); // clamp non-pow2 blocks |
| |
| *count_padded_out = block_slabs_padded_out * hs->slab_keys; |
| } |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_create_kernel(cl_program program, |
| cl_kernel * const kernel, |
| char const * const name) |
| { |
| cl_int err; |
| |
| *kernel = clCreateKernel(program,name,&err); |
| |
| cl_ok(err); |
| } |
| |
| static |
| void |
| hs_create_kernels(cl_program program, |
| cl_kernel * kernels, |
| char name_template[], |
| size_t const name_template_size, |
| uint32_t const count) |
| { |
| char const n_max = '0'+(char)count; |
| |
| for (char n = '0'; n<n_max; n++) |
| { |
| cl_int err; |
| |
| name_template[name_template_size-2] = n; |
| |
| *kernels++ = clCreateKernel(program,name_template,&err); |
| |
| cl_ok(err); |
| } |
| } |
| |
| // |
| // |
| // |
| |
| struct hs_cl * |
| hs_cl_create(struct hs_cl_target const * const target, |
| cl_context context, |
| cl_device_id device_id) |
| { |
| // |
| // immediately try to build the OpenCL program |
| // |
| bool const is_binary = (target->program[0] == 0); |
| uint32_t const program_size = NPBTOHL_MACRO(target->program+1); |
| |
| cl_program program; |
| |
| if (is_binary) // program is a binary |
| { |
| cl_int status, err; |
| |
| size_t const bins_sizeof[] = { program_size }; |
| unsigned char const * bins[] = { target->program+5 }; |
| |
| program = clCreateProgramWithBinary(context, |
| 1, |
| &device_id, |
| bins_sizeof, |
| bins, |
| &status, |
| &err); |
| cl_ok(err); |
| |
| fprintf(stdout,"Building binary... "); |
| |
| fflush(stdout); |
| |
| cl(BuildProgram(program, |
| 1, |
| &device_id, |
| NULL, |
| NULL, |
| NULL)); |
| } |
| else // program is source code |
| { |
| cl_int err; |
| |
| size_t const strings_sizeof[] = { program_size }; |
| char const * strings[] = { (char*)target->program+5 }; |
| |
| program = clCreateProgramWithSource(context, |
| 1, |
| strings, |
| strings_sizeof, |
| &err); |
| cl_ok(err); |
| |
| char const * const options = |
| "-cl-std=CL1.2 -cl-fast-relaxed-math " |
| "-cl-no-signed-zeros -cl-mad-enable " |
| "-cl-denorms-are-zero " |
| "-cl-kernel-arg-info"; |
| |
| fprintf(stdout,"Building source... "); |
| |
| fflush(stdout); |
| |
| cl(BuildProgram(program, |
| 1, |
| &device_id, |
| options, |
| NULL, |
| NULL)); |
| } |
| |
| // |
| // we reference these values a lot |
| // |
| uint32_t const bs_slabs_log2_ru = msb_idx_u32(pow2_ru_u32(target->config.block.slabs)); |
| uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs)); |
| |
| // |
| // how many kernels will be created? |
| // |
| uint32_t const count_bs = bs_slabs_log2_ru + 1; |
| uint32_t const count_bc = bc_slabs_log2_max + 1; |
| uint32_t count_fm[3] = { 0 }; |
| uint32_t count_hm[3] = { 0 }; |
| |
| // guaranteed to be in range [0,2] |
| for (uint32_t scale = target->config.merge.fm.scale_min; |
| scale <= target->config.merge.fm.scale_max; |
| scale++) |
| { |
| uint32_t fm_left = (target->config.block.slabs / 2) << scale; |
| |
| count_fm[scale] = msb_idx_u32(pow2_ru_u32(fm_left)) + 1; |
| } |
| |
| // guaranteed to be in range [0,2] |
| for (uint32_t scale = target->config.merge.hm.scale_min; |
| scale <= target->config.merge.hm.scale_max; |
| scale++) |
| { |
| count_hm[scale] = 1; |
| } |
| |
| uint32_t const count_all = |
| 1 |
| + count_bs |
| + count_bc |
| + count_fm[0] + count_fm[1] + count_fm[2] |
| + count_hm[0] + count_hm[1] + count_hm[2]; |
| |
| // |
| // allocate hs_cl |
| // |
| struct hs_cl * hs = malloc(sizeof(*hs) + sizeof(cl_kernel) * count_all); |
| |
| memcpy(&hs->config,&target->config,sizeof(hs->config)); |
| |
| // save some frequently used calculated values |
| hs->key_val_size = (target->config.words.key + target->config.words.val) * 4; |
| hs->slab_keys = target->config.slab.height << target->config.slab.width_log2; |
| hs->bs_slabs_log2_ru = bs_slabs_log2_ru; |
| hs->bc_slabs_log2_max = bc_slabs_log2_max; |
| |
| // save kernel count |
| hs->kernels.count = count_all; |
| |
| // |
| // create all the kernels and release the program |
| // |
| cl_kernel * kernel_next = hs->kernels.all; |
| |
| // |
| // BS |
| // |
| { |
| hs->kernels.bs = kernel_next; |
| |
| char bs_name[] = { "hs_kernel_bs_X" }; |
| |
| hs_create_kernels(program, |
| kernel_next, |
| bs_name,sizeof(bs_name), |
| count_bs); |
| |
| kernel_next += count_bs; |
| } |
| |
| // |
| // BC |
| // |
| { |
| hs->kernels.bc = kernel_next; |
| |
| char bc_name[] = { "hs_kernel_bc_X" }; |
| |
| hs_create_kernels(program, |
| kernel_next, |
| bc_name,sizeof(bc_name), |
| count_bc); |
| |
| kernel_next += count_bc; |
| } |
| |
| // |
| // FM |
| // |
| if (count_fm[0] > 0) |
| { |
| hs->kernels.fm[0] = kernel_next; |
| |
| char fm_0_name[] = { "hs_kernel_fm_0_X" }; |
| |
| hs_create_kernels(program, |
| kernel_next, |
| fm_0_name,sizeof(fm_0_name), |
| count_fm[0]); |
| |
| kernel_next += count_fm[0]; |
| } |
| |
| if (count_fm[1] > 0) |
| { |
| hs->kernels.fm[1] = kernel_next; |
| |
| char fm_1_name[] = { "hs_kernel_fm_1_X" }; |
| |
| hs_create_kernels(program, |
| kernel_next, |
| fm_1_name,sizeof(fm_1_name), |
| count_fm[1]); |
| |
| kernel_next += count_fm[1]; |
| } |
| |
| if (count_fm[2] > 0) |
| { |
| hs->kernels.fm[2] = kernel_next; |
| |
| char fm_2_name[] = { "hs_kernel_fm_2_X" }; |
| |
| hs_create_kernels(program, |
| kernel_next, |
| fm_2_name,sizeof(fm_2_name), |
| count_fm[2]); |
| |
| kernel_next += count_fm[2]; |
| } |
| |
| // |
| // HM |
| // |
| if (count_hm[0] > 0) |
| { |
| hs->kernels.hm[0] = kernel_next; |
| |
| hs_create_kernel(program, |
| kernel_next, |
| "hs_kernel_hm_0"); |
| |
| kernel_next += count_hm[0]; |
| } |
| |
| if (count_hm[1] > 0) |
| { |
| hs->kernels.hm[1] = kernel_next; |
| |
| hs_create_kernel(program, |
| kernel_next, |
| "hs_kernel_hm_1"); |
| |
| kernel_next += count_hm[1]; |
| } |
| |
| if (count_hm[2] > 0) |
| { |
| hs->kernels.hm[2] = kernel_next; |
| |
| hs_create_kernel(program, |
| kernel_next, |
| "hs_kernel_hm_2"); |
| |
| kernel_next += count_hm[2]; // unnecessary |
| } |
| |
| // |
| // TRANSPOSE |
| // |
| { |
| hs->kernels.transpose = kernel_next; |
| |
| hs_create_kernel(program, |
| kernel_next, |
| "hs_kernel_transpose"); |
| |
| kernel_next += 1; |
| } |
| |
| return hs; |
| } |
| |
| // |
| // |
| // |
| |
| void |
| hs_cl_release(struct hs_cl * const hs) |
| { |
| for (uint32_t ii=0; ii<hs->kernels.count; ii++) |
| cl(ReleaseKernel(hs->kernels.all[ii])); |
| |
| free(hs); |
| } |
| |
| // |
| // |
| // |