| /* |
| * Copyright 2016 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can |
| * be found in the LICENSE file. |
| * |
| */ |
| |
| #include <stdlib.h> |
| #include <string.h> |
| #include <inttypes.h> |
| |
| #include "common/util.h" |
| #include "common/macros.h" |
| #include "common/vk/assert_vk.h" |
| |
| #include "hs_vk.h" |
| #include "hs_vk_target.h" |
| |
| // |
| // We want concurrent kernel execution to occur in a few places. |
| // |
| // The summary is: |
| // |
| // 1) If necessary, some max valued keys are written to the end of |
| // the vin/vout buffers. |
| // |
| // 2) Blocks of slabs of keys are sorted. |
| // |
| // 3) If necesary, the blocks of slabs are merged until complete. |
| // |
| // 4) If requested, the slabs will be converted from slab ordering |
| // to linear ordering. |
| // |
| // Below is the general "happens-before" relationship between HotSort |
| // compute kernels. |
| // |
| // Note the diagram assumes vin and vout are different buffers. If |
| // they're not, then the first merge doesn't include the pad_vout |
| // event in the wait list. |
| // |
| // +----------+ +---------+ |
| // | pad_vout | | pad_vin | |
| // +----+-----+ +----+----+ |
| // | | |
| // | WAITFOR(pad_vin) |
| // | | |
| // | +-----v-----+ |
| // | | | |
| // | +----v----+ +----v----+ |
| // | | bs_full | | bs_frac | |
| // | +----+----+ +----+----+ |
| // | | | |
| // | +-----v-----+ |
| // | | |
| // | +------NO------JUST ONE BLOCK? |
| // | / | |
| // |/ YES |
| // + | |
| // | v |
| // | END_WITH_EVENTS(bs_full,bs_frac) |
| // | |
| // | |
| // WAITFOR(pad_vout,bs_full,bs_frac) >>> first iteration of loop <<< |
| // | |
| // | |
| // +-----------<------------+ |
| // | | |
| // +-----v-----+ | |
| // | | | |
| // +----v----+ +----v----+ | |
| // | fm_full | | fm_frac | | |
| // +----+----+ +----+----+ | |
| // | | ^ |
| // +-----v-----+ | |
| // | | |
| // WAITFOR(fm_full,fm_frac) | |
| // | | |
| // v | |
| // +--v--+ WAITFOR(bc) |
| // | hm | | |
| // +-----+ | |
| // | | |
| // WAITFOR(hm) | |
| // | ^ |
| // +--v--+ | |
| // | bc | | |
| // +-----+ | |
| // | | |
| // v | |
| // MERGING COMPLETE?-------NO------+ |
| // | |
| // YES |
| // | |
| // v |
| // END_WITH_EVENTS(bc) |
| // |
| |
| struct hs_vk |
| { |
| VkAllocationCallbacks const * allocator; |
| VkDevice device; |
| |
| struct { |
| struct { |
| VkDescriptorSetLayout vout_vin; |
| } layout; |
| } desc_set; |
| |
| struct { |
| struct { |
| VkPipelineLayout vout_vin; |
| } layout; |
| } pipeline; |
| |
| struct hs_vk_target_config config; |
| |
| uint32_t key_val_size; |
| uint32_t slab_keys; |
| uint32_t bs_slabs_log2_ru; |
| uint32_t bc_slabs_log2_max; |
| |
| struct { |
| uint32_t count; |
| VkPipeline * bs; |
| VkPipeline * bc; |
| VkPipeline * fm[3]; |
| VkPipeline * hm[3]; |
| VkPipeline * transpose; |
| VkPipeline all[]; |
| } pipelines; |
| }; |
| |
| // |
| // |
| // |
| |
| struct hs_state |
| { |
| VkCommandBuffer cb; |
| |
| // If sorting in-place, then vout == vin |
| VkBuffer vout; |
| VkBuffer vin; |
| |
| // bx_ru is number of rounded up warps in vin |
| uint32_t bx_ru; |
| }; |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_barrier_compute_w_to_compute_r(struct hs_state * const state) |
| { |
| static VkMemoryBarrier const shader_w_to_r = { |
| .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, |
| .pNext = NULL, |
| .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, |
| .dstAccessMask = VK_ACCESS_SHADER_READ_BIT |
| }; |
| |
| vkCmdPipelineBarrier(state->cb, |
| VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, |
| VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, |
| 0, |
| 1, |
| &shader_w_to_r, |
| 0, |
| NULL, |
| 0, |
| NULL); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_barrier_to_compute_r(struct hs_state * const state, |
| VkPipelineStageFlags const src_stage, |
| VkAccessFlagBits const src_access) |
| { |
| if (src_stage == 0) |
| return; |
| |
| VkMemoryBarrier const compute_r = { |
| .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, |
| .pNext = NULL, |
| .srcAccessMask = src_access, |
| .dstAccessMask = VK_ACCESS_SHADER_READ_BIT |
| }; |
| |
| vkCmdPipelineBarrier(state->cb, |
| src_stage, |
| VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, |
| 0, |
| 1, |
| &compute_r, |
| 0, |
| NULL, |
| 0, |
| NULL); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_barrier_to_transfer_fill(struct hs_state * const state, |
| VkPipelineStageFlags const src_stage, |
| VkAccessFlagBits const src_access) |
| { |
| if (src_stage == 0) |
| return; |
| |
| VkMemoryBarrier const fill_w = { |
| .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, |
| .pNext = NULL, |
| .srcAccessMask = src_access, |
| .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT |
| }; |
| |
| vkCmdPipelineBarrier(state->cb, |
| src_stage, |
| VK_PIPELINE_STAGE_TRANSFER_BIT, |
| 0, |
| 1, |
| &fill_w, |
| 0, |
| NULL, |
| 0, |
| NULL); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_transpose(struct hs_vk const * const hs, |
| struct hs_state * const state) |
| { |
| hs_barrier_compute_w_to_compute_r(state); |
| |
| vkCmdBindPipeline(state->cb, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| hs->pipelines.transpose[0]); |
| |
| vkCmdDispatch(state->cb,state->bx_ru,1,1); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_bc(struct hs_vk const * const hs, |
| struct hs_state * const state, |
| uint32_t const down_slabs, |
| uint32_t const clean_slabs_log2) |
| { |
| hs_barrier_compute_w_to_compute_r(state); |
| |
| // block clean the minimal number of down_slabs_log2 spans |
| uint32_t const frac_ru = (1u << clean_slabs_log2) - 1; |
| uint32_t const full_bc = (down_slabs + frac_ru) >> clean_slabs_log2; |
| |
| vkCmdBindPipeline(state->cb, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| hs->pipelines.bc[clean_slabs_log2]); |
| |
| vkCmdDispatch(state->cb,full_bc,1,1); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| uint32_t |
| hs_hm(struct hs_vk const * const hs, |
| struct hs_state * const state, |
| uint32_t const down_slabs, |
| uint32_t const clean_slabs_log2) |
| { |
| hs_barrier_compute_w_to_compute_r(state); |
| |
| // how many scaled half-merge spans are there? |
| uint32_t const frac_ru = (1 << clean_slabs_log2) - 1; |
| uint32_t const spans = (down_slabs + frac_ru) >> clean_slabs_log2; |
| |
| // for now, just clamp to the max |
| uint32_t const log2_rem = clean_slabs_log2 - hs->bc_slabs_log2_max; |
| uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.hm.scale_max,log2_rem); |
| uint32_t const log2_out = log2_rem - scale_log2; |
| |
| // size the grid |
| uint32_t const slab_span = hs->config.slab.height << log2_out; |
| |
| vkCmdBindPipeline(state->cb, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| hs->pipelines.hm[scale_log2][0]); |
| |
| vkCmdDispatch(state->cb,slab_span,spans,1); |
| |
| return log2_out; |
| } |
| |
| // |
| // FIXME -- some of this logic can be skipped if BS is a power-of-two |
| // |
| |
| static |
| uint32_t |
| hs_fm(struct hs_vk const * const hs, |
| struct hs_state * const state, |
| uint32_t * const down_slabs, |
| uint32_t const up_scale_log2) |
| { |
| // |
| // FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes |
| // a performance win to bias toward launching the smaller flip merge |
| // kernel in order to get more warps in flight (increased |
| // occupancy). This is useful when merging small numbers of slabs. |
| // |
| // Note that HS_FM_SCALE_MIN will always be 0 or 1. |
| // |
| // So, for now, just clamp to the max until there is a reason to |
| // restore the fancier and probably low-impact approach. |
| // |
| uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.fm.scale_max,up_scale_log2); |
| uint32_t const clean_log2 = up_scale_log2 - scale_log2; |
| |
| // number of slabs in a full-sized scaled flip-merge span |
| uint32_t const full_span_slabs = hs->config.block.slabs << up_scale_log2; |
| |
| // how many full-sized scaled flip-merge spans are there? |
| uint32_t full_fm = state->bx_ru / full_span_slabs; |
| uint32_t frac_fm = 0; |
| |
| // initialize down_slabs |
| *down_slabs = full_fm * full_span_slabs; |
| |
| // how many half-size scaled + fractional scaled spans are there? |
| uint32_t const span_rem = state->bx_ru - *down_slabs; |
| uint32_t const half_span_slabs = full_span_slabs >> 1; |
| |
| // if we have over a half-span then fractionally merge it |
| if (span_rem > half_span_slabs) |
| { |
| // the remaining slabs will be cleaned |
| *down_slabs += span_rem; |
| |
| uint32_t const frac_rem = span_rem - half_span_slabs; |
| uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem); |
| |
| if (frac_rem_pow2 >= half_span_slabs) |
| { |
| // bump it up to a full span |
| full_fm += 1; |
| } |
| else |
| { |
| // otherwise, add fractional |
| frac_fm = MAX_MACRO(1,frac_rem_pow2 >> clean_log2); |
| } |
| } |
| |
| // |
| // Size the grid |
| // |
| // The simplifying choices below limit the maximum keys that can be |
| // sorted with this grid scheme to around ~2B. |
| // |
| // .x : slab height << clean_log2 -- this is the slab span |
| // .y : [1...65535] -- this is the slab index |
| // .z : ( this could also be used to further expand .y ) |
| // |
| // Note that OpenCL declares a grid in terms of global threads and |
| // not grids and blocks |
| // |
| |
| // |
| // size the grid |
| // |
| uint32_t const slab_span = hs->config.slab.height << clean_log2; |
| |
| if (full_fm > 0) |
| { |
| uint32_t const full_idx = hs->bs_slabs_log2_ru - 1 + scale_log2; |
| |
| vkCmdBindPipeline(state->cb, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| hs->pipelines.fm[scale_log2][full_idx]); |
| |
| vkCmdDispatch(state->cb,slab_span,full_fm,1); |
| } |
| |
| if (frac_fm > 0) |
| { |
| vkCmdBindPipeline(state->cb, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| hs->pipelines.fm[scale_log2][msb_idx_u32(frac_fm)]); |
| |
| vkCmdDispatchBase(state->cb, |
| 0,full_fm,0, |
| slab_span,1,1); |
| } |
| |
| return clean_log2; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_bs(struct hs_vk const * const hs, |
| struct hs_state * const state, |
| uint32_t const count_padded_in) |
| { |
| uint32_t const slabs_in = count_padded_in / hs->slab_keys; |
| uint32_t const full_bs = slabs_in / hs->config.block.slabs; |
| uint32_t const frac_bs = slabs_in - full_bs * hs->config.block.slabs; |
| |
| if (full_bs > 0) |
| { |
| vkCmdBindPipeline(state->cb, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| hs->pipelines.bs[hs->bs_slabs_log2_ru]); |
| |
| vkCmdDispatch(state->cb,full_bs,1,1); |
| } |
| |
| if (frac_bs > 0) |
| { |
| uint32_t const frac_idx = msb_idx_u32(frac_bs); |
| uint32_t const full_to_frac_log2 = hs->bs_slabs_log2_ru - frac_idx; |
| |
| vkCmdBindPipeline(state->cb, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| hs->pipelines.bs[msb_idx_u32(frac_bs)]); |
| |
| vkCmdDispatchBase(state->cb, |
| full_bs<<full_to_frac_log2,0,0, |
| 1,1,1); |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_keyset_pre_fm(struct hs_vk const * const hs, |
| struct hs_state * const state, |
| uint32_t const count_lo, |
| uint32_t const count_hi) |
| { |
| uint32_t const vout_span = count_hi - count_lo; |
| |
| vkCmdFillBuffer(state->cb, |
| state->vout, |
| count_lo * hs->key_val_size, |
| vout_span * hs->key_val_size, |
| UINT32_MAX); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hs_keyset_pre_bs(struct hs_vk const * const hs, |
| struct hs_state * const state, |
| uint32_t const count, |
| uint32_t const count_hi) |
| { |
| uint32_t const vin_span = count_hi - count; |
| |
| vkCmdFillBuffer(state->cb, |
| state->vin, |
| count * hs->key_val_size, |
| vin_span * hs->key_val_size, |
| UINT32_MAX); |
| } |
| |
| // |
| // |
| // |
| |
| void |
| hs_vk_ds_bind(struct hs_vk const * const hs, |
| VkDescriptorSet hs_ds, |
| VkCommandBuffer cb, |
| VkBuffer vin, |
| VkBuffer vout) |
| { |
| // |
| // initialize the HotSort descriptor set |
| // |
| VkDescriptorBufferInfo const dbi[] = { |
| { |
| .buffer = vout == VK_NULL_HANDLE ? vin : vout, |
| .offset = 0, |
| .range = VK_WHOLE_SIZE |
| }, |
| { |
| .buffer = vin, |
| .offset = 0, |
| .range = VK_WHOLE_SIZE |
| } |
| }; |
| |
| VkWriteDescriptorSet const wds[] = { |
| { |
| .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, |
| .pNext = NULL, |
| .dstSet = hs_ds, |
| .dstBinding = 0, |
| .dstArrayElement = 0, |
| .descriptorCount = 2, |
| .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| .pImageInfo = NULL, |
| .pBufferInfo = dbi, |
| .pTexelBufferView = NULL |
| } |
| }; |
| |
| vkUpdateDescriptorSets(hs->device, |
| ARRAY_LENGTH_MACRO(wds), |
| wds, |
| 0, |
| NULL); |
| |
| // |
| // All HotSort kernels can use the same descriptor set: |
| // |
| // { |
| // HS_KEY_TYPE vout[]; |
| // HS_KEY_TYPE vin[]; |
| // } |
| // |
| // Note that only the bs() kernels read from vin(). |
| // |
| vkCmdBindDescriptorSets(cb, |
| VK_PIPELINE_BIND_POINT_COMPUTE, |
| hs->pipeline.layout.vout_vin, |
| 0, |
| 1, |
| &hs_ds, |
| 0, |
| NULL); |
| } |
| |
| // |
| // |
| // |
| |
| void |
| hs_vk_sort(struct hs_vk const * const hs, |
| VkCommandBuffer cb, |
| VkBuffer vin, |
| VkPipelineStageFlags const vin_src_stage, |
| VkAccessFlagBits const vin_src_access, |
| VkBuffer vout, |
| VkPipelineStageFlags const vout_src_stage, |
| VkAccessFlagBits const vout_src_access, |
| uint32_t const count, |
| uint32_t const count_padded_in, |
| uint32_t const count_padded_out, |
| bool const linearize) |
| { |
| // is this sort in place? |
| bool const is_in_place = (vout == VK_NULL_HANDLE); |
| |
| // |
| // create some common state |
| // |
| struct hs_state state = { |
| .cb = cb, |
| .vin = vin, |
| .vout = is_in_place ? vin : vout, |
| .bx_ru = (count + hs->slab_keys - 1) / hs->slab_keys |
| }; |
| |
| // initialize vin |
| uint32_t const count_hi = is_in_place ? count_padded_out : count_padded_in; |
| bool const is_pre_sort_reqd = count_hi > count; |
| bool const is_pre_merge_reqd = !is_in_place && (count_padded_out > count_padded_in); |
| |
| // |
| // pre-sort keyset needs to happen before bs() |
| // pre-merge keyset needs to happen before fm() |
| // |
| |
| VkPipelineStageFlags bs_src_stage = 0; |
| VkAccessFlagBits bs_src_access = 0; |
| |
| // initialize any trailing keys in vin before sorting |
| if (is_pre_sort_reqd) |
| { |
| hs_barrier_to_transfer_fill(&state,vin_src_stage,vin_src_access); |
| |
| hs_keyset_pre_bs(hs,&state,count,count_hi); |
| |
| bs_src_stage |= VK_PIPELINE_STAGE_TRANSFER_BIT; |
| bs_src_access |= VK_ACCESS_TRANSFER_WRITE_BIT; |
| } |
| else |
| { |
| bs_src_stage = vin_src_stage; |
| bs_src_access = vin_src_access; |
| } |
| |
| hs_barrier_to_compute_r(&state,bs_src_stage,bs_src_access); |
| |
| // sort blocks of slabs... after hs_keyset_pre_sort() |
| hs_bs(hs,&state,count_padded_in); |
| |
| VkPipelineStageFlags fm_src_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; |
| VkAccessFlagBits fm_src_access = VK_ACCESS_SHADER_READ_BIT; |
| |
| // initialize any trailing keys in vout before merging |
| if (is_pre_merge_reqd) |
| { |
| hs_barrier_to_transfer_fill(&state,vout_src_stage,vout_src_access); |
| |
| hs_keyset_pre_fm(hs,&state,count_padded_in,count_padded_out); |
| |
| fm_src_stage |= VK_PIPELINE_STAGE_TRANSFER_BIT; |
| fm_src_access |= VK_ACCESS_TRANSFER_WRITE_BIT; |
| } |
| else |
| { |
| fm_src_stage |= vout_src_stage; |
| fm_src_access |= vout_src_access; |
| } |
| |
| // |
| // if this was a single bs block then there is no merging |
| // |
| if (state.bx_ru > hs->config.block.slabs) |
| { |
| hs_barrier_to_compute_r(&state,fm_src_stage,fm_src_access); |
| |
| // |
| // otherwise, merge sorted spans of slabs until done |
| // |
| int32_t up_scale_log2 = 1; |
| |
| while (true) |
| { |
| uint32_t down_slabs; |
| |
| // flip merge slabs -- return span of slabs that must be cleaned |
| uint32_t clean_slabs_log2 = hs_fm(hs,&state, |
| &down_slabs, |
| up_scale_log2); |
| |
| // if span is gt largest slab block cleaner then half merge |
| while (clean_slabs_log2 > hs->bc_slabs_log2_max) |
| { |
| clean_slabs_log2 = hs_hm(hs,&state, |
| down_slabs, |
| clean_slabs_log2); |
| } |
| |
| // launch clean slab grid -- is it the final launch? |
| hs_bc(hs,&state,down_slabs,clean_slabs_log2); |
| |
| // was this the final block clean? |
| if (((uint32_t)hs->config.block.slabs << up_scale_log2) >= state.bx_ru) |
| break; |
| |
| // otherwise, merge twice as many slabs |
| up_scale_log2 += 1; |
| |
| // drop a barrier |
| hs_barrier_compute_w_to_compute_r(&state); |
| } |
| } |
| |
| // slabs or linear? |
| if (linearize) |
| hs_transpose(hs,&state); |
| } |
| |
| // |
| // |
| // |
| |
| #ifdef HS_VK_VERBOSE_STATISTICS_AMD |
| |
| #include <stdio.h> |
| |
| static |
| void |
| hs_vk_verbose_statistics_amd(VkDevice device, struct hs_vk const * const hs) |
| { |
| PFN_vkGetShaderInfoAMD vkGetShaderInfoAMD = |
| (PFN_vkGetShaderInfoAMD) |
| vkGetDeviceProcAddr(device,"vkGetShaderInfoAMD"); |
| |
| if (vkGetShaderInfoAMD == NULL) |
| return; |
| |
| fprintf(stdout, |
| " PHY PHY AVAIL AVAIL\n" |
| "VGPRs SGPRs LDS_MAX LDS/WG SPILL VGPRs SGPRs VGPRs SGPRs WORKGROUP_SIZE\n"); |
| |
| for (uint32_t ii=0; ii<hs->pipelines.count; ii++) |
| { |
| VkShaderStatisticsInfoAMD ssi_amd; |
| size_t ssi_amd_size = sizeof(ssi_amd); |
| |
| if (vkGetShaderInfoAMD(hs->device, |
| hs->pipelines.all[ii], |
| VK_SHADER_STAGE_COMPUTE_BIT, |
| VK_SHADER_INFO_TYPE_STATISTICS_AMD, |
| &ssi_amd_size, |
| &ssi_amd) == VK_SUCCESS) |
| { |
| fprintf(stdout, |
| "%5" PRIu32 " " |
| "%5" PRIu32 " " |
| "%5" PRIu32 " " |
| |
| "%6zu " |
| "%6zu " |
| |
| "%5" PRIu32 " " |
| "%5" PRIu32 " " |
| "%5" PRIu32 " " |
| "%5" PRIu32 " " |
| |
| "( %6" PRIu32 ", " "%6" PRIu32 ", " "%6" PRIu32 " )\n", |
| ssi_amd.resourceUsage.numUsedVgprs, |
| ssi_amd.resourceUsage.numUsedSgprs, |
| ssi_amd.resourceUsage.ldsSizePerLocalWorkGroup, |
| ssi_amd.resourceUsage.ldsUsageSizeInBytes, // size_t |
| ssi_amd.resourceUsage.scratchMemUsageInBytes, // size_t |
| ssi_amd.numPhysicalVgprs, |
| ssi_amd.numPhysicalSgprs, |
| ssi_amd.numAvailableVgprs, |
| ssi_amd.numAvailableSgprs, |
| ssi_amd.computeWorkGroupSize[0], |
| ssi_amd.computeWorkGroupSize[1], |
| ssi_amd.computeWorkGroupSize[2]); |
| } |
| } |
| } |
| |
| #endif |
| |
| // |
| // |
| // |
| |
| #ifdef HS_VK_VERBOSE_DISASSEMBLY_AMD |
| |
| #include <stdio.h> |
| |
| static |
| void |
| hs_vk_verbose_disassembly_amd(VkDevice device, struct hs_vk const * const hs) |
| { |
| PFN_vkGetShaderInfoAMD vkGetShaderInfoAMD = |
| (PFN_vkGetShaderInfoAMD) |
| vkGetDeviceProcAddr(device,"vkGetShaderInfoAMD"); |
| |
| if (vkGetShaderInfoAMD == NULL) |
| return; |
| |
| for (uint32_t ii=0; ii<hs->pipelines.count; ii++) |
| { |
| size_t disassembly_amd_size; |
| |
| if (vkGetShaderInfoAMD(hs->device, |
| hs->pipelines.all[ii], |
| VK_SHADER_STAGE_COMPUTE_BIT, |
| VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD, |
| &disassembly_amd_size, |
| NULL) == VK_SUCCESS) |
| { |
| void * disassembly_amd = malloc(disassembly_amd_size); |
| |
| if (vkGetShaderInfoAMD(hs->device, |
| hs->pipelines.all[ii], |
| VK_SHADER_STAGE_COMPUTE_BIT, |
| VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD, |
| &disassembly_amd_size, |
| disassembly_amd) == VK_SUCCESS) |
| { |
| fprintf(stdout,"%s",(char*)disassembly_amd); |
| } |
| |
| free(disassembly_amd); |
| } |
| } |
| } |
| |
| #endif |
| |
| // |
| // |
| // |
| |
| struct hs_vk * |
| hs_vk_create(struct hs_vk_target const * const target, |
| VkDevice device, |
| VkAllocationCallbacks const * allocator, |
| VkPipelineCache pipeline_cache) |
| { |
| // |
| // we reference these values a lot |
| // |
| uint32_t const bs_slabs_log2_ru = msb_idx_u32(pow2_ru_u32(target->config.block.slabs)); |
| uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs)); |
| |
| // |
| // how many kernels will be created? |
| // |
| uint32_t const count_bs = bs_slabs_log2_ru + 1; |
| uint32_t const count_bc = bc_slabs_log2_max + 1; |
| uint32_t count_fm[3] = { 0 }; |
| uint32_t count_hm[3] = { 0 }; |
| |
| // guaranteed to be in range [0,2] |
| for (uint32_t scale = target->config.merge.fm.scale_min; |
| scale <= target->config.merge.fm.scale_max; |
| scale++) |
| { |
| uint32_t fm_left = (target->config.block.slabs / 2) << scale; |
| |
| count_fm[scale] = msb_idx_u32(pow2_ru_u32(fm_left)) + 1; |
| } |
| |
| // guaranteed to be in range [0,2] |
| for (uint32_t scale = target->config.merge.hm.scale_min; |
| scale <= target->config.merge.hm.scale_max; |
| scale++) |
| { |
| count_hm[scale] = 1; |
| } |
| |
| uint32_t const count_bc_fm_hm_transpose = |
| + count_bc |
| + count_fm[0] + count_fm[1] + count_fm[2] |
| + count_hm[0] + count_hm[1] + count_hm[2] + |
| 1; // transpose |
| |
| uint32_t const count_all = count_bs + count_bc_fm_hm_transpose; |
| |
| // |
| // allocate hs_vk |
| // |
| struct hs_vk * hs; |
| |
| if (allocator == NULL) |
| { |
| hs = malloc(sizeof(*hs) + sizeof(VkPipeline*) * count_all); |
| } |
| else |
| { |
| hs = allocator->pfnAllocation(NULL, |
| sizeof(*hs) + sizeof(VkPipeline*) * count_all, |
| 0, |
| VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); |
| } |
| |
| // save device & allocator |
| hs->device = device; |
| hs->allocator = allocator; |
| |
| // |
| // create one descriptor set layout |
| // |
| static VkDescriptorSetLayoutBinding const dslb_vout_vin[] = { |
| { |
| .binding = 0, // vout |
| .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| .descriptorCount = 1, |
| .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |
| .pImmutableSamplers = NULL |
| }, |
| { |
| .binding = 1, // vin |
| .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| .descriptorCount = 1, |
| .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |
| .pImmutableSamplers = NULL |
| } |
| }; |
| |
| static VkDescriptorSetLayoutCreateInfo const dscli = { |
| .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, |
| .pNext = NULL, |
| .flags = 0, |
| .bindingCount = 2, // 0:vout[], 1:vin[] |
| .pBindings = dslb_vout_vin |
| }; |
| |
| vk(CreateDescriptorSetLayout(device, |
| &dscli, |
| allocator, |
| &hs->desc_set.layout.vout_vin)); |
| |
| // |
| // create one pipeline layout |
| // |
| VkPipelineLayoutCreateInfo plci = { |
| .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, |
| .pNext = NULL, |
| .flags = 0, |
| .setLayoutCount = 1, |
| .pSetLayouts = &hs->desc_set.layout.vout_vin, |
| .pushConstantRangeCount = 0, |
| .pPushConstantRanges = NULL |
| }; |
| |
| vk(CreatePipelineLayout(device, |
| &plci, |
| allocator, |
| &hs->pipeline.layout.vout_vin)); |
| |
| // |
| // copy the config from the target -- we need these values later |
| // |
| memcpy(&hs->config,&target->config,sizeof(hs->config)); |
| |
| // save some frequently used calculated values |
| hs->key_val_size = (target->config.words.key + target->config.words.val) * 4; |
| hs->slab_keys = target->config.slab.height << target->config.slab.width_log2; |
| hs->bs_slabs_log2_ru = bs_slabs_log2_ru; |
| hs->bc_slabs_log2_max = bc_slabs_log2_max; |
| |
| // save kernel count |
| hs->pipelines.count = count_all; |
| |
| // |
| // create all the compute pipelines by reusing this info |
| // |
| VkComputePipelineCreateInfo cpci = { |
| .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, |
| .pNext = NULL, |
| .flags = VK_PIPELINE_CREATE_DISPATCH_BASE, // | VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT, |
| .stage = { |
| .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, |
| .pNext = NULL, |
| .flags = 0, |
| .stage = VK_SHADER_STAGE_COMPUTE_BIT, |
| .module = VK_NULL_HANDLE, |
| .pName = "main", |
| .pSpecializationInfo = NULL |
| }, |
| .layout = hs->pipeline.layout.vout_vin, |
| .basePipelineHandle = VK_NULL_HANDLE, |
| .basePipelineIndex = 0 |
| }; |
| |
| // |
| // Create a shader module, use it to create a pipeline... and |
| // dispose of the shader module. |
| // |
| // The BS compute shaders have the same layout |
| // The non-BS compute shaders have the same layout |
| // |
| VkShaderModuleCreateInfo smci = { |
| .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, |
| .pNext = NULL, |
| .flags = 0, |
| .codeSize = 0, |
| .pCode = (uint32_t const *)target->modules // FIXME -- unfortunate typecast |
| }; |
| |
| // |
| // bs kernels have layout: (vout,vin) |
| // remaining have layout: (vout) |
| // |
| for (uint32_t ii=0; ii<count_all; ii++) |
| { |
| // convert bytes to words |
| uint32_t const * const module = smci.pCode + smci.codeSize / sizeof(*module); |
| |
| smci.codeSize = NTOHL_MACRO(module[0]); |
| smci.pCode = module + 1; |
| |
| vk(CreateShaderModule(device, |
| &smci, |
| allocator, |
| &cpci.stage.module)); |
| |
| vk(CreateComputePipelines(device, |
| pipeline_cache, |
| 1, |
| &cpci, |
| allocator, |
| hs->pipelines.all+ii)); |
| |
| vkDestroyShaderModule(device, |
| cpci.stage.module, |
| allocator); |
| } |
| |
| // |
| // initialize pointers to pipeline handles |
| // |
| VkPipeline * pipeline_next = hs->pipelines.all; |
| |
| // BS |
| hs->pipelines.bs = pipeline_next; |
| pipeline_next += count_bs; |
| |
| // BC |
| hs->pipelines.bc = pipeline_next; |
| pipeline_next += count_bc; |
| |
| // FM[0] |
| hs->pipelines.fm[0] = count_fm[0] ? pipeline_next : NULL; |
| pipeline_next += count_fm[0]; |
| |
| // FM[1] |
| hs->pipelines.fm[1] = count_fm[1] ? pipeline_next : NULL; |
| pipeline_next += count_fm[1]; |
| |
| // FM[2] |
| hs->pipelines.fm[2] = count_fm[2] ? pipeline_next : NULL; |
| pipeline_next += count_fm[2]; |
| |
| // HM[0] |
| hs->pipelines.hm[0] = count_hm[0] ? pipeline_next : NULL; |
| pipeline_next += count_hm[0]; |
| |
| // HM[1] |
| hs->pipelines.hm[1] = count_hm[1] ? pipeline_next : NULL; |
| pipeline_next += count_hm[1]; |
| |
| // HM[2] |
| hs->pipelines.hm[2] = count_hm[2] ? pipeline_next : NULL; |
| pipeline_next += count_hm[2]; |
| |
| // TRANSPOSE |
| hs->pipelines.transpose = pipeline_next; |
| pipeline_next += 1; |
| |
| // |
| // optionally dump pipeline stats |
| // |
| #ifdef HS_VK_VERBOSE_STATISTICS_AMD |
| hs_vk_verbose_statistics_amd(device,hs); |
| #endif |
| #ifdef HS_VK_VERBOSE_DISASSEMBLY_AMD |
| hs_vk_verbose_disassembly_amd(device,hs); |
| #endif |
| |
| // |
| // |
| // |
| |
| return hs; |
| } |
| |
| // |
| // |
| // |
| |
| void |
| hs_vk_release(struct hs_vk * const hs) |
| { |
| vkDestroyDescriptorSetLayout(hs->device, |
| hs->desc_set.layout.vout_vin, |
| hs->allocator); |
| |
| vkDestroyPipelineLayout(hs->device, |
| hs->pipeline.layout.vout_vin, |
| hs->allocator); |
| |
| for (uint32_t ii=0; ii<hs->pipelines.count; ii++) |
| { |
| vkDestroyPipeline(hs->device, |
| hs->pipelines.all[ii], |
| hs->allocator); |
| } |
| |
| if (hs->allocator == NULL) |
| { |
| free(hs); |
| } |
| else |
| { |
| hs->allocator->pfnFree(NULL,hs); |
| } |
| } |
| |
| // |
| // Allocate a per-thread descriptor set for the vin and vout |
| // VkBuffers. Note that HotSort uses only one descriptor set. |
| // |
| |
| VkDescriptorSet |
| hs_vk_ds_alloc(struct hs_vk const * const hs, VkDescriptorPool desc_pool) |
| { |
| VkDescriptorSetAllocateInfo const ds_alloc_info = { |
| .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, |
| .pNext = NULL, |
| .descriptorPool = desc_pool, |
| .descriptorSetCount = 1, |
| .pSetLayouts = &hs->desc_set.layout.vout_vin |
| }; |
| |
| VkDescriptorSet hs_ds; |
| |
| vk(AllocateDescriptorSets(hs->device, |
| &ds_alloc_info, |
| &hs_ds)); |
| |
| return hs_ds; |
| } |
| |
| // |
| // |
| // |
| |
| void |
| hs_vk_pad(struct hs_vk const * const hs, |
| uint32_t const count, |
| uint32_t * const count_padded_in, |
| uint32_t * const count_padded_out) |
| { |
| // |
| // round up the count to slabs |
| // |
| uint32_t const slabs_ru = (count + hs->slab_keys - 1) / hs->slab_keys; |
| uint32_t const blocks = slabs_ru / hs->config.block.slabs; |
| uint32_t const block_slabs = blocks * hs->config.block.slabs; |
| uint32_t const slabs_ru_rem = slabs_ru - block_slabs; |
| uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),hs->config.block.slabs); |
| |
| *count_padded_in = (block_slabs + slabs_ru_rem_ru) * hs->slab_keys; |
| *count_padded_out = *count_padded_in; |
| |
| // |
| // will merging be required? |
| // |
| if (slabs_ru > hs->config.block.slabs) |
| { |
| // more than one block |
| uint32_t const blocks_lo = pow2_rd_u32(blocks); |
| uint32_t const block_slabs_lo = blocks_lo * hs->config.block.slabs; |
| uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo; |
| |
| if (block_slabs_rem > 0) |
| { |
| uint32_t const block_slabs_rem_ru = pow2_ru_u32(block_slabs_rem); |
| |
| uint32_t const block_slabs_hi = MAX_MACRO(block_slabs_rem_ru, |
| blocks_lo << (1 - hs->config.merge.fm.scale_min)); |
| |
| uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi, |
| block_slabs_lo*2); // clamp non-pow2 blocks |
| |
| *count_padded_out = block_slabs_padded_out * hs->slab_keys; |
| } |
| } |
| } |
| |
| // |
| // |
| // |