| /* |
| * Copyright 2016 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can |
| * be found in the LICENSE file. |
| * |
| */ |
| |
| #pragma once |
| |
| // |
| // TODO: |
| // |
| // Add Key-Val sorting support -- easy. |
| // |
| |
| #include <stdio.h> |
| #include <stdint.h> |
| |
| // |
| // All code generation is driven by the specified architectural |
| // details and host platform API. |
| // |
| // In general, the warps-per-block and keys-per-thread are the |
| // critical knobs for tuning performance. |
| // |
| |
| struct hsg_config |
| { |
| struct { |
| |
| struct { |
| uint32_t warps; |
| uint32_t lo; |
| uint32_t hi; |
| } flip; |
| |
| struct { |
| uint32_t warps; |
| uint32_t lo; |
| uint32_t hi; |
| } half; |
| |
| uint32_t max_log2; |
| |
| } merge; |
| |
| struct { |
| uint32_t warps_min; |
| uint32_t warps_max; |
| uint32_t warps_mod; |
| |
| uint32_t smem_min; |
| uint32_t smem_quantum; |
| |
| uint32_t smem_bs; |
| uint32_t smem_bc; |
| } block; |
| |
| struct { |
| uint32_t lanes; |
| uint32_t lanes_log2; |
| uint32_t skpw_bs; |
| } warp; |
| |
| struct { |
| uint32_t regs; |
| uint32_t xtra; |
| } thread; |
| |
| struct { |
| uint32_t words; |
| } type; |
| }; |
| |
| // |
| // HotSort can merge non-power-of-two blocks of warps |
| // |
| |
| struct hsg_level |
| { |
| uint32_t count; // networks >= 2 |
| |
| uint32_t diffs [2]; |
| uint32_t diff_masks [2]; |
| uint32_t evenodds [2]; |
| uint32_t evenodd_masks[2]; |
| uint32_t networks [2]; |
| |
| union { |
| uint64_t b64; |
| uint32_t b32a2[2]; |
| } active; |
| }; |
| |
| // |
| // |
| // |
| |
| #define MERGE_LEVELS_MAX_LOG2 7 // merge up to 128 warps |
| #define MERGE_LEVELS_MAX_SIZE (1 << MERGE_LEVELS_MAX_LOG2) |
| |
| // |
| // This is computed |
| // |
| |
| struct hsg_merge |
| { |
| uint32_t offsets [MERGE_LEVELS_MAX_SIZE]; |
| uint32_t networks[MERGE_LEVELS_MAX_SIZE]; |
| |
| struct hsg_level levels[MERGE_LEVELS_MAX_LOG2]; |
| |
| uint32_t index; |
| |
| uint32_t warps; |
| |
| uint32_t rows_bs; |
| uint32_t rows_bc; |
| |
| uint32_t skpw_bc; |
| }; |
| |
| // |
| // |
| // |
| |
| #if 0 |
| |
| #define HSG_FILE_NAME_SIZE 80 |
| |
| struct hsg_file |
| { |
| FILE * file; |
| char const * prefix; |
| char name[HSG_FILE_NAME_SIZE]; |
| }; |
| |
| // |
| // |
| // |
| |
| typedef enum hsg_file_type { |
| |
| HSG_FILE_TYPE_HEADER, |
| HSG_FILE_TYPE_SOURCE, |
| |
| HSG_FILE_TYPE_COUNT |
| |
| } hsg_file_type; |
| |
| #endif |
| |
| // |
| // |
| // |
| |
| #define HSG_OP_EXPAND_ALL() \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_EXIT) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_END) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BEGIN) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_ELSE) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_BEGIN) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_END) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_KERNEL_PROTO) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_KERNEL_PREAMBLE) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_KERNEL_PROTO) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_KERNEL_PREAMBLE) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_KERNEL_PROTO) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_KERNEL_PREAMBLE) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_KERNEL_PROTO) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_KERNEL_PREAMBLE) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_GLOBAL_LOAD) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_GLOBAL_STORE) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_LOAD) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_STORE) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_FLIP) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_HALF) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_FLIP) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_HALF) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_XCHG) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_STORE_V) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_V) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_REG_SHARED_LOAD_V) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BLOCK_SYNC) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_FRAC_PRED) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_MERGE_H_PREAMBLE) \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_MERGE_H_PREAMBLE) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_MERGE_H_PRED) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_ACTIVE_PRED) \ |
| \ |
| HSG_OP_EXPAND_X(HSG_OP_TYPE_COUNT) |
| |
| // |
| // |
| // |
| |
| #undef HSG_OP_EXPAND_X |
| #define HSG_OP_EXPAND_X(t) t , |
| |
| typedef enum hsg_op_type { |
| |
| HSG_OP_EXPAND_ALL() |
| |
| } hsg_op_type; |
| |
| // |
| // |
| // |
| |
| struct hsg_op |
| { |
| hsg_op_type type; |
| |
| union { |
| |
| struct { |
| int32_t a; |
| int32_t b; |
| int32_t c; |
| }; |
| |
| struct { |
| int32_t n; |
| int32_t v; |
| }; |
| |
| struct { |
| int32_t m; |
| int32_t w; |
| }; |
| |
| }; |
| }; |
| |
| // |
| // |
| // |
| |
| extern char const * const hsg_op_type_string[]; |
| |
| // |
| // |
| // |
| |
| struct hsg_target |
| { |
| char const * define; |
| struct hsg_target_state * state; |
| }; |
| |
| // |
| // All targets share this prototype |
| // |
| |
| typedef |
| void |
| (*hsg_target_pfn)(struct hsg_target * const target, |
| struct hsg_config const * const config, |
| struct hsg_merge const * const merge, |
| struct hsg_op const * const ops, |
| uint32_t const depth); |
| // |
| // |
| // |
| |
| extern |
| void |
| hsg_target_debug(struct hsg_target * const target, |
| struct hsg_config const * const config, |
| struct hsg_merge const * const merge, |
| struct hsg_op const * const ops, |
| uint32_t const depth); |
| |
| extern |
| void |
| hsg_target_cuda(struct hsg_target * const target, |
| struct hsg_config const * const config, |
| struct hsg_merge const * const merge, |
| struct hsg_op const * const ops, |
| uint32_t const depth); |
| |
| extern |
| void |
| hsg_target_opencl(struct hsg_target * const target, |
| struct hsg_config const * const config, |
| struct hsg_merge const * const merge, |
| struct hsg_op const * const ops, |
| uint32_t const depth); |
| |
| extern |
| void |
| hsg_target_glsl(struct hsg_target * const target, |
| struct hsg_config const * const config, |
| struct hsg_merge const * const merge, |
| struct hsg_op const * const ops, |
| uint32_t const depth); |
| // |
| // |
| // |