| /* |
| * Copyright 2018 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| * |
| */ |
| |
| // |
| // |
| // |
| |
| #include "transpose.h" |
| #include "common/macros.h" |
| |
| // |
| // Rows must be an even number. This is enforced elsewhere. |
| // |
| // The transpose requires (cols_log2 * rows/2) row-pair blends. |
| // |
| void |
| hsg_transpose(uint32_t const cols_log2, |
| uint32_t const rows, |
| void (*pfn_blend)(uint32_t const cols_log2, |
| uint32_t const row_ll, // lower-left |
| uint32_t const row_ur, // upper-right |
| void * blend), |
| void * blend, |
| void (*pfn_remap)(uint32_t const row_from, |
| uint32_t const row_to, |
| void * remap), |
| void * remap) |
| { |
| // get mapping array |
| uint32_t * map_curr = ALLOCA_MACRO(rows * sizeof(*map_curr)); |
| uint32_t * map_next = ALLOCA_MACRO(rows * sizeof(*map_next)); |
| |
| // init the mapping array |
| for (uint32_t ii=0; ii<rows; ii++) |
| map_curr[ii] = ii; |
| |
| // successively transpose rows using blends |
| for (uint32_t cc=1; cc<=cols_log2; cc++) |
| { |
| uint32_t const mask = BITS_TO_MASK(cc); |
| |
| for (uint32_t ii=0; ii<rows; ii++) |
| { |
| uint32_t const left = map_curr[ii]; |
| uint32_t const stay = left & ~mask; |
| |
| if (left != stay) // will be swapped away |
| { |
| for (uint32_t jj=0; jj<rows; jj++) |
| { |
| if (map_curr[jj] == stay) |
| { |
| map_next[jj] = stay; |
| map_next[ii] = stay + (rows << (cc-1)); |
| |
| pfn_blend(cc,ii,jj,blend); // log2,left,right,payload |
| |
| break; |
| } |
| } |
| } |
| } |
| |
| uint32_t * tmp = map_curr; |
| |
| map_curr = map_next; |
| map_next = tmp; |
| } |
| |
| // write out the remapping |
| for (uint32_t ii=0; ii<rows; ii++) |
| pfn_remap(ii,map_curr[ii] >> cols_log2,remap); |
| } |
| |
| // |
| // test it! |
| // |
| |
| #ifdef HS_TRANSPOSE_DEBUG |
| |
| #include <stdio.h> |
| |
| static uint32_t cols; // implicit on SIMD/GPU |
| |
| static |
| void |
| hsg_debug_blend(uint32_t const cols_log2, |
| uint32_t const row_ll, // lower-left |
| uint32_t const row_ur, // upper-right |
| uint32_t * b) |
| { |
| fprintf(stdout,"BLEND( %u, %3u, %3u )\n",cols_log2,row_ll,row_ur); |
| |
| uint32_t * const ll = ALLOCA_MACRO(cols * sizeof(*b)); |
| uint32_t * const ur = ALLOCA_MACRO(cols * sizeof(*b)); |
| |
| memcpy(ll,b+row_ll*cols,cols * sizeof(*b)); |
| memcpy(ur,b+row_ur*cols,cols * sizeof(*b)); |
| |
| for (uint32_t ii=0; ii<cols; ii++) |
| b[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)]; |
| |
| for (uint32_t ii=0; ii<cols; ii++) |
| b[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii]; |
| } |
| |
| static |
| void |
| hsg_debug_remap(uint32_t const row_from, |
| uint32_t const row_to, |
| uint32_t * const r) |
| { |
| fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to); |
| |
| r[row_to] = row_from; |
| } |
| |
| static |
| void |
| hsg_debug_print(uint32_t const rows, |
| uint32_t const * const m, |
| uint32_t const * const r) |
| { |
| for (uint32_t rr=0; rr<rows; rr++) { |
| for (uint32_t cc=0; cc<cols; cc++) |
| fprintf(stdout,"%4u ",m[r[rr]*cols + cc]); |
| fprintf(stdout,"\n"); |
| } |
| } |
| |
| int |
| main(int argc, char * argv[]) |
| { |
| uint32_t const cols_log2 = (argc <= 1) ? 3 : strtoul(argv[1],NULL,0); |
| uint32_t const rows = (argc <= 2) ? 6 : strtoul(argv[2],NULL,0); |
| |
| if (rows & 1) |
| return; |
| |
| cols = 1 << cols_log2; |
| |
| uint32_t * const b = ALLOCA_MACRO(cols * rows * sizeof(*b)); |
| uint32_t * const r = ALLOCA_MACRO( rows * sizeof(*r)); |
| |
| for (uint32_t rr=0; rr<rows; rr++) { |
| r[rr] = rr; |
| for (uint32_t cc=0; cc<cols; cc++) |
| b[rr*cols+cc] = cc*rows+rr; |
| } |
| |
| hsg_debug_print(rows,b,r); |
| |
| hsg_transpose(cols_log2,rows, |
| hsg_debug_blend,b, |
| hsg_debug_remap,r); |
| |
| hsg_debug_print(rows,b,r); |
| |
| return EXIT_SUCCESS; |
| } |
| |
| #endif |
| |
| // |
| // |
| // |