src/compute/hs/gen/transpose.c - skia.git - Git at Google

 /*
  * Copyright 2018 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  *
  */

 //
 //
 //

 #include "transpose.h"
 #include "common/macros.h"

 //
 // Rows must be an even number.  This is enforced elsewhere.
 //
 // The transpose requires (cols_log2 * rows/2) row-pair blends.
 //
 void
 hsg_transpose(uint32_t                   const cols_log2,
               uint32_t                   const rows,
               void (*pfn_blend)(uint32_t const cols_log2,
                                 uint32_t const row_ll, // lower-left
                                 uint32_t const row_ur, // upper-right
                                 void *         blend),
               void *                           blend,
               void (*pfn_remap)(uint32_t const row_from,
                                 uint32_t const row_to,
                                 void *         remap),
               void *                           remap)
 {
   // get mapping array
   uint32_t * map_curr = ALLOCA_MACRO(rows * sizeof(*map_curr));
   uint32_t * map_next = ALLOCA_MACRO(rows * sizeof(*map_next));

   // init the mapping array
   for (uint32_t ii=0; ii<rows; ii++)
     map_curr[ii] = ii;

   // successively transpose rows using blends
   for (uint32_t cc=1; cc<=cols_log2; cc++)
     {
       uint32_t const mask = BITS_TO_MASK(cc);

       for (uint32_t ii=0; ii<rows; ii++)
         {
           uint32_t const left = map_curr[ii];
           uint32_t const stay = left & ~mask;

           if (left != stay) // will be swapped away
             {
               for (uint32_t jj=0; jj<rows; jj++)
                 {
                   if (map_curr[jj] == stay)
                     {
                       map_next[jj] = stay;
                       map_next[ii] = stay + (rows << (cc-1));

                       pfn_blend(cc,ii,jj,blend); // log2,left,right,payload

                       break;
                     }
                 }
             }
         }

       uint32_t * tmp = map_curr;

       map_curr = map_next;
       map_next = tmp;
     }

   // write out the remapping
   for (uint32_t ii=0; ii<rows; ii++)
     pfn_remap(ii,map_curr[ii] >> cols_log2,remap);
 }

 //
 // test it!
 //

 #ifdef HS_TRANSPOSE_DEBUG

 #include <stdio.h>

 static uint32_t cols; // implicit on SIMD/GPU

 static
 void
 hsg_debug_blend(uint32_t const cols_log2,
                 uint32_t const row_ll, // lower-left
                 uint32_t const row_ur, // upper-right
                 uint32_t *     b)
 {
   fprintf(stdout,"BLEND( %u, %3u, %3u )\n",cols_log2,row_ll,row_ur);

   uint32_t * const ll = ALLOCA_MACRO(cols * sizeof(*b));
   uint32_t * const ur = ALLOCA_MACRO(cols * sizeof(*b));

   memcpy(ll,b+row_ll*cols,cols * sizeof(*b));
   memcpy(ur,b+row_ur*cols,cols * sizeof(*b));

   for (uint32_t ii=0; ii<cols; ii++)
     b[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)];

   for (uint32_t ii=0; ii<cols; ii++)
     b[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii];
 }

 static
 void
 hsg_debug_remap(uint32_t   const row_from,
                 uint32_t   const row_to,
                 uint32_t * const r)
 {
   fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to);

   r[row_to] = row_from;
 }

 static
 void
 hsg_debug_print(uint32_t         const rows,
                 uint32_t const * const m,
                 uint32_t const * const r)
 {
   for (uint32_t rr=0; rr<rows; rr++) {
     for (uint32_t cc=0; cc<cols; cc++)
       fprintf(stdout,"%4u ",m[r[rr]*cols + cc]);
     fprintf(stdout,"\n");
   }
 }

 int
 main(int argc, char * argv[])
 {
   uint32_t const cols_log2 = (argc <= 1) ? 3 : strtoul(argv[1],NULL,0);
   uint32_t const rows      = (argc <= 2) ? 6 : strtoul(argv[2],NULL,0);

   if (rows & 1)
     return;

   cols = 1 << cols_log2;

   uint32_t * const b = ALLOCA_MACRO(cols * rows * sizeof(*b));
   uint32_t * const r = ALLOCA_MACRO(       rows * sizeof(*r));

   for (uint32_t rr=0; rr<rows; rr++) {
     r[rr] = rr;
     for (uint32_t cc=0; cc<cols; cc++)
       b[rr*cols+cc] = cc*rows+rr;
   }

   hsg_debug_print(rows,b,r);

   hsg_transpose(cols_log2,rows,
                 hsg_debug_blend,b,
                 hsg_debug_remap,r);

   hsg_debug_print(rows,b,r);

   return EXIT_SUCCESS;
 }

 #endif

 //
 //
 //
	/*
	* Copyright 2018 Google Inc.
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*
	*/

	//
	//
	//

	#include "transpose.h"
	#include "common/macros.h"

	//
	// Rows must be an even number. This is enforced elsewhere.
	//
	// The transpose requires (cols_log2 * rows/2) row-pair blends.
	//
	void
	hsg_transpose(uint32_t const cols_log2,
	uint32_t const rows,
	void (*pfn_blend)(uint32_t const cols_log2,
	uint32_t const row_ll, // lower-left
	uint32_t const row_ur, // upper-right
	void * blend),
	void * blend,
	void (*pfn_remap)(uint32_t const row_from,
	uint32_t const row_to,
	void * remap),
	void * remap)
	{
	// get mapping array
	uint32_t * map_curr = ALLOCA_MACRO(rows * sizeof(*map_curr));
	uint32_t * map_next = ALLOCA_MACRO(rows * sizeof(*map_next));

	// init the mapping array
	for (uint32_t ii=0; ii<rows; ii++)
	map_curr[ii] = ii;

	// successively transpose rows using blends
	for (uint32_t cc=1; cc<=cols_log2; cc++)
	{
	uint32_t const mask = BITS_TO_MASK(cc);

	for (uint32_t ii=0; ii<rows; ii++)
	{
	uint32_t const left = map_curr[ii];
	uint32_t const stay = left & ~mask;

	if (left != stay) // will be swapped away
	{
	for (uint32_t jj=0; jj<rows; jj++)
	{
	if (map_curr[jj] == stay)
	{
	map_next[jj] = stay;
	map_next[ii] = stay + (rows << (cc-1));

	pfn_blend(cc,ii,jj,blend); // log2,left,right,payload

	break;
	}
	}
	}
	}

	uint32_t * tmp = map_curr;

	map_curr = map_next;
	map_next = tmp;
	}

	// write out the remapping
	for (uint32_t ii=0; ii<rows; ii++)
	pfn_remap(ii,map_curr[ii] >> cols_log2,remap);
	}

	//
	// test it!
	//

	#ifdef HS_TRANSPOSE_DEBUG

	#include <stdio.h>

	static uint32_t cols; // implicit on SIMD/GPU

	static
	void
	hsg_debug_blend(uint32_t const cols_log2,
	uint32_t const row_ll, // lower-left
	uint32_t const row_ur, // upper-right
	uint32_t * b)
	{
	fprintf(stdout,"BLEND( %u, %3u, %3u )\n",cols_log2,row_ll,row_ur);

	uint32_t * const ll = ALLOCA_MACRO(cols * sizeof(*b));
	uint32_t * const ur = ALLOCA_MACRO(cols * sizeof(*b));

	memcpy(ll,b+row_llcols,cols sizeof(*b));
	memcpy(ur,b+row_urcols,cols sizeof(*b));

	for (uint32_t ii=0; ii<cols; ii++)
	b[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)];

	for (uint32_t ii=0; ii<cols; ii++)
	b[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii];
	}

	static
	void
	hsg_debug_remap(uint32_t const row_from,
	uint32_t const row_to,
	uint32_t * const r)
	{
	fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to);

	r[row_to] = row_from;
	}

	static
	void
	hsg_debug_print(uint32_t const rows,
	uint32_t const * const m,
	uint32_t const * const r)
	{
	for (uint32_t rr=0; rr<rows; rr++) {
	for (uint32_t cc=0; cc<cols; cc++)
	fprintf(stdout,"%4u ",m[r[rr]*cols + cc]);
	fprintf(stdout,"\n");
	}
	}

	int
	main(int argc, char * argv[])
	{
	uint32_t const cols_log2 = (argc <= 1) ? 3 : strtoul(argv[1],NULL,0);
	uint32_t const rows = (argc <= 2) ? 6 : strtoul(argv[2],NULL,0);

	if (rows & 1)
	return;

	cols = 1 << cols_log2;

	uint32_t * const b = ALLOCA_MACRO(cols * rows * sizeof(*b));
	uint32_t * const r = ALLOCA_MACRO( rows * sizeof(*r));

	for (uint32_t rr=0; rr<rows; rr++) {
	r[rr] = rr;
	for (uint32_t cc=0; cc<cols; cc++)
	b[rrcols+cc] = ccrows+rr;
	}

	hsg_debug_print(rows,b,r);

	hsg_transpose(cols_log2,rows,
	hsg_debug_blend,b,
	hsg_debug_remap,r);

	hsg_debug_print(rows,b,r);

	return EXIT_SUCCESS;
	}

	#endif

	//
	//
	//