blob: b0eb7ea7aea6b5275fdb5bc66637bd2383a19881 [file] [log] [blame]
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include "tile.h"
#include "block.h"
#include "raster.h"
#include "common.h"
#include "atomic_cl.h"
#include "block_pool_cl.h"
#include "kernel_cl_12.h"
//
//
//
#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
//
//
//
#if ( SKC_RASTERS_RECLAIM_X == 1 )
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0
#elif ( SKC_RASTERS_RECLAIM_X == 2 )
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1
#elif ( SKC_RASTERS_RECLAIM_X == 4 )
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3
#elif ( SKC_RASTERS_RECLAIM_X == 8 )
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7
#elif ( SKC_RASTERS_RECLAIM_X == 16)
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15
#else
#error "MISSING SKC_RASTERS_RECLAIM_X"
#endif
#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
(I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
#endif
//
// FIXME -- slate these for replacement
//
#define SKC_BROADCAST(E,S,I) \
sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_BROADCAST_LAST_HELPER(E,I) \
sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
#define SKC_BROADCAST_LAST(E,I) \
SKC_BROADCAST_LAST_HELPER(E,I)
//
// COMPILE-TIME PREDICATES
//
#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \
SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \
(skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \
SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \
SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
//
// RUN-TIME PREDICATES
//
#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \
(get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
//
// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
// COMBOS (NOT NECESSARILY POW2)
//
// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
// UINT TYPE INSTEAD OF A ULONG.
//
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
//
//
//
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
(((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
S = sub_group_scan_exclusive_add(C)
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \
(((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
//
//
//
struct skc_reclaim
{
skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
};
__kernel
SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
void
skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
__global skc_uint * const bp_elems, // block pool blocks
__global skc_uint volatile * const bp_atomics, // read/write atomics
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
__global skc_block_id_t const * const map, // raster host-to-device map
struct skc_reclaim const reclaim) // array of host raster ids
{
#if (__OPENCL_VERSION__ < 200)
skc_uint const reclaim_stride = get_num_sub_groups();
#else
skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
#endif
skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
#if 0
//
// NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
// WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
// MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
// RECLAMATION JOB ON THE REST OF THE PIPELINE.
//
for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
#endif
{
// get host raster id
skc_raster_h const raster = reclaim.aN[reclaim_idx];
// get block id of raster header
skc_block_id_t id = map[raster];
//
// load all of the head block ttxk.lo keys into registers
//
// FIXME -- this pattern lends itself to using the higher
// performance Intel GEN block load instructions
//
skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// pick out count.nodes and count.prims from the header
//
// load raster header counts -- we only need the blocks and
// nodes words the keys are doublewords.
//
// FIXME -- this can be made portable with compile-time macro expansion
//
skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
#if 0
if (get_sub_group_local_id() == 0) {
printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
}
#endif
//
// acquire a span in the block pool ids ring for reclaimed ids
//
skc_uint bp_ids_base = 0;
if (get_sub_group_local_id() == 0) {
bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
}
bp_ids_base = sub_group_broadcast(bp_ids_base,0);
//
// mask off everything but the block id
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
h##I = h##I & SKC_TTXK_LO_MASK_ID; \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// swap current id with next
//
if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
{
skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
id = next;
#if 0
printf("rasters next = %u\n",id);
#endif
}
#if 0
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
printf("%08X %u\n",h##I,h##I);
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
#endif
#if 0
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
printf("%08X\n",h##I); \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
#endif
//
// - we'll skip subgroups that are entirely header
//
// - but we need to mark any header elements that partially fill
// a subgroup as subblocks
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \
if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \
h##I = SKC_UINT_MAX; \
} \
} \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
{
//
// count reclaimable blocks in each lane
//
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// scan to find index of each block
//
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
//
// store blocks back to ring
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
if (count > 0) { \
bp_ids[bp_ids_idx] = h##I; \
} \
skc_uint const total = index + count; \
bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
}
// printf("R %7u ! %u\n",bp_ids_idx,h##I);
//
// we're done if it was just the header
//
if (count_nodes == 0)
return;
//
// otherwise, walk the nodes
//
do {
// id of next block is in last lane
id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
//
// load all of the node block ttxk.lo keys into registers
//
// FIXME -- this pattern lends itself to using the higher
// performance Intel GEN block load instructions
//
skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// mask off everything but the block id
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
n##I = n##I & SKC_TTXK_LO_MASK_ID;
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// swap current id with next
//
if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
{
skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
id = next;
#if 0
printf("rasters next = %u\n",id);
#endif
}
#if 0
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
printf("%08X %u\n",n##I,n##I);
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
#endif
//
// count reclaimable blocks in each lane
//
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// scan to find index of each block
//
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
//
// store blocks back to ring
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) { \
skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
if (count > 0) { \
bp_ids[bp_ids_idx] = n##I; \
} \
skc_uint const total = index + count; \
bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
// printf("R %7u ! %u\n",bp_ids_idx,n##I);
// any more nodes?
} while (--count_nodes > 0);
}
}
//
//
//