blob: 9bb84fbf96b2934e05343761e3d7fb0edef6ee0d [file] [log] [blame]
/*
* Copyright 2022 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "include/core/SkSpan.h"
#include "include/core/SkTypes.h"
#include "include/private/base/SkTArray.h"
#include "src/core/SkRasterPipelineOpList.h"
#include "src/core/SkTHash.h"
#include "src/core/SkUtils.h"
#include <cstdint>
#include <initializer_list>
#include <memory>
class SkArenaAlloc;
class SkRasterPipeline;
class SkWStream;
namespace SkSL {
class SkRPDebugTrace;
namespace RP {
// A single scalar in our program consumes one slot.
using Slot = int;
constexpr Slot NA = -1;
// Scalars, vectors, and matrices can be represented as a range of slot indices.
struct SlotRange {
Slot index = 0;
int count = 0;
};
// Ops that the builder will contextually rewrite into different RasterPipeline stages.
enum class BuilderOp {
// We support all the native Raster Pipeline ops.
#define M(stage) stage,
SK_RASTER_PIPELINE_OPS_ALL(M)
#undef M
// We also support Builder-specific ops; these are converted into real RP ops during
// `appendStages`.
push_literal,
push_slots,
push_uniform,
push_zeros,
push_clone,
push_clone_from_stack,
copy_stack_to_slots,
copy_stack_to_slots_unmasked,
discard_stack,
select,
push_condition_mask,
pop_condition_mask,
push_loop_mask,
pop_loop_mask,
push_return_mask,
pop_return_mask,
set_current_stack,
label,
unsupported
};
// Represents a single raster-pipeline SkSL instruction.
struct Instruction {
Instruction(BuilderOp op, std::initializer_list<Slot> slots, int a = 0, int b = 0, int c = 0)
: fOp(op), fImmA(a), fImmB(b), fImmC(c) {
auto iter = slots.begin();
if (iter != slots.end()) { fSlotA = *iter++; }
if (iter != slots.end()) { fSlotB = *iter++; }
if (iter != slots.end()) { fSlotC = *iter++; }
SkASSERT(iter == slots.end());
}
BuilderOp fOp;
Slot fSlotA = NA;
Slot fSlotB = NA;
Slot fSlotC = NA;
int fImmA = 0;
int fImmB = 0;
int fImmC = 0;
};
class Program {
public:
Program(SkTArray<Instruction> instrs,
int numValueSlots,
int numUniformSlots,
int numLabels,
int numBranches,
SkRPDebugTrace* debugTrace);
#if !defined(SKSL_STANDALONE)
void appendStages(SkRasterPipeline* pipeline,
SkArenaAlloc* alloc,
SkSpan<const float> uniforms);
#endif
void dump(SkWStream* s);
private:
using StackDepthMap = SkTHashMap<int, int>; // <stack index, depth of stack>
struct SlotData {
SkSpan<float> values;
SkSpan<float> stack;
};
SlotData allocateSlotData(SkArenaAlloc* alloc);
struct Stage {
SkRasterPipelineOp op;
void* ctx;
};
void makeStages(SkTArray<Stage>* pipeline,
SkArenaAlloc* alloc,
SkSpan<const float> uniforms,
const SlotData& slots);
void optimize();
StackDepthMap tempStackMaxDepths();
// These methods are used to split up large multi-slot operations into multiple ops as needed.
void appendCopy(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
SkRasterPipelineOp baseStage,
float* dst, int dstStride, const float* src, int srcStride, int numSlots);
void appendCopySlotsUnmasked(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
float* dst, const float* src, int numSlots);
void appendCopySlotsMasked(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
float* dst, const float* src, int numSlots);
void appendCopyConstants(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
float* dst, const float* src, int numSlots);
// Appends a single-slot single-input math operation to the pipeline. The op `stage` will
// appended `numSlots` times, starting at position `dst` and advancing one slot for each
// subsequent invocation.
void appendSingleSlotUnaryOp(SkTArray<Stage>* pipeline, SkRasterPipelineOp stage,
float* dst, int numSlots);
// Appends a multi-slot single-input math operation to the pipeline. `baseStage` must refer to
// an single-slot "apply_op" stage, which must be immediately followed by specializations for
// 2-4 slots. For instance, {`zero_slot`, `zero_2_slots`, `zero_3_slots`, `zero_4_slots`}
// must be contiguous ops in the stage list, listed in that order; pass `zero_slot` and we
// pick the appropriate op based on `numSlots`.
void appendMultiSlotUnaryOp(SkTArray<Stage>* pipeline, SkRasterPipelineOp baseStage,
float* dst, int numSlots);
// Appends a multi-slot two-input math operation to the pipeline. `src` must be _immediately_
// after `dst` in memory. `baseStage` must refer to an unbounded "apply_to_n_slots" stage, which
// must be immediately followed by specializations for 1-4 slots. For instance, {`add_n_floats`,
// `add_float`, `add_2_floats`, `add_3_floats`, `add_4_floats`} must be contiguous ops in the
// stage list, listed in that order; pass `add_n_floats` and we pick the appropriate op based on
// `numSlots`.
void appendAdjacentMultiSlotBinaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
SkRasterPipelineOp baseStage,
float* dst, const float* src, int numSlots);
// Appends a multi-slot math operation having three inputs (dst, src0, src1) and one output
// (dst) to the pipeline. The three inputs must be _immediately_ adjacent in memory. `baseStage`
// must refer to an unbounded "apply_to_n_slots" stage, which must be immediately followed by
// specializations for 1-4 slots.
void appendAdjacentMultiSlotTernaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
SkRasterPipelineOp stage, float* dst,
const float* src0, const float* src1, int numSlots);
// Appends a stack_rewind op on platforms where it is needed (when SK_HAS_MUSTTAIL is not set).
void appendStackRewind(SkTArray<Stage>* pipeline);
SkTArray<Instruction> fInstructions;
int fNumValueSlots = 0;
int fNumUniformSlots = 0;
int fNumTempStackSlots = 0;
int fNumLabels = 0;
int fNumBranches = 0;
SkTHashMap<int, int> fTempStackMaxDepths;
SkRPDebugTrace* fDebugTrace = nullptr;
};
class Builder {
public:
/** Finalizes and optimizes the program. */
std::unique_ptr<Program> finish(int numValueSlots,
int numUniformSlots,
SkRPDebugTrace* debugTrace = nullptr);
/**
* Peels off a label ID for use in the program. Set the label's position in the program with
* the `label` instruction. Actually branch to the target with an instruction like
* `branch_if_any_active_lanes` or `jump`.
*/
int nextLabelID() {
return fNumLabels++;
}
/**
* The builder keeps track of the state of execution masks; when we know that the execution
* mask is unaltered, we can generate simpler code. Code which alters the execution mask is
* required to enable this flag.
*/
void enableExecutionMaskWrites() {
++fExecutionMaskWritesEnabled;
}
void disableExecutionMaskWrites() {
SkASSERT(this->executionMaskWritesAreEnabled());
--fExecutionMaskWritesEnabled;
}
bool executionMaskWritesAreEnabled() {
return fExecutionMaskWritesEnabled > 0;
}
/** Assemble a program from the Raster Pipeline instructions below. */
void init_lane_masks() {
fInstructions.push_back({BuilderOp::init_lane_masks, {}});
}
void store_src_rg(SlotRange slots) {
SkASSERT(slots.count == 2);
fInstructions.push_back({BuilderOp::store_src_rg, {slots.index}});
}
void store_src(SlotRange slots) {
SkASSERT(slots.count == 4);
fInstructions.push_back({BuilderOp::store_src, {slots.index}});
}
void store_dst(SlotRange slots) {
SkASSERT(slots.count == 4);
fInstructions.push_back({BuilderOp::store_dst, {slots.index}});
}
void load_src(SlotRange slots) {
SkASSERT(slots.count == 4);
fInstructions.push_back({BuilderOp::load_src, {slots.index}});
}
void load_dst(SlotRange slots) {
SkASSERT(slots.count == 4);
fInstructions.push_back({BuilderOp::load_dst, {slots.index}});
}
void set_current_stack(int stackIdx) {
fInstructions.push_back({BuilderOp::set_current_stack, {}, stackIdx});
}
void label(int labelID) {
SkASSERT(labelID >= 0 && labelID < fNumLabels);
fInstructions.push_back({BuilderOp::label, {}, labelID});
}
void jump(int labelID) {
SkASSERT(labelID >= 0 && labelID < fNumLabels);
if (!fInstructions.empty() && fInstructions.back().fOp == BuilderOp::jump) {
// The previous instruction was also `jump`, so this branch could never possibly occur.
return;
}
fInstructions.push_back({BuilderOp::jump, {}, labelID});
++fNumBranches;
}
void branch_if_any_active_lanes(int labelID) {
if (!this->executionMaskWritesAreEnabled()) {
this->jump(labelID);
return;
}
SkASSERT(labelID >= 0 && labelID < fNumLabels);
if (!fInstructions.empty() &&
(fInstructions.back().fOp == BuilderOp::branch_if_any_active_lanes ||
fInstructions.back().fOp == BuilderOp::jump)) {
// The previous instruction was `jump` or `branch_if_any_active_lanes`, so this branch
// could never possibly occur.
return;
}
fInstructions.push_back({BuilderOp::branch_if_any_active_lanes, {}, labelID});
++fNumBranches;
}
void branch_if_no_active_lanes(int labelID) {
if (!this->executionMaskWritesAreEnabled()) {
return;
}
SkASSERT(labelID >= 0 && labelID < fNumLabels);
if (!fInstructions.empty() &&
(fInstructions.back().fOp == BuilderOp::branch_if_no_active_lanes ||
fInstructions.back().fOp == BuilderOp::jump)) {
// The previous instruction was `jump` or `branch_if_any_active_lanes`, so this branch
// could never possibly occur.
return;
}
fInstructions.push_back({BuilderOp::branch_if_no_active_lanes, {}, labelID});
++fNumBranches;
}
// We use the same SkRasterPipeline op regardless of the literal type, and bitcast the value.
void immediate_f(float val) {
fInstructions.push_back({BuilderOp::immediate_f, {}, sk_bit_cast<int32_t>(val)});
}
void immediate_i(int32_t val) {
fInstructions.push_back({BuilderOp::immediate_f, {}, val});
}
void immediate_u(uint32_t val) {
fInstructions.push_back({BuilderOp::immediate_f, {}, sk_bit_cast<int32_t>(val)});
}
void push_literal_f(float val) {
this->push_literal_i(sk_bit_cast<int32_t>(val));
}
void push_literal_i(int32_t val) {
if (val == 0) {
this->push_zeros(1);
} else {
fInstructions.push_back({BuilderOp::push_literal, {}, val});
}
}
void push_literal_u(uint32_t val) {
this->push_literal_i(sk_bit_cast<int32_t>(val));
}
// Translates into copy_constants (from uniforms into temp stack) in Raster Pipeline.
void push_uniform(SlotRange src);
void push_zeros(int count) {
// Translates into zero_slot_unmasked in Raster Pipeline.
if (!fInstructions.empty() && fInstructions.back().fOp == BuilderOp::push_zeros) {
// Coalesce adjacent push_zero ops into a single op.
fInstructions.back().fImmA += count;
} else {
fInstructions.push_back({BuilderOp::push_zeros, {}, count});
}
}
// Translates into copy_slots_unmasked (from values into temp stack) in Raster Pipeline.
void push_slots(SlotRange src);
// Translates into copy_slots_masked (from temp stack to values) in Raster Pipeline.
// Does not discard any values on the temp stack.
void copy_stack_to_slots(SlotRange dst) {
this->copy_stack_to_slots(dst, /*offsetFromStackTop=*/dst.count);
}
void copy_stack_to_slots(SlotRange dst, int offsetFromStackTop);
// Translates into copy_slots_unmasked (from temp stack to values) in Raster Pipeline.
// Does not discard any values on the temp stack.
void copy_stack_to_slots_unmasked(SlotRange dst) {
this->copy_stack_to_slots_unmasked(dst, /*offsetFromStackTop=*/dst.count);
}
void copy_stack_to_slots_unmasked(SlotRange dst, int offsetFromStackTop);
// Performs a unary op (like `bitwise_not`), given a slot count of `slots`. The stack top is
// replaced with the result.
void unary_op(BuilderOp op, int32_t slots);
// Performs a binary op (like `add_n_floats` or `cmpeq_n_ints`), given a slot count of
// `slots`. Two n-slot input values are consumed, and the result is pushed onto the stack.
void binary_op(BuilderOp op, int32_t slots);
// Performs a ternary op (like `mix` or `smoothstep`), given a slot count of
// `slots`. Three n-slot input values are consumed, and the result is pushed onto the stack.
void ternary_op(BuilderOp op, int32_t slots);
// Shrinks the temp stack, discarding values on top.
void discard_stack(int32_t count = 1);
void pop_slots(SlotRange dst) {
// The opposite of push_slots; copies values from the temp stack into value slots, then
// shrinks the temp stack.
this->copy_stack_to_slots(dst);
this->discard_stack(dst.count);
}
// Creates many clones of the top single-slot item on the temp stack.
void push_duplicates(int count);
// Creates a single clone of an item on the current temp stack. The cloned item can consist of
// any number of slots, and can be copied from an earlier position on the stack.
void push_clone(int numSlots, int offsetFromStackTop = 0) {
fInstructions.push_back({BuilderOp::push_clone, {}, numSlots,
numSlots + offsetFromStackTop});
}
// Creates a single clone from an item on any temp stack. The cloned item can consist of any
// number of slots.
void push_clone_from_stack(int numSlots, int otherStackIndex, int offsetFromStackTop = 0) {
fInstructions.push_back({BuilderOp::push_clone_from_stack, {}, numSlots, otherStackIndex,
numSlots + offsetFromStackTop});
}
void select(int slots) {
// Overlays the top two entries on the stack, making one hybrid entry. The execution mask
// is used to select which lanes are preserved.
SkASSERT(slots > 0);
fInstructions.push_back({BuilderOp::select, {}, slots});
}
// The opposite of push_slots; copies values from the temp stack into value slots, then
// shrinks the temp stack.
void pop_slots_unmasked(SlotRange dst);
void load_unmasked(Slot slot) {
fInstructions.push_back({BuilderOp::load_unmasked, {slot}});
}
void store_unmasked(Slot slot) {
fInstructions.push_back({BuilderOp::store_unmasked, {slot}});
}
void store_masked(Slot slot) {
fInstructions.push_back({BuilderOp::store_masked, {slot}});
}
void copy_slots_masked(SlotRange dst, SlotRange src) {
SkASSERT(dst.count == src.count);
fInstructions.push_back({BuilderOp::copy_slot_masked, {dst.index, src.index}, dst.count});
}
void copy_slots_unmasked(SlotRange dst, SlotRange src) {
SkASSERT(dst.count == src.count);
fInstructions.push_back({BuilderOp::copy_slot_unmasked, {dst.index, src.index}, dst.count});
}
void copy_constant(Slot slot, int constantValue) {
fInstructions.push_back({BuilderOp::copy_constant, {slot}, constantValue});
}
// Stores zeros across the entire slot range.
void zero_slots_unmasked(SlotRange dst);
// Consumes `consumedSlots` elements on the stack, then generates `components.size()` elements.
void swizzle(int consumedSlots, SkSpan<const int8_t> components);
// Transposes a matrix of size CxR on the stack (into a matrix of size RxC).
void transpose(int columns, int rows);
// Generates a CxR diagonal matrix from the top two scalars on the stack. The second scalar is
// used as the diagonal value; the first scalar (usually zero) fills in the rest of the slots.
void diagonal_matrix(int columns, int rows);
// Resizes a CxR matrix at the top of the stack to C'xR'.
void matrix_resize(int origColumns, int origRows, int newColumns, int newRows);
void push_condition_mask() {
SkASSERT(this->executionMaskWritesAreEnabled());
fInstructions.push_back({BuilderOp::push_condition_mask, {}});
}
void pop_condition_mask() {
SkASSERT(this->executionMaskWritesAreEnabled());
fInstructions.push_back({BuilderOp::pop_condition_mask, {}});
}
void merge_condition_mask() {
SkASSERT(this->executionMaskWritesAreEnabled());
fInstructions.push_back({BuilderOp::merge_condition_mask, {}});
}
void push_loop_mask() {
SkASSERT(this->executionMaskWritesAreEnabled());
fInstructions.push_back({BuilderOp::push_loop_mask, {}});
}
void pop_loop_mask() {
SkASSERT(this->executionMaskWritesAreEnabled());
fInstructions.push_back({BuilderOp::pop_loop_mask, {}});
}
void mask_off_loop_mask() {
SkASSERT(this->executionMaskWritesAreEnabled());
fInstructions.push_back({BuilderOp::mask_off_loop_mask, {}});
}
void reenable_loop_mask(SlotRange src) {
SkASSERT(this->executionMaskWritesAreEnabled());
SkASSERT(src.count == 1);
fInstructions.push_back({BuilderOp::reenable_loop_mask, {src.index}});
}
void merge_loop_mask() {
SkASSERT(this->executionMaskWritesAreEnabled());
fInstructions.push_back({BuilderOp::merge_loop_mask, {}});
}
void push_return_mask() {
SkASSERT(this->executionMaskWritesAreEnabled());
fInstructions.push_back({BuilderOp::push_return_mask, {}});
}
void pop_return_mask();
void mask_off_return_mask() {
SkASSERT(this->executionMaskWritesAreEnabled());
fInstructions.push_back({BuilderOp::mask_off_return_mask, {}});
}
private:
SkTArray<Instruction> fInstructions;
int fNumLabels = 0;
int fNumBranches = 0;
int fExecutionMaskWritesEnabled = 0;
};
} // namespace RP
} // namespace SkSL