| /* |
| * Copyright 2022 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #include "include/core/SkSpan.h" |
| #include "include/core/SkTypes.h" |
| #include "include/private/base/SkTArray.h" |
| #include "src/core/SkRasterPipelineOpList.h" |
| #include "src/core/SkTHash.h" |
| #include "src/core/SkUtils.h" |
| |
| #include <cstdint> |
| #include <initializer_list> |
| #include <memory> |
| |
| class SkArenaAlloc; |
| class SkRasterPipeline; |
| class SkWStream; |
| |
| namespace SkSL { |
| |
| class SkRPDebugTrace; |
| |
| namespace RP { |
| |
| // A single scalar in our program consumes one slot. |
| using Slot = int; |
| constexpr Slot NA = -1; |
| |
| // Scalars, vectors, and matrices can be represented as a range of slot indices. |
| struct SlotRange { |
| Slot index = 0; |
| int count = 0; |
| }; |
| |
| // Ops that the builder will contextually rewrite into different RasterPipeline stages. |
| enum class BuilderOp { |
| // We support all the native Raster Pipeline ops. |
| #define M(stage) stage, |
| SK_RASTER_PIPELINE_OPS_ALL(M) |
| #undef M |
| // We also support Builder-specific ops; these are converted into real RP ops during |
| // `appendStages`. |
| push_literal, |
| push_slots, |
| push_uniform, |
| push_zeros, |
| push_clone, |
| push_clone_from_stack, |
| copy_stack_to_slots, |
| copy_stack_to_slots_unmasked, |
| discard_stack, |
| select, |
| push_condition_mask, |
| pop_condition_mask, |
| push_loop_mask, |
| pop_loop_mask, |
| push_return_mask, |
| pop_return_mask, |
| set_current_stack, |
| label, |
| unsupported |
| }; |
| |
| // Represents a single raster-pipeline SkSL instruction. |
| struct Instruction { |
| Instruction(BuilderOp op, std::initializer_list<Slot> slots, int a = 0, int b = 0, int c = 0) |
| : fOp(op), fImmA(a), fImmB(b), fImmC(c) { |
| auto iter = slots.begin(); |
| if (iter != slots.end()) { fSlotA = *iter++; } |
| if (iter != slots.end()) { fSlotB = *iter++; } |
| if (iter != slots.end()) { fSlotC = *iter++; } |
| SkASSERT(iter == slots.end()); |
| } |
| |
| BuilderOp fOp; |
| Slot fSlotA = NA; |
| Slot fSlotB = NA; |
| Slot fSlotC = NA; |
| int fImmA = 0; |
| int fImmB = 0; |
| int fImmC = 0; |
| }; |
| |
| class Program { |
| public: |
| Program(SkTArray<Instruction> instrs, |
| int numValueSlots, |
| int numUniformSlots, |
| int numLabels, |
| int numBranches, |
| SkRPDebugTrace* debugTrace); |
| |
| #if !defined(SKSL_STANDALONE) |
| void appendStages(SkRasterPipeline* pipeline, |
| SkArenaAlloc* alloc, |
| SkSpan<const float> uniforms); |
| #endif |
| |
| void dump(SkWStream* s); |
| |
| private: |
| using StackDepthMap = SkTHashMap<int, int>; // <stack index, depth of stack> |
| |
| struct SlotData { |
| SkSpan<float> values; |
| SkSpan<float> stack; |
| }; |
| SlotData allocateSlotData(SkArenaAlloc* alloc); |
| |
| struct Stage { |
| SkRasterPipelineOp op; |
| void* ctx; |
| }; |
| void makeStages(SkTArray<Stage>* pipeline, |
| SkArenaAlloc* alloc, |
| SkSpan<const float> uniforms, |
| const SlotData& slots); |
| void optimize(); |
| StackDepthMap tempStackMaxDepths(); |
| |
| // These methods are used to split up large multi-slot operations into multiple ops as needed. |
| void appendCopy(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, |
| SkRasterPipelineOp baseStage, |
| float* dst, int dstStride, const float* src, int srcStride, int numSlots); |
| void appendCopySlotsUnmasked(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, |
| float* dst, const float* src, int numSlots); |
| void appendCopySlotsMasked(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, |
| float* dst, const float* src, int numSlots); |
| void appendCopyConstants(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, |
| float* dst, const float* src, int numSlots); |
| |
| // Appends a single-slot single-input math operation to the pipeline. The op `stage` will |
| // appended `numSlots` times, starting at position `dst` and advancing one slot for each |
| // subsequent invocation. |
| void appendSingleSlotUnaryOp(SkTArray<Stage>* pipeline, SkRasterPipelineOp stage, |
| float* dst, int numSlots); |
| |
| // Appends a multi-slot single-input math operation to the pipeline. `baseStage` must refer to |
| // an single-slot "apply_op" stage, which must be immediately followed by specializations for |
| // 2-4 slots. For instance, {`zero_slot`, `zero_2_slots`, `zero_3_slots`, `zero_4_slots`} |
| // must be contiguous ops in the stage list, listed in that order; pass `zero_slot` and we |
| // pick the appropriate op based on `numSlots`. |
| void appendMultiSlotUnaryOp(SkTArray<Stage>* pipeline, SkRasterPipelineOp baseStage, |
| float* dst, int numSlots); |
| |
| // Appends a multi-slot two-input math operation to the pipeline. `src` must be _immediately_ |
| // after `dst` in memory. `baseStage` must refer to an unbounded "apply_to_n_slots" stage, which |
| // must be immediately followed by specializations for 1-4 slots. For instance, {`add_n_floats`, |
| // `add_float`, `add_2_floats`, `add_3_floats`, `add_4_floats`} must be contiguous ops in the |
| // stage list, listed in that order; pass `add_n_floats` and we pick the appropriate op based on |
| // `numSlots`. |
| void appendAdjacentMultiSlotBinaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, |
| SkRasterPipelineOp baseStage, |
| float* dst, const float* src, int numSlots); |
| |
| // Appends a multi-slot math operation having three inputs (dst, src0, src1) and one output |
| // (dst) to the pipeline. The three inputs must be _immediately_ adjacent in memory. `baseStage` |
| // must refer to an unbounded "apply_to_n_slots" stage, which must be immediately followed by |
| // specializations for 1-4 slots. |
| void appendAdjacentMultiSlotTernaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, |
| SkRasterPipelineOp stage, float* dst, |
| const float* src0, const float* src1, int numSlots); |
| |
| // Appends a stack_rewind op on platforms where it is needed (when SK_HAS_MUSTTAIL is not set). |
| void appendStackRewind(SkTArray<Stage>* pipeline); |
| |
| SkTArray<Instruction> fInstructions; |
| int fNumValueSlots = 0; |
| int fNumUniformSlots = 0; |
| int fNumTempStackSlots = 0; |
| int fNumLabels = 0; |
| int fNumBranches = 0; |
| SkTHashMap<int, int> fTempStackMaxDepths; |
| SkRPDebugTrace* fDebugTrace = nullptr; |
| }; |
| |
| class Builder { |
| public: |
| /** Finalizes and optimizes the program. */ |
| std::unique_ptr<Program> finish(int numValueSlots, |
| int numUniformSlots, |
| SkRPDebugTrace* debugTrace = nullptr); |
| /** |
| * Peels off a label ID for use in the program. Set the label's position in the program with |
| * the `label` instruction. Actually branch to the target with an instruction like |
| * `branch_if_any_active_lanes` or `jump`. |
| */ |
| int nextLabelID() { |
| return fNumLabels++; |
| } |
| |
| /** |
| * The builder keeps track of the state of execution masks; when we know that the execution |
| * mask is unaltered, we can generate simpler code. Code which alters the execution mask is |
| * required to enable this flag. |
| */ |
| void enableExecutionMaskWrites() { |
| ++fExecutionMaskWritesEnabled; |
| } |
| |
| void disableExecutionMaskWrites() { |
| SkASSERT(this->executionMaskWritesAreEnabled()); |
| --fExecutionMaskWritesEnabled; |
| } |
| |
| bool executionMaskWritesAreEnabled() { |
| return fExecutionMaskWritesEnabled > 0; |
| } |
| |
| /** Assemble a program from the Raster Pipeline instructions below. */ |
| void init_lane_masks() { |
| fInstructions.push_back({BuilderOp::init_lane_masks, {}}); |
| } |
| |
| void store_src_rg(SlotRange slots) { |
| SkASSERT(slots.count == 2); |
| fInstructions.push_back({BuilderOp::store_src_rg, {slots.index}}); |
| } |
| |
| void store_src(SlotRange slots) { |
| SkASSERT(slots.count == 4); |
| fInstructions.push_back({BuilderOp::store_src, {slots.index}}); |
| } |
| |
| void store_dst(SlotRange slots) { |
| SkASSERT(slots.count == 4); |
| fInstructions.push_back({BuilderOp::store_dst, {slots.index}}); |
| } |
| |
| void load_src(SlotRange slots) { |
| SkASSERT(slots.count == 4); |
| fInstructions.push_back({BuilderOp::load_src, {slots.index}}); |
| } |
| |
| void load_dst(SlotRange slots) { |
| SkASSERT(slots.count == 4); |
| fInstructions.push_back({BuilderOp::load_dst, {slots.index}}); |
| } |
| |
| void set_current_stack(int stackIdx) { |
| fInstructions.push_back({BuilderOp::set_current_stack, {}, stackIdx}); |
| } |
| |
| void label(int labelID) { |
| SkASSERT(labelID >= 0 && labelID < fNumLabels); |
| fInstructions.push_back({BuilderOp::label, {}, labelID}); |
| } |
| |
| void jump(int labelID) { |
| SkASSERT(labelID >= 0 && labelID < fNumLabels); |
| if (!fInstructions.empty() && fInstructions.back().fOp == BuilderOp::jump) { |
| // The previous instruction was also `jump`, so this branch could never possibly occur. |
| return; |
| } |
| fInstructions.push_back({BuilderOp::jump, {}, labelID}); |
| ++fNumBranches; |
| } |
| |
| void branch_if_any_active_lanes(int labelID) { |
| if (!this->executionMaskWritesAreEnabled()) { |
| this->jump(labelID); |
| return; |
| } |
| |
| SkASSERT(labelID >= 0 && labelID < fNumLabels); |
| if (!fInstructions.empty() && |
| (fInstructions.back().fOp == BuilderOp::branch_if_any_active_lanes || |
| fInstructions.back().fOp == BuilderOp::jump)) { |
| // The previous instruction was `jump` or `branch_if_any_active_lanes`, so this branch |
| // could never possibly occur. |
| return; |
| } |
| fInstructions.push_back({BuilderOp::branch_if_any_active_lanes, {}, labelID}); |
| ++fNumBranches; |
| } |
| |
| void branch_if_no_active_lanes(int labelID) { |
| if (!this->executionMaskWritesAreEnabled()) { |
| return; |
| } |
| |
| SkASSERT(labelID >= 0 && labelID < fNumLabels); |
| if (!fInstructions.empty() && |
| (fInstructions.back().fOp == BuilderOp::branch_if_no_active_lanes || |
| fInstructions.back().fOp == BuilderOp::jump)) { |
| // The previous instruction was `jump` or `branch_if_any_active_lanes`, so this branch |
| // could never possibly occur. |
| return; |
| } |
| fInstructions.push_back({BuilderOp::branch_if_no_active_lanes, {}, labelID}); |
| ++fNumBranches; |
| } |
| |
| // We use the same SkRasterPipeline op regardless of the literal type, and bitcast the value. |
| void immediate_f(float val) { |
| fInstructions.push_back({BuilderOp::immediate_f, {}, sk_bit_cast<int32_t>(val)}); |
| } |
| |
| void immediate_i(int32_t val) { |
| fInstructions.push_back({BuilderOp::immediate_f, {}, val}); |
| } |
| |
| void immediate_u(uint32_t val) { |
| fInstructions.push_back({BuilderOp::immediate_f, {}, sk_bit_cast<int32_t>(val)}); |
| } |
| |
| void push_literal_f(float val) { |
| this->push_literal_i(sk_bit_cast<int32_t>(val)); |
| } |
| |
| void push_literal_i(int32_t val) { |
| if (val == 0) { |
| this->push_zeros(1); |
| } else { |
| fInstructions.push_back({BuilderOp::push_literal, {}, val}); |
| } |
| } |
| |
| void push_literal_u(uint32_t val) { |
| this->push_literal_i(sk_bit_cast<int32_t>(val)); |
| } |
| |
| // Translates into copy_constants (from uniforms into temp stack) in Raster Pipeline. |
| void push_uniform(SlotRange src); |
| |
| void push_zeros(int count) { |
| // Translates into zero_slot_unmasked in Raster Pipeline. |
| if (!fInstructions.empty() && fInstructions.back().fOp == BuilderOp::push_zeros) { |
| // Coalesce adjacent push_zero ops into a single op. |
| fInstructions.back().fImmA += count; |
| } else { |
| fInstructions.push_back({BuilderOp::push_zeros, {}, count}); |
| } |
| } |
| |
| // Translates into copy_slots_unmasked (from values into temp stack) in Raster Pipeline. |
| void push_slots(SlotRange src); |
| |
| // Translates into copy_slots_masked (from temp stack to values) in Raster Pipeline. |
| // Does not discard any values on the temp stack. |
| void copy_stack_to_slots(SlotRange dst) { |
| this->copy_stack_to_slots(dst, /*offsetFromStackTop=*/dst.count); |
| } |
| |
| void copy_stack_to_slots(SlotRange dst, int offsetFromStackTop); |
| |
| // Translates into copy_slots_unmasked (from temp stack to values) in Raster Pipeline. |
| // Does not discard any values on the temp stack. |
| void copy_stack_to_slots_unmasked(SlotRange dst) { |
| this->copy_stack_to_slots_unmasked(dst, /*offsetFromStackTop=*/dst.count); |
| } |
| |
| void copy_stack_to_slots_unmasked(SlotRange dst, int offsetFromStackTop); |
| |
| // Performs a unary op (like `bitwise_not`), given a slot count of `slots`. The stack top is |
| // replaced with the result. |
| void unary_op(BuilderOp op, int32_t slots); |
| |
| // Performs a binary op (like `add_n_floats` or `cmpeq_n_ints`), given a slot count of |
| // `slots`. Two n-slot input values are consumed, and the result is pushed onto the stack. |
| void binary_op(BuilderOp op, int32_t slots); |
| |
| // Performs a ternary op (like `mix` or `smoothstep`), given a slot count of |
| // `slots`. Three n-slot input values are consumed, and the result is pushed onto the stack. |
| void ternary_op(BuilderOp op, int32_t slots); |
| |
| // Shrinks the temp stack, discarding values on top. |
| void discard_stack(int32_t count = 1); |
| |
| void pop_slots(SlotRange dst) { |
| // The opposite of push_slots; copies values from the temp stack into value slots, then |
| // shrinks the temp stack. |
| this->copy_stack_to_slots(dst); |
| this->discard_stack(dst.count); |
| } |
| |
| // Creates many clones of the top single-slot item on the temp stack. |
| void push_duplicates(int count); |
| |
| // Creates a single clone of an item on the current temp stack. The cloned item can consist of |
| // any number of slots, and can be copied from an earlier position on the stack. |
| void push_clone(int numSlots, int offsetFromStackTop = 0) { |
| fInstructions.push_back({BuilderOp::push_clone, {}, numSlots, |
| numSlots + offsetFromStackTop}); |
| } |
| |
| // Creates a single clone from an item on any temp stack. The cloned item can consist of any |
| // number of slots. |
| void push_clone_from_stack(int numSlots, int otherStackIndex, int offsetFromStackTop = 0) { |
| fInstructions.push_back({BuilderOp::push_clone_from_stack, {}, numSlots, otherStackIndex, |
| numSlots + offsetFromStackTop}); |
| } |
| |
| void select(int slots) { |
| // Overlays the top two entries on the stack, making one hybrid entry. The execution mask |
| // is used to select which lanes are preserved. |
| SkASSERT(slots > 0); |
| fInstructions.push_back({BuilderOp::select, {}, slots}); |
| } |
| |
| // The opposite of push_slots; copies values from the temp stack into value slots, then |
| // shrinks the temp stack. |
| void pop_slots_unmasked(SlotRange dst); |
| |
| void load_unmasked(Slot slot) { |
| fInstructions.push_back({BuilderOp::load_unmasked, {slot}}); |
| } |
| |
| void store_unmasked(Slot slot) { |
| fInstructions.push_back({BuilderOp::store_unmasked, {slot}}); |
| } |
| |
| void store_masked(Slot slot) { |
| fInstructions.push_back({BuilderOp::store_masked, {slot}}); |
| } |
| |
| void copy_slots_masked(SlotRange dst, SlotRange src) { |
| SkASSERT(dst.count == src.count); |
| fInstructions.push_back({BuilderOp::copy_slot_masked, {dst.index, src.index}, dst.count}); |
| } |
| |
| void copy_slots_unmasked(SlotRange dst, SlotRange src) { |
| SkASSERT(dst.count == src.count); |
| fInstructions.push_back({BuilderOp::copy_slot_unmasked, {dst.index, src.index}, dst.count}); |
| } |
| |
| void copy_constant(Slot slot, int constantValue) { |
| fInstructions.push_back({BuilderOp::copy_constant, {slot}, constantValue}); |
| } |
| |
| // Stores zeros across the entire slot range. |
| void zero_slots_unmasked(SlotRange dst); |
| |
| // Consumes `consumedSlots` elements on the stack, then generates `components.size()` elements. |
| void swizzle(int consumedSlots, SkSpan<const int8_t> components); |
| |
| // Transposes a matrix of size CxR on the stack (into a matrix of size RxC). |
| void transpose(int columns, int rows); |
| |
| // Generates a CxR diagonal matrix from the top two scalars on the stack. The second scalar is |
| // used as the diagonal value; the first scalar (usually zero) fills in the rest of the slots. |
| void diagonal_matrix(int columns, int rows); |
| |
| // Resizes a CxR matrix at the top of the stack to C'xR'. |
| void matrix_resize(int origColumns, int origRows, int newColumns, int newRows); |
| |
| void push_condition_mask() { |
| SkASSERT(this->executionMaskWritesAreEnabled()); |
| fInstructions.push_back({BuilderOp::push_condition_mask, {}}); |
| } |
| |
| void pop_condition_mask() { |
| SkASSERT(this->executionMaskWritesAreEnabled()); |
| fInstructions.push_back({BuilderOp::pop_condition_mask, {}}); |
| } |
| |
| void merge_condition_mask() { |
| SkASSERT(this->executionMaskWritesAreEnabled()); |
| fInstructions.push_back({BuilderOp::merge_condition_mask, {}}); |
| } |
| |
| void push_loop_mask() { |
| SkASSERT(this->executionMaskWritesAreEnabled()); |
| fInstructions.push_back({BuilderOp::push_loop_mask, {}}); |
| } |
| |
| void pop_loop_mask() { |
| SkASSERT(this->executionMaskWritesAreEnabled()); |
| fInstructions.push_back({BuilderOp::pop_loop_mask, {}}); |
| } |
| |
| void mask_off_loop_mask() { |
| SkASSERT(this->executionMaskWritesAreEnabled()); |
| fInstructions.push_back({BuilderOp::mask_off_loop_mask, {}}); |
| } |
| |
| void reenable_loop_mask(SlotRange src) { |
| SkASSERT(this->executionMaskWritesAreEnabled()); |
| SkASSERT(src.count == 1); |
| fInstructions.push_back({BuilderOp::reenable_loop_mask, {src.index}}); |
| } |
| |
| void merge_loop_mask() { |
| SkASSERT(this->executionMaskWritesAreEnabled()); |
| fInstructions.push_back({BuilderOp::merge_loop_mask, {}}); |
| } |
| |
| void push_return_mask() { |
| SkASSERT(this->executionMaskWritesAreEnabled()); |
| fInstructions.push_back({BuilderOp::push_return_mask, {}}); |
| } |
| |
| void pop_return_mask(); |
| |
| void mask_off_return_mask() { |
| SkASSERT(this->executionMaskWritesAreEnabled()); |
| fInstructions.push_back({BuilderOp::mask_off_return_mask, {}}); |
| } |
| |
| private: |
| SkTArray<Instruction> fInstructions; |
| int fNumLabels = 0; |
| int fNumBranches = 0; |
| int fExecutionMaskWritesEnabled = 0; |
| }; |
| |
| } // namespace RP |
| } // namespace SkSL |