Add RP builder support for swizzle-copy-to-slots-indirect.
There is a bit of logic here to stuff all of the values into slots
and immediates, and then to peel them back out again, but it's
pretty standard stuff I hope.
Change-Id: Iad65e8d2cb548b0ca66129964eb1647e6b389acb
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/657636
Auto-Submit: John Stiles <johnstiles@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
Commit-Queue: John Stiles <johnstiles@google.com>
diff --git a/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp b/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
index b42a585..bdcafff 100644
--- a/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
+++ b/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
@@ -693,12 +693,51 @@
}
}
+static int max_packed_nybble(uint32_t components, size_t numComponents) {
+ int largest = 0;
+ for (size_t index = 0; index < numComponents; ++index) {
+ largest = std::max<int>(largest, components & 0xF);
+ components >>= 4;
+ }
+ return largest;
+}
+
void Builder::swizzle_copy_stack_to_slots(SlotRange dst,
SkSpan<const int8_t> components,
int offsetFromStackTop) {
- // An unmasked version of this op could squeeze out a little bit of extra speed, if needed.
+ // When the execution-mask writes-enabled flag is off, we could squeeze out a little bit of
+ // extra speed here by implementing and using an unmasked version of this op.
+
+ // SlotA: fixed-range start
+ // immA: number of swizzle components
+ // immB: swizzle components
+ // immC: offset from stack top
fInstructions.push_back({BuilderOp::swizzle_copy_stack_to_slots, {dst.index},
- (int)components.size(), offsetFromStackTop, pack_nybbles(components)});
+ (int)components.size(),
+ pack_nybbles(components),
+ offsetFromStackTop});
+}
+
+void Builder::swizzle_copy_stack_to_slots_indirect(SlotRange fixedRange,
+ int dynamicStackID,
+ SlotRange limitRange,
+ SkSpan<const int8_t> components,
+ int offsetFromStackTop) {
+ // When the execution-mask writes-enabled flag is off, we could squeeze out a little bit of
+ // extra speed here by implementing and using an unmasked version of this op.
+
+ // SlotA: fixed-range start
+ // SlotB: limit-range end
+ // immA: number of swizzle components
+ // immB: swizzle components
+ // immC: offset from stack top
+ // immD: dynamic stack ID
+ fInstructions.push_back({BuilderOp::swizzle_copy_stack_to_slots_indirect,
+ {fixedRange.index, limitRange.index + limitRange.count},
+ (int)components.size(),
+ pack_nybbles(components),
+ offsetFromStackTop,
+ dynamicStackID});
}
void Builder::swizzle(int consumedSlots, SkSpan<const int8_t> components) {
@@ -1553,11 +1592,15 @@
break;
}
case BuilderOp::swizzle_copy_stack_to_slots: {
+ // SlotA: fixed-range start
+ // immA: number of swizzle components
+ // immB: swizzle components
+ // immC: offset from stack top
auto stage = (ProgramOp)((int)ProgramOp::swizzle_copy_slot_masked + inst.fImmA - 1);
auto* ctx = alloc->make<SkRasterPipeline_SwizzleCopyCtx>();
- ctx->src = tempStackPtr - (inst.fImmB * N);
+ ctx->src = tempStackPtr - (inst.fImmC * N);
ctx->dst = SlotA();
- unpack_nybbles_to_offsets(inst.fImmC, SkSpan(ctx->offsets));
+ unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
pipeline->push_back({stage, ctx});
break;
}
@@ -1594,6 +1637,25 @@
pipeline->push_back({ProgramOp::copy_from_indirect_unmasked, ctx});
break;
}
+ case BuilderOp::swizzle_copy_stack_to_slots_indirect: {
+ // SlotA: fixed-range start
+ // SlotB: limit-range end
+ // immA: number of swizzle components
+ // immB: swizzle components
+ // immC: offset from stack top
+ // immD: dynamic stack ID
+ auto* ctx = alloc->make<SkRasterPipeline_SwizzleCopyIndirectCtx>();
+ ctx->src = tempStackPtr - (inst.fImmC * N);
+ ctx->dst = SlotA();
+ ctx->indirectOffset =
+ reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmD]) - (1 * N);
+ ctx->indirectLimit =
+ inst.fSlotB - inst.fSlotA - (max_packed_nybble(inst.fImmB, inst.fImmA) + 1);
+ ctx->slots = inst.fImmA;
+ unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
+ pipeline->push_back({ProgramOp::swizzle_copy_to_indirect_masked, ctx});
+ break;
+ }
case BuilderOp::case_op: {
auto* ctx = alloc->make<SkRasterPipeline_CaseOpCtx>();
ctx->ptr = reinterpret_cast<int*>(tempStackPtr - 2 * N);
@@ -1916,15 +1978,9 @@
return Adjacent3PtrCtx(ctx->dst, numSlots);
};
- // Stringize a swizzled pointer. Note that the slot-width of the original expression is not
- // preserved in the instruction encoding, so we need to do our best using the data we have.
- // (e.g., myFloat4.y would be indistinguishable from myFloat2.y.)
- auto SwizzlePtr = [&](const float* ptr, SkSpan<const uint16_t> offsets) {
- size_t highestComponent = *std::max_element(offsets.begin(), offsets.end()) /
- (N * sizeof(float));
-
- std::string src = "(" + PtrCtx(ptr, std::max(offsets.size(), highestComponent + 1)) +
- ").";
+ // Stringize a span of swizzle offsets to the textual equivalent (`xyzw`).
+ auto SwizzleOffsetSpan = [&](SkSpan<const uint16_t> offsets) {
+ std::string src;
for (uint16_t offset : offsets) {
if (offset == (0 * N * sizeof(float))) {
src.push_back('x');
@@ -1941,6 +1997,21 @@
return src;
};
+ // When we decode a swizzle, we don't know the slot width of the original value; that's not
+ // preserved in the instruction encoding. (e.g., myFloat4.y would be indistinguishable from
+ // myFloat2.y.) We do our best to make a readable dump using the data we have.
+ auto SwizzleWidth = [&](SkSpan<const uint16_t> offsets) {
+ size_t highestComponent = *std::max_element(offsets.begin(), offsets.end()) /
+ (N * sizeof(float));
+ size_t swizzleWidth = offsets.size();
+ return std::max(swizzleWidth, highestComponent + 1);
+ };
+
+ // Stringize a swizzled pointer.
+ auto SwizzlePtr = [&](const float* ptr, SkSpan<const uint16_t> offsets) {
+ return "(" + PtrCtx(ptr, SwizzleWidth(offsets)) + ")." + SwizzleOffsetSpan(offsets);
+ };
+
// Interpret the context value as a Swizzle structure.
auto SwizzleCtx = [&](ProgramOp op, const void* v) -> std::tuple<std::string, std::string> {
const auto* ctx = static_cast<const SkRasterPipeline_SwizzleCtx*>(v);
@@ -1978,7 +2049,7 @@
return std::make_tuple(dst, src);
};
- std::string opArg1, opArg2, opArg3;
+ std::string opArg1, opArg2, opArg3, opSwizzle;
using POp = ProgramOp;
switch (stage.op) {
case POp::label:
@@ -2144,6 +2215,14 @@
opArg3 = PtrCtx(ctx->indirectOffset, 1);
break;
}
+ case POp::swizzle_copy_to_indirect_masked: {
+ const auto* ctx = static_cast<SkRasterPipeline_SwizzleCopyIndirectCtx*>(stage.ctx);
+ opArg1 = PtrCtx(ctx->dst, SwizzleWidth(SkSpan(ctx->offsets, ctx->slots)));
+ opArg2 = PtrCtx(ctx->src, ctx->slots);
+ opArg3 = PtrCtx(ctx->indirectOffset, 1);
+ opSwizzle = SwizzleOffsetSpan(SkSpan(ctx->offsets, ctx->slots));
+ break;
+ }
case POp::merge_condition_mask:
case POp::add_float: case POp::add_int:
case POp::sub_float: case POp::sub_int:
@@ -2441,6 +2520,11 @@
opText = "Indirect(" + opArg1 + " + " + opArg3 + ") = Mask(" + opArg2 + ")";
break;
+ case POp::swizzle_copy_to_indirect_masked:
+ opText = "Indirect(" + opArg1 + " + " + opArg3 + ")." + opSwizzle + " = Mask(" +
+ opArg2 + ")";
+ break;
+
case POp::zero_slot_unmasked: case POp::zero_2_slots_unmasked:
case POp::zero_3_slots_unmasked: case POp::zero_4_slots_unmasked:
opText = opArg1 + " = 0";
diff --git a/src/sksl/codegen/SkSLRasterPipelineBuilder.h b/src/sksl/codegen/SkSLRasterPipelineBuilder.h
index d814f43..7a30ade 100644
--- a/src/sksl/codegen/SkSLRasterPipelineBuilder.h
+++ b/src/sksl/codegen/SkSLRasterPipelineBuilder.h
@@ -97,6 +97,7 @@
copy_stack_to_slots_unmasked,
copy_stack_to_slots_indirect,
swizzle_copy_stack_to_slots,
+ swizzle_copy_stack_to_slots_indirect,
discard_stack,
select,
push_condition_mask,
@@ -405,6 +406,14 @@
SkSpan<const int8_t> components,
int offsetFromStackTop);
+ // Translates into swizzle_copy_to_indirect_masked (from temp stack to values) in Raster
+ // Pipeline. Does not discard any values on the temp stack.
+ void swizzle_copy_stack_to_slots_indirect(SlotRange fixedRange,
+ int dynamicStackID,
+ SlotRange limitRange,
+ SkSpan<const int8_t> components,
+ int offsetFromStackTop);
+
// Translates into copy_slots_unmasked (from temp stack to values) in Raster Pipeline.
// Does not discard any values on the temp stack.
void copy_stack_to_slots_unmasked(SlotRange dst) {
diff --git a/tests/RasterPipelineBuilderTest.cpp b/tests/RasterPipelineBuilderTest.cpp
index 3920e93..28d3c89 100644
--- a/tests/RasterPipelineBuilderTest.cpp
+++ b/tests/RasterPipelineBuilderTest.cpp
@@ -228,6 +228,10 @@
builder.push_slots_indirect(four_slots_at(10), /*dynamicStack=*/1, ten_slots_at(10));
builder.push_uniform_indirect(one_slot_at(0), /*dynamicStack=*/1, five_slots_at(0));
builder.push_uniform_indirect(three_slots_at(5), /*dynamicStack=*/1, five_slots_at(5));
+ builder.swizzle_copy_stack_to_slots_indirect(three_slots_at(6), /*dynamicStackID=*/1,
+ ten_slots_at(0), {2, 1, 0},
+ /*offsetFromStackTop=*/3);
+ builder.copy_stack_to_slots_indirect(three_slots_at(4), /*dynamicStackID=*/1, ten_slots_at(0));
builder.pop_slots_indirect(five_slots_at(0), /*dynamicStackID=*/1, ten_slots_at(0));
builder.pop_slots_indirect(five_slots_at(10), /*dynamicStackID=*/1, ten_slots_at(10));
builder.set_current_stack(1);
@@ -240,8 +244,10 @@
3. copy_from_indirect_unmasked $2..5 = Indirect(v10..13 + $10)
4. copy_from_indirect_uniform_unm $6 = Indirect(u0 + $10)
5. copy_from_indirect_uniform_unm $7..9 = Indirect(u5..7 + $10)
- 6. copy_to_indirect_masked Indirect(v0..4 + $10) = Mask($5..9)
- 7. copy_to_indirect_masked Indirect(v10..14 + $10) = Mask($0..4)
+ 6. swizzle_copy_to_indirect_maske Indirect(v6..8 + $10).zyx = Mask($7..9)
+ 7. copy_to_indirect_masked Indirect(v4..6 + $10) = Mask($7..9)
+ 8. copy_to_indirect_masked Indirect(v0..4 + $10) = Mask($5..9)
+ 9. copy_to_indirect_masked Indirect(v10..14 + $10) = Mask($0..4)
)");
}