Add RP builder support for swizzle-copy-to-slots-indirect.

There is a bit of logic here to stuff all of the values into slots
and immediates, and then to peel them back out again, but it's
pretty standard stuff I hope.

Change-Id: Iad65e8d2cb548b0ca66129964eb1647e6b389acb
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/657636
Auto-Submit: John Stiles <johnstiles@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
Commit-Queue: John Stiles <johnstiles@google.com>
diff --git a/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp b/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
index b42a585..bdcafff 100644
--- a/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
+++ b/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
@@ -693,12 +693,51 @@
     }
 }
 
+static int max_packed_nybble(uint32_t components, size_t numComponents) {
+    int largest = 0;
+    for (size_t index = 0; index < numComponents; ++index) {
+        largest = std::max<int>(largest, components & 0xF);
+        components >>= 4;
+    }
+    return largest;
+}
+
 void Builder::swizzle_copy_stack_to_slots(SlotRange dst,
                                           SkSpan<const int8_t> components,
                                           int offsetFromStackTop) {
-    // An unmasked version of this op could squeeze out a little bit of extra speed, if needed.
+    // When the execution-mask writes-enabled flag is off, we could squeeze out a little bit of
+    // extra speed here by implementing and using an unmasked version of this op.
+
+    // SlotA: fixed-range start
+    // immA: number of swizzle components
+    // immB: swizzle components
+    // immC: offset from stack top
     fInstructions.push_back({BuilderOp::swizzle_copy_stack_to_slots, {dst.index},
-                             (int)components.size(), offsetFromStackTop, pack_nybbles(components)});
+                             (int)components.size(),
+                             pack_nybbles(components),
+                             offsetFromStackTop});
+}
+
+void Builder::swizzle_copy_stack_to_slots_indirect(SlotRange fixedRange,
+                                                   int dynamicStackID,
+                                                   SlotRange limitRange,
+                                                   SkSpan<const int8_t> components,
+                                                   int offsetFromStackTop) {
+    // When the execution-mask writes-enabled flag is off, we could squeeze out a little bit of
+    // extra speed here by implementing and using an unmasked version of this op.
+
+    // SlotA: fixed-range start
+    // SlotB: limit-range end
+    // immA: number of swizzle components
+    // immB: swizzle components
+    // immC: offset from stack top
+    // immD: dynamic stack ID
+    fInstructions.push_back({BuilderOp::swizzle_copy_stack_to_slots_indirect,
+                             {fixedRange.index, limitRange.index + limitRange.count},
+                             (int)components.size(),
+                             pack_nybbles(components),
+                             offsetFromStackTop,
+                             dynamicStackID});
 }
 
 void Builder::swizzle(int consumedSlots, SkSpan<const int8_t> components) {
@@ -1553,11 +1592,15 @@
                 break;
             }
             case BuilderOp::swizzle_copy_stack_to_slots: {
+                // SlotA: fixed-range start
+                // immA: number of swizzle components
+                // immB: swizzle components
+                // immC: offset from stack top
                 auto stage = (ProgramOp)((int)ProgramOp::swizzle_copy_slot_masked + inst.fImmA - 1);
                 auto* ctx = alloc->make<SkRasterPipeline_SwizzleCopyCtx>();
-                ctx->src = tempStackPtr - (inst.fImmB * N);
+                ctx->src = tempStackPtr - (inst.fImmC * N);
                 ctx->dst = SlotA();
-                unpack_nybbles_to_offsets(inst.fImmC, SkSpan(ctx->offsets));
+                unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
                 pipeline->push_back({stage, ctx});
                 break;
             }
@@ -1594,6 +1637,25 @@
                 pipeline->push_back({ProgramOp::copy_from_indirect_unmasked, ctx});
                 break;
             }
+            case BuilderOp::swizzle_copy_stack_to_slots_indirect: {
+                // SlotA: fixed-range start
+                // SlotB: limit-range end
+                // immA: number of swizzle components
+                // immB: swizzle components
+                // immC: offset from stack top
+                // immD: dynamic stack ID
+                auto* ctx = alloc->make<SkRasterPipeline_SwizzleCopyIndirectCtx>();
+                ctx->src = tempStackPtr - (inst.fImmC * N);
+                ctx->dst = SlotA();
+                ctx->indirectOffset =
+                        reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmD]) - (1 * N);
+                ctx->indirectLimit =
+                        inst.fSlotB - inst.fSlotA - (max_packed_nybble(inst.fImmB, inst.fImmA) + 1);
+                ctx->slots = inst.fImmA;
+                unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
+                pipeline->push_back({ProgramOp::swizzle_copy_to_indirect_masked, ctx});
+                break;
+            }
             case BuilderOp::case_op: {
                 auto* ctx = alloc->make<SkRasterPipeline_CaseOpCtx>();
                 ctx->ptr = reinterpret_cast<int*>(tempStackPtr - 2 * N);
@@ -1916,15 +1978,9 @@
             return Adjacent3PtrCtx(ctx->dst, numSlots);
         };
 
-        // Stringize a swizzled pointer. Note that the slot-width of the original expression is not
-        // preserved in the instruction encoding, so we need to do our best using the data we have.
-        // (e.g., myFloat4.y would be indistinguishable from myFloat2.y.)
-        auto SwizzlePtr = [&](const float* ptr, SkSpan<const uint16_t> offsets) {
-            size_t highestComponent = *std::max_element(offsets.begin(), offsets.end()) /
-                                      (N * sizeof(float));
-
-            std::string src = "(" + PtrCtx(ptr, std::max(offsets.size(), highestComponent + 1)) +
-                              ").";
+        // Stringize a span of swizzle offsets to the textual equivalent (`xyzw`).
+        auto SwizzleOffsetSpan = [&](SkSpan<const uint16_t> offsets) {
+            std::string src;
             for (uint16_t offset : offsets) {
                 if (offset == (0 * N * sizeof(float))) {
                     src.push_back('x');
@@ -1941,6 +1997,21 @@
             return src;
         };
 
+        // When we decode a swizzle, we don't know the slot width of the original value; that's not
+        // preserved in the instruction encoding. (e.g., myFloat4.y would be indistinguishable from
+        // myFloat2.y.) We do our best to make a readable dump using the data we have.
+        auto SwizzleWidth = [&](SkSpan<const uint16_t> offsets) {
+            size_t highestComponent = *std::max_element(offsets.begin(), offsets.end()) /
+                                      (N * sizeof(float));
+            size_t swizzleWidth = offsets.size();
+            return std::max(swizzleWidth, highestComponent + 1);
+        };
+
+        // Stringize a swizzled pointer.
+        auto SwizzlePtr = [&](const float* ptr, SkSpan<const uint16_t> offsets) {
+            return "(" + PtrCtx(ptr, SwizzleWidth(offsets)) + ")." + SwizzleOffsetSpan(offsets);
+        };
+
         // Interpret the context value as a Swizzle structure.
         auto SwizzleCtx = [&](ProgramOp op, const void* v) -> std::tuple<std::string, std::string> {
             const auto* ctx = static_cast<const SkRasterPipeline_SwizzleCtx*>(v);
@@ -1978,7 +2049,7 @@
             return std::make_tuple(dst, src);
         };
 
-        std::string opArg1, opArg2, opArg3;
+        std::string opArg1, opArg2, opArg3, opSwizzle;
         using POp = ProgramOp;
         switch (stage.op) {
             case POp::label:
@@ -2144,6 +2215,14 @@
                 opArg3 = PtrCtx(ctx->indirectOffset, 1);
                 break;
             }
+            case POp::swizzle_copy_to_indirect_masked: {
+                const auto* ctx = static_cast<SkRasterPipeline_SwizzleCopyIndirectCtx*>(stage.ctx);
+                opArg1 = PtrCtx(ctx->dst, SwizzleWidth(SkSpan(ctx->offsets, ctx->slots)));
+                opArg2 = PtrCtx(ctx->src, ctx->slots);
+                opArg3 = PtrCtx(ctx->indirectOffset, 1);
+                opSwizzle = SwizzleOffsetSpan(SkSpan(ctx->offsets, ctx->slots));
+                break;
+            }
             case POp::merge_condition_mask:
             case POp::add_float:   case POp::add_int:
             case POp::sub_float:   case POp::sub_int:
@@ -2441,6 +2520,11 @@
                 opText = "Indirect(" + opArg1 + " + " + opArg3 + ") = Mask(" + opArg2 + ")";
                 break;
 
+            case POp::swizzle_copy_to_indirect_masked:
+                opText = "Indirect(" + opArg1 + " + " + opArg3 + ")." + opSwizzle + " = Mask(" +
+                         opArg2 + ")";
+                break;
+
             case POp::zero_slot_unmasked:    case POp::zero_2_slots_unmasked:
             case POp::zero_3_slots_unmasked: case POp::zero_4_slots_unmasked:
                 opText = opArg1 + " = 0";
diff --git a/src/sksl/codegen/SkSLRasterPipelineBuilder.h b/src/sksl/codegen/SkSLRasterPipelineBuilder.h
index d814f43..7a30ade 100644
--- a/src/sksl/codegen/SkSLRasterPipelineBuilder.h
+++ b/src/sksl/codegen/SkSLRasterPipelineBuilder.h
@@ -97,6 +97,7 @@
     copy_stack_to_slots_unmasked,
     copy_stack_to_slots_indirect,
     swizzle_copy_stack_to_slots,
+    swizzle_copy_stack_to_slots_indirect,
     discard_stack,
     select,
     push_condition_mask,
@@ -405,6 +406,14 @@
                                      SkSpan<const int8_t> components,
                                      int offsetFromStackTop);
 
+    // Translates into swizzle_copy_to_indirect_masked (from temp stack to values) in Raster
+    // Pipeline. Does not discard any values on the temp stack.
+    void swizzle_copy_stack_to_slots_indirect(SlotRange fixedRange,
+                                              int dynamicStackID,
+                                              SlotRange limitRange,
+                                              SkSpan<const int8_t> components,
+                                              int offsetFromStackTop);
+
     // Translates into copy_slots_unmasked (from temp stack to values) in Raster Pipeline.
     // Does not discard any values on the temp stack.
     void copy_stack_to_slots_unmasked(SlotRange dst) {
diff --git a/tests/RasterPipelineBuilderTest.cpp b/tests/RasterPipelineBuilderTest.cpp
index 3920e93..28d3c89 100644
--- a/tests/RasterPipelineBuilderTest.cpp
+++ b/tests/RasterPipelineBuilderTest.cpp
@@ -228,6 +228,10 @@
     builder.push_slots_indirect(four_slots_at(10), /*dynamicStack=*/1, ten_slots_at(10));
     builder.push_uniform_indirect(one_slot_at(0), /*dynamicStack=*/1, five_slots_at(0));
     builder.push_uniform_indirect(three_slots_at(5), /*dynamicStack=*/1, five_slots_at(5));
+    builder.swizzle_copy_stack_to_slots_indirect(three_slots_at(6), /*dynamicStackID=*/1,
+                                                 ten_slots_at(0), {2, 1, 0},
+                                                 /*offsetFromStackTop=*/3);
+    builder.copy_stack_to_slots_indirect(three_slots_at(4), /*dynamicStackID=*/1, ten_slots_at(0));
     builder.pop_slots_indirect(five_slots_at(0), /*dynamicStackID=*/1, ten_slots_at(0));
     builder.pop_slots_indirect(five_slots_at(10), /*dynamicStackID=*/1, ten_slots_at(10));
     builder.set_current_stack(1);
@@ -240,8 +244,10 @@
     3. copy_from_indirect_unmasked    $2..5 = Indirect(v10..13 + $10)
     4. copy_from_indirect_uniform_unm $6 = Indirect(u0 + $10)
     5. copy_from_indirect_uniform_unm $7..9 = Indirect(u5..7 + $10)
-    6. copy_to_indirect_masked        Indirect(v0..4 + $10) = Mask($5..9)
-    7. copy_to_indirect_masked        Indirect(v10..14 + $10) = Mask($0..4)
+    6. swizzle_copy_to_indirect_maske Indirect(v6..8 + $10).zyx = Mask($7..9)
+    7. copy_to_indirect_masked        Indirect(v4..6 + $10) = Mask($7..9)
+    8. copy_to_indirect_masked        Indirect(v0..4 + $10) = Mask($5..9)
+    9. copy_to_indirect_masked        Indirect(v10..14 + $10) = Mask($0..4)
 )");
 }