Check the trace-coordinate in SkRP when debug traces are enabled.

This CL adds a new boolean to MakeRasterPipelineProgram to enable
trace-op writing. When this is passed, we generate a trace mask at
the start of the program. (This mask isn't used anywhere yet.)

Change-Id: Ifb8f13d8d793decfc35bcff4668cc5efab088fa0
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/661216
Reviewed-by: Brian Osman <brianosman@google.com>
Commit-Queue: John Stiles <johnstiles@google.com>
Auto-Submit: John Stiles <johnstiles@google.com>
diff --git a/include/effects/SkRuntimeEffect.h b/include/effects/SkRuntimeEffect.h
index 81246d4..ef4fa11 100644
--- a/include/effects/SkRuntimeEffect.h
+++ b/include/effects/SkRuntimeEffect.h
@@ -36,6 +36,7 @@
 
 namespace SkSL {
 class DebugTrace;
+class DebugTracePriv;
 class ErrorReporter;
 class FunctionDefinition;
 struct Program;
@@ -309,7 +310,7 @@
     bool alwaysOpaque()       const { return (fFlags & kAlwaysOpaque_Flag);       }
 
     const SkFilterColorProgram* getFilterColorProgram() const;
-    const SkSL::RP::Program* getRPProgram() const;
+    const SkSL::RP::Program* getRPProgram(SkSL::DebugTracePriv* debugTrace) const;
 
 #if defined(SK_GANESH)
     friend class GrSkSLFP;             // fBaseProgram, fSampleUsages
diff --git a/src/core/SkRuntimeEffect.cpp b/src/core/SkRuntimeEffect.cpp
index c477fb1..5cce54d 100644
--- a/src/core/SkRuntimeEffect.cpp
+++ b/src/core/SkRuntimeEffect.cpp
@@ -198,17 +198,25 @@
     return data ? data : originalData;
 }
 
-const SkSL::RP::Program* SkRuntimeEffect::getRPProgram() const {
+const SkSL::RP::Program* SkRuntimeEffect::getRPProgram(SkSL::DebugTracePriv* debugTrace) const {
     // Lazily compile the program the first time `getRPProgram` is called.
     // By using an SkOnce, we avoid thread hazards and behave in a conceptually const way, but we
     // can avoid the cost of invoking the RP code generator until it's actually needed.
     fCompileRPProgramOnce([&] {
 #ifdef SK_ENABLE_SKSL_IN_RASTER_PIPELINE
-        SkSL::DebugTracePriv debugTrace;
-        const_cast<SkRuntimeEffect*>(this)->fRPProgram =
-                MakeRasterPipelineProgram(*fBaseProgram,
-                                          fMain,
-                                          kRPEnableLiveTrace ? &debugTrace : nullptr);
+        SkSL::DebugTracePriv tempDebugTrace;
+        if (debugTrace) {
+            const_cast<SkRuntimeEffect*>(this)->fRPProgram = MakeRasterPipelineProgram(
+                    *fBaseProgram, fMain, debugTrace, /*writeTraceOps=*/true);
+        } else if (kRPEnableLiveTrace) {
+            debugTrace = &tempDebugTrace;
+            const_cast<SkRuntimeEffect*>(this)->fRPProgram = MakeRasterPipelineProgram(
+                    *fBaseProgram, fMain, debugTrace, /*writeTraceOps=*/false);
+        } else {
+            const_cast<SkRuntimeEffect*>(this)->fRPProgram = MakeRasterPipelineProgram(
+                    *fBaseProgram, fMain, /*debugTrace=*/nullptr, /*writeTraceOps=*/false);
+        }
+
         if (kRPEnableLiveTrace) {
             if (fRPProgram) {
                 SkDebugf("-----\n\n");
@@ -345,8 +353,8 @@
     SkASSERT(flattenable_is_valid_as_child(fChild.get()));
 }
 
-static sk_sp<SkSL::DebugTracePriv> make_skvm_debug_trace(SkRuntimeEffect* effect,
-                                                         const SkIPoint& coord) {
+static sk_sp<SkSL::DebugTracePriv> make_debug_trace(SkRuntimeEffect* effect,
+                                                    const SkIPoint& coord) {
     auto debugTrace = sk_make_sp<SkSL::DebugTracePriv>();
     debugTrace->setSource(effect->source());
     debugTrace->setTraceCoord(coord);
@@ -1260,7 +1268,7 @@
             // usage in runtime effects to just #version 100.
             return false;
         }
-        if (const SkSL::RP::Program* program = fEffect->getRPProgram()) {
+        if (const SkSL::RP::Program* program = fEffect->getRPProgram(/*debugTrace=*/nullptr)) {
             SkSpan<const float> uniforms = uniforms_as_span(fEffect->uniforms(),
                                                             fUniforms,
                                                             rec.fDstCS,
@@ -1406,7 +1414,7 @@
 
     SkRuntimeEffect::TracedShader makeTracedClone(const SkIPoint& coord) {
         sk_sp<SkRuntimeEffect> unoptimized = fEffect->makeUnoptimizedClone();
-        sk_sp<SkSL::DebugTracePriv> debugTrace = make_skvm_debug_trace(unoptimized.get(), coord);
+        sk_sp<SkSL::DebugTracePriv> debugTrace = make_debug_trace(unoptimized.get(), coord);
         auto debugShader = sk_make_sp<SkRTShader>(
                 unoptimized, debugTrace, this->uniformData(nullptr), SkSpan(fChildren));
 
@@ -1477,11 +1485,7 @@
             // usage in runtime effects to just #version 100.
             return false;
         }
-        if (fDebugTrace) {
-            // SkRP doesn't support debug traces yet; fall back to SkVM until this is implemented.
-            return false;
-        }
-        if (const SkSL::RP::Program* program = fEffect->getRPProgram()) {
+        if (const SkSL::RP::Program* program = fEffect->getRPProgram(fDebugTrace.get())) {
             std::optional<MatrixRec> newMRec = mRec.apply(rec);
             if (!newMRec.has_value()) {
                 return false;
@@ -1636,7 +1640,7 @@
             // usage in runtime effects to just #version 100.
             return false;
         }
-        if (const SkSL::RP::Program* program = fEffect->getRPProgram()) {
+        if (const SkSL::RP::Program* program = fEffect->getRPProgram(/*debugTrace=*/nullptr)) {
             SkSpan<const float> uniforms = uniforms_as_span(fEffect->uniforms(),
                                                             fUniforms,
                                                             rec.fDstCS,
diff --git a/src/sksl/codegen/SkSLRasterPipelineCodeGenerator.cpp b/src/sksl/codegen/SkSLRasterPipelineCodeGenerator.cpp
index fc2d1e2..4d427c3 100644
--- a/src/sksl/codegen/SkSLRasterPipelineCodeGenerator.cpp
+++ b/src/sksl/codegen/SkSLRasterPipelineCodeGenerator.cpp
@@ -5,6 +5,7 @@
  * found in the LICENSE file.
  */
 
+#include "include/core/SkPoint.h"
 #include "include/core/SkSpan.h"
 #include "include/private/SkSLDefines.h"
 #include "include/private/SkSLIRNode.h"
@@ -79,6 +80,10 @@
     return false;
 }
 
+class AutoContinueMask;
+class Generator;
+class LValue;
+
 class SlotManager {
 public:
     SlotManager(std::vector<SlotDebugInfo>* i) : fSlotDebugInfo(i) {}
@@ -119,17 +124,48 @@
     std::vector<SlotDebugInfo>* fSlotDebugInfo;
 };
 
-class AutoContinueMask;
-class LValue;
+class AutoStack {
+public:
+    /**
+     * Creates a temporary stack. The caller is responsible for discarding every entry on this
+     * stack before ~AutoStack is reached.
+     */
+    explicit AutoStack(Generator* g);
+    ~AutoStack();
+
+    /** Activates the associated stack. */
+    void enter();
+
+    /** Undoes a call to `enter`, returning to the previously-active stack. */
+    void exit();
+
+    /** Returns the stack ID of this AutoStack. */
+    int stackID() { return fStackID; }
+
+    /** Clones values from the top of the active stack onto this one. */
+    void pushClone(int slots);
+
+    /** Clones values from a fixed range of the active stack onto this one. */
+    void pushClone(SlotRange range, int offsetFromStackTop);
+
+    /** Clones values from a dynamic range of the active stack onto this one. */
+    void pushCloneIndirect(SlotRange range, int dynamicStackID, int offsetFromStackTop);
+
+private:
+    Generator* fGenerator;
+    int fStackID = 0;
+    int fParentStackID = 0;
+};
 
 class Generator {
 public:
-    Generator(const SkSL::Program& program, DebugTracePriv* debugTrace)
+    Generator(const SkSL::Program& program, DebugTracePriv* debugTrace, bool writeTraceOps)
             : fProgram(program)
             , fContext(fProgram.fContext->fTypes,
                        fProgram.fContext->fCaps,
                        *fProgram.fContext->fErrors)
             , fDebugTrace(debugTrace)
+            , fWriteTraceOps(writeTraceOps)
             , fProgramSlots(debugTrace ? &debugTrace->fSlotInfo : nullptr)
             , fUniformSlots(debugTrace ? &debugTrace->fUniformInfo : nullptr) {
         fContext.fModifiersPool = &fModifiersPool;
@@ -137,6 +173,12 @@
         fContext.fModule = fProgram.fContext->fModule;
     }
 
+    ~Generator() {
+        // ~AutoStack calls into the Generator, so we need to make sure the trace mask is reset
+        // before the Generator is destroyed.
+        fTraceMask.reset();
+    }
+
     /** Converts the SkSL main() function into a set of Instructions. */
     bool writeProgram(const FunctionDefinition& function);
 
@@ -352,11 +394,13 @@
     SkSL::ModifiersPool fModifiersPool;
     Builder fBuilder;
     DebugTracePriv* fDebugTrace = nullptr;
+    bool fWriteTraceOps = false;
     SkTHashMap<const Variable*, int> fChildEffectMap;
 
     SlotManager fProgramSlots;
     SlotManager fUniformSlots;
 
+    std::optional<AutoStack> fTraceMask;
     const FunctionDefinition* fCurrentFunction = nullptr;
     SlotRange fCurrentFunctionResult;
     AutoContinueMask* fCurrentContinueMask = nullptr;
@@ -426,48 +470,36 @@
     friend class AutoContinueMask;
 };
 
-class AutoStack {
-public:
-    explicit AutoStack(Generator* g)
-            : fGenerator(g)
-            , fStackID(g->createStack()) {}
+AutoStack::AutoStack(Generator* g)
+        : fGenerator(g)
+        , fStackID(g->createStack()) {}
 
-    ~AutoStack() {
-        fGenerator->recycleStack(fStackID);
-    }
+AutoStack::~AutoStack() {
+    fGenerator->recycleStack(fStackID);
+}
 
-    void enter() {
-        fParentStackID = fGenerator->currentStack();
-        fGenerator->setCurrentStack(fStackID);
-    }
+void AutoStack::enter() {
+    fParentStackID = fGenerator->currentStack();
+    fGenerator->setCurrentStack(fStackID);
+}
 
-    void exit() {
-        SkASSERT(fGenerator->currentStack() == fStackID);
-        fGenerator->setCurrentStack(fParentStackID);
-    }
+void AutoStack::exit() {
+    SkASSERT(fGenerator->currentStack() == fStackID);
+    fGenerator->setCurrentStack(fParentStackID);
+}
 
-    void pushClone(int slots) {
-        this->pushClone(SlotRange{0, slots}, /*offsetFromStackTop=*/slots);
-    }
+void AutoStack::pushClone(int slots) {
+    this->pushClone(SlotRange{0, slots}, /*offsetFromStackTop=*/slots);
+}
 
-    void pushClone(SlotRange range, int offsetFromStackTop) {
-        fGenerator->builder()->push_clone_from_stack(range, fStackID, offsetFromStackTop);
-    }
+void AutoStack::pushClone(SlotRange range, int offsetFromStackTop) {
+    fGenerator->builder()->push_clone_from_stack(range, fStackID, offsetFromStackTop);
+}
 
-    void pushCloneIndirect(SlotRange range, int dynamicStackID, int offsetFromStackTop) {
-        fGenerator->builder()->push_clone_indirect_from_stack(
-                range, dynamicStackID, /*otherStackID=*/fStackID, offsetFromStackTop);
-    }
-
-    int stackID() const {
-        return fStackID;
-    }
-
-private:
-    Generator* fGenerator;
-    int fStackID = 0;
-    int fParentStackID = 0;
-};
+void AutoStack::pushCloneIndirect(SlotRange range, int dynamicStackID, int offsetFromStackTop) {
+    fGenerator->builder()->push_clone_indirect_from_stack(
+            range, dynamicStackID, /*otherStackID=*/fStackID, offsetFromStackTop);
+}
 
 class AutoContinueMask {
 public:
@@ -3360,7 +3392,24 @@
     if (fDebugTrace) {
         // Copy the program source into the debug info so that it will be written in the trace file.
         fDebugTrace->setSource(*fProgram.fSource);
+
+        // The Raster Pipeline blitter generates centered pixel coordinates. (0.5, 1.5, 2.5,
+        // etc.) Add 0.5 to the requested trace coordinate to match this, then compare against
+        // src.rg, which contains the shader's coordinates. We keep this result in a dedicated
+        // trace-mask stack.
+        if (fWriteTraceOps) {
+            fTraceMask.emplace(this);
+            fTraceMask->enter();
+            fBuilder.push_src_rgba();
+            fBuilder.discard_stack(2);
+            fBuilder.push_literal_f(fDebugTrace->fTraceCoord.fX + 0.5f);
+            fBuilder.push_literal_f(fDebugTrace->fTraceCoord.fY + 0.5f);
+            fBuilder.binary_op(BuilderOp::cmpeq_n_floats, 2);
+            fBuilder.binary_op(BuilderOp::bitwise_and_n_ints, 1);
+            fTraceMask->exit();
+        }
     }
+
     // Assign slots to the parameters of main; copy src and dst into those slots as appropriate.
     for (const SkSL::Variable* param : function.declaration().parameters()) {
         switch (param->modifiers().fLayout.fBuiltin) {
@@ -3421,6 +3470,14 @@
     } else {
         fBuilder.pop_src_rgba();
     }
+
+    // Discard the trace mask.
+    if (fTraceMask.has_value()) {
+        fTraceMask->enter();
+        fBuilder.discard_stack(1);
+        fTraceMask->exit();
+    }
+
     return true;
 }
 
@@ -3432,8 +3489,9 @@
 
 std::unique_ptr<RP::Program> MakeRasterPipelineProgram(const SkSL::Program& program,
                                                        const FunctionDefinition& function,
-                                                       DebugTracePriv* debugTrace) {
-    RP::Generator generator(program, debugTrace);
+                                                       DebugTracePriv* debugTrace,
+                                                       bool writeTraceOps) {
+    RP::Generator generator(program, debugTrace, writeTraceOps);
     if (!generator.writeProgram(function)) {
         return nullptr;
     }
diff --git a/src/sksl/codegen/SkSLRasterPipelineCodeGenerator.h b/src/sksl/codegen/SkSLRasterPipelineCodeGenerator.h
index af782fd..b50a670 100644
--- a/src/sksl/codegen/SkSLRasterPipelineCodeGenerator.h
+++ b/src/sksl/codegen/SkSLRasterPipelineCodeGenerator.h
@@ -25,7 +25,8 @@
 //   -- src/dst in src.rgba and dst.rgba for blenders
 std::unique_ptr<RP::Program> MakeRasterPipelineProgram(const Program& program,
                                                        const FunctionDefinition& function,
-                                                       DebugTracePriv* debugTrace = nullptr);
+                                                       DebugTracePriv* debugTrace = nullptr,
+                                                       bool writeTraceOps = false);
 
 }  // namespace SkSL
 
diff --git a/tests/sksl/runtime/ArrayIndexing.skrp b/tests/sksl/runtime/ArrayIndexing.skrp
index 15e08ca..c87719a 100644
--- a/tests/sksl/runtime/ArrayIndexing.skrp
+++ b/tests/sksl/runtime/ArrayIndexing.skrp
@@ -1,79 +1,84 @@
-    1. store_src_rg                   xy = src.rg
-    2. init_lane_masks                CondMask = LoopMask = RetMask = true
-    3. copy_constant                  $0 = u1[0]
-    4. label                          label 0x00000000
-    5. zero_slot_unmasked             sum = 0
-    6. copy_constant                  i = 0x00000003 (4.203895e-45)
-    7. label                          label 0x00000003
-    8. copy_slot_unmasked             $1 = sum
-    9. copy_slot_unmasked             $7 = i
-   10. copy_from_indirect_uniform_unm $2 = Indirect(u2[0] + $7)
-   11. add_float                      $1 += $2
-   12. copy_slot_unmasked             sum = $1
-   13. copy_slot_unmasked             $1 = i
-   14. copy_constant                  $2 = 0x00000001 (1.401298e-45)
-   15. sub_int                        $1 -= $2
-   16. copy_slot_unmasked             i = $1
-   17. zero_slot_unmasked             $1 = 0
-   18. copy_slot_unmasked             $2 = i
-   19. cmple_int                      $1 = lessThanEqual($1, $2)
-   20. stack_rewind
-   21. branch_if_no_active_lanes_eq   branch -14 (label 3 at #7) if no lanes of $1 == 0x00000000 (0.0)
-   22. label                          label 0x00000002
-   23. copy_slot_unmasked             $1 = sum
-   24. label                          label 0x00000001
-   25. copy_constant                  prod = 0x3F800000 (1.0)
-   26. zero_slot_unmasked             i₁ = 0
-   27. label                          label 0x00000006
-   28. copy_slot_unmasked             $2 = prod
-   29. copy_slot_unmasked             $7 = i₁
-   30. copy_constant                  $8 = 0x00000002 (2.802597e-45)
-   31. cmplt_int                      $7 = lessThan($7, $8)
-   32. copy_slot_unmasked             $8 = i₁
-   33. zero_slot_unmasked             $9 = 0
-   34. mix_int                        $7 = mix($8, $9, $7)
-   35. copy_from_indirect_uniform_unm $3 = Indirect(u3[0] + $7)
-   36. mul_float                      $2 *= $3
-   37. copy_slot_unmasked             prod = $2
-   38. copy_slot_unmasked             $2 = i₁
-   39. copy_constant                  $3 = 0x00000001 (1.401298e-45)
-   40. add_int                        $2 += $3
-   41. copy_slot_unmasked             i₁ = $2
-   42. copy_constant                  $3 = 0x00000004 (5.605194e-45)
-   43. cmplt_int                      $2 = lessThan($2, $3)
-   44. stack_rewind
-   45. branch_if_no_active_lanes_eq   branch -18 (label 6 at #27) if no lanes of $2 == 0x00000000 (0.0)
-   46. label                          label 0x00000005
-   47. copy_slot_unmasked             $2 = prod
-   48. label                          label 0x00000004
-   49. zero_slot_unmasked             sum₁ = 0
-   50. copy_constant                  f = 0xC0133333 (-2.3)
-   51. label                          label 0x0000000A
-   52. store_condition_mask           $3 = CondMask
-   53. zero_slot_unmasked             $4 = 0
-   54. copy_slot_unmasked             $5 = f
-   55. cmplt_float                    $4 = lessThan($4, $5)
-   56. copy_slot_unmasked             $5 = f
-   57. copy_constant                  $6 = 0x41800000 (16.0)
-   58. cmplt_float                    $5 = lessThan($5, $6)
-   59. bitwise_and_int                $4 &= $5
-   60. merge_condition_mask           CondMask = $3 & $4
-   61. copy_slot_unmasked             $5 = sum₁
-   62. copy_slot_unmasked             $7 = f
-   63. cast_to_int_from_float         $7 = FloatToInt($7)
-   64. copy_from_indirect_uniform_unm $6 = Indirect(u4[0] + $7)
-   65. sub_float                      $5 -= $6
-   66. copy_slot_masked               sum₁ = Mask($5)
-   67. load_condition_mask            CondMask = $3
-   68. copy_slot_unmasked             $3 = f
-   69. copy_constant                  $4 = 0x406CCCCD (3.7)
-   70. add_float                      $3 += $4
-   71. copy_slot_unmasked             f = $3
-   72. copy_constant                  $4 = 0x41880000 (17.0)
-   73. cmplt_float                    $3 = lessThan($3, $4)
-   74. stack_rewind
-   75. branch_if_no_active_lanes_eq   branch -24 (label 10 at #51) if no lanes of $3 == 0x00000000 (0.0)
-   76. label                          label 0x00000009
-   77. copy_slot_unmasked             $3 = sum₁
-   78. label                          label 0x00000008
-   79. load_src                       src.rgba = $0..3
+    1. store_src                      $10..13 = src.rgba
+    2. copy_constant                  $12 = 0x3F000000 (0.5)
+    3. copy_constant                  $13 = 0x3F000000 (0.5)
+    4. cmpeq_2_floats                 $10..11 = equal($10..11, $12..13)
+    5. bitwise_and_int                $10 &= $11
+    6. store_src_rg                   xy = src.rg
+    7. init_lane_masks                CondMask = LoopMask = RetMask = true
+    8. copy_constant                  $0 = u1[0]
+    9. label                          label 0x00000000
+   10. zero_slot_unmasked             sum = 0
+   11. copy_constant                  i = 0x00000003 (4.203895e-45)
+   12. label                          label 0x00000003
+   13. copy_slot_unmasked             $1 = sum
+   14. copy_slot_unmasked             $7 = i
+   15. copy_from_indirect_uniform_unm $2 = Indirect(u2[0] + $7)
+   16. add_float                      $1 += $2
+   17. copy_slot_unmasked             sum = $1
+   18. copy_slot_unmasked             $1 = i
+   19. copy_constant                  $2 = 0x00000001 (1.401298e-45)
+   20. sub_int                        $1 -= $2
+   21. copy_slot_unmasked             i = $1
+   22. zero_slot_unmasked             $1 = 0
+   23. copy_slot_unmasked             $2 = i
+   24. cmple_int                      $1 = lessThanEqual($1, $2)
+   25. stack_rewind
+   26. branch_if_no_active_lanes_eq   branch -14 (label 3 at #12) if no lanes of $1 == 0x00000000 (0.0)
+   27. label                          label 0x00000002
+   28. copy_slot_unmasked             $1 = sum
+   29. label                          label 0x00000001
+   30. copy_constant                  prod = 0x3F800000 (1.0)
+   31. zero_slot_unmasked             i₁ = 0
+   32. label                          label 0x00000006
+   33. copy_slot_unmasked             $2 = prod
+   34. copy_slot_unmasked             $7 = i₁
+   35. copy_constant                  $8 = 0x00000002 (2.802597e-45)
+   36. cmplt_int                      $7 = lessThan($7, $8)
+   37. copy_slot_unmasked             $8 = i₁
+   38. zero_slot_unmasked             $9 = 0
+   39. mix_int                        $7 = mix($8, $9, $7)
+   40. copy_from_indirect_uniform_unm $3 = Indirect(u3[0] + $7)
+   41. mul_float                      $2 *= $3
+   42. copy_slot_unmasked             prod = $2
+   43. copy_slot_unmasked             $2 = i₁
+   44. copy_constant                  $3 = 0x00000001 (1.401298e-45)
+   45. add_int                        $2 += $3
+   46. copy_slot_unmasked             i₁ = $2
+   47. copy_constant                  $3 = 0x00000004 (5.605194e-45)
+   48. cmplt_int                      $2 = lessThan($2, $3)
+   49. stack_rewind
+   50. branch_if_no_active_lanes_eq   branch -18 (label 6 at #32) if no lanes of $2 == 0x00000000 (0.0)
+   51. label                          label 0x00000005
+   52. copy_slot_unmasked             $2 = prod
+   53. label                          label 0x00000004
+   54. zero_slot_unmasked             sum₁ = 0
+   55. copy_constant                  f = 0xC0133333 (-2.3)
+   56. label                          label 0x0000000A
+   57. store_condition_mask           $3 = CondMask
+   58. zero_slot_unmasked             $4 = 0
+   59. copy_slot_unmasked             $5 = f
+   60. cmplt_float                    $4 = lessThan($4, $5)
+   61. copy_slot_unmasked             $5 = f
+   62. copy_constant                  $6 = 0x41800000 (16.0)
+   63. cmplt_float                    $5 = lessThan($5, $6)
+   64. bitwise_and_int                $4 &= $5
+   65. merge_condition_mask           CondMask = $3 & $4
+   66. copy_slot_unmasked             $5 = sum₁
+   67. copy_slot_unmasked             $7 = f
+   68. cast_to_int_from_float         $7 = FloatToInt($7)
+   69. copy_from_indirect_uniform_unm $6 = Indirect(u4[0] + $7)
+   70. sub_float                      $5 -= $6
+   71. copy_slot_masked               sum₁ = Mask($5)
+   72. load_condition_mask            CondMask = $3
+   73. copy_slot_unmasked             $3 = f
+   74. copy_constant                  $4 = 0x406CCCCD (3.7)
+   75. add_float                      $3 += $4
+   76. copy_slot_unmasked             f = $3
+   77. copy_constant                  $4 = 0x41880000 (17.0)
+   78. cmplt_float                    $3 = lessThan($3, $4)
+   79. stack_rewind
+   80. branch_if_no_active_lanes_eq   branch -24 (label 10 at #56) if no lanes of $3 == 0x00000000 (0.0)
+   81. label                          label 0x00000009
+   82. copy_slot_unmasked             $3 = sum₁
+   83. label                          label 0x00000008
+   84. load_src                       src.rgba = $0..3
diff --git a/tests/sksl/runtime/Commutative.skrp b/tests/sksl/runtime/Commutative.skrp
index 0dab6db..9ed5970 100644
--- a/tests/sksl/runtime/Commutative.skrp
+++ b/tests/sksl/runtime/Commutative.skrp
@@ -1,128 +1,133 @@
-    1. store_src_rg                   xy = src.rg
-    2. init_lane_masks                CondMask = LoopMask = RetMask = true
-    3. copy_constant                  ok = 0xFFFFFFFF
-    4. copy_constant                  $0 = testMatrix2x2(0)
-    5. cast_to_int_from_float         $0 = FloatToInt($0)
-    6. copy_slot_unmasked             a = $0
-    7. copy_2_constants               $0..1 = testMatrix2x2(0..1)
-    8. swizzle_1                      $0 = ($0..1).y
-    9. cast_to_int_from_float         $0 = FloatToInt($0)
-   10. copy_slot_unmasked             b = $0
-   11. copy_constant                  $0 = testMatrix2x2(2)
-   12. copy_slot_unmasked             c = $0
-   13. copy_2_constants               $0..1 = testMatrix2x2(2..3)
-   14. swizzle_1                      $0 = ($0..1).y
-   15. copy_slot_unmasked             d = $0
-   16. copy_2_slots_unmasked          $0..1 = a, b
-   17. bitwise_and_int                $0 &= $1
-   18. copy_slot_unmasked             a_and_b = $0
-   19. copy_slot_unmasked             $0 = b
-   20. copy_slot_unmasked             $1 = a
-   21. bitwise_and_int                $0 &= $1
-   22. copy_slot_unmasked             b_and_a = $0
-   23. copy_slot_unmasked             $0 = ok
-   24. copy_2_slots_unmasked          $1..2 = a_and_b, b_and_a
-   25. cmpeq_int                      $1 = equal($1, $2)
+    1. store_src                      $12..15 = src.rgba
+    2. copy_constant                  $14 = 0x3F000000 (0.5)
+    3. copy_constant                  $15 = 0x3F000000 (0.5)
+    4. cmpeq_2_floats                 $12..13 = equal($12..13, $14..15)
+    5. bitwise_and_int                $12 &= $13
+    6. store_src_rg                   xy = src.rg
+    7. init_lane_masks                CondMask = LoopMask = RetMask = true
+    8. copy_constant                  ok = 0xFFFFFFFF
+    9. copy_constant                  $0 = testMatrix2x2(0)
+   10. cast_to_int_from_float         $0 = FloatToInt($0)
+   11. copy_slot_unmasked             a = $0
+   12. copy_2_constants               $0..1 = testMatrix2x2(0..1)
+   13. swizzle_1                      $0 = ($0..1).y
+   14. cast_to_int_from_float         $0 = FloatToInt($0)
+   15. copy_slot_unmasked             b = $0
+   16. copy_constant                  $0 = testMatrix2x2(2)
+   17. copy_slot_unmasked             c = $0
+   18. copy_2_constants               $0..1 = testMatrix2x2(2..3)
+   19. swizzle_1                      $0 = ($0..1).y
+   20. copy_slot_unmasked             d = $0
+   21. copy_2_slots_unmasked          $0..1 = a, b
+   22. bitwise_and_int                $0 &= $1
+   23. copy_slot_unmasked             a_and_b = $0
+   24. copy_slot_unmasked             $0 = b
+   25. copy_slot_unmasked             $1 = a
    26. bitwise_and_int                $0 &= $1
-   27. copy_slot_unmasked             ok = $0
-   28. copy_2_slots_unmasked          $0..1 = a, b
-   29. bitwise_or_int                 $0 |= $1
-   30. copy_slot_unmasked             a_or_b = $0
-   31. copy_slot_unmasked             $0 = b
-   32. copy_slot_unmasked             $1 = a
-   33. bitwise_or_int                 $0 |= $1
-   34. copy_slot_unmasked             b_or_a = $0
-   35. copy_slot_unmasked             $0 = ok
-   36. copy_2_slots_unmasked          $1..2 = a_or_b, b_or_a
-   37. cmpeq_int                      $1 = equal($1, $2)
-   38. bitwise_and_int                $0 &= $1
-   39. copy_slot_unmasked             ok = $0
-   40. copy_2_slots_unmasked          $0..1 = a, b
-   41. bitwise_xor_int                $0 ^= $1
-   42. copy_slot_unmasked             a_xor_b = $0
-   43. copy_slot_unmasked             $0 = b
-   44. copy_slot_unmasked             $1 = a
-   45. bitwise_xor_int                $0 ^= $1
-   46. copy_slot_unmasked             b_xor_a = $0
-   47. copy_slot_unmasked             $0 = ok
-   48. copy_2_slots_unmasked          $1..2 = a_xor_b, b_xor_a
-   49. cmpeq_int                      $1 = equal($1, $2)
-   50. bitwise_and_int                $0 &= $1
-   51. copy_slot_unmasked             ok = $0
-   52. copy_2_slots_unmasked          $0..1 = a, b
-   53. cmpeq_int                      $0 = equal($0, $1)
-   54. copy_slot_unmasked             a_eq_b = $0
-   55. copy_slot_unmasked             $0 = b
-   56. copy_slot_unmasked             $1 = a
-   57. cmpeq_int                      $0 = equal($0, $1)
-   58. copy_slot_unmasked             b_eq_a = $0
-   59. copy_slot_unmasked             $0 = ok
-   60. copy_2_slots_unmasked          $1..2 = a_eq_b, b_eq_a
-   61. cmpeq_int                      $1 = equal($1, $2)
-   62. bitwise_and_int                $0 &= $1
-   63. copy_slot_unmasked             ok = $0
-   64. copy_2_slots_unmasked          $0..1 = a, b
-   65. cmpne_int                      $0 = notEqual($0, $1)
-   66. copy_slot_unmasked             a_neq_b = $0
-   67. copy_slot_unmasked             $0 = b
-   68. copy_slot_unmasked             $1 = a
-   69. cmpne_int                      $0 = notEqual($0, $1)
-   70. copy_slot_unmasked             b_neq_a = $0
-   71. copy_slot_unmasked             $0 = ok
-   72. copy_2_slots_unmasked          $1..2 = a_neq_b, b_neq_a
-   73. cmpeq_int                      $1 = equal($1, $2)
-   74. bitwise_and_int                $0 &= $1
-   75. copy_slot_unmasked             ok = $0
-   76. copy_2_slots_unmasked          $0..1 = a, b
-   77. add_int                        $0 += $1
-   78. copy_slot_unmasked             a_add_b = $0
-   79. copy_slot_unmasked             $0 = b
-   80. copy_slot_unmasked             $1 = a
-   81. add_int                        $0 += $1
-   82. copy_slot_unmasked             b_add_a = $0
-   83. copy_slot_unmasked             $0 = ok
-   84. copy_2_slots_unmasked          $1..2 = a_add_b, b_add_a
-   85. cmpeq_int                      $1 = equal($1, $2)
-   86. bitwise_and_int                $0 &= $1
-   87. copy_slot_unmasked             ok = $0
-   88. copy_2_slots_unmasked          $0..1 = c, d
-   89. add_float                      $0 += $1
-   90. copy_slot_unmasked             c_add_d = $0
-   91. copy_slot_unmasked             $0 = d
-   92. copy_slot_unmasked             $1 = c
-   93. add_float                      $0 += $1
-   94. copy_slot_unmasked             d_add_c = $0
-   95. copy_slot_unmasked             $0 = ok
-   96. copy_2_slots_unmasked          $1..2 = c_add_d, d_add_c
-   97. cmpeq_float                    $1 = equal($1, $2)
-   98. bitwise_and_int                $0 &= $1
-   99. copy_slot_unmasked             ok = $0
-  100. copy_2_slots_unmasked          $0..1 = a, b
-  101. mul_int                        $0 *= $1
-  102. copy_slot_unmasked             a_mul_b = $0
-  103. copy_slot_unmasked             $0 = b
-  104. copy_slot_unmasked             $1 = a
-  105. mul_int                        $0 *= $1
-  106. copy_slot_unmasked             b_mul_a = $0
-  107. copy_slot_unmasked             $0 = ok
-  108. copy_2_slots_unmasked          $1..2 = a_mul_b, b_mul_a
-  109. cmpeq_int                      $1 = equal($1, $2)
-  110. bitwise_and_int                $0 &= $1
-  111. copy_slot_unmasked             ok = $0
-  112. copy_2_slots_unmasked          $0..1 = c, d
-  113. mul_float                      $0 *= $1
-  114. copy_slot_unmasked             c_mul_d = $0
-  115. copy_slot_unmasked             $0 = d
-  116. copy_slot_unmasked             $1 = c
-  117. mul_float                      $0 *= $1
-  118. copy_slot_unmasked             d_mul_c = $0
-  119. copy_slot_unmasked             $0 = ok
-  120. copy_2_slots_unmasked          $1..2 = c_mul_d, d_mul_c
-  121. cmpeq_float                    $1 = equal($1, $2)
-  122. bitwise_and_int                $0 &= $1
-  123. copy_slot_unmasked             ok = $0
-  124. swizzle_4                      $0..3 = ($0..3).xxxx
-  125. copy_4_constants               $4..7 = colorRed
-  126. copy_4_constants               $8..11 = colorGreen
-  127. mix_4_ints                     $0..3 = mix($4..7, $8..11, $0..3)
-  128. load_src                       src.rgba = $0..3
+   27. copy_slot_unmasked             b_and_a = $0
+   28. copy_slot_unmasked             $0 = ok
+   29. copy_2_slots_unmasked          $1..2 = a_and_b, b_and_a
+   30. cmpeq_int                      $1 = equal($1, $2)
+   31. bitwise_and_int                $0 &= $1
+   32. copy_slot_unmasked             ok = $0
+   33. copy_2_slots_unmasked          $0..1 = a, b
+   34. bitwise_or_int                 $0 |= $1
+   35. copy_slot_unmasked             a_or_b = $0
+   36. copy_slot_unmasked             $0 = b
+   37. copy_slot_unmasked             $1 = a
+   38. bitwise_or_int                 $0 |= $1
+   39. copy_slot_unmasked             b_or_a = $0
+   40. copy_slot_unmasked             $0 = ok
+   41. copy_2_slots_unmasked          $1..2 = a_or_b, b_or_a
+   42. cmpeq_int                      $1 = equal($1, $2)
+   43. bitwise_and_int                $0 &= $1
+   44. copy_slot_unmasked             ok = $0
+   45. copy_2_slots_unmasked          $0..1 = a, b
+   46. bitwise_xor_int                $0 ^= $1
+   47. copy_slot_unmasked             a_xor_b = $0
+   48. copy_slot_unmasked             $0 = b
+   49. copy_slot_unmasked             $1 = a
+   50. bitwise_xor_int                $0 ^= $1
+   51. copy_slot_unmasked             b_xor_a = $0
+   52. copy_slot_unmasked             $0 = ok
+   53. copy_2_slots_unmasked          $1..2 = a_xor_b, b_xor_a
+   54. cmpeq_int                      $1 = equal($1, $2)
+   55. bitwise_and_int                $0 &= $1
+   56. copy_slot_unmasked             ok = $0
+   57. copy_2_slots_unmasked          $0..1 = a, b
+   58. cmpeq_int                      $0 = equal($0, $1)
+   59. copy_slot_unmasked             a_eq_b = $0
+   60. copy_slot_unmasked             $0 = b
+   61. copy_slot_unmasked             $1 = a
+   62. cmpeq_int                      $0 = equal($0, $1)
+   63. copy_slot_unmasked             b_eq_a = $0
+   64. copy_slot_unmasked             $0 = ok
+   65. copy_2_slots_unmasked          $1..2 = a_eq_b, b_eq_a
+   66. cmpeq_int                      $1 = equal($1, $2)
+   67. bitwise_and_int                $0 &= $1
+   68. copy_slot_unmasked             ok = $0
+   69. copy_2_slots_unmasked          $0..1 = a, b
+   70. cmpne_int                      $0 = notEqual($0, $1)
+   71. copy_slot_unmasked             a_neq_b = $0
+   72. copy_slot_unmasked             $0 = b
+   73. copy_slot_unmasked             $1 = a
+   74. cmpne_int                      $0 = notEqual($0, $1)
+   75. copy_slot_unmasked             b_neq_a = $0
+   76. copy_slot_unmasked             $0 = ok
+   77. copy_2_slots_unmasked          $1..2 = a_neq_b, b_neq_a
+   78. cmpeq_int                      $1 = equal($1, $2)
+   79. bitwise_and_int                $0 &= $1
+   80. copy_slot_unmasked             ok = $0
+   81. copy_2_slots_unmasked          $0..1 = a, b
+   82. add_int                        $0 += $1
+   83. copy_slot_unmasked             a_add_b = $0
+   84. copy_slot_unmasked             $0 = b
+   85. copy_slot_unmasked             $1 = a
+   86. add_int                        $0 += $1
+   87. copy_slot_unmasked             b_add_a = $0
+   88. copy_slot_unmasked             $0 = ok
+   89. copy_2_slots_unmasked          $1..2 = a_add_b, b_add_a
+   90. cmpeq_int                      $1 = equal($1, $2)
+   91. bitwise_and_int                $0 &= $1
+   92. copy_slot_unmasked             ok = $0
+   93. copy_2_slots_unmasked          $0..1 = c, d
+   94. add_float                      $0 += $1
+   95. copy_slot_unmasked             c_add_d = $0
+   96. copy_slot_unmasked             $0 = d
+   97. copy_slot_unmasked             $1 = c
+   98. add_float                      $0 += $1
+   99. copy_slot_unmasked             d_add_c = $0
+  100. copy_slot_unmasked             $0 = ok
+  101. copy_2_slots_unmasked          $1..2 = c_add_d, d_add_c
+  102. cmpeq_float                    $1 = equal($1, $2)
+  103. bitwise_and_int                $0 &= $1
+  104. copy_slot_unmasked             ok = $0
+  105. copy_2_slots_unmasked          $0..1 = a, b
+  106. mul_int                        $0 *= $1
+  107. copy_slot_unmasked             a_mul_b = $0
+  108. copy_slot_unmasked             $0 = b
+  109. copy_slot_unmasked             $1 = a
+  110. mul_int                        $0 *= $1
+  111. copy_slot_unmasked             b_mul_a = $0
+  112. copy_slot_unmasked             $0 = ok
+  113. copy_2_slots_unmasked          $1..2 = a_mul_b, b_mul_a
+  114. cmpeq_int                      $1 = equal($1, $2)
+  115. bitwise_and_int                $0 &= $1
+  116. copy_slot_unmasked             ok = $0
+  117. copy_2_slots_unmasked          $0..1 = c, d
+  118. mul_float                      $0 *= $1
+  119. copy_slot_unmasked             c_mul_d = $0
+  120. copy_slot_unmasked             $0 = d
+  121. copy_slot_unmasked             $1 = c
+  122. mul_float                      $0 *= $1
+  123. copy_slot_unmasked             d_mul_c = $0
+  124. copy_slot_unmasked             $0 = ok
+  125. copy_2_slots_unmasked          $1..2 = c_mul_d, d_mul_c
+  126. cmpeq_float                    $1 = equal($1, $2)
+  127. bitwise_and_int                $0 &= $1
+  128. copy_slot_unmasked             ok = $0
+  129. swizzle_4                      $0..3 = ($0..3).xxxx
+  130. copy_4_constants               $4..7 = colorRed
+  131. copy_4_constants               $8..11 = colorGreen
+  132. mix_4_ints                     $0..3 = mix($4..7, $8..11, $0..3)
+  133. load_src                       src.rgba = $0..3
diff --git a/tests/sksl/runtime/GlobalVariables.skrp b/tests/sksl/runtime/GlobalVariables.skrp
index d513305..427c2a5 100644
--- a/tests/sksl/runtime/GlobalVariables.skrp
+++ b/tests/sksl/runtime/GlobalVariables.skrp
@@ -1,13 +1,18 @@
-    1. store_src_rg                   xy = src.rg
-    2. init_lane_masks                CondMask = LoopMask = RetMask = true
-    3. copy_constant                  gInitialized = 0xBF800000 (-1.0)
-    4. copy_constant                  gInitializedFromOther = 0x3F800000 (1.0)
-    5. zero_slot_unmasked             gUninitialized = 0
-    6. copy_constant                  $0 = 0x3F800000 (1.0)
-    7. copy_slot_unmasked             gUninitialized = $0
-    8. label                          label 0x00000000
-    9. zero_slot_unmasked             $0 = 0
-   10. copy_slot_unmasked             $1 = gInitializedFromOther
-   11. zero_slot_unmasked             $2 = 0
-   12. copy_slot_unmasked             $3 = gUninitialized
-   13. load_src                       src.rgba = $0..3
+    1. store_src                      $4..7 = src.rgba
+    2. copy_constant                  $6 = 0x3F000000 (0.5)
+    3. copy_constant                  $7 = 0x3F000000 (0.5)
+    4. cmpeq_2_floats                 $4..5 = equal($4..5, $6..7)
+    5. bitwise_and_int                $4 &= $5
+    6. store_src_rg                   xy = src.rg
+    7. init_lane_masks                CondMask = LoopMask = RetMask = true
+    8. copy_constant                  gInitialized = 0xBF800000 (-1.0)
+    9. copy_constant                  gInitializedFromOther = 0x3F800000 (1.0)
+   10. zero_slot_unmasked             gUninitialized = 0
+   11. copy_constant                  $0 = 0x3F800000 (1.0)
+   12. copy_slot_unmasked             gUninitialized = $0
+   13. label                          label 0x00000000
+   14. zero_slot_unmasked             $0 = 0
+   15. copy_slot_unmasked             $1 = gInitializedFromOther
+   16. zero_slot_unmasked             $2 = 0
+   17. copy_slot_unmasked             $3 = gUninitialized
+   18. load_src                       src.rgba = $0..3
diff --git a/tests/sksl/runtime/LoopFloat.skrp b/tests/sksl/runtime/LoopFloat.skrp
index 7f42156..ced06f5 100644
--- a/tests/sksl/runtime/LoopFloat.skrp
+++ b/tests/sksl/runtime/LoopFloat.skrp
@@ -1,361 +1,366 @@
-    1. store_src_rg                   pos = src.rg
-    2. init_lane_masks                CondMask = LoopMask = RetMask = true
-    3. zero_slot_unmasked             kZero = 0
-    4. copy_constant                  kTen = 0x41200000 (10.0)
-    5. copy_slot_unmasked             $2 = pos(0)
-    6. copy_constant                  $3 = colorGreen(1)
-    7. max_float                      $2 = max($2, $3)
-    8. copy_constant                  $3 = colorGreen(3)
-    9. min_float                      $2 = min($2, $3)
-   10. copy_constant                  $3 = 0x40A00000 (5.0)
-   11. mul_float                      $2 *= $3
-   12. copy_slot_unmasked             five = $2
-   13. store_condition_mask           $26 = CondMask
-   14. store_condition_mask           $46 = CondMask
-   15. store_condition_mask           $56 = CondMask
-   16. store_condition_mask           $36 = CondMask
-   17. store_condition_mask           $76 = CondMask
-   18. store_condition_mask           $66 = CondMask
-   19. store_condition_mask           $20 = CondMask
-   20. store_condition_mask           $70 = CondMask
-   21. store_condition_mask           $14 = CondMask
-   22. branch_if_no_lanes_active      branch_if_no_lanes_active +33 (label 10 at #55)
-   23. store_return_mask              $15 = RetMask
-   24. copy_slot_unmasked             five₁ = five
-   25. copy_slot_unmasked             i = kZero
-   26. store_loop_mask                $16 = LoopMask
-   27. jump                           jump +15 (label 12 at #42)
-   28. label                          label 0x0000000D
-   29. store_condition_mask           $17 = CondMask
-   30. copy_slot_unmasked             $18 = i
-   31. copy_slot_unmasked             $19 = five₁
-   32. cmpeq_float                    $18 = equal($18, $19)
-   33. merge_condition_mask           CondMask = $17 & $18
-   34. copy_slot_unmasked             $19 = i
-   35. copy_slot_masked               [return_loop].result = Mask($19)
-   36. mask_off_return_mask           RetMask &= ~(CondMask & LoopMask & RetMask)
-   37. load_condition_mask            CondMask = $17
-   38. copy_slot_unmasked             $17 = i
-   39. copy_constant                  $18 = 0x3F800000 (1.0)
-   40. add_float                      $17 += $18
-   41. copy_slot_masked               i = Mask($17)
-   42. label                          label 0x0000000C
-   43. copy_slot_unmasked             $17 = i
-   44. copy_constant                  $18 = 0x41200000 (10.0)
-   45. cmplt_float                    $17 = lessThan($17, $18)
-   46. merge_loop_mask                LoopMask &= $17
-   47. stack_rewind
-   48. branch_if_any_lanes_active     branch_if_any_lanes_active -20 (label 13 at #28)
-   49. label                          label 0x0000000B
-   50. load_loop_mask                 LoopMask = $16
-   51. zero_slot_unmasked             $16 = 0
-   52. copy_slot_masked               [return_loop].result = Mask($16)
-   53. load_return_mask               RetMask = $15
-   54. copy_slot_unmasked             $15 = [return_loop].result
-   55. label                          label 0x0000000A
-   56. copy_constant                  $16 = 0x40A00000 (5.0)
-   57. cmpeq_float                    $15 = equal($15, $16)
-   58. zero_slot_unmasked             $71 = 0
-   59. merge_condition_mask           CondMask = $14 & $15
-   60. branch_if_no_lanes_active      branch_if_no_lanes_active +38 (label 9 at #98)
-   61. copy_slot_unmasked             five₂ = five
-   62. zero_2_slots_unmasked          sum, i₁ = 0
-   63. store_loop_mask                $72 = LoopMask
-   64. jump                           jump +20 (label 16 at #84)
-   65. label                          label 0x00000011
-   66. zero_slot_unmasked             $0 = 0
-   67. store_condition_mask           $73 = CondMask
-   68. copy_slot_unmasked             $74 = i₁
-   69. copy_slot_unmasked             $75 = five₂
-   70. cmplt_float                    $74 = lessThan($74, $75)
-   71. merge_condition_mask           CondMask = $73 & $74
-   72. copy_constant                  $1 = 0xFFFFFFFF
-   73. copy_slot_masked               $0 = Mask($1)
-   74. mask_off_loop_mask             LoopMask &= ~(CondMask & LoopMask & RetMask)
-   75. load_condition_mask            CondMask = $73
-   76. copy_2_slots_unmasked          $73..74 = sum, i₁
-   77. add_float                      $73 += $74
-   78. copy_slot_masked               sum = Mask($73)
-   79. reenable_loop_mask             LoopMask |= $0
-   80. copy_slot_unmasked             $73 = i₁
-   81. copy_constant                  $74 = 0x3F800000 (1.0)
-   82. add_float                      $73 += $74
-   83. copy_slot_masked               i₁ = Mask($73)
-   84. label                          label 0x00000010
-   85. copy_slot_unmasked             $73 = i₁
-   86. copy_slot_unmasked             $74 = kTen
-   87. cmplt_float                    $73 = lessThan($73, $74)
-   88. merge_loop_mask                LoopMask &= $73
-   89. stack_rewind
-   90. branch_if_any_lanes_active     branch_if_any_lanes_active -25 (label 17 at #65)
-   91. label                          label 0x0000000F
-   92. load_loop_mask                 LoopMask = $72
-   93. copy_slot_unmasked             $72 = sum
-   94. label                          label 0x0000000E
-   95. copy_constant                  $73 = 0x420C0000 (35.0)
-   96. cmpeq_float                    $72 = equal($72, $73)
-   97. copy_slot_masked               $71 = Mask($72)
-   98. label                          label 0x00000009
-   99. load_condition_mask            CondMask = $14
-  100. zero_slot_unmasked             $21 = 0
-  101. merge_condition_mask           CondMask = $70 & $71
-  102. branch_if_no_lanes_active      branch_if_no_lanes_active +38 (label 8 at #140)
-  103. copy_slot_unmasked             five₃ = five
-  104. zero_slot_unmasked             sum₁ = 0
-  105. copy_constant                  kOne = 0x3F800000 (1.0)
-  106. zero_slot_unmasked             i₂ = 0
-  107. store_loop_mask                $22 = LoopMask
-  108. jump                           jump +18 (label 20 at #126)
-  109. label                          label 0x00000015
-  110. store_condition_mask           $23 = CondMask
-  111. copy_slot_unmasked             $24 = five₃
-  112. copy_slot_unmasked             $25 = i₂
-  113. cmplt_float                    $24 = lessThan($24, $25)
-  114. merge_condition_mask           CondMask = $23 & $24
-  115. branch_if_all_lanes_active     branch_if_all_lanes_active +18 (label 19 at #133)
-  116. mask_off_loop_mask             LoopMask &= ~(CondMask & LoopMask & RetMask)
-  117. load_condition_mask            CondMask = $23
-  118. copy_slot_unmasked             $23 = sum₁
-  119. copy_slot_unmasked             $24 = i₂
-  120. add_float                      $23 += $24
-  121. copy_slot_masked               sum₁ = Mask($23)
-  122. copy_slot_unmasked             $23 = i₂
-  123. copy_slot_unmasked             $24 = kOne
-  124. add_float                      $23 += $24
-  125. copy_slot_masked               i₂ = Mask($23)
-  126. label                          label 0x00000014
-  127. copy_slot_unmasked             $23 = i₂
-  128. copy_constant                  $24 = 0x41200000 (10.0)
-  129. cmplt_float                    $23 = lessThan($23, $24)
-  130. merge_loop_mask                LoopMask &= $23
-  131. stack_rewind
-  132. branch_if_any_lanes_active     branch_if_any_lanes_active -23 (label 21 at #109)
-  133. label                          label 0x00000013
-  134. load_loop_mask                 LoopMask = $22
-  135. copy_slot_unmasked             $22 = sum₁
-  136. label                          label 0x00000012
-  137. copy_constant                  $23 = 0x41700000 (15.0)
-  138. cmpeq_float                    $22 = equal($22, $23)
-  139. copy_slot_masked               $21 = Mask($22)
-  140. label                          label 0x00000008
-  141. load_condition_mask            CondMask = $70
-  142. zero_slot_unmasked             $67 = 0
-  143. merge_condition_mask           CondMask = $20 & $21
-  144. branch_if_no_lanes_active      branch_if_no_lanes_active +25 (label 7 at #169)
-  145. zero_slot_unmasked             sum₂ = 0
-  146. branch_if_no_lanes_active      branch_if_no_lanes_active +14 (label 23 at #160)
-  147. copy_constant                  i₃ = 0x3DFBE76D (0.123)
-  148. label                          label 0x00000018
-  149. copy_2_slots_unmasked          $68..69 = sum₂, i₃
-  150. add_float                      $68 += $69
-  151. copy_slot_masked               sum₂ = Mask($68)
-  152. copy_slot_unmasked             $68 = i₃
-  153. copy_constant                  $69 = 0x3DE353F8 (0.111)
-  154. add_float                      $68 += $69
-  155. copy_slot_masked               i₃ = Mask($68)
-  156. copy_constant                  $69 = 0x3F19999A (0.6)
-  157. cmplt_float                    $68 = lessThan($68, $69)
-  158. stack_rewind
-  159. branch_if_no_active_lanes_eq   branch -11 (label 24 at #148) if no lanes of $68 == 0x00000000 (0.0)
-  160. label                          label 0x00000017
-  161. copy_slot_unmasked             $68 = sum₂
-  162. copy_constant                  $69 = 0x3FDCCCCD (1.725)
-  163. sub_float                      $68 -= $69
-  164. label                          label 0x00000016
-  165. abs_float                      $68 = abs($68)
-  166. copy_constant                  $69 = 0x3CCCCCCD (0.025)
-  167. cmplt_float                    $68 = lessThan($68, $69)
-  168. copy_slot_masked               $67 = Mask($68)
-  169. label                          label 0x00000007
-  170. load_condition_mask            CondMask = $20
-  171. zero_slot_unmasked             $77 = 0
-  172. merge_condition_mask           CondMask = $66 & $67
-  173. branch_if_no_lanes_active      branch_if_no_lanes_active +28 (label 6 at #201)
-  174. copy_constant                  $78 = 0x41100000 (9.0)
-  175. swizzle_4                      $78..81 = ($78..81).xxxx
-  176. copy_4_slots_unmasked          result = $78..81
-  177. branch_if_no_lanes_active      branch_if_no_lanes_active +13 (label 26 at #190)
-  178. copy_constant                  i₄ = 0x3F800000 (1.0)
-  179. label                          label 0x0000001B
-  180. copy_4_slots_unmasked          $78..81 = result(1..3), i₄
-  181. copy_4_slots_masked            result = Mask($78..81)
-  182. copy_slot_unmasked             $78 = i₄
-  183. copy_constant                  $79 = 0x3F800000 (1.0)
-  184. add_float                      $78 += $79
-  185. copy_slot_masked               i₄ = Mask($78)
-  186. copy_constant                  $79 = 0x40400000 (3.0)
-  187. cmple_float                    $78 = lessThanEqual($78, $79)
-  188. stack_rewind
-  189. branch_if_no_active_lanes_eq   branch -10 (label 27 at #179) if no lanes of $78 == 0x00000000 (0.0)
-  190. label                          label 0x0000001A
-  191. copy_4_slots_unmasked          $78..81 = result
-  192. copy_constant                  $82 = 0x41100000 (9.0)
-  193. copy_constant                  $83 = 0x3F800000 (1.0)
-  194. copy_constant                  $84 = 0x40000000 (2.0)
-  195. copy_constant                  $85 = 0x40400000 (3.0)
-  196. cmpeq_4_floats                 $78..81 = equal($78..81, $82..85)
-  197. bitwise_and_2_ints             $78..79 &= $80..81
-  198. bitwise_and_int                $78 &= $79
-  199. label                          label 0x00000019
-  200. copy_slot_masked               $77 = Mask($78)
-  201. label                          label 0x00000006
-  202. load_condition_mask            CondMask = $66
-  203. zero_slot_unmasked             $37 = 0
-  204. merge_condition_mask           CondMask = $76 & $77
-  205. branch_if_no_lanes_active      branch_if_no_lanes_active +28 (label 5 at #233)
-  206. copy_constant                  $38 = 0x41100000 (9.0)
-  207. swizzle_4                      $38..41 = ($38..41).xxxx
-  208. copy_4_slots_unmasked          result₁ = $38..41
-  209. branch_if_no_lanes_active      branch_if_no_lanes_active +13 (label 29 at #222)
-  210. copy_constant                  i₅ = 0x3F800000 (1.0)
-  211. label                          label 0x0000001E
-  212. copy_4_slots_unmasked          $38..41 = result₁(1..3), i₅
-  213. copy_4_slots_masked            result₁ = Mask($38..41)
-  214. copy_slot_unmasked             $38 = i₅
-  215. copy_constant                  $39 = 0x3F800000 (1.0)
-  216. add_float                      $38 += $39
-  217. copy_slot_masked               i₅ = Mask($38)
-  218. copy_constant                  $39 = 0x40800000 (4.0)
-  219. cmplt_float                    $38 = lessThan($38, $39)
-  220. stack_rewind
-  221. branch_if_no_active_lanes_eq   branch -10 (label 30 at #211) if no lanes of $38 == 0x00000000 (0.0)
-  222. label                          label 0x0000001D
-  223. copy_4_slots_unmasked          $38..41 = result₁
-  224. copy_constant                  $42 = 0x41100000 (9.0)
-  225. copy_constant                  $43 = 0x3F800000 (1.0)
-  226. copy_constant                  $44 = 0x40000000 (2.0)
-  227. copy_constant                  $45 = 0x40400000 (3.0)
-  228. cmpeq_4_floats                 $38..41 = equal($38..41, $42..45)
-  229. bitwise_and_2_ints             $38..39 &= $40..41
-  230. bitwise_and_int                $38 &= $39
-  231. label                          label 0x0000001C
-  232. copy_slot_masked               $37 = Mask($38)
-  233. label                          label 0x00000005
-  234. load_condition_mask            CondMask = $76
-  235. zero_slot_unmasked             $57 = 0
-  236. merge_condition_mask           CondMask = $36 & $37
-  237. branch_if_no_lanes_active      branch_if_no_lanes_active +29 (label 4 at #266)
-  238. copy_constant                  $58 = 0x41100000 (9.0)
-  239. swizzle_4                      $58..61 = ($58..61).xxxx
-  240. copy_4_slots_unmasked          result₂ = $58..61
-  241. branch_if_no_lanes_active      branch_if_no_lanes_active +14 (label 32 at #255)
-  242. copy_constant                  i₆ = 0x40400000 (3.0)
-  243. label                          label 0x00000021
-  244. copy_4_slots_unmasked          $58..61 = result₂(1..3), i₆
-  245. copy_4_slots_masked            result₂ = Mask($58..61)
-  246. copy_slot_unmasked             $58 = i₆
-  247. copy_constant                  $59 = 0x3F800000 (1.0)
-  248. sub_float                      $58 -= $59
-  249. copy_slot_masked               i₆ = Mask($58)
-  250. copy_constant                  $58 = 0x3F800000 (1.0)
-  251. copy_slot_unmasked             $59 = i₆
-  252. cmple_float                    $58 = lessThanEqual($58, $59)
-  253. stack_rewind
-  254. branch_if_no_active_lanes_eq   branch -11 (label 33 at #243) if no lanes of $58 == 0x00000000 (0.0)
-  255. label                          label 0x00000020
-  256. copy_4_slots_unmasked          $58..61 = result₂
-  257. copy_constant                  $62 = 0x41100000 (9.0)
-  258. copy_constant                  $63 = 0x40400000 (3.0)
-  259. copy_constant                  $64 = 0x40000000 (2.0)
-  260. copy_constant                  $65 = 0x3F800000 (1.0)
-  261. cmpeq_4_floats                 $58..61 = equal($58..61, $62..65)
-  262. bitwise_and_2_ints             $58..59 &= $60..61
-  263. bitwise_and_int                $58 &= $59
-  264. label                          label 0x0000001F
-  265. copy_slot_masked               $57 = Mask($58)
-  266. label                          label 0x00000004
-  267. load_condition_mask            CondMask = $36
-  268. zero_slot_unmasked             $47 = 0
-  269. merge_condition_mask           CondMask = $56 & $57
-  270. branch_if_no_lanes_active      branch_if_no_lanes_active +29 (label 3 at #299)
-  271. copy_constant                  $48 = 0x41100000 (9.0)
-  272. swizzle_4                      $48..51 = ($48..51).xxxx
-  273. copy_4_slots_unmasked          result₃ = $48..51
-  274. branch_if_no_lanes_active      branch_if_no_lanes_active +14 (label 35 at #288)
-  275. copy_constant                  i₇ = 0x40400000 (3.0)
-  276. label                          label 0x00000024
-  277. copy_4_slots_unmasked          $48..51 = result₃(1..3), i₇
-  278. copy_4_slots_masked            result₃ = Mask($48..51)
-  279. copy_slot_unmasked             $48 = i₇
-  280. copy_constant                  $49 = 0x3F800000 (1.0)
-  281. sub_float                      $48 -= $49
-  282. copy_slot_masked               i₇ = Mask($48)
-  283. zero_slot_unmasked             $48 = 0
-  284. copy_slot_unmasked             $49 = i₇
-  285. cmplt_float                    $48 = lessThan($48, $49)
-  286. stack_rewind
-  287. branch_if_no_active_lanes_eq   branch -11 (label 36 at #276) if no lanes of $48 == 0x00000000 (0.0)
-  288. label                          label 0x00000023
-  289. copy_4_slots_unmasked          $48..51 = result₃
-  290. copy_constant                  $52 = 0x41100000 (9.0)
-  291. copy_constant                  $53 = 0x40400000 (3.0)
-  292. copy_constant                  $54 = 0x40000000 (2.0)
-  293. copy_constant                  $55 = 0x3F800000 (1.0)
-  294. cmpeq_4_floats                 $48..51 = equal($48..51, $52..55)
-  295. bitwise_and_2_ints             $48..49 &= $50..51
-  296. bitwise_and_int                $48 &= $49
-  297. label                          label 0x00000022
-  298. copy_slot_masked               $47 = Mask($48)
-  299. label                          label 0x00000003
-  300. load_condition_mask            CondMask = $56
-  301. zero_slot_unmasked             $27 = 0
-  302. merge_condition_mask           CondMask = $46 & $47
-  303. branch_if_no_lanes_active      branch_if_no_lanes_active +20 (label 2 at #323)
-  304. copy_constant                  $28 = 0x41100000 (9.0)
-  305. swizzle_4                      $28..31 = ($28..31).xxxx
-  306. copy_4_slots_unmasked          result₄ = $28..31
-  307. branch_if_no_lanes_active      branch_if_no_lanes_active +5 (label 38 at #312)
-  308. copy_constant                  i₈ = 0x3F800000 (1.0)
-  309. label                          label 0x00000027
-  310. copy_4_slots_unmasked          $28..31 = result₄(1..3), i₈
-  311. copy_4_slots_masked            result₄ = Mask($28..31)
-  312. label                          label 0x00000026
-  313. copy_4_slots_unmasked          $28..31 = result₄
-  314. copy_constant                  $32 = 0x41100000 (9.0)
-  315. copy_constant                  $33 = 0x41100000 (9.0)
-  316. copy_constant                  $34 = 0x41100000 (9.0)
-  317. copy_constant                  $35 = 0x3F800000 (1.0)
-  318. cmpeq_4_floats                 $28..31 = equal($28..31, $32..35)
-  319. bitwise_and_2_ints             $28..29 &= $30..31
-  320. bitwise_and_int                $28 &= $29
-  321. label                          label 0x00000025
-  322. copy_slot_masked               $27 = Mask($28)
-  323. label                          label 0x00000002
-  324. load_condition_mask            CondMask = $46
-  325. zero_slot_unmasked             $2 = 0
-  326. merge_condition_mask           CondMask = $26 & $27
-  327. branch_if_no_lanes_active      branch_if_no_lanes_active +28 (label 1 at #355)
-  328. copy_constant                  $3 = 0x41100000 (9.0)
-  329. swizzle_4                      $3..6 = ($3..6).xxxx
-  330. copy_4_slots_unmasked          result₅ = $3..6
-  331. branch_if_no_lanes_active      branch_if_no_lanes_active +13 (label 41 at #344)
-  332. copy_constant                  i₉ = 0x3F800000 (1.0)
-  333. label                          label 0x0000002A
-  334. copy_4_slots_unmasked          $3..6 = result₅(1..3), i₉
-  335. copy_4_slots_masked            result₅ = Mask($3..6)
-  336. copy_slot_unmasked             $3 = i₉
-  337. copy_constant                  $4 = 0x3F800000 (1.0)
-  338. add_float                      $3 += $4
-  339. copy_slot_masked               i₉ = Mask($3)
-  340. copy_constant                  $4 = 0x40800000 (4.0)
-  341. cmpne_float                    $3 = notEqual($3, $4)
-  342. stack_rewind
-  343. branch_if_no_active_lanes_eq   branch -10 (label 42 at #333) if no lanes of $3 == 0x00000000 (0.0)
-  344. label                          label 0x00000029
-  345. copy_4_slots_unmasked          $3..6 = result₅
-  346. copy_constant                  $7 = 0x41100000 (9.0)
-  347. copy_constant                  $8 = 0x3F800000 (1.0)
-  348. copy_constant                  $9 = 0x40000000 (2.0)
-  349. copy_constant                  $10 = 0x40400000 (3.0)
-  350. cmpeq_4_floats                 $3..6 = equal($3..6, $7..10)
-  351. bitwise_and_2_ints             $3..4 &= $5..6
-  352. bitwise_and_int                $3 &= $4
-  353. label                          label 0x00000028
-  354. copy_slot_masked               $2 = Mask($3)
-  355. label                          label 0x00000001
-  356. load_condition_mask            CondMask = $26
-  357. swizzle_4                      $2..5 = ($2..5).xxxx
-  358. copy_4_constants               $6..9 = colorRed
-  359. copy_4_constants               $10..13 = colorGreen
-  360. mix_4_ints                     $2..5 = mix($6..9, $10..13, $2..5)
-  361. load_src                       src.rgba = $2..5
+    1. store_src                      $28..31 = src.rgba
+    2. copy_constant                  $30 = 0x3F000000 (0.5)
+    3. copy_constant                  $31 = 0x3F000000 (0.5)
+    4. cmpeq_2_floats                 $28..29 = equal($28..29, $30..31)
+    5. bitwise_and_int                $28 &= $29
+    6. store_src_rg                   pos = src.rg
+    7. init_lane_masks                CondMask = LoopMask = RetMask = true
+    8. zero_slot_unmasked             kZero = 0
+    9. copy_constant                  kTen = 0x41200000 (10.0)
+   10. copy_slot_unmasked             $6 = pos(0)
+   11. copy_constant                  $7 = colorGreen(1)
+   12. max_float                      $6 = max($6, $7)
+   13. copy_constant                  $7 = colorGreen(3)
+   14. min_float                      $6 = min($6, $7)
+   15. copy_constant                  $7 = 0x40A00000 (5.0)
+   16. mul_float                      $6 *= $7
+   17. copy_slot_unmasked             five = $6
+   18. store_condition_mask           $42 = CondMask
+   19. store_condition_mask           $52 = CondMask
+   20. store_condition_mask           $32 = CondMask
+   21. store_condition_mask           $78 = CondMask
+   22. store_condition_mask           $62 = CondMask
+   23. store_condition_mask           $24 = CondMask
+   24. store_condition_mask           $72 = CondMask
+   25. store_condition_mask           $18 = CondMask
+   26. store_condition_mask           $0 = CondMask
+   27. branch_if_no_lanes_active      branch_if_no_lanes_active +33 (label 10 at #60)
+   28. store_return_mask              $1 = RetMask
+   29. copy_slot_unmasked             five₁ = five
+   30. copy_slot_unmasked             i = kZero
+   31. store_loop_mask                $2 = LoopMask
+   32. jump                           jump +15 (label 12 at #47)
+   33. label                          label 0x0000000D
+   34. store_condition_mask           $3 = CondMask
+   35. copy_slot_unmasked             $4 = i
+   36. copy_slot_unmasked             $5 = five₁
+   37. cmpeq_float                    $4 = equal($4, $5)
+   38. merge_condition_mask           CondMask = $3 & $4
+   39. copy_slot_unmasked             $5 = i
+   40. copy_slot_masked               [return_loop].result = Mask($5)
+   41. mask_off_return_mask           RetMask &= ~(CondMask & LoopMask & RetMask)
+   42. load_condition_mask            CondMask = $3
+   43. copy_slot_unmasked             $3 = i
+   44. copy_constant                  $4 = 0x3F800000 (1.0)
+   45. add_float                      $3 += $4
+   46. copy_slot_masked               i = Mask($3)
+   47. label                          label 0x0000000C
+   48. copy_slot_unmasked             $3 = i
+   49. copy_constant                  $4 = 0x41200000 (10.0)
+   50. cmplt_float                    $3 = lessThan($3, $4)
+   51. merge_loop_mask                LoopMask &= $3
+   52. stack_rewind
+   53. branch_if_any_lanes_active     branch_if_any_lanes_active -20 (label 13 at #33)
+   54. label                          label 0x0000000B
+   55. load_loop_mask                 LoopMask = $2
+   56. zero_slot_unmasked             $2 = 0
+   57. copy_slot_masked               [return_loop].result = Mask($2)
+   58. load_return_mask               RetMask = $1
+   59. copy_slot_unmasked             $1 = [return_loop].result
+   60. label                          label 0x0000000A
+   61. copy_constant                  $2 = 0x40A00000 (5.0)
+   62. cmpeq_float                    $1 = equal($1, $2)
+   63. zero_slot_unmasked             $19 = 0
+   64. merge_condition_mask           CondMask = $0 & $1
+   65. branch_if_no_lanes_active      branch_if_no_lanes_active +38 (label 9 at #103)
+   66. copy_slot_unmasked             five₂ = five
+   67. zero_2_slots_unmasked          sum, i₁ = 0
+   68. store_loop_mask                $20 = LoopMask
+   69. jump                           jump +20 (label 16 at #89)
+   70. label                          label 0x00000011
+   71. zero_slot_unmasked             $88 = 0
+   72. store_condition_mask           $21 = CondMask
+   73. copy_slot_unmasked             $22 = i₁
+   74. copy_slot_unmasked             $23 = five₂
+   75. cmplt_float                    $22 = lessThan($22, $23)
+   76. merge_condition_mask           CondMask = $21 & $22
+   77. copy_constant                  $89 = 0xFFFFFFFF
+   78. copy_slot_masked               $88 = Mask($89)
+   79. mask_off_loop_mask             LoopMask &= ~(CondMask & LoopMask & RetMask)
+   80. load_condition_mask            CondMask = $21
+   81. copy_2_slots_unmasked          $21..22 = sum, i₁
+   82. add_float                      $21 += $22
+   83. copy_slot_masked               sum = Mask($21)
+   84. reenable_loop_mask             LoopMask |= $88
+   85. copy_slot_unmasked             $21 = i₁
+   86. copy_constant                  $22 = 0x3F800000 (1.0)
+   87. add_float                      $21 += $22
+   88. copy_slot_masked               i₁ = Mask($21)
+   89. label                          label 0x00000010
+   90. copy_slot_unmasked             $21 = i₁
+   91. copy_slot_unmasked             $22 = kTen
+   92. cmplt_float                    $21 = lessThan($21, $22)
+   93. merge_loop_mask                LoopMask &= $21
+   94. stack_rewind
+   95. branch_if_any_lanes_active     branch_if_any_lanes_active -25 (label 17 at #70)
+   96. label                          label 0x0000000F
+   97. load_loop_mask                 LoopMask = $20
+   98. copy_slot_unmasked             $20 = sum
+   99. label                          label 0x0000000E
+  100. copy_constant                  $21 = 0x420C0000 (35.0)
+  101. cmpeq_float                    $20 = equal($20, $21)
+  102. copy_slot_masked               $19 = Mask($20)
+  103. label                          label 0x00000009
+  104. load_condition_mask            CondMask = $0
+  105. zero_slot_unmasked             $73 = 0
+  106. merge_condition_mask           CondMask = $18 & $19
+  107. branch_if_no_lanes_active      branch_if_no_lanes_active +38 (label 8 at #145)
+  108. copy_slot_unmasked             five₃ = five
+  109. zero_slot_unmasked             sum₁ = 0
+  110. copy_constant                  kOne = 0x3F800000 (1.0)
+  111. zero_slot_unmasked             i₂ = 0
+  112. store_loop_mask                $74 = LoopMask
+  113. jump                           jump +18 (label 20 at #131)
+  114. label                          label 0x00000015
+  115. store_condition_mask           $75 = CondMask
+  116. copy_slot_unmasked             $76 = five₃
+  117. copy_slot_unmasked             $77 = i₂
+  118. cmplt_float                    $76 = lessThan($76, $77)
+  119. merge_condition_mask           CondMask = $75 & $76
+  120. branch_if_all_lanes_active     branch_if_all_lanes_active +18 (label 19 at #138)
+  121. mask_off_loop_mask             LoopMask &= ~(CondMask & LoopMask & RetMask)
+  122. load_condition_mask            CondMask = $75
+  123. copy_slot_unmasked             $75 = sum₁
+  124. copy_slot_unmasked             $76 = i₂
+  125. add_float                      $75 += $76
+  126. copy_slot_masked               sum₁ = Mask($75)
+  127. copy_slot_unmasked             $75 = i₂
+  128. copy_slot_unmasked             $76 = kOne
+  129. add_float                      $75 += $76
+  130. copy_slot_masked               i₂ = Mask($75)
+  131. label                          label 0x00000014
+  132. copy_slot_unmasked             $75 = i₂
+  133. copy_constant                  $76 = 0x41200000 (10.0)
+  134. cmplt_float                    $75 = lessThan($75, $76)
+  135. merge_loop_mask                LoopMask &= $75
+  136. stack_rewind
+  137. branch_if_any_lanes_active     branch_if_any_lanes_active -23 (label 21 at #114)
+  138. label                          label 0x00000013
+  139. load_loop_mask                 LoopMask = $74
+  140. copy_slot_unmasked             $74 = sum₁
+  141. label                          label 0x00000012
+  142. copy_constant                  $75 = 0x41700000 (15.0)
+  143. cmpeq_float                    $74 = equal($74, $75)
+  144. copy_slot_masked               $73 = Mask($74)
+  145. label                          label 0x00000008
+  146. load_condition_mask            CondMask = $18
+  147. zero_slot_unmasked             $25 = 0
+  148. merge_condition_mask           CondMask = $72 & $73
+  149. branch_if_no_lanes_active      branch_if_no_lanes_active +25 (label 7 at #174)
+  150. zero_slot_unmasked             sum₂ = 0
+  151. branch_if_no_lanes_active      branch_if_no_lanes_active +14 (label 23 at #165)
+  152. copy_constant                  i₃ = 0x3DFBE76D (0.123)
+  153. label                          label 0x00000018
+  154. copy_2_slots_unmasked          $26..27 = sum₂, i₃
+  155. add_float                      $26 += $27
+  156. copy_slot_masked               sum₂ = Mask($26)
+  157. copy_slot_unmasked             $26 = i₃
+  158. copy_constant                  $27 = 0x3DE353F8 (0.111)
+  159. add_float                      $26 += $27
+  160. copy_slot_masked               i₃ = Mask($26)
+  161. copy_constant                  $27 = 0x3F19999A (0.6)
+  162. cmplt_float                    $26 = lessThan($26, $27)
+  163. stack_rewind
+  164. branch_if_no_active_lanes_eq   branch -11 (label 24 at #153) if no lanes of $26 == 0x00000000 (0.0)
+  165. label                          label 0x00000017
+  166. copy_slot_unmasked             $26 = sum₂
+  167. copy_constant                  $27 = 0x3FDCCCCD (1.725)
+  168. sub_float                      $26 -= $27
+  169. label                          label 0x00000016
+  170. abs_float                      $26 = abs($26)
+  171. copy_constant                  $27 = 0x3CCCCCCD (0.025)
+  172. cmplt_float                    $26 = lessThan($26, $27)
+  173. copy_slot_masked               $25 = Mask($26)
+  174. label                          label 0x00000007
+  175. load_condition_mask            CondMask = $72
+  176. zero_slot_unmasked             $63 = 0
+  177. merge_condition_mask           CondMask = $24 & $25
+  178. branch_if_no_lanes_active      branch_if_no_lanes_active +28 (label 6 at #206)
+  179. copy_constant                  $64 = 0x41100000 (9.0)
+  180. swizzle_4                      $64..67 = ($64..67).xxxx
+  181. copy_4_slots_unmasked          result = $64..67
+  182. branch_if_no_lanes_active      branch_if_no_lanes_active +13 (label 26 at #195)
+  183. copy_constant                  i₄ = 0x3F800000 (1.0)
+  184. label                          label 0x0000001B
+  185. copy_4_slots_unmasked          $64..67 = result(1..3), i₄
+  186. copy_4_slots_masked            result = Mask($64..67)
+  187. copy_slot_unmasked             $64 = i₄
+  188. copy_constant                  $65 = 0x3F800000 (1.0)
+  189. add_float                      $64 += $65
+  190. copy_slot_masked               i₄ = Mask($64)
+  191. copy_constant                  $65 = 0x40400000 (3.0)
+  192. cmple_float                    $64 = lessThanEqual($64, $65)
+  193. stack_rewind
+  194. branch_if_no_active_lanes_eq   branch -10 (label 27 at #184) if no lanes of $64 == 0x00000000 (0.0)
+  195. label                          label 0x0000001A
+  196. copy_4_slots_unmasked          $64..67 = result
+  197. copy_constant                  $68 = 0x41100000 (9.0)
+  198. copy_constant                  $69 = 0x3F800000 (1.0)
+  199. copy_constant                  $70 = 0x40000000 (2.0)
+  200. copy_constant                  $71 = 0x40400000 (3.0)
+  201. cmpeq_4_floats                 $64..67 = equal($64..67, $68..71)
+  202. bitwise_and_2_ints             $64..65 &= $66..67
+  203. bitwise_and_int                $64 &= $65
+  204. label                          label 0x00000019
+  205. copy_slot_masked               $63 = Mask($64)
+  206. label                          label 0x00000006
+  207. load_condition_mask            CondMask = $24
+  208. zero_slot_unmasked             $79 = 0
+  209. merge_condition_mask           CondMask = $62 & $63
+  210. branch_if_no_lanes_active      branch_if_no_lanes_active +28 (label 5 at #238)
+  211. copy_constant                  $80 = 0x41100000 (9.0)
+  212. swizzle_4                      $80..83 = ($80..83).xxxx
+  213. copy_4_slots_unmasked          result₁ = $80..83
+  214. branch_if_no_lanes_active      branch_if_no_lanes_active +13 (label 29 at #227)
+  215. copy_constant                  i₅ = 0x3F800000 (1.0)
+  216. label                          label 0x0000001E
+  217. copy_4_slots_unmasked          $80..83 = result₁(1..3), i₅
+  218. copy_4_slots_masked            result₁ = Mask($80..83)
+  219. copy_slot_unmasked             $80 = i₅
+  220. copy_constant                  $81 = 0x3F800000 (1.0)
+  221. add_float                      $80 += $81
+  222. copy_slot_masked               i₅ = Mask($80)
+  223. copy_constant                  $81 = 0x40800000 (4.0)
+  224. cmplt_float                    $80 = lessThan($80, $81)
+  225. stack_rewind
+  226. branch_if_no_active_lanes_eq   branch -10 (label 30 at #216) if no lanes of $80 == 0x00000000 (0.0)
+  227. label                          label 0x0000001D
+  228. copy_4_slots_unmasked          $80..83 = result₁
+  229. copy_constant                  $84 = 0x41100000 (9.0)
+  230. copy_constant                  $85 = 0x3F800000 (1.0)
+  231. copy_constant                  $86 = 0x40000000 (2.0)
+  232. copy_constant                  $87 = 0x40400000 (3.0)
+  233. cmpeq_4_floats                 $80..83 = equal($80..83, $84..87)
+  234. bitwise_and_2_ints             $80..81 &= $82..83
+  235. bitwise_and_int                $80 &= $81
+  236. label                          label 0x0000001C
+  237. copy_slot_masked               $79 = Mask($80)
+  238. label                          label 0x00000005
+  239. load_condition_mask            CondMask = $62
+  240. zero_slot_unmasked             $33 = 0
+  241. merge_condition_mask           CondMask = $78 & $79
+  242. branch_if_no_lanes_active      branch_if_no_lanes_active +29 (label 4 at #271)
+  243. copy_constant                  $34 = 0x41100000 (9.0)
+  244. swizzle_4                      $34..37 = ($34..37).xxxx
+  245. copy_4_slots_unmasked          result₂ = $34..37
+  246. branch_if_no_lanes_active      branch_if_no_lanes_active +14 (label 32 at #260)
+  247. copy_constant                  i₆ = 0x40400000 (3.0)
+  248. label                          label 0x00000021
+  249. copy_4_slots_unmasked          $34..37 = result₂(1..3), i₆
+  250. copy_4_slots_masked            result₂ = Mask($34..37)
+  251. copy_slot_unmasked             $34 = i₆
+  252. copy_constant                  $35 = 0x3F800000 (1.0)
+  253. sub_float                      $34 -= $35
+  254. copy_slot_masked               i₆ = Mask($34)
+  255. copy_constant                  $34 = 0x3F800000 (1.0)
+  256. copy_slot_unmasked             $35 = i₆
+  257. cmple_float                    $34 = lessThanEqual($34, $35)
+  258. stack_rewind
+  259. branch_if_no_active_lanes_eq   branch -11 (label 33 at #248) if no lanes of $34 == 0x00000000 (0.0)
+  260. label                          label 0x00000020
+  261. copy_4_slots_unmasked          $34..37 = result₂
+  262. copy_constant                  $38 = 0x41100000 (9.0)
+  263. copy_constant                  $39 = 0x40400000 (3.0)
+  264. copy_constant                  $40 = 0x40000000 (2.0)
+  265. copy_constant                  $41 = 0x3F800000 (1.0)
+  266. cmpeq_4_floats                 $34..37 = equal($34..37, $38..41)
+  267. bitwise_and_2_ints             $34..35 &= $36..37
+  268. bitwise_and_int                $34 &= $35
+  269. label                          label 0x0000001F
+  270. copy_slot_masked               $33 = Mask($34)
+  271. label                          label 0x00000004
+  272. load_condition_mask            CondMask = $78
+  273. zero_slot_unmasked             $53 = 0
+  274. merge_condition_mask           CondMask = $32 & $33
+  275. branch_if_no_lanes_active      branch_if_no_lanes_active +29 (label 3 at #304)
+  276. copy_constant                  $54 = 0x41100000 (9.0)
+  277. swizzle_4                      $54..57 = ($54..57).xxxx
+  278. copy_4_slots_unmasked          result₃ = $54..57
+  279. branch_if_no_lanes_active      branch_if_no_lanes_active +14 (label 35 at #293)
+  280. copy_constant                  i₇ = 0x40400000 (3.0)
+  281. label                          label 0x00000024
+  282. copy_4_slots_unmasked          $54..57 = result₃(1..3), i₇
+  283. copy_4_slots_masked            result₃ = Mask($54..57)
+  284. copy_slot_unmasked             $54 = i₇
+  285. copy_constant                  $55 = 0x3F800000 (1.0)
+  286. sub_float                      $54 -= $55
+  287. copy_slot_masked               i₇ = Mask($54)
+  288. zero_slot_unmasked             $54 = 0
+  289. copy_slot_unmasked             $55 = i₇
+  290. cmplt_float                    $54 = lessThan($54, $55)
+  291. stack_rewind
+  292. branch_if_no_active_lanes_eq   branch -11 (label 36 at #281) if no lanes of $54 == 0x00000000 (0.0)
+  293. label                          label 0x00000023
+  294. copy_4_slots_unmasked          $54..57 = result₃
+  295. copy_constant                  $58 = 0x41100000 (9.0)
+  296. copy_constant                  $59 = 0x40400000 (3.0)
+  297. copy_constant                  $60 = 0x40000000 (2.0)
+  298. copy_constant                  $61 = 0x3F800000 (1.0)
+  299. cmpeq_4_floats                 $54..57 = equal($54..57, $58..61)
+  300. bitwise_and_2_ints             $54..55 &= $56..57
+  301. bitwise_and_int                $54 &= $55
+  302. label                          label 0x00000022
+  303. copy_slot_masked               $53 = Mask($54)
+  304. label                          label 0x00000003
+  305. load_condition_mask            CondMask = $32
+  306. zero_slot_unmasked             $43 = 0
+  307. merge_condition_mask           CondMask = $52 & $53
+  308. branch_if_no_lanes_active      branch_if_no_lanes_active +20 (label 2 at #328)
+  309. copy_constant                  $44 = 0x41100000 (9.0)
+  310. swizzle_4                      $44..47 = ($44..47).xxxx
+  311. copy_4_slots_unmasked          result₄ = $44..47
+  312. branch_if_no_lanes_active      branch_if_no_lanes_active +5 (label 38 at #317)
+  313. copy_constant                  i₈ = 0x3F800000 (1.0)
+  314. label                          label 0x00000027
+  315. copy_4_slots_unmasked          $44..47 = result₄(1..3), i₈
+  316. copy_4_slots_masked            result₄ = Mask($44..47)
+  317. label                          label 0x00000026
+  318. copy_4_slots_unmasked          $44..47 = result₄
+  319. copy_constant                  $48 = 0x41100000 (9.0)
+  320. copy_constant                  $49 = 0x41100000 (9.0)
+  321. copy_constant                  $50 = 0x41100000 (9.0)
+  322. copy_constant                  $51 = 0x3F800000 (1.0)
+  323. cmpeq_4_floats                 $44..47 = equal($44..47, $48..51)
+  324. bitwise_and_2_ints             $44..45 &= $46..47
+  325. bitwise_and_int                $44 &= $45
+  326. label                          label 0x00000025
+  327. copy_slot_masked               $43 = Mask($44)
+  328. label                          label 0x00000002
+  329. load_condition_mask            CondMask = $52
+  330. zero_slot_unmasked             $6 = 0
+  331. merge_condition_mask           CondMask = $42 & $43
+  332. branch_if_no_lanes_active      branch_if_no_lanes_active +28 (label 1 at #360)
+  333. copy_constant                  $7 = 0x41100000 (9.0)
+  334. swizzle_4                      $7..10 = ($7..10).xxxx
+  335. copy_4_slots_unmasked          result₅ = $7..10
+  336. branch_if_no_lanes_active      branch_if_no_lanes_active +13 (label 41 at #349)
+  337. copy_constant                  i₉ = 0x3F800000 (1.0)
+  338. label                          label 0x0000002A
+  339. copy_4_slots_unmasked          $7..10 = result₅(1..3), i₉
+  340. copy_4_slots_masked            result₅ = Mask($7..10)
+  341. copy_slot_unmasked             $7 = i₉
+  342. copy_constant                  $8 = 0x3F800000 (1.0)
+  343. add_float                      $7 += $8
+  344. copy_slot_masked               i₉ = Mask($7)
+  345. copy_constant                  $8 = 0x40800000 (4.0)
+  346. cmpne_float                    $7 = notEqual($7, $8)
+  347. stack_rewind
+  348. branch_if_no_active_lanes_eq   branch -10 (label 42 at #338) if no lanes of $7 == 0x00000000 (0.0)
+  349. label                          label 0x00000029
+  350. copy_4_slots_unmasked          $7..10 = result₅
+  351. copy_constant                  $11 = 0x41100000 (9.0)
+  352. copy_constant                  $12 = 0x3F800000 (1.0)
+  353. copy_constant                  $13 = 0x40000000 (2.0)
+  354. copy_constant                  $14 = 0x40400000 (3.0)
+  355. cmpeq_4_floats                 $7..10 = equal($7..10, $11..14)
+  356. bitwise_and_2_ints             $7..8 &= $9..10
+  357. bitwise_and_int                $7 &= $8
+  358. label                          label 0x00000028
+  359. copy_slot_masked               $6 = Mask($7)
+  360. label                          label 0x00000001
+  361. load_condition_mask            CondMask = $42
+  362. swizzle_4                      $6..9 = ($6..9).xxxx
+  363. copy_4_constants               $10..13 = colorRed
+  364. copy_4_constants               $14..17 = colorGreen
+  365. mix_4_ints                     $6..9 = mix($10..13, $14..17, $6..9)
+  366. load_src                       src.rgba = $6..9
diff --git a/tests/sksl/runtime/LoopInt.skrp b/tests/sksl/runtime/LoopInt.skrp
index 4f01ceb..a1236f7 100644
--- a/tests/sksl/runtime/LoopInt.skrp
+++ b/tests/sksl/runtime/LoopInt.skrp
@@ -1,332 +1,337 @@
-    1. store_src_rg                   pos = src.rg
-    2. init_lane_masks                CondMask = LoopMask = RetMask = true
-    3. zero_slot_unmasked             kZero = 0
-    4. copy_constant                  kTen = 0x0000000A (1.401298e-44)
-    5. copy_slot_unmasked             $0 = pos(0)
-    6. copy_constant                  $1 = colorGreen(1)
-    7. max_float                      $0 = max($0, $1)
-    8. copy_constant                  $1 = colorGreen(3)
-    9. min_float                      $0 = min($0, $1)
-   10. cast_to_int_from_float         $0 = FloatToInt($0)
-   11. copy_constant                  $1 = 0x00000005 (7.006492e-45)
-   12. mul_int                        $0 *= $1
-   13. copy_slot_unmasked             five = $0
-   14. store_condition_mask           $20 = CondMask
-   15. store_condition_mask           $40 = CondMask
-   16. store_condition_mask           $50 = CondMask
-   17. store_condition_mask           $30 = CondMask
-   18. store_condition_mask           $72 = CondMask
-   19. store_condition_mask           $60 = CondMask
-   20. store_condition_mask           $14 = CondMask
-   21. store_condition_mask           $66 = CondMask
-   22. branch_if_no_lanes_active      branch_if_no_lanes_active +33 (label 9 at #55)
-   23. store_return_mask              $67 = RetMask
-   24. copy_slot_unmasked             five₁ = five
-   25. copy_slot_unmasked             i = kZero
-   26. store_loop_mask                $68 = LoopMask
-   27. jump                           jump +15 (label 11 at #42)
-   28. label                          label 0x0000000C
-   29. store_condition_mask           $69 = CondMask
-   30. copy_slot_unmasked             $70 = i
-   31. copy_slot_unmasked             $71 = five₁
-   32. cmpeq_int                      $70 = equal($70, $71)
-   33. merge_condition_mask           CondMask = $69 & $70
-   34. copy_slot_unmasked             $71 = i
-   35. copy_slot_masked               [return_loop].result = Mask($71)
-   36. mask_off_return_mask           RetMask &= ~(CondMask & LoopMask & RetMask)
-   37. load_condition_mask            CondMask = $69
-   38. copy_slot_unmasked             $69 = i
-   39. copy_constant                  $70 = 0x00000001 (1.401298e-45)
-   40. add_int                        $69 += $70
-   41. copy_slot_masked               i = Mask($69)
-   42. label                          label 0x0000000B
-   43. copy_slot_unmasked             $69 = i
-   44. copy_constant                  $70 = 0x0000000A (1.401298e-44)
-   45. cmplt_int                      $69 = lessThan($69, $70)
-   46. merge_loop_mask                LoopMask &= $69
-   47. stack_rewind
-   48. branch_if_any_lanes_active     branch_if_any_lanes_active -20 (label 12 at #28)
-   49. label                          label 0x0000000A
-   50. load_loop_mask                 LoopMask = $68
-   51. zero_slot_unmasked             $68 = 0
-   52. copy_slot_masked               [return_loop].result = Mask($68)
-   53. load_return_mask               RetMask = $67
-   54. copy_slot_unmasked             $67 = [return_loop].result
-   55. label                          label 0x00000009
-   56. copy_constant                  $68 = 0x00000005 (7.006492e-45)
-   57. cmpeq_int                      $67 = equal($67, $68)
-   58. zero_slot_unmasked             $15 = 0
-   59. merge_condition_mask           CondMask = $66 & $67
-   60. branch_if_no_lanes_active      branch_if_no_lanes_active +38 (label 8 at #98)
-   61. copy_slot_unmasked             five₂ = five
-   62. zero_2_slots_unmasked          sum, i₁ = 0
-   63. store_loop_mask                $16 = LoopMask
-   64. jump                           jump +20 (label 15 at #84)
-   65. label                          label 0x00000010
-   66. zero_slot_unmasked             $12 = 0
-   67. store_condition_mask           $17 = CondMask
-   68. copy_slot_unmasked             $18 = i₁
-   69. copy_slot_unmasked             $19 = five₂
-   70. cmplt_int                      $18 = lessThan($18, $19)
-   71. merge_condition_mask           CondMask = $17 & $18
-   72. copy_constant                  $13 = 0xFFFFFFFF
-   73. copy_slot_masked               $12 = Mask($13)
-   74. mask_off_loop_mask             LoopMask &= ~(CondMask & LoopMask & RetMask)
-   75. load_condition_mask            CondMask = $17
-   76. copy_2_slots_unmasked          $17..18 = sum, i₁
-   77. add_int                        $17 += $18
-   78. copy_slot_masked               sum = Mask($17)
-   79. reenable_loop_mask             LoopMask |= $12
-   80. copy_slot_unmasked             $17 = i₁
-   81. copy_constant                  $18 = 0x00000001 (1.401298e-45)
-   82. add_int                        $17 += $18
-   83. copy_slot_masked               i₁ = Mask($17)
-   84. label                          label 0x0000000F
-   85. copy_slot_unmasked             $17 = i₁
-   86. copy_slot_unmasked             $18 = kTen
-   87. cmplt_int                      $17 = lessThan($17, $18)
-   88. merge_loop_mask                LoopMask &= $17
-   89. stack_rewind
-   90. branch_if_any_lanes_active     branch_if_any_lanes_active -25 (label 16 at #65)
-   91. label                          label 0x0000000E
-   92. load_loop_mask                 LoopMask = $16
-   93. copy_slot_unmasked             $16 = sum
-   94. label                          label 0x0000000D
-   95. copy_constant                  $17 = 0x00000023 (4.904545e-44)
-   96. cmpeq_int                      $16 = equal($16, $17)
-   97. copy_slot_masked               $15 = Mask($16)
-   98. label                          label 0x00000008
-   99. load_condition_mask            CondMask = $66
-  100. zero_slot_unmasked             $61 = 0
-  101. merge_condition_mask           CondMask = $14 & $15
-  102. branch_if_no_lanes_active      branch_if_no_lanes_active +38 (label 7 at #140)
-  103. copy_constant                  five₃ = 0x00000005 (7.006492e-45)
-  104. zero_slot_unmasked             sum₁ = 0
-  105. copy_constant                  kOne = 0x00000001 (1.401298e-45)
-  106. zero_slot_unmasked             i₂ = 0
-  107. store_loop_mask                $62 = LoopMask
-  108. jump                           jump +18 (label 19 at #126)
-  109. label                          label 0x00000014
-  110. store_condition_mask           $63 = CondMask
-  111. copy_slot_unmasked             $64 = five₃
-  112. copy_slot_unmasked             $65 = i₂
-  113. cmplt_int                      $64 = lessThan($64, $65)
-  114. merge_condition_mask           CondMask = $63 & $64
-  115. branch_if_all_lanes_active     branch_if_all_lanes_active +18 (label 18 at #133)
-  116. mask_off_loop_mask             LoopMask &= ~(CondMask & LoopMask & RetMask)
-  117. load_condition_mask            CondMask = $63
-  118. copy_slot_unmasked             $63 = sum₁
-  119. copy_slot_unmasked             $64 = i₂
-  120. add_int                        $63 += $64
-  121. copy_slot_masked               sum₁ = Mask($63)
-  122. copy_slot_unmasked             $63 = i₂
-  123. copy_slot_unmasked             $64 = kOne
-  124. add_int                        $63 += $64
-  125. copy_slot_masked               i₂ = Mask($63)
-  126. label                          label 0x00000013
-  127. copy_slot_unmasked             $63 = i₂
-  128. copy_constant                  $64 = 0x0000000A (1.401298e-44)
-  129. cmplt_int                      $63 = lessThan($63, $64)
-  130. merge_loop_mask                LoopMask &= $63
-  131. stack_rewind
-  132. branch_if_any_lanes_active     branch_if_any_lanes_active -23 (label 20 at #109)
-  133. label                          label 0x00000012
-  134. load_loop_mask                 LoopMask = $62
-  135. copy_slot_unmasked             $62 = sum₁
-  136. label                          label 0x00000011
-  137. copy_constant                  $63 = 0x0000000F (2.101948e-44)
-  138. cmpeq_int                      $62 = equal($62, $63)
-  139. copy_slot_masked               $61 = Mask($62)
-  140. label                          label 0x00000007
-  141. load_condition_mask            CondMask = $14
-  142. zero_slot_unmasked             $73 = 0
-  143. merge_condition_mask           CondMask = $60 & $61
-  144. branch_if_no_lanes_active      branch_if_no_lanes_active +28 (label 6 at #172)
-  145. copy_constant                  $74 = 0x00000009 (1.261169e-44)
-  146. swizzle_4                      $74..77 = ($74..77).xxxx
-  147. copy_4_slots_unmasked          result = $74..77
-  148. branch_if_no_lanes_active      branch_if_no_lanes_active +13 (label 22 at #161)
-  149. copy_constant                  i₃ = 0x00000001 (1.401298e-45)
-  150. label                          label 0x00000017
-  151. copy_4_slots_unmasked          $74..77 = result(1..3), i₃
-  152. copy_4_slots_masked            result = Mask($74..77)
-  153. copy_slot_unmasked             $74 = i₃
-  154. copy_constant                  $75 = 0x00000001 (1.401298e-45)
-  155. add_int                        $74 += $75
-  156. copy_slot_masked               i₃ = Mask($74)
-  157. copy_constant                  $75 = 0x00000003 (4.203895e-45)
-  158. cmple_int                      $74 = lessThanEqual($74, $75)
-  159. stack_rewind
-  160. branch_if_no_active_lanes_eq   branch -10 (label 23 at #150) if no lanes of $74 == 0x00000000 (0.0)
-  161. label                          label 0x00000016
-  162. copy_4_slots_unmasked          $74..77 = result
-  163. copy_constant                  $78 = 0x00000009 (1.261169e-44)
-  164. copy_constant                  $79 = 0x00000001 (1.401298e-45)
-  165. copy_constant                  $80 = 0x00000002 (2.802597e-45)
-  166. copy_constant                  $81 = 0x00000003 (4.203895e-45)
-  167. cmpeq_4_ints                   $74..77 = equal($74..77, $78..81)
-  168. bitwise_and_2_ints             $74..75 &= $76..77
-  169. bitwise_and_int                $74 &= $75
-  170. label                          label 0x00000015
-  171. copy_slot_masked               $73 = Mask($74)
-  172. label                          label 0x00000006
-  173. load_condition_mask            CondMask = $60
-  174. zero_slot_unmasked             $31 = 0
-  175. merge_condition_mask           CondMask = $72 & $73
-  176. branch_if_no_lanes_active      branch_if_no_lanes_active +28 (label 5 at #204)
-  177. copy_constant                  $32 = 0x00000009 (1.261169e-44)
-  178. swizzle_4                      $32..35 = ($32..35).xxxx
-  179. copy_4_slots_unmasked          result₁ = $32..35
-  180. branch_if_no_lanes_active      branch_if_no_lanes_active +13 (label 25 at #193)
-  181. copy_constant                  i₄ = 0x00000001 (1.401298e-45)
-  182. label                          label 0x0000001A
-  183. copy_4_slots_unmasked          $32..35 = result₁(1..3), i₄
-  184. copy_4_slots_masked            result₁ = Mask($32..35)
-  185. copy_slot_unmasked             $32 = i₄
-  186. copy_constant                  $33 = 0x00000001 (1.401298e-45)
-  187. add_int                        $32 += $33
-  188. copy_slot_masked               i₄ = Mask($32)
-  189. copy_constant                  $33 = 0x00000004 (5.605194e-45)
-  190. cmplt_int                      $32 = lessThan($32, $33)
-  191. stack_rewind
-  192. branch_if_no_active_lanes_eq   branch -10 (label 26 at #182) if no lanes of $32 == 0x00000000 (0.0)
-  193. label                          label 0x00000019
-  194. copy_4_slots_unmasked          $32..35 = result₁
-  195. copy_constant                  $36 = 0x00000009 (1.261169e-44)
-  196. copy_constant                  $37 = 0x00000001 (1.401298e-45)
-  197. copy_constant                  $38 = 0x00000002 (2.802597e-45)
-  198. copy_constant                  $39 = 0x00000003 (4.203895e-45)
-  199. cmpeq_4_ints                   $32..35 = equal($32..35, $36..39)
-  200. bitwise_and_2_ints             $32..33 &= $34..35
-  201. bitwise_and_int                $32 &= $33
-  202. label                          label 0x00000018
-  203. copy_slot_masked               $31 = Mask($32)
-  204. label                          label 0x00000005
-  205. load_condition_mask            CondMask = $72
-  206. zero_slot_unmasked             $51 = 0
-  207. merge_condition_mask           CondMask = $30 & $31
-  208. branch_if_no_lanes_active      branch_if_no_lanes_active +29 (label 4 at #237)
-  209. copy_constant                  $52 = 0x00000009 (1.261169e-44)
-  210. swizzle_4                      $52..55 = ($52..55).xxxx
-  211. copy_4_slots_unmasked          result₂ = $52..55
-  212. branch_if_no_lanes_active      branch_if_no_lanes_active +14 (label 28 at #226)
-  213. copy_constant                  i₅ = 0x00000003 (4.203895e-45)
-  214. label                          label 0x0000001D
-  215. copy_4_slots_unmasked          $52..55 = result₂(1..3), i₅
-  216. copy_4_slots_masked            result₂ = Mask($52..55)
-  217. copy_slot_unmasked             $52 = i₅
-  218. copy_constant                  $53 = 0x00000001 (1.401298e-45)
-  219. sub_int                        $52 -= $53
-  220. copy_slot_masked               i₅ = Mask($52)
-  221. copy_constant                  $52 = 0x00000001 (1.401298e-45)
-  222. copy_slot_unmasked             $53 = i₅
-  223. cmple_int                      $52 = lessThanEqual($52, $53)
-  224. stack_rewind
-  225. branch_if_no_active_lanes_eq   branch -11 (label 29 at #214) if no lanes of $52 == 0x00000000 (0.0)
-  226. label                          label 0x0000001C
-  227. copy_4_slots_unmasked          $52..55 = result₂
-  228. copy_constant                  $56 = 0x00000009 (1.261169e-44)
-  229. copy_constant                  $57 = 0x00000003 (4.203895e-45)
-  230. copy_constant                  $58 = 0x00000002 (2.802597e-45)
-  231. copy_constant                  $59 = 0x00000001 (1.401298e-45)
-  232. cmpeq_4_ints                   $52..55 = equal($52..55, $56..59)
-  233. bitwise_and_2_ints             $52..53 &= $54..55
-  234. bitwise_and_int                $52 &= $53
-  235. label                          label 0x0000001B
-  236. copy_slot_masked               $51 = Mask($52)
-  237. label                          label 0x00000004
-  238. load_condition_mask            CondMask = $30
-  239. zero_slot_unmasked             $41 = 0
-  240. merge_condition_mask           CondMask = $50 & $51
-  241. branch_if_no_lanes_active      branch_if_no_lanes_active +29 (label 3 at #270)
-  242. copy_constant                  $42 = 0x00000009 (1.261169e-44)
-  243. swizzle_4                      $42..45 = ($42..45).xxxx
-  244. copy_4_slots_unmasked          result₃ = $42..45
-  245. branch_if_no_lanes_active      branch_if_no_lanes_active +14 (label 31 at #259)
-  246. copy_constant                  i₆ = 0x00000003 (4.203895e-45)
-  247. label                          label 0x00000020
-  248. copy_4_slots_unmasked          $42..45 = result₃(1..3), i₆
-  249. copy_4_slots_masked            result₃ = Mask($42..45)
-  250. copy_slot_unmasked             $42 = i₆
-  251. copy_constant                  $43 = 0x00000001 (1.401298e-45)
-  252. sub_int                        $42 -= $43
-  253. copy_slot_masked               i₆ = Mask($42)
-  254. zero_slot_unmasked             $42 = 0
-  255. copy_slot_unmasked             $43 = i₆
-  256. cmplt_int                      $42 = lessThan($42, $43)
-  257. stack_rewind
-  258. branch_if_no_active_lanes_eq   branch -11 (label 32 at #247) if no lanes of $42 == 0x00000000 (0.0)
-  259. label                          label 0x0000001F
-  260. copy_4_slots_unmasked          $42..45 = result₃
-  261. copy_constant                  $46 = 0x00000009 (1.261169e-44)
-  262. copy_constant                  $47 = 0x00000003 (4.203895e-45)
-  263. copy_constant                  $48 = 0x00000002 (2.802597e-45)
-  264. copy_constant                  $49 = 0x00000001 (1.401298e-45)
-  265. cmpeq_4_ints                   $42..45 = equal($42..45, $46..49)
-  266. bitwise_and_2_ints             $42..43 &= $44..45
-  267. bitwise_and_int                $42 &= $43
-  268. label                          label 0x0000001E
-  269. copy_slot_masked               $41 = Mask($42)
-  270. label                          label 0x00000003
-  271. load_condition_mask            CondMask = $50
-  272. zero_slot_unmasked             $21 = 0
-  273. merge_condition_mask           CondMask = $40 & $41
-  274. branch_if_no_lanes_active      branch_if_no_lanes_active +20 (label 2 at #294)
-  275. copy_constant                  $22 = 0x00000009 (1.261169e-44)
-  276. swizzle_4                      $22..25 = ($22..25).xxxx
-  277. copy_4_slots_unmasked          result₄ = $22..25
-  278. branch_if_no_lanes_active      branch_if_no_lanes_active +5 (label 34 at #283)
-  279. copy_constant                  i₇ = 0x00000001 (1.401298e-45)
-  280. label                          label 0x00000023
-  281. copy_4_slots_unmasked          $22..25 = result₄(1..3), i₇
-  282. copy_4_slots_masked            result₄ = Mask($22..25)
-  283. label                          label 0x00000022
-  284. copy_4_slots_unmasked          $22..25 = result₄
-  285. copy_constant                  $26 = 0x00000009 (1.261169e-44)
-  286. copy_constant                  $27 = 0x00000009 (1.261169e-44)
-  287. copy_constant                  $28 = 0x00000009 (1.261169e-44)
-  288. copy_constant                  $29 = 0x00000001 (1.401298e-45)
-  289. cmpeq_4_ints                   $22..25 = equal($22..25, $26..29)
-  290. bitwise_and_2_ints             $22..23 &= $24..25
-  291. bitwise_and_int                $22 &= $23
-  292. label                          label 0x00000021
-  293. copy_slot_masked               $21 = Mask($22)
-  294. label                          label 0x00000002
-  295. load_condition_mask            CondMask = $40
-  296. zero_slot_unmasked             $0 = 0
-  297. merge_condition_mask           CondMask = $20 & $21
-  298. branch_if_no_lanes_active      branch_if_no_lanes_active +28 (label 1 at #326)
-  299. copy_constant                  $1 = 0x00000009 (1.261169e-44)
-  300. swizzle_4                      $1..4 = ($1..4).xxxx
-  301. copy_4_slots_unmasked          result₅ = $1..4
-  302. branch_if_no_lanes_active      branch_if_no_lanes_active +13 (label 37 at #315)
-  303. copy_constant                  i₈ = 0x00000001 (1.401298e-45)
-  304. label                          label 0x00000026
-  305. copy_4_slots_unmasked          $1..4 = result₅(1..3), i₈
-  306. copy_4_slots_masked            result₅ = Mask($1..4)
-  307. copy_slot_unmasked             $1 = i₈
-  308. copy_constant                  $2 = 0x00000001 (1.401298e-45)
-  309. add_int                        $1 += $2
-  310. copy_slot_masked               i₈ = Mask($1)
-  311. copy_constant                  $2 = 0x00000004 (5.605194e-45)
-  312. cmpne_int                      $1 = notEqual($1, $2)
-  313. stack_rewind
-  314. branch_if_no_active_lanes_eq   branch -10 (label 38 at #304) if no lanes of $1 == 0x00000000 (0.0)
-  315. label                          label 0x00000025
-  316. copy_4_slots_unmasked          $1..4 = result₅
-  317. copy_constant                  $5 = 0x00000009 (1.261169e-44)
-  318. copy_constant                  $6 = 0x00000001 (1.401298e-45)
-  319. copy_constant                  $7 = 0x00000002 (2.802597e-45)
-  320. copy_constant                  $8 = 0x00000003 (4.203895e-45)
-  321. cmpeq_4_ints                   $1..4 = equal($1..4, $5..8)
-  322. bitwise_and_2_ints             $1..2 &= $3..4
-  323. bitwise_and_int                $1 &= $2
-  324. label                          label 0x00000024
-  325. copy_slot_masked               $0 = Mask($1)
-  326. label                          label 0x00000001
-  327. load_condition_mask            CondMask = $20
-  328. swizzle_4                      $0..3 = ($0..3).xxxx
-  329. copy_4_constants               $4..7 = colorRed
-  330. copy_4_constants               $8..11 = colorGreen
-  331. mix_4_ints                     $0..3 = mix($4..7, $8..11, $0..3)
-  332. load_src                       src.rgba = $0..3
+    1. store_src                      $26..29 = src.rgba
+    2. copy_constant                  $28 = 0x3F000000 (0.5)
+    3. copy_constant                  $29 = 0x3F000000 (0.5)
+    4. cmpeq_2_floats                 $26..27 = equal($26..27, $28..29)
+    5. bitwise_and_int                $26 &= $27
+    6. store_src_rg                   pos = src.rg
+    7. init_lane_masks                CondMask = LoopMask = RetMask = true
+    8. zero_slot_unmasked             kZero = 0
+    9. copy_constant                  kTen = 0x0000000A (1.401298e-44)
+   10. copy_slot_unmasked             $2 = pos(0)
+   11. copy_constant                  $3 = colorGreen(1)
+   12. max_float                      $2 = max($2, $3)
+   13. copy_constant                  $3 = colorGreen(3)
+   14. min_float                      $2 = min($2, $3)
+   15. cast_to_int_from_float         $2 = FloatToInt($2)
+   16. copy_constant                  $3 = 0x00000005 (7.006492e-45)
+   17. mul_int                        $2 *= $3
+   18. copy_slot_unmasked             five = $2
+   19. store_condition_mask           $40 = CondMask
+   20. store_condition_mask           $50 = CondMask
+   21. store_condition_mask           $30 = CondMask
+   22. store_condition_mask           $76 = CondMask
+   23. store_condition_mask           $60 = CondMask
+   24. store_condition_mask           $20 = CondMask
+   25. store_condition_mask           $70 = CondMask
+   26. store_condition_mask           $14 = CondMask
+   27. branch_if_no_lanes_active      branch_if_no_lanes_active +33 (label 9 at #60)
+   28. store_return_mask              $15 = RetMask
+   29. copy_slot_unmasked             five₁ = five
+   30. copy_slot_unmasked             i = kZero
+   31. store_loop_mask                $16 = LoopMask
+   32. jump                           jump +15 (label 11 at #47)
+   33. label                          label 0x0000000C
+   34. store_condition_mask           $17 = CondMask
+   35. copy_slot_unmasked             $18 = i
+   36. copy_slot_unmasked             $19 = five₁
+   37. cmpeq_int                      $18 = equal($18, $19)
+   38. merge_condition_mask           CondMask = $17 & $18
+   39. copy_slot_unmasked             $19 = i
+   40. copy_slot_masked               [return_loop].result = Mask($19)
+   41. mask_off_return_mask           RetMask &= ~(CondMask & LoopMask & RetMask)
+   42. load_condition_mask            CondMask = $17
+   43. copy_slot_unmasked             $17 = i
+   44. copy_constant                  $18 = 0x00000001 (1.401298e-45)
+   45. add_int                        $17 += $18
+   46. copy_slot_masked               i = Mask($17)
+   47. label                          label 0x0000000B
+   48. copy_slot_unmasked             $17 = i
+   49. copy_constant                  $18 = 0x0000000A (1.401298e-44)
+   50. cmplt_int                      $17 = lessThan($17, $18)
+   51. merge_loop_mask                LoopMask &= $17
+   52. stack_rewind
+   53. branch_if_any_lanes_active     branch_if_any_lanes_active -20 (label 12 at #33)
+   54. label                          label 0x0000000A
+   55. load_loop_mask                 LoopMask = $16
+   56. zero_slot_unmasked             $16 = 0
+   57. copy_slot_masked               [return_loop].result = Mask($16)
+   58. load_return_mask               RetMask = $15
+   59. copy_slot_unmasked             $15 = [return_loop].result
+   60. label                          label 0x00000009
+   61. copy_constant                  $16 = 0x00000005 (7.006492e-45)
+   62. cmpeq_int                      $15 = equal($15, $16)
+   63. zero_slot_unmasked             $71 = 0
+   64. merge_condition_mask           CondMask = $14 & $15
+   65. branch_if_no_lanes_active      branch_if_no_lanes_active +38 (label 8 at #103)
+   66. copy_slot_unmasked             five₂ = five
+   67. zero_2_slots_unmasked          sum, i₁ = 0
+   68. store_loop_mask                $72 = LoopMask
+   69. jump                           jump +20 (label 15 at #89)
+   70. label                          label 0x00000010
+   71. zero_slot_unmasked             $0 = 0
+   72. store_condition_mask           $73 = CondMask
+   73. copy_slot_unmasked             $74 = i₁
+   74. copy_slot_unmasked             $75 = five₂
+   75. cmplt_int                      $74 = lessThan($74, $75)
+   76. merge_condition_mask           CondMask = $73 & $74
+   77. copy_constant                  $1 = 0xFFFFFFFF
+   78. copy_slot_masked               $0 = Mask($1)
+   79. mask_off_loop_mask             LoopMask &= ~(CondMask & LoopMask & RetMask)
+   80. load_condition_mask            CondMask = $73
+   81. copy_2_slots_unmasked          $73..74 = sum, i₁
+   82. add_int                        $73 += $74
+   83. copy_slot_masked               sum = Mask($73)
+   84. reenable_loop_mask             LoopMask |= $0
+   85. copy_slot_unmasked             $73 = i₁
+   86. copy_constant                  $74 = 0x00000001 (1.401298e-45)
+   87. add_int                        $73 += $74
+   88. copy_slot_masked               i₁ = Mask($73)
+   89. label                          label 0x0000000F
+   90. copy_slot_unmasked             $73 = i₁
+   91. copy_slot_unmasked             $74 = kTen
+   92. cmplt_int                      $73 = lessThan($73, $74)
+   93. merge_loop_mask                LoopMask &= $73
+   94. stack_rewind
+   95. branch_if_any_lanes_active     branch_if_any_lanes_active -25 (label 16 at #70)
+   96. label                          label 0x0000000E
+   97. load_loop_mask                 LoopMask = $72
+   98. copy_slot_unmasked             $72 = sum
+   99. label                          label 0x0000000D
+  100. copy_constant                  $73 = 0x00000023 (4.904545e-44)
+  101. cmpeq_int                      $72 = equal($72, $73)
+  102. copy_slot_masked               $71 = Mask($72)
+  103. label                          label 0x00000008
+  104. load_condition_mask            CondMask = $14
+  105. zero_slot_unmasked             $21 = 0
+  106. merge_condition_mask           CondMask = $70 & $71
+  107. branch_if_no_lanes_active      branch_if_no_lanes_active +38 (label 7 at #145)
+  108. copy_constant                  five₃ = 0x00000005 (7.006492e-45)
+  109. zero_slot_unmasked             sum₁ = 0
+  110. copy_constant                  kOne = 0x00000001 (1.401298e-45)
+  111. zero_slot_unmasked             i₂ = 0
+  112. store_loop_mask                $22 = LoopMask
+  113. jump                           jump +18 (label 19 at #131)
+  114. label                          label 0x00000014
+  115. store_condition_mask           $23 = CondMask
+  116. copy_slot_unmasked             $24 = five₃
+  117. copy_slot_unmasked             $25 = i₂
+  118. cmplt_int                      $24 = lessThan($24, $25)
+  119. merge_condition_mask           CondMask = $23 & $24
+  120. branch_if_all_lanes_active     branch_if_all_lanes_active +18 (label 18 at #138)
+  121. mask_off_loop_mask             LoopMask &= ~(CondMask & LoopMask & RetMask)
+  122. load_condition_mask            CondMask = $23
+  123. copy_slot_unmasked             $23 = sum₁
+  124. copy_slot_unmasked             $24 = i₂
+  125. add_int                        $23 += $24
+  126. copy_slot_masked               sum₁ = Mask($23)
+  127. copy_slot_unmasked             $23 = i₂
+  128. copy_slot_unmasked             $24 = kOne
+  129. add_int                        $23 += $24
+  130. copy_slot_masked               i₂ = Mask($23)
+  131. label                          label 0x00000013
+  132. copy_slot_unmasked             $23 = i₂
+  133. copy_constant                  $24 = 0x0000000A (1.401298e-44)
+  134. cmplt_int                      $23 = lessThan($23, $24)
+  135. merge_loop_mask                LoopMask &= $23
+  136. stack_rewind
+  137. branch_if_any_lanes_active     branch_if_any_lanes_active -23 (label 20 at #114)
+  138. label                          label 0x00000012
+  139. load_loop_mask                 LoopMask = $22
+  140. copy_slot_unmasked             $22 = sum₁
+  141. label                          label 0x00000011
+  142. copy_constant                  $23 = 0x0000000F (2.101948e-44)
+  143. cmpeq_int                      $22 = equal($22, $23)
+  144. copy_slot_masked               $21 = Mask($22)
+  145. label                          label 0x00000007
+  146. load_condition_mask            CondMask = $70
+  147. zero_slot_unmasked             $61 = 0
+  148. merge_condition_mask           CondMask = $20 & $21
+  149. branch_if_no_lanes_active      branch_if_no_lanes_active +28 (label 6 at #177)
+  150. copy_constant                  $62 = 0x00000009 (1.261169e-44)
+  151. swizzle_4                      $62..65 = ($62..65).xxxx
+  152. copy_4_slots_unmasked          result = $62..65
+  153. branch_if_no_lanes_active      branch_if_no_lanes_active +13 (label 22 at #166)
+  154. copy_constant                  i₃ = 0x00000001 (1.401298e-45)
+  155. label                          label 0x00000017
+  156. copy_4_slots_unmasked          $62..65 = result(1..3), i₃
+  157. copy_4_slots_masked            result = Mask($62..65)
+  158. copy_slot_unmasked             $62 = i₃
+  159. copy_constant                  $63 = 0x00000001 (1.401298e-45)
+  160. add_int                        $62 += $63
+  161. copy_slot_masked               i₃ = Mask($62)
+  162. copy_constant                  $63 = 0x00000003 (4.203895e-45)
+  163. cmple_int                      $62 = lessThanEqual($62, $63)
+  164. stack_rewind
+  165. branch_if_no_active_lanes_eq   branch -10 (label 23 at #155) if no lanes of $62 == 0x00000000 (0.0)
+  166. label                          label 0x00000016
+  167. copy_4_slots_unmasked          $62..65 = result
+  168. copy_constant                  $66 = 0x00000009 (1.261169e-44)
+  169. copy_constant                  $67 = 0x00000001 (1.401298e-45)
+  170. copy_constant                  $68 = 0x00000002 (2.802597e-45)
+  171. copy_constant                  $69 = 0x00000003 (4.203895e-45)
+  172. cmpeq_4_ints                   $62..65 = equal($62..65, $66..69)
+  173. bitwise_and_2_ints             $62..63 &= $64..65
+  174. bitwise_and_int                $62 &= $63
+  175. label                          label 0x00000015
+  176. copy_slot_masked               $61 = Mask($62)
+  177. label                          label 0x00000006
+  178. load_condition_mask            CondMask = $20
+  179. zero_slot_unmasked             $77 = 0
+  180. merge_condition_mask           CondMask = $60 & $61
+  181. branch_if_no_lanes_active      branch_if_no_lanes_active +28 (label 5 at #209)
+  182. copy_constant                  $78 = 0x00000009 (1.261169e-44)
+  183. swizzle_4                      $78..81 = ($78..81).xxxx
+  184. copy_4_slots_unmasked          result₁ = $78..81
+  185. branch_if_no_lanes_active      branch_if_no_lanes_active +13 (label 25 at #198)
+  186. copy_constant                  i₄ = 0x00000001 (1.401298e-45)
+  187. label                          label 0x0000001A
+  188. copy_4_slots_unmasked          $78..81 = result₁(1..3), i₄
+  189. copy_4_slots_masked            result₁ = Mask($78..81)
+  190. copy_slot_unmasked             $78 = i₄
+  191. copy_constant                  $79 = 0x00000001 (1.401298e-45)
+  192. add_int                        $78 += $79
+  193. copy_slot_masked               i₄ = Mask($78)
+  194. copy_constant                  $79 = 0x00000004 (5.605194e-45)
+  195. cmplt_int                      $78 = lessThan($78, $79)
+  196. stack_rewind
+  197. branch_if_no_active_lanes_eq   branch -10 (label 26 at #187) if no lanes of $78 == 0x00000000 (0.0)
+  198. label                          label 0x00000019
+  199. copy_4_slots_unmasked          $78..81 = result₁
+  200. copy_constant                  $82 = 0x00000009 (1.261169e-44)
+  201. copy_constant                  $83 = 0x00000001 (1.401298e-45)
+  202. copy_constant                  $84 = 0x00000002 (2.802597e-45)
+  203. copy_constant                  $85 = 0x00000003 (4.203895e-45)
+  204. cmpeq_4_ints                   $78..81 = equal($78..81, $82..85)
+  205. bitwise_and_2_ints             $78..79 &= $80..81
+  206. bitwise_and_int                $78 &= $79
+  207. label                          label 0x00000018
+  208. copy_slot_masked               $77 = Mask($78)
+  209. label                          label 0x00000005
+  210. load_condition_mask            CondMask = $60
+  211. zero_slot_unmasked             $31 = 0
+  212. merge_condition_mask           CondMask = $76 & $77
+  213. branch_if_no_lanes_active      branch_if_no_lanes_active +29 (label 4 at #242)
+  214. copy_constant                  $32 = 0x00000009 (1.261169e-44)
+  215. swizzle_4                      $32..35 = ($32..35).xxxx
+  216. copy_4_slots_unmasked          result₂ = $32..35
+  217. branch_if_no_lanes_active      branch_if_no_lanes_active +14 (label 28 at #231)
+  218. copy_constant                  i₅ = 0x00000003 (4.203895e-45)
+  219. label                          label 0x0000001D
+  220. copy_4_slots_unmasked          $32..35 = result₂(1..3), i₅
+  221. copy_4_slots_masked            result₂ = Mask($32..35)
+  222. copy_slot_unmasked             $32 = i₅
+  223. copy_constant                  $33 = 0x00000001 (1.401298e-45)
+  224. sub_int                        $32 -= $33
+  225. copy_slot_masked               i₅ = Mask($32)
+  226. copy_constant                  $32 = 0x00000001 (1.401298e-45)
+  227. copy_slot_unmasked             $33 = i₅
+  228. cmple_int                      $32 = lessThanEqual($32, $33)
+  229. stack_rewind
+  230. branch_if_no_active_lanes_eq   branch -11 (label 29 at #219) if no lanes of $32 == 0x00000000 (0.0)
+  231. label                          label 0x0000001C
+  232. copy_4_slots_unmasked          $32..35 = result₂
+  233. copy_constant                  $36 = 0x00000009 (1.261169e-44)
+  234. copy_constant                  $37 = 0x00000003 (4.203895e-45)
+  235. copy_constant                  $38 = 0x00000002 (2.802597e-45)
+  236. copy_constant                  $39 = 0x00000001 (1.401298e-45)
+  237. cmpeq_4_ints                   $32..35 = equal($32..35, $36..39)
+  238. bitwise_and_2_ints             $32..33 &= $34..35
+  239. bitwise_and_int                $32 &= $33
+  240. label                          label 0x0000001B
+  241. copy_slot_masked               $31 = Mask($32)
+  242. label                          label 0x00000004
+  243. load_condition_mask            CondMask = $76
+  244. zero_slot_unmasked             $51 = 0
+  245. merge_condition_mask           CondMask = $30 & $31
+  246. branch_if_no_lanes_active      branch_if_no_lanes_active +29 (label 3 at #275)
+  247. copy_constant                  $52 = 0x00000009 (1.261169e-44)
+  248. swizzle_4                      $52..55 = ($52..55).xxxx
+  249. copy_4_slots_unmasked          result₃ = $52..55
+  250. branch_if_no_lanes_active      branch_if_no_lanes_active +14 (label 31 at #264)
+  251. copy_constant                  i₆ = 0x00000003 (4.203895e-45)
+  252. label                          label 0x00000020
+  253. copy_4_slots_unmasked          $52..55 = result₃(1..3), i₆
+  254. copy_4_slots_masked            result₃ = Mask($52..55)
+  255. copy_slot_unmasked             $52 = i₆
+  256. copy_constant                  $53 = 0x00000001 (1.401298e-45)
+  257. sub_int                        $52 -= $53
+  258. copy_slot_masked               i₆ = Mask($52)
+  259. zero_slot_unmasked             $52 = 0
+  260. copy_slot_unmasked             $53 = i₆
+  261. cmplt_int                      $52 = lessThan($52, $53)
+  262. stack_rewind
+  263. branch_if_no_active_lanes_eq   branch -11 (label 32 at #252) if no lanes of $52 == 0x00000000 (0.0)
+  264. label                          label 0x0000001F
+  265. copy_4_slots_unmasked          $52..55 = result₃
+  266. copy_constant                  $56 = 0x00000009 (1.261169e-44)
+  267. copy_constant                  $57 = 0x00000003 (4.203895e-45)
+  268. copy_constant                  $58 = 0x00000002 (2.802597e-45)
+  269. copy_constant                  $59 = 0x00000001 (1.401298e-45)
+  270. cmpeq_4_ints                   $52..55 = equal($52..55, $56..59)
+  271. bitwise_and_2_ints             $52..53 &= $54..55
+  272. bitwise_and_int                $52 &= $53
+  273. label                          label 0x0000001E
+  274. copy_slot_masked               $51 = Mask($52)
+  275. label                          label 0x00000003
+  276. load_condition_mask            CondMask = $30
+  277. zero_slot_unmasked             $41 = 0
+  278. merge_condition_mask           CondMask = $50 & $51
+  279. branch_if_no_lanes_active      branch_if_no_lanes_active +20 (label 2 at #299)
+  280. copy_constant                  $42 = 0x00000009 (1.261169e-44)
+  281. swizzle_4                      $42..45 = ($42..45).xxxx
+  282. copy_4_slots_unmasked          result₄ = $42..45
+  283. branch_if_no_lanes_active      branch_if_no_lanes_active +5 (label 34 at #288)
+  284. copy_constant                  i₇ = 0x00000001 (1.401298e-45)
+  285. label                          label 0x00000023
+  286. copy_4_slots_unmasked          $42..45 = result₄(1..3), i₇
+  287. copy_4_slots_masked            result₄ = Mask($42..45)
+  288. label                          label 0x00000022
+  289. copy_4_slots_unmasked          $42..45 = result₄
+  290. copy_constant                  $46 = 0x00000009 (1.261169e-44)
+  291. copy_constant                  $47 = 0x00000009 (1.261169e-44)
+  292. copy_constant                  $48 = 0x00000009 (1.261169e-44)
+  293. copy_constant                  $49 = 0x00000001 (1.401298e-45)
+  294. cmpeq_4_ints                   $42..45 = equal($42..45, $46..49)
+  295. bitwise_and_2_ints             $42..43 &= $44..45
+  296. bitwise_and_int                $42 &= $43
+  297. label                          label 0x00000021
+  298. copy_slot_masked               $41 = Mask($42)
+  299. label                          label 0x00000002
+  300. load_condition_mask            CondMask = $50
+  301. zero_slot_unmasked             $2 = 0
+  302. merge_condition_mask           CondMask = $40 & $41
+  303. branch_if_no_lanes_active      branch_if_no_lanes_active +28 (label 1 at #331)
+  304. copy_constant                  $3 = 0x00000009 (1.261169e-44)
+  305. swizzle_4                      $3..6 = ($3..6).xxxx
+  306. copy_4_slots_unmasked          result₅ = $3..6
+  307. branch_if_no_lanes_active      branch_if_no_lanes_active +13 (label 37 at #320)
+  308. copy_constant                  i₈ = 0x00000001 (1.401298e-45)
+  309. label                          label 0x00000026
+  310. copy_4_slots_unmasked          $3..6 = result₅(1..3), i₈
+  311. copy_4_slots_masked            result₅ = Mask($3..6)
+  312. copy_slot_unmasked             $3 = i₈
+  313. copy_constant                  $4 = 0x00000001 (1.401298e-45)
+  314. add_int                        $3 += $4
+  315. copy_slot_masked               i₈ = Mask($3)
+  316. copy_constant                  $4 = 0x00000004 (5.605194e-45)
+  317. cmpne_int                      $3 = notEqual($3, $4)
+  318. stack_rewind
+  319. branch_if_no_active_lanes_eq   branch -10 (label 38 at #309) if no lanes of $3 == 0x00000000 (0.0)
+  320. label                          label 0x00000025
+  321. copy_4_slots_unmasked          $3..6 = result₅
+  322. copy_constant                  $7 = 0x00000009 (1.261169e-44)
+  323. copy_constant                  $8 = 0x00000001 (1.401298e-45)
+  324. copy_constant                  $9 = 0x00000002 (2.802597e-45)
+  325. copy_constant                  $10 = 0x00000003 (4.203895e-45)
+  326. cmpeq_4_ints                   $3..6 = equal($3..6, $7..10)
+  327. bitwise_and_2_ints             $3..4 &= $5..6
+  328. bitwise_and_int                $3 &= $4
+  329. label                          label 0x00000024
+  330. copy_slot_masked               $2 = Mask($3)
+  331. label                          label 0x00000001
+  332. load_condition_mask            CondMask = $40
+  333. swizzle_4                      $2..5 = ($2..5).xxxx
+  334. copy_4_constants               $6..9 = colorRed
+  335. copy_4_constants               $10..13 = colorGreen
+  336. mix_4_ints                     $2..5 = mix($6..9, $10..13, $2..5)
+  337. load_src                       src.rgba = $2..5
diff --git a/tests/sksl/runtime/MultipleCallsInOneStatement.skrp b/tests/sksl/runtime/MultipleCallsInOneStatement.skrp
index bea0f70..d14a52e 100644
--- a/tests/sksl/runtime/MultipleCallsInOneStatement.skrp
+++ b/tests/sksl/runtime/MultipleCallsInOneStatement.skrp
@@ -1,21 +1,26 @@
-    1. store_src_rg                   pos = src.rg
-    2. init_lane_masks                CondMask = LoopMask = RetMask = true
-    3. copy_constant                  x = 0x00000005 (7.006492e-45)
-    4. copy_slot_unmasked             $0 = x
-    5. label                          label 0x00000000
-    6. copy_constant                  x = 0x00000003 (4.203895e-45)
-    7. copy_slot_unmasked             $1 = x
-    8. label                          label 0x00000001
-    9. add_int                        $0 += $1
-   10. copy_constant                  x = 0x00000002 (2.802597e-45)
-   11. copy_slot_unmasked             $1 = x
-   12. label                          label 0x00000002
-   13. add_int                        $0 += $1
-   14. copy_slot_unmasked             ten = $0
-   15. copy_constant                  $1 = 0x0000000A (1.401298e-44)
-   16. cmpeq_int                      $0 = equal($0, $1)
-   17. swizzle_4                      $0..3 = ($0..3).xxxx
-   18. copy_4_constants               $4..7 = colorRed
-   19. copy_4_constants               $8..11 = colorGreen
-   20. mix_4_ints                     $0..3 = mix($4..7, $8..11, $0..3)
-   21. load_src                       src.rgba = $0..3
+    1. store_src                      $12..15 = src.rgba
+    2. copy_constant                  $14 = 0x3F000000 (0.5)
+    3. copy_constant                  $15 = 0x3F000000 (0.5)
+    4. cmpeq_2_floats                 $12..13 = equal($12..13, $14..15)
+    5. bitwise_and_int                $12 &= $13
+    6. store_src_rg                   pos = src.rg
+    7. init_lane_masks                CondMask = LoopMask = RetMask = true
+    8. copy_constant                  x = 0x00000005 (7.006492e-45)
+    9. copy_slot_unmasked             $0 = x
+   10. label                          label 0x00000000
+   11. copy_constant                  x = 0x00000003 (4.203895e-45)
+   12. copy_slot_unmasked             $1 = x
+   13. label                          label 0x00000001
+   14. add_int                        $0 += $1
+   15. copy_constant                  x = 0x00000002 (2.802597e-45)
+   16. copy_slot_unmasked             $1 = x
+   17. label                          label 0x00000002
+   18. add_int                        $0 += $1
+   19. copy_slot_unmasked             ten = $0
+   20. copy_constant                  $1 = 0x0000000A (1.401298e-44)
+   21. cmpeq_int                      $0 = equal($0, $1)
+   22. swizzle_4                      $0..3 = ($0..3).xxxx
+   23. copy_4_constants               $4..7 = colorRed
+   24. copy_4_constants               $8..11 = colorGreen
+   25. mix_4_ints                     $0..3 = mix($4..7, $8..11, $0..3)
+   26. load_src                       src.rgba = $0..3
diff --git a/tests/sksl/runtime/PrecisionQualifiers.skrp b/tests/sksl/runtime/PrecisionQualifiers.skrp
index f867df42..0d17d47 100644
--- a/tests/sksl/runtime/PrecisionQualifiers.skrp
+++ b/tests/sksl/runtime/PrecisionQualifiers.skrp
@@ -1,241 +1,246 @@
-    1. store_src_rg                   coords = src.rg
-    2. init_lane_masks                CondMask = LoopMask = RetMask = true
-    3. zero_4_slots_unmasked          zero = 0
-    4. copy_constant                  $0 = 0x3F800000 (1.0)
-    5. swizzle_4                      $0..3 = ($0..3).xxxx
-    6. copy_4_slots_unmasked          one = $0..3
-    7. copy_4_constants               $0..3 = colorGreen
-    8. copy_4_slots_unmasked          green = $0..3
-    9. copy_4_slots_unmasked          $4..7 = one
-   10. mul_4_floats                   $0..3 *= $4..7
-   11. copy_4_slots_unmasked          $4..7 = zero
-   12. add_4_floats                   $0..3 += $4..7
+    1. store_src                      $16..19 = src.rgba
+    2. copy_constant                  $18 = 0x3F000000 (0.5)
+    3. copy_constant                  $19 = 0x3F000000 (0.5)
+    4. cmpeq_2_floats                 $16..17 = equal($16..17, $18..19)
+    5. bitwise_and_int                $16 &= $17
+    6. store_src_rg                   coords = src.rg
+    7. init_lane_masks                CondMask = LoopMask = RetMask = true
+    8. zero_4_slots_unmasked          zero = 0
+    9. copy_constant                  $0 = 0x3F800000 (1.0)
+   10. swizzle_4                      $0..3 = ($0..3).xxxx
+   11. copy_4_slots_unmasked          one = $0..3
+   12. copy_4_constants               $0..3 = colorGreen
    13. copy_4_slots_unmasked          green = $0..3
-   14. copy_4_constants               $0..3 = colorRed
-   15. copy_4_slots_unmasked          red = $0..3
+   14. copy_4_slots_unmasked          $4..7 = one
+   15. mul_4_floats                   $0..3 *= $4..7
    16. copy_4_slots_unmasked          $4..7 = zero
    17. add_4_floats                   $0..3 += $4..7
-   18. copy_4_slots_unmasked          $4..7 = one
-   19. mul_4_floats                   $0..3 *= $4..7
+   18. copy_4_slots_unmasked          green = $0..3
+   19. copy_4_constants               $0..3 = colorRed
    20. copy_4_slots_unmasked          red = $0..3
-   21. store_condition_mask           $12 = CondMask
-   22. store_condition_mask           $51 = CondMask
-   23. store_condition_mask           $55 = CondMask
-   24. store_condition_mask           $16 = CondMask
-   25. store_condition_mask           $66 = CondMask
-   26. store_condition_mask           $62 = CondMask
-   27. branch_if_no_lanes_active      branch_if_no_lanes_active +10 (label 7 at #37)
-   28. copy_constant                  mp = 0x3F000000 (0.5)
-   29. copy_slot_unmasked             hp = mp
-   30. copy_constant                  ihp = 0x00000002 (2.802597e-45)
-   31. copy_slot_unmasked             imp = ihp
-   32. copy_2_slots_unmasked          $63..64 = mp, hp
-   33. cmpeq_float                    $63 = equal($63, $64)
-   34. copy_2_slots_unmasked          $64..65 = ihp, imp
-   35. cmpeq_int                      $64 = equal($64, $65)
-   36. bitwise_and_int                $63 &= $64
-   37. label                          label 0x00000007
-   38. zero_slot_unmasked             $67 = 0
-   39. merge_condition_mask           CondMask = $62 & $63
-   40. branch_if_no_lanes_active      branch_if_no_lanes_active +59 (label 6 at #99)
-   41. copy_constant                  $68 = 0x40000000 (2.0)
-   42. copy_slot_unmasked             $69 = $68
-   43. copy_2_slots_unmasked          mp2 = $68..69
-   44. copy_2_slots_unmasked          hp2 = $68..69
-   45. copy_constant                  $68 = 0x40400000 (3.0)
-   46. swizzle_3                      $68..70 = ($68..70).xxx
-   47. copy_3_slots_unmasked          mp3 = $68..70
-   48. copy_3_slots_unmasked          hp3 = $68..70
-   49. copy_constant                  $68 = 0x40800000 (4.0)
-   50. swizzle_4                      $68..71 = ($68..71).xxxx
-   51. copy_4_slots_unmasked          mp4 = $68..71
-   52. copy_4_slots_unmasked          hp4 = $68..71
-   53. copy_constant                  $68 = 0x00000002 (2.802597e-45)
-   54. copy_slot_unmasked             $69 = $68
-   55. copy_2_slots_unmasked          ihp2 = $68..69
-   56. copy_2_slots_unmasked          imp2 = $68..69
-   57. copy_constant                  $68 = 0x00000003 (4.203895e-45)
-   58. swizzle_3                      $68..70 = ($68..70).xxx
-   59. copy_3_slots_unmasked          ihp3 = $68..70
-   60. copy_3_slots_unmasked          imp3 = $68..70
-   61. copy_constant                  $68 = 0x00000004 (5.605194e-45)
-   62. swizzle_4                      $68..71 = ($68..71).xxxx
-   63. copy_4_slots_unmasked          ihp4 = $68..71
-   64. copy_4_slots_unmasked          imp4 = $68..71
-   65. copy_4_slots_unmasked          $68..71 = mp2, hp2
-   66. cmpeq_2_floats                 $68..69 = equal($68..69, $70..71)
-   67. bitwise_and_int                $68 &= $69
-   68. copy_3_slots_unmasked          $69..71 = hp3
-   69. copy_3_slots_unmasked          $72..74 = mp3
-   70. cmpeq_3_floats                 $69..71 = equal($69..71, $72..74)
-   71. bitwise_and_int                $70 &= $71
-   72. bitwise_and_int                $69 &= $70
-   73. bitwise_and_int                $68 &= $69
-   74. copy_4_slots_unmasked          $69..72 = mp4
-   75. copy_4_slots_unmasked          $73..76 = hp4
-   76. cmpeq_4_floats                 $69..72 = equal($69..72, $73..76)
-   77. bitwise_and_2_ints             $69..70 &= $71..72
-   78. bitwise_and_int                $69 &= $70
-   79. bitwise_and_int                $68 &= $69
-   80. copy_2_slots_unmasked          $69..70 = imp2
-   81. copy_2_slots_unmasked          $71..72 = ihp2
-   82. cmpeq_2_ints                   $69..70 = equal($69..70, $71..72)
-   83. bitwise_and_int                $69 &= $70
-   84. bitwise_and_int                $68 &= $69
-   85. copy_4_slots_unmasked          $69..72 = ihp3, imp3(0)
-   86. copy_2_slots_unmasked          $73..74 = imp3(1..2)
-   87. cmpeq_3_ints                   $69..71 = equal($69..71, $72..74)
-   88. bitwise_and_int                $70 &= $71
-   89. bitwise_and_int                $69 &= $70
-   90. bitwise_and_int                $68 &= $69
-   91. copy_4_slots_unmasked          $69..72 = imp4
-   92. copy_4_slots_unmasked          $73..76 = ihp4
-   93. cmpeq_4_ints                   $69..72 = equal($69..72, $73..76)
-   94. bitwise_and_2_ints             $69..70 &= $71..72
-   95. bitwise_and_int                $69 &= $70
-   96. bitwise_and_int                $68 &= $69
-   97. label                          label 0x00000008
-   98. copy_slot_masked               $67 = Mask($68)
-   99. label                          label 0x00000006
-  100. load_condition_mask            CondMask = $62
-  101. zero_slot_unmasked             $17 = 0
-  102. merge_condition_mask           CondMask = $66 & $67
-  103. branch_if_no_lanes_active      branch_if_no_lanes_active +60 (label 5 at #163)
-  104. zero_slot_unmasked             $18 = 0
-  105. copy_constant                  $19 = 0x40000000 (2.0)
-  106. swizzle_4                      $18..21 = ($18..21).yxxy
-  107. copy_4_slots_unmasked          mp2₁ = $18..21
-  108. copy_4_slots_unmasked          hp2₁ = $18..21
-  109. zero_slot_unmasked             $18 = 0
-  110. copy_constant                  $19 = 0x40400000 (3.0)
-  111. shuffle                        $18..26 = ($18..26)[1 0 0 0 1 0 0 0 1]
-  112. copy_4_slots_unmasked          mp3₁(0..3) = $18..21
-  113. copy_4_slots_unmasked          mp3₁(4..7) = $22..25
-  114. copy_slot_unmasked             mp3₁(8) = $26
-  115. copy_4_slots_unmasked          hp3₁(0..3) = $18..21
-  116. copy_4_slots_unmasked          hp3₁(4..7) = $22..25
-  117. copy_slot_unmasked             hp3₁(8) = $26
-  118. zero_slot_unmasked             $18 = 0
-  119. copy_constant                  $19 = 0x40800000 (4.0)
-  120. shuffle                        $18..33 = ($18..33)[1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1]
-  121. copy_4_slots_unmasked          mp4₁(0..3) = $18..21
-  122. copy_4_slots_unmasked          mp4₁(4..7) = $22..25
-  123. copy_4_slots_unmasked          mp4₁(8..11) = $26..29
-  124. copy_4_slots_unmasked          mp4₁(12..15) = $30..33
-  125. copy_4_slots_unmasked          hp4₁(0..3) = $18..21
-  126. copy_4_slots_unmasked          hp4₁(4..7) = $22..25
-  127. copy_4_slots_unmasked          hp4₁(8..11) = $26..29
-  128. copy_4_slots_unmasked          hp4₁(12..15) = $30..33
-  129. copy_4_slots_unmasked          $18..21 = mp2₁
-  130. copy_4_slots_unmasked          $22..25 = hp2₁
-  131. cmpeq_4_floats                 $18..21 = equal($18..21, $22..25)
-  132. bitwise_and_2_ints             $18..19 &= $20..21
-  133. bitwise_and_int                $18 &= $19
-  134. copy_4_slots_unmasked          $19..22 = hp3₁(0..3)
-  135. copy_4_slots_unmasked          $23..26 = hp3₁(4..7)
-  136. copy_slot_unmasked             $27 = hp3₁(8)
-  137. copy_4_slots_unmasked          $28..31 = mp3₁(0..3)
-  138. copy_4_slots_unmasked          $32..35 = mp3₁(4..7)
-  139. copy_slot_unmasked             $36 = mp3₁(8)
-  140. cmpeq_n_floats                 $19..27 = equal($19..27, $28..36)
-  141. bitwise_and_4_ints             $20..23 &= $24..27
-  142. bitwise_and_2_ints             $20..21 &= $22..23
-  143. bitwise_and_int                $20 &= $21
-  144. bitwise_and_int                $19 &= $20
-  145. bitwise_and_int                $18 &= $19
-  146. copy_4_slots_unmasked          $19..22 = mp4₁(0..3)
-  147. copy_4_slots_unmasked          $23..26 = mp4₁(4..7)
-  148. copy_4_slots_unmasked          $27..30 = mp4₁(8..11)
-  149. copy_4_slots_unmasked          $31..34 = mp4₁(12..15)
-  150. copy_4_slots_unmasked          $35..38 = hp4₁(0..3)
-  151. copy_4_slots_unmasked          $39..42 = hp4₁(4..7)
-  152. copy_4_slots_unmasked          $43..46 = hp4₁(8..11)
-  153. copy_4_slots_unmasked          $47..50 = hp4₁(12..15)
-  154. cmpeq_n_floats                 $19..34 = equal($19..34, $35..50)
-  155. bitwise_and_4_ints             $27..30 &= $31..34
-  156. bitwise_and_4_ints             $23..26 &= $27..30
-  157. bitwise_and_4_ints             $19..22 &= $23..26
-  158. bitwise_and_2_ints             $19..20 &= $21..22
-  159. bitwise_and_int                $19 &= $20
-  160. bitwise_and_int                $18 &= $19
-  161. label                          label 0x00000009
-  162. copy_slot_masked               $17 = Mask($18)
-  163. label                          label 0x00000005
-  164. load_condition_mask            CondMask = $66
-  165. zero_slot_unmasked             $56 = 0
-  166. merge_condition_mask           CondMask = $16 & $17
-  167. branch_if_no_lanes_active      branch_if_no_lanes_active +35 (label 4 at #202)
-  168. zero_slot_unmasked             mf[0] = 0
-  169. copy_constant                  $57 = 0x3F800000 (1.0)
-  170. copy_slot_masked               mf[0] = Mask($57)
-  171. zero_slot_unmasked             hf[0] = 0
-  172. copy_constant                  $57 = 0x3F800000 (1.0)
-  173. copy_slot_masked               hf[0] = Mask($57)
-  174. zero_4_slots_unmasked          mv[0], mv[1] = 0
-  175. zero_slot_unmasked             $57 = 0
-  176. copy_constant                  $58 = 0x3F800000 (1.0)
-  177. copy_2_slots_masked            mv[0] = Mask($57..58)
-  178. copy_constant                  $57 = 0x40000000 (2.0)
-  179. copy_constant                  $58 = 0x40400000 (3.0)
-  180. copy_2_slots_masked            mv[1] = Mask($57..58)
-  181. zero_4_slots_unmasked          hv[0], hv[1] = 0
-  182. zero_slot_unmasked             $57 = 0
-  183. copy_constant                  $58 = 0x3F800000 (1.0)
-  184. copy_2_slots_masked            hv[0] = Mask($57..58)
-  185. copy_constant                  $57 = 0x40000000 (2.0)
-  186. copy_constant                  $58 = 0x40400000 (3.0)
-  187. copy_2_slots_masked            hv[1] = Mask($57..58)
-  188. copy_2_slots_unmasked          $57..58 = mf[0], hf[0]
-  189. cmpeq_float                    $57 = equal($57, $58)
-  190. copy_2_slots_unmasked          $58..59 = hv[0]
-  191. copy_2_slots_unmasked          $60..61 = mv[0]
-  192. cmpeq_2_floats                 $58..59 = equal($58..59, $60..61)
-  193. bitwise_and_int                $58 &= $59
-  194. bitwise_and_int                $57 &= $58
-  195. copy_2_slots_unmasked          $58..59 = mv[1]
-  196. copy_2_slots_unmasked          $60..61 = hv[1]
-  197. cmpeq_2_floats                 $58..59 = equal($58..59, $60..61)
-  198. bitwise_and_int                $58 &= $59
-  199. bitwise_and_int                $57 &= $58
-  200. label                          label 0x0000000A
-  201. copy_slot_masked               $56 = Mask($57)
-  202. label                          label 0x00000004
-  203. load_condition_mask            CondMask = $16
-  204. zero_slot_unmasked             $52 = 0
-  205. merge_condition_mask           CondMask = $55 & $56
-  206. branch_if_no_lanes_active      branch_if_no_lanes_active +7 (label 3 at #213)
-  207. copy_constant                  value = 0x3F800000 (1.0)
-  208. copy_slot_unmasked             $53 = value
-  209. copy_constant                  $54 = 0x3F800000 (1.0)
-  210. cmpeq_float                    $53 = equal($53, $54)
-  211. label                          label 0x0000000B
-  212. copy_slot_masked               $52 = Mask($53)
-  213. label                          label 0x00000003
-  214. load_condition_mask            CondMask = $55
-  215. zero_slot_unmasked             $13 = 0
-  216. merge_condition_mask           CondMask = $51 & $52
-  217. branch_if_no_lanes_active      branch_if_no_lanes_active +7 (label 2 at #224)
-  218. copy_constant                  value₁ = 0x40000000 (2.0)
-  219. copy_slot_unmasked             $14 = value₁
-  220. copy_constant                  $15 = 0x40000000 (2.0)
-  221. cmpeq_float                    $14 = equal($14, $15)
-  222. label                          label 0x0000000C
-  223. copy_slot_masked               $13 = Mask($14)
-  224. label                          label 0x00000002
-  225. load_condition_mask            CondMask = $51
-  226. zero_slot_unmasked             $0 = 0
-  227. merge_condition_mask           CondMask = $12 & $13
-  228. branch_if_no_lanes_active      branch_if_no_lanes_active +7 (label 1 at #235)
-  229. copy_constant                  value₂ = 0x40400000 (3.0)
-  230. copy_slot_unmasked             $1 = value₂
-  231. copy_constant                  $2 = 0x40400000 (3.0)
-  232. cmpeq_float                    $1 = equal($1, $2)
-  233. label                          label 0x0000000D
-  234. copy_slot_masked               $0 = Mask($1)
-  235. label                          label 0x00000001
-  236. load_condition_mask            CondMask = $12
-  237. swizzle_4                      $0..3 = ($0..3).xxxx
-  238. copy_4_slots_unmasked          $4..7 = red
-  239. copy_4_slots_unmasked          $8..11 = green
-  240. mix_4_ints                     $0..3 = mix($4..7, $8..11, $0..3)
-  241. load_src                       src.rgba = $0..3
+   21. copy_4_slots_unmasked          $4..7 = zero
+   22. add_4_floats                   $0..3 += $4..7
+   23. copy_4_slots_unmasked          $4..7 = one
+   24. mul_4_floats                   $0..3 *= $4..7
+   25. copy_4_slots_unmasked          red = $0..3
+   26. store_condition_mask           $27 = CondMask
+   27. store_condition_mask           $31 = CondMask
+   28. store_condition_mask           $20 = CondMask
+   29. store_condition_mask           $46 = CondMask
+   30. store_condition_mask           $35 = CondMask
+   31. store_condition_mask           $12 = CondMask
+   32. branch_if_no_lanes_active      branch_if_no_lanes_active +10 (label 7 at #42)
+   33. copy_constant                  mp = 0x3F000000 (0.5)
+   34. copy_slot_unmasked             hp = mp
+   35. copy_constant                  ihp = 0x00000002 (2.802597e-45)
+   36. copy_slot_unmasked             imp = ihp
+   37. copy_2_slots_unmasked          $13..14 = mp, hp
+   38. cmpeq_float                    $13 = equal($13, $14)
+   39. copy_2_slots_unmasked          $14..15 = ihp, imp
+   40. cmpeq_int                      $14 = equal($14, $15)
+   41. bitwise_and_int                $13 &= $14
+   42. label                          label 0x00000007
+   43. zero_slot_unmasked             $36 = 0
+   44. merge_condition_mask           CondMask = $12 & $13
+   45. branch_if_no_lanes_active      branch_if_no_lanes_active +59 (label 6 at #104)
+   46. copy_constant                  $37 = 0x40000000 (2.0)
+   47. copy_slot_unmasked             $38 = $37
+   48. copy_2_slots_unmasked          mp2 = $37..38
+   49. copy_2_slots_unmasked          hp2 = $37..38
+   50. copy_constant                  $37 = 0x40400000 (3.0)
+   51. swizzle_3                      $37..39 = ($37..39).xxx
+   52. copy_3_slots_unmasked          mp3 = $37..39
+   53. copy_3_slots_unmasked          hp3 = $37..39
+   54. copy_constant                  $37 = 0x40800000 (4.0)
+   55. swizzle_4                      $37..40 = ($37..40).xxxx
+   56. copy_4_slots_unmasked          mp4 = $37..40
+   57. copy_4_slots_unmasked          hp4 = $37..40
+   58. copy_constant                  $37 = 0x00000002 (2.802597e-45)
+   59. copy_slot_unmasked             $38 = $37
+   60. copy_2_slots_unmasked          ihp2 = $37..38
+   61. copy_2_slots_unmasked          imp2 = $37..38
+   62. copy_constant                  $37 = 0x00000003 (4.203895e-45)
+   63. swizzle_3                      $37..39 = ($37..39).xxx
+   64. copy_3_slots_unmasked          ihp3 = $37..39
+   65. copy_3_slots_unmasked          imp3 = $37..39
+   66. copy_constant                  $37 = 0x00000004 (5.605194e-45)
+   67. swizzle_4                      $37..40 = ($37..40).xxxx
+   68. copy_4_slots_unmasked          ihp4 = $37..40
+   69. copy_4_slots_unmasked          imp4 = $37..40
+   70. copy_4_slots_unmasked          $37..40 = mp2, hp2
+   71. cmpeq_2_floats                 $37..38 = equal($37..38, $39..40)
+   72. bitwise_and_int                $37 &= $38
+   73. copy_3_slots_unmasked          $38..40 = hp3
+   74. copy_3_slots_unmasked          $41..43 = mp3
+   75. cmpeq_3_floats                 $38..40 = equal($38..40, $41..43)
+   76. bitwise_and_int                $39 &= $40
+   77. bitwise_and_int                $38 &= $39
+   78. bitwise_and_int                $37 &= $38
+   79. copy_4_slots_unmasked          $38..41 = mp4
+   80. copy_4_slots_unmasked          $42..45 = hp4
+   81. cmpeq_4_floats                 $38..41 = equal($38..41, $42..45)
+   82. bitwise_and_2_ints             $38..39 &= $40..41
+   83. bitwise_and_int                $38 &= $39
+   84. bitwise_and_int                $37 &= $38
+   85. copy_2_slots_unmasked          $38..39 = imp2
+   86. copy_2_slots_unmasked          $40..41 = ihp2
+   87. cmpeq_2_ints                   $38..39 = equal($38..39, $40..41)
+   88. bitwise_and_int                $38 &= $39
+   89. bitwise_and_int                $37 &= $38
+   90. copy_4_slots_unmasked          $38..41 = ihp3, imp3(0)
+   91. copy_2_slots_unmasked          $42..43 = imp3(1..2)
+   92. cmpeq_3_ints                   $38..40 = equal($38..40, $41..43)
+   93. bitwise_and_int                $39 &= $40
+   94. bitwise_and_int                $38 &= $39
+   95. bitwise_and_int                $37 &= $38
+   96. copy_4_slots_unmasked          $38..41 = imp4
+   97. copy_4_slots_unmasked          $42..45 = ihp4
+   98. cmpeq_4_ints                   $38..41 = equal($38..41, $42..45)
+   99. bitwise_and_2_ints             $38..39 &= $40..41
+  100. bitwise_and_int                $38 &= $39
+  101. bitwise_and_int                $37 &= $38
+  102. label                          label 0x00000008
+  103. copy_slot_masked               $36 = Mask($37)
+  104. label                          label 0x00000006
+  105. load_condition_mask            CondMask = $12
+  106. zero_slot_unmasked             $47 = 0
+  107. merge_condition_mask           CondMask = $35 & $36
+  108. branch_if_no_lanes_active      branch_if_no_lanes_active +60 (label 5 at #168)
+  109. zero_slot_unmasked             $48 = 0
+  110. copy_constant                  $49 = 0x40000000 (2.0)
+  111. swizzle_4                      $48..51 = ($48..51).yxxy
+  112. copy_4_slots_unmasked          mp2₁ = $48..51
+  113. copy_4_slots_unmasked          hp2₁ = $48..51
+  114. zero_slot_unmasked             $48 = 0
+  115. copy_constant                  $49 = 0x40400000 (3.0)
+  116. shuffle                        $48..56 = ($48..56)[1 0 0 0 1 0 0 0 1]
+  117. copy_4_slots_unmasked          mp3₁(0..3) = $48..51
+  118. copy_4_slots_unmasked          mp3₁(4..7) = $52..55
+  119. copy_slot_unmasked             mp3₁(8) = $56
+  120. copy_4_slots_unmasked          hp3₁(0..3) = $48..51
+  121. copy_4_slots_unmasked          hp3₁(4..7) = $52..55
+  122. copy_slot_unmasked             hp3₁(8) = $56
+  123. zero_slot_unmasked             $48 = 0
+  124. copy_constant                  $49 = 0x40800000 (4.0)
+  125. shuffle                        $48..63 = ($48..63)[1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1]
+  126. copy_4_slots_unmasked          mp4₁(0..3) = $48..51
+  127. copy_4_slots_unmasked          mp4₁(4..7) = $52..55
+  128. copy_4_slots_unmasked          mp4₁(8..11) = $56..59
+  129. copy_4_slots_unmasked          mp4₁(12..15) = $60..63
+  130. copy_4_slots_unmasked          hp4₁(0..3) = $48..51
+  131. copy_4_slots_unmasked          hp4₁(4..7) = $52..55
+  132. copy_4_slots_unmasked          hp4₁(8..11) = $56..59
+  133. copy_4_slots_unmasked          hp4₁(12..15) = $60..63
+  134. copy_4_slots_unmasked          $48..51 = mp2₁
+  135. copy_4_slots_unmasked          $52..55 = hp2₁
+  136. cmpeq_4_floats                 $48..51 = equal($48..51, $52..55)
+  137. bitwise_and_2_ints             $48..49 &= $50..51
+  138. bitwise_and_int                $48 &= $49
+  139. copy_4_slots_unmasked          $49..52 = hp3₁(0..3)
+  140. copy_4_slots_unmasked          $53..56 = hp3₁(4..7)
+  141. copy_slot_unmasked             $57 = hp3₁(8)
+  142. copy_4_slots_unmasked          $58..61 = mp3₁(0..3)
+  143. copy_4_slots_unmasked          $62..65 = mp3₁(4..7)
+  144. copy_slot_unmasked             $66 = mp3₁(8)
+  145. cmpeq_n_floats                 $49..57 = equal($49..57, $58..66)
+  146. bitwise_and_4_ints             $50..53 &= $54..57
+  147. bitwise_and_2_ints             $50..51 &= $52..53
+  148. bitwise_and_int                $50 &= $51
+  149. bitwise_and_int                $49 &= $50
+  150. bitwise_and_int                $48 &= $49
+  151. copy_4_slots_unmasked          $49..52 = mp4₁(0..3)
+  152. copy_4_slots_unmasked          $53..56 = mp4₁(4..7)
+  153. copy_4_slots_unmasked          $57..60 = mp4₁(8..11)
+  154. copy_4_slots_unmasked          $61..64 = mp4₁(12..15)
+  155. copy_4_slots_unmasked          $65..68 = hp4₁(0..3)
+  156. copy_4_slots_unmasked          $69..72 = hp4₁(4..7)
+  157. copy_4_slots_unmasked          $73..76 = hp4₁(8..11)
+  158. copy_4_slots_unmasked          $77..80 = hp4₁(12..15)
+  159. cmpeq_n_floats                 $49..64 = equal($49..64, $65..80)
+  160. bitwise_and_4_ints             $57..60 &= $61..64
+  161. bitwise_and_4_ints             $53..56 &= $57..60
+  162. bitwise_and_4_ints             $49..52 &= $53..56
+  163. bitwise_and_2_ints             $49..50 &= $51..52
+  164. bitwise_and_int                $49 &= $50
+  165. bitwise_and_int                $48 &= $49
+  166. label                          label 0x00000009
+  167. copy_slot_masked               $47 = Mask($48)
+  168. label                          label 0x00000005
+  169. load_condition_mask            CondMask = $35
+  170. zero_slot_unmasked             $21 = 0
+  171. merge_condition_mask           CondMask = $46 & $47
+  172. branch_if_no_lanes_active      branch_if_no_lanes_active +35 (label 4 at #207)
+  173. zero_slot_unmasked             mf[0] = 0
+  174. copy_constant                  $22 = 0x3F800000 (1.0)
+  175. copy_slot_masked               mf[0] = Mask($22)
+  176. zero_slot_unmasked             hf[0] = 0
+  177. copy_constant                  $22 = 0x3F800000 (1.0)
+  178. copy_slot_masked               hf[0] = Mask($22)
+  179. zero_4_slots_unmasked          mv[0], mv[1] = 0
+  180. zero_slot_unmasked             $22 = 0
+  181. copy_constant                  $23 = 0x3F800000 (1.0)
+  182. copy_2_slots_masked            mv[0] = Mask($22..23)
+  183. copy_constant                  $22 = 0x40000000 (2.0)
+  184. copy_constant                  $23 = 0x40400000 (3.0)
+  185. copy_2_slots_masked            mv[1] = Mask($22..23)
+  186. zero_4_slots_unmasked          hv[0], hv[1] = 0
+  187. zero_slot_unmasked             $22 = 0
+  188. copy_constant                  $23 = 0x3F800000 (1.0)
+  189. copy_2_slots_masked            hv[0] = Mask($22..23)
+  190. copy_constant                  $22 = 0x40000000 (2.0)
+  191. copy_constant                  $23 = 0x40400000 (3.0)
+  192. copy_2_slots_masked            hv[1] = Mask($22..23)
+  193. copy_2_slots_unmasked          $22..23 = mf[0], hf[0]
+  194. cmpeq_float                    $22 = equal($22, $23)
+  195. copy_2_slots_unmasked          $23..24 = hv[0]
+  196. copy_2_slots_unmasked          $25..26 = mv[0]
+  197. cmpeq_2_floats                 $23..24 = equal($23..24, $25..26)
+  198. bitwise_and_int                $23 &= $24
+  199. bitwise_and_int                $22 &= $23
+  200. copy_2_slots_unmasked          $23..24 = mv[1]
+  201. copy_2_slots_unmasked          $25..26 = hv[1]
+  202. cmpeq_2_floats                 $23..24 = equal($23..24, $25..26)
+  203. bitwise_and_int                $23 &= $24
+  204. bitwise_and_int                $22 &= $23
+  205. label                          label 0x0000000A
+  206. copy_slot_masked               $21 = Mask($22)
+  207. label                          label 0x00000004
+  208. load_condition_mask            CondMask = $46
+  209. zero_slot_unmasked             $32 = 0
+  210. merge_condition_mask           CondMask = $20 & $21
+  211. branch_if_no_lanes_active      branch_if_no_lanes_active +7 (label 3 at #218)
+  212. copy_constant                  value = 0x3F800000 (1.0)
+  213. copy_slot_unmasked             $33 = value
+  214. copy_constant                  $34 = 0x3F800000 (1.0)
+  215. cmpeq_float                    $33 = equal($33, $34)
+  216. label                          label 0x0000000B
+  217. copy_slot_masked               $32 = Mask($33)
+  218. label                          label 0x00000003
+  219. load_condition_mask            CondMask = $20
+  220. zero_slot_unmasked             $28 = 0
+  221. merge_condition_mask           CondMask = $31 & $32
+  222. branch_if_no_lanes_active      branch_if_no_lanes_active +7 (label 2 at #229)
+  223. copy_constant                  value₁ = 0x40000000 (2.0)
+  224. copy_slot_unmasked             $29 = value₁
+  225. copy_constant                  $30 = 0x40000000 (2.0)
+  226. cmpeq_float                    $29 = equal($29, $30)
+  227. label                          label 0x0000000C
+  228. copy_slot_masked               $28 = Mask($29)
+  229. label                          label 0x00000002
+  230. load_condition_mask            CondMask = $31
+  231. zero_slot_unmasked             $0 = 0
+  232. merge_condition_mask           CondMask = $27 & $28
+  233. branch_if_no_lanes_active      branch_if_no_lanes_active +7 (label 1 at #240)
+  234. copy_constant                  value₂ = 0x40400000 (3.0)
+  235. copy_slot_unmasked             $1 = value₂
+  236. copy_constant                  $2 = 0x40400000 (3.0)
+  237. cmpeq_float                    $1 = equal($1, $2)
+  238. label                          label 0x0000000D
+  239. copy_slot_masked               $0 = Mask($1)
+  240. label                          label 0x00000001
+  241. load_condition_mask            CondMask = $27
+  242. swizzle_4                      $0..3 = ($0..3).xxxx
+  243. copy_4_slots_unmasked          $4..7 = red
+  244. copy_4_slots_unmasked          $8..11 = green
+  245. mix_4_ints                     $0..3 = mix($4..7, $8..11, $0..3)
+  246. load_src                       src.rgba = $0..3
diff --git a/tools/skslc/Main.cpp b/tools/skslc/Main.cpp
index 5e81552..edba074 100644
--- a/tools/skslc/Main.cpp
+++ b/tools/skslc/Main.cpp
@@ -669,8 +669,9 @@
                         compiler.errorReporter().error({}, "code has no entrypoint");
                         return false;
                     }
+                    bool wantTraceOps = (debugTrace != nullptr);
                     std::unique_ptr<SkSL::RP::Program> rasterProg = SkSL::MakeRasterPipelineProgram(
-                            program, *main->definition(), &skrpDebugTrace);
+                            program, *main->definition(), &skrpDebugTrace, wantTraceOps);
                     if (!rasterProg) {
                         compiler.errorReporter().error({}, "code is not supported");
                         return false;