Revert "Revert "Complete rewrite of the SkSL interpreter""

This reverts commit 99c54f0290bbd55fba5394a6e4344205d2244063.
diff --git a/bench/SkSLInterpreterBench.cpp b/bench/SkSLInterpreterBench.cpp
index 22afd28..5684955 100644
--- a/bench/SkSLInterpreterBench.cpp
+++ b/bench/SkSLInterpreterBench.cpp
@@ -9,6 +9,7 @@
 #include "include/utils/SkRandom.h"
 #include "src/sksl/SkSLByteCode.h"
 #include "src/sksl/SkSLCompiler.h"
+#include "src/sksl/SkSLInterpreter.h"
 
 // Without this build flag, this bench isn't runnable.
 #if defined(SK_ENABLE_SKSL_INTERPRETER)
@@ -22,6 +23,8 @@
         , fCount(pixels) {}
 
 protected:
+    static constexpr int VecWidth = 16;
+
     const char* onGetName() override {
         return fName.c_str();
     }
@@ -35,9 +38,10 @@
         SkSL::Program::Settings settings;
         auto program = compiler.convertProgram(SkSL::Program::kGeneric_Kind, fSrc, settings);
         SkASSERT(compiler.errorCount() == 0);
-        fByteCode = compiler.toByteCode(*program);
+        std::unique_ptr<SkSL::ByteCode> byteCode = compiler.toByteCode(*program);
+        fMain = byteCode->getFunction("main");
+        fInterpreter.reset(new SkSL::Interpreter<VecWidth>(std::move(byteCode)));
         SkASSERT(compiler.errorCount() == 0);
-        fMain = fByteCode->getFunction("main");
 
         SkRandom rnd;
         fPixels.resize(fCount * 4);
@@ -55,14 +59,14 @@
                 fPixels.data() + 3 * fCount,
             };
 
-            SkAssertResult(fByteCode->runStriped(fMain, fCount, args, 4, nullptr, 0, nullptr, 0));
+            fInterpreter->runStriped(fMain, fCount, (float**) args);
         }
     }
 
 private:
     SkString fName;
     SkSL::String fSrc;
-    std::unique_ptr<SkSL::ByteCode> fByteCode;
+    std::unique_ptr<SkSL::Interpreter<VecWidth>> fInterpreter;
     const SkSL::ByteCodeFunction* fMain;
 
     int fCount;
diff --git a/gn/sksl.gni b/gn/sksl.gni
index acc3242..182d312 100644
--- a/gn/sksl.gni
+++ b/gn/sksl.gni
@@ -8,7 +8,6 @@
 
 skia_sksl_sources = [
   "$_src/sksl/SkSLASTNode.cpp",
-  "$_src/sksl/SkSLByteCode.cpp",
   "$_src/sksl/SkSLByteCodeGenerator.cpp",
   "$_src/sksl/SkSLCFGGenerator.cpp",
   "$_src/sksl/SkSLCompiler.cpp",
diff --git a/modules/particles/include/SkParticleEffect.h b/modules/particles/include/SkParticleEffect.h
index b19ce2f..b28eac8 100644
--- a/modules/particles/include/SkParticleEffect.h
+++ b/modules/particles/include/SkParticleEffect.h
@@ -16,6 +16,7 @@
 #include "include/private/SkTemplates.h"
 #include "include/utils/SkRandom.h"
 #include "modules/particles/include/SkParticleData.h"
+#include "src/sksl/SkSLInterpreter.h"
 
 #include <memory>
 
@@ -25,6 +26,8 @@
 class SkParticleDrawable;
 class SkParticleExternalValue;
 
+static constexpr int INTERPRETER_WIDTH = 8;
+
 namespace skresources {
     class ResourceProvider;
 }
@@ -122,13 +125,16 @@
     friend class SkParticleEffect;
 
     // Cached
+    template<int width>
     struct Program {
-        std::unique_ptr<SkSL::ByteCode> fByteCode;
+        std::unique_ptr<SkSL::Interpreter<width>> fInterpreter;
         SkTArray<std::unique_ptr<SkParticleExternalValue>> fExternalValues;
     };
 
-    Program fEffectProgram;
-    Program fParticleProgram;
+    // for performance it would be better to run this with a Program<1>, but for code-size reasons
+    // we stick to INTERPRETER_WIDTH
+    Program<INTERPRETER_WIDTH> fEffectProgram;
+    Program<INTERPRETER_WIDTH> fParticleProgram;
 };
 
 class SkParticleEffect : public SkRefCnt {
@@ -183,8 +189,17 @@
     void setFrame   (float     f) { fState.fFrame    = f; }
     void setFlags   (uint32_t  f) { fState.fFlags    = f; }
 
-    const SkSL::ByteCode* effectCode() const { return fParams->fEffectProgram.fByteCode.get(); }
-    const SkSL::ByteCode* particleCode() const { return fParams->fParticleProgram.fByteCode.get(); }
+    const SkSL::ByteCode* effectCode() const {
+        return fParams->fEffectProgram.fInterpreter ?
+               &fParams->fEffectProgram.fInterpreter->getCode() :
+               nullptr;
+    }
+
+    const SkSL::ByteCode* particleCode() const {
+        return fParams->fParticleProgram.fInterpreter ?
+               &fParams->fParticleProgram.fInterpreter->getCode() :
+               nullptr;
+    }
 
     float* effectUniforms() { return fEffectUniforms.data(); }
     float* particleUniforms() { return fParticleUniforms.data(); }
diff --git a/modules/particles/src/SkParticleEffect.cpp b/modules/particles/src/SkParticleEffect.cpp
index a1d39d0..ab4e4e1 100644
--- a/modules/particles/src/SkParticleEffect.cpp
+++ b/modules/particles/src/SkParticleEffect.cpp
@@ -119,7 +119,9 @@
         fDrawable->prepare(resourceProvider);
     }
 
-    auto buildProgram = [this](const SkSL::String& code, Program* p) {
+    auto buildProgram = [this](const SkSL::String& code) ->
+                                     std::pair<std::unique_ptr<SkSL::ByteCode>,
+                                               SkTArray<std::unique_ptr<SkParticleExternalValue>>> {
         SkSL::Compiler compiler;
         SkSL::Program::Settings settings;
 
@@ -140,17 +142,15 @@
         auto program = compiler.convertProgram(SkSL::Program::kGeneric_Kind, code, settings);
         if (!program) {
             SkDebugf("%s\n", compiler.errorText().c_str());
-            return;
+            return std::make_pair(nullptr, std::move(externalValues));
         }
 
         auto byteCode = compiler.toByteCode(*program);
         if (!byteCode) {
             SkDebugf("%s\n", compiler.errorText().c_str());
-            return;
+            return std::make_pair(nullptr, std::move(externalValues));
         }
-
-        p->fByteCode = std::move(byteCode);
-        p->fExternalValues.swap(externalValues);
+        return std::make_pair(std::move(byteCode), std::move(externalValues));
     };
 
     SkSL::String effectCode(kCommonHeader);
@@ -160,8 +160,15 @@
     particleCode.append(kParticleHeader);
     particleCode.append(fParticleCode.c_str());
 
-    buildProgram(effectCode, &fEffectProgram);
-    buildProgram(particleCode, &fParticleProgram);
+    auto effectProgram = buildProgram(effectCode);
+    fEffectProgram.fInterpreter.reset(new SkSL::Interpreter<INTERPRETER_WIDTH>(
+                                                                   std::move(effectProgram.first)));
+    fEffectProgram.fExternalValues.swap(effectProgram.second);
+
+    auto particleProgram = buildProgram(particleCode);
+    fParticleProgram.fInterpreter.reset(new SkSL::Interpreter<INTERPRETER_WIDTH>(
+                                                                 std::move(particleProgram.first)));
+    fParticleProgram.fExternalValues.swap(particleProgram.second);
 }
 
 SkParticleEffect::SkParticleEffect(sk_sp<SkParticleEffectParams> params, const SkRandom& random)
@@ -222,15 +229,22 @@
 }
 
 void SkParticleEffect::runEffectScript(double now, const char* entry) {
-    if (const auto& byteCode = fParams->fEffectProgram.fByteCode) {
-        if (auto fun = byteCode->getFunction(entry)) {
+    SkSL::Interpreter<INTERPRETER_WIDTH>* interpreter = fParams->fEffectProgram.fInterpreter.get();
+    if (interpreter) {
+        const auto& byteCode = interpreter->getCode();
+        if (auto fun = byteCode.getFunction(entry)) {
             for (const auto& value : fParams->fEffectProgram.fExternalValues) {
                 value->setRandom(&fRandom);
                 value->setEffect(this);
             }
-            SkAssertResult(byteCode->run(fun, &fState.fAge, sizeof(EffectState) / sizeof(float),
-                                         nullptr, 0,
-                                         fEffectUniforms.data(), fEffectUniforms.count()));
+            interpreter->setUniforms(fEffectUniforms.data());
+            static constexpr int numChannels = sizeof(EffectState) / sizeof(float);
+            SkASSERT(numChannels == fun->getParameterSlotCount());
+            float* args[numChannels];
+            for (int i = 0; i < numChannels; ++i) {
+                args[i] = &fState.fAge + i;
+            }
+            SkAssertResult(interpreter->runStriped(fun, 1, args));
             this->processEffectSpawnRequests(now);
         }
     }
@@ -263,8 +277,11 @@
 }
 
 void SkParticleEffect::runParticleScript(double now, const char* entry, int start, int count) {
-    if (const auto& byteCode = fParams->fParticleProgram.fByteCode) {
-        if (auto fun = byteCode->getFunction(entry)) {
+    SkSL::Interpreter<INTERPRETER_WIDTH>* interpreter =
+                                                       fParams->fParticleProgram.fInterpreter.get();
+    if (interpreter) {
+        const auto& byteCode = interpreter->getCode();
+        if (auto fun = byteCode.getFunction(entry)) {
             float* args[SkParticles::kNumChannels];
             for (int i = 0; i < SkParticles::kNumChannels; ++i) {
                 args[i] = fParticles.fData[i].get() + start;
@@ -275,10 +292,8 @@
                 value->setEffect(this);
             }
             memcpy(&fParticleUniforms[1], &fState.fAge, sizeof(EffectState));
-            SkAssertResult(byteCode->runStriped(fun, count, args, SkParticles::kNumChannels,
-                                                nullptr, 0,
-                                                fParticleUniforms.data(),
-                                                fParticleUniforms.count()));
+            interpreter->setUniforms(fParticleUniforms.data());
+            SkAssertResult(interpreter->runStriped(fun, count, (float**) args));
             this->processParticleSpawnRequests(now, start);
         }
     }
diff --git a/src/core/SkColorFilter.cpp b/src/core/SkColorFilter.cpp
index 26836d4..dfd34d2 100644
--- a/src/core/SkColorFilter.cpp
+++ b/src/core/SkColorFilter.cpp
@@ -19,6 +19,7 @@
 #include "src/core/SkReadBuffer.h"
 #include "src/core/SkVM.h"
 #include "src/core/SkWriteBuffer.h"
+#include "src/sksl/SkSLInterpreter.h"
 
 #if SK_SUPPORT_GPU
 #include "src/gpu/GrFragmentProcessor.h"
@@ -420,17 +421,20 @@
         ctx->ninputs = fEffect->uniformSize() / 4;
         ctx->shaderConvention = false;
 
-        SkAutoMutexExclusive ama(fByteCodeMutex);
-        if (!fByteCode) {
+        SkAutoMutexExclusive ama(fInterpreterMutex);
+        if (!fInterpreter) {
             auto [byteCode, errorText] = fEffect->toByteCode(fInputs->data());
             if (!byteCode) {
                 SkDebugf("%s\n", errorText.c_str());
                 return false;
             }
-            fByteCode = std::move(byteCode);
+            fMain = byteCode->getFunction("main");
+            fInterpreter.reset(
+                           new SkSL::Interpreter<SkRasterPipeline_InterpreterCtx::VECTOR_WIDTH>(
+                                                                          std::move(byteCode)));
         }
-        ctx->byteCode = fByteCode.get();
-        ctx->fn = ctx->byteCode->getFunction("main");
+        ctx->fn = fMain;
+        ctx->interpreter = fInterpreter.get();
         rec.fPipeline->append(SkRasterPipeline::interpreter, ctx);
         return true;
     }
@@ -453,8 +457,10 @@
     sk_sp<SkRuntimeEffect> fEffect;
     sk_sp<SkData> fInputs;
 
-    mutable SkMutex fByteCodeMutex;
-    mutable std::unique_ptr<SkSL::ByteCode> fByteCode;
+    mutable SkMutex fInterpreterMutex;
+    mutable std::unique_ptr<SkSL::Interpreter<SkRasterPipeline_InterpreterCtx::VECTOR_WIDTH>>
+                                                                                       fInterpreter;
+    mutable const SkSL::ByteCodeFunction* fMain;
 
     friend class SkColorFilter;
 
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 98b009a..1a6e582 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -161,12 +161,15 @@
 };
 
 namespace SkSL {
-class ByteCode;
 class ByteCodeFunction;
+
+template<int width>
+class Interpreter;
 }
 
 struct SkRasterPipeline_InterpreterCtx {
-    const SkSL::ByteCode*         byteCode;
+    static constexpr int VECTOR_WIDTH = 8;
+    SkSL::Interpreter<VECTOR_WIDTH>* interpreter;
     const SkSL::ByteCodeFunction* fn;
 
     SkColor4f   paintColor;
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 00e2b67..0f75d54 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -10,7 +10,7 @@
 
 #include "include/core/SkTypes.h"
 #include "src/core/SkUtils.h"  // unaligned_{load,store}
-#include "src/sksl/SkSLByteCode.h"
+#include "src/sksl/SkSLInterpreter.h"
 
 // Every function in this file should be marked static and inline using SI.
 #if defined(__clang__)
@@ -2711,7 +2711,6 @@
 
     float*  args[]  = { xx, yy, rr, gg, bb, aa };
     float** in_args = args;
-    int     in_count = 6;
 
     if (c->shaderConvention) {
         // our caller must have called seed_shader to set these
@@ -2723,15 +2722,14 @@
         sk_unaligned_store(aa, F(c->paintColor.fA));
     } else {
         in_args += 2;   // skip x,y
-        in_count = 4;
         sk_unaligned_store(rr, r);
         sk_unaligned_store(gg, g);
         sk_unaligned_store(bb, b);
         sk_unaligned_store(aa, a);
     }
 
-    SkAssertResult(c->byteCode->runStriped(c->fn, tail ? tail : N, in_args, in_count,
-                                           nullptr, 0, (const float*)c->inputs, c->ninputs));
+    c->interpreter->setUniforms((float*) c->inputs);
+    SkAssertResult(c->interpreter->runStriped(c->fn, tail ? tail : N, (float**) in_args));
 
     r = sk_unaligned_load<F>(rr);
     g = sk_unaligned_load<F>(gg);
diff --git a/src/shaders/SkRTShader.cpp b/src/shaders/SkRTShader.cpp
index ce823eb..90aeb1f 100644
--- a/src/shaders/SkRTShader.cpp
+++ b/src/shaders/SkRTShader.cpp
@@ -14,6 +14,8 @@
 #include "src/shaders/SkRTShader.h"
 
 #include "src/sksl/SkSLByteCode.h"
+#include "src/sksl/SkSLCompiler.h"
+#include "src/sksl/SkSLInterpreter.h"
 
 #if SK_SUPPORT_GPU
 #include "src/gpu/GrColorInfo.h"
@@ -45,17 +47,19 @@
     ctx->ninputs = fEffect->uniformSize() / 4;
     ctx->shaderConvention = true;
 
-    SkAutoMutexExclusive ama(fByteCodeMutex);
-    if (!fByteCode) {
+    SkAutoMutexExclusive ama(fInterpreterMutex);
+    if (!fInterpreter) {
         auto [byteCode, errorText] = fEffect->toByteCode(fInputs->data());
         if (!byteCode) {
             SkDebugf("%s\n", errorText.c_str());
             return false;
         }
-        fByteCode = std::move(byteCode);
+        fMain = byteCode->getFunction("main");
+        fInterpreter.reset(new SkSL::Interpreter<SkRasterPipeline_InterpreterCtx::VECTOR_WIDTH>(
+                                                                      std::move(byteCode)));
     }
-    ctx->byteCode = fByteCode.get();
-    ctx->fn = ctx->byteCode->getFunction("main");
+    ctx->fn = fMain;
+    ctx->interpreter = fInterpreter.get();
 
     rec.fPipeline->append(SkRasterPipeline::seed_shader);
     rec.fPipeline->append_matrix(rec.fAlloc, inverse);
diff --git a/src/shaders/SkRTShader.h b/src/shaders/SkRTShader.h
index 5d44840..1ac56ec 100644
--- a/src/shaders/SkRTShader.h
+++ b/src/shaders/SkRTShader.h
@@ -18,7 +18,12 @@
 class SkMatrix;
 class SkRuntimeEffect;
 
-namespace SkSL { class ByteCode; }
+namespace SkSL {
+    class ByteCodeFunction;
+
+    template<int width>
+    class Interpreter;
+}
 
 class SkRTShader : public SkShaderBase {
 public:
@@ -37,6 +42,8 @@
     bool onAppendStages(const SkStageRec& rec) const override;
 
 private:
+    static constexpr int VECTOR_WIDTH = 8;
+
     SK_FLATTENABLE_HOOKS(SkRTShader)
 
     sk_sp<SkRuntimeEffect> fEffect;
@@ -45,8 +52,9 @@
     sk_sp<SkData> fInputs;
     std::vector<sk_sp<SkShader>> fChildren;
 
-    mutable SkMutex fByteCodeMutex;
-    mutable std::unique_ptr<SkSL::ByteCode> fByteCode;
+    mutable SkMutex fInterpreterMutex;
+    mutable std::unique_ptr<SkSL::Interpreter<VECTOR_WIDTH>> fInterpreter;
+    mutable const SkSL::ByteCodeFunction* fMain;
 
     typedef SkShaderBase INHERITED;
 };
diff --git a/src/sksl/SkSLByteCode.cpp b/src/sksl/SkSLByteCode.cpp
deleted file mode 100644
index a9c3480..0000000
--- a/src/sksl/SkSLByteCode.cpp
+++ /dev/null
@@ -1,1760 +0,0 @@
-/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef SKSL_STANDALONE
-
-#include "include/core/SkPoint3.h"
-#include "include/private/SkVx.h"
-#include "src/core/SkUtils.h"   // sk_unaligned_load
-#include "src/sksl/SkSLByteCode.h"
-#include "src/sksl/SkSLByteCodeGenerator.h"
-#include "src/sksl/SkSLExternalValue.h"
-
-#include <vector>
-
-namespace SkSL {
-
-#if defined(SK_ENABLE_SKSL_INTERPRETER)
-
-constexpr int VecWidth = ByteCode::kVecWidth;
-
-struct Interpreter {
-
-using F32 = skvx::Vec<VecWidth, float>;
-using I32 = skvx::Vec<VecWidth, int32_t>;
-using U32 = skvx::Vec<VecWidth, uint32_t>;
-
-#define READ8() (*(ip++))
-#define READ16() (ip += 2, sk_unaligned_load<uint16_t>(ip - 2))
-#define READ32() (ip += 4, sk_unaligned_load<uint32_t>(ip - 4))
-#define READ_INST() (ip += sizeof(instruction), \
-                     sk_unaligned_load<instruction>(ip - sizeof(instruction)))
-
-#define VECTOR_DISASSEMBLE(op, text)                                \
-    case ByteCodeInstruction::op: printf(text); ++ip; break;        \
-    case ByteCodeInstruction::op##2: printf(text "2"); ++ip; break; \
-    case ByteCodeInstruction::op##3: printf(text "3"); ++ip; break; \
-    case ByteCodeInstruction::op##4: printf(text "4"); ++ip; break;
-
-#define VECTOR_DISASSEMBLE_NO_COUNT(op, text)                 \
-    case ByteCodeInstruction::op: printf(text); break;        \
-    case ByteCodeInstruction::op##2: printf(text "2"); break; \
-    case ByteCodeInstruction::op##3: printf(text "3"); break; \
-    case ByteCodeInstruction::op##4: printf(text "4"); break;
-
-#define VECTOR_MATRIX_DISASSEMBLE(op, text) \
-    VECTOR_DISASSEMBLE(op, text)            \
-    case ByteCodeInstruction::op##N: printf(text "N %d", READ8()); break;
-
-#define VECTOR_MATRIX_DISASSEMBLE_NO_COUNT(op, text) \
-    VECTOR_DISASSEMBLE_NO_COUNT(op, text)            \
-    case ByteCodeInstruction::op##N: printf(text "N %d", READ8()); break;
-
-static const uint8_t* DisassembleInstruction(const uint8_t* ip) {
-    switch ((ByteCodeInstruction) (intptr_t) READ_INST()) {
-        VECTOR_MATRIX_DISASSEMBLE(kAddF, "addf")
-        VECTOR_DISASSEMBLE(kAddI, "addi")
-        case ByteCodeInstruction::kAndB: printf("andb"); break;
-        case ByteCodeInstruction::kBranch: printf("branch %d", READ16()); break;
-        case ByteCodeInstruction::kCall: printf("call %d", READ8()); break;
-        case ByteCodeInstruction::kCallExternal: {
-            int argumentCount = READ8();
-            int returnCount = READ8();
-            int externalValue = READ8();
-            printf("callexternal %d, %d, %d", argumentCount, returnCount, externalValue);
-            break;
-        }
-        case ByteCodeInstruction::kClampIndex: printf("clampindex %d", READ8()); break;
-        VECTOR_DISASSEMBLE(kCompareIEQ, "compareieq")
-        VECTOR_DISASSEMBLE(kCompareINEQ, "compareineq")
-        VECTOR_MATRIX_DISASSEMBLE(kCompareFEQ, "comparefeq")
-        VECTOR_MATRIX_DISASSEMBLE(kCompareFNEQ, "comparefneq")
-        VECTOR_DISASSEMBLE(kCompareFGT, "comparefgt")
-        VECTOR_DISASSEMBLE(kCompareFGTEQ, "comparefgteq")
-        VECTOR_DISASSEMBLE(kCompareFLT, "compareflt")
-        VECTOR_DISASSEMBLE(kCompareFLTEQ, "compareflteq")
-        VECTOR_DISASSEMBLE(kCompareSGT, "comparesgt")
-        VECTOR_DISASSEMBLE(kCompareSGTEQ, "comparesgteq")
-        VECTOR_DISASSEMBLE(kCompareSLT, "compareslt")
-        VECTOR_DISASSEMBLE(kCompareSLTEQ, "compareslteq")
-        VECTOR_DISASSEMBLE(kCompareUGT, "compareugt")
-        VECTOR_DISASSEMBLE(kCompareUGTEQ, "compareugteq")
-        VECTOR_DISASSEMBLE(kCompareULT, "compareult")
-        VECTOR_DISASSEMBLE(kCompareULTEQ, "compareulteq")
-        VECTOR_DISASSEMBLE_NO_COUNT(kConvertFtoI, "convertftoi")
-        VECTOR_DISASSEMBLE_NO_COUNT(kConvertStoF, "convertstof")
-        VECTOR_DISASSEMBLE_NO_COUNT(kConvertUtoF, "convertutof")
-        VECTOR_DISASSEMBLE(kCos, "cos")
-        VECTOR_MATRIX_DISASSEMBLE(kDivideF, "dividef")
-        VECTOR_DISASSEMBLE(kDivideS, "divideS")
-        VECTOR_DISASSEMBLE(kDivideU, "divideu")
-        VECTOR_MATRIX_DISASSEMBLE(kDup, "dup")
-        case ByteCodeInstruction::kInverse2x2: printf("inverse2x2"); break;
-        case ByteCodeInstruction::kInverse3x3: printf("inverse3x3"); break;
-        case ByteCodeInstruction::kInverse4x4: printf("inverse4x4"); break;
-        case ByteCodeInstruction::kLoad: printf("load %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kLoad2: printf("load2 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kLoad3: printf("load3 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kLoad4: printf("load4 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kLoadGlobal: printf("loadglobal %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kLoadGlobal2: printf("loadglobal2 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kLoadGlobal3: printf("loadglobal3 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kLoadGlobal4: printf("loadglobal4 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kLoadUniform: printf("loaduniform %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kLoadUniform2: printf("loaduniform2 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kLoadUniform3: printf("loaduniform3 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kLoadUniform4: printf("loaduniform4 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kLoadSwizzle: {
-            int target = READ8();
-            int count = READ8();
-            printf("loadswizzle %d %d", target, count);
-            for (int i = 0; i < count; ++i) {
-                printf(", %d", READ8());
-            }
-            break;
-        }
-        case ByteCodeInstruction::kLoadSwizzleGlobal: {
-            int target = READ8();
-            int count = READ8();
-            printf("loadswizzleglobal %d %d", target, count);
-            for (int i = 0; i < count; ++i) {
-                printf(", %d", READ8());
-            }
-            break;
-        }
-        case ByteCodeInstruction::kLoadSwizzleUniform: {
-            int target = READ8();
-            int count = READ8();
-            printf("loadswizzleuniform %d %d", target, count);
-            for (int i = 0; i < count; ++i) {
-                printf(", %d", READ8());
-            }
-            break;
-        }
-        case ByteCodeInstruction::kLoadExtended: printf("loadextended %d", READ8()); break;
-        case ByteCodeInstruction::kLoadExtendedGlobal: printf("loadextendedglobal %d", READ8());
-            break;
-        case ByteCodeInstruction::kLoadExtendedUniform: printf("loadextendeduniform %d", READ8());
-            break;
-        case ByteCodeInstruction::kMatrixToMatrix: {
-            int srcCols = READ8();
-            int srcRows = READ8();
-            int dstCols = READ8();
-            int dstRows = READ8();
-            printf("matrixtomatrix %dx%d %dx%d", srcCols, srcRows, dstCols, dstRows);
-            break;
-        }
-        case ByteCodeInstruction::kMatrixMultiply: {
-            int lCols = READ8();
-            int lRows = READ8();
-            int rCols = READ8();
-            printf("matrixmultiply %dx%d %dx%d", lCols, lRows, rCols, lCols);
-            break;
-        }
-        VECTOR_MATRIX_DISASSEMBLE(kMultiplyF, "multiplyf")
-        VECTOR_DISASSEMBLE(kMultiplyI, "multiplyi")
-        VECTOR_MATRIX_DISASSEMBLE_NO_COUNT(kNegateF, "negatef")
-        VECTOR_DISASSEMBLE_NO_COUNT(kNegateI, "negatei")
-        case ByteCodeInstruction::kNotB: printf("notb"); break;
-        case ByteCodeInstruction::kOrB: printf("orb"); break;
-        VECTOR_MATRIX_DISASSEMBLE_NO_COUNT(kPop, "pop")
-        case ByteCodeInstruction::kPushImmediate: {
-            uint32_t v = READ32();
-            union { uint32_t u; float f; } pun = { v };
-            printf("pushimmediate %s", (to_string(v) + "(" + to_string(pun.f) + ")").c_str());
-            break;
-        }
-        case ByteCodeInstruction::kReadExternal: printf("readexternal %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kReadExternal2: printf("readexternal2 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kReadExternal3: printf("readexternal3 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kReadExternal4: printf("readexternal4 %d", READ16() >> 8); break;
-        VECTOR_DISASSEMBLE(kRemainderF, "remainderf")
-        VECTOR_DISASSEMBLE(kRemainderS, "remainders")
-        VECTOR_DISASSEMBLE(kRemainderU, "remainderu")
-        case ByteCodeInstruction::kReserve: printf("reserve %d", READ8()); break;
-        case ByteCodeInstruction::kReturn: printf("return %d", READ8()); break;
-        case ByteCodeInstruction::kScalarToMatrix: {
-            int cols = READ8();
-            int rows = READ8();
-            printf("scalartomatrix %dx%d", cols, rows);
-            break;
-        }
-        case ByteCodeInstruction::kShiftLeft: printf("shl %d", READ8()); break;
-        case ByteCodeInstruction::kShiftRightS: printf("shrs %d", READ8()); break;
-        case ByteCodeInstruction::kShiftRightU: printf("shru %d", READ8()); break;
-        VECTOR_DISASSEMBLE(kSin, "sin")
-        VECTOR_DISASSEMBLE_NO_COUNT(kSqrt, "sqrt")
-        case ByteCodeInstruction::kStore: printf("store %d", READ8()); break;
-        case ByteCodeInstruction::kStore2: printf("store2 %d", READ8()); break;
-        case ByteCodeInstruction::kStore3: printf("store3 %d", READ8()); break;
-        case ByteCodeInstruction::kStore4: printf("store4 %d", READ8()); break;
-        case ByteCodeInstruction::kStoreGlobal: printf("storeglobal %d", READ8()); break;
-        case ByteCodeInstruction::kStoreGlobal2: printf("storeglobal2 %d", READ8()); break;
-        case ByteCodeInstruction::kStoreGlobal3: printf("storeglobal3 %d", READ8()); break;
-        case ByteCodeInstruction::kStoreGlobal4: printf("storeglobal4 %d", READ8()); break;
-        case ByteCodeInstruction::kStoreSwizzle: {
-            int target = READ8();
-            int count = READ8();
-            printf("storeswizzle %d %d", target, count);
-            for (int i = 0; i < count; ++i) {
-                printf(", %d", READ8());
-            }
-            break;
-        }
-        case ByteCodeInstruction::kStoreSwizzleGlobal: {
-            int target = READ8();
-            int count = READ8();
-            printf("storeswizzleglobal %d %d", target, count);
-            for (int i = 0; i < count; ++i) {
-                printf(", %d", READ8());
-            }
-            break;
-        }
-        case ByteCodeInstruction::kStoreSwizzleIndirect: {
-            int count = READ8();
-            printf("storeswizzleindirect %d", count);
-            for (int i = 0; i < count; ++i) {
-                printf(", %d", READ8());
-            }
-            break;
-        }
-        case ByteCodeInstruction::kStoreSwizzleIndirectGlobal: {
-            int count = READ8();
-            printf("storeswizzleindirectglobal %d", count);
-            for (int i = 0; i < count; ++i) {
-                printf(", %d", READ8());
-            }
-            break;
-        }
-        case ByteCodeInstruction::kStoreExtended: printf("storeextended %d", READ8()); break;
-        case ByteCodeInstruction::kStoreExtendedGlobal: printf("storeextendedglobal %d", READ8());
-            break;
-        VECTOR_MATRIX_DISASSEMBLE(kSubtractF, "subtractf")
-        VECTOR_DISASSEMBLE(kSubtractI, "subtracti")
-        case ByteCodeInstruction::kSwizzle: {
-            printf("swizzle %d, ", READ8());
-            int count = READ8();
-            printf("%d", count);
-            for (int i = 0; i < count; ++i) {
-                printf(", %d", READ8());
-            }
-            break;
-        }
-        VECTOR_DISASSEMBLE(kTan, "tan")
-        case ByteCodeInstruction::kWriteExternal: printf("writeexternal %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kWriteExternal2: printf("writeexternal2 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kWriteExternal3: printf("writeexternal3 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kWriteExternal4: printf("writeexternal4 %d", READ16() >> 8); break;
-        case ByteCodeInstruction::kXorB: printf("xorb"); break;
-        case ByteCodeInstruction::kMaskPush: printf("maskpush"); break;
-        case ByteCodeInstruction::kMaskPop: printf("maskpop"); break;
-        case ByteCodeInstruction::kMaskNegate: printf("masknegate"); break;
-        case ByteCodeInstruction::kMaskBlend: printf("maskblend %d", READ8()); break;
-        case ByteCodeInstruction::kBranchIfAllFalse:
-            printf("branchifallfalse %d", READ16());
-            break;
-        case ByteCodeInstruction::kLoopBegin: printf("loopbegin"); break;
-        case ByteCodeInstruction::kLoopNext: printf("loopnext"); break;
-        case ByteCodeInstruction::kLoopMask: printf("loopmask"); break;
-        case ByteCodeInstruction::kLoopEnd: printf("loopend"); break;
-        case ByteCodeInstruction::kLoopContinue: printf("loopcontinue"); break;
-        case ByteCodeInstruction::kLoopBreak: printf("loopbreak"); break;
-        default:
-            ip -= sizeof(instruction);
-            printf("unknown(%d)\n", (int) (intptr_t) READ_INST());
-            SkASSERT(false);
-    }
-    return ip;
-}
-
-#ifdef SKSLC_THREADED_CODE
-    #define LABEL(name) name:
-    #ifdef TRACE
-        #define NEXT() goto next
-    #else
-        #define NEXT() goto *READ_INST()
-    #endif
-#else
-    #define LABEL(name) case ByteCodeInstruction::name:
-    #define NEXT() continue
-#endif
-
-#define VECTOR_BINARY_OP(base, field, op)             \
-    LABEL(base ## 4)                                  \
-        sp[-4] = sp[-4].field op sp[0].field;         \
-        POP();                                        \
-        /* fall through */                            \
-    LABEL(base ## 3) {                                \
-        sp[-ip[0]] = sp[-ip[0]].field op sp[0].field; \
-        POP();                                        \
-    }   /* fall through */                            \
-    LABEL(base ## 2) {                                \
-        sp[-ip[0]] = sp[-ip[0]].field op sp[0].field; \
-        POP();                                        \
-    }   /* fall through */                            \
-    LABEL(base) {                                     \
-        sp[-ip[0]] = sp[-ip[0]].field op sp[0].field; \
-        POP();                                        \
-        ++ip;                                         \
-        NEXT();                                       \
-    }
-
-// A naive implementation of / or % using skvx operations will likely crash with a divide by zero
-// in inactive vector lanesm, so we need to be sure to avoid masked-off lanes.
-#define VECTOR_BINARY_MASKED_OP(base, field, op)            \
-    LABEL(base ## 4)                                        \
-        for (int i = 0; i < VecWidth; ++i) {                \
-            if (mask()[i]) {                                \
-                sp[-4].field[i] op ## = sp[0].field[i];     \
-            }                                               \
-        }                                                   \
-        POP();                                              \
-        /* fall through */                                  \
-    LABEL(base ## 3) {                                      \
-        for (int i = 0; i < VecWidth; ++i) {                \
-            if (mask()[i]) {                                \
-                sp[-ip[0]].field[i] op ## = sp[0].field[i]; \
-            }                                               \
-        }                                                   \
-        POP();                                              \
-    }   /* fall through */                                  \
-    LABEL(base ## 2) {                                      \
-        for (int i = 0; i < VecWidth; ++i) {                \
-            if (mask()[i]) {                                \
-                sp[-ip[0]].field[i] op ## = sp[0].field[i]; \
-            }                                               \
-        }                                                   \
-        POP();                                              \
-    }   /* fall through */                                  \
-    LABEL(base) {                                           \
-        for (int i = 0; i < VecWidth; ++i) {                \
-            if (mask()[i]) {                                \
-                sp[-ip[0]].field[i] op ## = sp[0].field[i]; \
-            }                                               \
-        }                                                   \
-        POP();                                              \
-        ++ip;                                               \
-        NEXT();                                             \
-    }
-
-
-#define VECTOR_MATRIX_BINARY_OP(base, field, op)          \
-    VECTOR_BINARY_OP(base, field, op)                     \
-    LABEL(base ## N) {                                    \
-        int count = READ8();                              \
-        for (int i = count; i > 0; --i) {                 \
-            sp[-count] = sp[-count].field op sp[0].field; \
-            POP();                                        \
-        }                                                 \
-        NEXT();                                           \
-    }
-
-#define VECTOR_BINARY_FN(base, field, fn)               \
-    LABEL(base ## 4)                                    \
-        sp[-4] = fn(sp[-4].field, sp[0].field);         \
-        POP();                                          \
-        /* fall through */                              \
-    LABEL(base ## 3) {                                  \
-        sp[-ip[0]] = fn(sp[-ip[0]].field, sp[0].field); \
-        POP();                                          \
-    }   /* fall through */                              \
-    LABEL(base ## 2) {                                  \
-        sp[-ip[0]] = fn(sp[-ip[0]].field, sp[0].field); \
-        POP();                                          \
-    }   /* fall through */                              \
-    LABEL(base) {                                       \
-        sp[-ip[0]] = fn(sp[-ip[0]].field, sp[0].field); \
-        POP();                                          \
-        ++ip;                                           \
-        NEXT();                                         \
-    }
-
-#define VECTOR_UNARY_FN(base, fn, field)         \
-    LABEL(base ## 4)  sp[-3] = fn(sp[-3].field); \
-    LABEL(base ## 3)  sp[-2] = fn(sp[-2].field); \
-    LABEL(base ## 2)  sp[-1] = fn(sp[-1].field); \
-    LABEL(base)       sp[ 0] = fn(sp[ 0].field); \
-                      NEXT();
-
-#define VECTOR_UNARY_FN_VEC(base, fn)                     \
-    LABEL(base ## 4)                                      \
-    LABEL(base ## 3)                                      \
-    LABEL(base ## 2)                                      \
-    LABEL(base) {                                         \
-        int count = READ8();                              \
-        float* v = (float*)sp - count + 1;                \
-        for (int i = VecWidth * count; i > 0; --i, ++v) { \
-            *v = fn(*v);                                  \
-        }                                                 \
-        NEXT();                                           \
-    }
-
-#define VECTOR_LABELS(base) \
-    &&base ## 4,            \
-    &&base ## 3,            \
-    &&base ## 2,            \
-    &&base
-
-#define VECTOR_MATRIX_LABELS(base) \
-    VECTOR_LABELS(base),           \
-    &&base ## N
-
-// If you trip this assert, it means that the order of the opcodes listed in ByteCodeInstruction
-// does not match the order of the opcodes listed in the 'labels' array in innerRun().
-#define CHECK_LABEL(name) \
-    SkASSERT(labels[(int) ByteCodeInstruction::name] == &&name)
-
-#define CHECK_VECTOR_LABELS(name) \
-    CHECK_LABEL(name ## 4);       \
-    CHECK_LABEL(name ## 3);       \
-    CHECK_LABEL(name ## 2);       \
-    CHECK_LABEL(name)
-
-#define CHECK_VECTOR_MATRIX_LABELS(name) \
-    CHECK_VECTOR_LABELS(name);           \
-    CHECK_LABEL(name ## N)
-
-union VValue {
-    VValue() {}
-    VValue(F32 f) : fFloat(f) {}
-    VValue(I32 s) : fSigned(s) {}
-    VValue(U32 u) : fUnsigned(u) {}
-
-    F32 fFloat;
-    I32 fSigned;
-    U32 fUnsigned;
-};
-
-struct StackFrame {
-    const uint8_t* fCode;
-    const uint8_t* fIP;
-    VValue* fStack;
-    int fParameterCount;
-};
-
-static F32 VecMod(F32 a, F32 b) {
-    return a - skvx::trunc(a / b) * b;
-}
-
-#define spf(index)  sp[index].fFloat
-
-static void CallExternal(const ByteCode* byteCode, const uint8_t*& ip, VValue*& sp,
-                          int baseIndex, I32 mask) {
-    int argumentCount = READ8();
-    int returnCount = READ8();
-    int target = READ8();
-    ExternalValue* v = byteCode->fExternalValues[target];
-    sp -= argumentCount - 1;
-
-    float tmpArgs[4];
-    float tmpReturn[4];
-    SkASSERT(argumentCount <= (int)SK_ARRAY_COUNT(tmpArgs));
-    SkASSERT(returnCount <= (int)SK_ARRAY_COUNT(tmpReturn));
-
-    for (int i = 0; i < VecWidth; ++i) {
-        if (mask[i]) {
-            for (int j = 0; j < argumentCount; ++j) {
-                tmpArgs[j] = sp[j].fFloat[i];
-            }
-            v->call(baseIndex + i, tmpArgs, tmpReturn);
-            for (int j = 0; j < returnCount; ++j) {
-                sp[j].fFloat[i] = tmpReturn[j];
-            }
-        }
-    }
-    sp += returnCount - 1;
-}
-
-static void Inverse2x2(VValue* sp) {
-    F32 a = sp[-3].fFloat,
-        b = sp[-2].fFloat,
-        c = sp[-1].fFloat,
-        d = sp[ 0].fFloat;
-    F32 idet = F32(1) / (a*d - b*c);
-    sp[-3].fFloat = d * idet;
-    sp[-2].fFloat = -b * idet;
-    sp[-1].fFloat = -c * idet;
-    sp[ 0].fFloat = a * idet;
-}
-
-static void Inverse3x3(VValue* sp) {
-    F32 a11 = sp[-8].fFloat, a12 = sp[-5].fFloat, a13 = sp[-2].fFloat,
-        a21 = sp[-7].fFloat, a22 = sp[-4].fFloat, a23 = sp[-1].fFloat,
-        a31 = sp[-6].fFloat, a32 = sp[-3].fFloat, a33 = sp[ 0].fFloat;
-    F32 idet = F32(1) / (a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
-                         a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31);
-    sp[-8].fFloat = (a22 * a33 - a23 * a32) * idet;
-    sp[-7].fFloat = (a23 * a31 - a21 * a33) * idet;
-    sp[-6].fFloat = (a21 * a32 - a22 * a31) * idet;
-    sp[-5].fFloat = (a13 * a32 - a12 * a33) * idet;
-    sp[-4].fFloat = (a11 * a33 - a13 * a31) * idet;
-    sp[-3].fFloat = (a12 * a31 - a11 * a32) * idet;
-    sp[-2].fFloat = (a12 * a23 - a13 * a22) * idet;
-    sp[-1].fFloat = (a13 * a21 - a11 * a23) * idet;
-    sp[ 0].fFloat = (a11 * a22 - a12 * a21) * idet;
-}
-
-static void Inverse4x4(VValue* sp) {
-    F32 a00 = spf(-15), a10 = spf(-11), a20 = spf( -7), a30 = spf( -3),
-        a01 = spf(-14), a11 = spf(-10), a21 = spf( -6), a31 = spf( -2),
-        a02 = spf(-13), a12 = spf( -9), a22 = spf( -5), a32 = spf( -1),
-        a03 = spf(-12), a13 = spf( -8), a23 = spf( -4), a33 = spf(  0);
-
-    F32 b00 = a00 * a11 - a01 * a10,
-        b01 = a00 * a12 - a02 * a10,
-        b02 = a00 * a13 - a03 * a10,
-        b03 = a01 * a12 - a02 * a11,
-        b04 = a01 * a13 - a03 * a11,
-        b05 = a02 * a13 - a03 * a12,
-        b06 = a20 * a31 - a21 * a30,
-        b07 = a20 * a32 - a22 * a30,
-        b08 = a20 * a33 - a23 * a30,
-        b09 = a21 * a32 - a22 * a31,
-        b10 = a21 * a33 - a23 * a31,
-        b11 = a22 * a33 - a23 * a32;
-
-    F32 idet = F32(1) /
-               (b00 * b11 - b01 * b10 + b02 * b09 + b03 * b08 - b04 * b07 + b05 * b06);
-
-    b00 *= idet;
-    b01 *= idet;
-    b02 *= idet;
-    b03 *= idet;
-    b04 *= idet;
-    b05 *= idet;
-    b06 *= idet;
-    b07 *= idet;
-    b08 *= idet;
-    b09 *= idet;
-    b10 *= idet;
-    b11 *= idet;
-
-    spf(-15) = a11 * b11 - a12 * b10 + a13 * b09;
-    spf(-14) = a02 * b10 - a01 * b11 - a03 * b09;
-    spf(-13) = a31 * b05 - a32 * b04 + a33 * b03;
-    spf(-12) = a22 * b04 - a21 * b05 - a23 * b03;
-    spf(-11) = a12 * b08 - a10 * b11 - a13 * b07;
-    spf(-10) = a00 * b11 - a02 * b08 + a03 * b07;
-    spf( -9) = a32 * b02 - a30 * b05 - a33 * b01;
-    spf( -8) = a20 * b05 - a22 * b02 + a23 * b01;
-    spf( -7) = a10 * b10 - a11 * b08 + a13 * b06;
-    spf( -6) = a01 * b08 - a00 * b10 - a03 * b06;
-    spf( -5) = a30 * b04 - a31 * b02 + a33 * b00;
-    spf( -4) = a21 * b02 - a20 * b04 - a23 * b00;
-    spf( -3) = a11 * b07 - a10 * b09 - a12 * b06;
-    spf( -2) = a00 * b09 - a01 * b07 + a02 * b06;
-    spf( -1) = a31 * b01 - a30 * b03 - a32 * b00;
-    spf(  0) = a20 * b03 - a21 * b01 + a22 * b00;
-}
-
-static bool InnerRun(const ByteCode* byteCode, const ByteCodeFunction* f, VValue* stack,
-                     float* outReturn[], VValue globals[], const float uniforms[],
-                     bool stripedOutput, int N, int baseIndex) {
-#ifdef SKSLC_THREADED_CODE
-    static const void* labels[] = {
-        // If you aren't familiar with it, the &&label syntax is the GCC / Clang "labels as values"
-        // extension. If you add anything to this array, be sure to add the corresponding
-        // CHECK_LABEL() or CHECK_*_LABELS() assert below.
-        VECTOR_MATRIX_LABELS(kAddF),
-        VECTOR_LABELS(kAddI),
-        &&kAndB,
-        &&kBranch,
-        &&kCall,
-        &&kCallExternal,
-        &&kClampIndex,
-        VECTOR_LABELS(kCompareIEQ),
-        VECTOR_LABELS(kCompareINEQ),
-        VECTOR_MATRIX_LABELS(kCompareFEQ),
-        VECTOR_MATRIX_LABELS(kCompareFNEQ),
-        VECTOR_LABELS(kCompareFGT),
-        VECTOR_LABELS(kCompareFGTEQ),
-        VECTOR_LABELS(kCompareFLT),
-        VECTOR_LABELS(kCompareFLTEQ),
-        VECTOR_LABELS(kCompareSGT),
-        VECTOR_LABELS(kCompareSGTEQ),
-        VECTOR_LABELS(kCompareSLT),
-        VECTOR_LABELS(kCompareSLTEQ),
-        VECTOR_LABELS(kCompareUGT),
-        VECTOR_LABELS(kCompareUGTEQ),
-        VECTOR_LABELS(kCompareULT),
-        VECTOR_LABELS(kCompareULTEQ),
-        VECTOR_LABELS(kConvertFtoI),
-        VECTOR_LABELS(kConvertStoF),
-        VECTOR_LABELS(kConvertUtoF),
-        VECTOR_LABELS(kCos),
-        VECTOR_MATRIX_LABELS(kDivideF),
-        VECTOR_LABELS(kDivideS),
-        VECTOR_LABELS(kDivideU),
-        VECTOR_MATRIX_LABELS(kDup),
-        &&kInverse2x2,
-        &&kInverse3x3,
-        &&kInverse4x4,
-        VECTOR_LABELS(kLoad),
-        VECTOR_LABELS(kLoadGlobal),
-        VECTOR_LABELS(kLoadUniform),
-        &&kLoadSwizzle,
-        &&kLoadSwizzleGlobal,
-        &&kLoadSwizzleUniform,
-        &&kLoadExtended,
-        &&kLoadExtendedGlobal,
-        &&kLoadExtendedUniform,
-        &&kMatrixToMatrix,
-        &&kMatrixMultiply,
-        VECTOR_MATRIX_LABELS(kNegateF),
-        VECTOR_LABELS(kNegateI),
-        VECTOR_MATRIX_LABELS(kMultiplyF),
-        VECTOR_LABELS(kMultiplyI),
-        &&kNotB,
-        &&kOrB,
-        VECTOR_MATRIX_LABELS(kPop),
-        &&kPushImmediate,
-        VECTOR_LABELS(kReadExternal),
-        VECTOR_LABELS(kRemainderF),
-        VECTOR_LABELS(kRemainderS),
-        VECTOR_LABELS(kRemainderU),
-        &&kReserve,
-        &&kReturn,
-        &&kScalarToMatrix,
-        &&kShiftLeft,
-        &&kShiftRightS,
-        &&kShiftRightU,
-        VECTOR_LABELS(kSin),
-        VECTOR_LABELS(kSqrt),
-        VECTOR_LABELS(kStore),
-        VECTOR_LABELS(kStoreGlobal),
-        &&kStoreExtended,
-        &&kStoreExtendedGlobal,
-        &&kStoreSwizzle,
-        &&kStoreSwizzleGlobal,
-        &&kStoreSwizzleIndirect,
-        &&kStoreSwizzleIndirectGlobal,
-        &&kSwizzle,
-        VECTOR_MATRIX_LABELS(kSubtractF),
-        VECTOR_LABELS(kSubtractI),
-        VECTOR_LABELS(kTan),
-        VECTOR_LABELS(kWriteExternal),
-        &&kXorB,
-
-        &&kMaskPush,
-        &&kMaskPop,
-        &&kMaskNegate,
-        &&kMaskBlend,
-        &&kBranchIfAllFalse,
-
-        &&kLoopBegin,
-        &&kLoopNext,
-        &&kLoopMask,
-        &&kLoopEnd,
-        &&kLoopBreak,
-        &&kLoopContinue,
-    };
-    // Verify that the order of the labels array matches the order of the ByteCodeInstruction enum.
-    CHECK_VECTOR_MATRIX_LABELS(kAddF);
-    CHECK_VECTOR_LABELS(kAddI);
-    CHECK_LABEL(kAndB);
-    CHECK_LABEL(kBranch);
-    CHECK_LABEL(kCall);
-    CHECK_LABEL(kCallExternal);
-    CHECK_LABEL(kClampIndex);
-    CHECK_VECTOR_LABELS(kCompareIEQ);
-    CHECK_VECTOR_LABELS(kCompareINEQ);
-    CHECK_VECTOR_MATRIX_LABELS(kCompareFEQ);
-    CHECK_VECTOR_MATRIX_LABELS(kCompareFNEQ);
-    CHECK_VECTOR_LABELS(kCompareFGT);
-    CHECK_VECTOR_LABELS(kCompareFGTEQ);
-    CHECK_VECTOR_LABELS(kCompareFLT);
-    CHECK_VECTOR_LABELS(kCompareFLTEQ);
-    CHECK_VECTOR_LABELS(kCompareSGT);
-    CHECK_VECTOR_LABELS(kCompareSGTEQ);
-    CHECK_VECTOR_LABELS(kCompareSLT);
-    CHECK_VECTOR_LABELS(kCompareSLTEQ);
-    CHECK_VECTOR_LABELS(kCompareUGT);
-    CHECK_VECTOR_LABELS(kCompareUGTEQ);
-    CHECK_VECTOR_LABELS(kCompareULT);
-    CHECK_VECTOR_LABELS(kCompareULTEQ);
-    CHECK_VECTOR_LABELS(kConvertFtoI);
-    CHECK_VECTOR_LABELS(kConvertStoF);
-    CHECK_VECTOR_LABELS(kConvertUtoF);
-    CHECK_VECTOR_LABELS(kCos);
-    CHECK_VECTOR_MATRIX_LABELS(kDivideF);
-    CHECK_VECTOR_LABELS(kDivideS);
-    CHECK_VECTOR_LABELS(kDivideU);
-    CHECK_VECTOR_MATRIX_LABELS(kDup);
-    CHECK_LABEL(kInverse2x2);
-    CHECK_LABEL(kInverse3x3);
-    CHECK_LABEL(kInverse4x4);
-    CHECK_VECTOR_LABELS(kLoad);
-    CHECK_VECTOR_LABELS(kLoadGlobal);
-    CHECK_VECTOR_LABELS(kLoadUniform);
-    CHECK_LABEL(kLoadSwizzle);
-    CHECK_LABEL(kLoadSwizzleGlobal);
-    CHECK_LABEL(kLoadSwizzleUniform);
-    CHECK_LABEL(kLoadExtended);
-    CHECK_LABEL(kLoadExtendedGlobal);
-    CHECK_LABEL(kLoadExtendedUniform);
-    CHECK_LABEL(kMatrixToMatrix);
-    CHECK_LABEL(kMatrixMultiply);
-    CHECK_VECTOR_MATRIX_LABELS(kNegateF);
-    CHECK_VECTOR_LABELS(kNegateI);
-    CHECK_VECTOR_MATRIX_LABELS(kMultiplyF);
-    CHECK_VECTOR_LABELS(kMultiplyI);
-    CHECK_LABEL(kNotB);
-    CHECK_LABEL(kOrB);
-    CHECK_VECTOR_MATRIX_LABELS(kPop);
-    CHECK_LABEL(kPushImmediate);
-    CHECK_VECTOR_LABELS(kReadExternal);
-    CHECK_VECTOR_LABELS(kRemainderF);
-    CHECK_VECTOR_LABELS(kRemainderS);
-    CHECK_VECTOR_LABELS(kRemainderU);
-    CHECK_LABEL(kReserve);
-    CHECK_LABEL(kReturn);
-    CHECK_LABEL(kScalarToMatrix);
-    CHECK_LABEL(kShiftLeft);
-    CHECK_LABEL(kShiftRightS);
-    CHECK_LABEL(kShiftRightU);
-    CHECK_VECTOR_LABELS(kSin);
-    CHECK_VECTOR_LABELS(kSqrt);
-    CHECK_VECTOR_LABELS(kStore);
-    CHECK_VECTOR_LABELS(kStoreGlobal);
-    CHECK_LABEL(kStoreExtended);
-    CHECK_LABEL(kStoreExtendedGlobal);
-    CHECK_LABEL(kStoreSwizzle);
-    CHECK_LABEL(kStoreSwizzleGlobal);
-    CHECK_LABEL(kStoreSwizzleIndirect);
-    CHECK_LABEL(kStoreSwizzleIndirectGlobal);
-    CHECK_LABEL(kSwizzle);
-    CHECK_VECTOR_MATRIX_LABELS(kSubtractF);
-    CHECK_VECTOR_LABELS(kSubtractI);
-    CHECK_VECTOR_LABELS(kTan);
-    CHECK_VECTOR_LABELS(kWriteExternal);
-    CHECK_LABEL(kXorB);
-    CHECK_LABEL(kMaskPush);
-    CHECK_LABEL(kMaskPop);
-    CHECK_LABEL(kMaskNegate);
-    CHECK_LABEL(kMaskBlend);
-    CHECK_LABEL(kBranchIfAllFalse);
-    CHECK_LABEL(kLoopBegin);
-    CHECK_LABEL(kLoopNext);
-    CHECK_LABEL(kLoopMask);
-    CHECK_LABEL(kLoopEnd);
-    CHECK_LABEL(kLoopBreak);
-    CHECK_LABEL(kLoopContinue);
-    f->fPreprocessOnce([f] { ((ByteCodeFunction*)f)->preprocess(labels); });
-#endif
-
-    // Needs to be the first N non-negative integers, at least as large as VecWidth
-    static const Interpreter::I32 gLanes = {
-        0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
-    };
-
-    VValue* sp = stack + f->fParameterCount + f->fLocalCount - 1;
-
-    #define POP() (*(sp--))
-    #define PUSH(v) (sp[1] = v, ++sp)
-
-    const uint8_t* code = f->fCode.data();
-    const uint8_t* ip = code;
-    std::vector<StackFrame> frames;
-
-    I32 condStack[16];  // Independent condition masks
-    I32 maskStack[16];  // Combined masks (eg maskStack[0] & maskStack[1] & ...)
-    I32 contStack[16];  // Continue flags for loops
-    I32 loopStack[16];  // Loop execution masks
-    condStack[0] = maskStack[0] = (gLanes < N);
-    contStack[0] = I32( 0);
-    loopStack[0] = I32(~0);
-    I32* condPtr = condStack;
-    I32* maskPtr = maskStack;
-    I32* contPtr = contStack;
-    I32* loopPtr = loopStack;
-
-    if (f->fConditionCount + 1 > (int)SK_ARRAY_COUNT(condStack) ||
-        f->fLoopCount + 1 > (int)SK_ARRAY_COUNT(loopStack)) {
-        return false;
-    }
-
-    auto mask = [&]() { return *maskPtr & *loopPtr; };
-
-#ifdef SKSLC_THREADED_CODE
-    // If the "labels as values" extension is available, we implement this using threaded code.
-    // Instead of opcodes, the code directly contains the addresses of the labels to jump to. Then
-    // the code for each opcode simply grabs the address of the next opcode and uses a goto to jump
-    // there.
-    NEXT();
-#else
-    // Otherwise, we have to use a switch statement and a loop to execute the right label.
-    for (;;) {
-        #ifdef TRACE
-            printf("at %3d ", (int) (ip - code));
-            disassemble_instruction(ip);
-            printf(" (stack: %d)\n", (int) (sp - stack) + 1);
-        #endif
-        switch ((ByteCodeInstruction) READ16()) {
-#endif
-
-    VECTOR_MATRIX_BINARY_OP(kAddF, fFloat, +)
-    VECTOR_BINARY_OP(kAddI, fSigned, +)
-
-    // Booleans are integer masks: 0/~0 for false/true. So bitwise ops do what we want:
-    LABEL(kAndB)
-        sp[-1] = sp[-1].fSigned & sp[0].fSigned;
-        POP();
-        NEXT();
-    LABEL(kNotB)
-        sp[0] = ~sp[0].fSigned;
-        NEXT();
-    LABEL(kOrB)
-        sp[-1] = sp[-1].fSigned | sp[0].fSigned;
-        POP();
-        NEXT();
-    LABEL(kXorB)
-        sp[-1] = sp[-1].fSigned ^ sp[0].fSigned;
-        POP();
-        NEXT();
-
-    LABEL(kBranch)
-        ip = code + READ16();
-        NEXT();
-
-    LABEL(kCall) {
-        // Precursor code reserved space for the return value, and pushed all parameters to
-        // the stack. Update our bottom of stack to point at the first parameter, and our
-        // sp to point past those parameters (plus space for locals).
-        int target = READ8();
-        const ByteCodeFunction* fun = byteCode->fFunctions[target].get();
-#ifdef SKSLC_THREADED_CODE
-        fun->fPreprocessOnce([fun] { ((ByteCodeFunction*)fun)->preprocess(labels); });
-#endif
-        if (skvx::any(mask())) {
-            frames.push_back({ code, ip, stack, fun->fParameterCount });
-            ip = code = fun->fCode.data();
-            stack = sp - fun->fParameterCount + 1;
-            sp = stack + fun->fParameterCount + fun->fLocalCount - 1;
-        }
-        NEXT();
-    }
-
-    LABEL(kCallExternal) {
-        CallExternal(byteCode, ip, sp, baseIndex, mask());
-        NEXT();
-    }
-
-    LABEL(kClampIndex) {
-        int length = READ8();
-        if (skvx::any(mask() & ((sp[0].fSigned < 0) | (sp[0].fSigned >= length)))) {
-            return false;
-        }
-        NEXT();
-    }
-
-    VECTOR_BINARY_OP(kCompareIEQ, fSigned, ==)
-    VECTOR_MATRIX_BINARY_OP(kCompareFEQ, fFloat, ==)
-    VECTOR_BINARY_OP(kCompareINEQ, fSigned, !=)
-    VECTOR_MATRIX_BINARY_OP(kCompareFNEQ, fFloat, !=)
-    VECTOR_BINARY_OP(kCompareSGT, fSigned, >)
-    VECTOR_BINARY_OP(kCompareUGT, fUnsigned, >)
-    VECTOR_BINARY_OP(kCompareFGT, fFloat, >)
-    VECTOR_BINARY_OP(kCompareSGTEQ, fSigned, >=)
-    VECTOR_BINARY_OP(kCompareUGTEQ, fUnsigned, >=)
-    VECTOR_BINARY_OP(kCompareFGTEQ, fFloat, >=)
-    VECTOR_BINARY_OP(kCompareSLT, fSigned, <)
-    VECTOR_BINARY_OP(kCompareULT, fUnsigned, <)
-    VECTOR_BINARY_OP(kCompareFLT, fFloat, <)
-    VECTOR_BINARY_OP(kCompareSLTEQ, fSigned, <=)
-    VECTOR_BINARY_OP(kCompareULTEQ, fUnsigned, <=)
-    VECTOR_BINARY_OP(kCompareFLTEQ, fFloat, <=)
-
-    LABEL(kConvertFtoI4) sp[-3] = skvx::cast<int>(sp[-3].fFloat);
-    LABEL(kConvertFtoI3) sp[-2] = skvx::cast<int>(sp[-2].fFloat);
-    LABEL(kConvertFtoI2) sp[-1] = skvx::cast<int>(sp[-1].fFloat);
-    LABEL(kConvertFtoI)  sp[ 0] = skvx::cast<int>(sp[ 0].fFloat);
-                         NEXT();
-
-    LABEL(kConvertStoF4) sp[-3] = skvx::cast<float>(sp[-3].fSigned);
-    LABEL(kConvertStoF3) sp[-2] = skvx::cast<float>(sp[-2].fSigned);
-    LABEL(kConvertStoF2) sp[-1] = skvx::cast<float>(sp[-1].fSigned);
-    LABEL(kConvertStoF)  sp[ 0] = skvx::cast<float>(sp[ 0].fSigned);
-                         NEXT();
-
-    LABEL(kConvertUtoF4) sp[-3] = skvx::cast<float>(sp[-3].fUnsigned);
-    LABEL(kConvertUtoF3) sp[-2] = skvx::cast<float>(sp[-2].fUnsigned);
-    LABEL(kConvertUtoF2) sp[-1] = skvx::cast<float>(sp[-1].fUnsigned);
-    LABEL(kConvertUtoF)  sp[ 0] = skvx::cast<float>(sp[ 0].fUnsigned);
-                         NEXT();
-
-    VECTOR_UNARY_FN_VEC(kCos, cosf)
-
-    VECTOR_BINARY_MASKED_OP(kDivideS, fSigned, /)
-    VECTOR_BINARY_MASKED_OP(kDivideU, fUnsigned, /)
-    VECTOR_MATRIX_BINARY_OP(kDivideF, fFloat, /)
-
-    LABEL(kDup4) PUSH(sp[1 - ip[0]]);
-    LABEL(kDup3) PUSH(sp[1 - ip[0]]);
-    LABEL(kDup2) PUSH(sp[1 - ip[0]]);
-    LABEL(kDup)  PUSH(sp[1 - ip[0]]);
-                 ++ip;
-                 NEXT();
-
-    LABEL(kDupN) {
-        int count = READ8();
-        memcpy(sp + 1, sp - count + 1, count * sizeof(VValue));
-        sp += count;
-        NEXT();
-    }
-
-    LABEL(kInverse2x2) {
-        Inverse2x2(sp);
-        NEXT();
-    }
-    LABEL(kInverse3x3) {
-        Inverse3x3(sp);
-        NEXT();
-    }
-    LABEL(kInverse4x4) {
-        Inverse4x4(sp);
-        NEXT();
-    }
-
-    LABEL(kLoad4) sp[4] = stack[ip[1] + 3];
-    LABEL(kLoad3) sp[3] = stack[ip[1] + 2];
-    LABEL(kLoad2) sp[2] = stack[ip[1] + 1];
-    LABEL(kLoad)  sp[1] = stack[ip[1] + 0];
-                  sp += ip[0];
-                  ip += 2;
-                  NEXT();
-
-    LABEL(kLoadGlobal4) sp[4] = globals[ip[1] + 3];
-    LABEL(kLoadGlobal3) sp[3] = globals[ip[1] + 2];
-    LABEL(kLoadGlobal2) sp[2] = globals[ip[1] + 1];
-    LABEL(kLoadGlobal)  sp[1] = globals[ip[1] + 0];
-                        sp += ip[0];
-                        ip += 2;
-                        NEXT();
-
-    LABEL(kLoadUniform4) sp[4].fFloat = uniforms[ip[1] + 3];
-    LABEL(kLoadUniform3) sp[3].fFloat = uniforms[ip[1] + 2];
-    LABEL(kLoadUniform2) sp[2].fFloat = uniforms[ip[1] + 1];
-    LABEL(kLoadUniform)  sp[1].fFloat = uniforms[ip[1] + 0];
-                        sp += ip[0];
-                        ip += 2;
-                        NEXT();
-
-    LABEL(kLoadExtended) {
-        int count = READ8();
-        I32 src = POP().fSigned;
-        I32 m = mask();
-        for (int i = 0; i < count; ++i) {
-            for (int j = 0; j < VecWidth; ++j) {
-                if (m[j]) {
-                    sp[i + 1].fSigned[j] = stack[src[j] + i].fSigned[j];
-                }
-            }
-        }
-        sp += count;
-        NEXT();
-    }
-
-    LABEL(kLoadExtendedGlobal) {
-        int count = READ8();
-        I32 src = POP().fSigned;
-        I32 m = mask();
-        for (int i = 0; i < count; ++i) {
-            for (int j = 0; j < VecWidth; ++j) {
-                if (m[j]) {
-                    sp[i + 1].fSigned[j] = globals[src[j] + i].fSigned[j];
-                }
-            }
-        }
-        sp += count;
-        NEXT();
-    }
-
-    LABEL(kLoadExtendedUniform) {
-        int count = READ8();
-        I32 src = POP().fSigned;
-        I32 m = mask();
-        for (int i = 0; i < count; ++i) {
-            for (int j = 0; j < VecWidth; ++j) {
-                if (m[j]) {
-                    sp[i + 1].fFloat[j] = uniforms[src[j] + i];
-                }
-            }
-        }
-        sp += count;
-        NEXT();
-    }
-
-    LABEL(kLoadSwizzle) {
-        int src = READ8();
-        int count = READ8();
-        for (int i = 0; i < count; ++i) {
-            PUSH(stack[src + *(ip + i)]);
-        }
-        ip += count;
-        NEXT();
-    }
-
-    LABEL(kLoadSwizzleGlobal) {
-        int src = READ8();
-        int count = READ8();
-        for (int i = 0; i < count; ++i) {
-            PUSH(globals[src + *(ip + i)]);
-        }
-        ip += count;
-        NEXT();
-    }
-
-    LABEL(kLoadSwizzleUniform) {
-        int src = READ8();
-        int count = READ8();
-        for (int i = 0; i < count; ++i) {
-            PUSH(F32(uniforms[src + *(ip + i)]));
-        }
-        ip += count;
-        NEXT();
-    }
-
-    LABEL(kMatrixToMatrix) {
-        int srcCols = READ8();
-        int srcRows = READ8();
-        int dstCols = READ8();
-        int dstRows = READ8();
-        SkASSERT(srcCols >= 2 && srcCols <= 4);
-        SkASSERT(srcRows >= 2 && srcRows <= 4);
-        SkASSERT(dstCols >= 2 && dstCols <= 4);
-        SkASSERT(dstRows >= 2 && dstRows <= 4);
-        F32 tmp[16];
-        memset(tmp, 0, sizeof(tmp));
-        tmp[0] = tmp[5] = tmp[10] = tmp[15] = F32(1.0f);
-        for (int c = srcCols - 1; c >= 0; --c) {
-            for (int r = srcRows - 1; r >= 0; --r) {
-                tmp[c*4 + r] = POP().fFloat;
-            }
-        }
-        for (int c = 0; c < dstCols; ++c) {
-            for (int r = 0; r < dstRows; ++r) {
-                PUSH(tmp[c*4 + r]);
-            }
-        }
-        NEXT();
-    }
-
-    LABEL(kMatrixMultiply) {
-        int lCols = READ8();
-        int lRows = READ8();
-        int rCols = READ8();
-        int rRows = lCols;
-        F32 tmp[16] = { 0.0f };
-        F32* B = &(sp - (rCols * rRows) + 1)->fFloat;
-        F32* A = B - (lCols * lRows);
-        for (int c = 0; c < rCols; ++c) {
-            for (int r = 0; r < lRows; ++r) {
-                for (int j = 0; j < lCols; ++j) {
-                    tmp[c*lRows + r] += A[j*lRows + r] * B[c*rRows + j];
-                }
-            }
-        }
-        sp -= (lCols * lRows) + (rCols * rRows);
-        memcpy(sp + 1, tmp, rCols * lRows * sizeof(VValue));
-        sp += (rCols * lRows);
-        NEXT();
-    }
-
-    VECTOR_BINARY_OP(kMultiplyI, fSigned, *)
-    VECTOR_MATRIX_BINARY_OP(kMultiplyF, fFloat, *)
-
-    LABEL(kNegateF4) sp[-3] = -sp[-3].fFloat;
-    LABEL(kNegateF3) sp[-2] = -sp[-2].fFloat;
-    LABEL(kNegateF2) sp[-1] = -sp[-1].fFloat;
-    LABEL(kNegateF)  sp[ 0] = -sp[ 0].fFloat;
-                     NEXT();
-
-    LABEL(kNegateFN) {
-        int count = READ8();
-        for (int i = count - 1; i >= 0; --i) {
-            sp[-i] = -sp[-i].fFloat;
-        }
-        NEXT();
-    }
-
-    LABEL(kNegateI4) sp[-3] = -sp[-3].fSigned;
-    LABEL(kNegateI3) sp[-2] = -sp[-2].fSigned;
-    LABEL(kNegateI2) sp[-1] = -sp[-1].fSigned;
-    LABEL(kNegateI)  sp[ 0] = -sp[ 0].fSigned;
-                     NEXT();
-
-    LABEL(kPop4) POP();
-    LABEL(kPop3) POP();
-    LABEL(kPop2) POP();
-    LABEL(kPop)  POP();
-                 NEXT();
-
-    LABEL(kPopN)
-        sp -= READ8();
-        NEXT();
-
-    LABEL(kPushImmediate)
-        PUSH(U32(READ32()));
-        NEXT();
-
-    LABEL(kReadExternal)
-    LABEL(kReadExternal2)
-    LABEL(kReadExternal3)
-    LABEL(kReadExternal4) {
-        int count = READ8();
-        int src = READ8();
-        float tmp[4];
-        I32 m = mask();
-        for (int i = 0; i < VecWidth; ++i) {
-            if (m[i]) {
-                byteCode->fExternalValues[src]->read(baseIndex + i, tmp);
-                for (int j = 0; j < count; ++j) {
-                    sp[j + 1].fFloat[i] = tmp[j];
-                }
-            }
-        }
-        sp += count;
-        NEXT();
-    }
-
-    VECTOR_BINARY_FN(kRemainderF, fFloat, VecMod)
-    VECTOR_BINARY_MASKED_OP(kRemainderS, fSigned, %)
-    VECTOR_BINARY_MASKED_OP(kRemainderU, fUnsigned, %)
-
-    LABEL(kReserve)
-        sp += READ8();
-        NEXT();
-
-    LABEL(kReturn) {
-        int count = READ8();
-        if (frames.empty()) {
-            if (outReturn) {
-                VValue* src = sp - count + 1;
-                if (stripedOutput) {
-                    for (int i = 0; i < count; ++i) {
-                        memcpy(outReturn[i], &src->fFloat, N * sizeof(float));
-                        ++src;
-                    }
-                } else {
-                    float* outPtr = outReturn[0];
-                    for (int i = 0; i < count; ++i) {
-                        for (int j = 0; j < N; ++j) {
-                            outPtr[count * j] = src->fFloat[j];
-                        }
-                        ++outPtr;
-                        ++src;
-                    }
-                }
-            }
-            return true;
-        } else {
-            // When we were called, the caller reserved stack space for their copy of our
-            // return value, then 'stack' was positioned after that, where our parameters
-            // were placed. Copy our return values to their reserved area.
-            memcpy(stack - count, sp - count + 1, count * sizeof(VValue));
-
-            // Now move the stack pointer to the end of the passed-in parameters. This odd
-            // calling convention requires the caller to pop the arguments after calling,
-            // but allows them to store any out-parameters back during that unwinding.
-            // After that sequence finishes, the return value will be the top of the stack.
-            const StackFrame& frame(frames.back());
-            sp = stack + frame.fParameterCount - 1;
-            stack = frame.fStack;
-            code = frame.fCode;
-            ip = frame.fIP;
-            frames.pop_back();
-            NEXT();
-        }
-    }
-
-    LABEL(kScalarToMatrix) {
-        int cols = READ8();
-        int rows = READ8();
-        VValue v = POP();
-        for (int c = 0; c < cols; ++c) {
-            for (int r = 0; r < rows; ++r) {
-                PUSH(c == r ? v : F32(0.0f));
-            }
-        }
-        NEXT();
-    }
-
-    LABEL(kShiftLeft)
-        sp[0] = sp[0].fSigned << READ8();
-        NEXT();
-    LABEL(kShiftRightS)
-        sp[0] = sp[0].fSigned >> READ8();
-        NEXT();
-    LABEL(kShiftRightU)
-        sp[0] = sp[0].fUnsigned >> READ8();
-        NEXT();
-
-    VECTOR_UNARY_FN_VEC(kSin, sinf)
-    VECTOR_UNARY_FN(kSqrt, skvx::sqrt, fFloat)
-
-    LABEL(kStore4)
-        stack[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+3].fFloat);
-    LABEL(kStore3)
-        stack[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+2].fFloat);
-    LABEL(kStore2)
-        stack[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+1].fFloat);
-    LABEL(kStore)
-        stack[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+0].fFloat);
-        ++ip;
-        NEXT();
-
-    LABEL(kStoreGlobal4)
-        globals[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+3].fFloat);
-    LABEL(kStoreGlobal3)
-        globals[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+2].fFloat);
-    LABEL(kStoreGlobal2)
-        globals[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+1].fFloat);
-    LABEL(kStoreGlobal)
-        globals[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+0].fFloat);
-        ++ip;
-        NEXT();
-
-    LABEL(kStoreExtended) {
-        int count = READ8();
-        I32 target = POP().fSigned;
-        VValue* src = sp - count + 1;
-        I32 m = mask();
-        for (int i = 0; i < count; ++i) {
-            for (int j = 0; j < VecWidth; ++j) {
-                if (m[j]) {
-                    stack[target[j] + i].fSigned[j] = src[i].fSigned[j];
-                }
-            }
-        }
-        sp -= count;
-        NEXT();
-    }
-    LABEL(kStoreExtendedGlobal) {
-        int count = READ8();
-        I32 target = POP().fSigned;
-        VValue* src = sp - count + 1;
-        I32 m = mask();
-        for (int i = 0; i < count; ++i) {
-            for (int j = 0; j < VecWidth; ++j) {
-                if (m[j]) {
-                    globals[target[j] + i].fSigned[j] = src[i].fSigned[j];
-                }
-            }
-        }
-        sp -= count;
-        NEXT();
-    }
-
-    LABEL(kStoreSwizzle) {
-        int target = READ8();
-        int count = READ8();
-        for (int i = count - 1; i >= 0; --i) {
-            stack[target + *(ip + i)] = skvx::if_then_else(
-                    mask(), POP().fFloat, stack[target + *(ip + i)].fFloat);
-        }
-        ip += count;
-        NEXT();
-    }
-
-    LABEL(kStoreSwizzleGlobal) {
-        int target = READ8();
-        int count = READ8();
-        for (int i = count - 1; i >= 0; --i) {
-            globals[target + *(ip + i)] = skvx::if_then_else(
-                    mask(), POP().fFloat, globals[target + *(ip + i)].fFloat);
-        }
-        ip += count;
-        NEXT();
-    }
-
-    LABEL(kStoreSwizzleIndirect) {
-        int count = READ8();
-        I32 target = POP().fSigned;
-        I32 m = mask();
-        for (int i = count - 1; i >= 0; --i) {
-            I32 v = POP().fSigned;
-            for (int j = 0; j < VecWidth; ++j) {
-                if (m[j]) {
-                    stack[target[j] + *(ip + i)].fSigned[j] = v[j];
-                }
-            }
-        }
-        ip += count;
-        NEXT();
-    }
-
-    LABEL(kStoreSwizzleIndirectGlobal) {
-        int count = READ8();
-        I32 target = POP().fSigned;
-        I32 m = mask();
-        for (int i = count - 1; i >= 0; --i) {
-            I32 v = POP().fSigned;
-            for (int j = 0; j < VecWidth; ++j) {
-                if (m[j]) {
-                    globals[target[j] + *(ip + i)].fSigned[j] = v[j];
-                }
-            }
-        }
-        ip += count;
-        NEXT();
-    }
-
-    VECTOR_BINARY_OP(kSubtractI, fSigned, -)
-    VECTOR_MATRIX_BINARY_OP(kSubtractF, fFloat, -)
-
-    LABEL(kSwizzle) {
-        VValue tmp[4];
-        for (int i = READ8() - 1; i >= 0; --i) {
-            tmp[i] = POP();
-        }
-        for (int i = READ8() - 1; i >= 0; --i) {
-            PUSH(tmp[READ8()]);
-        }
-        NEXT();
-    }
-
-    VECTOR_UNARY_FN_VEC(kTan, tanf)
-
-    LABEL(kWriteExternal4)
-    LABEL(kWriteExternal3)
-    LABEL(kWriteExternal2)
-    LABEL(kWriteExternal) {
-        int count = READ8();
-        int target = READ8();
-        float tmp[4];
-        I32 m = mask();
-        sp -= count;
-        for (int i = 0; i < VecWidth; ++i) {
-            if (m[i]) {
-                for (int j = 0; j < count; ++j) {
-                    tmp[j] = sp[j + 1].fFloat[i];
-                }
-                byteCode->fExternalValues[target]->write(baseIndex + i, tmp);
-            }
-        }
-        NEXT();
-    }
-
-    LABEL(kMaskPush)
-        condPtr[1] = POP().fSigned;
-        maskPtr[1] = maskPtr[0] & condPtr[1];
-        ++condPtr; ++maskPtr;
-        NEXT();
-    LABEL(kMaskPop)
-        --condPtr; --maskPtr;
-        NEXT();
-    LABEL(kMaskNegate)
-        maskPtr[0] = maskPtr[-1] & ~condPtr[0];
-        NEXT();
-    LABEL(kMaskBlend) {
-        int count = READ8();
-        I32 m = condPtr[0];
-        --condPtr; --maskPtr;
-        for (int i = 0; i < count; ++i) {
-            sp[-count] = skvx::if_then_else(m, sp[-count].fFloat, sp[0].fFloat);
-            --sp;
-        }
-        NEXT();
-    }
-    LABEL(kBranchIfAllFalse) {
-        int target = READ16();
-        if (!skvx::any(mask())) {
-            ip = code + target;
-        }
-        NEXT();
-    }
-
-    LABEL(kLoopBegin)
-        contPtr[1] = 0;
-        loopPtr[1] = loopPtr[0];
-        ++contPtr; ++loopPtr;
-        NEXT();
-    LABEL(kLoopNext)
-        *loopPtr |= *contPtr;
-        *contPtr = 0;
-        NEXT();
-    LABEL(kLoopMask)
-        *loopPtr &= POP().fSigned;
-        NEXT();
-    LABEL(kLoopEnd)
-        --contPtr; --loopPtr;
-        NEXT();
-    LABEL(kLoopBreak)
-        *loopPtr &= ~mask();
-        NEXT();
-    LABEL(kLoopContinue) {
-        I32 m = mask();
-        *contPtr |=  m;
-        *loopPtr &= ~m;
-        NEXT();
-    }
-#ifdef SKSLC_THREADED_CODE
-    #ifdef TRACE
-        next:
-            printf("at %3d (stack: %d) (disable threaded code for disassembly)\n",
-                   (int) (ip - code), (int) (sp - stack) + 1);
-            goto *READ_INST();
-    #endif
-#else
-        }
-    }
-#endif
-}
-
-}; // class Interpreter
-
-#endif // SK_ENABLE_SKSL_INTERPRETER
-
-#undef spf
-
-void ByteCodeFunction::disassemble() const {
-#if defined(SK_ENABLE_SKSL_INTERPRETER)
-    const uint8_t* ip = fCode.data();
-    while (ip < fCode.data() + fCode.size()) {
-        printf("%d: ", (int)(ip - fCode.data()));
-        ip = Interpreter::DisassembleInstruction(ip);
-        printf("\n");
-    }
-#endif
-}
-
-#define VECTOR_PREPROCESS(base)          \
-    case ByteCodeInstruction::base ## 4: \
-    case ByteCodeInstruction::base ## 3: \
-    case ByteCodeInstruction::base ## 2: \
-    case ByteCodeInstruction::base: READ8(); break;
-
-#define VECTOR_PREPROCESS_NO_COUNT(base) \
-    case ByteCodeInstruction::base ## 4: \
-    case ByteCodeInstruction::base ## 3: \
-    case ByteCodeInstruction::base ## 2: \
-    case ByteCodeInstruction::base: break;
-
-#define VECTOR_MATRIX_PREPROCESS(base) \
-    VECTOR_PREPROCESS(base)            \
-    case ByteCodeInstruction::base ## N: READ8(); break;
-
-#define VECTOR_MATRIX_PREPROCESS_NO_COUNT(base) \
-    VECTOR_PREPROCESS_NO_COUNT(base)            \
-    case ByteCodeInstruction::base ## N: READ8(); break;
-
-void ByteCodeFunction::preprocess(const void* labels[]) {
-#if defined(SK_ENABLE_SKSL_INTERPRETER)
-#ifdef TRACE
-    this->disassemble();
-#endif
-    uint8_t* ip = fCode.data();
-    while (ip < fCode.data() + fCode.size()) {
-        ByteCodeInstruction inst = (ByteCodeInstruction) (intptr_t) READ_INST();
-        const void* label = labels[(int) inst];
-        memcpy(ip - sizeof(instruction), &label, sizeof(label));
-        switch (inst) {
-            VECTOR_MATRIX_PREPROCESS(kAddF)
-            VECTOR_PREPROCESS(kAddI)
-            case ByteCodeInstruction::kAndB: break;
-            case ByteCodeInstruction::kBranch: READ16(); break;
-            case ByteCodeInstruction::kCall: READ8(); break;
-            case ByteCodeInstruction::kCallExternal: {
-                READ8();
-                READ8();
-                READ8();
-                break;
-            }
-            case ByteCodeInstruction::kClampIndex: READ8(); break;
-            VECTOR_PREPROCESS(kCompareIEQ)
-            VECTOR_PREPROCESS(kCompareINEQ)
-            VECTOR_MATRIX_PREPROCESS(kCompareFEQ)
-            VECTOR_MATRIX_PREPROCESS(kCompareFNEQ)
-            VECTOR_PREPROCESS(kCompareFGT)
-            VECTOR_PREPROCESS(kCompareFGTEQ)
-            VECTOR_PREPROCESS(kCompareFLT)
-            VECTOR_PREPROCESS(kCompareFLTEQ)
-            VECTOR_PREPROCESS(kCompareSGT)
-            VECTOR_PREPROCESS(kCompareSGTEQ)
-            VECTOR_PREPROCESS(kCompareSLT)
-            VECTOR_PREPROCESS(kCompareSLTEQ)
-            VECTOR_PREPROCESS(kCompareUGT)
-            VECTOR_PREPROCESS(kCompareUGTEQ)
-            VECTOR_PREPROCESS(kCompareULT)
-            VECTOR_PREPROCESS(kCompareULTEQ)
-            VECTOR_PREPROCESS_NO_COUNT(kConvertFtoI)
-            VECTOR_PREPROCESS_NO_COUNT(kConvertStoF)
-            VECTOR_PREPROCESS_NO_COUNT(kConvertUtoF)
-            VECTOR_PREPROCESS(kCos)
-            VECTOR_MATRIX_PREPROCESS(kDivideF)
-            VECTOR_PREPROCESS(kDivideS)
-            VECTOR_PREPROCESS(kDivideU)
-            VECTOR_MATRIX_PREPROCESS(kDup)
-
-            case ByteCodeInstruction::kInverse2x2:
-            case ByteCodeInstruction::kInverse3x3:
-            case ByteCodeInstruction::kInverse4x4: break;
-
-            case ByteCodeInstruction::kLoad:
-            case ByteCodeInstruction::kLoad2:
-            case ByteCodeInstruction::kLoad3:
-            case ByteCodeInstruction::kLoad4:
-            case ByteCodeInstruction::kLoadGlobal:
-            case ByteCodeInstruction::kLoadGlobal2:
-            case ByteCodeInstruction::kLoadGlobal3:
-            case ByteCodeInstruction::kLoadGlobal4:
-            case ByteCodeInstruction::kLoadUniform:
-            case ByteCodeInstruction::kLoadUniform2:
-            case ByteCodeInstruction::kLoadUniform3:
-            case ByteCodeInstruction::kLoadUniform4: READ16(); break;
-
-            case ByteCodeInstruction::kLoadSwizzle:
-            case ByteCodeInstruction::kLoadSwizzleGlobal:
-            case ByteCodeInstruction::kLoadSwizzleUniform: {
-                READ8();
-                int count = READ8();
-                ip += count;
-                break;
-            }
-
-            case ByteCodeInstruction::kLoadExtended:
-            case ByteCodeInstruction::kLoadExtendedGlobal:
-            case ByteCodeInstruction::kLoadExtendedUniform:
-                READ8();
-                break;
-
-            case ByteCodeInstruction::kMatrixToMatrix: {
-                READ8();
-                READ8();
-                READ8();
-                READ8();
-                break;
-            }
-            case ByteCodeInstruction::kMatrixMultiply: {
-                READ8();
-                READ8();
-                READ8();
-                break;
-            }
-            VECTOR_MATRIX_PREPROCESS(kMultiplyF)
-            VECTOR_PREPROCESS(kMultiplyI)
-            VECTOR_MATRIX_PREPROCESS_NO_COUNT(kNegateF)
-            VECTOR_PREPROCESS_NO_COUNT(kNegateI)
-            case ByteCodeInstruction::kNotB: break;
-            case ByteCodeInstruction::kOrB: break;
-            VECTOR_MATRIX_PREPROCESS_NO_COUNT(kPop)
-            case ByteCodeInstruction::kPushImmediate: READ32(); break;
-
-            case ByteCodeInstruction::kReadExternal:
-            case ByteCodeInstruction::kReadExternal2:
-            case ByteCodeInstruction::kReadExternal3:
-            case ByteCodeInstruction::kReadExternal4: READ16(); break;
-
-            VECTOR_PREPROCESS(kRemainderF)
-            VECTOR_PREPROCESS(kRemainderS)
-            VECTOR_PREPROCESS(kRemainderU)
-            case ByteCodeInstruction::kReserve: READ8(); break;
-            case ByteCodeInstruction::kReturn: READ8(); break;
-            case ByteCodeInstruction::kScalarToMatrix: READ8(); READ8(); break;
-            case ByteCodeInstruction::kShiftLeft: READ8(); break;
-            case ByteCodeInstruction::kShiftRightS: READ8(); break;
-            case ByteCodeInstruction::kShiftRightU: READ8(); break;
-            VECTOR_PREPROCESS(kSin)
-            VECTOR_PREPROCESS_NO_COUNT(kSqrt)
-
-            case ByteCodeInstruction::kStore:
-            case ByteCodeInstruction::kStore2:
-            case ByteCodeInstruction::kStore3:
-            case ByteCodeInstruction::kStore4:
-            case ByteCodeInstruction::kStoreGlobal:
-            case ByteCodeInstruction::kStoreGlobal2:
-            case ByteCodeInstruction::kStoreGlobal3:
-            case ByteCodeInstruction::kStoreGlobal4: READ8(); break;
-
-            case ByteCodeInstruction::kStoreSwizzle:
-            case ByteCodeInstruction::kStoreSwizzleGlobal: {
-                READ8();
-                int count = READ8();
-                ip += count;
-                break;
-            }
-
-            case ByteCodeInstruction::kStoreSwizzleIndirect:
-            case ByteCodeInstruction::kStoreSwizzleIndirectGlobal: {
-                int count = READ8();
-                ip += count;
-                break;
-            }
-
-            case ByteCodeInstruction::kStoreExtended: READ8(); break;
-            case ByteCodeInstruction::kStoreExtendedGlobal: READ8(); break;
-
-            VECTOR_MATRIX_PREPROCESS(kSubtractF)
-            VECTOR_PREPROCESS(kSubtractI)
-
-            case ByteCodeInstruction::kSwizzle: {
-                READ8();
-                int count = READ8();
-                ip += count;
-                break;
-            }
-            VECTOR_PREPROCESS(kTan)
-            case ByteCodeInstruction::kWriteExternal:
-            case ByteCodeInstruction::kWriteExternal2:
-            case ByteCodeInstruction::kWriteExternal3:
-            case ByteCodeInstruction::kWriteExternal4: READ16(); break;
-
-            case ByteCodeInstruction::kXorB: break;
-            case ByteCodeInstruction::kMaskPush: break;
-            case ByteCodeInstruction::kMaskPop: break;
-            case ByteCodeInstruction::kMaskNegate: break;
-            case ByteCodeInstruction::kMaskBlend: READ8(); break;
-            case ByteCodeInstruction::kBranchIfAllFalse: READ16(); break;
-            case ByteCodeInstruction::kLoopBegin: break;
-            case ByteCodeInstruction::kLoopNext: break;
-            case ByteCodeInstruction::kLoopMask: break;
-            case ByteCodeInstruction::kLoopEnd: break;
-            case ByteCodeInstruction::kLoopContinue:  break;
-            case ByteCodeInstruction::kLoopBreak: break;
-            default:
-                ip -= 2;
-                printf("unknown(%d)\n", READ16());
-                SkASSERT(false);
-        }
-    }
-#endif
-}
-
-bool ByteCode::run(const ByteCodeFunction* f,
-                   float* args, int argCount,
-                   float* outReturn, int returnCount,
-                   const float* uniforms, int uniformCount) const {
-#if defined(SK_ENABLE_SKSL_INTERPRETER)
-    Interpreter::VValue stack[128];
-    int stackNeeded = f->fParameterCount + f->fLocalCount + f->fStackCount;
-    if (stackNeeded > (int)SK_ARRAY_COUNT(stack)) {
-        return false;
-    }
-
-    if (argCount != f->fParameterCount ||
-        returnCount != f->fReturnCount ||
-        uniformCount != fUniformSlotCount) {
-        return false;
-    }
-
-    Interpreter::VValue globals[32];
-    if (fGlobalSlotCount > (int)SK_ARRAY_COUNT(globals)) {
-        return false;
-    }
-
-    // Transpose args into stack
-    {
-        float* src = args;
-        float* dst = (float*)stack;
-        for (int i = 0; i < argCount; ++i) {
-            *dst = *src++;
-            dst += VecWidth;
-        }
-    }
-
-    bool stripedOutput = false;
-    float** outArray = outReturn ? &outReturn : nullptr;
-    if (!Interpreter::InnerRun(this, f, stack, outArray, globals, uniforms, stripedOutput, 1, 0)) {
-        return false;
-    }
-
-    // Transpose out parameters back
-    {
-        float* dst = args;
-        float* src = (float*)stack;
-        for (const auto& p : f->fParameters) {
-            if (p.fIsOutParameter) {
-                for (int i = p.fSlotCount; i > 0; --i) {
-                    *dst++ = *src;
-                    src += VecWidth;
-                }
-            } else {
-                dst += p.fSlotCount;
-                src += p.fSlotCount * VecWidth;
-            }
-        }
-    }
-
-    return true;
-#else
-    SkDEBUGFAIL("ByteCode interpreter not enabled");
-    return false;
-#endif
-}
-
-bool ByteCode::runStriped(const ByteCodeFunction* f, int N,
-                          float* args[], int argCount,
-                          float* outReturn[], int returnCount,
-                          const float* uniforms, int uniformCount) const {
-#if defined(SK_ENABLE_SKSL_INTERPRETER)
-    Interpreter::VValue stack[128];
-    int stackNeeded = f->fParameterCount + f->fLocalCount + f->fStackCount;
-    if (stackNeeded > (int)SK_ARRAY_COUNT(stack)) {
-        return false;
-    }
-
-    if (argCount != f->fParameterCount ||
-        returnCount != f->fReturnCount ||
-        uniformCount != fUniformSlotCount) {
-        return false;
-    }
-
-    Interpreter::VValue globals[32];
-    if (fGlobalSlotCount > (int)SK_ARRAY_COUNT(globals)) {
-        return false;
-    }
-
-    // innerRun just takes outArgs, so clear it if the count is zero
-    if (returnCount == 0) {
-        outReturn = nullptr;
-    }
-
-    int baseIndex = 0;
-
-    while (N) {
-        int w = std::min(N, VecWidth);
-
-        // Copy args into stack
-        for (int i = 0; i < argCount; ++i) {
-            memcpy((void*)(stack + i), args[i], w * sizeof(float));
-        }
-
-        bool stripedOutput = true;
-        if (!Interpreter::InnerRun(this, f, stack, outReturn, globals, uniforms, stripedOutput, w,
-                                   baseIndex)) {
-            return false;
-        }
-
-        // Copy out parameters back
-        int slot = 0;
-        for (const auto& p : f->fParameters) {
-            if (p.fIsOutParameter) {
-                for (int i = slot; i < slot + p.fSlotCount; ++i) {
-                    memcpy(args[i], stack + i, w * sizeof(float));
-                }
-            }
-            slot += p.fSlotCount;
-        }
-
-        // Step each argument pointer ahead
-        for (int i = 0; i < argCount; ++i) {
-            args[i] += w;
-        }
-        N -= w;
-        baseIndex += w;
-    }
-
-    return true;
-#else
-    SkDEBUGFAIL("ByteCode interpreter not enabled");
-    return false;
-#endif
-}
-
-} // namespace SkSL
-
-#endif
diff --git a/src/sksl/SkSLByteCode.h b/src/sksl/SkSLByteCode.h
index f917eec..adc4a0b 100644
--- a/src/sksl/SkSLByteCode.h
+++ b/src/sksl/SkSLByteCode.h
@@ -9,206 +9,59 @@
 #define SKSL_BYTECODE
 
 #include "include/private/SkOnce.h"
+#include "include/private/SkVx.h"
 #include "src/sksl/SkSLString.h"
+#include "src/sksl/ir/SkSLFunctionDeclaration.h"
 
 #include <memory>
 #include <vector>
 
 namespace SkSL {
 
-class  ExternalValue;
-struct FunctionDeclaration;
-
-// GCC and Clang support the "labels as values" extension which we need to implement the interpreter
-// using threaded code. Otherwise, we fall back to using a switch statement in a for loop.
-#if defined(__GNUC__) || defined(__clang__)
-    #define SKSLC_THREADED_CODE
-    using instruction = void*;
-#else
-    using instruction = uint16_t;
-#endif
-
-#define VECTOR(name) name ## 4, name ## 3, name ## 2, name
-#define VECTOR_MATRIX(name) name ## 4, name ## 3, name ## 2, name, name ## N
-
-enum class ByteCodeInstruction : uint16_t {
-    // B = bool, F = float, I = int, S = signed, U = unsigned
-    // All binary VECTOR instructions (kAddF, KSubtractI, kCompareIEQ, etc.) are followed by a byte
-    // indicating the count, even though it is redundant due to the count appearing in the opcode.
-    // This is because the original opcodes are lost after we preprocess it into threaded code, and
-    // we need to still be able to access the count so as to permit the implementation to use opcode
-    // fallthrough.
-    VECTOR_MATRIX(kAddF),
-    VECTOR(kAddI),
-    kAndB,
-    kBranch,
-    // Followed by a byte indicating the index of the function to call
-    kCall,
-    // Followed by three bytes indicating: the number of argument slots, the number of return slots,
-    // and the index of the external value to call
-    kCallExternal,
-    // For dynamic array access: Followed by byte indicating length of array
-    kClampIndex,
-    VECTOR(kCompareIEQ),
-    VECTOR(kCompareINEQ),
-    VECTOR_MATRIX(kCompareFEQ),
-    VECTOR_MATRIX(kCompareFNEQ),
-    VECTOR(kCompareFGT),
-    VECTOR(kCompareFGTEQ),
-    VECTOR(kCompareFLT),
-    VECTOR(kCompareFLTEQ),
-    VECTOR(kCompareSGT),
-    VECTOR(kCompareSGTEQ),
-    VECTOR(kCompareSLT),
-    VECTOR(kCompareSLTEQ),
-    VECTOR(kCompareUGT),
-    VECTOR(kCompareUGTEQ),
-    VECTOR(kCompareULT),
-    VECTOR(kCompareULTEQ),
-    VECTOR(kConvertFtoI),
-    VECTOR(kConvertStoF),
-    VECTOR(kConvertUtoF),
-    // Followed by a (redundant) byte indicating the count
-    VECTOR(kCos),
-    VECTOR_MATRIX(kDivideF),
-    VECTOR(kDivideS),
-    VECTOR(kDivideU),
-    // Duplicates the top stack value. Followed by a (redundant) byte indicating the count.
-    VECTOR_MATRIX(kDup),
-    kInverse2x2,
-    kInverse3x3,
-    kInverse4x4,
-    // kLoad/kLoadGlobal are followed by a byte indicating the count, and a byte indicating the
-    // local/global slot to load
-    VECTOR(kLoad),
-    VECTOR(kLoadGlobal),
-    VECTOR(kLoadUniform),
-    // As kLoad/kLoadGlobal, then a count byte (1-4), and then one byte per swizzle component (0-3).
-    kLoadSwizzle,
-    kLoadSwizzleGlobal,
-    kLoadSwizzleUniform,
-    // kLoadExtended* are fallback load ops when we lack a specialization. They are followed by a
-    // count byte, and get the slot to load from the top of the stack.
-    kLoadExtended,
-    kLoadExtendedGlobal,
-    kLoadExtendedUniform,
-    // Followed by four bytes: srcCols, srcRows, dstCols, dstRows. Consumes the src matrix from the
-    // stack, and replaces it with the dst matrix. Per GLSL rules, there are no restrictions on
-    // dimensions. Any overlapping values are copied, and any other values are filled in with the
-    // identity matrix.
-    kMatrixToMatrix,
-    // Followed by three bytes: leftCols (== rightRows), leftRows, rightCols
-    kMatrixMultiply,
-    VECTOR_MATRIX(kNegateF),
-    VECTOR(kNegateI),
-    VECTOR_MATRIX(kMultiplyF),
-    VECTOR(kMultiplyI),
-    kNotB,
-    kOrB,
-    VECTOR_MATRIX(kPop),
-    // Followed by a 32 bit value containing the value to push
-    kPushImmediate,
-    // Followed by a byte indicating external value to read
-    VECTOR(kReadExternal),
-    VECTOR(kRemainderF),
-    VECTOR(kRemainderS),
-    VECTOR(kRemainderU),
-    // Followed by a byte indicating the number of slots to reserve on the stack (for later return)
-    kReserve,
-    // Followed by a byte indicating the number of slots being returned
-    kReturn,
-    // Followed by two bytes indicating columns and rows of matrix (2, 3, or 4 each).
-    // Takes a single value from the top of the stack, and converts to a CxR matrix with that value
-    // replicated along the diagonal (and zero elsewhere), per the GLSL matrix construction rules.
-    kScalarToMatrix,
-    // Followed by a byte indicating the number of bits to shift
-    kShiftLeft,
-    kShiftRightS,
-    kShiftRightU,
-    // Followed by a (redundant) byte indicating the count
-    VECTOR(kSin),
-    VECTOR(kSqrt),
-    // kStore/kStoreGlobal are followed by a byte indicating the local/global slot to store
-    VECTOR(kStore),
-    VECTOR(kStoreGlobal),
-    // Fallback stores. Followed by count byte, and get the slot to store from the top of the stack
-    kStoreExtended,
-    kStoreExtendedGlobal,
-    // As kStore/kStoreGlobal, then a count byte (1-4), then one byte per swizzle component (0-3).
-    // Expects the stack to look like: ... v1 v2 v3 v4, where the number of 'v's is equal to the
-    // number of swizzle components. After the store, all v's are popped from the stack.
-    kStoreSwizzle,
-    kStoreSwizzleGlobal,
-    // As above, but gets the store slot from the top of the stack (before values to be stored)
-    kStoreSwizzleIndirect,
-    kStoreSwizzleIndirectGlobal,
-    // Followed by two count bytes (1-4), and then one byte per swizzle component (0-3). The first
-    // count byte provides the current vector size (the vector is the top n stack elements), and the
-    // second count byte provides the swizzle component count.
-    kSwizzle,
-    VECTOR_MATRIX(kSubtractF),
-    VECTOR(kSubtractI),
-    // Followed by a (redundant) byte indicating the count
-    VECTOR(kTan),
-    // Followed by a byte indicating external value to write
-    VECTOR(kWriteExternal),
-    kXorB,
-
-    kMaskPush,
-    kMaskPop,
-    kMaskNegate,
-    // Followed by count byte
-    kMaskBlend,
-    // Followed by address
-    kBranchIfAllFalse,
-
-    kLoopBegin,
-    kLoopNext,
-    kLoopMask,
-    kLoopEnd,
-    kLoopBreak,
-    kLoopContinue,
-};
-#undef VECTOR
+class ByteCode;
+class ExternalValue;
 
 class ByteCodeFunction {
 public:
-    int getParameterCount() const { return fParameterCount; }
-    int getReturnCount() const { return fReturnCount; }
-
-    /**
-     * Print bytecode disassembly to stdout.
-     */
-    void disassemble() const;
-
-private:
-    ByteCodeFunction(const FunctionDeclaration* declaration);
-
-    friend class ByteCode;
-    friend class ByteCodeGenerator;
-    friend struct Interpreter;
-
+    // all counts are of 32-bit values, so a float4 counts as 4 parameter or return slots
     struct Parameter {
         int fSlotCount;
         bool fIsOutParameter;
     };
 
-    SkSL::String fName;
-    std::vector<Parameter> fParameters;
-    int fParameterCount;
-    int fReturnCount = 0;
+    /**
+     * Note that this is the actual number of parameters, not the number of parameter slots.
+     */
+    int getParameterCount() const { return fParameters.size(); }
 
-    int fLocalCount = 0;
-    int fStackCount = 0;
-    int fConditionCount = 0;
-    int fLoopCount = 0;
-    mutable SkOnce fPreprocessOnce;
+    Parameter getParameter(int idx) const { return fParameters[idx]; }
+
+    int getParameterSlotCount() const { return fParameterSlotCount; }
+
+    int getReturnSlotCount() const { return fReturnSlotCount; }
+
+    void disassemble() const { }
+
+private:
+    ByteCodeFunction(const FunctionDeclaration* declaration)
+        : fName(declaration->fName) {}
+
+    String fName;
+
+    std::vector<Parameter> fParameters;
+
+    int fParameterSlotCount;
+
+    int fReturnSlotCount;
+
+    int fStackSlotCount;
+
     std::vector<uint8_t> fCode;
 
-    /**
-     * Replace each opcode with the corresponding entry from the labels array.
-     */
-    void preprocess(const void* labels[]);
+    friend class ByteCode;
+    friend class ByteCodeGenerator;
+    template<int width>
+    friend class Interpreter;
 };
 
 enum class TypeCategory {
@@ -220,9 +73,260 @@
 
 class SK_API ByteCode {
 public:
-    static constexpr int kVecWidth = 8;
+    template<int width>
+    union Vector {
+        skvx::Vec<width, float> fFloat;
+        skvx::Vec<width, int32_t> fInt;
+        skvx::Vec<width, uint32_t> fUInt;
 
-    ByteCode() = default;
+        Vector() = default;
+
+        Vector(skvx::Vec<width, float> f)
+            : fFloat(f) {}
+
+        Vector(skvx::Vec<width, int32_t> i)
+            : fInt(i) {}
+
+        Vector(skvx::Vec<width, uint32_t> u)
+            : fUInt(u) {}
+    };
+
+    enum class Instruction : uint8_t {
+        // no parameters
+        kNop,
+        // no parameters
+        kAbort,
+        // Register target, Register src1, Register src2
+        kAddF,
+        // Register target, Register src1, Register src2
+        kAddI,
+        // Register target, Register src1, Register src2
+        kAnd,
+        // Register index, int arrayLength
+        kBoundsCheck,
+        // Pointer target
+        kBranch,
+        // Pointer target
+        kBranchIfAllFalse,
+        // no parameters
+        kBreak,
+        // Register target, uint8_t functionIndex, Register parameters
+        kCall,
+        // Register target, uint8_t externalValueIndex, uint8_t targetSize, Register arguments,
+        // uint8_t argumentSize
+        kCallExternal,
+        // Register target, Register src1, Register src2
+        kCompareEQF,
+        // Register target, Register src1, Register src2
+        kCompareEQI,
+        // Register target, Register src1, Register src2
+        kCompareNEQF,
+        // Register target, Register src1, Register src2
+        kCompareNEQI,
+        // Register target, Register src1, Register src2
+        kCompareGTF,
+        // Register target, Register src1, Register src2
+        kCompareGTS,
+        // Register target, Register src1, Register src2
+        kCompareGTU,
+        // Register target, Register src1, Register src2
+        kCompareGTEQF,
+        // Register target, Register src1, Register src2
+        kCompareGTEQS,
+        // Register target, Register src1, Register src2
+        kCompareGTEQU,
+        // Register target, Register src1, Register src2
+        kCompareLTF,
+        // Register target, Register src1, Register src2
+        kCompareLTS,
+        // Register target, Register src1, Register src2
+        kCompareLTU,
+        // Register target, Register src1, Register src2
+        kCompareLTEQF,
+        // Register target, Register src1, Register src2
+        kCompareLTEQS,
+        // Register target, Register src1, Register src2
+        kCompareLTEQU,
+        // no parameters
+        kContinue,
+        // Register target, Register src
+        kCopy,
+        // Register target, Register src,
+        kCos,
+        // Register target, Register src1, Register src2
+        kDivideF,
+        // Register target, Register src1, Register src2
+        kDivideS,
+        // Register target, Register src1, Register src2
+        kDivideU,
+        // Register target, Register src
+        kFloatToSigned,
+        // Register target, Register src
+        kFloatToUnsigned,
+        // Load a constant into a register
+        // Register target, Immediate value
+        kImmediate,
+        // Register target, Register src
+        kInverse2x2,
+        // Register target, Register src
+        kInverse3x3,
+        // Register target, Register src
+        kInverse4x4,
+        // Load the memory cell pointed to by srcPtr into a register
+        // Register target, Register srcPtr
+        kLoad,
+        // Load the memory cell pointed to by src into a register
+        // Register target, Pointer src
+        kLoadDirect,
+        // Load the parameter slot pointed to by srcPtr into a register
+        // Register target, Register srcPtr
+        kLoadParameter,
+        // Load the parameter slot pointed to by src into a register
+        // Register target, Pointer src
+        kLoadParameterDirect,
+        // Load the stack cell pointed to by srcPtr + sp into a register
+        // Register target, Register srcPtr
+        kLoadStack,
+        // Load the stack cell pointed to by src + sp into a register
+        // Register target, Pointer src
+        kLoadStackDirect,
+        // Pushes a new loop onto the loop and continue stacks
+        // no parameters
+        kLoopBegin,
+        // Pops the loop and continue stacks
+        // no parameters
+        kLoopEnd,
+        // Register mask
+        kLoopMask,
+        // no parameters
+        kLoopNext,
+        // no parameters
+        kMaskNegate,
+        // no parameters
+        kMaskPop,
+        // Register mask
+        kMaskPush,
+        // Register target, Register left, Register right, uint8_t leftColsAndRightRows,
+        // uint8_t leftRows, uint8_t rightCols
+        kMatrixMultiply,
+        // Register target, Register src, uint8_t srcColumns, uint8_t srcRows, uint8_t dstColumns,
+        // uint8_t dstRows
+        kMatrixToMatrix,
+        // Register target, Register src1, Register src2
+        kMultiplyF,
+        // Register target, Register src1, Register src2
+        kMultiplyI,
+        // Register target, Register src
+        kNegateF,
+        // Register target, Register src
+        kNegateS,
+        // Register target, Register src
+        kNot,
+        // Register target, Register src1, Register src2
+        kOr,
+        // Register src
+        kPrint,
+        // Register target, uint8_t count, uint8_t index
+        kReadExternal,
+        // Register target, Register src1, Register src2
+        kRemainderF,
+        // Register target, Register src1, Register src2
+        kRemainderS,
+        // Register target, Register src1, Register src2
+        kRemainderU,
+        // no parameters
+        kReturn,
+        // Register value
+        kReturnValue,
+        // Register target, Register src, uint8_t columns, uint8_t rows
+        kScalarToMatrix,
+        // Register target, Register test, Register ifTrue, Register ifFalse
+        kSelect,
+        // Register target, Register src, uint8_t count
+        kShiftLeft,
+        // Register target, Register src, uint8_t count
+        kShiftRightS,
+        // Register target, Register src, uint8_t count
+        kShiftRightU,
+        // Register target, Register src
+        kSignedToFloat,
+        // Register target, Register src,
+        kSin,
+        // Register target, Register src,
+        kSqrt,
+        // Store to the memory cell pointed to by dstPtr
+        // Register dstPtr, Register src
+        kStore,
+        // Store to the memory cell pointed to by dst
+        // Pointer dst, Register src
+        kStoreDirect,
+        // Store to the parameter slot pointed to by dstPtr
+        // Register dstPtr, Register src
+        kStoreParameter,
+        // Store to the parameter slot pointed to by dst
+        // Pointer dst, Register src
+        kStoreParameterDirect,
+        // Stores a register into the stack cell pointed to by dst + sp
+        // Register dst, Register src
+        kStoreStack,
+        // Stores a register into the stack cell pointed to by dstPtr + sp
+        // Pointer dst, Register src
+        kStoreStackDirect,
+        // Register target, Register src1, Register src2
+        kSubtractF,
+        // Register target, Register src1, Register src2
+        kSubtractI,
+        // Register target, Register src,
+        kTan,
+        // Register target, Register src,
+        kUnsignedToFloat,
+        // uint8_t index, uint8_t count, Register src
+        kWriteExternal,
+        // Register target, Register src1, Register src2
+        kXor,
+    };
+
+
+    // Compound values like vectors span multiple Registers or Pointer addresses. We always refer to
+    // them by the address of their first slot, so for instance if you add two float4's together,
+    // the resulting Register contains the first channel of the result, with the other three
+    // channels following in the next three Registers.
+
+    struct Register {
+        uint16_t fIndex;
+
+        Register operator+(uint16_t offset) const {
+            return Register{(uint16_t) (fIndex + offset)};
+        }
+    };
+
+    struct Pointer {
+        uint16_t fAddress;
+
+        Pointer operator+(uint16_t offset) const {
+            return Pointer{(uint16_t) (fAddress + offset)};
+        }
+    };
+
+    union Immediate {
+        float fFloat;
+        int32_t fInt;
+        uint32_t fUInt;
+
+        Immediate() {}
+
+        Immediate(float f)
+            : fFloat(f) {}
+
+        Immediate(int32_t i)
+            : fInt(i) {}
+
+        Immediate(uint32_t u)
+            : fUInt(u) {}
+    };
+
+    static constexpr int kPointerMax = 65535;
+    static constexpr int kRegisterMax = 65535;
 
     const ByteCodeFunction* getFunction(const char* name) const {
         for (const auto& f : fFunctions) {
@@ -233,36 +337,9 @@
         return nullptr;
     }
 
-    /**
-     * Invokes the specified function once, with the given arguments.
-     * 'args', 'outReturn', and 'uniforms' are collections of 32-bit values (typically floats,
-     * but possibly int32_t or uint32_t, depending on the types used in the SkSL).
-     * Any 'out' or 'inout' parameters will result in the 'args' array being modified.
-     * The return value is stored in 'outReturn' (may be null, to discard the return value).
-     * 'uniforms' are mapped to 'uniform' globals, in order.
-     */
-    bool SKSL_WARN_UNUSED_RESULT run(const ByteCodeFunction*,
-                                     float* args, int argCount,
-                                     float* outReturn, int returnCount,
-                                     const float* uniforms, int uniformCount) const;
-
-    /**
-     * Invokes the specified function with the given arguments, 'N' times. 'args' and 'outReturn'
-     * are accepted and returned in structure-of-arrays form:
-     *   args[0] points to an array of N values, the first argument for each invocation
-     *   ...
-     *   args[argCount - 1] points to an array of N values, the last argument for each invocation
-     *
-     * All values in 'args', 'outReturn', and 'uniforms' are 32-bit values (typically floats,
-     * but possibly int32_t or uint32_t, depending on the types used in the SkSL).
-     * Any 'out' or 'inout' parameters will result in the 'args' array being modified.
-     * The return value is stored in 'outReturn' (may be null, to discard the return value).
-     * 'uniforms' are mapped to 'uniform' globals, in order.
-     */
-    bool SKSL_WARN_UNUSED_RESULT runStriped(const ByteCodeFunction*, int N,
-                                            float* args[], int argCount,
-                                            float* outReturn[], int returnCount,
-                                            const float* uniforms, int uniformCount) const;
+    int getGlobalSlotCount() const {
+        return fGlobalSlotCount;
+    }
 
     struct Uniform {
         SkSL::String fName;
@@ -285,20 +362,19 @@
     const Uniform& getUniform(int i) const { return fUniforms[i]; }
 
 private:
-    ByteCode(const ByteCode&) = delete;
-    ByteCode& operator=(const ByteCode&) = delete;
+    std::vector<std::unique_ptr<ByteCodeFunction>> fFunctions;
+    std::vector<ExternalValue*> fExternalValues;
 
-    friend class ByteCodeGenerator;
-    friend struct Interpreter;
+    int fGlobalSlotCount;
 
-    int fGlobalSlotCount = 0;
     int fUniformSlotCount = 0;
     std::vector<Uniform> fUniforms;
 
-    std::vector<std::unique_ptr<ByteCodeFunction>> fFunctions;
-    std::vector<ExternalValue*> fExternalValues;
+    friend class ByteCodeGenerator;
+    template<int width>
+    friend class Interpreter;
 };
 
-}
+} // namespace
 
 #endif
diff --git a/src/sksl/SkSLByteCodeGenerator.cpp b/src/sksl/SkSLByteCodeGenerator.cpp
index 36a1338..75b3d22 100644
--- a/src/sksl/SkSLByteCodeGenerator.cpp
+++ b/src/sksl/SkSLByteCodeGenerator.cpp
@@ -7,50 +7,21 @@
 
 #include "src/sksl/SkSLByteCodeGenerator.h"
 
-#include <algorithm>
-
 namespace SkSL {
 
-static TypeCategory type_category(const Type& type) {
-    switch (type.kind()) {
-        case Type::Kind::kVector_Kind:
-        case Type::Kind::kMatrix_Kind:
-            return type_category(type.componentType());
-        default:
-            if (type.fName == "bool") {
-                return TypeCategory::kBool;
-            } else if (type.fName == "int" ||
-                       type.fName == "short" ||
-                       type.fName == "$intLiteral") {
-                return TypeCategory::kSigned;
-            } else if (type.fName == "uint" ||
-                       type.fName == "ushort") {
-                return TypeCategory::kUnsigned;
-            } else {
-                SkASSERT(type.fName == "float" ||
-                         type.fName == "half" ||
-                         type.fName == "$floatLiteral");
-                return TypeCategory::kFloat;
-            }
-            ABORT("unsupported type: %s\n", type.displayName().c_str());
-    }
-}
-
-
-ByteCodeGenerator::ByteCodeGenerator(const Context* context, const Program* program, ErrorReporter* errors,
-                  ByteCode* output)
+ByteCodeGenerator::ByteCodeGenerator(const Program* program, ErrorReporter* errors,
+                                     ByteCode* output)
     : INHERITED(program, errors, nullptr)
-    , fContext(*context)
     , fOutput(output)
     , fIntrinsics {
-        { "cos",     ByteCodeInstruction::kCos },
+        { "cos",     ByteCode::Instruction::kCos },
         { "dot",     SpecialIntrinsic::kDot },
-        { "inverse", ByteCodeInstruction::kInverse2x2 },
-        { "sin",     ByteCodeInstruction::kSin },
-        { "sqrt",    ByteCodeInstruction::kSqrt },
-        { "tan",     ByteCodeInstruction::kTan },
-      } {}
-
+        { "inverse", SpecialIntrinsic::kInverse },
+        { "print",   ByteCode::Instruction::kPrint },
+        { "sin",     ByteCode::Instruction::kSin },
+        { "sqrt",    ByteCode::Instruction::kSqrt },
+        { "tan",     ByteCode::Instruction::kTan },
+    } {}
 
 int ByteCodeGenerator::SlotCount(const Type& type) {
     if (type.kind() == Type::kOther_Kind) {
@@ -80,89 +51,73 @@
 static inline bool is_in(const SkSL::Variable& var) {
     return var.fModifiers.fFlags & Modifiers::kIn_Flag;
 }
-
-void ByteCodeGenerator::gatherUniforms(const Type& type, const String& name) {
-    if (type.kind() == Type::kOther_Kind) {
-        return;
-    } else if (type.kind() == Type::kStruct_Kind) {
-        for (const auto& f : type.fields()) {
-            this->gatherUniforms(*f.fType, name + "." + f.fName);
-        }
-    } else if (type.kind() == Type::kArray_Kind) {
-        for (int i = 0; i < type.columns(); ++i) {
-            this->gatherUniforms(type.componentType(), String::printf("%s[%d]", name.c_str(), i));
-        }
-    } else {
-        fOutput->fUniforms.push_back({ name, type_category(type), type.rows(), type.columns(),
-                                       fOutput->fUniformSlotCount });
-        fOutput->fUniformSlotCount += type.columns() * type.rows();
-    }
-}
-
-bool ByteCodeGenerator::generateCode() {
-    for (const auto& e : fProgram) {
-        switch (e.fKind) {
-            case ProgramElement::kFunction_Kind: {
-                std::unique_ptr<ByteCodeFunction> f = this->writeFunction((FunctionDefinition&) e);
-                if (!f) {
-                    return false;
+ByteCodeGenerator::Location ByteCodeGenerator::getLocation(const Variable& var) {
+    // given that we seldom have more than a couple of variables, linear search is probably the most
+    // efficient way to handle lookups
+    switch (var.fStorage) {
+        case Variable::kLocal_Storage: {
+            for (int i = fLocals.size() - 1; i >= 0; --i) {
+                if (fLocals[i] == &var) {
+                    return ByteCode::Pointer{(uint16_t) (i + fParameterCount)};
                 }
-                fOutput->fFunctions.push_back(std::move(f));
-                fFunctions.push_back(&(FunctionDefinition&)e);
-                break;
             }
-            case ProgramElement::kVar_Kind: {
-                VarDeclarations& decl = (VarDeclarations&) e;
-                for (const auto& v : decl.fVars) {
-                    const Variable* declVar = ((VarDeclaration&) *v).fVar;
-                    if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
-                        continue;
-                    }
-                    if (is_uniform(*declVar)) {
-                        this->gatherUniforms(declVar->fType, declVar->fName);
-                    } else {
-                        fOutput->fGlobalSlotCount += SlotCount(declVar->fType);
+            int result = fLocals.size() + fParameterCount;
+            fLocals.push_back(&var);
+            for (int i = 0; i < SlotCount(var.fType) - 1; ++i) {
+                fLocals.push_back(nullptr);
+            }
+            SkASSERT(result <= ByteCode::kPointerMax);
+            return ByteCode::Pointer{(uint16_t) result};
+        }
+        case Variable::kParameter_Storage: {
+            int offset = 0;
+            for (const auto& p : fFunction->fDeclaration.fParameters) {
+                if (p == &var) {
+                    SkASSERT(offset <= ByteCode::kPointerMax);
+                    return ByteCode::Pointer{(uint16_t) offset};
+                }
+                offset += SlotCount(p->fType);
+            }
+            SkASSERT(false);
+            return ByteCode::Pointer{0};
+        }
+        case Variable::kGlobal_Storage: {
+            if (is_in(var)) {
+                // If you trip this assert, it means the program is using raw 'in' variables. You
+                // should either specialize the program (Compiler::specialize) to bake in the final
+                // values of the 'in' variables, or not use 'in' variables (maybe you meant to use
+                // 'uniform' instead?).
+                SkASSERT(false);
+                return ByteCode::Pointer{0};
+            }
+            bool isUniform = is_uniform(var);
+            int offset = isUniform ? fOutput->getGlobalSlotCount() : 0;
+            for (const auto& e : fProgram) {
+                if (e.fKind == ProgramElement::kVar_Kind) {
+                    VarDeclarations& decl = (VarDeclarations&) e;
+                    for (const auto& v : decl.fVars) {
+                        const Variable* declVar = ((VarDeclaration&) *v).fVar;
+                        if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
+                            continue;
+                        }
+                        if (isUniform != is_uniform(*declVar)) {
+                            continue;
+                        }
+                        if (declVar == &var) {
+                            SkASSERT(offset <= ByteCode::kPointerMax);
+                            return ByteCode::Pointer{(uint16_t) offset};
+                        }
+                        offset += SlotCount(declVar->fType);
                     }
                 }
-                break;
             }
-            default:
-                ; // ignore
+            SkASSERT(false);
+            return ByteCode::Pointer{0};
         }
+        default:
+            SkASSERT(false);
+            return ByteCode::Pointer{0};
     }
-    return 0 == fErrors.errorCount();
-}
-
-std::unique_ptr<ByteCodeFunction> ByteCodeGenerator::writeFunction(const FunctionDefinition& f) {
-    fFunction = &f;
-    std::unique_ptr<ByteCodeFunction> result(new ByteCodeFunction(&f.fDeclaration));
-    fParameterCount = result->fParameterCount;
-    fLoopCount = fMaxLoopCount = 0;
-    fConditionCount = fMaxConditionCount = 0;
-    fStackCount = fMaxStackCount = 0;
-    fCode = &result->fCode;
-
-    this->writeStatement(*f.fBody);
-    if (0 == fErrors.errorCount()) {
-        SkASSERT(fLoopCount == 0);
-        SkASSERT(fConditionCount == 0);
-        SkASSERT(fStackCount == 0);
-    }
-    this->write(ByteCodeInstruction::kReturn, 0);
-    this->write8(0);
-
-    result->fLocalCount     = fLocals.size();
-    result->fConditionCount = fMaxConditionCount;
-    result->fLoopCount      = fMaxLoopCount;
-    result->fStackCount     = fMaxStackCount;
-
-    const Type& returnType = f.fDeclaration.fReturnType;
-    if (returnType != *fContext.fVoid_Type) {
-        result->fReturnCount = SlotCount(returnType);
-    }
-    fLocals.clear();
-    fFunction = nullptr;
-    return result;
 }
 
 // A "simple" Swizzle is based on a variable (or a compound variable like a struct or array), and
@@ -186,1273 +141,240 @@
     return true;
 }
 
-int ByteCodeGenerator::StackUsage(ByteCodeInstruction inst, int count_) {
-    // Ensures that we use count iff we're passed a non-default value. Most instructions have an
-    // implicit count, so the caller shouldn't need to worry about it (or count makes no sense).
-    // The asserts avoids callers thinking they're supplying useful information in that scenario,
-    // or failing to supply necessary information for the ops that need a count.
-    struct CountValue {
-        operator int() {
-            SkASSERT(val != ByteCodeGenerator::kUnusedStackCount);
-            SkDEBUGCODE(used = true);
-            return val;
-        }
-        ~CountValue() {
-            SkASSERT(used || val == ByteCodeGenerator::kUnusedStackCount);
-        }
-        int val;
-        SkDEBUGCODE(bool used = false;)
-    } count = { count_ };
-
-    switch (inst) {
-        // Unary functions/operators that don't change stack depth at all:
-#define VECTOR_UNARY_OP(base)                \
-        case ByteCodeInstruction::base:      \
-        case ByteCodeInstruction::base ## 2: \
-        case ByteCodeInstruction::base ## 3: \
-        case ByteCodeInstruction::base ## 4: \
-            return 0;
-
-        VECTOR_UNARY_OP(kConvertFtoI)
-        VECTOR_UNARY_OP(kConvertStoF)
-        VECTOR_UNARY_OP(kConvertUtoF)
-
-        VECTOR_UNARY_OP(kCos)
-        VECTOR_UNARY_OP(kSin)
-        VECTOR_UNARY_OP(kSqrt)
-        VECTOR_UNARY_OP(kTan)
-
-        VECTOR_UNARY_OP(kNegateF)
-        VECTOR_UNARY_OP(kNegateI)
-
-        case ByteCodeInstruction::kInverse2x2:
-        case ByteCodeInstruction::kInverse3x3:
-        case ByteCodeInstruction::kInverse4x4: return 0;
-
-        case ByteCodeInstruction::kClampIndex: return 0;
-        case ByteCodeInstruction::kNotB: return 0;
-        case ByteCodeInstruction::kNegateFN: return 0;
-        case ByteCodeInstruction::kShiftLeft: return 0;
-        case ByteCodeInstruction::kShiftRightS: return 0;
-        case ByteCodeInstruction::kShiftRightU: return 0;
-
-#undef VECTOR_UNARY_OP
-
-        // Binary functions/operators that do a 2 -> 1 reduction (possibly N times)
-#define VECTOR_BINARY_OP(base)                          \
-        case ByteCodeInstruction::base:      return -1; \
-        case ByteCodeInstruction::base ## 2: return -2; \
-        case ByteCodeInstruction::base ## 3: return -3; \
-        case ByteCodeInstruction::base ## 4: return -4;
-
-#define VECTOR_MATRIX_BINARY_OP(base)                   \
-        VECTOR_BINARY_OP(base)                          \
-        case ByteCodeInstruction::base ## N: return -count;
-
-        case ByteCodeInstruction::kAndB: return -1;
-        case ByteCodeInstruction::kOrB:  return -1;
-        case ByteCodeInstruction::kXorB: return -1;
-
-        VECTOR_BINARY_OP(kAddI)
-        VECTOR_MATRIX_BINARY_OP(kAddF)
-
-        VECTOR_BINARY_OP(kCompareIEQ)
-        VECTOR_MATRIX_BINARY_OP(kCompareFEQ)
-        VECTOR_BINARY_OP(kCompareINEQ)
-        VECTOR_MATRIX_BINARY_OP(kCompareFNEQ)
-        VECTOR_BINARY_OP(kCompareSGT)
-        VECTOR_BINARY_OP(kCompareUGT)
-        VECTOR_BINARY_OP(kCompareFGT)
-        VECTOR_BINARY_OP(kCompareSGTEQ)
-        VECTOR_BINARY_OP(kCompareUGTEQ)
-        VECTOR_BINARY_OP(kCompareFGTEQ)
-        VECTOR_BINARY_OP(kCompareSLT)
-        VECTOR_BINARY_OP(kCompareULT)
-        VECTOR_BINARY_OP(kCompareFLT)
-        VECTOR_BINARY_OP(kCompareSLTEQ)
-        VECTOR_BINARY_OP(kCompareULTEQ)
-        VECTOR_BINARY_OP(kCompareFLTEQ)
-
-        VECTOR_BINARY_OP(kDivideS)
-        VECTOR_BINARY_OP(kDivideU)
-        VECTOR_MATRIX_BINARY_OP(kDivideF)
-        VECTOR_BINARY_OP(kMultiplyI)
-        VECTOR_MATRIX_BINARY_OP(kMultiplyF)
-        VECTOR_BINARY_OP(kRemainderF)
-        VECTOR_BINARY_OP(kRemainderS)
-        VECTOR_BINARY_OP(kRemainderU)
-        VECTOR_BINARY_OP(kSubtractI)
-        VECTOR_MATRIX_BINARY_OP(kSubtractF)
-
-#undef VECTOR_BINARY_OP
-#undef VECTOR_MATRIX_BINARY_OP
-
-        // Ops that push or load data to grow the stack:
-        case ByteCodeInstruction::kDup:
-        case ByteCodeInstruction::kLoad:
-        case ByteCodeInstruction::kLoadGlobal:
-        case ByteCodeInstruction::kLoadUniform:
-        case ByteCodeInstruction::kReadExternal:
-        case ByteCodeInstruction::kPushImmediate:
-            return 1;
-
-        case ByteCodeInstruction::kDup2:
-        case ByteCodeInstruction::kLoad2:
-        case ByteCodeInstruction::kLoadGlobal2:
-        case ByteCodeInstruction::kLoadUniform2:
-        case ByteCodeInstruction::kReadExternal2:
-            return 2;
-
-        case ByteCodeInstruction::kDup3:
-        case ByteCodeInstruction::kLoad3:
-        case ByteCodeInstruction::kLoadGlobal3:
-        case ByteCodeInstruction::kLoadUniform3:
-        case ByteCodeInstruction::kReadExternal3:
-            return 3;
-
-        case ByteCodeInstruction::kDup4:
-        case ByteCodeInstruction::kLoad4:
-        case ByteCodeInstruction::kLoadGlobal4:
-        case ByteCodeInstruction::kLoadUniform4:
-        case ByteCodeInstruction::kReadExternal4:
-            return 4;
-
-        case ByteCodeInstruction::kDupN:
-        case ByteCodeInstruction::kLoadSwizzle:
-        case ByteCodeInstruction::kLoadSwizzleGlobal:
-        case ByteCodeInstruction::kLoadSwizzleUniform:
-            return count;
-
-        // Pushes 'count' values, minus one for the 'address' that's consumed first
-        case ByteCodeInstruction::kLoadExtended:
-        case ByteCodeInstruction::kLoadExtendedGlobal:
-        case ByteCodeInstruction::kLoadExtendedUniform:
-            return count - 1;
-
-        // Ops that pop or store data to shrink the stack:
-        case ByteCodeInstruction::kPop:
-        case ByteCodeInstruction::kStore:
-        case ByteCodeInstruction::kStoreGlobal:
-        case ByteCodeInstruction::kWriteExternal:
-            return -1;
-
-        case ByteCodeInstruction::kPop2:
-        case ByteCodeInstruction::kStore2:
-        case ByteCodeInstruction::kStoreGlobal2:
-        case ByteCodeInstruction::kWriteExternal2:
-            return -2;
-
-        case ByteCodeInstruction::kPop3:
-        case ByteCodeInstruction::kStore3:
-        case ByteCodeInstruction::kStoreGlobal3:
-        case ByteCodeInstruction::kWriteExternal3:
-            return -3;
-
-        case ByteCodeInstruction::kPop4:
-        case ByteCodeInstruction::kStore4:
-        case ByteCodeInstruction::kStoreGlobal4:
-        case ByteCodeInstruction::kWriteExternal4:
-            return -4;
-
-        case ByteCodeInstruction::kPopN:
-        case ByteCodeInstruction::kStoreSwizzle:
-        case ByteCodeInstruction::kStoreSwizzleGlobal:
-            return -count;
-
-        // Consumes 'count' values, plus one for the 'address'
-        case ByteCodeInstruction::kStoreExtended:
-        case ByteCodeInstruction::kStoreExtendedGlobal:
-        case ByteCodeInstruction::kStoreSwizzleIndirect:
-        case ByteCodeInstruction::kStoreSwizzleIndirectGlobal:
-            return -count - 1;
-
-        // Strange ops where the caller computes the delta for us:
-        case ByteCodeInstruction::kCallExternal:
-        case ByteCodeInstruction::kMatrixToMatrix:
-        case ByteCodeInstruction::kMatrixMultiply:
-        case ByteCodeInstruction::kReserve:
-        case ByteCodeInstruction::kReturn:
-        case ByteCodeInstruction::kScalarToMatrix:
-        case ByteCodeInstruction::kSwizzle:
-            return count;
-
-        // Miscellaneous
-
-        // kCall is net-zero. Max stack depth is adjusted in writeFunctionCall.
-        case ByteCodeInstruction::kCall:             return 0;
-        case ByteCodeInstruction::kBranch:           return 0;
-        case ByteCodeInstruction::kBranchIfAllFalse: return 0;
-
-        case ByteCodeInstruction::kMaskPush:         return -1;
-        case ByteCodeInstruction::kMaskPop:          return 0;
-        case ByteCodeInstruction::kMaskNegate:       return 0;
-        case ByteCodeInstruction::kMaskBlend:        return -count;
-
-        case ByteCodeInstruction::kLoopBegin:        return 0;
-        case ByteCodeInstruction::kLoopNext:         return 0;
-        case ByteCodeInstruction::kLoopMask:         return -1;
-        case ByteCodeInstruction::kLoopEnd:          return 0;
-        case ByteCodeInstruction::kLoopBreak:        return 0;
-        case ByteCodeInstruction::kLoopContinue:     return 0;
-
-        default:
-            ABORT("unsupported instruction %d\n", (int)inst);
-            return 0;
-    }
-}
-
-ByteCodeGenerator::Location ByteCodeGenerator::getLocation(const Variable& var) {
-    // given that we seldom have more than a couple of variables, linear search is probably the most
-    // efficient way to handle lookups
-    switch (var.fStorage) {
-        case Variable::kLocal_Storage: {
-            for (int i = fLocals.size() - 1; i >= 0; --i) {
-                if (fLocals[i] == &var) {
-                    SkASSERT(fParameterCount + i <= 255);
-                    return { fParameterCount + i, Storage::kLocal };
-                }
-            }
-            int result = fParameterCount + fLocals.size();
-            fLocals.push_back(&var);
-            for (int i = 0; i < SlotCount(var.fType) - 1; ++i) {
-                fLocals.push_back(nullptr);
-            }
-            SkASSERT(result <= 255);
-            return { result, Storage::kLocal };
-        }
-        case Variable::kParameter_Storage: {
-            int offset = 0;
-            for (const auto& p : fFunction->fDeclaration.fParameters) {
-                if (p == &var) {
-                    SkASSERT(offset <= 255);
-                    return { offset, Storage::kLocal };
-                }
-                offset += SlotCount(p->fType);
-            }
-            SkASSERT(false);
-            return Location::MakeInvalid();
-        }
-        case Variable::kGlobal_Storage: {
-            if (is_in(var)) {
-                // If you trip this assert, it means the program is using raw 'in' variables. You
-                // should either specialize the program (Compiler::specialize) to bake in the final
-                // values of the 'in' variables, or not use 'in' variables (maybe you meant to use
-                // 'uniform' instead?).
-                SkASSERT(false);
-                return Location::MakeInvalid();
-            }
-            int offset = 0;
-            bool isUniform = is_uniform(var);
-            for (const auto& e : fProgram) {
-                if (e.fKind == ProgramElement::kVar_Kind) {
-                    VarDeclarations& decl = (VarDeclarations&) e;
-                    for (const auto& v : decl.fVars) {
-                        const Variable* declVar = ((VarDeclaration&) *v).fVar;
-                        if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
-                            continue;
-                        }
-                        if (isUniform != is_uniform(*declVar)) {
-                            continue;
-                        }
-                        if (declVar == &var) {
-                            SkASSERT(offset <= 255);
-                            return  { offset, isUniform ? Storage::kUniform : Storage::kGlobal };
-                        }
-                        offset += SlotCount(declVar->fType);
-                    }
-                }
-            }
-            SkASSERT(false);
-            return Location::MakeInvalid();
-        }
-        default:
-            SkASSERT(false);
-            return Location::MakeInvalid();
-    }
-}
-
 ByteCodeGenerator::Location ByteCodeGenerator::getLocation(const Expression& expr) {
     switch (expr.fKind) {
         case Expression::kFieldAccess_Kind: {
-            const FieldAccess& f = (const FieldAccess&)expr;
-            Location baseLoc = this->getLocation(*f.fBase);
+            const FieldAccess& f = (const FieldAccess&) expr;
+            Location result = this->getLocation(*f.fBase);
             int offset = 0;
             for (int i = 0; i < f.fFieldIndex; ++i) {
                 offset += SlotCount(*f.fBase->fType.fields()[i].fType);
             }
-            if (baseLoc.isOnStack()) {
-                if (offset != 0) {
-                    this->write(ByteCodeInstruction::kPushImmediate);
-                    this->write32(offset);
-                    this->write(ByteCodeInstruction::kAddI);
-                    this->write8(1);
-                }
-                return baseLoc;
-            } else {
-                return baseLoc + offset;
-            }
+            return result.offset(*this, offset);
         }
         case Expression::kIndex_Kind: {
-            const IndexExpression& i = (const IndexExpression&)expr;
-            int stride = SlotCount(i.fType);
-            int length = i.fBase->fType.columns();
-            SkASSERT(length <= 255);
-            int offset = -1;
-            if (i.fIndex->isConstant()) {
-                int64_t index = i.fIndex->getConstantInt();
+            const IndexExpression& idx = (const IndexExpression&) expr;
+            int stride = SlotCount(idx.fType);
+            int length = idx.fBase->fType.columns();
+            Location result = this->getLocation(*idx.fBase);
+            if (idx.fIndex->isConstant()) {
+                int64_t index = idx.fIndex->getConstantInt();
                 if (index < 0 || index >= length) {
-                    fErrors.error(i.fIndex->fOffset, "Array index out of bounds.");
-                    return Location::MakeInvalid();
+                    fErrors.error(idx.fIndex->fOffset, "Array index out of bounds");
+                    return result;
                 }
-                offset = index * stride;
+                return result.offset(*this, index * stride);
             } else {
-                if (i.fIndex->hasSideEffects()) {
-                    // Having a side-effect in an indexer is technically safe for an rvalue,
-                    // but with lvalues we have to evaluate the indexer twice, so make it an error.
-                    fErrors.error(i.fIndex->fOffset,
-                            "Index expressions with side-effects not supported in byte code.");
-                    return Location::MakeInvalid();
-                }
-                this->writeExpression(*i.fIndex);
-                this->write(ByteCodeInstruction::kClampIndex);
-                this->write8(length);
-                if (stride != 1) {
-                    this->write(ByteCodeInstruction::kPushImmediate);
-                    this->write32(stride);
-                    this->write(ByteCodeInstruction::kMultiplyI);
-                    this->write8(1);
-                }
+                ByteCode::Register index = this->next(1);
+                this->writeExpression(*idx.fIndex, index);
+                this->write(ByteCode::Instruction::kBoundsCheck);
+                this->write(index);
+                this->write(length);
+                ByteCode::Register imm = this->next(1);
+                this->write(ByteCode::Instruction::kImmediate);
+                this->write(imm);
+                this->write(ByteCode::Immediate{stride});
+                ByteCode::Register offset = this->next(1);
+                this->write(ByteCode::Instruction::kMultiplyI);
+                this->write(offset);
+                this->write(index);
+                this->write(imm);
+                return result.offset(*this, offset);
             }
-            Location baseLoc = this->getLocation(*i.fBase);
-
-            // Are both components known statically?
-            if (!baseLoc.isOnStack() && offset >= 0) {
-                return baseLoc + offset;
-            }
-
-            // At least one component is dynamic (and on the stack).
-
-            // If the other component is zero, we're done
-            if (baseLoc.fSlot == 0 || offset == 0) {
-                return baseLoc.makeOnStack();
-            }
-
-            // Push the non-dynamic component (if any) to the stack, then add the two
-            if (!baseLoc.isOnStack()) {
-                this->write(ByteCodeInstruction::kPushImmediate);
-                this->write32(baseLoc.fSlot);
-            }
-            if (offset >= 0) {
-                this->write(ByteCodeInstruction::kPushImmediate);
-                this->write32(offset);
-            }
-            this->write(ByteCodeInstruction::kAddI);
-            this->write8(1);
-            return baseLoc.makeOnStack();
         }
         case Expression::kSwizzle_Kind: {
-            const Swizzle& s = (const Swizzle&)expr;
+            const Swizzle& s = (const Swizzle&) expr;
             SkASSERT(swizzle_is_simple(s));
-            Location baseLoc = this->getLocation(*s.fBase);
-            int offset = s.fComponents[0];
-            if (baseLoc.isOnStack()) {
-                if (offset != 0) {
-                    this->write(ByteCodeInstruction::kPushImmediate);
-                    this->write32(offset);
-                    this->write(ByteCodeInstruction::kAddI);
-                    this->write8(1);
-                }
-                return baseLoc;
-            } else {
-                return baseLoc + offset;
-            }
+            return this->getLocation(*s.fBase).offset(*this, s.fComponents[0]);
         }
         case Expression::kVariableReference_Kind: {
-            const Variable& var = ((const VariableReference&)expr).fVariable;
+            const Variable& var = ((const VariableReference&) expr).fVariable;
             return this->getLocation(var);
         }
         default:
             SkASSERT(false);
-            return Location::MakeInvalid();
+            return ByteCode::Pointer{0};
     }
 }
 
-void ByteCodeGenerator::write8(uint8_t b) {
-    fCode->push_back(b);
-}
-
-void ByteCodeGenerator::write16(uint16_t i) {
-    size_t n = fCode->size();
-    fCode->resize(n+2);
-    memcpy(fCode->data() + n, &i, 2);
-}
-
-void ByteCodeGenerator::write32(uint32_t i) {
-    size_t n = fCode->size();
-    fCode->resize(n+4);
-    memcpy(fCode->data() + n, &i, 4);
-}
-
-void ByteCodeGenerator::write(ByteCodeInstruction i, int count) {
-    switch (i) {
-        case ByteCodeInstruction::kLoopBegin: this->enterLoop();      break;
-        case ByteCodeInstruction::kLoopEnd:   this->exitLoop();       break;
-
-        case ByteCodeInstruction::kMaskPush:  this->enterCondition(); break;
-        case ByteCodeInstruction::kMaskPop:
-        case ByteCodeInstruction::kMaskBlend: this->exitCondition();  break;
-        default: /* Do nothing */ break;
-    }
-    instruction val = (instruction) i;
-    size_t n = fCode->size();
-    fCode->resize(n + sizeof(val));
-    memcpy(fCode->data() + n, &val, sizeof(val));
-    fStackCount += StackUsage(i, count);
-    fMaxStackCount = std::max(fMaxStackCount, fStackCount);
-}
-
-static ByteCodeInstruction vector_instruction(ByteCodeInstruction base, int count) {
-    SkASSERT(count >= 1 && count <= 4);
-    return ((ByteCodeInstruction) ((int) base + 1 - count));
-}
-
-void ByteCodeGenerator::writeTypedInstruction(const Type& type, ByteCodeInstruction s,
-                                              ByteCodeInstruction u, ByteCodeInstruction f,
-                                              int count, bool writeCount) {
-    switch (type_category(type)) {
-        case TypeCategory::kSigned:
-            this->write(vector_instruction(s, count));
-            break;
-        case TypeCategory::kUnsigned:
-            this->write(vector_instruction(u, count));
-            break;
-        case TypeCategory::kFloat: {
-            if (count > 4) {
-                this->write((ByteCodeInstruction)((int)f + 1), count);
-            } else {
-                this->write(vector_instruction(f, count));
-            }
-            break;
+Variable::Storage ByteCodeGenerator::getStorage(const Expression& expr) {
+    switch (expr.fKind) {
+        case Expression::kFieldAccess_Kind: {
+            const FieldAccess& f = (const FieldAccess&) expr;
+            return this->getStorage(*f.fBase);
         }
-        default:
-            SkASSERT(false);
-    }
-    if (writeCount) {
-        this->write8(count);
-    }
-}
-
-bool ByteCodeGenerator::writeBinaryExpression(const BinaryExpression& b, bool discard) {
-    if (b.fOperator == Token::Kind::EQ) {
-        std::unique_ptr<LValue> lvalue = this->getLValue(*b.fLeft);
-        this->writeExpression(*b.fRight);
-        lvalue->store(discard);
-        discard = false;
-        return discard;
-    }
-    const Type& lType = b.fLeft->fType;
-    const Type& rType = b.fRight->fType;
-    bool lVecOrMtx = (lType.kind() == Type::kVector_Kind || lType.kind() == Type::kMatrix_Kind);
-    bool rVecOrMtx = (rType.kind() == Type::kVector_Kind || rType.kind() == Type::kMatrix_Kind);
-    Token::Kind op;
-    std::unique_ptr<LValue> lvalue;
-    if (is_assignment(b.fOperator)) {
-        lvalue = this->getLValue(*b.fLeft);
-        lvalue->load();
-        op = remove_assignment(b.fOperator);
-    } else {
-        this->writeExpression(*b.fLeft);
-        op = b.fOperator;
-        if (!lVecOrMtx && rVecOrMtx) {
-            for (int i = SlotCount(rType); i > 1; --i) {
-                this->write(ByteCodeInstruction::kDup);
-                this->write8(1);
-            }
+        case Expression::kIndex_Kind: {
+            const IndexExpression& idx = (const IndexExpression&) expr;
+            return this->getStorage(*idx.fBase);
         }
-    }
-    int count = std::max(SlotCount(lType), SlotCount(rType));
-    SkDEBUGCODE(TypeCategory tc = type_category(lType));
-    switch (op) {
-        case Token::Kind::LOGICALAND: {
-            SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
-            this->write(ByteCodeInstruction::kDup);
-            this->write8(1);
-            this->write(ByteCodeInstruction::kMaskPush);
-            this->write(ByteCodeInstruction::kBranchIfAllFalse);
-            DeferredLocation falseLocation(this);
-            this->writeExpression(*b.fRight);
-            this->write(ByteCodeInstruction::kAndB);
-            falseLocation.set();
-            this->write(ByteCodeInstruction::kMaskPop);
-            return false;
+        case Expression::kSwizzle_Kind: {
+            const Swizzle& s = (const Swizzle&) expr;
+            return this->getStorage(*s.fBase);
         }
-        case Token::Kind::LOGICALOR: {
-            SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
-            this->write(ByteCodeInstruction::kDup);
-            this->write8(1);
-            this->write(ByteCodeInstruction::kNotB);
-            this->write(ByteCodeInstruction::kMaskPush);
-            this->write(ByteCodeInstruction::kBranchIfAllFalse);
-            DeferredLocation falseLocation(this);
-            this->writeExpression(*b.fRight);
-            this->write(ByteCodeInstruction::kOrB);
-            falseLocation.set();
-            this->write(ByteCodeInstruction::kMaskPop);
-            return false;
-        }
-        case Token::Kind::SHL:
-        case Token::Kind::SHR: {
-            SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
-                                    tc == SkSL::TypeCategory::kUnsigned));
-            if (!b.fRight->isConstant()) {
-                fErrors.error(b.fRight->fOffset, "Shift amounts must be constant");
-                return false;
-            }
-            int64_t shift = b.fRight->getConstantInt();
-            if (shift < 0 || shift > 31) {
-                fErrors.error(b.fRight->fOffset, "Shift amount out of range");
-                return false;
-            }
-
-            if (op == Token::Kind::SHL) {
-                this->write(ByteCodeInstruction::kShiftLeft);
-            } else {
-                this->write(type_category(lType) == TypeCategory::kSigned
-                                ? ByteCodeInstruction::kShiftRightS
-                                : ByteCodeInstruction::kShiftRightU);
-            }
-            this->write8(shift);
-            return false;
-        }
-
-        default:
-            break;
-    }
-    this->writeExpression(*b.fRight);
-    if (lVecOrMtx && !rVecOrMtx) {
-        for (int i = SlotCount(lType); i > 1; --i) {
-            this->write(ByteCodeInstruction::kDup);
-            this->write8(1);
-        }
-    }
-    // Special case for M*V, V*M, M*M (but not V*V!)
-    if (op == Token::Kind::STAR && lVecOrMtx && rVecOrMtx &&
-        !(lType.kind() == Type::kVector_Kind && rType.kind() == Type::kVector_Kind)) {
-        this->write(ByteCodeInstruction::kMatrixMultiply,
-                    SlotCount(b.fType) - (SlotCount(lType) + SlotCount(rType)));
-        int rCols = rType.columns(),
-            rRows = rType.rows(),
-            lCols = lType.columns(),
-            lRows = lType.rows();
-        // M*V treats the vector as a column
-        if (rType.kind() == Type::kVector_Kind) {
-            std::swap(rCols, rRows);
-        }
-        SkASSERT(lCols == rRows);
-        SkASSERT(SlotCount(b.fType) == lRows * rCols);
-        this->write8(lCols);
-        this->write8(lRows);
-        this->write8(rCols);
-    } else {
-        switch (op) {
-            case Token::Kind::EQEQ:
-                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareIEQ,
-                                            ByteCodeInstruction::kCompareIEQ,
-                                            ByteCodeInstruction::kCompareFEQ,
-                                            count);
-                // Collapse to a single bool
-                for (int i = count; i > 1; --i) {
-                    this->write(ByteCodeInstruction::kAndB);
-                }
-                break;
-            case Token::Kind::GT:
-                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSGT,
-                                            ByteCodeInstruction::kCompareUGT,
-                                            ByteCodeInstruction::kCompareFGT,
-                                            count);
-                break;
-            case Token::Kind::GTEQ:
-                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSGTEQ,
-                                            ByteCodeInstruction::kCompareUGTEQ,
-                                            ByteCodeInstruction::kCompareFGTEQ,
-                                            count);
-                break;
-            case Token::Kind::LT:
-                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSLT,
-                                            ByteCodeInstruction::kCompareULT,
-                                            ByteCodeInstruction::kCompareFLT,
-                                            count);
-                break;
-            case Token::Kind::LTEQ:
-                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSLTEQ,
-                                            ByteCodeInstruction::kCompareULTEQ,
-                                            ByteCodeInstruction::kCompareFLTEQ,
-                                            count);
-                break;
-            case Token::Kind::MINUS:
-                this->writeTypedInstruction(lType, ByteCodeInstruction::kSubtractI,
-                                            ByteCodeInstruction::kSubtractI,
-                                            ByteCodeInstruction::kSubtractF,
-                                            count);
-                break;
-            case Token::Kind::NEQ:
-                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareINEQ,
-                                            ByteCodeInstruction::kCompareINEQ,
-                                            ByteCodeInstruction::kCompareFNEQ,
-                                            count);
-                // Collapse to a single bool
-                for (int i = count; i > 1; --i) {
-                    this->write(ByteCodeInstruction::kOrB);
-                }
-                break;
-            case Token::Kind::PERCENT:
-                this->writeTypedInstruction(lType, ByteCodeInstruction::kRemainderS,
-                                            ByteCodeInstruction::kRemainderU,
-                                            ByteCodeInstruction::kRemainderF,
-                                            count);
-                break;
-            case Token::Kind::PLUS:
-                this->writeTypedInstruction(lType, ByteCodeInstruction::kAddI,
-                                            ByteCodeInstruction::kAddI,
-                                            ByteCodeInstruction::kAddF,
-                                            count);
-                break;
-            case Token::Kind::SLASH:
-                this->writeTypedInstruction(lType, ByteCodeInstruction::kDivideS,
-                                            ByteCodeInstruction::kDivideU,
-                                            ByteCodeInstruction::kDivideF,
-                                            count);
-                break;
-            case Token::Kind::STAR:
-                this->writeTypedInstruction(lType, ByteCodeInstruction::kMultiplyI,
-                                            ByteCodeInstruction::kMultiplyI,
-                                            ByteCodeInstruction::kMultiplyF,
-                                            count);
-                break;
-
-            case Token::Kind::LOGICALXOR:
-                SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
-                this->write(ByteCodeInstruction::kXorB);
-                break;
-
-            case Token::Kind::BITWISEAND:
-                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
-                                        tc == SkSL::TypeCategory::kUnsigned));
-                this->write(ByteCodeInstruction::kAndB);
-                break;
-            case Token::Kind::BITWISEOR:
-                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
-                                        tc == SkSL::TypeCategory::kUnsigned));
-                this->write(ByteCodeInstruction::kOrB);
-                break;
-            case Token::Kind::BITWISEXOR:
-                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
-                                        tc == SkSL::TypeCategory::kUnsigned));
-                this->write(ByteCodeInstruction::kXorB);
-                break;
-
-            default:
-                fErrors.error(b.fOffset, SkSL::String::printf("Unsupported binary operator '%s'",
-                                                              Compiler::OperatorName(op)));
-                break;
-        }
-    }
-    if (lvalue) {
-        lvalue->store(discard);
-        discard = false;
-    }
-    return discard;
-}
-
-void ByteCodeGenerator::writeBoolLiteral(const BoolLiteral& b) {
-    this->write(ByteCodeInstruction::kPushImmediate);
-    this->write32(b.fValue ? ~0 : 0);
-}
-
-void ByteCodeGenerator::writeConstructor(const Constructor& c) {
-    for (const auto& arg : c.fArguments) {
-        this->writeExpression(*arg);
-    }
-    if (c.fArguments.size() == 1) {
-        const Type& inType = c.fArguments[0]->fType;
-        const Type& outType = c.fType;
-        TypeCategory inCategory = type_category(inType);
-        TypeCategory outCategory = type_category(outType);
-        int inCount = SlotCount(inType);
-        int outCount = SlotCount(outType);
-        if (inCategory != outCategory) {
-            SkASSERT(inCount == outCount);
-            if (inCategory == TypeCategory::kFloat) {
-                SkASSERT(outCategory == TypeCategory::kSigned ||
-                         outCategory == TypeCategory::kUnsigned);
-                this->write(vector_instruction(ByteCodeInstruction::kConvertFtoI, outCount));
-            } else if (outCategory == TypeCategory::kFloat) {
-                if (inCategory == TypeCategory::kSigned) {
-                    this->write(vector_instruction(ByteCodeInstruction::kConvertStoF, outCount));
-                } else {
-                    SkASSERT(inCategory == TypeCategory::kUnsigned);
-                    this->write(vector_instruction(ByteCodeInstruction::kConvertUtoF, outCount));
-                }
-            } else {
-                SkASSERT(false);
-            }
-        }
-        if (inType.kind() == Type::kMatrix_Kind && outType.kind() == Type::kMatrix_Kind) {
-            this->write(ByteCodeInstruction::kMatrixToMatrix,
-                        SlotCount(outType) - SlotCount(inType));
-            this->write8(inType.columns());
-            this->write8(inType.rows());
-            this->write8(outType.columns());
-            this->write8(outType.rows());
-        } else if (inCount != outCount) {
-            SkASSERT(inCount == 1);
-            if (outType.kind() == Type::kMatrix_Kind) {
-                this->write(ByteCodeInstruction::kScalarToMatrix, SlotCount(outType) - 1);
-                this->write8(outType.columns());
-                this->write8(outType.rows());
-            } else {
-                SkASSERT(outType.kind() == Type::kVector_Kind);
-                for (; inCount != outCount; ++inCount) {
-                    this->write(ByteCodeInstruction::kDup);
-                    this->write8(1);
-                }
-            }
-        }
-    }
-}
-
-void ByteCodeGenerator::writeExternalFunctionCall(const ExternalFunctionCall& f) {
-    int argumentCount = 0;
-    for (const auto& arg : f.fArguments) {
-        this->writeExpression(*arg);
-        argumentCount += SlotCount(arg->fType);
-    }
-    this->write(ByteCodeInstruction::kCallExternal, SlotCount(f.fType) - argumentCount);
-    SkASSERT(argumentCount <= 255);
-    this->write8(argumentCount);
-    this->write8(SlotCount(f.fType));
-    int index = fOutput->fExternalValues.size();
-    fOutput->fExternalValues.push_back(f.fFunction);
-    SkASSERT(index <= 255);
-    this->write8(index);
-}
-
-void ByteCodeGenerator::writeExternalValue(const ExternalValueReference& e) {
-    int count = SlotCount(e.fValue->type());
-    this->write(vector_instruction(ByteCodeInstruction::kReadExternal, count));
-    this->write8(count);
-    int index = fOutput->fExternalValues.size();
-    fOutput->fExternalValues.push_back(e.fValue);
-    SkASSERT(index <= 255);
-    this->write8(index);
-}
-
-void ByteCodeGenerator::writeVariableExpression(const Expression& expr) {
-    Location location = this->getLocation(expr);
-    int count = SlotCount(expr.fType);
-    if (location.isOnStack() || count > 4) {
-        if (!location.isOnStack()) {
-            this->write(ByteCodeInstruction::kPushImmediate);
-            this->write32(location.fSlot);
-        }
-        this->write(location.selectLoad(ByteCodeInstruction::kLoadExtended,
-                                        ByteCodeInstruction::kLoadExtendedGlobal,
-                                        ByteCodeInstruction::kLoadExtendedUniform),
-                    count);
-        this->write8(count);
-    } else {
-        this->write(vector_instruction(location.selectLoad(ByteCodeInstruction::kLoad,
-                                                           ByteCodeInstruction::kLoadGlobal,
-                                                           ByteCodeInstruction::kLoadUniform),
-                                       count));
-        this->write8(count);
-        this->write8(location.fSlot);
-    }
-}
-
-static inline uint32_t float_to_bits(float x) {
-    uint32_t u;
-    memcpy(&u, &x, sizeof(uint32_t));
-    return u;
-}
-
-void ByteCodeGenerator::writeFloatLiteral(const FloatLiteral& f) {
-    this->write(ByteCodeInstruction::kPushImmediate);
-    this->write32(float_to_bits(f.fValue));
-}
-
-void ByteCodeGenerator::writeIntrinsicCall(const FunctionCall& c) {
-    auto found = fIntrinsics.find(c.fFunction.fName);
-    if (found == fIntrinsics.end()) {
-        fErrors.error(c.fOffset, String::printf("Unsupported intrinsic: '%s'",
-                                                String(c.fFunction.fName).c_str()));
-        return;
-    }
-    int count = SlotCount(c.fArguments[0]->fType);
-    if (found->second.fIsSpecial) {
-        SpecialIntrinsic special = found->second.fValue.fSpecial;
-        switch (special) {
-            case SpecialIntrinsic::kDot: {
-                SkASSERT(c.fArguments.size() == 2);
-                SkASSERT(count == SlotCount(c.fArguments[1]->fType));
-                this->write(vector_instruction(ByteCodeInstruction::kMultiplyF, count));
-                this->write8(count);
-                for (int i = count; i > 1; --i) {
-                    this->write(ByteCodeInstruction::kAddF);
-                    this->write8(1);
-                }
-                break;
-            }
-            default:
-                SkASSERT(false);
-        }
-    } else {
-        switch (found->second.fValue.fInstruction) {
-            case ByteCodeInstruction::kCos:
-            case ByteCodeInstruction::kSin:
-            case ByteCodeInstruction::kTan:
-                SkASSERT(c.fArguments.size() > 0);
-                this->write(vector_instruction(found->second.fValue.fInstruction, count));
-                this->write8(count);
-                break;
-            case ByteCodeInstruction::kSqrt:
-                SkASSERT(c.fArguments.size() > 0);
-                this->write(vector_instruction(found->second.fValue.fInstruction, count));
-                break;
-            case ByteCodeInstruction::kInverse2x2: {
-                SkASSERT(c.fArguments.size() > 0);
-                auto op = ByteCodeInstruction::kInverse2x2;
-                switch (count) {
-                    case 4: break;  // float2x2
-                    case 9:  op = ByteCodeInstruction::kInverse3x3; break;
-                    case 16: op = ByteCodeInstruction::kInverse4x4; break;
-                    default: SkASSERT(false);
-                }
-                this->write(op);
-                break;
-            }
-            default:
-                SkASSERT(false);
-        }
-    }
-}
-
-void ByteCodeGenerator::writeFunctionCall(const FunctionCall& f) {
-    // Find the index of the function we're calling. We explicitly do not allow calls to functions
-    // before they're defined. This is an easy-to-understand rule that prevents recursion.
-    int idx = -1;
-    for (size_t i = 0; i < fFunctions.size(); ++i) {
-        if (f.fFunction.matches(fFunctions[i]->fDeclaration)) {
-            idx = i;
-            break;
-        }
-    }
-    if (idx == -1) {
-        for (const auto& arg : f.fArguments) {
-            this->writeExpression(*arg);
-        }
-        this->writeIntrinsicCall(f);
-        return;
-    }
-
-
-    if (idx > 255) {
-        fErrors.error(f.fOffset, "Function count limit exceeded");
-        return;
-    } else if (idx >= (int) fFunctions.size()) {
-        fErrors.error(f.fOffset, "Call to undefined function");
-        return;
-    }
-
-    // We may need to deal with out parameters, so the sequence is tricky
-    if (int returnCount = SlotCount(f.fType)) {
-        this->write(ByteCodeInstruction::kReserve, returnCount);
-        this->write8(returnCount);
-    }
-
-    int argCount = f.fArguments.size();
-    std::vector<std::unique_ptr<LValue>> lvalues;
-    for (int i = 0; i < argCount; ++i) {
-        const auto& param = f.fFunction.fParameters[i];
-        const auto& arg = f.fArguments[i];
-        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
-            lvalues.emplace_back(this->getLValue(*arg));
-            lvalues.back()->load();
-        } else {
-            this->writeExpression(*arg);
-        }
-    }
-
-    // The space used by the call is based on the callee, but it also unwinds all of that before
-    // we continue execution. We adjust our max stack depths below.
-    this->write(ByteCodeInstruction::kCall);
-    this->write8(idx);
-
-    const ByteCodeFunction* callee = fOutput->fFunctions[idx].get();
-    fMaxLoopCount      = std::max(fMaxLoopCount,      fLoopCount      + callee->fLoopCount);
-    fMaxConditionCount = std::max(fMaxConditionCount, fConditionCount + callee->fConditionCount);
-    fMaxStackCount     = std::max(fMaxStackCount,     fStackCount     + callee->fLocalCount
-                                                                      + callee->fStackCount);
-
-    // After the called function returns, the stack will still contain our arguments. We have to
-    // pop them (storing any out parameters back to their lvalues as we go). We glob together slot
-    // counts for all parameters that aren't out-params, so we can pop them in one big chunk.
-    int popCount = 0;
-    auto pop = [&]() {
-        if (popCount > 4) {
-            this->write(ByteCodeInstruction::kPopN, popCount);
-            this->write8(popCount);
-        } else if (popCount > 0) {
-            this->write(vector_instruction(ByteCodeInstruction::kPop, popCount));
-        }
-        popCount = 0;
-    };
-
-    for (int i = argCount - 1; i >= 0; --i) {
-        const auto& param = f.fFunction.fParameters[i];
-        const auto& arg = f.fArguments[i];
-        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
-            pop();
-            lvalues.back()->store(true);
-            lvalues.pop_back();
-        } else {
-            popCount += SlotCount(arg->fType);
-        }
-    }
-    pop();
-}
-
-void ByteCodeGenerator::writeIntLiteral(const IntLiteral& i) {
-    this->write(ByteCodeInstruction::kPushImmediate);
-    this->write32(i.fValue);
-}
-
-void ByteCodeGenerator::writeNullLiteral(const NullLiteral& n) {
-    // not yet implemented
-    abort();
-}
-
-bool ByteCodeGenerator::writePrefixExpression(const PrefixExpression& p, bool discard) {
-    switch (p.fOperator) {
-        case Token::Kind::PLUSPLUS: // fall through
-        case Token::Kind::MINUSMINUS: {
-            SkASSERT(SlotCount(p.fOperand->fType) == 1);
-            std::unique_ptr<LValue> lvalue = this->getLValue(*p.fOperand);
-            lvalue->load();
-            this->write(ByteCodeInstruction::kPushImmediate);
-            this->write32(type_category(p.fType) == TypeCategory::kFloat ? float_to_bits(1.0f) : 1);
-            if (p.fOperator == Token::Kind::PLUSPLUS) {
-                this->writeTypedInstruction(p.fType,
-                                            ByteCodeInstruction::kAddI,
-                                            ByteCodeInstruction::kAddI,
-                                            ByteCodeInstruction::kAddF,
-                                            1);
-            } else {
-                this->writeTypedInstruction(p.fType,
-                                            ByteCodeInstruction::kSubtractI,
-                                            ByteCodeInstruction::kSubtractI,
-                                            ByteCodeInstruction::kSubtractF,
-                                            1);
-            }
-            lvalue->store(discard);
-            discard = false;
-            break;
-        }
-        case Token::Kind::MINUS: {
-            this->writeExpression(*p.fOperand);
-            this->writeTypedInstruction(p.fType,
-                                        ByteCodeInstruction::kNegateI,
-                                        ByteCodeInstruction::kNegateI,
-                                        ByteCodeInstruction::kNegateF,
-                                        SlotCount(p.fOperand->fType),
-                                        false);
-            break;
-        }
-        case Token::Kind::LOGICALNOT:
-        case Token::Kind::BITWISENOT: {
-            SkASSERT(SlotCount(p.fOperand->fType) == 1);
-            SkDEBUGCODE(TypeCategory tc = type_category(p.fOperand->fType));
-            SkASSERT((p.fOperator == Token::Kind::LOGICALNOT && tc == TypeCategory::kBool) ||
-                     (p.fOperator == Token::Kind::BITWISENOT && (tc == TypeCategory::kSigned ||
-                                                                 tc == TypeCategory::kUnsigned)));
-            this->writeExpression(*p.fOperand);
-            this->write(ByteCodeInstruction::kNotB);
-            break;
-        }
-        default:
-            SkASSERT(false);
-    }
-    return discard;
-}
-
-bool ByteCodeGenerator::writePostfixExpression(const PostfixExpression& p, bool discard) {
-    switch (p.fOperator) {
-        case Token::Kind::PLUSPLUS: // fall through
-        case Token::Kind::MINUSMINUS: {
-            SkASSERT(SlotCount(p.fOperand->fType) == 1);
-            std::unique_ptr<LValue> lvalue = this->getLValue(*p.fOperand);
-            lvalue->load();
-            // If we're not supposed to discard the result, then make a copy *before* the +/-
-            if (!discard) {
-                this->write(ByteCodeInstruction::kDup);
-                this->write8(1);
-            }
-            this->write(ByteCodeInstruction::kPushImmediate);
-            this->write32(type_category(p.fType) == TypeCategory::kFloat ? float_to_bits(1.0f) : 1);
-            if (p.fOperator == Token::Kind::PLUSPLUS) {
-                this->writeTypedInstruction(p.fType,
-                                            ByteCodeInstruction::kAddI,
-                                            ByteCodeInstruction::kAddI,
-                                            ByteCodeInstruction::kAddF,
-                                            1);
-            } else {
-                this->writeTypedInstruction(p.fType,
-                                            ByteCodeInstruction::kSubtractI,
-                                            ByteCodeInstruction::kSubtractI,
-                                            ByteCodeInstruction::kSubtractF,
-                                            1);
-            }
-            // Always consume the result as part of the store
-            lvalue->store(true);
-            discard = false;
-            break;
-        }
-        default:
-            SkASSERT(false);
-    }
-    return discard;
-}
-
-void ByteCodeGenerator::writeSwizzle(const Swizzle& s) {
-    if (swizzle_is_simple(s)) {
-        this->writeVariableExpression(s);
-        return;
-    }
-
-    switch (s.fBase->fKind) {
         case Expression::kVariableReference_Kind: {
-            Location location = this->getLocation(*s.fBase);
-            this->write(location.selectLoad(ByteCodeInstruction::kLoadSwizzle,
-                                            ByteCodeInstruction::kLoadSwizzleGlobal,
-                                            ByteCodeInstruction::kLoadSwizzleUniform),
-                        s.fComponents.size());
-            this->write8(location.fSlot);
-            this->write8(s.fComponents.size());
-            for (int c : s.fComponents) {
-                this->write8(c);
-            }
-            break;
+            const Variable& var = ((const VariableReference&) expr).fVariable;
+            return var.fStorage;
         }
         default:
-            this->writeExpression(*s.fBase);
-            this->write(ByteCodeInstruction::kSwizzle,
-                        s.fComponents.size() - s.fBase->fType.columns());
-            this->write8(s.fBase->fType.columns());
-            this->write8(s.fComponents.size());
-            for (int c : s.fComponents) {
-                this->write8(c);
-            }
-    }
-}
-
-void ByteCodeGenerator::writeTernaryExpression(const TernaryExpression& t) {
-    int count = SlotCount(t.fType);
-    SkASSERT(count == SlotCount(t.fIfTrue->fType));
-    SkASSERT(count == SlotCount(t.fIfFalse->fType));
-
-    this->writeExpression(*t.fTest);
-    this->write(ByteCodeInstruction::kMaskPush);
-    this->writeExpression(*t.fIfTrue);
-    this->write(ByteCodeInstruction::kMaskNegate);
-    this->writeExpression(*t.fIfFalse);
-    this->write(ByteCodeInstruction::kMaskBlend, count);
-    this->write8(count);
-}
-
-void ByteCodeGenerator::writeExpression(const Expression& e, bool discard) {
-    switch (e.fKind) {
-        case Expression::kBinary_Kind:
-            discard = this->writeBinaryExpression((BinaryExpression&) e, discard);
-            break;
-        case Expression::kBoolLiteral_Kind:
-            this->writeBoolLiteral((BoolLiteral&) e);
-            break;
-        case Expression::kConstructor_Kind:
-            this->writeConstructor((Constructor&) e);
-            break;
-        case Expression::kExternalFunctionCall_Kind:
-            this->writeExternalFunctionCall((ExternalFunctionCall&) e);
-            break;
-        case Expression::kExternalValue_Kind:
-            this->writeExternalValue((ExternalValueReference&) e);
-            break;
-        case Expression::kFieldAccess_Kind:
-        case Expression::kIndex_Kind:
-        case Expression::kVariableReference_Kind:
-            this->writeVariableExpression(e);
-            break;
-        case Expression::kFloatLiteral_Kind:
-            this->writeFloatLiteral((FloatLiteral&) e);
-            break;
-        case Expression::kFunctionCall_Kind:
-            this->writeFunctionCall((FunctionCall&) e);
-            break;
-        case Expression::kIntLiteral_Kind:
-            this->writeIntLiteral((IntLiteral&) e);
-            break;
-        case Expression::kNullLiteral_Kind:
-            this->writeNullLiteral((NullLiteral&) e);
-            break;
-        case Expression::kPrefix_Kind:
-            discard = this->writePrefixExpression((PrefixExpression&) e, discard);
-            break;
-        case Expression::kPostfix_Kind:
-            discard = this->writePostfixExpression((PostfixExpression&) e, discard);
-            break;
-        case Expression::kSwizzle_Kind:
-            this->writeSwizzle((Swizzle&) e);
-            break;
-        case Expression::kTernary_Kind:
-            this->writeTernaryExpression((TernaryExpression&) e);
-            break;
-        default:
-#ifdef SK_DEBUG
-            printf("unsupported expression %s\n", e.description().c_str());
-#endif
             SkASSERT(false);
-    }
-    if (discard) {
-        int count = SlotCount(e.fType);
-        if (count > 4) {
-            this->write(ByteCodeInstruction::kPopN, count);
-            this->write8(count);
-        } else if (count != 0) {
-            this->write(vector_instruction(ByteCodeInstruction::kPop, count));
-        }
-        discard = false;
+            return Variable::kLocal_Storage;
     }
 }
 
-class ByteCodeExternalValueLValue : public ByteCodeGenerator::LValue {
-public:
-    ByteCodeExternalValueLValue(ByteCodeGenerator* generator, ExternalValue& value, int index)
-        : INHERITED(*generator)
-        , fCount(ByteCodeGenerator::SlotCount(value.type()))
-        , fIndex(index) {}
+ByteCode::Instruction ByteCodeGenerator::getLoadInstruction(ByteCodeGenerator::Location location,
+                                                            Variable::Storage storage) {
+    switch (storage) {
+        case Variable::kGlobal_Storage:
+            switch (location.fKind) {
+                case Location::kPointer_Kind: return ByteCode::Instruction::kLoadDirect;
+                case Location::kRegister_Kind: return ByteCode::Instruction::kLoad;
+            }
+        case Variable::kParameter_Storage:
+            switch (location.fKind) {
+                case Location::kPointer_Kind: return ByteCode::Instruction::kLoadParameterDirect;
+                case Location::kRegister_Kind: return ByteCode::Instruction::kLoadParameter;
+            }
+        case Variable::kLocal_Storage:
+            switch (location.fKind) {
+                case Location::kPointer_Kind: return ByteCode::Instruction::kLoadStackDirect;
+                case Location::kRegister_Kind: return ByteCode::Instruction::kLoadStack;
+            }
+        default:
+            break;
+    }
+    SkASSERT(false);
+    return ByteCode::Instruction::kNop;
+}
 
-    void load() override {
-        fGenerator.write(vector_instruction(ByteCodeInstruction::kReadExternal, fCount));
-        fGenerator.write8(fCount);
-        fGenerator.write8(fIndex);
+ByteCode::Instruction ByteCodeGenerator::getStoreInstruction(ByteCodeGenerator::Location location,
+                                                             Variable::Storage storage) {
+    switch (storage) {
+        case Variable::kGlobal_Storage:
+            switch (location.fKind) {
+                case Location::kPointer_Kind: return ByteCode::Instruction::kStoreDirect;
+                case Location::kRegister_Kind: return ByteCode::Instruction::kStore;
+            }
+        case Variable::kParameter_Storage:
+            switch (location.fKind) {
+                case Location::kPointer_Kind: return ByteCode::Instruction::kStoreParameterDirect;
+                case Location::kRegister_Kind: return ByteCode::Instruction::kStoreParameter;
+            }
+        case Variable::kLocal_Storage:
+            switch (location.fKind) {
+                case Location::kPointer_Kind: return ByteCode::Instruction::kStoreStackDirect;
+                case Location::kRegister_Kind: return ByteCode::Instruction::kStoreStack;
+            }
+        default:
+            break;
+    }
+    SkASSERT(false);
+    return ByteCode::Instruction::kNop;
+}
+
+class ByteCodeSimpleLValue : public ByteCodeGenerator::LValue {
+public:
+    ByteCodeSimpleLValue(ByteCodeGenerator* generator, ByteCodeGenerator::Location location,
+                         int count, ByteCode::Instruction load, ByteCode::Instruction store)
+        : INHERITED(*generator)
+        , fLocation(location)
+        , fCount(count)
+        , fLoad(load)
+        , fStore(store) {}
+
+    void load(ByteCode::Register result) override {
+        for (int i = 0; i < fCount; ++i) {
+            ByteCodeGenerator::Location final = fLocation.offset(fGenerator, i);
+            fGenerator.write(fLoad);
+            fGenerator.write(result + i);
+            fGenerator.write(final);
+        }
     }
 
-    void store(bool discard) override {
-        if (!discard) {
-            fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, fCount));
-            fGenerator.write8(fCount);
+    void store(ByteCode::Register src) override {
+        for (int i = 0; i < fCount; ++i) {
+            ByteCodeGenerator::Location final = fLocation.offset(fGenerator, i);
+            fGenerator.write(fStore);
+            fGenerator.write(final);
+            fGenerator.write(src + i);
         }
-        fGenerator.write(vector_instruction(ByteCodeInstruction::kWriteExternal, fCount));
-        fGenerator.write8(fCount);
-        fGenerator.write8(fIndex);
     }
 
 private:
-    typedef LValue INHERITED;
+    ByteCodeGenerator::Location fLocation;
 
     int fCount;
 
-    int fIndex;
+    ByteCode::Instruction fLoad;
+
+    ByteCode::Instruction fStore;
+
+    typedef ByteCodeGenerator::LValue INHERITED;
 };
 
 class ByteCodeSwizzleLValue : public ByteCodeGenerator::LValue {
 public:
-    ByteCodeSwizzleLValue(ByteCodeGenerator* generator, const Swizzle& swizzle)
+    ByteCodeSwizzleLValue(ByteCodeGenerator* generator, const Swizzle* swizzle)
         : INHERITED(*generator)
-        , fSwizzle(swizzle) {}
+        , fSwizzle(*swizzle) {}
 
-    void load() override {
-        fGenerator.writeSwizzle(fSwizzle);
+    void load(ByteCode::Register result) override {
+        fGenerator.writeSwizzle(fSwizzle, result);
     }
 
-    void store(bool discard) override {
-        int count = fSwizzle.fComponents.size();
-        if (!discard) {
-            fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, count));
-            fGenerator.write8(count);
-        }
-        ByteCodeGenerator::Location location = fGenerator.getLocation(*fSwizzle.fBase);
-        if (location.isOnStack()) {
-            fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreSwizzleIndirect,
-                                                  ByteCodeInstruction::kStoreSwizzleIndirectGlobal),
-                             count);
-        } else {
-            fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreSwizzle,
-                                                  ByteCodeInstruction::kStoreSwizzleGlobal),
-                             count);
-            fGenerator.write8(location.fSlot);
-        }
-        fGenerator.write8(count);
-        for (int c : fSwizzle.fComponents) {
-            fGenerator.write8(c);
+    void store(ByteCode::Register src) override {
+        ByteCodeGenerator::Location target = fGenerator.getLocation(*fSwizzle.fBase);
+        ByteCode::Instruction inst = fGenerator.getStoreInstruction(
+                                                            target,
+                                                            fGenerator.getStorage(*fSwizzle.fBase));
+        for (size_t i = 0; i < fSwizzle.fComponents.size(); ++i) {
+            ByteCodeGenerator::Location final = target.offset(fGenerator, fSwizzle.fComponents[i]);
+            fGenerator.write(inst);
+            fGenerator.write(final);
+            fGenerator.write(src + i);
         }
     }
 
 private:
     const Swizzle& fSwizzle;
 
-    typedef LValue INHERITED;
+    typedef ByteCodeGenerator::LValue INHERITED;
 };
 
-class ByteCodeExpressionLValue : public ByteCodeGenerator::LValue {
+class ByteCodeExternalValueLValue : public ByteCodeGenerator::LValue {
 public:
-    ByteCodeExpressionLValue(ByteCodeGenerator* generator, const Expression& expr)
+    ByteCodeExternalValueLValue(ByteCodeGenerator* generator, ExternalValue& value, int index)
         : INHERITED(*generator)
-        , fExpression(expr) {}
-
-    void load() override {
-        fGenerator.writeVariableExpression(fExpression);
+        , fIndex(index)
+        , fSlotCount(ByteCodeGenerator::SlotCount(value.type())) {
+        SkASSERT(fSlotCount <= 4);
     }
 
-    void store(bool discard) override {
-        int count = ByteCodeGenerator::SlotCount(fExpression.fType);
-        if (!discard) {
-            if (count > 4) {
-                fGenerator.write(ByteCodeInstruction::kDupN, count);
-                fGenerator.write8(count);
-            } else {
-                fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, count));
-                fGenerator.write8(count);
-            }
-        }
-        ByteCodeGenerator::Location location = fGenerator.getLocation(fExpression);
-        if (location.isOnStack() || count > 4) {
-            if (!location.isOnStack()) {
-                fGenerator.write(ByteCodeInstruction::kPushImmediate);
-                fGenerator.write32(location.fSlot);
-            }
-            fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreExtended,
-                                                  ByteCodeInstruction::kStoreExtendedGlobal),
-                             count);
-            fGenerator.write8(count);
-        } else {
-            fGenerator.write(
-                    vector_instruction(location.selectStore(ByteCodeInstruction::kStore,
-                                                            ByteCodeInstruction::kStoreGlobal),
-                                       count));
-            fGenerator.write8(location.fSlot);
-        }
+    void load(ByteCode::Register result) override {
+        fGenerator.write(ByteCode::Instruction::kReadExternal);
+        fGenerator.write(result);
+        fGenerator.write((uint8_t) fSlotCount);
+        fGenerator.write((uint8_t) fIndex);
+    }
+
+    void store(ByteCode::Register src) override {
+        fGenerator.write(ByteCode::Instruction::kWriteExternal);
+        fGenerator.write((uint8_t) fIndex);
+        fGenerator.write((uint8_t) fSlotCount);
+        fGenerator.write(src);
     }
 
 private:
     typedef LValue INHERITED;
 
-    const Expression& fExpression;
+    int fIndex;
+
+    int fSlotCount;
 };
 
-std::unique_ptr<ByteCodeGenerator::LValue> ByteCodeGenerator::getLValue(const Expression& e) {
-    switch (e.fKind) {
+std::unique_ptr<ByteCodeGenerator::LValue> ByteCodeGenerator::getLValue(const Expression& expr) {
+    switch (expr.fKind) {
         case Expression::kExternalValue_Kind: {
-            ExternalValue* value = ((ExternalValueReference&) e).fValue;
+            ExternalValue* value = ((ExternalValueReference&) expr).fValue;
             int index = fOutput->fExternalValues.size();
             fOutput->fExternalValues.push_back(value);
             SkASSERT(index <= 255);
@@ -1460,169 +382,954 @@
         }
         case Expression::kFieldAccess_Kind:
         case Expression::kIndex_Kind:
-        case Expression::kVariableReference_Kind:
-            return std::unique_ptr<LValue>(new ByteCodeExpressionLValue(this, e));
-        case Expression::kSwizzle_Kind: {
-            const Swizzle& s = (const Swizzle&) e;
-            return swizzle_is_simple(s)
-                    ? std::unique_ptr<LValue>(new ByteCodeExpressionLValue(this, e))
-                    : std::unique_ptr<LValue>(new ByteCodeSwizzleLValue(this, s));
+        case Expression::kVariableReference_Kind: {
+            Location location = this->getLocation(expr);
+            Variable::Storage storage = this->getStorage(expr);
+            ByteCode::Instruction loadInst = this->getLoadInstruction(location, storage);
+            ByteCode::Instruction storeInst = this->getStoreInstruction(location, storage);
+            return std::unique_ptr<LValue>(new ByteCodeSimpleLValue(this, location,
+                                                                    SlotCount(expr.fType),
+                                                                    loadInst, storeInst));
         }
+        case Expression::kSwizzle_Kind:
+            return std::unique_ptr<LValue>(new ByteCodeSwizzleLValue(this, &(Swizzle&) expr));
+        default:
+            ABORT("unsupported lvalue\n");
+    }
+}
+
+ByteCode::Register ByteCodeGenerator::next(int count) {
+    SkASSERT(fNextRegister + count <= ByteCode::kRegisterMax);
+    fNextRegister += count;
+    return ByteCode::Register{(uint16_t) (fNextRegister - count)};
+}
+
+static TypeCategory type_category(const Type& type) {
+    switch (type.kind()) {
+        case Type::Kind::kVector_Kind:
+        case Type::Kind::kMatrix_Kind:
+            return type_category(type.componentType());
+        default:
+            String name = type.displayName();
+            if (name == "bool") {
+                return TypeCategory::kBool;
+            } else if (name == "int" || name == "short") {
+                return TypeCategory::kSigned;
+            } else if (name == "uint" || name == "ushort") {
+                return TypeCategory::kUnsigned;
+            } else {
+                SkASSERT(name == "float" || name == "half");
+                return TypeCategory::kFloat;
+            }
+            ABORT("unsupported type: %s\n", name.c_str());
+    }
+}
+
+void ByteCodeGenerator::writeTypedInstruction(const Type& type, ByteCode::Instruction s,
+                                              ByteCode::Instruction u, ByteCode::Instruction f) {
+    switch (type_category(type)) {
+        case TypeCategory::kSigned:
+            this->write(s);
+            break;
+        case TypeCategory::kUnsigned:
+            this->write(u);
+            break;
+        case TypeCategory::kFloat: {
+            this->write(f);
+            break;
+        }
+        default:
+            SkASSERT(false);
+    }
+}
+
+void ByteCodeGenerator::writeBinaryInstruction(const Type& operandType,
+                                               ByteCode::Register left,
+                                               ByteCode::Register right,
+                                               ByteCode::Instruction s,
+                                               ByteCode::Instruction u,
+                                               ByteCode::Instruction f,
+                                               ByteCode::Register result) {
+    for (int i = 0; i < SlotCount(operandType); ++i) {
+        this->writeTypedInstruction(operandType, s, u, f);
+        this->write(result + i);
+        this->write(left + i);
+        this->write(right + i);
+    }
+}
+
+void ByteCodeGenerator::writeBinaryExpression(const BinaryExpression& b,
+                                              ByteCode::Register result) {
+    if (b.fOperator == Token::Kind::EQ) {
+        std::unique_ptr<LValue> lvalue = this->getLValue(*b.fLeft);
+        this->writeExpression(*b.fRight, result);
+        lvalue->store(result);
+        return;
+    }
+    const Type& lType = b.fLeft->fType;
+    const Type& rType = b.fRight->fType;
+    bool lVecOrMtx = (lType.kind() == Type::kVector_Kind || lType.kind() == Type::kMatrix_Kind);
+    bool rVecOrMtx = (rType.kind() == Type::kVector_Kind || rType.kind() == Type::kMatrix_Kind);
+    const Type* operandType;
+    if (!lVecOrMtx && rVecOrMtx) {
+        operandType = &rType;
+    } else {
+        operandType = &lType;
+    }
+    Token::Kind op;
+    std::unique_ptr<LValue> lvalue;
+    ByteCode::Register left;
+    switch (b.fOperator) {
+        case Token::Kind::LOGICALAND:
+        case Token::Kind::LOGICALANDEQ:
+        case Token::Kind::LOGICALOR:
+        case Token::Kind::LOGICALOREQ:
+            left = result;
+            break;
+        default:
+            left = this->next(SlotCount(*operandType));
+    }
+    if (is_assignment(b.fOperator)) {
+        lvalue = this->getLValue(*b.fLeft);
+        lvalue->load(left);
+        op = remove_assignment(b.fOperator);
+    } else {
+        this->writeExpression(*b.fLeft, left);
+        op = b.fOperator;
+        if (!lVecOrMtx && rVecOrMtx) {
+            for (int i = 1; i < SlotCount(rType); ++i) {
+                this->write(ByteCode::Instruction::kCopy);
+                this->write(left + i);
+                this->write(left);
+            }
+        }
+    }
+    SkDEBUGCODE(TypeCategory tc = type_category(lType));
+    int count = std::max(SlotCount(lType), SlotCount(rType));
+    switch (op) {
+        case Token::Kind::LOGICALAND: {
+            SkASSERT(left.fIndex == result.fIndex);
+            this->write(ByteCode::Instruction::kMaskPush);
+            ++fConditionCount;
+            this->write(left);
+            this->write(ByteCode::Instruction::kBranchIfAllFalse);
+            DeferredLocation falseLocation(this);
+            SkASSERT(SlotCount(b.fRight->fType) == 1);
+            ByteCode::Register right = this->next(1);
+            this->writeExpression(*b.fRight, right);
+            this->write(ByteCode::Instruction::kAnd);
+            this->write(result);
+            this->write(left);
+            this->write(right);
+            falseLocation.set();
+            --fConditionCount;
+            this->write(ByteCode::Instruction::kMaskPop);
+            return;
+        }
+        case Token::Kind::LOGICALOR: {
+            SkASSERT(left.fIndex == result.fIndex);
+            ByteCode::Register mask = this->next(1);
+            this->write(ByteCode::Instruction::kNot);
+            this->write(mask);
+            this->write(left);
+            this->write(ByteCode::Instruction::kMaskPush);
+            ++fConditionCount;
+            this->write(mask);
+            this->write(ByteCode::Instruction::kBranchIfAllFalse);
+            DeferredLocation falseLocation(this);
+            SkASSERT(SlotCount(b.fRight->fType) == 1);
+            ByteCode::Register right = this->next(1);
+            this->writeExpression(*b.fRight, right);
+            this->write(ByteCode::Instruction::kOr);
+            this->write(result);
+            this->write(left);
+            this->write(right);
+            falseLocation.set();
+            --fConditionCount;
+            this->write(ByteCode::Instruction::kMaskPop);
+            return;
+        }
+        case Token::Kind::SHL:
+        case Token::Kind::SHR: {
+            SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
+                                    tc == SkSL::TypeCategory::kUnsigned));
+            if (!b.fRight->isConstant()) {
+                fErrors.error(b.fRight->fOffset, "Shift amounts must be constant");
+                return;
+            }
+            int64_t shift = b.fRight->getConstantInt();
+            if (shift < 0 || shift > 31) {
+                fErrors.error(b.fRight->fOffset, "Shift amount out of range");
+                return;
+            }
+
+            if (op == Token::Kind::SHL) {
+                this->write(ByteCode::Instruction::kShiftLeft);
+            } else {
+                this->write(type_category(lType) == TypeCategory::kSigned
+                                ? ByteCode::Instruction::kShiftRightS
+                                : ByteCode::Instruction::kShiftRightU);
+            }
+            this->write(result);
+            this->write(left);
+            this->write((uint8_t) shift);
+            return;
+        }
+        case Token::Kind::STAR:
+            // Special case for M*V, V*M, M*M (but not V*V!)
+            if (lType.columns() > 1 && rType.columns() > 1 &&
+                (lType.rows() > 1 || rType.rows() > 1)) {
+                ByteCode::Register right = this->next(SlotCount(rType));
+                this->writeExpression(*b.fRight, right);
+                int rCols = rType.columns(),
+                    rRows = rType.rows(),
+                    lCols = lType.columns(),
+                    lRows = lType.rows();
+                // M*V treats the vector as a column
+                if (rType.kind() == Type::kVector_Kind) {
+                    std::swap(rCols, rRows);
+                }
+                SkASSERT(lCols == rRows);
+                SkASSERT(SlotCount(b.fType) == lRows * rCols);
+                this->write(ByteCode::Instruction::kMatrixMultiply);
+                this->write(result);
+                this->write(left);
+                this->write(right);
+                this->write((uint8_t) lCols);
+                this->write((uint8_t) lRows);
+                this->write((uint8_t) rCols);
+                return;
+            }
+
+        default:
+            break;
+    }
+    ByteCode::Register right = this->next(SlotCount(*operandType));
+    this->writeExpression(*b.fRight, right);
+    if (lVecOrMtx && !rVecOrMtx) {
+        for (int i = 1; i < SlotCount(*operandType); ++i) {
+            this->write(ByteCode::Instruction::kCopy);
+            this->write(right + i);
+            this->write(right);
+        }
+    }
+    switch (op) {
+        case Token::Kind::EQEQ:
+            this->writeBinaryInstruction(*operandType, left, right,
+                                         ByteCode::Instruction::kCompareEQI,
+                                         ByteCode::Instruction::kCompareEQI,
+                                         ByteCode::Instruction::kCompareEQF,
+                                         result);
+            // Collapse to a single bool
+            for (int i = 1; i < count; ++i) {
+                this->write(ByteCode::Instruction::kAnd);
+                this->write(result);
+                this->write(result);
+                this->write(result + i);
+            }
+            break;
+        case Token::Kind::GT:
+            this->writeBinaryInstruction(*operandType, left, right,
+                                         ByteCode::Instruction::kCompareGTS,
+                                         ByteCode::Instruction::kCompareGTU,
+                                         ByteCode::Instruction::kCompareGTF,
+                                         result);
+            break;
+        case Token::Kind::GTEQ:
+            this->writeBinaryInstruction(*operandType, left, right,
+                                         ByteCode::Instruction::kCompareGTEQS,
+                                         ByteCode::Instruction::kCompareGTEQU,
+                                         ByteCode::Instruction::kCompareGTEQF,
+                                         result);
+            break;
+        case Token::Kind::LT:
+            this->writeBinaryInstruction(*operandType, left, right,
+                                         ByteCode::Instruction::kCompareLTS,
+                                         ByteCode::Instruction::kCompareLTU,
+                                         ByteCode::Instruction::kCompareLTF,
+                                         result);
+            break;
+        case Token::Kind::LTEQ:
+            this->writeBinaryInstruction(*operandType, left, right,
+                                         ByteCode::Instruction::kCompareLTEQS,
+                                         ByteCode::Instruction::kCompareLTEQU,
+                                         ByteCode::Instruction::kCompareLTEQF,
+                                         result);
+            break;
+        case Token::Kind::MINUS:
+            this->writeBinaryInstruction(*operandType, left, right,
+                                         ByteCode::Instruction::kSubtractI,
+                                         ByteCode::Instruction::kSubtractI,
+                                         ByteCode::Instruction::kSubtractF,
+                                         result);
+            break;
+        case Token::Kind::NEQ:
+            this->writeBinaryInstruction(*operandType, left, right,
+                                         ByteCode::Instruction::kCompareNEQI,
+                                         ByteCode::Instruction::kCompareNEQI,
+                                         ByteCode::Instruction::kCompareNEQF,
+                                         result);
+            // Collapse to a single bool
+            for (int i = 1; i < count; ++i) {
+                this->write(ByteCode::Instruction::kOr);
+                this->write(result);
+                this->write(result);
+                this->write(result + i);
+            }
+            break;
+        case Token::Kind::PERCENT:
+            this->writeBinaryInstruction(*operandType, left, right,
+                                         ByteCode::Instruction::kRemainderS,
+                                         ByteCode::Instruction::kRemainderU,
+                                         ByteCode::Instruction::kRemainderF,
+                                         result);
+            break;
+        case Token::Kind::PLUS:
+            this->writeBinaryInstruction(*operandType, left, right,
+                                         ByteCode::Instruction::kAddI,
+                                         ByteCode::Instruction::kAddI,
+                                         ByteCode::Instruction::kAddF,
+                                         result);
+            break;
+        case Token::Kind::SLASH:
+            this->writeBinaryInstruction(*operandType, left, right,
+                                         ByteCode::Instruction::kDivideS,
+                                         ByteCode::Instruction::kDivideU,
+                                         ByteCode::Instruction::kDivideF,
+                                         result);
+            break;
+        case Token::Kind::STAR:
+            this->writeBinaryInstruction(*operandType, left, right,
+                                         ByteCode::Instruction::kMultiplyI,
+                                         ByteCode::Instruction::kMultiplyI,
+                                         ByteCode::Instruction::kMultiplyF,
+                                         result);
+            break;
+        case Token::Kind::LOGICALXOR: {
+            SkASSERT(tc == SkSL::TypeCategory::kBool);
+            this->write(ByteCode::Instruction::kXor);
+            this->write(result);
+            this->write(left);
+            this->write(right);
+            break;
+        }
+        case Token::Kind::BITWISEAND: {
+            SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
+            this->write(ByteCode::Instruction::kAnd);
+            this->write(result);
+            this->write(left);
+            this->write(right);
+            break;
+        }
+        case Token::Kind::BITWISEOR: {
+            SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
+            this->write(ByteCode::Instruction::kOr);
+            this->write(result);
+            this->write(left);
+            this->write(right);
+            break;
+        }
+        case Token::Kind::BITWISEXOR: {
+            SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
+            this->write(ByteCode::Instruction::kXor);
+            this->write(result);
+            this->write(left);
+            this->write(right);
+            break;
+        }
+        default:
+            fErrors.error(b.fOffset, SkSL::String::printf("Unsupported binary operator '%s'",
+                                                          Compiler::OperatorName(op)));
+            break;
+    }
+    if (lvalue) {
+        lvalue->store(result);
+    }
+}
+
+void ByteCodeGenerator::writeConstructor(const Constructor& c, ByteCode::Register result) {
+    if (c.fType.rows() > 1) {
+        if (c.fArguments.size() == 1) {
+            if (SlotCount(c.fArguments[0]->fType) == 1) {
+                ByteCode::Register v = this->next(1);
+                this->writeExpression(*c.fArguments[0], v);
+                this->write(ByteCode::Instruction::kScalarToMatrix);
+                this->write(result);
+                this->write(v);
+                this->write((uint8_t) c.fType.columns());
+                this->write((uint8_t) c.fType.rows());
+                return;
+            } else if (c.fArguments[0]->fType.rows() > 1) {
+                ByteCode::Register v = this->next(SlotCount(c.fArguments[0]->fType));
+                this->writeExpression(*c.fArguments[0], v);
+                this->write(ByteCode::Instruction::kMatrixToMatrix);
+                this->write(result);
+                this->write(v);
+                this->write((uint8_t) c.fArguments[0]->fType.columns());
+                this->write((uint8_t) c.fArguments[0]->fType.rows());
+                this->write((uint8_t) c.fType.columns());
+                this->write((uint8_t) c.fType.rows());
+                return;
+            }
+        }
+        int offset = 0;
+        for (const auto& arg : c.fArguments) {
+            this->writeExpression(*arg, ByteCode::Register{(uint16_t) (result.fIndex + offset)});
+            offset += SlotCount(arg->fType);
+        }
+        return;
+    }
+    if (c.fArguments.size() == 1 && c.fArguments[0]->fType.columns() == 1 &&
+        c.fType.columns() > 1) {
+        SkASSERT(SlotCount(c.fArguments[0]->fType) == 1);
+        ByteCode::Register v = result;
+        this->writeExpression(*c.fArguments[0], v);
+        for (int i = 1; i < c.fType.columns(); ++i) {
+            this->write(ByteCode::Instruction::kCopy);
+            this->write(v + i);
+            this->write(v);
+        }
+        return;
+    }
+    ByteCode::Instruction inst;
+    switch (type_category(c.fArguments[0]->fType)) {
+        case TypeCategory::kSigned:
+            if (type_category(c.fType) == TypeCategory::kFloat) {
+                inst = ByteCode::Instruction::kSignedToFloat;
+            } else {
+                inst = ByteCode::Instruction::kNop;
+            }
+            break;
+        case TypeCategory::kUnsigned:
+            if (type_category(c.fType) == TypeCategory::kFloat) {
+                inst = ByteCode::Instruction::kUnsignedToFloat;
+            } else {
+                inst = ByteCode::Instruction::kNop;
+            }
+            break;
+        case TypeCategory::kFloat:
+            if (type_category(c.fType) == TypeCategory::kSigned) {
+                inst = ByteCode::Instruction::kFloatToSigned;
+            } else if (type_category(c.fType) == TypeCategory::kUnsigned) {
+                inst = ByteCode::Instruction::kFloatToUnsigned;
+            } else {
+                inst = ByteCode::Instruction::kNop;
+            }
+            break;
+        default:
+            SkASSERT(false);
+            return;
+    }
+    ByteCode::Register values;
+    if (inst == ByteCode::Instruction::kNop) {
+        values = result;
+    } else {
+        values = this->next(SlotCount(c.fType));
+    }
+    ByteCode::Register v = values;
+    for (size_t i = 0; i < c.fArguments.size(); ++i) {
+        this->writeExpression(*c.fArguments[i], v);
+        v.fIndex += SlotCount(c.fArguments[i]->fType);
+    }
+    if (inst != ByteCode::Instruction::kNop) {
+        v = values;
+        ByteCode::Register target = result;
+        for (size_t i = 0; i < c.fArguments.size(); ++i) {
+            int count = SlotCount(c.fArguments[i]->fType);
+            for (int j = 0; j < count; ++j) {
+                this->write(inst);
+                this->write(target);
+                ++target.fIndex;
+                this->write(v + j);
+            }
+        }
+    }
+}
+
+void ByteCodeGenerator::writeExternalFunctionCall(const ExternalFunctionCall& f,
+                                                  ByteCode::Register result) {
+    int argumentCount = 0;
+    for (const auto& arg : f.fArguments) {
+        argumentCount += SlotCount(arg->fType);
+    }
+    ByteCode::Register args = this->next(argumentCount);
+    argumentCount = 0;
+    for (const auto& arg : f.fArguments) {
+        this->writeExpression(*arg, args + argumentCount);
+        argumentCount += SlotCount(arg->fType);
+    }
+    this->write(ByteCode::Instruction::kCallExternal);
+    this->write(result);
+    int index = fOutput->fExternalValues.size();
+    fOutput->fExternalValues.push_back(f.fFunction);
+    SkASSERT(index <= 255);
+    this->write((uint8_t) index);
+    SkASSERT(SlotCount(f.fType) <= 255);
+    this->write((uint8_t) SlotCount(f.fType));
+    this->write(args);
+    SkASSERT(argumentCount <= 255);
+    this->write((uint8_t) argumentCount);
+}
+
+void ByteCodeGenerator::writeExternalValue(const ExternalValueReference& e,
+                                           ByteCode::Register result) {
+    this->write(ByteCode::Instruction::kReadExternal);
+    this->write(result);
+    this->write((uint8_t) SlotCount(e.fValue->type()));
+    int index = fOutput->fExternalValues.size();
+    fOutput->fExternalValues.push_back(e.fValue);
+    SkASSERT(index <= 255);
+    this->write((uint8_t) index);
+}
+
+void ByteCodeGenerator::writeIntrinsicCall(const FunctionCall& c, Intrinsic intrinsic,
+                                           ByteCode::Register result) {
+    if (intrinsic.fIsSpecial) {
+        switch (intrinsic.fValue.fSpecial) {
+            case SpecialIntrinsic::kDot: {
+                SkASSERT(c.fArguments.size() == 2);
+                int count = SlotCount(c.fArguments[0]->fType);
+                ByteCode::Register left = this->next(count);
+                this->writeExpression(*c.fArguments[0], left);
+                ByteCode::Register right = this->next(count);
+                this->writeExpression(*c.fArguments[1], right);
+                ByteCode::Register product = this->next(count);
+                for (int i = 0; i < count; ++i) {
+                    this->writeTypedInstruction(c.fType,
+                                                ByteCode::Instruction::kMultiplyI,
+                                                ByteCode::Instruction::kMultiplyI,
+                                                ByteCode::Instruction::kMultiplyF);
+                    this->write(product + i);
+                    this->write(left + i);
+                    this->write(right + i);
+                }
+                ByteCode::Register total = product;
+                for (int i = 1; i < count; ++i) {
+                    this->writeTypedInstruction(c.fType,
+                                                ByteCode::Instruction::kAddI,
+                                                ByteCode::Instruction::kAddI,
+                                                ByteCode::Instruction::kAddF);
+                    ByteCode::Register sum = i == count - 1 ? result : this->next(1);
+                    this->write(sum);
+                    this->write(total);
+                    this->write(product + i);
+                    total = sum;
+                }
+                break;
+            }
+            case SpecialIntrinsic::kInverse: {
+                SkASSERT(c.fArguments.size() == 1);
+                int count = SlotCount(c.fArguments[0]->fType);
+                ByteCode::Register arg = this->next(count);
+                this->writeExpression(*c.fArguments[0], arg);
+                switch (SlotCount(c.fArguments[0]->fType)) {
+                    case 4:  this->write(ByteCode::Instruction::kInverse2x2); break;
+                    case 9:  this->write(ByteCode::Instruction::kInverse3x3); break;
+                    case 16: this->write(ByteCode::Instruction::kInverse4x4); break;
+                    default: SkASSERT(false);
+                }
+                this->write(result);
+                this->write(arg);
+                break;
+            }
+        }
+    } else {
+        std::vector<ByteCode::Register> argRegs;
+        for (const auto& expr : c.fArguments) {
+            ByteCode::Register reg = this->next(SlotCount(expr->fType));
+            this->writeExpression(*expr, reg);
+            argRegs.push_back(reg);
+        }
+        this->write(intrinsic.fValue.fInstruction);
+        if (c.fType.fName != "void") {
+            this->write(result);
+        }
+        for (ByteCode::Register arg : argRegs) {
+            this->write(arg);
+        }
+    }
+}
+
+void ByteCodeGenerator::writeFunctionCall(const FunctionCall& c, ByteCode::Register result) {
+    auto found = fIntrinsics.find(c.fFunction.fName);
+    if (found != fIntrinsics.end()) {
+        return this->writeIntrinsicCall(c, found->second, result);
+    }
+    int argCount = c.fArguments.size();
+    std::vector<std::unique_ptr<LValue>> lvalues;
+    int parameterSlotCount = 0;
+    for (const auto& p : c.fFunction.fParameters) {
+        parameterSlotCount += SlotCount(p->fType);
+    }
+    ByteCode::Register argStart = this->next(parameterSlotCount);
+    ByteCode::Register nextArg = argStart;
+    for (int i = 0; i < argCount; ++i) {
+        const auto& param = c.fFunction.fParameters[i];
+        const auto& arg = c.fArguments[i];
+        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
+            lvalues.emplace_back(this->getLValue(*arg));
+            lvalues.back()->load(nextArg);
+        } else {
+            this->writeExpression(*arg, nextArg);
+        }
+        nextArg.fIndex += SlotCount(arg->fType);
+    }
+    // Find the index of the function we're calling. We explicitly do not allow calls to functions
+    // before they're defined. This is an easy-to-understand rule that prevents recursion.
+    size_t idx;
+    for (idx = 0; idx < fFunctions.size(); ++idx) {
+        if (c.fFunction.matches(fFunctions[idx]->fDeclaration)) {
+            break;
+        }
+    }
+    if (idx > 255) {
+        fErrors.error(c.fOffset, "Function count limit exceeded");
+        return;
+    } else if (idx >= fOutput->fFunctions.size()) {
+        fErrors.error(c.fOffset, "Call to undefined function");
+        return;
+    }
+
+    this->write(ByteCode::Instruction::kCall);
+    this->write(result);
+    this->write((uint8_t) idx);
+    this->write(argStart);
+    nextArg = argStart;
+    auto lvalue = lvalues.begin();
+    for (int i = 0; i < argCount; ++i) {
+        const auto& param = c.fFunction.fParameters[i];
+        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
+            (*(lvalue++))->store(nextArg);
+        }
+        nextArg.fIndex += SlotCount(param->fType);
+    }
+}
+
+void ByteCodeGenerator::incOrDec(Token::Kind op, Expression& operand, bool prefix,
+                                 ByteCode::Register result) {
+    SkASSERT(op == Token::Kind::PLUSPLUS || op == Token::Kind::MINUSMINUS);
+    std::unique_ptr<LValue> lvalue = this->getLValue(operand);
+    SkASSERT(SlotCount(operand.fType) == 1);
+    ByteCode::Register value;
+    if (prefix) {
+        value = this->next(1);
+    } else {
+        value = result;
+    }
+    lvalue->load(value);
+    ByteCode::Register one = this->next(1);
+    this->write(ByteCode::Instruction::kImmediate);
+    this->write(one);
+    if (type_category(operand.fType) == TypeCategory::kFloat) {
+        this->write(ByteCode::Immediate(1.0f));
+    } else {
+        this->write(ByteCode::Immediate((int32_t) 1));
+    }
+    if (op == Token::Kind::PLUSPLUS) {
+        this->writeTypedInstruction(operand.fType,
+                                    ByteCode::Instruction::kAddI,
+                                    ByteCode::Instruction::kAddI,
+                                    ByteCode::Instruction::kAddF);
+    } else {
+        this->writeTypedInstruction(operand.fType,
+                                    ByteCode::Instruction::kSubtractI,
+                                    ByteCode::Instruction::kSubtractI,
+                                    ByteCode::Instruction::kSubtractF);
+    }
+    if (prefix) {
+        this->write(result);
+        this->write(value);
+        this->write(one);
+        lvalue->store(result);
+    } else {
+        ByteCode::Register temp = this->next(1);
+        this->write(temp);
+        this->write(value);
+        this->write(one);
+        lvalue->store(temp);
+    }
+}
+
+void ByteCodeGenerator::writePostfixExpression(const PostfixExpression& p,
+                                               ByteCode::Register result) {
+    this->incOrDec(p.fOperator, *p.fOperand, false, result);
+}
+
+void ByteCodeGenerator::writePrefixExpression(const PrefixExpression& p,
+                                              ByteCode::Register result) {
+    switch (p.fOperator) {
+        case Token::Kind::PLUSPLUS:
+        case Token::Kind::MINUSMINUS: {
+            return this->incOrDec(p.fOperator, *p.fOperand, true, result);
+        }
+        case Token::Kind::MINUS: {
+            ByteCode::Register src = this->next(SlotCount(p.fType));
+            this->writeExpression(*p.fOperand, src);
+            for (int i = 0; i < SlotCount(p.fType); ++i) {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCode::Instruction::kNegateS,
+                                            ByteCode::Instruction::kNegateS,
+                                            ByteCode::Instruction::kNegateF);
+                this->write(result + i);
+                this->write(src + i);
+            }
+            break;
+        }
+        case Token::Kind::LOGICALNOT:
+        case Token::Kind::BITWISENOT: {
+            ByteCode::Register src = this->next(SlotCount(p.fType));
+            this->writeExpression(*p.fOperand, src);
+            for (int i = 0; i < SlotCount(p.fType); ++i) {
+                this->write(ByteCode::Instruction::kNot);
+                this->write(result + i);
+                this->write(src + i);
+            }
+            break;
+        }
+        default:
+            SkASSERT(false);
+    }
+}
+
+void ByteCodeGenerator::writeSwizzle(const Swizzle& s, ByteCode::Register result) {
+    if (swizzle_is_simple(s)) {
+        this->writeVariableExpression(s, result);
+        return;
+    }
+    ByteCode::Register base = this->writeExpression(*s.fBase);
+    for (int i = 0; i < (int) s.fComponents.size(); ++i) {
+        this->write(ByteCode::Instruction::kCopy);
+        this->write(result + i);
+        this->write(base + s.fComponents[i]);
+    }
+}
+
+void ByteCodeGenerator::writeTernaryExpression(const TernaryExpression& t,
+                                               ByteCode::Register result) {
+    int count = SlotCount(t.fType);
+    SkASSERT(count == SlotCount(t.fIfTrue->fType));
+    SkASSERT(count == SlotCount(t.fIfFalse->fType));
+
+    ByteCode::Register test = this->writeExpression(*t.fTest);
+    this->write(ByteCode::Instruction::kMaskPush);
+    ++fConditionCount;
+    this->write(test);
+    ByteCode::Register ifTrue = this->writeExpression(*t.fIfTrue);
+    this->write(ByteCode::Instruction::kMaskNegate);
+    ByteCode::Register ifFalse = this->writeExpression(*t.fIfFalse);
+    --fConditionCount;
+    this->write(ByteCode::Instruction::kMaskPop);
+    for (int i = 0; i < count; ++i) {
+        this->write(ByteCode::Instruction::kSelect);
+        this->write(result + i);
+        this->write(test);
+        this->write(ifTrue + i);
+        this->write(ifFalse + i);
+    }
+}
+
+void ByteCodeGenerator::writeVariableExpression(const Expression& expr,
+                                                ByteCode::Register result) {
+    ByteCodeGenerator::Location location = this->getLocation(expr);
+    int count = SlotCount(expr.fType);
+    for (int i = 0; i < count; ++i) {
+        ByteCodeGenerator::Location final = location.offset(*this, i);
+        this->write(this->getLoadInstruction(location, this->getStorage(expr)));
+        this->write(result + i);
+        this->write(final);
+    }
+}
+
+void ByteCodeGenerator::writeExpression(const Expression& expr, ByteCode::Register result) {
+    switch (expr.fKind) {
+        case Expression::kBoolLiteral_Kind: {
+            this->write(ByteCode::Instruction::kImmediate);
+            this->write(result);
+            this->write(ByteCode::Immediate((int32_t) (((BoolLiteral&) expr).fValue ? -1 : 0)));
+            break;
+        }
+        case Expression::kBinary_Kind: {
+            this->writeBinaryExpression((BinaryExpression&) expr, result);
+            break;
+        }
+        case Expression::kConstructor_Kind: {
+            this->writeConstructor((Constructor&) expr, result);
+            break;
+        }
+        case Expression::kExternalFunctionCall_Kind:
+            this->writeExternalFunctionCall((ExternalFunctionCall&) expr, result);
+            break;
+        case Expression::kExternalValue_Kind:
+            this->writeExternalValue((ExternalValueReference&) expr, result);
+            break;
+        case Expression::kFloatLiteral_Kind: {
+            this->write(ByteCode::Instruction::kImmediate);
+            this->write(result);
+            this->write(ByteCode::Immediate((float) ((FloatLiteral&) expr).fValue));
+            break;
+        }
+        case Expression::kFunctionCall_Kind: {
+            this->writeFunctionCall((FunctionCall&) expr, result);
+            break;
+        }
+        case Expression::kIntLiteral_Kind: {
+            this->write(ByteCode::Instruction::kImmediate);
+            this->write(result);
+            this->write(ByteCode::Immediate((int32_t) ((IntLiteral&) expr).fValue));
+            break;
+        }
+        case Expression::kPostfix_Kind:
+            this->writePostfixExpression((PostfixExpression&) expr, result);
+            break;
+        case Expression::kPrefix_Kind:
+            this->writePrefixExpression((PrefixExpression&) expr, result);
+            break;
+        case Expression::kSwizzle_Kind:
+            this->writeSwizzle((Swizzle&) expr, result);
+            break;
         case Expression::kTernary_Kind:
+            this->writeTernaryExpression((TernaryExpression&) expr, result);
+            break;
+        case Expression::kFieldAccess_Kind:
+        case Expression::kIndex_Kind:
+        case Expression::kVariableReference_Kind:
+            this->writeVariableExpression(expr, result);
+            break;
         default:
 #ifdef SK_DEBUG
-            ABORT("unsupported lvalue %s\n", e.description().c_str());
+            ABORT("unsupported lvalue %s\n", expr.description().c_str());
 #endif
-            return nullptr;
+            break;
     }
 }
 
+ByteCode::Register ByteCodeGenerator::writeExpression(const Expression& expr) {
+    ByteCode::Register result = this->next(SlotCount(expr.fType));
+    this->writeExpression(expr, result);
+    return result;
+}
+
 void ByteCodeGenerator::writeBlock(const Block& b) {
     for (const auto& s : b.fStatements) {
         this->writeStatement(*s);
     }
 }
 
-void ByteCodeGenerator::setBreakTargets() {
-    std::vector<DeferredLocation>& breaks = fBreakTargets.top();
-    for (DeferredLocation& b : breaks) {
-        b.set();
-    }
-    fBreakTargets.pop();
-}
-
-void ByteCodeGenerator::setContinueTargets() {
-    std::vector<DeferredLocation>& continues = fContinueTargets.top();
-    for (DeferredLocation& c : continues) {
-        c.set();
-    }
-    fContinueTargets.pop();
-}
-
-void ByteCodeGenerator::writeBreakStatement(const BreakStatement& b) {
-    // TODO: Include BranchIfAllFalse to top-most LoopNext
-    this->write(ByteCodeInstruction::kLoopBreak);
-}
-
-void ByteCodeGenerator::writeContinueStatement(const ContinueStatement& c) {
-    // TODO: Include BranchIfAllFalse to top-most LoopNext
-    this->write(ByteCodeInstruction::kLoopContinue);
-}
-
 void ByteCodeGenerator::writeDoStatement(const DoStatement& d) {
-    this->write(ByteCodeInstruction::kLoopBegin);
-    size_t start = fCode->size();
+    this->write(ByteCode::Instruction::kLoopBegin);
+    ++fConditionCount;
+    SkASSERT(fCode->size() < ByteCode::kPointerMax);
+    ByteCode::Pointer start{(uint16_t) fCode->size()};
     this->writeStatement(*d.fStatement);
-    this->write(ByteCodeInstruction::kLoopNext);
-    this->writeExpression(*d.fTest);
-    this->write(ByteCodeInstruction::kLoopMask);
-    // TODO: Could shorten this with kBranchIfAnyTrue
-    this->write(ByteCodeInstruction::kBranchIfAllFalse);
+    ByteCode::Register test = this->writeExpression(*d.fTest);
+    this->write(ByteCode::Instruction::kLoopNext);
+    this->write(ByteCode::Instruction::kLoopMask);
+    this->write(test);
+    this->write(ByteCode::Instruction::kBranchIfAllFalse);
     DeferredLocation endLocation(this);
-    this->write(ByteCodeInstruction::kBranch);
-    this->write16(start);
+    this->write(ByteCode::Instruction::kBranch);
+    this->write(start);
     endLocation.set();
-    this->write(ByteCodeInstruction::kLoopEnd);
+    --fConditionCount;
+    this->write(ByteCode::Instruction::kLoopEnd);
 }
 
 void ByteCodeGenerator::writeForStatement(const ForStatement& f) {
-    fContinueTargets.emplace();
-    fBreakTargets.emplace();
     if (f.fInitializer) {
         this->writeStatement(*f.fInitializer);
     }
-    this->write(ByteCodeInstruction::kLoopBegin);
-    size_t start = fCode->size();
+    this->write(ByteCode::Instruction::kLoopBegin);
+    ++fConditionCount;
+    ByteCode::Pointer start{(uint16_t) fCode->size()};
     if (f.fTest) {
-        this->writeExpression(*f.fTest);
-        this->write(ByteCodeInstruction::kLoopMask);
+        ByteCode::Register test = this->writeExpression(*f.fTest);
+        this->write(ByteCode::Instruction::kLoopMask);
+        this->write(test);
     }
-    this->write(ByteCodeInstruction::kBranchIfAllFalse);
+    this->write(ByteCode::Instruction::kBranchIfAllFalse);
     DeferredLocation endLocation(this);
     this->writeStatement(*f.fStatement);
-    this->write(ByteCodeInstruction::kLoopNext);
+    this->write(ByteCode::Instruction::kLoopNext);
     if (f.fNext) {
-        this->writeExpression(*f.fNext, true);
+        this->writeExpression(*f.fNext);
     }
-    this->write(ByteCodeInstruction::kBranch);
-    this->write16(start);
+    this->write(ByteCode::Instruction::kBranch);
+    this->write(start);
     endLocation.set();
-    this->write(ByteCodeInstruction::kLoopEnd);
+    --fConditionCount;
+    this->write(ByteCode::Instruction::kLoopEnd);
 }
 
 void ByteCodeGenerator::writeIfStatement(const IfStatement& i) {
-    this->writeExpression(*i.fTest);
-    this->write(ByteCodeInstruction::kMaskPush);
-    this->write(ByteCodeInstruction::kBranchIfAllFalse);
+    ByteCode::Register test = this->writeExpression(*i.fTest);
+    this->write(ByteCode::Instruction::kMaskPush);
+    ++fConditionCount;
+    this->write(test);
+    this->write(ByteCode::Instruction::kBranchIfAllFalse);
     DeferredLocation falseLocation(this);
     this->writeStatement(*i.fIfTrue);
     falseLocation.set();
     if (i.fIfFalse) {
-        this->write(ByteCodeInstruction::kMaskNegate);
-        this->write(ByteCodeInstruction::kBranchIfAllFalse);
+        this->write(ByteCode::Instruction::kMaskNegate);
+        this->write(ByteCode::Instruction::kBranchIfAllFalse);
         DeferredLocation endLocation(this);
         this->writeStatement(*i.fIfFalse);
         endLocation.set();
     }
-    this->write(ByteCodeInstruction::kMaskPop);
+    --fConditionCount;
+    this->write(ByteCode::Instruction::kMaskPop);
 }
 
-void ByteCodeGenerator::writeReturnStatement(const ReturnStatement& r) {
-    if (fLoopCount || fConditionCount) {
+void ByteCodeGenerator::writeReturn(const ReturnStatement& r) {
+    if (fConditionCount) {
         fErrors.error(r.fOffset, "return not allowed inside conditional or loop");
         return;
     }
-    int count = SlotCount(r.fExpression->fType);
-    this->writeExpression(*r.fExpression);
-
-    // Technically, the kReturn also pops fOutput->fLocalCount values from the stack, too, but we
-    // haven't counted pushing those (they're outside the scope of our stack tracking). Instead,
-    // we account for those in writeFunction().
-
-    // This is all fine because we don't allow conditional returns, so we only return once anyway.
-    this->write(ByteCodeInstruction::kReturn, -count);
-    this->write8(count);
-}
-
-void ByteCodeGenerator::writeSwitchStatement(const SwitchStatement& r) {
-    // not yet implemented
-    abort();
+    if (r.fExpression) {
+        ByteCode::Register value = this->writeExpression(*r.fExpression);
+        this->write(ByteCode::Instruction::kReturnValue);
+        this->write(value);
+    }
+    else {
+        this->write(ByteCode::Instruction::kReturn);
+    }
 }
 
 void ByteCodeGenerator::writeVarDeclarations(const VarDeclarations& v) {
     for (const auto& declStatement : v.fVars) {
         const VarDeclaration& decl = (VarDeclaration&) *declStatement;
-        // we need to grab the location even if we don't use it, to ensure it has been allocated
-        Location location = this->getLocation(*decl.fVar);
+        // we need to grab the location even if we don't use it, to ensure it
+        // has been allocated
+        ByteCodeGenerator::Location location = this->getLocation(*decl.fVar);
         if (decl.fValue) {
-            this->writeExpression(*decl.fValue);
-            int count = SlotCount(decl.fValue->fType);
-            if (count > 4) {
-                this->write(ByteCodeInstruction::kPushImmediate);
-                this->write32(location.fSlot);
-                this->write(ByteCodeInstruction::kStoreExtended, count);
-                this->write8(count);
-            } else {
-                this->write(vector_instruction(ByteCodeInstruction::kStore, count));
-                this->write8(location.fSlot);
+            ByteCode::Register src = this->writeExpression(*decl.fValue);
+            for (int i = 0; i < SlotCount(decl.fVar->fType); ++i) {
+                ByteCodeGenerator::Location final = location.offset(*this, i);
+                this->write(ByteCode::Instruction::kStoreStackDirect);
+                this->write(final);
+                this->write(src + i);
             }
         }
     }
 }
 
 void ByteCodeGenerator::writeWhileStatement(const WhileStatement& w) {
-    this->write(ByteCodeInstruction::kLoopBegin);
-    size_t cond = fCode->size();
-    this->writeExpression(*w.fTest);
-    this->write(ByteCodeInstruction::kLoopMask);
-    this->write(ByteCodeInstruction::kBranchIfAllFalse);
+    this->write(ByteCode::Instruction::kLoopBegin);
+    ++fConditionCount;
+    SkASSERT(fCode->size() < ByteCode::kPointerMax);
+    ByteCode::Pointer start{(uint16_t) fCode->size()};
+    ByteCode::Register test = this->writeExpression(*w.fTest);
+    this->write(ByteCode::Instruction::kLoopMask);
+    this->write(test);
+    this->write(ByteCode::Instruction::kBranchIfAllFalse);
     DeferredLocation endLocation(this);
     this->writeStatement(*w.fStatement);
-    this->write(ByteCodeInstruction::kLoopNext);
-    this->write(ByteCodeInstruction::kBranch);
-    this->write16(cond);
+    this->write(ByteCode::Instruction::kLoopNext);
+    this->write(ByteCode::Instruction::kBranch);
+    this->write(start);
     endLocation.set();
-    this->write(ByteCodeInstruction::kLoopEnd);
+    --fConditionCount;
+    this->write(ByteCode::Instruction::kLoopEnd);
 }
 
 void ByteCodeGenerator::writeStatement(const Statement& s) {
@@ -1631,19 +1338,16 @@
             this->writeBlock((Block&) s);
             break;
         case Statement::kBreak_Kind:
-            this->writeBreakStatement((BreakStatement&) s);
+            this->write(ByteCode::Instruction::kBreak);
             break;
         case Statement::kContinue_Kind:
-            this->writeContinueStatement((ContinueStatement&) s);
+            this->write(ByteCode::Instruction::kContinue);
             break;
-        case Statement::kDiscard_Kind:
-            // not yet implemented
-            abort();
         case Statement::kDo_Kind:
             this->writeDoStatement((DoStatement&) s);
             break;
         case Statement::kExpression_Kind:
-            this->writeExpression(*((ExpressionStatement&) s).fExpression, true);
+            this->writeExpression(*((ExpressionStatement&) s).fExpression);
             break;
         case Statement::kFor_Kind:
             this->writeForStatement((ForStatement&) s);
@@ -1654,10 +1358,7 @@
         case Statement::kNop_Kind:
             break;
         case Statement::kReturn_Kind:
-            this->writeReturnStatement((ReturnStatement&) s);
-            break;
-        case Statement::kSwitch_Kind:
-            this->writeSwitchStatement((SwitchStatement&) s);
+            this->writeReturn((ReturnStatement&) s);
             break;
         case Statement::kVarDeclarations_Kind:
             this->writeVarDeclarations(*((VarDeclarationsStatement&) s).fDeclaration);
@@ -1666,18 +1367,80 @@
             this->writeWhileStatement((WhileStatement&) s);
             break;
         default:
-            SkASSERT(false);
+            ABORT("unsupported statement\n");
     }
 }
 
-ByteCodeFunction::ByteCodeFunction(const FunctionDeclaration* declaration)
-        : fName(declaration->fName) {
+void ByteCodeGenerator::writeFunction(const FunctionDefinition& f) {
+    fFunction = &f;
+    std::unique_ptr<ByteCodeFunction> result(new ByteCodeFunction(&f.fDeclaration));
+    result->fReturnSlotCount = SlotCount(f.fDeclaration.fReturnType);
     fParameterCount = 0;
-    for (const auto& p : declaration->fParameters) {
-        int slots = ByteCodeGenerator::SlotCount(p->fType);
-        fParameters.push_back({ slots, (bool)(p->fModifiers.fFlags & Modifiers::kOut_Flag) });
-        fParameterCount += slots;
+    fConditionCount = 0;
+    for (const auto& p : f.fDeclaration.fParameters) {
+        int count = SlotCount(p->fType);
+        bool isOut = ((p->fModifiers.fFlags & Modifiers::kOut_Flag) != 0);
+        result->fParameters.push_back(ByteCodeFunction::Parameter{count, isOut});
+        fParameterCount += count;
+    }
+    result->fParameterSlotCount = fParameterCount;
+    fCode = &result->fCode;
+    this->writeStatement(*f.fBody);
+    result->fStackSlotCount = fLocals.size();
+    if (f.fDeclaration.fReturnType.fName == "void") {
+        this->write(ByteCode::Instruction::kReturn);
+    } else {
+        this->write(ByteCode::Instruction::kAbort);
+    }
+    fOutput->fFunctions.push_back(std::move(result));
+    SkASSERT(fConditionCount == 0);
+}
+
+void ByteCodeGenerator::gatherUniforms(const Type& type, const String& name) {
+    if (type.kind() == Type::kOther_Kind) {
+        return;
+    } else if (type.kind() == Type::kStruct_Kind) {
+        for (const auto& f : type.fields()) {
+            this->gatherUniforms(*f.fType, name + "." + f.fName);
+        }
+    } else if (type.kind() == Type::kArray_Kind) {
+        for (int i = 0; i < type.columns(); ++i) {
+            this->gatherUniforms(type.componentType(), String::printf("%s[%d]", name.c_str(), i));
+        }
+    } else {
+        fOutput->fUniforms.push_back({ name, type_category(type), type.rows(), type.columns(),
+                                       fOutput->fUniformSlotCount });
+        fOutput->fUniformSlotCount += type.columns() * type.rows();
     }
 }
 
+bool ByteCodeGenerator::generateCode() {
+    fOutput->fGlobalSlotCount = 0;
+    fOutput->fUniformSlotCount = 0;
+    for (const auto& pe : fProgram) {
+        if (pe.fKind == ProgramElement::kVar_Kind) {
+            VarDeclarations& decl = (VarDeclarations&) pe;
+            for (const auto& v : decl.fVars) {
+                const Variable* declVar = ((VarDeclaration&) *v).fVar;
+                if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
+                    continue;
+                }
+                if (is_uniform(*declVar)) {
+                    this->gatherUniforms(declVar->fType, declVar->fName);
+                } else {
+                    fOutput->fGlobalSlotCount += SlotCount(declVar->fType);
+                }
+            }
+        }
+    }
+    for (const auto& pe : fProgram) {
+        if (pe.fKind == ProgramElement::kFunction_Kind) {
+            FunctionDefinition& f = (FunctionDefinition&) pe;
+            fFunctions.push_back(&f);
+            this->writeFunction(f);
+        }
+    }
+    return fErrors.errorCount() == 0;
 }
+
+} // namespace
diff --git a/src/sksl/SkSLByteCodeGenerator.h b/src/sksl/SkSLByteCodeGenerator.h
index 4e3accd..ab232c4 100644
--- a/src/sksl/SkSLByteCodeGenerator.h
+++ b/src/sksl/SkSLByteCodeGenerator.h
@@ -54,95 +54,19 @@
 
 class ByteCodeGenerator : public CodeGenerator {
 public:
-    class LValue {
-    public:
-        LValue(ByteCodeGenerator& generator)
-            : fGenerator(generator) {}
-
-        virtual ~LValue() {}
-
-        /**
-         * Stack before call: ... lvalue
-         * Stack after call: ... lvalue load
-         */
-        virtual void load() = 0;
-
-        /**
-         * Stack before call: ... lvalue value
-         * Stack after call: ...
-         */
-        virtual void store(bool discard) = 0;
-
-    protected:
-        ByteCodeGenerator& fGenerator;
-    };
-
-    ByteCodeGenerator(const Context* context, const Program* program, ErrorReporter* errors,
-                      ByteCode* output);
+    ByteCodeGenerator(const Program* program, ErrorReporter* errors, ByteCode* output);
 
     bool generateCode() override;
 
-    void write8(uint8_t b);
-
-    void write16(uint16_t b);
-
-    void write32(uint32_t b);
-
-    void write(ByteCodeInstruction inst, int count = kUnusedStackCount);
-
-    /**
-     * Based on 'type', writes the s (signed), u (unsigned), or f (float) instruction.
-     */
-    void writeTypedInstruction(const Type& type, ByteCodeInstruction s, ByteCodeInstruction u,
-                               ByteCodeInstruction f, int count, bool writeCount = true);
-
-    static int SlotCount(const Type& type);
-
 private:
-    static constexpr int kUnusedStackCount = INT32_MAX;
-    static int StackUsage(ByteCodeInstruction, int count);
-
-    // reserves 16 bits in the output code, to be filled in later with an address once we determine
-    // it
-    class DeferredLocation {
-    public:
-        DeferredLocation(ByteCodeGenerator* generator)
-            : fGenerator(*generator)
-            , fOffset(generator->fCode->size()) {
-            generator->write16(0);
-        }
-
-#ifdef SK_DEBUG
-        ~DeferredLocation() {
-            SkASSERT(fSet);
-        }
-#endif
-
-        void set() {
-            int target = fGenerator.fCode->size();
-            SkASSERT(target <= 65535);
-            (*fGenerator.fCode)[fOffset] = target;
-            (*fGenerator.fCode)[fOffset + 1] = target >> 8;
-#ifdef SK_DEBUG
-            fSet = true;
-#endif
-        }
-
-    private:
-        ByteCodeGenerator& fGenerator;
-        size_t fOffset;
-#ifdef SK_DEBUG
-        bool fSet = false;
-#endif
-    };
-
     // Intrinsics which do not simply map to a single opcode
     enum class SpecialIntrinsic {
         kDot,
+        kInverse,
     };
 
     struct Intrinsic {
-        Intrinsic(ByteCodeInstruction instruction)
+        Intrinsic(ByteCode::Instruction instruction)
             : fIsSpecial(false)
             , fValue(instruction) {}
 
@@ -153,201 +77,250 @@
         bool fIsSpecial;
 
         union Value {
-            Value(ByteCodeInstruction instruction)
+            Value(ByteCode::Instruction instruction)
                 : fInstruction(instruction) {}
 
             Value(SpecialIntrinsic special)
                 : fSpecial(special) {}
 
-            ByteCodeInstruction fInstruction;
+            ByteCode::Instruction fInstruction;
             SpecialIntrinsic fSpecial;
         } fValue;
     };
 
+    class LValue {
+    public:
+        LValue(ByteCodeGenerator& generator)
+            : fGenerator(generator) {}
 
-    // Similar to Variable::Storage, but locals and parameters are grouped together, and globals
-    // are further subidivided into uniforms and other (writable) globals.
-    enum class Storage {
-        kLocal,    // include parameters
-        kGlobal,   // non-uniform globals
-        kUniform,  // uniform globals
+        virtual ~LValue() {}
+
+        virtual void load(ByteCode::Register result) = 0;
+
+        virtual void store(ByteCode::Register src) = 0;
+
+    protected:
+        ByteCodeGenerator& fGenerator;
     };
 
     struct Location {
-        int     fSlot;
-        Storage fStorage;
+        enum {
+            kPointer_Kind,
+            kRegister_Kind
+        } fKind;
 
-        // Not really invalid, but a "safe" placeholder to be more explicit at call-sites
-        static Location MakeInvalid() { return { 0, Storage::kLocal }; }
+        union {
+            ByteCode::Pointer fPointer;
+            ByteCode::Register fRegister;
+        };
 
-        Location makeOnStack() { return { -1, fStorage }; }
-        bool isOnStack() const { return fSlot < 0; }
+        Location(ByteCode::Pointer p)
+            : fKind(kPointer_Kind)
+            , fPointer(p) {}
 
-        Location operator+(int offset) {
-            SkASSERT(fSlot >= 0);
-            return { fSlot + offset, fStorage };
+        Location(ByteCode::Register r)
+            : fKind(kRegister_Kind)
+            , fRegister(r) {}
+
+        /**
+         * Returns this location offset by 'offset' bytes. For pointers, this is a compile-time
+         * operation, while for registers there will be CPU instructions output to handle the
+         * runtime calculation of the address.
+         */
+        Location offset(ByteCodeGenerator& generator, int offset) {
+            if (!offset) {
+                return *this;
+            }
+            if (fKind == kPointer_Kind) {
+                return Location(fPointer + offset);
+            }
+            ByteCode::Register a = generator.next(1);
+            generator.write(ByteCode::Instruction::kImmediate);
+            generator.write(a);
+            generator.write(ByteCode::Immediate{offset});
+            ByteCode::Register result = generator.next(1);
+            generator.write(ByteCode::Instruction::kAddI);
+            generator.write(result);
+            generator.write(fRegister);
+            generator.write(a);
+            return result;
         }
 
-        ByteCodeInstruction selectLoad(ByteCodeInstruction local,
-                                       ByteCodeInstruction global,
-                                       ByteCodeInstruction uniform) const {
-            switch (fStorage) {
-                case Storage::kLocal:   return local;
-                case Storage::kGlobal:  return global;
-                case Storage::kUniform: return uniform;
+        /**
+         * Returns this location offset by the number of bytes stored in the 'offset' register. This
+         * will output the necessary CPU instructions to perform the math and return a new register
+         * location.
+         */
+        Location offset(ByteCodeGenerator& generator, ByteCode::Register offset) {
+            ByteCode::Register current;
+            switch (fKind) {
+                case kPointer_Kind:
+                    current = generator.next(1);
+                    generator.write(ByteCode::Instruction::kImmediate);
+                    generator.write(current);
+                    generator.write(ByteCode::Immediate{fPointer.fAddress});
+                    break;
+                case kRegister_Kind:
+                    current = fRegister;
             }
-            SkUNREACHABLE;
-        }
-
-        ByteCodeInstruction selectStore(ByteCodeInstruction local,
-                                        ByteCodeInstruction global) const {
-            switch (fStorage) {
-                case Storage::kLocal:   return local;
-                case Storage::kGlobal:  return global;
-                case Storage::kUniform: ABORT("Trying to store to a uniform"); break;
-            }
-            return local;
+            ByteCode::Register result = generator.next(1);
+            generator.write(ByteCode::Instruction::kAddI);
+            generator.write(result);
+            generator.write(current);
+            generator.write(offset);
+            return result;
         }
     };
 
+    // reserves 16 bits in the output code, to be filled in later with an address once we determine
+    // it
+    class DeferredLocation {
+    public:
+        explicit DeferredLocation(ByteCodeGenerator* generator)
+            : fGenerator(*generator)
+            , fOffset(generator->fCode->size()) {
+            generator->write(ByteCode::Pointer{65535});
+        }
+
+        void set() {
+            SkASSERT(fGenerator.fCode->size() <= ByteCode::kPointerMax);
+            static_assert(sizeof(ByteCode::Pointer) == 2,
+                          "failed assumption that ByteCode::Pointer is uint16_t");
+            void* dst = &(*fGenerator.fCode)[fOffset];
+            // ensure that the placeholder value 65535 hasn't been modified yet
+            SkASSERT(((uint8_t*) dst)[0] == 255 && ((uint8_t*) dst)[1] == 255);
+            ByteCode::Pointer target{(uint16_t) fGenerator.fCode->size()};
+            memcpy(dst, &target, sizeof(target));
+        }
+
+    private:
+        ByteCodeGenerator& fGenerator;
+        size_t fOffset;
+    };
+
+    template<typename T>
+    void write(T value) {
+        size_t n = fCode->size();
+        fCode->resize(n + sizeof(value));
+        memcpy(fCode->data() + n, &value, sizeof(value));
+    }
+
+    ByteCode::Register next(int slotCount);
+
     /**
-     * Returns the local slot into which var should be stored, allocating a new slot if it has not
-     * already been assigned one. Compound variables (e.g. vectors) will consume more than one local
-     * slot, with the getLocation return value indicating where the first element should be stored.
+     * Based on 'type', writes the s (signed), u (unsigned), or f (float) instruction.
      */
+    void writeTypedInstruction(const Type& type, ByteCode::Instruction s, ByteCode::Instruction u,
+                               ByteCode::Instruction f);
+
+    ByteCode::Instruction getLoadInstruction(Location location, Variable::Storage storage);
+
+    ByteCode::Instruction getStoreInstruction(Location location, Variable::Storage storage);
+
+    static int SlotCount(const Type& type);
+
     Location getLocation(const Variable& var);
 
-    /**
-     * As above, but computes the (possibly dynamic) address of an expression involving indexing &
-     * field access. If the address is known, it's returned. If not, -1 is returned, and the
-     * location will be left on the top of the stack.
-     */
     Location getLocation(const Expression& expr);
 
-    void gatherUniforms(const Type& type, const String& name);
+    Variable::Storage getStorage(const Expression& expr);
 
-    std::unique_ptr<ByteCodeFunction> writeFunction(const FunctionDefinition& f);
-
-    void writeVarDeclarations(const VarDeclarations& decl);
-
-    void writeVariableExpression(const Expression& expr);
-
-    void writeExpression(const Expression& expr, bool discard = false);
-
-    /**
-     * Pushes whatever values are required by the lvalue onto the stack, and returns an LValue
-     * permitting loads and stores to it.
-     */
     std::unique_ptr<LValue> getLValue(const Expression& expr);
 
-    void writeIntrinsicCall(const FunctionCall& c);
+    void writeFunction(const FunctionDefinition& f);
 
-    void writeFunctionCall(const FunctionCall& c);
+    // For compound values, the result argument specifies the first component. Subsequent components
+    // will be in subsequent registers.
 
-    void writeConstructor(const Constructor& c);
+    void writeBinaryInstruction(const Type& operandType, ByteCode::Register left,
+                                ByteCode::Register right, ByteCode::Instruction s,
+                                ByteCode::Instruction u, ByteCode::Instruction f,
+                                ByteCode::Register result);
 
-    void writeExternalFunctionCall(const ExternalFunctionCall& c);
+    void writeBinaryExpression(const BinaryExpression& expr, ByteCode::Register result);
 
-    void writeExternalValue(const ExternalValueReference& r);
+    void writeConstructor(const Constructor& c, ByteCode::Register result);
 
-    void writeSwizzle(const Swizzle& swizzle);
+    void writeExternalFunctionCall(const ExternalFunctionCall& f, ByteCode::Register result);
 
-    bool writeBinaryExpression(const BinaryExpression& b, bool discard);
+    void writeExternalValue(const ExternalValueReference& e, ByteCode::Register result);
 
-    void writeTernaryExpression(const TernaryExpression& t);
+    void writeIntrinsicCall(const FunctionCall& c, Intrinsic intrinsic, ByteCode::Register result);
 
-    void writeNullLiteral(const NullLiteral& n);
+    void writeFunctionCall(const FunctionCall& c, ByteCode::Register result);
 
-    bool writePrefixExpression(const PrefixExpression& p, bool discard);
+    void incOrDec(Token::Kind op, Expression& operand, bool prefix, ByteCode::Register result);
 
-    bool writePostfixExpression(const PostfixExpression& p, bool discard);
+    void writePostfixExpression(const PostfixExpression& p, ByteCode::Register result);
 
-    void writeBoolLiteral(const BoolLiteral& b);
+    void writePrefixExpression(const PrefixExpression& p, ByteCode::Register result);
 
-    void writeIntLiteral(const IntLiteral& i);
+    void writeSwizzle(const Swizzle& s, ByteCode::Register result);
 
-    void writeFloatLiteral(const FloatLiteral& f);
+    void writeTernaryExpression(const TernaryExpression& t, ByteCode::Register result);
 
-    void writeStatement(const Statement& s);
+    void writeVariableExpression(const Expression& e, ByteCode::Register result);
+
+    void writeExpression(const Expression& expr, ByteCode::Register result);
+
+    ByteCode::Register writeExpression(const Expression& expr);
 
     void writeBlock(const Block& b);
 
-    void writeBreakStatement(const BreakStatement& b);
-
-    void writeContinueStatement(const ContinueStatement& c);
-
-    void writeIfStatement(const IfStatement& stmt);
+    void writeDoStatement(const DoStatement& d);
 
     void writeForStatement(const ForStatement& f);
 
+    void writeIfStatement(const IfStatement& i);
+
+    void writeReturn(const ReturnStatement& r);
+
+    void writeVarDeclarations(const VarDeclarations& v);
+
     void writeWhileStatement(const WhileStatement& w);
 
-    void writeDoStatement(const DoStatement& d);
+    void writeStatement(const Statement& s);
 
-    void writeSwitchStatement(const SwitchStatement& s);
-
-    void writeReturnStatement(const ReturnStatement& r);
-
-    // updates the current set of breaks to branch to the current location
-    void setBreakTargets();
-
-    // updates the current set of continues to branch to the current location
-    void setContinueTargets();
-
-    void enterLoop() {
-        fLoopCount++;
-        fMaxLoopCount = std::max(fMaxLoopCount, fLoopCount);
-    }
-
-    void exitLoop() {
-        SkASSERT(fLoopCount > 0);
-        fLoopCount--;
-    }
-
-    void enterCondition() {
-        fConditionCount++;
-        fMaxConditionCount = std::max(fMaxConditionCount, fConditionCount);
-    }
-
-    void exitCondition() {
-        SkASSERT(fConditionCount > 0);
-        fConditionCount--;
-    }
-
-    const Context& fContext;
+    void gatherUniforms(const Type& type, const String& name);
 
     ByteCode* fOutput;
 
+    int fNextRegister = 0;
+
     const FunctionDefinition* fFunction;
 
+    std::vector<const FunctionDefinition*> fFunctions;
+
     std::vector<uint8_t>* fCode;
 
     std::vector<const Variable*> fLocals;
 
-    std::stack<std::vector<DeferredLocation>> fContinueTargets;
-
-    std::stack<std::vector<DeferredLocation>> fBreakTargets;
-
-    std::vector<const FunctionDefinition*> fFunctions;
-
     int fParameterCount;
-    int fStackCount;
-    int fMaxStackCount;
 
-    int fLoopCount;
-    int fMaxLoopCount;
     int fConditionCount;
-    int fMaxConditionCount;
 
     const std::unordered_map<String, Intrinsic> fIntrinsics;
 
     friend class DeferredLocation;
-    friend class ByteCodeExpressionLValue;
+    friend class ByteCodeExternalValueLValue;
+    friend class ByteCodeSimpleLValue;
     friend class ByteCodeSwizzleLValue;
 
     typedef CodeGenerator INHERITED;
 };
 
+template<>
+inline void ByteCodeGenerator::write(ByteCodeGenerator::Location loc) {
+    switch (loc.fKind) {
+        case ByteCodeGenerator::Location::kPointer_Kind:
+            this->write(loc.fPointer);
+            break;
+        case ByteCodeGenerator::Location::kRegister_Kind:
+            this->write(loc.fRegister);
+            break;
+    }
+}
+
 }
 
 #endif
diff --git a/src/sksl/SkSLCompiler.cpp b/src/sksl/SkSLCompiler.cpp
index 7bfdce1..6e84b98 100644
--- a/src/sksl/SkSLCompiler.cpp
+++ b/src/sksl/SkSLCompiler.cpp
@@ -77,14 +77,17 @@
 namespace SkSL {
 
 static void grab_intrinsics(std::vector<std::unique_ptr<ProgramElement>>* src,
-               std::map<StringFragment, std::pair<std::unique_ptr<ProgramElement>, bool>>* target) {
-    for (auto& element : *src) {
+               std::map<String, std::pair<std::unique_ptr<ProgramElement>, bool>>* target) {
+    for (auto iter = src->begin(); iter != src->end(); ) {
+        std::unique_ptr<ProgramElement>& element = *iter;
         switch (element->fKind) {
             case ProgramElement::kFunction_Kind: {
                 FunctionDefinition& f = (FunctionDefinition&) *element;
-                StringFragment name = f.fDeclaration.fName;
-                SkASSERT(target->find(name) == target->end());
-                (*target)[name] = std::make_pair(std::move(element), false);
+                SkASSERT(f.fDeclaration.fBuiltin);
+                String key = f.fDeclaration.declaration();
+                SkASSERT(target->find(key) == target->end());
+                (*target)[key] = std::make_pair(std::move(element), false);
+                iter = src->erase(iter);
                 break;
             }
             case ProgramElement::kEnum_Kind: {
@@ -92,6 +95,7 @@
                 StringFragment name = e.fTypeName;
                 SkASSERT(target->find(name) == target->end());
                 (*target)[name] = std::make_pair(std::move(element), false);
+                iter = src->erase(iter);
                 break;
             }
             default:
@@ -278,11 +282,13 @@
     this->processIncludeFile(Program::kPipelineStage_Kind, SKSL_PIPELINE_INCLUDE,
                              strlen(SKSL_PIPELINE_INCLUDE), fGpuSymbolTable, &fPipelineInclude,
                              &fPipelineSymbolTable);
-    std::vector<std::unique_ptr<ProgramElement>> interpIntrinsics;
     this->processIncludeFile(Program::kGeneric_Kind, SKSL_INTERP_INCLUDE,
                              strlen(SKSL_INTERP_INCLUDE), symbols, &fInterpreterInclude,
                              &fInterpreterSymbolTable);
-    grab_intrinsics(&interpIntrinsics, &fInterpreterIntrinsics);
+    grab_intrinsics(&fInterpreterInclude, &fInterpreterIntrinsics);
+    // need to hang on to the source so that FunctionDefinition.fSource pointers in this file
+    // remain valid
+    fInterpreterIncludeSource = std::move(fIRGenerator->fFile);
 }
 
 Compiler::~Compiler() {
@@ -1624,7 +1630,7 @@
     }
     fSource = program.fSource.get();
     std::unique_ptr<ByteCode> result(new ByteCode());
-    ByteCodeGenerator cg(fContext.get(), &program, this, result.get());
+    ByteCodeGenerator cg(&program, this, result.get());
     bool success = cg.generateCode();
     fSource = nullptr;
     if (success) {
diff --git a/src/sksl/SkSLCompiler.h b/src/sksl/SkSLCompiler.h
index fb4b4fb..25762e8 100644
--- a/src/sksl/SkSLCompiler.h
+++ b/src/sksl/SkSLCompiler.h
@@ -215,8 +215,8 @@
 
     Position position(int offset);
 
-    std::map<StringFragment, std::pair<std::unique_ptr<ProgramElement>, bool>> fGPUIntrinsics;
-    std::map<StringFragment, std::pair<std::unique_ptr<ProgramElement>, bool>> fInterpreterIntrinsics;
+    std::map<String, std::pair<std::unique_ptr<ProgramElement>, bool>> fGPUIntrinsics;
+    std::map<String, std::pair<std::unique_ptr<ProgramElement>, bool>> fInterpreterIntrinsics;
     std::unique_ptr<ASTFile> fGpuIncludeSource;
     std::shared_ptr<SymbolTable> fGpuSymbolTable;
     std::vector<std::unique_ptr<ProgramElement>> fVertexInclude;
@@ -227,6 +227,7 @@
     std::shared_ptr<SymbolTable> fGeometrySymbolTable;
     std::vector<std::unique_ptr<ProgramElement>> fPipelineInclude;
     std::shared_ptr<SymbolTable> fPipelineSymbolTable;
+    std::unique_ptr<ASTFile> fInterpreterIncludeSource;
     std::vector<std::unique_ptr<ProgramElement>> fInterpreterInclude;
     std::shared_ptr<SymbolTable> fInterpreterSymbolTable;
 
diff --git a/src/sksl/SkSLIRGenerator.cpp b/src/sksl/SkSLIRGenerator.cpp
index 44b4200..8d9d42d 100644
--- a/src/sksl/SkSLIRGenerator.cpp
+++ b/src/sksl/SkSLIRGenerator.cpp
@@ -1776,7 +1776,7 @@
                                               const FunctionDeclaration& function,
                                               std::vector<std::unique_ptr<Expression>> arguments) {
     if (function.fBuiltin) {
-        auto found = fIntrinsics->find(function.fName);
+        auto found = fIntrinsics->find(function.declaration());
         if (found != fIntrinsics->end() && !found->second.second) {
             found->second.second = true;
             const FunctionDeclaration* old = fCurrentFunction;
@@ -2186,7 +2186,7 @@
         }
     }
     fErrors.error(base->fOffset, "type '" + base->fType.displayName() + "' does not have a "
-                                 "field named '" + field + "");
+                                 "field named '" + field + "'");
     return nullptr;
 }
 
diff --git a/src/sksl/SkSLIRGenerator.h b/src/sksl/SkSLIRGenerator.h
index a088444..14ea097 100644
--- a/src/sksl/SkSLIRGenerator.h
+++ b/src/sksl/SkSLIRGenerator.h
@@ -159,7 +159,7 @@
     std::shared_ptr<SymbolTable> fSymbolTable;
     // Symbols which have definitions in the include files. The bool tells us whether this
     // intrinsic has been included already.
-    std::map<StringFragment, std::pair<std::unique_ptr<ProgramElement>, bool>>* fIntrinsics = nullptr;
+    std::map<String, std::pair<std::unique_ptr<ProgramElement>, bool>>* fIntrinsics = nullptr;
     // holds extra temp variable declarations needed for the current function
     std::vector<std::unique_ptr<Statement>> fExtraVars;
     int fLoopLevel;
diff --git a/src/sksl/SkSLInterpreter.h b/src/sksl/SkSLInterpreter.h
new file mode 100644
index 0000000..398a9b7
--- /dev/null
+++ b/src/sksl/SkSLInterpreter.h
@@ -0,0 +1,1353 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "include/private/GrTypesPriv.h" // GrAlignTo
+#include "src/core/SkUtils.h" // sk_unaligned_load
+#include "src/sksl/SkSLByteCode.h"
+#include "src/sksl/SkSLExternalValue.h"
+
+#include <stack>
+
+#ifndef SKSL_INTERPRETER
+#define SKSL_INTERPRETER
+
+namespace SkSL {
+
+// GCC and Clang support the "labels as values" extension which we need to implement the interpreter
+// using threaded code. Otherwise, we fall back to using a switch statement in a for loop.
+#if defined(__GNUC__) || defined(__clang__)
+    #define SKSL_THREADED_CODE
+#endif
+
+#ifdef SKSL_THREADED_CODE
+    using instruction = void*;
+    #define LABEL(name) name:
+    #ifdef TRACE
+        #define NEXT()                                   \
+            {                                            \
+                const uint8_t* trace_ip = ip;            \
+                printf("%d: ", (int) (trace_ip - code)); \
+                disassemble(&trace_ip);                  \
+            }                                            \
+            goto *labels[(int) read<ByteCode::Instruction>(&ip)]
+    #else
+        #define NEXT() goto *labels[(int) read<ByteCode::Instruction>(&ip)]
+    #endif
+#else
+    using instruction = uint16_t;
+    #define LABEL(name) case ByteCode::Instruction::name:
+    #define NEXT() continue
+#endif
+
+// If you trip this assert, it means that the order of the opcodes listed in ByteCodeInstruction
+// does not match the order of the opcodes listed in the 'labels' array in innerRun().
+#define CHECK_LABEL(name) \
+    SkASSERT(labels[(int) ByteCode::Instruction::name] == &&name)
+
+template<typename T>
+static T read(const uint8_t** ip) {
+    *ip += sizeof(T);
+    return sk_unaligned_load<T>(*ip - sizeof(T));
+}
+
+#define BINARY_OP(inst, src, result, op)                                  \
+    LABEL(inst) {                                                         \
+        ByteCode::Register target = read<ByteCode::Register>(&ip);        \
+        ByteCode::Register src1 = read<ByteCode::Register>(&ip);          \
+        ByteCode::Register src2 = read<ByteCode::Register>(&ip);          \
+        fRegisters[target.fIndex].result = fRegisters[src1.fIndex].src op \
+                                           fRegisters[src2.fIndex].src;   \
+        NEXT();                                                           \
+    }
+
+#define MASKED_BINARY_OP(inst, src, result, op)                                         \
+    LABEL(inst) {                                                                       \
+        ByteCode::Register target = read<ByteCode::Register>(&ip);                      \
+        ByteCode::Register src1 = read<ByteCode::Register>(&ip);                        \
+        ByteCode::Register src2 = read<ByteCode::Register>(&ip);                        \
+        auto m = mask();                                                                \
+        for (int i = 0; i < width; ++i) {                                               \
+            if (m[i]) {                                                                 \
+                fRegisters[target.fIndex].result[i] = fRegisters[src1.fIndex].src[i] op \
+                                                   fRegisters[src2.fIndex].src[i];      \
+            }                                                                           \
+        }                                                                               \
+        NEXT();                                                                         \
+    }
+
+#define VECTOR_UNARY_FN(inst, fn)                                                       \
+    LABEL(inst) {                                                                       \
+        ByteCode::Register target = read<ByteCode::Register>(&ip);                      \
+        ByteCode::Register src = read<ByteCode::Register>(&ip);                         \
+        for (int i = 0; i < width; ++ i) {                                              \
+            fRegisters[target.fIndex].fFloat[i] = fn(fRegisters[src.fIndex].fFloat[i]); \
+        }                                                                               \
+        NEXT();                                                                         \
+    }
+
+#define DISASSEMBLE_0(inst, name) \
+    case ByteCode::Instruction::inst: printf(name "\n"); break;
+
+#define DISASSEMBLE_1(inst, name)                                   \
+    case ByteCode::Instruction::inst:                               \
+        printf(name " $%d\n", read<ByteCode::Register>(ip).fIndex); \
+        break;
+
+#define DISASSEMBLE_UNARY(inst, name)                             \
+    case ByteCode::Instruction::inst: {                           \
+        ByteCode::Register target = read<ByteCode::Register>(ip); \
+        ByteCode::Register src = read<ByteCode::Register>(ip);    \
+        printf(name " $%d -> $%d\n", src.fIndex, target.fIndex);  \
+        break;                                                    \
+    }
+
+#define DISASSEMBLE_BINARY(inst, name)                                              \
+    case ByteCode::Instruction::inst: {                                             \
+        ByteCode::Register target = read<ByteCode::Register>(ip);                   \
+        ByteCode::Register src1 = read<ByteCode::Register>(ip);                     \
+        ByteCode::Register src2 = read<ByteCode::Register>(ip);                     \
+        printf(name " $%d, $%d -> $%d\n", src1.fIndex, src2.fIndex, target.fIndex); \
+        break;                                                                      \
+    }
+
+/**
+ * Operates on vectors of the specified width, so creating an Interpreter<16> means that all inputs,
+ * outputs, and internal calculations will be 16-wide vectors.
+ */
+template<int width>
+class Interpreter {
+public:
+    using Vector = ByteCode::Vector<width>;
+    using VectorI = skvx::Vec<width, int32_t>;
+    using VectorF = skvx::Vec<width, float>;
+
+    Interpreter(std::unique_ptr<ByteCode> code)
+        : fCode(std::move(code)) {
+        // C++ doesn't guarantee proper alignment of naively-allocated vectors, so we can't have the
+        // registers and memory directly as fields of this object without jumping through some hoops
+        // during Interpreter allocation and deallocation. We simplify this by having the backing
+        // store be a separate allocation, jumping through the hoops ourselves rather than require
+        // Interpreter's clients to be aware of alignment.
+        // Ideally, we could use std::aligned_alloc here, but as of this writing it is not available
+        // on some compilers despite claiming to support C++17.
+        fBackingStore = calloc(sizeof(Vector), MEMORY_SIZE + REGISTER_COUNT + 1);
+        fMemory = (Vector*) GrAlignTo((size_t) fBackingStore, alignof(Vector));
+        fRegisters = fMemory + MEMORY_SIZE;
+    }
+
+    ~Interpreter() {
+        free(fBackingStore);
+    }
+
+    void setUniforms(const float uniforms[]) {
+        for (int i = 0; i < fCode->getUniformSlotCount(); ++i) {
+            fMemory[fCode->getGlobalSlotCount() + i].fFloat = VectorF(uniforms[i]);
+        }
+    }
+
+    /**
+     * Returns true on success and stores a pointer to the first slot of the result into outResult.
+     * This pointer is only guaranteed to be valid until the next run() call.
+     */
+     bool run(const ByteCodeFunction* f, Vector args[], Vector** outResult) {
+        SkASSERT(f);
+        VectorI condStack[MASK_STACK_SIZE];
+        memset(condStack, 255, sizeof(VectorI));
+        VectorI maskStack[MASK_STACK_SIZE];
+        memset(maskStack, 255, sizeof(VectorI));
+        VectorI loopStack[LOOP_STACK_SIZE];
+        memset(loopStack, 255, sizeof(VectorI));
+        VectorI continueStack[LOOP_STACK_SIZE];
+        memset(continueStack, 0, sizeof(VectorI));
+        Vector* stack = fMemory + MEMORY_SIZE;
+        int stackCount = f->fStackSlotCount + f->fParameterSlotCount;
+        stack -= stackCount;
+        memcpy(stack, args, f->fParameterSlotCount * sizeof(Vector));
+        Context context(fMemory, stack, condStack, maskStack, loopStack, continueStack);
+        if (this->innerRun(f, context, 0, outResult)) {
+            int slot = 0;
+            for (const auto& p : f->fParameters) {
+                if (p.fIsOutParameter) {
+                    memcpy(&args[slot], &stack[slot], p.fSlotCount * sizeof(Vector));
+                }
+                slot += p.fSlotCount;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * Invokes the specified function with the given arguments, 'count' times. 'args' and
+     * 'outResult' are accepted and returned in structure-of-arrays form:
+     *   args[0] points to an array of N values, the first argument for each invocation
+     *   ...
+     *   args[argCount - 1] points to an array of N values, the last argument for each invocation
+     *
+     * All values in 'args', 'outReturn', and 'uniforms' are 32-bit values (typically floats,
+     * but possibly int32_t or uint32_t, depending on the types used in the SkSL).
+     * Any 'out' or 'inout' parameters will result in the 'args' array being modified.
+     */
+    bool runStriped(const ByteCodeFunction* f, int count, float* args[]) {
+        SkASSERT(f);
+        Vector* stack = fMemory + MEMORY_SIZE;
+        int stackCount = f->fStackSlotCount + f->fParameterSlotCount;
+        stack -= stackCount;
+        VectorI condStack[MASK_STACK_SIZE];
+        VectorI maskStack[MASK_STACK_SIZE];
+        VectorI loopStack[LOOP_STACK_SIZE];
+        VectorI continueStack[LOOP_STACK_SIZE];
+        Context context(fMemory, stack, condStack, maskStack, loopStack, continueStack);
+        for (int i = 0; i < count; i += width) {
+            int lanes = std::min(width, count - i);
+            size_t size = lanes * sizeof(float);
+            memset(maskStack, 255, sizeof(VectorI));
+            memset(loopStack, 255, sizeof(VectorI));
+            for (int j = lanes; j < width; ++j) {
+                maskStack[0][j] = 0;
+                loopStack[0][j] = 0;
+            }
+            memset(continueStack, 0, sizeof(VectorI));
+            for (int j = 0; j < f->fParameterSlotCount; ++j) {
+                memcpy(stack + j, &args[j][i], size);
+            }
+            if (!this->innerRun(f, context, i, nullptr)) {
+                return false;
+            }
+            int slot = 0;
+            for (const auto& p : f->fParameters) {
+                if (p.fIsOutParameter) {
+                    for (int j = 0; j < p.fSlotCount; ++j) {
+                        memcpy(&args[slot + j][i], stack + slot + j, size);
+                    }
+                }
+                slot += p.fSlotCount;
+            }
+        }
+        return true;
+    }
+
+    const ByteCode& getCode() {
+        return *fCode;
+    }
+
+private:
+    static constexpr size_t REGISTER_COUNT = 1024;
+
+    static constexpr size_t MEMORY_SIZE = 1024;
+
+    static constexpr size_t MASK_STACK_SIZE = 64;
+
+    static constexpr size_t LOOP_STACK_SIZE = 16;
+
+    struct StackFrame {
+        StackFrame(const ByteCodeFunction* function, const uint8_t* ip, const int stackSlotCount,
+                   Vector* parameters, Vector* returnValue)
+            : fFunction(function)
+            , fIP(ip)
+            , fStackSlotCount(stackSlotCount)
+            , fParameters(parameters)
+            , fReturnValue(returnValue) {}
+
+        const ByteCodeFunction* fFunction;
+        const uint8_t* fIP;
+        const int fStackSlotCount;
+        Vector* fParameters;
+        Vector* fReturnValue;
+    };
+
+    struct Context {
+        Context(Vector* memory, Vector* stack, VectorI* condStack, VectorI* maskStack,
+                VectorI* loopStack,VectorI* continueStack)
+            : fMemory(memory)
+            , fStack(stack)
+            , fCondStack(condStack)
+            , fMaskStack(maskStack)
+            , fLoopStack(loopStack)
+            , fContinueStack(continueStack) {}
+
+        Vector* fMemory;
+        Vector* fStack;
+        VectorI* fCondStack;
+        VectorI* fMaskStack;
+        VectorI* fLoopStack;
+        VectorI* fContinueStack;
+        std::stack<StackFrame> fCallStack;
+    };
+
+    // $x = register
+    // @x = memory cell
+    // &x = parameter
+    void disassemble(const uint8_t** ip) {
+        ByteCode::Instruction inst = read<ByteCode::Instruction>(ip);
+        switch (inst) {
+            DISASSEMBLE_BINARY(kAddF, "addF")
+            DISASSEMBLE_BINARY(kAddI, "addI")
+            DISASSEMBLE_BINARY(kAnd, "and")
+            DISASSEMBLE_BINARY(kCompareEQF, "compare eqF")
+            DISASSEMBLE_BINARY(kCompareEQI, "compare eqI")
+            DISASSEMBLE_BINARY(kCompareNEQF, "compare neqF")
+            DISASSEMBLE_BINARY(kCompareNEQI, "compare neqI")
+            DISASSEMBLE_BINARY(kCompareGTF, "compare gtF")
+            DISASSEMBLE_BINARY(kCompareGTS, "compare gtS")
+            DISASSEMBLE_BINARY(kCompareGTU, "compare gtU")
+            DISASSEMBLE_BINARY(kCompareGTEQF, "compare gteqF")
+            DISASSEMBLE_BINARY(kCompareGTEQS, "compare gteqS")
+            DISASSEMBLE_BINARY(kCompareGTEQU, "compare gteqU")
+            DISASSEMBLE_BINARY(kCompareLTF, "compare ltF")
+            DISASSEMBLE_BINARY(kCompareLTS, "compare ltS")
+            DISASSEMBLE_BINARY(kCompareLTU, "compare ltU")
+            DISASSEMBLE_BINARY(kCompareLTEQF, "compare lteqF")
+            DISASSEMBLE_BINARY(kCompareLTEQS, "compare lteqS")
+            DISASSEMBLE_BINARY(kCompareLTEQU, "compare lteqU")
+            DISASSEMBLE_BINARY(kSubtractF, "subF")
+            DISASSEMBLE_BINARY(kSubtractI, "subI")
+            DISASSEMBLE_BINARY(kDivideF, "divF")
+            DISASSEMBLE_BINARY(kDivideS, "divS")
+            DISASSEMBLE_BINARY(kDivideU, "divU")
+            DISASSEMBLE_BINARY(kRemainderS, "remS")
+            DISASSEMBLE_BINARY(kRemainderU, "remU")
+            DISASSEMBLE_BINARY(kMultiplyF, "mulF")
+            DISASSEMBLE_BINARY(kMultiplyI, "mulI")
+            DISASSEMBLE_BINARY(kOr, "or")
+            DISASSEMBLE_BINARY(kXor, "xor")
+            DISASSEMBLE_0(kNop, "nop")
+            DISASSEMBLE_BINARY(kRemainderF, "remF")
+            case ByteCode::Instruction::kBoundsCheck: {
+                ByteCode::Register r = read<ByteCode::Register>(ip);
+                int length = read<int>(ip);
+                printf("boundsCheck 0 <= $%d < %d\n", r.fIndex, length);
+                break;
+            }
+            case ByteCode::Instruction::kBranch:
+                printf("branch %d\n", read<ByteCode::Pointer>(ip).fAddress);
+                break;
+            case ByteCode::Instruction::kBranchIfAllFalse:
+                printf("branchIfAllFalse %d\n", read<ByteCode::Pointer>(ip).fAddress);
+                break;
+            DISASSEMBLE_0(kBreak, "break")
+            case ByteCode::Instruction::kCall: {
+                ByteCode::Register target = read<ByteCode::Register>(ip);
+                uint8_t idx = read<uint8_t>(ip);
+                ByteCode::Register args = read<ByteCode::Register>(ip);
+                ByteCodeFunction* f = fCode->fFunctions[idx].get();
+                printf("call %s($%d...) -> $%d", f->fName.c_str(), args.fIndex, target.fIndex);
+                printf("\n");
+                break;
+            }
+            case ByteCode::Instruction::kCallExternal: {
+                ByteCode::Register target = read<ByteCode::Register>(ip);
+                uint8_t idx = read<uint8_t>(ip);
+                uint8_t targetCount = read<uint8_t>(ip);
+                ByteCode::Register args = read<ByteCode::Register>(ip);
+                uint8_t argCount = read<uint8_t>(ip);
+                ExternalValue* ev = fCode->fExternalValues[idx];
+                printf("callExternal %s($%d(%d)...) -> $%d(%d)", String(ev->fName).c_str(),
+                        args.fIndex, argCount, target.fIndex, targetCount);
+                printf("\n");
+                break;
+            }
+            DISASSEMBLE_0(kContinue, "continue")
+            DISASSEMBLE_UNARY(kCopy, "copy")
+            DISASSEMBLE_UNARY(kCos, "cos")
+            DISASSEMBLE_UNARY(kFloatToSigned, "FtoS")
+            DISASSEMBLE_UNARY(kFloatToUnsigned, "FtoU")
+            case ByteCode::Instruction::kImmediate: {
+                ByteCode::Register target = read<ByteCode::Register>(ip);
+                ByteCode::Immediate src = read<ByteCode::Immediate>(ip);
+                printf("immediate (%d | %f) -> $%d\n", src.fInt, src.fFloat, target.fIndex);
+                break;
+            }
+            DISASSEMBLE_UNARY(kInverse2x2, "inverse2x2")
+            DISASSEMBLE_UNARY(kInverse3x3, "inverse3x3")
+            DISASSEMBLE_UNARY(kInverse4x4, "inverse4x4")
+            DISASSEMBLE_UNARY(kLoad, "load")
+            case ByteCode::Instruction::kLoadDirect: {
+                ByteCode::Register target = read<ByteCode::Register>(ip);
+                ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
+                printf("loadDirect @%d -> $%d\n", src.fAddress, target.fIndex);
+                break;
+            }
+            DISASSEMBLE_UNARY(kLoadParameter, "loadParameter")
+            case ByteCode::Instruction::kLoadParameterDirect: {
+                ByteCode::Register target = read<ByteCode::Register>(ip);
+                ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
+                printf("loadParameterDirect &%d -> $%d\n", src.fAddress, target.fIndex);
+                break;
+            }
+            DISASSEMBLE_UNARY(kLoadStack, "loadStack")
+            case ByteCode::Instruction::kLoadStackDirect: {
+                ByteCode::Register target = read<ByteCode::Register>(ip);
+                ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
+                printf("loadStackDirect @%d -> $%d\n", src.fAddress, target.fIndex);
+                break;
+            }
+            DISASSEMBLE_0(kLoopBegin, "loopBegin")
+            DISASSEMBLE_0(kLoopEnd, "loopEnd")
+            DISASSEMBLE_1(kLoopMask, "loopMask")
+            DISASSEMBLE_0(kLoopNext, "loopNext")
+            DISASSEMBLE_0(kMaskNegate, "maskNegate")
+            DISASSEMBLE_0(kMaskPop, "maskPop")
+            DISASSEMBLE_1(kMaskPush, "maskPush")
+            case ByteCode::Instruction::kMatrixMultiply: {
+                ByteCode::Register target = read<ByteCode::Register>(ip);
+                ByteCode::Register left = read<ByteCode::Register>(ip);
+                ByteCode::Register right = read<ByteCode::Register>(ip);
+                uint8_t leftColsAndRightRows = read<uint8_t>(ip);
+                uint8_t leftRows = read<uint8_t>(ip);
+                uint8_t rightColumns = read<uint8_t>(ip);
+                printf("matrixMultiply $%d, $%d, %d, %d, %d -> $%d\n", left.fIndex, right.fIndex,
+                       leftColsAndRightRows, leftRows, rightColumns, target.fIndex);
+                break;
+            }
+            case ByteCode::Instruction::kMatrixToMatrix: {
+                ByteCode::Register target = read<ByteCode::Register>(ip);
+                ByteCode::Register src = read<ByteCode::Register>(ip);
+                uint8_t srcColumns = read<uint8_t>(ip);
+                uint8_t srcRows = read<uint8_t>(ip);
+                uint8_t dstColumns = read<uint8_t>(ip);
+                uint8_t dstRows = read<uint8_t>(ip);
+                printf("matrixToMatrix $%d, %dx%d to %dx%d -> $%d\n", src.fIndex, srcColumns,
+                       srcRows, dstColumns, dstRows, target.fIndex);
+                break;
+            }
+            DISASSEMBLE_UNARY(kNegateF, "negateF")
+            DISASSEMBLE_UNARY(kNegateS, "negateS")
+            DISASSEMBLE_UNARY(kNot, "not")
+            case ByteCode::Instruction::kReadExternal: {
+                ByteCode::Register target = read<ByteCode::Register>(ip);
+                uint8_t count = read<uint8_t>(ip);
+                uint8_t index = read<uint8_t>(ip);
+                printf("readExternal %d, %d -> $%d\n", count, index, target.fIndex);
+                break;
+            }
+            DISASSEMBLE_1(kPrint, "print")
+            DISASSEMBLE_0(kReturn, "return")
+            DISASSEMBLE_1(kReturnValue, "returnValue")
+            case ByteCode::Instruction::kScalarToMatrix: {
+                ByteCode::Register target = read<ByteCode::Register>(ip);
+                ByteCode::Register src = read<ByteCode::Register>(ip);
+                uint8_t columns = read<uint8_t>(ip);
+                uint8_t rows = read<uint8_t>(ip);
+                printf("scalarToMatrix $%d, %dx%d -> $%d\n", src.fIndex, columns, rows,
+                       target.fIndex);
+                break;
+            }
+            case ByteCode::Instruction::kSelect: {
+                ByteCode::Register target = read<ByteCode::Register>(ip);
+                ByteCode::Register test = read<ByteCode::Register>(ip);
+                ByteCode::Register src1 = read<ByteCode::Register>(ip);
+                ByteCode::Register src2 = read<ByteCode::Register>(ip);
+                printf("select $%d, $%d, $%d -> %d\n", test.fIndex, src1.fIndex, src2.fIndex,
+                       target.fIndex);
+                break;
+            }
+            DISASSEMBLE_BINARY(kShiftLeft, "shiftLeft")
+            DISASSEMBLE_BINARY(kShiftRightS, "shiftRightS")
+            DISASSEMBLE_BINARY(kShiftRightU, "shiftRightU")
+            DISASSEMBLE_UNARY(kSignedToFloat, "signedToFloat")
+            DISASSEMBLE_UNARY(kSin, "sin")
+            DISASSEMBLE_UNARY(kSqrt, "sqrt")
+            DISASSEMBLE_UNARY(kStore, "store")
+            case ByteCode::Instruction::kStoreDirect: {
+                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
+                ByteCode::Register src = read<ByteCode::Register>(ip);
+                printf("store $%d -> @%d\n", src.fIndex, target.fAddress);
+                break;
+            }
+            DISASSEMBLE_UNARY(kStoreParameter, "storeParameter")
+            case ByteCode::Instruction::kStoreParameterDirect: {
+                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
+                ByteCode::Register src = read<ByteCode::Register>(ip);
+                printf("storeParameter $%d -> &%d\n", src.fIndex, target.fAddress);
+                break;
+            }
+            DISASSEMBLE_UNARY(kStoreStack, "storeStack")
+            case ByteCode::Instruction::kStoreStackDirect: {
+                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
+                ByteCode::Register src = read<ByteCode::Register>(ip);
+                printf("storeStackDirect $%d -> @%d\n", src.fIndex, target.fAddress);
+                break;
+            }
+            DISASSEMBLE_UNARY(kTan, "tan")
+            DISASSEMBLE_UNARY(kUnsignedToFloat, "unsignedToFloat")
+            case ByteCode::Instruction::kWriteExternal: {
+                uint8_t index = read<uint8_t>(ip);
+                uint8_t count = read<uint8_t>(ip);
+                ByteCode::Register src = read<ByteCode::Register>(ip);
+                printf("writeExternal $%d, %d -> %d\n", src.fIndex, count, index);
+                break;
+            }
+            default:
+                printf("unsupported: %d\n", (int) inst);
+                SkASSERT(false);
+        }
+    }
+
+    static Vector VecMod(Vector x, Vector y) {
+        return Vector(x.fFloat - skvx::trunc(x.fFloat / y.fFloat) * y.fFloat);
+    }
+
+    #define CHECK_STACK_BOUNDS(address)                              \
+        SkASSERT(context.fStack + address >= fMemory &&              \
+                 context.fStack + address <= fMemory + MEMORY_SIZE)
+
+    static void Inverse2x2(Vector* in, Vector* out) {
+        VectorF a = in[0].fFloat,
+                b = in[1].fFloat,
+                c = in[2].fFloat,
+                d = in[3].fFloat;
+        VectorF idet = VectorF(1) / (a*d - b*c);
+        out[0].fFloat = d * idet;
+        out[1].fFloat = -b * idet;
+        out[2].fFloat = -c * idet;
+        out[3].fFloat = a * idet;
+    }
+
+    static void Inverse3x3(Vector* in, Vector* out) {
+        VectorF a11 = in[0].fFloat, a12 = in[3].fFloat, a13 = in[6].fFloat,
+                a21 = in[1].fFloat, a22 = in[4].fFloat, a23 = in[7].fFloat,
+                a31 = in[2].fFloat, a32 = in[5].fFloat, a33 = in[8].fFloat;
+        VectorF idet = VectorF(1) / (a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
+                                     a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31);
+        out[0].fFloat = (a22 * a33 - a23 * a32) * idet;
+        out[1].fFloat = (a23 * a31 - a21 * a33) * idet;
+        out[2].fFloat = (a21 * a32 - a22 * a31) * idet;
+        out[3].fFloat = (a13 * a32 - a12 * a33) * idet;
+        out[4].fFloat = (a11 * a33 - a13 * a31) * idet;
+        out[5].fFloat = (a12 * a31 - a11 * a32) * idet;
+        out[6].fFloat = (a12 * a23 - a13 * a22) * idet;
+        out[7].fFloat = (a13 * a21 - a11 * a23) * idet;
+        out[8].fFloat = (a11 * a22 - a12 * a21) * idet;
+    }
+
+
+    static void Inverse4x4(Vector* in, Vector* out) {
+        #define inf(index)  in[index].fFloat
+        #define outf(index) out[index].fFloat
+        VectorF a00 = inf(0), a10 = inf(4), a20 = inf( 8), a30 = inf(12),
+                a01 = inf(1), a11 = inf(5), a21 = inf( 9), a31 = inf(13),
+                a02 = inf(2), a12 = inf(6), a22 = inf(10), a32 = inf(14),
+                a03 = inf(3), a13 = inf(7), a23 = inf(11), a33 = inf(15);
+
+        VectorF b00 = a00 * a11 - a01 * a10,
+                b01 = a00 * a12 - a02 * a10,
+                b02 = a00 * a13 - a03 * a10,
+                b03 = a01 * a12 - a02 * a11,
+                b04 = a01 * a13 - a03 * a11,
+                b05 = a02 * a13 - a03 * a12,
+                b06 = a20 * a31 - a21 * a30,
+                b07 = a20 * a32 - a22 * a30,
+                b08 = a20 * a33 - a23 * a30,
+                b09 = a21 * a32 - a22 * a31,
+                b10 = a21 * a33 - a23 * a31,
+                b11 = a22 * a33 - a23 * a32;
+
+        VectorF idet = VectorF(1) /
+                            (b00 * b11 - b01 * b10 + b02 * b09 + b03 * b08 - b04 * b07 + b05 * b06);
+
+        b00 *= idet;
+        b01 *= idet;
+        b02 *= idet;
+        b03 *= idet;
+        b04 *= idet;
+        b05 *= idet;
+        b06 *= idet;
+        b07 *= idet;
+        b08 *= idet;
+        b09 *= idet;
+        b10 *= idet;
+        b11 *= idet;
+
+        outf( 0) = a11 * b11 - a12 * b10 + a13 * b09;
+        outf( 1) = a02 * b10 - a01 * b11 - a03 * b09;
+        outf( 2) = a31 * b05 - a32 * b04 + a33 * b03;
+        outf( 3) = a22 * b04 - a21 * b05 - a23 * b03;
+        outf( 4) = a12 * b08 - a10 * b11 - a13 * b07;
+        outf( 5) = a00 * b11 - a02 * b08 + a03 * b07;
+        outf( 6) = a32 * b02 - a30 * b05 - a33 * b01;
+        outf( 7) = a20 * b05 - a22 * b02 + a23 * b01;
+        outf( 8) = a10 * b10 - a11 * b08 + a13 * b06;
+        outf( 9) = a01 * b08 - a00 * b10 - a03 * b06;
+        outf(10) = a30 * b04 - a31 * b02 + a33 * b00;
+        outf(11) = a21 * b02 - a20 * b04 - a23 * b00;
+        outf(12) = a11 * b07 - a10 * b09 - a12 * b06;
+        outf(13) = a00 * b09 - a01 * b07 + a02 * b06;
+        outf(14) = a31 * b01 - a30 * b03 - a32 * b00;
+        outf(15) = a20 * b03 - a21 * b01 + a22 * b00;
+        #undef inf
+        #undef outf
+    }
+
+    bool innerRun(const ByteCodeFunction* f, Context context, int baseIndex, Vector** outResult) {
+#ifdef SKSL_THREADED_CODE
+        static const void* labels[] = {
+            // If you aren't familiar with it, the &&label syntax is the GCC / Clang "labels as
+            // values" extension. If you add anything to this array, be sure to add the
+            // corresponding CHECK_LABEL() assert below.
+            &&kNop,
+            &&kAbort,
+            &&kAddF,
+            &&kAddI,
+            &&kAnd,
+            &&kBoundsCheck,
+            &&kBranch,
+            &&kBranchIfAllFalse,
+            &&kBreak,
+            &&kCall,
+            &&kCallExternal,
+            &&kCompareEQF,
+            &&kCompareEQI,
+            &&kCompareNEQF,
+            &&kCompareNEQI,
+            &&kCompareGTF,
+            &&kCompareGTS,
+            &&kCompareGTU,
+            &&kCompareGTEQF,
+            &&kCompareGTEQS,
+            &&kCompareGTEQU,
+            &&kCompareLTF,
+            &&kCompareLTS,
+            &&kCompareLTU,
+            &&kCompareLTEQF,
+            &&kCompareLTEQS,
+            &&kCompareLTEQU,
+            &&kContinue,
+            &&kCopy,
+            &&kCos,
+            &&kDivideF,
+            &&kDivideS,
+            &&kDivideU,
+            &&kFloatToSigned,
+            &&kFloatToUnsigned,
+            &&kImmediate,
+            &&kInverse2x2,
+            &&kInverse3x3,
+            &&kInverse4x4,
+            &&kLoad,
+            &&kLoadDirect,
+            &&kLoadParameter,
+            &&kLoadParameterDirect,
+            &&kLoadStack,
+            &&kLoadStackDirect,
+            &&kLoopBegin,
+            &&kLoopEnd,
+            &&kLoopMask,
+            &&kLoopNext,
+            &&kMaskNegate,
+            &&kMaskPop,
+            &&kMaskPush,
+            &&kMatrixMultiply,
+            &&kMatrixToMatrix,
+            &&kMultiplyF,
+            &&kMultiplyI,
+            &&kNegateF,
+            &&kNegateS,
+            &&kNot,
+            &&kOr,
+            &&kPrint,
+            &&kReadExternal,
+            &&kRemainderF,
+            &&kRemainderS,
+            &&kRemainderU,
+            &&kReturn,
+            &&kReturnValue,
+            &&kScalarToMatrix,
+            &&kSelect,
+            &&kShiftLeft,
+            &&kShiftRightS,
+            &&kShiftRightU,
+            &&kSignedToFloat,
+            &&kSin,
+            &&kSqrt,
+            &&kStore,
+            &&kStoreDirect,
+            &&kStoreParameter,
+            &&kStoreParameterDirect,
+            &&kStoreStack,
+            &&kStoreStackDirect,
+            &&kSubtractF,
+            &&kSubtractI,
+            &&kTan,
+            &&kUnsignedToFloat,
+            &&kWriteExternal,
+            &&kXor
+        };
+        CHECK_LABEL(kNop);
+        CHECK_LABEL(kAbort);
+        CHECK_LABEL(kAddF);
+        CHECK_LABEL(kAddI);
+        CHECK_LABEL(kAnd);
+        CHECK_LABEL(kBoundsCheck);
+        CHECK_LABEL(kBranch);
+        CHECK_LABEL(kBranchIfAllFalse);
+        CHECK_LABEL(kBreak);
+        CHECK_LABEL(kCall);
+        CHECK_LABEL(kCallExternal);
+        CHECK_LABEL(kCompareEQF);
+        CHECK_LABEL(kCompareEQI);
+        CHECK_LABEL(kCompareNEQF);
+        CHECK_LABEL(kCompareNEQI);
+        CHECK_LABEL(kCompareGTF);
+        CHECK_LABEL(kCompareGTS);
+        CHECK_LABEL(kCompareGTU);
+        CHECK_LABEL(kCompareGTEQF);
+        CHECK_LABEL(kCompareGTEQS);
+        CHECK_LABEL(kCompareGTEQU);
+        CHECK_LABEL(kCompareLTF);
+        CHECK_LABEL(kCompareLTS);
+        CHECK_LABEL(kCompareLTU);
+        CHECK_LABEL(kCompareLTEQF);
+        CHECK_LABEL(kCompareLTEQS);
+        CHECK_LABEL(kCompareLTEQU);
+        CHECK_LABEL(kContinue);
+        CHECK_LABEL(kCopy);
+        CHECK_LABEL(kCos);
+        CHECK_LABEL(kDivideF);
+        CHECK_LABEL(kDivideS);
+        CHECK_LABEL(kDivideU);
+        CHECK_LABEL(kFloatToSigned);
+        CHECK_LABEL(kFloatToUnsigned);
+        CHECK_LABEL(kImmediate);
+        CHECK_LABEL(kInverse2x2);
+        CHECK_LABEL(kInverse3x3);
+        CHECK_LABEL(kInverse4x4);
+        CHECK_LABEL(kLoad);
+        CHECK_LABEL(kLoadDirect);
+        CHECK_LABEL(kLoadParameter);
+        CHECK_LABEL(kLoadParameterDirect);
+        CHECK_LABEL(kLoadStack);
+        CHECK_LABEL(kLoadStackDirect);
+        CHECK_LABEL(kLoopBegin);
+        CHECK_LABEL(kLoopEnd);
+        CHECK_LABEL(kLoopMask);
+        CHECK_LABEL(kLoopNext);
+        CHECK_LABEL(kMaskNegate);
+        CHECK_LABEL(kMaskPop);
+        CHECK_LABEL(kMaskPush);
+        CHECK_LABEL(kMatrixMultiply);
+        CHECK_LABEL(kMatrixToMatrix);
+        CHECK_LABEL(kMultiplyF);
+        CHECK_LABEL(kMultiplyI);
+        CHECK_LABEL(kNegateF);
+        CHECK_LABEL(kNegateS);
+        CHECK_LABEL(kNot);
+        CHECK_LABEL(kOr);
+        CHECK_LABEL(kPrint);
+        CHECK_LABEL(kReadExternal);
+        CHECK_LABEL(kRemainderF);
+        CHECK_LABEL(kRemainderS);
+        CHECK_LABEL(kRemainderU);
+        CHECK_LABEL(kReturn);
+        CHECK_LABEL(kReturnValue);
+        CHECK_LABEL(kScalarToMatrix);
+        CHECK_LABEL(kSelect);
+        CHECK_LABEL(kShiftLeft);
+        CHECK_LABEL(kShiftRightS);
+        CHECK_LABEL(kShiftRightU);
+        CHECK_LABEL(kSignedToFloat);
+        CHECK_LABEL(kSin);
+        CHECK_LABEL(kSqrt);
+        CHECK_LABEL(kStore);
+        CHECK_LABEL(kStoreDirect);
+        CHECK_LABEL(kStoreParameter);
+        CHECK_LABEL(kStoreParameterDirect);
+        CHECK_LABEL(kStoreStack);
+        CHECK_LABEL(kStoreStackDirect);
+        CHECK_LABEL(kSubtractF);
+        CHECK_LABEL(kSubtractI);
+        CHECK_LABEL(kTan);
+        CHECK_LABEL(kUnsignedToFloat);
+        CHECK_LABEL(kWriteExternal);
+        CHECK_LABEL(kXor);
+#endif
+        auto mask = [&]() { return *context.fMaskStack & *context.fLoopStack; };
+        auto parameterBase = [&]() {
+            return context.fCallStack.empty() ? context.fStack
+                                              : context.fCallStack.top().fParameters;
+        };
+        const uint8_t* code = f->fCode.data();
+        const uint8_t* ip = code;
+#ifdef SKSL_THREADED_CODE
+        #ifdef TRACE
+            const uint8_t* trace_ip = ip;
+            printf("0: ");
+            disassemble(&trace_ip);
+        #endif
+        goto *labels[(int) read<ByteCode::Instruction>(&ip)];
+#else
+        for (;;) {
+            #ifdef TRACE
+                const uint8_t* trace_ip = ip;
+                disassemble(&trace_ip);
+            #endif
+            ByteCode::Instruction inst = read<ByteCode::Instruction>(&ip);
+            switch (inst) {
+#endif
+                BINARY_OP(kAddF, fFloat, fFloat, +)
+                BINARY_OP(kAddI, fInt, fInt, +)
+                BINARY_OP(kAnd, fInt, fInt, &)
+                BINARY_OP(kCompareEQF, fFloat, fInt, ==)
+                BINARY_OP(kCompareEQI, fInt, fInt, ==)
+                BINARY_OP(kCompareNEQF, fFloat, fInt, !=)
+                BINARY_OP(kCompareNEQI, fInt, fInt, !=)
+                BINARY_OP(kCompareGTF, fFloat, fInt, >)
+                BINARY_OP(kCompareGTS, fInt, fInt, >)
+                BINARY_OP(kCompareGTU, fUInt, fUInt, >)
+                BINARY_OP(kCompareGTEQF, fFloat, fInt, >=)
+                BINARY_OP(kCompareGTEQS, fInt, fInt, >=)
+                BINARY_OP(kCompareGTEQU, fUInt, fUInt, >=)
+                BINARY_OP(kCompareLTF, fFloat, fInt, <)
+                BINARY_OP(kCompareLTS, fInt, fInt, <)
+                BINARY_OP(kCompareLTU, fUInt, fUInt, <)
+                BINARY_OP(kCompareLTEQF, fFloat, fInt, <=)
+                BINARY_OP(kCompareLTEQS, fInt, fInt, <=)
+                BINARY_OP(kCompareLTEQU, fUInt, fUInt, <=)
+                BINARY_OP(kSubtractF, fFloat, fFloat, -)
+                BINARY_OP(kSubtractI, fInt, fInt, -)
+                BINARY_OP(kDivideF, fFloat, fFloat, /)
+                MASKED_BINARY_OP(kDivideS, fInt, fInt, /)
+                MASKED_BINARY_OP(kDivideU, fUInt, fUInt, /)
+                MASKED_BINARY_OP(kRemainderS, fInt, fInt, %)
+                MASKED_BINARY_OP(kRemainderU, fUInt, fUInt, %)
+                BINARY_OP(kMultiplyF, fFloat, fFloat, *)
+                BINARY_OP(kMultiplyI, fInt, fInt, *)
+                BINARY_OP(kOr, fInt, fInt, |)
+                BINARY_OP(kXor, fInt, fInt, ^)
+                LABEL(kAbort)
+                    SkASSERT(false);
+                    return false;
+                LABEL(kBoundsCheck) {
+                    ByteCode::Register r = read<ByteCode::Register>(&ip);
+                    int length = read<int>(&ip);
+                    if (skvx::any(mask() & ((fRegisters[r.fIndex].fInt < 0) |
+                                            (fRegisters[r.fIndex].fInt >= length)))) {
+                        return false;
+                    }
+                    NEXT();
+                }
+                LABEL(kBranch) {
+                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
+                    ip = code + target.fAddress;
+                    NEXT();
+                }
+                LABEL(kBranchIfAllFalse) {
+                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
+                    if (!skvx::any(mask())) {
+                        ip = code + target.fAddress;
+                    }
+                    NEXT();
+                }
+                LABEL(kBreak)
+                    *context.fLoopStack &= ~mask();
+                    NEXT();
+                LABEL(kCall) {
+                    ByteCode::Register returnValue = read<ByteCode::Register>(&ip);
+                    uint8_t idx = read<uint8_t>(&ip);
+                    ByteCode::Register args = read<ByteCode::Register>(&ip);
+                    const ByteCodeFunction* target = fCode->fFunctions[idx].get();
+                    int stackSlotCount = target->fStackSlotCount + target->fParameterSlotCount;
+                    context.fCallStack.push(StackFrame(f, ip, stackSlotCount,
+                                                       &fRegisters[args.fIndex],
+                                                       &fRegisters[returnValue.fIndex]));
+                    f = target;
+                    code = f->fCode.data();
+                    ip = code;
+                    context.fStack -= stackSlotCount;
+                    memcpy(context.fStack, &fRegisters[args.fIndex],
+                           f->fParameterSlotCount * sizeof(Vector));
+                    NEXT();
+                }
+                LABEL(kCallExternal) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    uint8_t index = read<uint8_t>(&ip);
+                    uint8_t targetSize = read<uint8_t>(&ip);
+                    ByteCode::Register arguments = read<ByteCode::Register>(&ip);
+                    uint8_t argumentSize = read<uint8_t>(&ip);
+                    ExternalValue* v = fCode->fExternalValues[index];
+                    float tmpReturn[64];
+                    SkASSERT(targetSize < 64);
+                    float tmpArgs[64];
+                    SkASSERT(argumentSize < 64);
+                    VectorI m = mask();
+                    for (int i = 0; i < width; ++i) {
+                        if (m[i]) {
+                            for (int j = 0; j < argumentSize; j++) {
+                                tmpArgs[j] = fRegisters[arguments.fIndex + j].fFloat[i];
+                            }
+                            v->call(baseIndex + i, tmpArgs, tmpReturn);
+                            for (int j = 0; j < targetSize; j++) {
+                                fRegisters[target.fIndex + j].fFloat[i] = tmpReturn[j];
+                            }
+                        }
+                    }
+                    NEXT();
+                }
+                LABEL(kContinue) {
+                    VectorI m = mask();
+                    *context.fContinueStack |= m;
+                    *context.fLoopStack &= ~m;
+                    NEXT();
+                }
+                LABEL(kCopy) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    fRegisters[target.fIndex].fInt = fRegisters[src.fIndex].fInt;
+                    NEXT();
+                }
+                VECTOR_UNARY_FN(kCos, cosf)
+                LABEL(kFloatToSigned) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    fRegisters[target.fIndex] = Vector(skvx::cast<int32_t>(
+                                                       fRegisters[src.fIndex].fFloat));
+                    NEXT();
+                }
+                LABEL(kFloatToUnsigned) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    fRegisters[target.fIndex] = Vector(skvx::cast<uint32_t>(
+                                                       fRegisters[src.fIndex].fFloat));
+                    NEXT();
+                }
+                LABEL(kImmediate) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Immediate src = read<ByteCode::Immediate>(&ip);
+                    fRegisters[target.fIndex].fInt = src.fInt;
+                    NEXT();
+                }
+                LABEL(kInverse2x2) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    Inverse2x2(&fRegisters[src.fIndex], &fRegisters[target.fIndex]);
+                    NEXT();
+                }
+                LABEL(kInverse3x3) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    Inverse3x3(&fRegisters[src.fIndex], &fRegisters[target.fIndex]);
+                    NEXT();
+                }
+                LABEL(kInverse4x4) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    Inverse4x4(&fRegisters[src.fIndex], &fRegisters[target.fIndex]);
+                    NEXT();
+                }
+                LABEL(kLoad) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    VectorI m = mask();
+                    for (int i = 0; i < width; ++i) {
+                        if (m[i]) {
+                            fRegisters[target.fIndex].fInt[i] =
+                                                    fMemory[fRegisters[src.fIndex].fInt[i]].fInt[i];
+                        }
+                    }
+                    NEXT();
+                }
+                LABEL(kLoadDirect) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
+                    fRegisters[target.fIndex].fInt = fMemory[src.fAddress].fInt;
+                    NEXT();
+                }
+                LABEL(kLoadParameter) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    Vector* base = parameterBase();
+                    VectorI m = mask();
+                    for (int i = 0; i < width; ++i) {
+                        if (m[i]) {
+                            fRegisters[target.fIndex].fInt[i] =
+                                                       base[fRegisters[src.fIndex].fInt[i]].fInt[i];
+                        }
+                    }
+                    NEXT();
+                }
+                LABEL(kLoadParameterDirect) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
+                    Vector* base = parameterBase();
+                    fRegisters[target.fIndex].fInt = base[src.fAddress].fInt;
+                    NEXT();
+                }
+                LABEL(kLoadStack) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    VectorI m = mask();
+                    for (int i = 0; i < width; ++i) {
+                        if (m[i]) {
+                            fRegisters[target.fIndex].fInt[i] =
+                                             context.fStack[fRegisters[src.fIndex].fInt[i]].fInt[i];
+                        }
+                    }
+                    NEXT();
+                }
+                LABEL(kLoadStackDirect) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
+                    CHECK_STACK_BOUNDS(src.fAddress);
+                    fRegisters[target.fIndex].fInt = context.fStack[src.fAddress].fInt;
+                    NEXT();
+                }
+                LABEL(kLoopBegin) {
+                    context.fLoopStack[1] = context.fLoopStack[0];
+                    ++context.fLoopStack;
+                    context.fContinueStack[1] = 0;
+                    ++context.fContinueStack;
+                    NEXT();
+                }
+                LABEL(kLoopEnd) {
+                    --context.fLoopStack;
+                    --context.fContinueStack;
+                    NEXT();
+                }
+                LABEL(kLoopMask) {
+                    ByteCode::Register value = read<ByteCode::Register>(&ip);
+                    *context.fLoopStack &= fRegisters[value.fIndex].fInt;
+                    NEXT();
+                }
+                LABEL(kLoopNext) {
+                    *context.fLoopStack |= *context.fContinueStack;
+                    *context.fContinueStack = 0;
+                    NEXT();
+                }
+                LABEL(kMaskNegate) {
+                    *context.fMaskStack = context.fMaskStack[-1] & ~context.fCondStack[0];
+                    NEXT();
+                }
+                LABEL(kMaskPop) {
+                    --context.fMaskStack;
+                    --context.fCondStack;
+                    NEXT();
+                }
+                LABEL(kMaskPush) {
+                    ByteCode::Register value = read<ByteCode::Register>(&ip);
+                    context.fCondStack[1] = fRegisters[value.fIndex].fInt;
+                    context.fMaskStack[1] = context.fMaskStack[0] & context.fCondStack[1];
+                    ++context.fCondStack;
+                    ++context.fMaskStack;
+                    NEXT();
+                }
+                LABEL(kMatrixMultiply) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register left = read<ByteCode::Register>(&ip);
+                    ByteCode::Register right = read<ByteCode::Register>(&ip);
+                    uint8_t lCols = read<uint8_t>(&ip);
+                    uint8_t lRows = read<uint8_t>(&ip);
+                    uint8_t rCols = read<uint8_t>(&ip);
+                    uint8_t rRows = lCols;
+                    memset(&fRegisters[target.fIndex], 0, sizeof(Vector) * rCols * lRows);
+                    for (int c = 0; c < rCols; ++c) {
+                        for (int r = 0; r < lRows; ++r) {
+                            for (int j = 0; j < lCols; ++j) {
+                                fRegisters[target.fIndex + c * lRows + r].fFloat +=
+                                        fRegisters[left.fIndex + j * lRows + r].fFloat *
+                                        fRegisters[right.fIndex + c * rRows + j].fFloat;
+                            }
+                        }
+                    }
+                    NEXT();
+                }
+                LABEL(kMatrixToMatrix) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    uint8_t srcColumns = read<uint8_t>(&ip);
+                    uint8_t srcRows = read<uint8_t>(&ip);
+                    uint8_t dstColumns = read<uint8_t>(&ip);
+                    uint8_t dstRows = read<uint8_t>(&ip);
+                    int offset = 0;
+                    for (int i = 0; i < dstColumns; ++i) {
+                        for (int j = 0; j < dstRows; ++j) {
+                            if (i < srcColumns && j < srcRows) {
+                                fRegisters[target.fIndex + offset] =
+                                                         fRegisters[src.fIndex + (srcRows * i) + j];
+                            } else {
+                                if (i == j) {
+                                    fRegisters[target.fIndex + offset].fFloat = 1;
+                                } else {
+                                    fRegisters[target.fIndex + offset].fFloat = 0;
+                                }
+                            }
+                            ++offset;
+                        }
+                    }
+                    NEXT();
+                }
+                LABEL(kNegateF) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    fRegisters[target.fIndex].fFloat = -fRegisters[src.fIndex].fFloat;
+                    NEXT();
+                }
+                LABEL(kNegateS) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    fRegisters[target.fIndex].fInt = -fRegisters[src.fIndex].fInt;
+                    NEXT();
+                }
+                LABEL(kNop)
+                    NEXT();
+                LABEL(kNot) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    fRegisters[target.fIndex].fInt = ~fRegisters[src.fIndex].fInt;
+                    NEXT();
+                }
+                LABEL(kPrint) {
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    if (skvx::any(mask())) {
+                        printf("[");
+                        const char* separator = "";
+                        for (int i = 0; i < width; ++i) {
+                            if (mask()[i]) {
+                                printf("%s%f", separator, fRegisters[src.fIndex].fFloat[i]);
+                            }
+                            else {
+                                printf("%s-", separator);
+                            }
+                            separator = ", ";
+                        }
+                        printf("]\n");
+                    }
+                    NEXT();
+                }
+                LABEL(kReadExternal) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    uint8_t count = read<uint8_t>(&ip);
+                    uint8_t index = read<uint8_t>(&ip);
+                    SkASSERT(count <= 4);
+                    SkASSERT(fCode->fExternalValues.size() > index);
+                    float tmp[4];
+                    VectorI m = mask();
+                    for (int i = 0; i < width; ++i) {
+                        if (m[i]) {
+                            fCode->fExternalValues[index]->read(baseIndex + i, tmp);
+                            for (int j = 0; j < count; ++j) {
+                                fRegisters[target.fIndex + j].fFloat[i] = tmp[j];
+                            }
+                        }
+                    }
+                    NEXT();
+                }
+                LABEL(kRemainderF) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src1 = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src2 = read<ByteCode::Register>(&ip);
+                    fRegisters[target.fIndex] = VecMod(fRegisters[src1.fIndex],
+                                                       fRegisters[src2.fIndex]);
+                    NEXT();
+                }
+                LABEL(kReturn) {
+                    if (context.fCallStack.empty()) {
+                        return true;
+                    }
+                    StackFrame frame = context.fCallStack.top();
+                    f = frame.fFunction;
+                    code = f->fCode.data();
+                    ip = frame.fIP;
+                    context.fStack += frame.fStackSlotCount;
+                    context.fCallStack.pop();
+                    NEXT();
+                }
+                LABEL(kReturnValue) {
+                    ByteCode::Register returnValue = read<ByteCode::Register>(&ip);
+                    if (context.fCallStack.empty()) {
+                        if (outResult) {
+                            *outResult = &fRegisters[returnValue.fIndex];
+                        }
+                        return true;
+                    }
+                    StackFrame frame = context.fCallStack.top();
+                    ip = frame.fIP;
+                    context.fStack += frame.fStackSlotCount;
+                    memcpy(frame.fReturnValue, &fRegisters[returnValue.fIndex],
+                           sizeof(Vector) * f->fReturnSlotCount);
+                    f = frame.fFunction;
+                    code = f->fCode.data();
+                    context.fCallStack.pop();
+                    NEXT();
+                }
+                LABEL(kScalarToMatrix) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    uint8_t columns = read<uint8_t>(&ip);
+                    uint8_t rows = read<uint8_t>(&ip);
+                    int offset = 0;
+                    for (int i = 0; i < columns; ++i) {
+                        for (int j = 0; j < rows; ++j) {
+                            if (i == j) {
+                                fRegisters[target.fIndex + offset] = fRegisters[src.fIndex];
+                            } else {
+                                fRegisters[target.fIndex + offset].fFloat = 0;
+                            }
+                            ++offset;
+                        }
+                    }
+                    NEXT();
+                }
+                LABEL(kSelect) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register test = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src1 = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src2 = read<ByteCode::Register>(&ip);
+                    fRegisters[target.fIndex] = skvx::if_then_else(fRegisters[test.fIndex].fInt,
+                                                                   fRegisters[src1.fIndex].fFloat,
+                                                                   fRegisters[src2.fIndex].fFloat);
+                    NEXT();
+                }
+                LABEL(kShiftLeft) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    uint8_t count = read<uint8_t>(&ip);
+                    fRegisters[target.fIndex].fInt = fRegisters[src.fIndex].fInt << count;
+                    NEXT();
+                }
+                LABEL(kShiftRightS) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    int8_t count = read<int8_t>(&ip);
+                    fRegisters[target.fIndex].fInt = fRegisters[src.fIndex].fInt >> count;
+                    NEXT();
+                }
+                LABEL(kShiftRightU) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    uint8_t count = read<uint8_t>(&ip);
+                    fRegisters[target.fIndex].fUInt = fRegisters[src.fIndex].fUInt >> count;
+                    NEXT();
+                }
+                LABEL(kSignedToFloat) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    fRegisters[target.fIndex] = Vector(skvx::cast<float>(
+                                                                      fRegisters[src.fIndex].fInt));
+                    NEXT();
+                }
+                VECTOR_UNARY_FN(kSin, sinf)
+                LABEL(kSqrt) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    fRegisters[target.fIndex].fFloat = skvx::sqrt(fRegisters[src.fIndex].fFloat);
+                    NEXT();
+                }
+                LABEL(kStore) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    VectorI m = mask();
+                    for (int i = 0; i < width; ++i) {
+                        if (m[i]) {
+                            fMemory[fRegisters[target.fIndex].fInt[i]].fInt[i] =
+                                                                     fRegisters[src.fIndex].fInt[i];
+                        }
+                    }
+                    NEXT();
+                }
+                LABEL(kStoreDirect) {
+                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    fMemory[target.fAddress] = skvx::if_then_else(mask(),
+                                                                  fRegisters[src.fIndex].fFloat,
+                                                                  fMemory[target.fAddress].fFloat);
+                    NEXT();
+                }
+                LABEL(kStoreParameter) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    Vector* base = parameterBase();
+                    VectorI m = mask();
+                    for (int i = 0; i < width; ++i) {
+                        if (m[i]) {
+                            base[fRegisters[target.fIndex].fInt[i]].fInt[i] =
+                                                                     fRegisters[src.fIndex].fInt[i];
+                        }
+                    }
+                    NEXT();
+                }
+                LABEL(kStoreParameterDirect) {
+                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    Vector* base = parameterBase();
+                    base[target.fAddress] = skvx::if_then_else(mask(),
+                                                               fRegisters[src.fIndex].fFloat,
+                                                               base[target.fAddress].fFloat);
+                    NEXT();
+                }
+                LABEL(kStoreStack) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    VectorI m = mask();
+                    for (int i = 0; i < width; ++i) {
+                        if (m[i]) {
+                            context.fStack[fRegisters[target.fIndex].fInt[i]].fInt[i] =
+                                                                     fRegisters[src.fIndex].fInt[i];
+                        }
+                    }
+                    NEXT();
+                }
+                LABEL(kStoreStackDirect) {
+                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
+                    CHECK_STACK_BOUNDS(target.fAddress);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    context.fStack[target.fAddress] = skvx::if_then_else(
+                                                            mask(),
+                                                            fRegisters[src.fIndex].fFloat,
+                                                            context.fStack[target.fAddress].fFloat);
+                    NEXT();
+                }
+                VECTOR_UNARY_FN(kTan, tanf)
+                LABEL(kUnsignedToFloat) {
+                    ByteCode::Register target = read<ByteCode::Register>(&ip);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    fRegisters[target.fIndex] = Vector(skvx::cast<float>(
+                                                                     fRegisters[src.fIndex].fUInt));
+                    NEXT();
+                }
+                LABEL(kWriteExternal) {
+                    uint8_t index = read<uint8_t>(&ip);
+                    uint8_t count = read<uint8_t>(&ip);
+                    SkASSERT(count <= 4);
+                    SkASSERT(fCode->fExternalValues.size() > index);
+                    ByteCode::Register src = read<ByteCode::Register>(&ip);
+                    float tmp[4];
+                    VectorI m = mask();
+                    for (int i = 0; i < width; ++i) {
+                        if (m[i]) {
+                            for (int j = 0; j < count; ++j) {
+                                tmp[j] = fRegisters[src.fIndex + j].fFloat[i];
+                            }
+                            fCode->fExternalValues[index]->write(baseIndex + i, tmp);
+                        }
+                    }
+                    NEXT();
+                }
+#ifndef SKSL_THREADED_CODE
+            }
+        }
+#endif
+    }
+
+    const std::unique_ptr<ByteCode> fCode;
+
+    void* fBackingStore;
+
+    Vector* fRegisters;
+
+    Vector* fMemory;
+
+    friend class ByteCode;
+
+    friend class ByteCodeGenerator;
+};
+
+#undef BINARY_OP
+#undef CHECK_STACK_BOUNDS
+
+} // namespace
+
+#endif
diff --git a/src/sksl/ir/SkSLFunctionDeclaration.h b/src/sksl/ir/SkSLFunctionDeclaration.h
index 11b04a5..f7ce904 100644
--- a/src/sksl/ir/SkSLFunctionDeclaration.h
+++ b/src/sksl/ir/SkSLFunctionDeclaration.h
@@ -36,7 +36,7 @@
         for (auto p : fParameters) {
             result += separator;
             separator = ", ";
-            result += p->fName;
+            result += p->fType.displayName();
         }
         result += ")";
         return result;
diff --git a/src/sksl/ir/SkSLSymbolTable.cpp b/src/sksl/ir/SkSLSymbolTable.cpp
index ed2cb4d..bbf001d 100644
--- a/src/sksl/ir/SkSLSymbolTable.cpp
+++ b/src/sksl/ir/SkSLSymbolTable.cpp
@@ -114,9 +114,7 @@
                 break;
             case Symbol::kUnresolvedFunction_Kind:
                 for (auto& f : ((UnresolvedFunction&) *pair.second).fFunctions) {
-                    if (!((FunctionDeclaration*)f)->fDefined) {
-                        ((FunctionDeclaration*)f)->fBuiltin = true;
-                    }
+                    ((FunctionDeclaration*)f)->fBuiltin = true;
                 }
                 break;
             default:
diff --git a/src/sksl/sksl_interp.inc b/src/sksl/sksl_interp.inc
index f43f05f..e576f9f 100644
--- a/src/sksl/sksl_interp.inc
+++ b/src/sksl/sksl_interp.inc
@@ -1,5 +1,7 @@
 STRINGIFY(
 
+sk_has_side_effects void print(float f);
+
 $genType cos($genType y);
 $genHType cos($genHType y);
 float dot($genType x, $genType y);
diff --git a/tests/SkSLInterpreterTest.cpp b/tests/SkSLInterpreterTest.cpp
index 8251269..9b2ab2c 100644
--- a/tests/SkSLInterpreterTest.cpp
+++ b/tests/SkSLInterpreterTest.cpp
@@ -9,19 +9,11 @@
 #include "src/sksl/SkSLByteCode.h"
 #include "src/sksl/SkSLCompiler.h"
 #include "src/sksl/SkSLExternalValue.h"
+#include "src/sksl/SkSLInterpreter.h"
 #include "src/utils/SkJSON.h"
 
 #include "tests/Test.h"
 
-static bool nearly_equal(const float a[], const float b[], int count) {
-    for (int i = 0; i < count; ++i) {
-        if (!SkScalarNearlyEqual(a[i], b[i])) {
-            return false;
-        }
-    }
-    return true;
-}
-
 void test(skiatest::Reporter* r, const char* src, float* in, float* expected,
           bool exactCompare = true) {
     SkSL::Compiler compiler;
@@ -39,30 +31,11 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        int returnCount = main->getReturnCount();
-        std::unique_ptr<float[]> out = std::unique_ptr<float[]>(new float[returnCount]);
-        SkAssertResult(byteCode->run(main, in, main->getParameterCount(), out.get(), returnCount,
-                                     nullptr, 0));
-        bool valid = exactCompare ? !memcmp(out.get(), expected, sizeof(float) * returnCount)
-                                  : nearly_equal(out.get(), expected, returnCount);
-        if (!valid) {
-            printf("for program: %s\n", src);
-            printf("    expected (");
-            const char* separator = "";
-            for (int i = 0; i < returnCount; ++i) {
-                printf("%s%f", separator, expected[i]);
-                separator = ", ";
-            }
-            printf("), but received (");
-            separator = "";
-            for (int i = 0; i < returnCount; ++i) {
-                printf("%s%f", separator, out.get()[i]);
-                separator = ", ";
-            }
-            printf(")\n");
-            main->disassemble();
-        }
-        REPORTER_ASSERT(r, valid);
+        SkSL::Interpreter<1> interpreter(std::move(byteCode));
+        SkSL::ByteCode::Vector<1>* result;
+        bool success = interpreter.run(main, (SkSL::ByteCode::Vector<1>*) in, &result);
+        REPORTER_ASSERT(r, success);
+        REPORTER_ASSERT(r, result->fFloat[0] == expected[0]);
     } else {
         printf("%s\n%s", src, compiler.errorText().c_str());
     }
@@ -83,7 +56,8 @@
         return;
     }
 
-    const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
+    const SkSL::ByteCodeFunction* main1 = byteCode->getFunction("main");
+    SkSL::Interpreter<1> interpreter1(std::move(byteCode));
 
     // Test on four different vectors (with varying orderings to get divergent control flow)
     const float input[16] = { 1, 2, 3, 4,
@@ -97,9 +71,16 @@
 
     // First run in scalar mode to determine the expected output
     for (int i = 0; i < 4; ++i) {
-        SkAssertResult(byteCode->run(main, out_s + i * 4, 4, nullptr, 0, nullptr, 0));
+        SkAssertResult(interpreter1.run(main1, (SkSL::ByteCode::Vector<1>*) (out_s + i * 4),
+                       nullptr));
     }
 
+    byteCode = compiler.toByteCode(*program);
+    SkASSERT(compiler.errorCount() == 0);
+
+    const SkSL::ByteCodeFunction* main4 = byteCode->getFunction("main");
+    SkSL::Interpreter<4> interpreter4(std::move(byteCode));
+
     // Need to transpose input vectors for striped execution
     auto transpose = [](float* v) {
         for (int r = 0; r < 4; ++r)
@@ -112,7 +93,7 @@
     float* args[] = { out_v, out_v + 4, out_v + 8, out_v + 12 };
 
     // Now run in parallel and compare results
-    SkAssertResult(byteCode->runStriped(main, 4, args, 4, nullptr, 0, nullptr, 0));
+    SkAssertResult(interpreter4.runStriped(main4, 4, (float**) args));
 
     // Transpose striped outputs back
     transpose(out_v);
@@ -125,7 +106,7 @@
                     out_v[4*i + 0], out_v[4*i + 1], out_v[4*i + 2], out_v[4*i + 3],
                     out_s[4*i + 0], out_s[4*i + 1], out_s[4*i + 2], out_s[4*i + 3]);
         }
-        main->disassemble();
+        main4->disassemble();
         REPORT_FAILURE(r, "VecInterpreter mismatch", SkString());
     }
 }
@@ -147,20 +128,26 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        float inoutColor[4] = { inR, inG, inB, inA };
-        SkAssertResult(byteCode->run(main, inoutColor, 4, nullptr, 0, nullptr, 0));
-        if (inoutColor[0] != expectedR || inoutColor[1] != expectedG ||
-            inoutColor[2] != expectedB || inoutColor[3] != expectedA) {
+        SkSL::ByteCode::Vector<1> inoutColor[4];
+        inoutColor[0].fFloat[0] = inR;
+        inoutColor[1].fFloat[0] = inG;
+        inoutColor[2].fFloat[0] = inB;
+        inoutColor[3].fFloat[0] = inA;
+        SkSL::Interpreter<1> interpreter(std::move(byteCode));
+        bool success = interpreter.run(main, inoutColor, nullptr);
+        REPORTER_ASSERT(r, success);
+        if (inoutColor[0].fFloat[0] != expectedR || inoutColor[1].fFloat[0] != expectedG ||
+            inoutColor[2].fFloat[0] != expectedB || inoutColor[3].fFloat[0] != expectedA) {
             printf("for program: %s\n", src);
             printf("    expected (%f, %f, %f, %f), but received (%f, %f, %f, %f)\n", expectedR,
-                   expectedG, expectedB, expectedA, inoutColor[0], inoutColor[1], inoutColor[2],
-                   inoutColor[3]);
+                   expectedG, expectedB, expectedA, inoutColor[0].fFloat[0],
+                   inoutColor[1].fFloat[0], inoutColor[2].fFloat[0], inoutColor[3].fFloat[0]);
             main->disassemble();
         }
-        REPORTER_ASSERT(r, inoutColor[0] == expectedR);
-        REPORTER_ASSERT(r, inoutColor[1] == expectedG);
-        REPORTER_ASSERT(r, inoutColor[2] == expectedB);
-        REPORTER_ASSERT(r, inoutColor[3] == expectedA);
+        REPORTER_ASSERT(r, inoutColor[0].fFloat[0] == expectedR);
+        REPORTER_ASSERT(r, inoutColor[1].fFloat[0] == expectedG);
+        REPORTER_ASSERT(r, inoutColor[2].fFloat[0] == expectedB);
+        REPORTER_ASSERT(r, inoutColor[3].fFloat[0] == expectedA);
     } else {
         printf("%s\n%s", src, compiler.errorText().c_str());
     }
@@ -177,6 +164,10 @@
          0.5, 1, 1.5, 2);
     test(r, "void main(inout half4 color) { color.r = int(color.r) + int(color.g); }", 1, 3, 0, 0,
          4, 3, 0, 0);
+    test(r, "void main(inout half4 color) { color.rg = color.r + color.gb; }", 1, 2, 3, 4,
+         3, 4, 3, 4);
+    test(r, "void main(inout half4 color) { color.rg = color.rg + color.b; }", 1, 2, 3, 4,
+         4, 5, 3, 4);
 }
 
 DEF_TEST(SkSLInterpreterSubtract, r) {
@@ -189,6 +180,10 @@
     test(r, "void main(inout half4 color) { color = -color; }", 4, 3, 2, 1, -4, -3, -2, -1);
     test(r, "void main(inout half4 color) { color.r = int(color.r) - int(color.g); }", 3, 1, 0, 0,
          2, 1, 0, 0);
+    test(r, "void main(inout half4 color) { color.rg = color.r - color.gb; }", 1, 2, 3, 4,
+         -1, -2, 3, 4);
+    test(r, "void main(inout half4 color) { color.rg = color.rg - color.b; }", 1, 2, 3, 4,
+         -2, -1, 3, 4);
 }
 
 DEF_TEST(SkSLInterpreterMultiply, r) {
@@ -200,6 +195,10 @@
          16, 9, 4, 1);
     test(r, "void main(inout half4 color) { color.r = int(color.r) * int(color.g); }", 3, -2, 0, 0,
          -6, -2, 0, 0);
+    test(r, "void main(inout half4 color) { color.rg = color.r * color.gb; }", 5, 2, 3, 4,
+         10, 15, 3, 4);
+    test(r, "void main(inout half4 color) { color.rg = color.rg * color.b; }", 1, 2, 3, 4,
+         3, 6, 3, 4);
 }
 
 DEF_TEST(SkSLInterpreterDivide, r) {
@@ -211,6 +210,10 @@
          1, 1, 1, 1);
     test(r, "void main(inout half4 color) { color.r = int(color.r) / int(color.g); }", 8, -2, 0, 0,
          -4, -2, 0, 0);
+    test(r, "void main(inout half4 color) { color.rg = color.r / color.gb; }", 12, 2, 3, 4,
+         6, 4, 3, 4);
+    test(r, "void main(inout half4 color) { color.rg = color.rg / color.b; }", 6, 3, 3, 4,
+         2, 1, 3, 4);
 }
 
 DEF_TEST(SkSLInterpreterRemainder, r) {
@@ -222,6 +225,14 @@
          2, 3, 0, 0);
     test(r, "void main(inout half4 color) { color.rg = half2(int2(int(color.r), int(color.g)) % "
                 "int(color.b)); }", 8, 10, 6, 0, 2, 4, 6, 0);
+    test(r, "void main(inout half4 color) { color.rg = color.r + color.gb; }", 1, 2, 3, 4,
+         3, 4, 3, 4);
+    test(r, "void main(inout half4 color) { color.rg = color.rg + color.b; }", 1, 2, 3, 4,
+         4, 5, 3, 4);
+    test(r, "void main(inout half4 color) { color.rg = color.r % color.gb; }", 10, 2, 3, 4,
+         0, 1, 3, 4);
+    test(r, "void main(inout half4 color) { color.rg = color.rg % color.b; }", 6, 3, 4, 4,
+         2, 3, 4, 4);
 }
 
 DEF_TEST(SkSLInterpreterAnd, r) {
@@ -278,7 +289,7 @@
     unsigned out;
 
     out = 0x00000088;
-    test(r, "int  main(int  x) { return x << 3; }", (float*)&in, (float*)&out);
+    test(r, "int main(int x) { return x << 3; }", (float*)&in, (float*)&out);
 
     out = 0xF0000002;
     test(r, "int main(int x) { return x >> 3; }", (float*)&in, (float*)&out);
@@ -474,11 +485,15 @@
     test(r, "void main(inout half4 color) { if (color.rg == color.ba) color.a = 1; }",
          1, 2, 1, 2, 1, 2, 1, 1);
     test(r, "void main(inout half4 color) { if (color.rg == color.ba) color.a = 1; }",
+         1, 2, 1, 3, 1, 2, 1, 3);
+    test(r, "void main(inout half4 color) { if (color.rg == color.ba) color.a = 1; }",
          1, 2, 3, 2, 1, 2, 3, 2);
     test(r, "void main(inout half4 color) { if (color.rg != color.ba) color.a = 1; }",
          1, 2, 1, 2, 1, 2, 1, 2);
     test(r, "void main(inout half4 color) { if (color.rg != color.ba) color.a = 1; }",
          1, 2, 3, 2, 1, 2, 3, 1);
+    test(r, "void main(inout half4 color) { if (color.rg != color.ba) color.a = 1; }",
+         1, 2, 1, 3, 1, 2, 1, 1);
 }
 
 DEF_TEST(SkSLInterpreterWhile, r) {
@@ -641,51 +656,67 @@
     SkIRect gRects[4] = { { 1,2,3,4 }, { 5,6,7,8 }, { 9,10,11,12 }, { 13,14,15,16 } };
     const float* fRects = (const float*)gRects;
 
+    SkSL::Interpreter<1> interpreter(std::move(byteCode));
+    auto geti = [](SkSL::Interpreter<1>::Vector* v) { return v->fInt[0]; };
+    auto getf = [](SkSL::Interpreter<1>::Vector* v) { return v->fFloat[0]; };
+
     {
         SkIRect in = SkIRect::MakeXYWH(10, 10, 20, 30);
-        int out = 0;
-        SkAssertResult(byteCode->run(rect_height, (float*)&in, 4, (float*)&out, 1, fRects, 16));
-        REPORTER_ASSERT(r, out == 30);
+        SkSL::Interpreter<1>::Vector* out;
+        bool success = interpreter.run(rect_height, (SkSL::Interpreter<1>::Vector*) &in, &out);
+        REPORTER_ASSERT(r, success);
+        REPORTER_ASSERT(r, geti(out) == 30);
     }
 
     {
         int in[2] = { 15, 25 };
-        RectAndColor out;
-        SkAssertResult(byteCode->run(make_blue_rect, (float*)in, 2, (float*)&out, 8, fRects, 16));
-        REPORTER_ASSERT(r, out.fRect.width() == 15);
-        REPORTER_ASSERT(r, out.fRect.height() == 25);
+        SkSL::Interpreter<1>::Vector* out;
+        bool success = interpreter.run(make_blue_rect, (SkSL::Interpreter<1>::Vector*) in, &out);
+        REPORTER_ASSERT(r, success);
+        RectAndColor result{ { geti(out), geti(out + 1), geti(out + 2), geti(out + 3) },
+                             { getf(out + 4), getf(out + 5), getf(out + 6), getf(out + 7) } };
+        REPORTER_ASSERT(r, result.fRect.width() == 15);
+        REPORTER_ASSERT(r, result.fRect.height() == 25);
         SkColor4f blue = { 0.0f, 1.0f, 0.0f, 1.0f };
-        REPORTER_ASSERT(r, out.fColor == blue);
+        REPORTER_ASSERT(r, result.fColor == blue);
     }
 
     {
         int in[15] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-        int out = 0;
-        SkAssertResult(byteCode->run(median, (float*)in, 15, (float*)&out, 1, fRects, 16));
-        REPORTER_ASSERT(r, out == 8);
+        SkSL::Interpreter<1>::Vector* out;
+        bool success = interpreter.run(median, (SkSL::Interpreter<1>::Vector*) in, &out);
+        REPORTER_ASSERT(r, success);
+        REPORTER_ASSERT(r, geti(out) == 8);
     }
 
     {
         float in[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
-        float out[8] = { 0 };
-        SkAssertResult(byteCode->run(sums, in, 8, out, 8, fRects, 16));
+        SkSL::Interpreter<1>::Vector* out;
+        bool success = interpreter.run(sums, (SkSL::Interpreter<1>::Vector*) in, &out);
+        REPORTER_ASSERT(r, success);
         for (int i = 0; i < 8; ++i) {
-            REPORTER_ASSERT(r, out[i] == static_cast<float>((i + 1) * (i + 2) / 2));
+            REPORTER_ASSERT(r, getf(out + i) == static_cast<float>((i + 1) * (i + 2) / 2));
         }
     }
 
     {
         int in = 2;
-        SkIRect out = SkIRect::MakeEmpty();
-        SkAssertResult(byteCode->run(get_rect, (float*)&in, 1, (float*)&out, 4, fRects, 16));
-        REPORTER_ASSERT(r, out == gRects[2]);
+        interpreter.setUniforms(fRects);
+        SkSL::Interpreter<1>::Vector* out;
+        bool success = interpreter.run(get_rect, (SkSL::Interpreter<1>::Vector*) &in, &out);
+        REPORTER_ASSERT(r, success);
+        REPORTER_ASSERT(r, geti(out) == gRects[2].fLeft);
+        REPORTER_ASSERT(r, geti(out + 1) == gRects[2].fTop);
+        REPORTER_ASSERT(r, geti(out + 2) == gRects[2].fRight);
+        REPORTER_ASSERT(r, geti(out + 3) == gRects[2].fBottom);
     }
 
     {
         ManyRects in;
         memset(&in, 0, sizeof(in));
         in.fNumRects = 2;
-        SkAssertResult(byteCode->run(fill_rects, (float*)&in, 33, nullptr, 0, fRects, 16));
+        bool success = interpreter.run(fill_rects, (SkSL::Interpreter<1>::Vector*) &in, nullptr);
+        REPORTER_ASSERT(r, success);
         ManyRects expected;
         memset(&expected, 0, sizeof(expected));
         expected.fNumRects = 2;
@@ -718,9 +749,11 @@
     auto byteCode = compiler.toByteCode(*program);
     REPORTER_ASSERT(r, byteCode);
 
-    auto fun = byteCode->getFunction("main");
-    bool result = byteCode->run(fun, in, fun->getParameterCount(), nullptr, 0, nullptr, 0);
-    REPORTER_ASSERT(r, !result);
+    auto main = byteCode->getFunction("main");
+    SkSL::Interpreter<1> interpreter(std::move(byteCode));
+    SkSL::ByteCode::Vector<1>* result;
+    bool success = interpreter.run(main, (SkSL::ByteCode::Vector<1>*) in, &result);
+    REPORTER_ASSERT(r, !success);
 }
 
 DEF_TEST(SkSLInterpreterRestrictFunctionCalls, r) {
@@ -786,16 +819,21 @@
     REPORTER_ASSERT(r, dot3);
     REPORTER_ASSERT(r, dot2);
 
-    float out = 0.0f;
+    SkSL::Interpreter<1> interpreter(std::move(byteCode));
     float in = 3.0f;
-    SkAssertResult(byteCode->run(main, &in, 1, &out, 1, nullptr, 0));
-    REPORTER_ASSERT(r, out = 6.0f);
 
-    SkAssertResult(byteCode->run(dot3, &in, 1, &out, 1, nullptr, 0));
-    REPORTER_ASSERT(r, out = 9.0f);
+    SkSL::Interpreter<1>::Vector* out;
+    bool success = interpreter.run(main, (SkSL::Interpreter<1>::Vector*) &in, &out);
+    REPORTER_ASSERT(r, success);
+    REPORTER_ASSERT(r, out->fFloat[0] = 6.0f);
 
-    SkAssertResult(byteCode->run(dot2, &in, 1, &out, 1, nullptr, 0));
-    REPORTER_ASSERT(r, out = -1.0f);
+    success = interpreter.run(dot3, (SkSL::Interpreter<1>::Vector*) &in, &out);
+    REPORTER_ASSERT(r, success);
+    REPORTER_ASSERT(r, out->fFloat[0] = 9.0f);
+
+    success = interpreter.run(dot2, (SkSL::Interpreter<1>::Vector*) &in, &out);
+    REPORTER_ASSERT(r, success);
+    REPORTER_ASSERT(r, out->fFloat[0] = -1.0f);
 }
 
 DEF_TEST(SkSLInterpreterOutParams, r) {
@@ -804,15 +842,18 @@
          "void main(inout half4 color) { oneAlpha(color); }",
          0, 0, 0, 0, 0, 0, 0, 1);
     test(r,
-         "half2 tricky(half x, half y, inout half2 color, half z) {"
+         "half2 tricky(half x, half y, inout half2 color, half z, out half w) {"
          "    color.xy = color.yx;"
+         "    w = 47;"
          "    return half2(x + y, z);"
          "}"
          "void main(inout half4 color) {"
-         "    half2 t = tricky(1, 2, color.rb, 5);"
+         "    half w;"
+         "    half2 t = tricky(1, 2, color.rb, 5, w);"
+         "    color.r += w;"
          "    color.ga = t;"
          "}",
-         1, 2, 3, 4, 3, 3, 1, 5);
+         1, 2, 3, 4, 50, 3, 1, 5);
 }
 
 DEF_TEST(SkSLInterpreterMathFunctions, r) {
@@ -1029,9 +1070,11 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        float out;
-        SkAssertResult(byteCode->run(main, nullptr, 0, &out, 1, nullptr, 0));
-        REPORTER_ASSERT(r, out == 66.0);
+        SkSL::Interpreter<1> interpreter(std::move(byteCode));
+        SkSL::ByteCode::Vector<1>* result;
+        bool success = interpreter.run(main, nullptr, &result);
+        REPORTER_ASSERT(r, success);
+        REPORTER_ASSERT(r, result->fFloat[0] == 66.0);
         REPORTER_ASSERT(r, outValue == 152);
     } else {
         printf("%s\n%s", src, compiler.errorText().c_str());
@@ -1062,7 +1105,9 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        SkAssertResult(byteCode->run(main, nullptr, 0, nullptr, 0, nullptr, 0));
+        SkSL::Interpreter<1> interpreter(std::move(byteCode));
+        bool success = interpreter.run(main, nullptr, nullptr);
+        REPORTER_ASSERT(r, success);
         REPORTER_ASSERT(r, value[0] == 2);
         REPORTER_ASSERT(r, value[1] == 4);
         REPORTER_ASSERT(r, value[2] == 6);
@@ -1127,9 +1172,11 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        float out;
-        SkAssertResult(byteCode->run(main, nullptr, 0, &out, 1, nullptr, 0));
-        REPORTER_ASSERT(r, out == 5.0);
+        SkSL::Interpreter<1> interpreter(std::move(byteCode));
+        SkSL::ByteCode::Vector<1>* result;
+        bool success = interpreter.run(main, nullptr, &result);
+        REPORTER_ASSERT(r, success);
+        REPORTER_ASSERT(r, result->fFloat[0] == 5.0);
     } else {
         printf("%s\n%s", src, compiler.errorText().c_str());
     }
@@ -1142,32 +1189,23 @@
         : INHERITED(name, *compiler.context().fFloat4_Type)
         , fCompiler(compiler)
         , fFunction(function) {}
-
     bool canCall() const override {
         return true;
     }
-
     int callParameterCount() const override {
         return 1;
     }
-
     void getCallParameterTypes(const SkSL::Type** outTypes) const override {
         outTypes[0] = fCompiler.context().fFloat4_Type.get();
     }
-
     void call(int /*unusedIndex*/, float* arguments, float* outReturn) override {
         fFunction(arguments, outReturn);
     }
-
 private:
     SkSL::Compiler& fCompiler;
-
     void (*fFunction)(float[4], float[4]);
-
     typedef SkSL::ExternalValue INHERITED;
 };
-
-
 DEF_TEST(SkSLInterpreterExternalValuesVectorCall, r) {
     SkSL::Compiler compiler;
     SkSL::Program::Settings settings;
@@ -1195,12 +1233,14 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        float out[4];
-        SkAssertResult(byteCode->run(main, nullptr, 0, out, 4, nullptr, 0));
-        REPORTER_ASSERT(r, out[0] == 1.0);
-        REPORTER_ASSERT(r, out[1] == 2.0);
-        REPORTER_ASSERT(r, out[2] == 3.0);
-        REPORTER_ASSERT(r, out[3] == 4.0);
+        SkSL::Interpreter<1> interpreter(std::move(byteCode));
+        SkSL::ByteCode::Vector<1>* result;
+        bool success = interpreter.run(main, nullptr, &result);
+        REPORTER_ASSERT(r, success);
+        REPORTER_ASSERT(r, result[0].fFloat[0] == 1.0);
+        REPORTER_ASSERT(r, result[1].fFloat[0] == 2.0);
+        REPORTER_ASSERT(r, result[2].fFloat[0] == 3.0);
+        REPORTER_ASSERT(r, result[3].fFloat[0] == 4.0);
     } else {
         printf("%s\n%s", src, compiler.errorText().c_str());
     }