Revert "Revert "Revert "Complete rewrite of the SkSL interpreter"""

This reverts commit 7deb1c26baaba66622994fb46c396e2f9d134359.

Revert "maybe fixed?"

This reverts commit 7ad3f229c767da2235ae8a5a5e77a1f6952feffc.

Revert "removed extraneous change"

This reverts commit 682f299aa8c766d48e3bb9eae3383541a392ecd5.

Revert "test change"

This reverts commit 5f40986cefe52989c002dbef3c2ac5eac47d9a50.

Revert "derp"

This reverts commit 4f830b8df32072debb5ea5c8540d892391723f5a.

Revert "let's see what happens"

This reverts commit d5290563f053cd7b06b8a676f498235b97b520b5.

Change-Id: Ib3c13c2a6ade9fc42382509d036e212c7fe50cc6
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/265979
Reviewed-by: Ben Wagner aka dogben <benjaminwagner@google.com>
diff --git a/bench/SkSLInterpreterBench.cpp b/bench/SkSLInterpreterBench.cpp
index 5684955..22afd28 100644
--- a/bench/SkSLInterpreterBench.cpp
+++ b/bench/SkSLInterpreterBench.cpp
@@ -9,7 +9,6 @@
 #include "include/utils/SkRandom.h"
 #include "src/sksl/SkSLByteCode.h"
 #include "src/sksl/SkSLCompiler.h"
-#include "src/sksl/SkSLInterpreter.h"
 
 // Without this build flag, this bench isn't runnable.
 #if defined(SK_ENABLE_SKSL_INTERPRETER)
@@ -23,8 +22,6 @@
         , fCount(pixels) {}
 
 protected:
-    static constexpr int VecWidth = 16;
-
     const char* onGetName() override {
         return fName.c_str();
     }
@@ -38,10 +35,9 @@
         SkSL::Program::Settings settings;
         auto program = compiler.convertProgram(SkSL::Program::kGeneric_Kind, fSrc, settings);
         SkASSERT(compiler.errorCount() == 0);
-        std::unique_ptr<SkSL::ByteCode> byteCode = compiler.toByteCode(*program);
-        fMain = byteCode->getFunction("main");
-        fInterpreter.reset(new SkSL::Interpreter<VecWidth>(std::move(byteCode)));
+        fByteCode = compiler.toByteCode(*program);
         SkASSERT(compiler.errorCount() == 0);
+        fMain = fByteCode->getFunction("main");
 
         SkRandom rnd;
         fPixels.resize(fCount * 4);
@@ -59,14 +55,14 @@
                 fPixels.data() + 3 * fCount,
             };
 
-            fInterpreter->runStriped(fMain, fCount, (float**) args);
+            SkAssertResult(fByteCode->runStriped(fMain, fCount, args, 4, nullptr, 0, nullptr, 0));
         }
     }
 
 private:
     SkString fName;
     SkSL::String fSrc;
-    std::unique_ptr<SkSL::Interpreter<VecWidth>> fInterpreter;
+    std::unique_ptr<SkSL::ByteCode> fByteCode;
     const SkSL::ByteCodeFunction* fMain;
 
     int fCount;
diff --git a/gn/sksl.gni b/gn/sksl.gni
index 182d312..acc3242 100644
--- a/gn/sksl.gni
+++ b/gn/sksl.gni
@@ -8,6 +8,7 @@
 
 skia_sksl_sources = [
   "$_src/sksl/SkSLASTNode.cpp",
+  "$_src/sksl/SkSLByteCode.cpp",
   "$_src/sksl/SkSLByteCodeGenerator.cpp",
   "$_src/sksl/SkSLCFGGenerator.cpp",
   "$_src/sksl/SkSLCompiler.cpp",
diff --git a/modules/particles/include/SkParticleEffect.h b/modules/particles/include/SkParticleEffect.h
index b28eac8..b19ce2f 100644
--- a/modules/particles/include/SkParticleEffect.h
+++ b/modules/particles/include/SkParticleEffect.h
@@ -16,7 +16,6 @@
 #include "include/private/SkTemplates.h"
 #include "include/utils/SkRandom.h"
 #include "modules/particles/include/SkParticleData.h"
-#include "src/sksl/SkSLInterpreter.h"
 
 #include <memory>
 
@@ -26,8 +25,6 @@
 class SkParticleDrawable;
 class SkParticleExternalValue;
 
-static constexpr int INTERPRETER_WIDTH = 8;
-
 namespace skresources {
     class ResourceProvider;
 }
@@ -125,16 +122,13 @@
     friend class SkParticleEffect;
 
     // Cached
-    template<int width>
     struct Program {
-        std::unique_ptr<SkSL::Interpreter<width>> fInterpreter;
+        std::unique_ptr<SkSL::ByteCode> fByteCode;
         SkTArray<std::unique_ptr<SkParticleExternalValue>> fExternalValues;
     };
 
-    // for performance it would be better to run this with a Program<1>, but for code-size reasons
-    // we stick to INTERPRETER_WIDTH
-    Program<INTERPRETER_WIDTH> fEffectProgram;
-    Program<INTERPRETER_WIDTH> fParticleProgram;
+    Program fEffectProgram;
+    Program fParticleProgram;
 };
 
 class SkParticleEffect : public SkRefCnt {
@@ -189,17 +183,8 @@
     void setFrame   (float     f) { fState.fFrame    = f; }
     void setFlags   (uint32_t  f) { fState.fFlags    = f; }
 
-    const SkSL::ByteCode* effectCode() const {
-        return fParams->fEffectProgram.fInterpreter ?
-               &fParams->fEffectProgram.fInterpreter->getCode() :
-               nullptr;
-    }
-
-    const SkSL::ByteCode* particleCode() const {
-        return fParams->fParticleProgram.fInterpreter ?
-               &fParams->fParticleProgram.fInterpreter->getCode() :
-               nullptr;
-    }
+    const SkSL::ByteCode* effectCode() const { return fParams->fEffectProgram.fByteCode.get(); }
+    const SkSL::ByteCode* particleCode() const { return fParams->fParticleProgram.fByteCode.get(); }
 
     float* effectUniforms() { return fEffectUniforms.data(); }
     float* particleUniforms() { return fParticleUniforms.data(); }
diff --git a/modules/particles/src/SkParticleEffect.cpp b/modules/particles/src/SkParticleEffect.cpp
index ab4e4e1..a1d39d0 100644
--- a/modules/particles/src/SkParticleEffect.cpp
+++ b/modules/particles/src/SkParticleEffect.cpp
@@ -119,9 +119,7 @@
         fDrawable->prepare(resourceProvider);
     }
 
-    auto buildProgram = [this](const SkSL::String& code) ->
-                                     std::pair<std::unique_ptr<SkSL::ByteCode>,
-                                               SkTArray<std::unique_ptr<SkParticleExternalValue>>> {
+    auto buildProgram = [this](const SkSL::String& code, Program* p) {
         SkSL::Compiler compiler;
         SkSL::Program::Settings settings;
 
@@ -142,15 +140,17 @@
         auto program = compiler.convertProgram(SkSL::Program::kGeneric_Kind, code, settings);
         if (!program) {
             SkDebugf("%s\n", compiler.errorText().c_str());
-            return std::make_pair(nullptr, std::move(externalValues));
+            return;
         }
 
         auto byteCode = compiler.toByteCode(*program);
         if (!byteCode) {
             SkDebugf("%s\n", compiler.errorText().c_str());
-            return std::make_pair(nullptr, std::move(externalValues));
+            return;
         }
-        return std::make_pair(std::move(byteCode), std::move(externalValues));
+
+        p->fByteCode = std::move(byteCode);
+        p->fExternalValues.swap(externalValues);
     };
 
     SkSL::String effectCode(kCommonHeader);
@@ -160,15 +160,8 @@
     particleCode.append(kParticleHeader);
     particleCode.append(fParticleCode.c_str());
 
-    auto effectProgram = buildProgram(effectCode);
-    fEffectProgram.fInterpreter.reset(new SkSL::Interpreter<INTERPRETER_WIDTH>(
-                                                                   std::move(effectProgram.first)));
-    fEffectProgram.fExternalValues.swap(effectProgram.second);
-
-    auto particleProgram = buildProgram(particleCode);
-    fParticleProgram.fInterpreter.reset(new SkSL::Interpreter<INTERPRETER_WIDTH>(
-                                                                 std::move(particleProgram.first)));
-    fParticleProgram.fExternalValues.swap(particleProgram.second);
+    buildProgram(effectCode, &fEffectProgram);
+    buildProgram(particleCode, &fParticleProgram);
 }
 
 SkParticleEffect::SkParticleEffect(sk_sp<SkParticleEffectParams> params, const SkRandom& random)
@@ -229,22 +222,15 @@
 }
 
 void SkParticleEffect::runEffectScript(double now, const char* entry) {
-    SkSL::Interpreter<INTERPRETER_WIDTH>* interpreter = fParams->fEffectProgram.fInterpreter.get();
-    if (interpreter) {
-        const auto& byteCode = interpreter->getCode();
-        if (auto fun = byteCode.getFunction(entry)) {
+    if (const auto& byteCode = fParams->fEffectProgram.fByteCode) {
+        if (auto fun = byteCode->getFunction(entry)) {
             for (const auto& value : fParams->fEffectProgram.fExternalValues) {
                 value->setRandom(&fRandom);
                 value->setEffect(this);
             }
-            interpreter->setUniforms(fEffectUniforms.data());
-            static constexpr int numChannels = sizeof(EffectState) / sizeof(float);
-            SkASSERT(numChannels == fun->getParameterSlotCount());
-            float* args[numChannels];
-            for (int i = 0; i < numChannels; ++i) {
-                args[i] = &fState.fAge + i;
-            }
-            SkAssertResult(interpreter->runStriped(fun, 1, args));
+            SkAssertResult(byteCode->run(fun, &fState.fAge, sizeof(EffectState) / sizeof(float),
+                                         nullptr, 0,
+                                         fEffectUniforms.data(), fEffectUniforms.count()));
             this->processEffectSpawnRequests(now);
         }
     }
@@ -277,11 +263,8 @@
 }
 
 void SkParticleEffect::runParticleScript(double now, const char* entry, int start, int count) {
-    SkSL::Interpreter<INTERPRETER_WIDTH>* interpreter =
-                                                       fParams->fParticleProgram.fInterpreter.get();
-    if (interpreter) {
-        const auto& byteCode = interpreter->getCode();
-        if (auto fun = byteCode.getFunction(entry)) {
+    if (const auto& byteCode = fParams->fParticleProgram.fByteCode) {
+        if (auto fun = byteCode->getFunction(entry)) {
             float* args[SkParticles::kNumChannels];
             for (int i = 0; i < SkParticles::kNumChannels; ++i) {
                 args[i] = fParticles.fData[i].get() + start;
@@ -292,8 +275,10 @@
                 value->setEffect(this);
             }
             memcpy(&fParticleUniforms[1], &fState.fAge, sizeof(EffectState));
-            interpreter->setUniforms(fParticleUniforms.data());
-            SkAssertResult(interpreter->runStriped(fun, count, (float**) args));
+            SkAssertResult(byteCode->runStriped(fun, count, args, SkParticles::kNumChannels,
+                                                nullptr, 0,
+                                                fParticleUniforms.data(),
+                                                fParticleUniforms.count()));
             this->processParticleSpawnRequests(now, start);
         }
     }
diff --git a/src/core/SkColorFilter.cpp b/src/core/SkColorFilter.cpp
index dfd34d2..26836d4 100644
--- a/src/core/SkColorFilter.cpp
+++ b/src/core/SkColorFilter.cpp
@@ -19,7 +19,6 @@
 #include "src/core/SkReadBuffer.h"
 #include "src/core/SkVM.h"
 #include "src/core/SkWriteBuffer.h"
-#include "src/sksl/SkSLInterpreter.h"
 
 #if SK_SUPPORT_GPU
 #include "src/gpu/GrFragmentProcessor.h"
@@ -421,20 +420,17 @@
         ctx->ninputs = fEffect->uniformSize() / 4;
         ctx->shaderConvention = false;
 
-        SkAutoMutexExclusive ama(fInterpreterMutex);
-        if (!fInterpreter) {
+        SkAutoMutexExclusive ama(fByteCodeMutex);
+        if (!fByteCode) {
             auto [byteCode, errorText] = fEffect->toByteCode(fInputs->data());
             if (!byteCode) {
                 SkDebugf("%s\n", errorText.c_str());
                 return false;
             }
-            fMain = byteCode->getFunction("main");
-            fInterpreter.reset(
-                           new SkSL::Interpreter<SkRasterPipeline_InterpreterCtx::VECTOR_WIDTH>(
-                                                                          std::move(byteCode)));
+            fByteCode = std::move(byteCode);
         }
-        ctx->fn = fMain;
-        ctx->interpreter = fInterpreter.get();
+        ctx->byteCode = fByteCode.get();
+        ctx->fn = ctx->byteCode->getFunction("main");
         rec.fPipeline->append(SkRasterPipeline::interpreter, ctx);
         return true;
     }
@@ -457,10 +453,8 @@
     sk_sp<SkRuntimeEffect> fEffect;
     sk_sp<SkData> fInputs;
 
-    mutable SkMutex fInterpreterMutex;
-    mutable std::unique_ptr<SkSL::Interpreter<SkRasterPipeline_InterpreterCtx::VECTOR_WIDTH>>
-                                                                                       fInterpreter;
-    mutable const SkSL::ByteCodeFunction* fMain;
+    mutable SkMutex fByteCodeMutex;
+    mutable std::unique_ptr<SkSL::ByteCode> fByteCode;
 
     friend class SkColorFilter;
 
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 1a6e582..98b009a 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -161,15 +161,12 @@
 };
 
 namespace SkSL {
+class ByteCode;
 class ByteCodeFunction;
-
-template<int width>
-class Interpreter;
 }
 
 struct SkRasterPipeline_InterpreterCtx {
-    static constexpr int VECTOR_WIDTH = 8;
-    SkSL::Interpreter<VECTOR_WIDTH>* interpreter;
+    const SkSL::ByteCode*         byteCode;
     const SkSL::ByteCodeFunction* fn;
 
     SkColor4f   paintColor;
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 0f75d54..00e2b67 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -10,7 +10,7 @@
 
 #include "include/core/SkTypes.h"
 #include "src/core/SkUtils.h"  // unaligned_{load,store}
-#include "src/sksl/SkSLInterpreter.h"
+#include "src/sksl/SkSLByteCode.h"
 
 // Every function in this file should be marked static and inline using SI.
 #if defined(__clang__)
@@ -2711,6 +2711,7 @@
 
     float*  args[]  = { xx, yy, rr, gg, bb, aa };
     float** in_args = args;
+    int     in_count = 6;
 
     if (c->shaderConvention) {
         // our caller must have called seed_shader to set these
@@ -2722,14 +2723,15 @@
         sk_unaligned_store(aa, F(c->paintColor.fA));
     } else {
         in_args += 2;   // skip x,y
+        in_count = 4;
         sk_unaligned_store(rr, r);
         sk_unaligned_store(gg, g);
         sk_unaligned_store(bb, b);
         sk_unaligned_store(aa, a);
     }
 
-    c->interpreter->setUniforms((float*) c->inputs);
-    SkAssertResult(c->interpreter->runStriped(c->fn, tail ? tail : N, (float**) in_args));
+    SkAssertResult(c->byteCode->runStriped(c->fn, tail ? tail : N, in_args, in_count,
+                                           nullptr, 0, (const float*)c->inputs, c->ninputs));
 
     r = sk_unaligned_load<F>(rr);
     g = sk_unaligned_load<F>(gg);
diff --git a/src/shaders/SkRTShader.cpp b/src/shaders/SkRTShader.cpp
index 90aeb1f..ce823eb 100644
--- a/src/shaders/SkRTShader.cpp
+++ b/src/shaders/SkRTShader.cpp
@@ -14,8 +14,6 @@
 #include "src/shaders/SkRTShader.h"
 
 #include "src/sksl/SkSLByteCode.h"
-#include "src/sksl/SkSLCompiler.h"
-#include "src/sksl/SkSLInterpreter.h"
 
 #if SK_SUPPORT_GPU
 #include "src/gpu/GrColorInfo.h"
@@ -47,19 +45,17 @@
     ctx->ninputs = fEffect->uniformSize() / 4;
     ctx->shaderConvention = true;
 
-    SkAutoMutexExclusive ama(fInterpreterMutex);
-    if (!fInterpreter) {
+    SkAutoMutexExclusive ama(fByteCodeMutex);
+    if (!fByteCode) {
         auto [byteCode, errorText] = fEffect->toByteCode(fInputs->data());
         if (!byteCode) {
             SkDebugf("%s\n", errorText.c_str());
             return false;
         }
-        fMain = byteCode->getFunction("main");
-        fInterpreter.reset(new SkSL::Interpreter<SkRasterPipeline_InterpreterCtx::VECTOR_WIDTH>(
-                                                                      std::move(byteCode)));
+        fByteCode = std::move(byteCode);
     }
-    ctx->fn = fMain;
-    ctx->interpreter = fInterpreter.get();
+    ctx->byteCode = fByteCode.get();
+    ctx->fn = ctx->byteCode->getFunction("main");
 
     rec.fPipeline->append(SkRasterPipeline::seed_shader);
     rec.fPipeline->append_matrix(rec.fAlloc, inverse);
diff --git a/src/shaders/SkRTShader.h b/src/shaders/SkRTShader.h
index 1ac56ec..5d44840 100644
--- a/src/shaders/SkRTShader.h
+++ b/src/shaders/SkRTShader.h
@@ -18,12 +18,7 @@
 class SkMatrix;
 class SkRuntimeEffect;
 
-namespace SkSL {
-    class ByteCodeFunction;
-
-    template<int width>
-    class Interpreter;
-}
+namespace SkSL { class ByteCode; }
 
 class SkRTShader : public SkShaderBase {
 public:
@@ -42,8 +37,6 @@
     bool onAppendStages(const SkStageRec& rec) const override;
 
 private:
-    static constexpr int VECTOR_WIDTH = 8;
-
     SK_FLATTENABLE_HOOKS(SkRTShader)
 
     sk_sp<SkRuntimeEffect> fEffect;
@@ -52,9 +45,8 @@
     sk_sp<SkData> fInputs;
     std::vector<sk_sp<SkShader>> fChildren;
 
-    mutable SkMutex fInterpreterMutex;
-    mutable std::unique_ptr<SkSL::Interpreter<VECTOR_WIDTH>> fInterpreter;
-    mutable const SkSL::ByteCodeFunction* fMain;
+    mutable SkMutex fByteCodeMutex;
+    mutable std::unique_ptr<SkSL::ByteCode> fByteCode;
 
     typedef SkShaderBase INHERITED;
 };
diff --git a/src/sksl/SkSLByteCode.cpp b/src/sksl/SkSLByteCode.cpp
new file mode 100644
index 0000000..a9c3480
--- /dev/null
+++ b/src/sksl/SkSLByteCode.cpp
@@ -0,0 +1,1760 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SKSL_STANDALONE
+
+#include "include/core/SkPoint3.h"
+#include "include/private/SkVx.h"
+#include "src/core/SkUtils.h"   // sk_unaligned_load
+#include "src/sksl/SkSLByteCode.h"
+#include "src/sksl/SkSLByteCodeGenerator.h"
+#include "src/sksl/SkSLExternalValue.h"
+
+#include <vector>
+
+namespace SkSL {
+
+#if defined(SK_ENABLE_SKSL_INTERPRETER)
+
+constexpr int VecWidth = ByteCode::kVecWidth;
+
+struct Interpreter {
+
+using F32 = skvx::Vec<VecWidth, float>;
+using I32 = skvx::Vec<VecWidth, int32_t>;
+using U32 = skvx::Vec<VecWidth, uint32_t>;
+
+#define READ8() (*(ip++))
+#define READ16() (ip += 2, sk_unaligned_load<uint16_t>(ip - 2))
+#define READ32() (ip += 4, sk_unaligned_load<uint32_t>(ip - 4))
+#define READ_INST() (ip += sizeof(instruction), \
+                     sk_unaligned_load<instruction>(ip - sizeof(instruction)))
+
+#define VECTOR_DISASSEMBLE(op, text)                                \
+    case ByteCodeInstruction::op: printf(text); ++ip; break;        \
+    case ByteCodeInstruction::op##2: printf(text "2"); ++ip; break; \
+    case ByteCodeInstruction::op##3: printf(text "3"); ++ip; break; \
+    case ByteCodeInstruction::op##4: printf(text "4"); ++ip; break;
+
+#define VECTOR_DISASSEMBLE_NO_COUNT(op, text)                 \
+    case ByteCodeInstruction::op: printf(text); break;        \
+    case ByteCodeInstruction::op##2: printf(text "2"); break; \
+    case ByteCodeInstruction::op##3: printf(text "3"); break; \
+    case ByteCodeInstruction::op##4: printf(text "4"); break;
+
+#define VECTOR_MATRIX_DISASSEMBLE(op, text) \
+    VECTOR_DISASSEMBLE(op, text)            \
+    case ByteCodeInstruction::op##N: printf(text "N %d", READ8()); break;
+
+#define VECTOR_MATRIX_DISASSEMBLE_NO_COUNT(op, text) \
+    VECTOR_DISASSEMBLE_NO_COUNT(op, text)            \
+    case ByteCodeInstruction::op##N: printf(text "N %d", READ8()); break;
+
+static const uint8_t* DisassembleInstruction(const uint8_t* ip) {
+    switch ((ByteCodeInstruction) (intptr_t) READ_INST()) {
+        VECTOR_MATRIX_DISASSEMBLE(kAddF, "addf")
+        VECTOR_DISASSEMBLE(kAddI, "addi")
+        case ByteCodeInstruction::kAndB: printf("andb"); break;
+        case ByteCodeInstruction::kBranch: printf("branch %d", READ16()); break;
+        case ByteCodeInstruction::kCall: printf("call %d", READ8()); break;
+        case ByteCodeInstruction::kCallExternal: {
+            int argumentCount = READ8();
+            int returnCount = READ8();
+            int externalValue = READ8();
+            printf("callexternal %d, %d, %d", argumentCount, returnCount, externalValue);
+            break;
+        }
+        case ByteCodeInstruction::kClampIndex: printf("clampindex %d", READ8()); break;
+        VECTOR_DISASSEMBLE(kCompareIEQ, "compareieq")
+        VECTOR_DISASSEMBLE(kCompareINEQ, "compareineq")
+        VECTOR_MATRIX_DISASSEMBLE(kCompareFEQ, "comparefeq")
+        VECTOR_MATRIX_DISASSEMBLE(kCompareFNEQ, "comparefneq")
+        VECTOR_DISASSEMBLE(kCompareFGT, "comparefgt")
+        VECTOR_DISASSEMBLE(kCompareFGTEQ, "comparefgteq")
+        VECTOR_DISASSEMBLE(kCompareFLT, "compareflt")
+        VECTOR_DISASSEMBLE(kCompareFLTEQ, "compareflteq")
+        VECTOR_DISASSEMBLE(kCompareSGT, "comparesgt")
+        VECTOR_DISASSEMBLE(kCompareSGTEQ, "comparesgteq")
+        VECTOR_DISASSEMBLE(kCompareSLT, "compareslt")
+        VECTOR_DISASSEMBLE(kCompareSLTEQ, "compareslteq")
+        VECTOR_DISASSEMBLE(kCompareUGT, "compareugt")
+        VECTOR_DISASSEMBLE(kCompareUGTEQ, "compareugteq")
+        VECTOR_DISASSEMBLE(kCompareULT, "compareult")
+        VECTOR_DISASSEMBLE(kCompareULTEQ, "compareulteq")
+        VECTOR_DISASSEMBLE_NO_COUNT(kConvertFtoI, "convertftoi")
+        VECTOR_DISASSEMBLE_NO_COUNT(kConvertStoF, "convertstof")
+        VECTOR_DISASSEMBLE_NO_COUNT(kConvertUtoF, "convertutof")
+        VECTOR_DISASSEMBLE(kCos, "cos")
+        VECTOR_MATRIX_DISASSEMBLE(kDivideF, "dividef")
+        VECTOR_DISASSEMBLE(kDivideS, "divideS")
+        VECTOR_DISASSEMBLE(kDivideU, "divideu")
+        VECTOR_MATRIX_DISASSEMBLE(kDup, "dup")
+        case ByteCodeInstruction::kInverse2x2: printf("inverse2x2"); break;
+        case ByteCodeInstruction::kInverse3x3: printf("inverse3x3"); break;
+        case ByteCodeInstruction::kInverse4x4: printf("inverse4x4"); break;
+        case ByteCodeInstruction::kLoad: printf("load %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoad2: printf("load2 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoad3: printf("load3 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoad4: printf("load4 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadGlobal: printf("loadglobal %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadGlobal2: printf("loadglobal2 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadGlobal3: printf("loadglobal3 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadGlobal4: printf("loadglobal4 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadUniform: printf("loaduniform %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadUniform2: printf("loaduniform2 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadUniform3: printf("loaduniform3 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadUniform4: printf("loaduniform4 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadSwizzle: {
+            int target = READ8();
+            int count = READ8();
+            printf("loadswizzle %d %d", target, count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kLoadSwizzleGlobal: {
+            int target = READ8();
+            int count = READ8();
+            printf("loadswizzleglobal %d %d", target, count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kLoadSwizzleUniform: {
+            int target = READ8();
+            int count = READ8();
+            printf("loadswizzleuniform %d %d", target, count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kLoadExtended: printf("loadextended %d", READ8()); break;
+        case ByteCodeInstruction::kLoadExtendedGlobal: printf("loadextendedglobal %d", READ8());
+            break;
+        case ByteCodeInstruction::kLoadExtendedUniform: printf("loadextendeduniform %d", READ8());
+            break;
+        case ByteCodeInstruction::kMatrixToMatrix: {
+            int srcCols = READ8();
+            int srcRows = READ8();
+            int dstCols = READ8();
+            int dstRows = READ8();
+            printf("matrixtomatrix %dx%d %dx%d", srcCols, srcRows, dstCols, dstRows);
+            break;
+        }
+        case ByteCodeInstruction::kMatrixMultiply: {
+            int lCols = READ8();
+            int lRows = READ8();
+            int rCols = READ8();
+            printf("matrixmultiply %dx%d %dx%d", lCols, lRows, rCols, lCols);
+            break;
+        }
+        VECTOR_MATRIX_DISASSEMBLE(kMultiplyF, "multiplyf")
+        VECTOR_DISASSEMBLE(kMultiplyI, "multiplyi")
+        VECTOR_MATRIX_DISASSEMBLE_NO_COUNT(kNegateF, "negatef")
+        VECTOR_DISASSEMBLE_NO_COUNT(kNegateI, "negatei")
+        case ByteCodeInstruction::kNotB: printf("notb"); break;
+        case ByteCodeInstruction::kOrB: printf("orb"); break;
+        VECTOR_MATRIX_DISASSEMBLE_NO_COUNT(kPop, "pop")
+        case ByteCodeInstruction::kPushImmediate: {
+            uint32_t v = READ32();
+            union { uint32_t u; float f; } pun = { v };
+            printf("pushimmediate %s", (to_string(v) + "(" + to_string(pun.f) + ")").c_str());
+            break;
+        }
+        case ByteCodeInstruction::kReadExternal: printf("readexternal %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kReadExternal2: printf("readexternal2 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kReadExternal3: printf("readexternal3 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kReadExternal4: printf("readexternal4 %d", READ16() >> 8); break;
+        VECTOR_DISASSEMBLE(kRemainderF, "remainderf")
+        VECTOR_DISASSEMBLE(kRemainderS, "remainders")
+        VECTOR_DISASSEMBLE(kRemainderU, "remainderu")
+        case ByteCodeInstruction::kReserve: printf("reserve %d", READ8()); break;
+        case ByteCodeInstruction::kReturn: printf("return %d", READ8()); break;
+        case ByteCodeInstruction::kScalarToMatrix: {
+            int cols = READ8();
+            int rows = READ8();
+            printf("scalartomatrix %dx%d", cols, rows);
+            break;
+        }
+        case ByteCodeInstruction::kShiftLeft: printf("shl %d", READ8()); break;
+        case ByteCodeInstruction::kShiftRightS: printf("shrs %d", READ8()); break;
+        case ByteCodeInstruction::kShiftRightU: printf("shru %d", READ8()); break;
+        VECTOR_DISASSEMBLE(kSin, "sin")
+        VECTOR_DISASSEMBLE_NO_COUNT(kSqrt, "sqrt")
+        case ByteCodeInstruction::kStore: printf("store %d", READ8()); break;
+        case ByteCodeInstruction::kStore2: printf("store2 %d", READ8()); break;
+        case ByteCodeInstruction::kStore3: printf("store3 %d", READ8()); break;
+        case ByteCodeInstruction::kStore4: printf("store4 %d", READ8()); break;
+        case ByteCodeInstruction::kStoreGlobal: printf("storeglobal %d", READ8()); break;
+        case ByteCodeInstruction::kStoreGlobal2: printf("storeglobal2 %d", READ8()); break;
+        case ByteCodeInstruction::kStoreGlobal3: printf("storeglobal3 %d", READ8()); break;
+        case ByteCodeInstruction::kStoreGlobal4: printf("storeglobal4 %d", READ8()); break;
+        case ByteCodeInstruction::kStoreSwizzle: {
+            int target = READ8();
+            int count = READ8();
+            printf("storeswizzle %d %d", target, count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kStoreSwizzleGlobal: {
+            int target = READ8();
+            int count = READ8();
+            printf("storeswizzleglobal %d %d", target, count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kStoreSwizzleIndirect: {
+            int count = READ8();
+            printf("storeswizzleindirect %d", count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kStoreSwizzleIndirectGlobal: {
+            int count = READ8();
+            printf("storeswizzleindirectglobal %d", count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kStoreExtended: printf("storeextended %d", READ8()); break;
+        case ByteCodeInstruction::kStoreExtendedGlobal: printf("storeextendedglobal %d", READ8());
+            break;
+        VECTOR_MATRIX_DISASSEMBLE(kSubtractF, "subtractf")
+        VECTOR_DISASSEMBLE(kSubtractI, "subtracti")
+        case ByteCodeInstruction::kSwizzle: {
+            printf("swizzle %d, ", READ8());
+            int count = READ8();
+            printf("%d", count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        VECTOR_DISASSEMBLE(kTan, "tan")
+        case ByteCodeInstruction::kWriteExternal: printf("writeexternal %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kWriteExternal2: printf("writeexternal2 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kWriteExternal3: printf("writeexternal3 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kWriteExternal4: printf("writeexternal4 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kXorB: printf("xorb"); break;
+        case ByteCodeInstruction::kMaskPush: printf("maskpush"); break;
+        case ByteCodeInstruction::kMaskPop: printf("maskpop"); break;
+        case ByteCodeInstruction::kMaskNegate: printf("masknegate"); break;
+        case ByteCodeInstruction::kMaskBlend: printf("maskblend %d", READ8()); break;
+        case ByteCodeInstruction::kBranchIfAllFalse:
+            printf("branchifallfalse %d", READ16());
+            break;
+        case ByteCodeInstruction::kLoopBegin: printf("loopbegin"); break;
+        case ByteCodeInstruction::kLoopNext: printf("loopnext"); break;
+        case ByteCodeInstruction::kLoopMask: printf("loopmask"); break;
+        case ByteCodeInstruction::kLoopEnd: printf("loopend"); break;
+        case ByteCodeInstruction::kLoopContinue: printf("loopcontinue"); break;
+        case ByteCodeInstruction::kLoopBreak: printf("loopbreak"); break;
+        default:
+            ip -= sizeof(instruction);
+            printf("unknown(%d)\n", (int) (intptr_t) READ_INST());
+            SkASSERT(false);
+    }
+    return ip;
+}
+
+#ifdef SKSLC_THREADED_CODE
+    #define LABEL(name) name:
+    #ifdef TRACE
+        #define NEXT() goto next
+    #else
+        #define NEXT() goto *READ_INST()
+    #endif
+#else
+    #define LABEL(name) case ByteCodeInstruction::name:
+    #define NEXT() continue
+#endif
+
+#define VECTOR_BINARY_OP(base, field, op)             \
+    LABEL(base ## 4)                                  \
+        sp[-4] = sp[-4].field op sp[0].field;         \
+        POP();                                        \
+        /* fall through */                            \
+    LABEL(base ## 3) {                                \
+        sp[-ip[0]] = sp[-ip[0]].field op sp[0].field; \
+        POP();                                        \
+    }   /* fall through */                            \
+    LABEL(base ## 2) {                                \
+        sp[-ip[0]] = sp[-ip[0]].field op sp[0].field; \
+        POP();                                        \
+    }   /* fall through */                            \
+    LABEL(base) {                                     \
+        sp[-ip[0]] = sp[-ip[0]].field op sp[0].field; \
+        POP();                                        \
+        ++ip;                                         \
+        NEXT();                                       \
+    }
+
+// A naive implementation of / or % using skvx operations will likely crash with a divide by zero
+// in inactive vector lanesm, so we need to be sure to avoid masked-off lanes.
+#define VECTOR_BINARY_MASKED_OP(base, field, op)            \
+    LABEL(base ## 4)                                        \
+        for (int i = 0; i < VecWidth; ++i) {                \
+            if (mask()[i]) {                                \
+                sp[-4].field[i] op ## = sp[0].field[i];     \
+            }                                               \
+        }                                                   \
+        POP();                                              \
+        /* fall through */                                  \
+    LABEL(base ## 3) {                                      \
+        for (int i = 0; i < VecWidth; ++i) {                \
+            if (mask()[i]) {                                \
+                sp[-ip[0]].field[i] op ## = sp[0].field[i]; \
+            }                                               \
+        }                                                   \
+        POP();                                              \
+    }   /* fall through */                                  \
+    LABEL(base ## 2) {                                      \
+        for (int i = 0; i < VecWidth; ++i) {                \
+            if (mask()[i]) {                                \
+                sp[-ip[0]].field[i] op ## = sp[0].field[i]; \
+            }                                               \
+        }                                                   \
+        POP();                                              \
+    }   /* fall through */                                  \
+    LABEL(base) {                                           \
+        for (int i = 0; i < VecWidth; ++i) {                \
+            if (mask()[i]) {                                \
+                sp[-ip[0]].field[i] op ## = sp[0].field[i]; \
+            }                                               \
+        }                                                   \
+        POP();                                              \
+        ++ip;                                               \
+        NEXT();                                             \
+    }
+
+
+#define VECTOR_MATRIX_BINARY_OP(base, field, op)          \
+    VECTOR_BINARY_OP(base, field, op)                     \
+    LABEL(base ## N) {                                    \
+        int count = READ8();                              \
+        for (int i = count; i > 0; --i) {                 \
+            sp[-count] = sp[-count].field op sp[0].field; \
+            POP();                                        \
+        }                                                 \
+        NEXT();                                           \
+    }
+
+#define VECTOR_BINARY_FN(base, field, fn)               \
+    LABEL(base ## 4)                                    \
+        sp[-4] = fn(sp[-4].field, sp[0].field);         \
+        POP();                                          \
+        /* fall through */                              \
+    LABEL(base ## 3) {                                  \
+        sp[-ip[0]] = fn(sp[-ip[0]].field, sp[0].field); \
+        POP();                                          \
+    }   /* fall through */                              \
+    LABEL(base ## 2) {                                  \
+        sp[-ip[0]] = fn(sp[-ip[0]].field, sp[0].field); \
+        POP();                                          \
+    }   /* fall through */                              \
+    LABEL(base) {                                       \
+        sp[-ip[0]] = fn(sp[-ip[0]].field, sp[0].field); \
+        POP();                                          \
+        ++ip;                                           \
+        NEXT();                                         \
+    }
+
+#define VECTOR_UNARY_FN(base, fn, field)         \
+    LABEL(base ## 4)  sp[-3] = fn(sp[-3].field); \
+    LABEL(base ## 3)  sp[-2] = fn(sp[-2].field); \
+    LABEL(base ## 2)  sp[-1] = fn(sp[-1].field); \
+    LABEL(base)       sp[ 0] = fn(sp[ 0].field); \
+                      NEXT();
+
+#define VECTOR_UNARY_FN_VEC(base, fn)                     \
+    LABEL(base ## 4)                                      \
+    LABEL(base ## 3)                                      \
+    LABEL(base ## 2)                                      \
+    LABEL(base) {                                         \
+        int count = READ8();                              \
+        float* v = (float*)sp - count + 1;                \
+        for (int i = VecWidth * count; i > 0; --i, ++v) { \
+            *v = fn(*v);                                  \
+        }                                                 \
+        NEXT();                                           \
+    }
+
+#define VECTOR_LABELS(base) \
+    &&base ## 4,            \
+    &&base ## 3,            \
+    &&base ## 2,            \
+    &&base
+
+#define VECTOR_MATRIX_LABELS(base) \
+    VECTOR_LABELS(base),           \
+    &&base ## N
+
+// If you trip this assert, it means that the order of the opcodes listed in ByteCodeInstruction
+// does not match the order of the opcodes listed in the 'labels' array in innerRun().
+#define CHECK_LABEL(name) \
+    SkASSERT(labels[(int) ByteCodeInstruction::name] == &&name)
+
+#define CHECK_VECTOR_LABELS(name) \
+    CHECK_LABEL(name ## 4);       \
+    CHECK_LABEL(name ## 3);       \
+    CHECK_LABEL(name ## 2);       \
+    CHECK_LABEL(name)
+
+#define CHECK_VECTOR_MATRIX_LABELS(name) \
+    CHECK_VECTOR_LABELS(name);           \
+    CHECK_LABEL(name ## N)
+
+union VValue {
+    VValue() {}
+    VValue(F32 f) : fFloat(f) {}
+    VValue(I32 s) : fSigned(s) {}
+    VValue(U32 u) : fUnsigned(u) {}
+
+    F32 fFloat;
+    I32 fSigned;
+    U32 fUnsigned;
+};
+
+struct StackFrame {
+    const uint8_t* fCode;
+    const uint8_t* fIP;
+    VValue* fStack;
+    int fParameterCount;
+};
+
+static F32 VecMod(F32 a, F32 b) {
+    return a - skvx::trunc(a / b) * b;
+}
+
+#define spf(index)  sp[index].fFloat
+
+static void CallExternal(const ByteCode* byteCode, const uint8_t*& ip, VValue*& sp,
+                          int baseIndex, I32 mask) {
+    int argumentCount = READ8();
+    int returnCount = READ8();
+    int target = READ8();
+    ExternalValue* v = byteCode->fExternalValues[target];
+    sp -= argumentCount - 1;
+
+    float tmpArgs[4];
+    float tmpReturn[4];
+    SkASSERT(argumentCount <= (int)SK_ARRAY_COUNT(tmpArgs));
+    SkASSERT(returnCount <= (int)SK_ARRAY_COUNT(tmpReturn));
+
+    for (int i = 0; i < VecWidth; ++i) {
+        if (mask[i]) {
+            for (int j = 0; j < argumentCount; ++j) {
+                tmpArgs[j] = sp[j].fFloat[i];
+            }
+            v->call(baseIndex + i, tmpArgs, tmpReturn);
+            for (int j = 0; j < returnCount; ++j) {
+                sp[j].fFloat[i] = tmpReturn[j];
+            }
+        }
+    }
+    sp += returnCount - 1;
+}
+
+static void Inverse2x2(VValue* sp) {
+    F32 a = sp[-3].fFloat,
+        b = sp[-2].fFloat,
+        c = sp[-1].fFloat,
+        d = sp[ 0].fFloat;
+    F32 idet = F32(1) / (a*d - b*c);
+    sp[-3].fFloat = d * idet;
+    sp[-2].fFloat = -b * idet;
+    sp[-1].fFloat = -c * idet;
+    sp[ 0].fFloat = a * idet;
+}
+
+static void Inverse3x3(VValue* sp) {
+    F32 a11 = sp[-8].fFloat, a12 = sp[-5].fFloat, a13 = sp[-2].fFloat,
+        a21 = sp[-7].fFloat, a22 = sp[-4].fFloat, a23 = sp[-1].fFloat,
+        a31 = sp[-6].fFloat, a32 = sp[-3].fFloat, a33 = sp[ 0].fFloat;
+    F32 idet = F32(1) / (a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
+                         a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31);
+    sp[-8].fFloat = (a22 * a33 - a23 * a32) * idet;
+    sp[-7].fFloat = (a23 * a31 - a21 * a33) * idet;
+    sp[-6].fFloat = (a21 * a32 - a22 * a31) * idet;
+    sp[-5].fFloat = (a13 * a32 - a12 * a33) * idet;
+    sp[-4].fFloat = (a11 * a33 - a13 * a31) * idet;
+    sp[-3].fFloat = (a12 * a31 - a11 * a32) * idet;
+    sp[-2].fFloat = (a12 * a23 - a13 * a22) * idet;
+    sp[-1].fFloat = (a13 * a21 - a11 * a23) * idet;
+    sp[ 0].fFloat = (a11 * a22 - a12 * a21) * idet;
+}
+
+static void Inverse4x4(VValue* sp) {
+    F32 a00 = spf(-15), a10 = spf(-11), a20 = spf( -7), a30 = spf( -3),
+        a01 = spf(-14), a11 = spf(-10), a21 = spf( -6), a31 = spf( -2),
+        a02 = spf(-13), a12 = spf( -9), a22 = spf( -5), a32 = spf( -1),
+        a03 = spf(-12), a13 = spf( -8), a23 = spf( -4), a33 = spf(  0);
+
+    F32 b00 = a00 * a11 - a01 * a10,
+        b01 = a00 * a12 - a02 * a10,
+        b02 = a00 * a13 - a03 * a10,
+        b03 = a01 * a12 - a02 * a11,
+        b04 = a01 * a13 - a03 * a11,
+        b05 = a02 * a13 - a03 * a12,
+        b06 = a20 * a31 - a21 * a30,
+        b07 = a20 * a32 - a22 * a30,
+        b08 = a20 * a33 - a23 * a30,
+        b09 = a21 * a32 - a22 * a31,
+        b10 = a21 * a33 - a23 * a31,
+        b11 = a22 * a33 - a23 * a32;
+
+    F32 idet = F32(1) /
+               (b00 * b11 - b01 * b10 + b02 * b09 + b03 * b08 - b04 * b07 + b05 * b06);
+
+    b00 *= idet;
+    b01 *= idet;
+    b02 *= idet;
+    b03 *= idet;
+    b04 *= idet;
+    b05 *= idet;
+    b06 *= idet;
+    b07 *= idet;
+    b08 *= idet;
+    b09 *= idet;
+    b10 *= idet;
+    b11 *= idet;
+
+    spf(-15) = a11 * b11 - a12 * b10 + a13 * b09;
+    spf(-14) = a02 * b10 - a01 * b11 - a03 * b09;
+    spf(-13) = a31 * b05 - a32 * b04 + a33 * b03;
+    spf(-12) = a22 * b04 - a21 * b05 - a23 * b03;
+    spf(-11) = a12 * b08 - a10 * b11 - a13 * b07;
+    spf(-10) = a00 * b11 - a02 * b08 + a03 * b07;
+    spf( -9) = a32 * b02 - a30 * b05 - a33 * b01;
+    spf( -8) = a20 * b05 - a22 * b02 + a23 * b01;
+    spf( -7) = a10 * b10 - a11 * b08 + a13 * b06;
+    spf( -6) = a01 * b08 - a00 * b10 - a03 * b06;
+    spf( -5) = a30 * b04 - a31 * b02 + a33 * b00;
+    spf( -4) = a21 * b02 - a20 * b04 - a23 * b00;
+    spf( -3) = a11 * b07 - a10 * b09 - a12 * b06;
+    spf( -2) = a00 * b09 - a01 * b07 + a02 * b06;
+    spf( -1) = a31 * b01 - a30 * b03 - a32 * b00;
+    spf(  0) = a20 * b03 - a21 * b01 + a22 * b00;
+}
+
+static bool InnerRun(const ByteCode* byteCode, const ByteCodeFunction* f, VValue* stack,
+                     float* outReturn[], VValue globals[], const float uniforms[],
+                     bool stripedOutput, int N, int baseIndex) {
+#ifdef SKSLC_THREADED_CODE
+    static const void* labels[] = {
+        // If you aren't familiar with it, the &&label syntax is the GCC / Clang "labels as values"
+        // extension. If you add anything to this array, be sure to add the corresponding
+        // CHECK_LABEL() or CHECK_*_LABELS() assert below.
+        VECTOR_MATRIX_LABELS(kAddF),
+        VECTOR_LABELS(kAddI),
+        &&kAndB,
+        &&kBranch,
+        &&kCall,
+        &&kCallExternal,
+        &&kClampIndex,
+        VECTOR_LABELS(kCompareIEQ),
+        VECTOR_LABELS(kCompareINEQ),
+        VECTOR_MATRIX_LABELS(kCompareFEQ),
+        VECTOR_MATRIX_LABELS(kCompareFNEQ),
+        VECTOR_LABELS(kCompareFGT),
+        VECTOR_LABELS(kCompareFGTEQ),
+        VECTOR_LABELS(kCompareFLT),
+        VECTOR_LABELS(kCompareFLTEQ),
+        VECTOR_LABELS(kCompareSGT),
+        VECTOR_LABELS(kCompareSGTEQ),
+        VECTOR_LABELS(kCompareSLT),
+        VECTOR_LABELS(kCompareSLTEQ),
+        VECTOR_LABELS(kCompareUGT),
+        VECTOR_LABELS(kCompareUGTEQ),
+        VECTOR_LABELS(kCompareULT),
+        VECTOR_LABELS(kCompareULTEQ),
+        VECTOR_LABELS(kConvertFtoI),
+        VECTOR_LABELS(kConvertStoF),
+        VECTOR_LABELS(kConvertUtoF),
+        VECTOR_LABELS(kCos),
+        VECTOR_MATRIX_LABELS(kDivideF),
+        VECTOR_LABELS(kDivideS),
+        VECTOR_LABELS(kDivideU),
+        VECTOR_MATRIX_LABELS(kDup),
+        &&kInverse2x2,
+        &&kInverse3x3,
+        &&kInverse4x4,
+        VECTOR_LABELS(kLoad),
+        VECTOR_LABELS(kLoadGlobal),
+        VECTOR_LABELS(kLoadUniform),
+        &&kLoadSwizzle,
+        &&kLoadSwizzleGlobal,
+        &&kLoadSwizzleUniform,
+        &&kLoadExtended,
+        &&kLoadExtendedGlobal,
+        &&kLoadExtendedUniform,
+        &&kMatrixToMatrix,
+        &&kMatrixMultiply,
+        VECTOR_MATRIX_LABELS(kNegateF),
+        VECTOR_LABELS(kNegateI),
+        VECTOR_MATRIX_LABELS(kMultiplyF),
+        VECTOR_LABELS(kMultiplyI),
+        &&kNotB,
+        &&kOrB,
+        VECTOR_MATRIX_LABELS(kPop),
+        &&kPushImmediate,
+        VECTOR_LABELS(kReadExternal),
+        VECTOR_LABELS(kRemainderF),
+        VECTOR_LABELS(kRemainderS),
+        VECTOR_LABELS(kRemainderU),
+        &&kReserve,
+        &&kReturn,
+        &&kScalarToMatrix,
+        &&kShiftLeft,
+        &&kShiftRightS,
+        &&kShiftRightU,
+        VECTOR_LABELS(kSin),
+        VECTOR_LABELS(kSqrt),
+        VECTOR_LABELS(kStore),
+        VECTOR_LABELS(kStoreGlobal),
+        &&kStoreExtended,
+        &&kStoreExtendedGlobal,
+        &&kStoreSwizzle,
+        &&kStoreSwizzleGlobal,
+        &&kStoreSwizzleIndirect,
+        &&kStoreSwizzleIndirectGlobal,
+        &&kSwizzle,
+        VECTOR_MATRIX_LABELS(kSubtractF),
+        VECTOR_LABELS(kSubtractI),
+        VECTOR_LABELS(kTan),
+        VECTOR_LABELS(kWriteExternal),
+        &&kXorB,
+
+        &&kMaskPush,
+        &&kMaskPop,
+        &&kMaskNegate,
+        &&kMaskBlend,
+        &&kBranchIfAllFalse,
+
+        &&kLoopBegin,
+        &&kLoopNext,
+        &&kLoopMask,
+        &&kLoopEnd,
+        &&kLoopBreak,
+        &&kLoopContinue,
+    };
+    // Verify that the order of the labels array matches the order of the ByteCodeInstruction enum.
+    CHECK_VECTOR_MATRIX_LABELS(kAddF);
+    CHECK_VECTOR_LABELS(kAddI);
+    CHECK_LABEL(kAndB);
+    CHECK_LABEL(kBranch);
+    CHECK_LABEL(kCall);
+    CHECK_LABEL(kCallExternal);
+    CHECK_LABEL(kClampIndex);
+    CHECK_VECTOR_LABELS(kCompareIEQ);
+    CHECK_VECTOR_LABELS(kCompareINEQ);
+    CHECK_VECTOR_MATRIX_LABELS(kCompareFEQ);
+    CHECK_VECTOR_MATRIX_LABELS(kCompareFNEQ);
+    CHECK_VECTOR_LABELS(kCompareFGT);
+    CHECK_VECTOR_LABELS(kCompareFGTEQ);
+    CHECK_VECTOR_LABELS(kCompareFLT);
+    CHECK_VECTOR_LABELS(kCompareFLTEQ);
+    CHECK_VECTOR_LABELS(kCompareSGT);
+    CHECK_VECTOR_LABELS(kCompareSGTEQ);
+    CHECK_VECTOR_LABELS(kCompareSLT);
+    CHECK_VECTOR_LABELS(kCompareSLTEQ);
+    CHECK_VECTOR_LABELS(kCompareUGT);
+    CHECK_VECTOR_LABELS(kCompareUGTEQ);
+    CHECK_VECTOR_LABELS(kCompareULT);
+    CHECK_VECTOR_LABELS(kCompareULTEQ);
+    CHECK_VECTOR_LABELS(kConvertFtoI);
+    CHECK_VECTOR_LABELS(kConvertStoF);
+    CHECK_VECTOR_LABELS(kConvertUtoF);
+    CHECK_VECTOR_LABELS(kCos);
+    CHECK_VECTOR_MATRIX_LABELS(kDivideF);
+    CHECK_VECTOR_LABELS(kDivideS);
+    CHECK_VECTOR_LABELS(kDivideU);
+    CHECK_VECTOR_MATRIX_LABELS(kDup);
+    CHECK_LABEL(kInverse2x2);
+    CHECK_LABEL(kInverse3x3);
+    CHECK_LABEL(kInverse4x4);
+    CHECK_VECTOR_LABELS(kLoad);
+    CHECK_VECTOR_LABELS(kLoadGlobal);
+    CHECK_VECTOR_LABELS(kLoadUniform);
+    CHECK_LABEL(kLoadSwizzle);
+    CHECK_LABEL(kLoadSwizzleGlobal);
+    CHECK_LABEL(kLoadSwizzleUniform);
+    CHECK_LABEL(kLoadExtended);
+    CHECK_LABEL(kLoadExtendedGlobal);
+    CHECK_LABEL(kLoadExtendedUniform);
+    CHECK_LABEL(kMatrixToMatrix);
+    CHECK_LABEL(kMatrixMultiply);
+    CHECK_VECTOR_MATRIX_LABELS(kNegateF);
+    CHECK_VECTOR_LABELS(kNegateI);
+    CHECK_VECTOR_MATRIX_LABELS(kMultiplyF);
+    CHECK_VECTOR_LABELS(kMultiplyI);
+    CHECK_LABEL(kNotB);
+    CHECK_LABEL(kOrB);
+    CHECK_VECTOR_MATRIX_LABELS(kPop);
+    CHECK_LABEL(kPushImmediate);
+    CHECK_VECTOR_LABELS(kReadExternal);
+    CHECK_VECTOR_LABELS(kRemainderF);
+    CHECK_VECTOR_LABELS(kRemainderS);
+    CHECK_VECTOR_LABELS(kRemainderU);
+    CHECK_LABEL(kReserve);
+    CHECK_LABEL(kReturn);
+    CHECK_LABEL(kScalarToMatrix);
+    CHECK_LABEL(kShiftLeft);
+    CHECK_LABEL(kShiftRightS);
+    CHECK_LABEL(kShiftRightU);
+    CHECK_VECTOR_LABELS(kSin);
+    CHECK_VECTOR_LABELS(kSqrt);
+    CHECK_VECTOR_LABELS(kStore);
+    CHECK_VECTOR_LABELS(kStoreGlobal);
+    CHECK_LABEL(kStoreExtended);
+    CHECK_LABEL(kStoreExtendedGlobal);
+    CHECK_LABEL(kStoreSwizzle);
+    CHECK_LABEL(kStoreSwizzleGlobal);
+    CHECK_LABEL(kStoreSwizzleIndirect);
+    CHECK_LABEL(kStoreSwizzleIndirectGlobal);
+    CHECK_LABEL(kSwizzle);
+    CHECK_VECTOR_MATRIX_LABELS(kSubtractF);
+    CHECK_VECTOR_LABELS(kSubtractI);
+    CHECK_VECTOR_LABELS(kTan);
+    CHECK_VECTOR_LABELS(kWriteExternal);
+    CHECK_LABEL(kXorB);
+    CHECK_LABEL(kMaskPush);
+    CHECK_LABEL(kMaskPop);
+    CHECK_LABEL(kMaskNegate);
+    CHECK_LABEL(kMaskBlend);
+    CHECK_LABEL(kBranchIfAllFalse);
+    CHECK_LABEL(kLoopBegin);
+    CHECK_LABEL(kLoopNext);
+    CHECK_LABEL(kLoopMask);
+    CHECK_LABEL(kLoopEnd);
+    CHECK_LABEL(kLoopBreak);
+    CHECK_LABEL(kLoopContinue);
+    f->fPreprocessOnce([f] { ((ByteCodeFunction*)f)->preprocess(labels); });
+#endif
+
+    // Needs to be the first N non-negative integers, at least as large as VecWidth
+    static const Interpreter::I32 gLanes = {
+        0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+    };
+
+    VValue* sp = stack + f->fParameterCount + f->fLocalCount - 1;
+
+    #define POP() (*(sp--))
+    #define PUSH(v) (sp[1] = v, ++sp)
+
+    const uint8_t* code = f->fCode.data();
+    const uint8_t* ip = code;
+    std::vector<StackFrame> frames;
+
+    I32 condStack[16];  // Independent condition masks
+    I32 maskStack[16];  // Combined masks (eg maskStack[0] & maskStack[1] & ...)
+    I32 contStack[16];  // Continue flags for loops
+    I32 loopStack[16];  // Loop execution masks
+    condStack[0] = maskStack[0] = (gLanes < N);
+    contStack[0] = I32( 0);
+    loopStack[0] = I32(~0);
+    I32* condPtr = condStack;
+    I32* maskPtr = maskStack;
+    I32* contPtr = contStack;
+    I32* loopPtr = loopStack;
+
+    if (f->fConditionCount + 1 > (int)SK_ARRAY_COUNT(condStack) ||
+        f->fLoopCount + 1 > (int)SK_ARRAY_COUNT(loopStack)) {
+        return false;
+    }
+
+    auto mask = [&]() { return *maskPtr & *loopPtr; };
+
+#ifdef SKSLC_THREADED_CODE
+    // If the "labels as values" extension is available, we implement this using threaded code.
+    // Instead of opcodes, the code directly contains the addresses of the labels to jump to. Then
+    // the code for each opcode simply grabs the address of the next opcode and uses a goto to jump
+    // there.
+    NEXT();
+#else
+    // Otherwise, we have to use a switch statement and a loop to execute the right label.
+    for (;;) {
+        #ifdef TRACE
+            printf("at %3d ", (int) (ip - code));
+            disassemble_instruction(ip);
+            printf(" (stack: %d)\n", (int) (sp - stack) + 1);
+        #endif
+        switch ((ByteCodeInstruction) READ16()) {
+#endif
+
+    VECTOR_MATRIX_BINARY_OP(kAddF, fFloat, +)
+    VECTOR_BINARY_OP(kAddI, fSigned, +)
+
+    // Booleans are integer masks: 0/~0 for false/true. So bitwise ops do what we want:
+    LABEL(kAndB)
+        sp[-1] = sp[-1].fSigned & sp[0].fSigned;
+        POP();
+        NEXT();
+    LABEL(kNotB)
+        sp[0] = ~sp[0].fSigned;
+        NEXT();
+    LABEL(kOrB)
+        sp[-1] = sp[-1].fSigned | sp[0].fSigned;
+        POP();
+        NEXT();
+    LABEL(kXorB)
+        sp[-1] = sp[-1].fSigned ^ sp[0].fSigned;
+        POP();
+        NEXT();
+
+    LABEL(kBranch)
+        ip = code + READ16();
+        NEXT();
+
+    LABEL(kCall) {
+        // Precursor code reserved space for the return value, and pushed all parameters to
+        // the stack. Update our bottom of stack to point at the first parameter, and our
+        // sp to point past those parameters (plus space for locals).
+        int target = READ8();
+        const ByteCodeFunction* fun = byteCode->fFunctions[target].get();
+#ifdef SKSLC_THREADED_CODE
+        fun->fPreprocessOnce([fun] { ((ByteCodeFunction*)fun)->preprocess(labels); });
+#endif
+        if (skvx::any(mask())) {
+            frames.push_back({ code, ip, stack, fun->fParameterCount });
+            ip = code = fun->fCode.data();
+            stack = sp - fun->fParameterCount + 1;
+            sp = stack + fun->fParameterCount + fun->fLocalCount - 1;
+        }
+        NEXT();
+    }
+
+    LABEL(kCallExternal) {
+        CallExternal(byteCode, ip, sp, baseIndex, mask());
+        NEXT();
+    }
+
+    LABEL(kClampIndex) {
+        int length = READ8();
+        if (skvx::any(mask() & ((sp[0].fSigned < 0) | (sp[0].fSigned >= length)))) {
+            return false;
+        }
+        NEXT();
+    }
+
+    VECTOR_BINARY_OP(kCompareIEQ, fSigned, ==)
+    VECTOR_MATRIX_BINARY_OP(kCompareFEQ, fFloat, ==)
+    VECTOR_BINARY_OP(kCompareINEQ, fSigned, !=)
+    VECTOR_MATRIX_BINARY_OP(kCompareFNEQ, fFloat, !=)
+    VECTOR_BINARY_OP(kCompareSGT, fSigned, >)
+    VECTOR_BINARY_OP(kCompareUGT, fUnsigned, >)
+    VECTOR_BINARY_OP(kCompareFGT, fFloat, >)
+    VECTOR_BINARY_OP(kCompareSGTEQ, fSigned, >=)
+    VECTOR_BINARY_OP(kCompareUGTEQ, fUnsigned, >=)
+    VECTOR_BINARY_OP(kCompareFGTEQ, fFloat, >=)
+    VECTOR_BINARY_OP(kCompareSLT, fSigned, <)
+    VECTOR_BINARY_OP(kCompareULT, fUnsigned, <)
+    VECTOR_BINARY_OP(kCompareFLT, fFloat, <)
+    VECTOR_BINARY_OP(kCompareSLTEQ, fSigned, <=)
+    VECTOR_BINARY_OP(kCompareULTEQ, fUnsigned, <=)
+    VECTOR_BINARY_OP(kCompareFLTEQ, fFloat, <=)
+
+    LABEL(kConvertFtoI4) sp[-3] = skvx::cast<int>(sp[-3].fFloat);
+    LABEL(kConvertFtoI3) sp[-2] = skvx::cast<int>(sp[-2].fFloat);
+    LABEL(kConvertFtoI2) sp[-1] = skvx::cast<int>(sp[-1].fFloat);
+    LABEL(kConvertFtoI)  sp[ 0] = skvx::cast<int>(sp[ 0].fFloat);
+                         NEXT();
+
+    LABEL(kConvertStoF4) sp[-3] = skvx::cast<float>(sp[-3].fSigned);
+    LABEL(kConvertStoF3) sp[-2] = skvx::cast<float>(sp[-2].fSigned);
+    LABEL(kConvertStoF2) sp[-1] = skvx::cast<float>(sp[-1].fSigned);
+    LABEL(kConvertStoF)  sp[ 0] = skvx::cast<float>(sp[ 0].fSigned);
+                         NEXT();
+
+    LABEL(kConvertUtoF4) sp[-3] = skvx::cast<float>(sp[-3].fUnsigned);
+    LABEL(kConvertUtoF3) sp[-2] = skvx::cast<float>(sp[-2].fUnsigned);
+    LABEL(kConvertUtoF2) sp[-1] = skvx::cast<float>(sp[-1].fUnsigned);
+    LABEL(kConvertUtoF)  sp[ 0] = skvx::cast<float>(sp[ 0].fUnsigned);
+                         NEXT();
+
+    VECTOR_UNARY_FN_VEC(kCos, cosf)
+
+    VECTOR_BINARY_MASKED_OP(kDivideS, fSigned, /)
+    VECTOR_BINARY_MASKED_OP(kDivideU, fUnsigned, /)
+    VECTOR_MATRIX_BINARY_OP(kDivideF, fFloat, /)
+
+    LABEL(kDup4) PUSH(sp[1 - ip[0]]);
+    LABEL(kDup3) PUSH(sp[1 - ip[0]]);
+    LABEL(kDup2) PUSH(sp[1 - ip[0]]);
+    LABEL(kDup)  PUSH(sp[1 - ip[0]]);
+                 ++ip;
+                 NEXT();
+
+    LABEL(kDupN) {
+        int count = READ8();
+        memcpy(sp + 1, sp - count + 1, count * sizeof(VValue));
+        sp += count;
+        NEXT();
+    }
+
+    LABEL(kInverse2x2) {
+        Inverse2x2(sp);
+        NEXT();
+    }
+    LABEL(kInverse3x3) {
+        Inverse3x3(sp);
+        NEXT();
+    }
+    LABEL(kInverse4x4) {
+        Inverse4x4(sp);
+        NEXT();
+    }
+
+    LABEL(kLoad4) sp[4] = stack[ip[1] + 3];
+    LABEL(kLoad3) sp[3] = stack[ip[1] + 2];
+    LABEL(kLoad2) sp[2] = stack[ip[1] + 1];
+    LABEL(kLoad)  sp[1] = stack[ip[1] + 0];
+                  sp += ip[0];
+                  ip += 2;
+                  NEXT();
+
+    LABEL(kLoadGlobal4) sp[4] = globals[ip[1] + 3];
+    LABEL(kLoadGlobal3) sp[3] = globals[ip[1] + 2];
+    LABEL(kLoadGlobal2) sp[2] = globals[ip[1] + 1];
+    LABEL(kLoadGlobal)  sp[1] = globals[ip[1] + 0];
+                        sp += ip[0];
+                        ip += 2;
+                        NEXT();
+
+    LABEL(kLoadUniform4) sp[4].fFloat = uniforms[ip[1] + 3];
+    LABEL(kLoadUniform3) sp[3].fFloat = uniforms[ip[1] + 2];
+    LABEL(kLoadUniform2) sp[2].fFloat = uniforms[ip[1] + 1];
+    LABEL(kLoadUniform)  sp[1].fFloat = uniforms[ip[1] + 0];
+                        sp += ip[0];
+                        ip += 2;
+                        NEXT();
+
+    LABEL(kLoadExtended) {
+        int count = READ8();
+        I32 src = POP().fSigned;
+        I32 m = mask();
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    sp[i + 1].fSigned[j] = stack[src[j] + i].fSigned[j];
+                }
+            }
+        }
+        sp += count;
+        NEXT();
+    }
+
+    LABEL(kLoadExtendedGlobal) {
+        int count = READ8();
+        I32 src = POP().fSigned;
+        I32 m = mask();
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    sp[i + 1].fSigned[j] = globals[src[j] + i].fSigned[j];
+                }
+            }
+        }
+        sp += count;
+        NEXT();
+    }
+
+    LABEL(kLoadExtendedUniform) {
+        int count = READ8();
+        I32 src = POP().fSigned;
+        I32 m = mask();
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    sp[i + 1].fFloat[j] = uniforms[src[j] + i];
+                }
+            }
+        }
+        sp += count;
+        NEXT();
+    }
+
+    LABEL(kLoadSwizzle) {
+        int src = READ8();
+        int count = READ8();
+        for (int i = 0; i < count; ++i) {
+            PUSH(stack[src + *(ip + i)]);
+        }
+        ip += count;
+        NEXT();
+    }
+
+    LABEL(kLoadSwizzleGlobal) {
+        int src = READ8();
+        int count = READ8();
+        for (int i = 0; i < count; ++i) {
+            PUSH(globals[src + *(ip + i)]);
+        }
+        ip += count;
+        NEXT();
+    }
+
+    LABEL(kLoadSwizzleUniform) {
+        int src = READ8();
+        int count = READ8();
+        for (int i = 0; i < count; ++i) {
+            PUSH(F32(uniforms[src + *(ip + i)]));
+        }
+        ip += count;
+        NEXT();
+    }
+
+    LABEL(kMatrixToMatrix) {
+        int srcCols = READ8();
+        int srcRows = READ8();
+        int dstCols = READ8();
+        int dstRows = READ8();
+        SkASSERT(srcCols >= 2 && srcCols <= 4);
+        SkASSERT(srcRows >= 2 && srcRows <= 4);
+        SkASSERT(dstCols >= 2 && dstCols <= 4);
+        SkASSERT(dstRows >= 2 && dstRows <= 4);
+        F32 tmp[16];
+        memset(tmp, 0, sizeof(tmp));
+        tmp[0] = tmp[5] = tmp[10] = tmp[15] = F32(1.0f);
+        for (int c = srcCols - 1; c >= 0; --c) {
+            for (int r = srcRows - 1; r >= 0; --r) {
+                tmp[c*4 + r] = POP().fFloat;
+            }
+        }
+        for (int c = 0; c < dstCols; ++c) {
+            for (int r = 0; r < dstRows; ++r) {
+                PUSH(tmp[c*4 + r]);
+            }
+        }
+        NEXT();
+    }
+
+    LABEL(kMatrixMultiply) {
+        int lCols = READ8();
+        int lRows = READ8();
+        int rCols = READ8();
+        int rRows = lCols;
+        F32 tmp[16] = { 0.0f };
+        F32* B = &(sp - (rCols * rRows) + 1)->fFloat;
+        F32* A = B - (lCols * lRows);
+        for (int c = 0; c < rCols; ++c) {
+            for (int r = 0; r < lRows; ++r) {
+                for (int j = 0; j < lCols; ++j) {
+                    tmp[c*lRows + r] += A[j*lRows + r] * B[c*rRows + j];
+                }
+            }
+        }
+        sp -= (lCols * lRows) + (rCols * rRows);
+        memcpy(sp + 1, tmp, rCols * lRows * sizeof(VValue));
+        sp += (rCols * lRows);
+        NEXT();
+    }
+
+    VECTOR_BINARY_OP(kMultiplyI, fSigned, *)
+    VECTOR_MATRIX_BINARY_OP(kMultiplyF, fFloat, *)
+
+    LABEL(kNegateF4) sp[-3] = -sp[-3].fFloat;
+    LABEL(kNegateF3) sp[-2] = -sp[-2].fFloat;
+    LABEL(kNegateF2) sp[-1] = -sp[-1].fFloat;
+    LABEL(kNegateF)  sp[ 0] = -sp[ 0].fFloat;
+                     NEXT();
+
+    LABEL(kNegateFN) {
+        int count = READ8();
+        for (int i = count - 1; i >= 0; --i) {
+            sp[-i] = -sp[-i].fFloat;
+        }
+        NEXT();
+    }
+
+    LABEL(kNegateI4) sp[-3] = -sp[-3].fSigned;
+    LABEL(kNegateI3) sp[-2] = -sp[-2].fSigned;
+    LABEL(kNegateI2) sp[-1] = -sp[-1].fSigned;
+    LABEL(kNegateI)  sp[ 0] = -sp[ 0].fSigned;
+                     NEXT();
+
+    LABEL(kPop4) POP();
+    LABEL(kPop3) POP();
+    LABEL(kPop2) POP();
+    LABEL(kPop)  POP();
+                 NEXT();
+
+    LABEL(kPopN)
+        sp -= READ8();
+        NEXT();
+
+    LABEL(kPushImmediate)
+        PUSH(U32(READ32()));
+        NEXT();
+
+    LABEL(kReadExternal)
+    LABEL(kReadExternal2)
+    LABEL(kReadExternal3)
+    LABEL(kReadExternal4) {
+        int count = READ8();
+        int src = READ8();
+        float tmp[4];
+        I32 m = mask();
+        for (int i = 0; i < VecWidth; ++i) {
+            if (m[i]) {
+                byteCode->fExternalValues[src]->read(baseIndex + i, tmp);
+                for (int j = 0; j < count; ++j) {
+                    sp[j + 1].fFloat[i] = tmp[j];
+                }
+            }
+        }
+        sp += count;
+        NEXT();
+    }
+
+    VECTOR_BINARY_FN(kRemainderF, fFloat, VecMod)
+    VECTOR_BINARY_MASKED_OP(kRemainderS, fSigned, %)
+    VECTOR_BINARY_MASKED_OP(kRemainderU, fUnsigned, %)
+
+    LABEL(kReserve)
+        sp += READ8();
+        NEXT();
+
+    LABEL(kReturn) {
+        int count = READ8();
+        if (frames.empty()) {
+            if (outReturn) {
+                VValue* src = sp - count + 1;
+                if (stripedOutput) {
+                    for (int i = 0; i < count; ++i) {
+                        memcpy(outReturn[i], &src->fFloat, N * sizeof(float));
+                        ++src;
+                    }
+                } else {
+                    float* outPtr = outReturn[0];
+                    for (int i = 0; i < count; ++i) {
+                        for (int j = 0; j < N; ++j) {
+                            outPtr[count * j] = src->fFloat[j];
+                        }
+                        ++outPtr;
+                        ++src;
+                    }
+                }
+            }
+            return true;
+        } else {
+            // When we were called, the caller reserved stack space for their copy of our
+            // return value, then 'stack' was positioned after that, where our parameters
+            // were placed. Copy our return values to their reserved area.
+            memcpy(stack - count, sp - count + 1, count * sizeof(VValue));
+
+            // Now move the stack pointer to the end of the passed-in parameters. This odd
+            // calling convention requires the caller to pop the arguments after calling,
+            // but allows them to store any out-parameters back during that unwinding.
+            // After that sequence finishes, the return value will be the top of the stack.
+            const StackFrame& frame(frames.back());
+            sp = stack + frame.fParameterCount - 1;
+            stack = frame.fStack;
+            code = frame.fCode;
+            ip = frame.fIP;
+            frames.pop_back();
+            NEXT();
+        }
+    }
+
+    LABEL(kScalarToMatrix) {
+        int cols = READ8();
+        int rows = READ8();
+        VValue v = POP();
+        for (int c = 0; c < cols; ++c) {
+            for (int r = 0; r < rows; ++r) {
+                PUSH(c == r ? v : F32(0.0f));
+            }
+        }
+        NEXT();
+    }
+
+    LABEL(kShiftLeft)
+        sp[0] = sp[0].fSigned << READ8();
+        NEXT();
+    LABEL(kShiftRightS)
+        sp[0] = sp[0].fSigned >> READ8();
+        NEXT();
+    LABEL(kShiftRightU)
+        sp[0] = sp[0].fUnsigned >> READ8();
+        NEXT();
+
+    VECTOR_UNARY_FN_VEC(kSin, sinf)
+    VECTOR_UNARY_FN(kSqrt, skvx::sqrt, fFloat)
+
+    LABEL(kStore4)
+        stack[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+3].fFloat);
+    LABEL(kStore3)
+        stack[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+2].fFloat);
+    LABEL(kStore2)
+        stack[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+1].fFloat);
+    LABEL(kStore)
+        stack[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+0].fFloat);
+        ++ip;
+        NEXT();
+
+    LABEL(kStoreGlobal4)
+        globals[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+3].fFloat);
+    LABEL(kStoreGlobal3)
+        globals[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+2].fFloat);
+    LABEL(kStoreGlobal2)
+        globals[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+1].fFloat);
+    LABEL(kStoreGlobal)
+        globals[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+0].fFloat);
+        ++ip;
+        NEXT();
+
+    LABEL(kStoreExtended) {
+        int count = READ8();
+        I32 target = POP().fSigned;
+        VValue* src = sp - count + 1;
+        I32 m = mask();
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    stack[target[j] + i].fSigned[j] = src[i].fSigned[j];
+                }
+            }
+        }
+        sp -= count;
+        NEXT();
+    }
+    LABEL(kStoreExtendedGlobal) {
+        int count = READ8();
+        I32 target = POP().fSigned;
+        VValue* src = sp - count + 1;
+        I32 m = mask();
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    globals[target[j] + i].fSigned[j] = src[i].fSigned[j];
+                }
+            }
+        }
+        sp -= count;
+        NEXT();
+    }
+
+    LABEL(kStoreSwizzle) {
+        int target = READ8();
+        int count = READ8();
+        for (int i = count - 1; i >= 0; --i) {
+            stack[target + *(ip + i)] = skvx::if_then_else(
+                    mask(), POP().fFloat, stack[target + *(ip + i)].fFloat);
+        }
+        ip += count;
+        NEXT();
+    }
+
+    LABEL(kStoreSwizzleGlobal) {
+        int target = READ8();
+        int count = READ8();
+        for (int i = count - 1; i >= 0; --i) {
+            globals[target + *(ip + i)] = skvx::if_then_else(
+                    mask(), POP().fFloat, globals[target + *(ip + i)].fFloat);
+        }
+        ip += count;
+        NEXT();
+    }
+
+    LABEL(kStoreSwizzleIndirect) {
+        int count = READ8();
+        I32 target = POP().fSigned;
+        I32 m = mask();
+        for (int i = count - 1; i >= 0; --i) {
+            I32 v = POP().fSigned;
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    stack[target[j] + *(ip + i)].fSigned[j] = v[j];
+                }
+            }
+        }
+        ip += count;
+        NEXT();
+    }
+
+    LABEL(kStoreSwizzleIndirectGlobal) {
+        int count = READ8();
+        I32 target = POP().fSigned;
+        I32 m = mask();
+        for (int i = count - 1; i >= 0; --i) {
+            I32 v = POP().fSigned;
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    globals[target[j] + *(ip + i)].fSigned[j] = v[j];
+                }
+            }
+        }
+        ip += count;
+        NEXT();
+    }
+
+    VECTOR_BINARY_OP(kSubtractI, fSigned, -)
+    VECTOR_MATRIX_BINARY_OP(kSubtractF, fFloat, -)
+
+    LABEL(kSwizzle) {
+        VValue tmp[4];
+        for (int i = READ8() - 1; i >= 0; --i) {
+            tmp[i] = POP();
+        }
+        for (int i = READ8() - 1; i >= 0; --i) {
+            PUSH(tmp[READ8()]);
+        }
+        NEXT();
+    }
+
+    VECTOR_UNARY_FN_VEC(kTan, tanf)
+
+    LABEL(kWriteExternal4)
+    LABEL(kWriteExternal3)
+    LABEL(kWriteExternal2)
+    LABEL(kWriteExternal) {
+        int count = READ8();
+        int target = READ8();
+        float tmp[4];
+        I32 m = mask();
+        sp -= count;
+        for (int i = 0; i < VecWidth; ++i) {
+            if (m[i]) {
+                for (int j = 0; j < count; ++j) {
+                    tmp[j] = sp[j + 1].fFloat[i];
+                }
+                byteCode->fExternalValues[target]->write(baseIndex + i, tmp);
+            }
+        }
+        NEXT();
+    }
+
+    LABEL(kMaskPush)
+        condPtr[1] = POP().fSigned;
+        maskPtr[1] = maskPtr[0] & condPtr[1];
+        ++condPtr; ++maskPtr;
+        NEXT();
+    LABEL(kMaskPop)
+        --condPtr; --maskPtr;
+        NEXT();
+    LABEL(kMaskNegate)
+        maskPtr[0] = maskPtr[-1] & ~condPtr[0];
+        NEXT();
+    LABEL(kMaskBlend) {
+        int count = READ8();
+        I32 m = condPtr[0];
+        --condPtr; --maskPtr;
+        for (int i = 0; i < count; ++i) {
+            sp[-count] = skvx::if_then_else(m, sp[-count].fFloat, sp[0].fFloat);
+            --sp;
+        }
+        NEXT();
+    }
+    LABEL(kBranchIfAllFalse) {
+        int target = READ16();
+        if (!skvx::any(mask())) {
+            ip = code + target;
+        }
+        NEXT();
+    }
+
+    LABEL(kLoopBegin)
+        contPtr[1] = 0;
+        loopPtr[1] = loopPtr[0];
+        ++contPtr; ++loopPtr;
+        NEXT();
+    LABEL(kLoopNext)
+        *loopPtr |= *contPtr;
+        *contPtr = 0;
+        NEXT();
+    LABEL(kLoopMask)
+        *loopPtr &= POP().fSigned;
+        NEXT();
+    LABEL(kLoopEnd)
+        --contPtr; --loopPtr;
+        NEXT();
+    LABEL(kLoopBreak)
+        *loopPtr &= ~mask();
+        NEXT();
+    LABEL(kLoopContinue) {
+        I32 m = mask();
+        *contPtr |=  m;
+        *loopPtr &= ~m;
+        NEXT();
+    }
+#ifdef SKSLC_THREADED_CODE
+    #ifdef TRACE
+        next:
+            printf("at %3d (stack: %d) (disable threaded code for disassembly)\n",
+                   (int) (ip - code), (int) (sp - stack) + 1);
+            goto *READ_INST();
+    #endif
+#else
+        }
+    }
+#endif
+}
+
+}; // class Interpreter
+
+#endif // SK_ENABLE_SKSL_INTERPRETER
+
+#undef spf
+
+void ByteCodeFunction::disassemble() const {
+#if defined(SK_ENABLE_SKSL_INTERPRETER)
+    const uint8_t* ip = fCode.data();
+    while (ip < fCode.data() + fCode.size()) {
+        printf("%d: ", (int)(ip - fCode.data()));
+        ip = Interpreter::DisassembleInstruction(ip);
+        printf("\n");
+    }
+#endif
+}
+
+#define VECTOR_PREPROCESS(base)          \
+    case ByteCodeInstruction::base ## 4: \
+    case ByteCodeInstruction::base ## 3: \
+    case ByteCodeInstruction::base ## 2: \
+    case ByteCodeInstruction::base: READ8(); break;
+
+#define VECTOR_PREPROCESS_NO_COUNT(base) \
+    case ByteCodeInstruction::base ## 4: \
+    case ByteCodeInstruction::base ## 3: \
+    case ByteCodeInstruction::base ## 2: \
+    case ByteCodeInstruction::base: break;
+
+#define VECTOR_MATRIX_PREPROCESS(base) \
+    VECTOR_PREPROCESS(base)            \
+    case ByteCodeInstruction::base ## N: READ8(); break;
+
+#define VECTOR_MATRIX_PREPROCESS_NO_COUNT(base) \
+    VECTOR_PREPROCESS_NO_COUNT(base)            \
+    case ByteCodeInstruction::base ## N: READ8(); break;
+
+void ByteCodeFunction::preprocess(const void* labels[]) {
+#if defined(SK_ENABLE_SKSL_INTERPRETER)
+#ifdef TRACE
+    this->disassemble();
+#endif
+    uint8_t* ip = fCode.data();
+    while (ip < fCode.data() + fCode.size()) {
+        ByteCodeInstruction inst = (ByteCodeInstruction) (intptr_t) READ_INST();
+        const void* label = labels[(int) inst];
+        memcpy(ip - sizeof(instruction), &label, sizeof(label));
+        switch (inst) {
+            VECTOR_MATRIX_PREPROCESS(kAddF)
+            VECTOR_PREPROCESS(kAddI)
+            case ByteCodeInstruction::kAndB: break;
+            case ByteCodeInstruction::kBranch: READ16(); break;
+            case ByteCodeInstruction::kCall: READ8(); break;
+            case ByteCodeInstruction::kCallExternal: {
+                READ8();
+                READ8();
+                READ8();
+                break;
+            }
+            case ByteCodeInstruction::kClampIndex: READ8(); break;
+            VECTOR_PREPROCESS(kCompareIEQ)
+            VECTOR_PREPROCESS(kCompareINEQ)
+            VECTOR_MATRIX_PREPROCESS(kCompareFEQ)
+            VECTOR_MATRIX_PREPROCESS(kCompareFNEQ)
+            VECTOR_PREPROCESS(kCompareFGT)
+            VECTOR_PREPROCESS(kCompareFGTEQ)
+            VECTOR_PREPROCESS(kCompareFLT)
+            VECTOR_PREPROCESS(kCompareFLTEQ)
+            VECTOR_PREPROCESS(kCompareSGT)
+            VECTOR_PREPROCESS(kCompareSGTEQ)
+            VECTOR_PREPROCESS(kCompareSLT)
+            VECTOR_PREPROCESS(kCompareSLTEQ)
+            VECTOR_PREPROCESS(kCompareUGT)
+            VECTOR_PREPROCESS(kCompareUGTEQ)
+            VECTOR_PREPROCESS(kCompareULT)
+            VECTOR_PREPROCESS(kCompareULTEQ)
+            VECTOR_PREPROCESS_NO_COUNT(kConvertFtoI)
+            VECTOR_PREPROCESS_NO_COUNT(kConvertStoF)
+            VECTOR_PREPROCESS_NO_COUNT(kConvertUtoF)
+            VECTOR_PREPROCESS(kCos)
+            VECTOR_MATRIX_PREPROCESS(kDivideF)
+            VECTOR_PREPROCESS(kDivideS)
+            VECTOR_PREPROCESS(kDivideU)
+            VECTOR_MATRIX_PREPROCESS(kDup)
+
+            case ByteCodeInstruction::kInverse2x2:
+            case ByteCodeInstruction::kInverse3x3:
+            case ByteCodeInstruction::kInverse4x4: break;
+
+            case ByteCodeInstruction::kLoad:
+            case ByteCodeInstruction::kLoad2:
+            case ByteCodeInstruction::kLoad3:
+            case ByteCodeInstruction::kLoad4:
+            case ByteCodeInstruction::kLoadGlobal:
+            case ByteCodeInstruction::kLoadGlobal2:
+            case ByteCodeInstruction::kLoadGlobal3:
+            case ByteCodeInstruction::kLoadGlobal4:
+            case ByteCodeInstruction::kLoadUniform:
+            case ByteCodeInstruction::kLoadUniform2:
+            case ByteCodeInstruction::kLoadUniform3:
+            case ByteCodeInstruction::kLoadUniform4: READ16(); break;
+
+            case ByteCodeInstruction::kLoadSwizzle:
+            case ByteCodeInstruction::kLoadSwizzleGlobal:
+            case ByteCodeInstruction::kLoadSwizzleUniform: {
+                READ8();
+                int count = READ8();
+                ip += count;
+                break;
+            }
+
+            case ByteCodeInstruction::kLoadExtended:
+            case ByteCodeInstruction::kLoadExtendedGlobal:
+            case ByteCodeInstruction::kLoadExtendedUniform:
+                READ8();
+                break;
+
+            case ByteCodeInstruction::kMatrixToMatrix: {
+                READ8();
+                READ8();
+                READ8();
+                READ8();
+                break;
+            }
+            case ByteCodeInstruction::kMatrixMultiply: {
+                READ8();
+                READ8();
+                READ8();
+                break;
+            }
+            VECTOR_MATRIX_PREPROCESS(kMultiplyF)
+            VECTOR_PREPROCESS(kMultiplyI)
+            VECTOR_MATRIX_PREPROCESS_NO_COUNT(kNegateF)
+            VECTOR_PREPROCESS_NO_COUNT(kNegateI)
+            case ByteCodeInstruction::kNotB: break;
+            case ByteCodeInstruction::kOrB: break;
+            VECTOR_MATRIX_PREPROCESS_NO_COUNT(kPop)
+            case ByteCodeInstruction::kPushImmediate: READ32(); break;
+
+            case ByteCodeInstruction::kReadExternal:
+            case ByteCodeInstruction::kReadExternal2:
+            case ByteCodeInstruction::kReadExternal3:
+            case ByteCodeInstruction::kReadExternal4: READ16(); break;
+
+            VECTOR_PREPROCESS(kRemainderF)
+            VECTOR_PREPROCESS(kRemainderS)
+            VECTOR_PREPROCESS(kRemainderU)
+            case ByteCodeInstruction::kReserve: READ8(); break;
+            case ByteCodeInstruction::kReturn: READ8(); break;
+            case ByteCodeInstruction::kScalarToMatrix: READ8(); READ8(); break;
+            case ByteCodeInstruction::kShiftLeft: READ8(); break;
+            case ByteCodeInstruction::kShiftRightS: READ8(); break;
+            case ByteCodeInstruction::kShiftRightU: READ8(); break;
+            VECTOR_PREPROCESS(kSin)
+            VECTOR_PREPROCESS_NO_COUNT(kSqrt)
+
+            case ByteCodeInstruction::kStore:
+            case ByteCodeInstruction::kStore2:
+            case ByteCodeInstruction::kStore3:
+            case ByteCodeInstruction::kStore4:
+            case ByteCodeInstruction::kStoreGlobal:
+            case ByteCodeInstruction::kStoreGlobal2:
+            case ByteCodeInstruction::kStoreGlobal3:
+            case ByteCodeInstruction::kStoreGlobal4: READ8(); break;
+
+            case ByteCodeInstruction::kStoreSwizzle:
+            case ByteCodeInstruction::kStoreSwizzleGlobal: {
+                READ8();
+                int count = READ8();
+                ip += count;
+                break;
+            }
+
+            case ByteCodeInstruction::kStoreSwizzleIndirect:
+            case ByteCodeInstruction::kStoreSwizzleIndirectGlobal: {
+                int count = READ8();
+                ip += count;
+                break;
+            }
+
+            case ByteCodeInstruction::kStoreExtended: READ8(); break;
+            case ByteCodeInstruction::kStoreExtendedGlobal: READ8(); break;
+
+            VECTOR_MATRIX_PREPROCESS(kSubtractF)
+            VECTOR_PREPROCESS(kSubtractI)
+
+            case ByteCodeInstruction::kSwizzle: {
+                READ8();
+                int count = READ8();
+                ip += count;
+                break;
+            }
+            VECTOR_PREPROCESS(kTan)
+            case ByteCodeInstruction::kWriteExternal:
+            case ByteCodeInstruction::kWriteExternal2:
+            case ByteCodeInstruction::kWriteExternal3:
+            case ByteCodeInstruction::kWriteExternal4: READ16(); break;
+
+            case ByteCodeInstruction::kXorB: break;
+            case ByteCodeInstruction::kMaskPush: break;
+            case ByteCodeInstruction::kMaskPop: break;
+            case ByteCodeInstruction::kMaskNegate: break;
+            case ByteCodeInstruction::kMaskBlend: READ8(); break;
+            case ByteCodeInstruction::kBranchIfAllFalse: READ16(); break;
+            case ByteCodeInstruction::kLoopBegin: break;
+            case ByteCodeInstruction::kLoopNext: break;
+            case ByteCodeInstruction::kLoopMask: break;
+            case ByteCodeInstruction::kLoopEnd: break;
+            case ByteCodeInstruction::kLoopContinue:  break;
+            case ByteCodeInstruction::kLoopBreak: break;
+            default:
+                ip -= 2;
+                printf("unknown(%d)\n", READ16());
+                SkASSERT(false);
+        }
+    }
+#endif
+}
+
+bool ByteCode::run(const ByteCodeFunction* f,
+                   float* args, int argCount,
+                   float* outReturn, int returnCount,
+                   const float* uniforms, int uniformCount) const {
+#if defined(SK_ENABLE_SKSL_INTERPRETER)
+    Interpreter::VValue stack[128];
+    int stackNeeded = f->fParameterCount + f->fLocalCount + f->fStackCount;
+    if (stackNeeded > (int)SK_ARRAY_COUNT(stack)) {
+        return false;
+    }
+
+    if (argCount != f->fParameterCount ||
+        returnCount != f->fReturnCount ||
+        uniformCount != fUniformSlotCount) {
+        return false;
+    }
+
+    Interpreter::VValue globals[32];
+    if (fGlobalSlotCount > (int)SK_ARRAY_COUNT(globals)) {
+        return false;
+    }
+
+    // Transpose args into stack
+    {
+        float* src = args;
+        float* dst = (float*)stack;
+        for (int i = 0; i < argCount; ++i) {
+            *dst = *src++;
+            dst += VecWidth;
+        }
+    }
+
+    bool stripedOutput = false;
+    float** outArray = outReturn ? &outReturn : nullptr;
+    if (!Interpreter::InnerRun(this, f, stack, outArray, globals, uniforms, stripedOutput, 1, 0)) {
+        return false;
+    }
+
+    // Transpose out parameters back
+    {
+        float* dst = args;
+        float* src = (float*)stack;
+        for (const auto& p : f->fParameters) {
+            if (p.fIsOutParameter) {
+                for (int i = p.fSlotCount; i > 0; --i) {
+                    *dst++ = *src;
+                    src += VecWidth;
+                }
+            } else {
+                dst += p.fSlotCount;
+                src += p.fSlotCount * VecWidth;
+            }
+        }
+    }
+
+    return true;
+#else
+    SkDEBUGFAIL("ByteCode interpreter not enabled");
+    return false;
+#endif
+}
+
+bool ByteCode::runStriped(const ByteCodeFunction* f, int N,
+                          float* args[], int argCount,
+                          float* outReturn[], int returnCount,
+                          const float* uniforms, int uniformCount) const {
+#if defined(SK_ENABLE_SKSL_INTERPRETER)
+    Interpreter::VValue stack[128];
+    int stackNeeded = f->fParameterCount + f->fLocalCount + f->fStackCount;
+    if (stackNeeded > (int)SK_ARRAY_COUNT(stack)) {
+        return false;
+    }
+
+    if (argCount != f->fParameterCount ||
+        returnCount != f->fReturnCount ||
+        uniformCount != fUniformSlotCount) {
+        return false;
+    }
+
+    Interpreter::VValue globals[32];
+    if (fGlobalSlotCount > (int)SK_ARRAY_COUNT(globals)) {
+        return false;
+    }
+
+    // innerRun just takes outArgs, so clear it if the count is zero
+    if (returnCount == 0) {
+        outReturn = nullptr;
+    }
+
+    int baseIndex = 0;
+
+    while (N) {
+        int w = std::min(N, VecWidth);
+
+        // Copy args into stack
+        for (int i = 0; i < argCount; ++i) {
+            memcpy((void*)(stack + i), args[i], w * sizeof(float));
+        }
+
+        bool stripedOutput = true;
+        if (!Interpreter::InnerRun(this, f, stack, outReturn, globals, uniforms, stripedOutput, w,
+                                   baseIndex)) {
+            return false;
+        }
+
+        // Copy out parameters back
+        int slot = 0;
+        for (const auto& p : f->fParameters) {
+            if (p.fIsOutParameter) {
+                for (int i = slot; i < slot + p.fSlotCount; ++i) {
+                    memcpy(args[i], stack + i, w * sizeof(float));
+                }
+            }
+            slot += p.fSlotCount;
+        }
+
+        // Step each argument pointer ahead
+        for (int i = 0; i < argCount; ++i) {
+            args[i] += w;
+        }
+        N -= w;
+        baseIndex += w;
+    }
+
+    return true;
+#else
+    SkDEBUGFAIL("ByteCode interpreter not enabled");
+    return false;
+#endif
+}
+
+} // namespace SkSL
+
+#endif
diff --git a/src/sksl/SkSLByteCode.h b/src/sksl/SkSLByteCode.h
index 5b8a970..f917eec 100644
--- a/src/sksl/SkSLByteCode.h
+++ b/src/sksl/SkSLByteCode.h
@@ -9,59 +9,206 @@
 #define SKSL_BYTECODE
 
 #include "include/private/SkOnce.h"
-#include "include/private/SkVx.h"
 #include "src/sksl/SkSLString.h"
-#include "src/sksl/ir/SkSLFunctionDeclaration.h"
 
 #include <memory>
 #include <vector>
 
 namespace SkSL {
 
-class ByteCode;
-class ExternalValue;
+class  ExternalValue;
+struct FunctionDeclaration;
 
-class SK_API ByteCodeFunction {
+// GCC and Clang support the "labels as values" extension which we need to implement the interpreter
+// using threaded code. Otherwise, we fall back to using a switch statement in a for loop.
+#if defined(__GNUC__) || defined(__clang__)
+    #define SKSLC_THREADED_CODE
+    using instruction = void*;
+#else
+    using instruction = uint16_t;
+#endif
+
+#define VECTOR(name) name ## 4, name ## 3, name ## 2, name
+#define VECTOR_MATRIX(name) name ## 4, name ## 3, name ## 2, name, name ## N
+
+enum class ByteCodeInstruction : uint16_t {
+    // B = bool, F = float, I = int, S = signed, U = unsigned
+    // All binary VECTOR instructions (kAddF, KSubtractI, kCompareIEQ, etc.) are followed by a byte
+    // indicating the count, even though it is redundant due to the count appearing in the opcode.
+    // This is because the original opcodes are lost after we preprocess it into threaded code, and
+    // we need to still be able to access the count so as to permit the implementation to use opcode
+    // fallthrough.
+    VECTOR_MATRIX(kAddF),
+    VECTOR(kAddI),
+    kAndB,
+    kBranch,
+    // Followed by a byte indicating the index of the function to call
+    kCall,
+    // Followed by three bytes indicating: the number of argument slots, the number of return slots,
+    // and the index of the external value to call
+    kCallExternal,
+    // For dynamic array access: Followed by byte indicating length of array
+    kClampIndex,
+    VECTOR(kCompareIEQ),
+    VECTOR(kCompareINEQ),
+    VECTOR_MATRIX(kCompareFEQ),
+    VECTOR_MATRIX(kCompareFNEQ),
+    VECTOR(kCompareFGT),
+    VECTOR(kCompareFGTEQ),
+    VECTOR(kCompareFLT),
+    VECTOR(kCompareFLTEQ),
+    VECTOR(kCompareSGT),
+    VECTOR(kCompareSGTEQ),
+    VECTOR(kCompareSLT),
+    VECTOR(kCompareSLTEQ),
+    VECTOR(kCompareUGT),
+    VECTOR(kCompareUGTEQ),
+    VECTOR(kCompareULT),
+    VECTOR(kCompareULTEQ),
+    VECTOR(kConvertFtoI),
+    VECTOR(kConvertStoF),
+    VECTOR(kConvertUtoF),
+    // Followed by a (redundant) byte indicating the count
+    VECTOR(kCos),
+    VECTOR_MATRIX(kDivideF),
+    VECTOR(kDivideS),
+    VECTOR(kDivideU),
+    // Duplicates the top stack value. Followed by a (redundant) byte indicating the count.
+    VECTOR_MATRIX(kDup),
+    kInverse2x2,
+    kInverse3x3,
+    kInverse4x4,
+    // kLoad/kLoadGlobal are followed by a byte indicating the count, and a byte indicating the
+    // local/global slot to load
+    VECTOR(kLoad),
+    VECTOR(kLoadGlobal),
+    VECTOR(kLoadUniform),
+    // As kLoad/kLoadGlobal, then a count byte (1-4), and then one byte per swizzle component (0-3).
+    kLoadSwizzle,
+    kLoadSwizzleGlobal,
+    kLoadSwizzleUniform,
+    // kLoadExtended* are fallback load ops when we lack a specialization. They are followed by a
+    // count byte, and get the slot to load from the top of the stack.
+    kLoadExtended,
+    kLoadExtendedGlobal,
+    kLoadExtendedUniform,
+    // Followed by four bytes: srcCols, srcRows, dstCols, dstRows. Consumes the src matrix from the
+    // stack, and replaces it with the dst matrix. Per GLSL rules, there are no restrictions on
+    // dimensions. Any overlapping values are copied, and any other values are filled in with the
+    // identity matrix.
+    kMatrixToMatrix,
+    // Followed by three bytes: leftCols (== rightRows), leftRows, rightCols
+    kMatrixMultiply,
+    VECTOR_MATRIX(kNegateF),
+    VECTOR(kNegateI),
+    VECTOR_MATRIX(kMultiplyF),
+    VECTOR(kMultiplyI),
+    kNotB,
+    kOrB,
+    VECTOR_MATRIX(kPop),
+    // Followed by a 32 bit value containing the value to push
+    kPushImmediate,
+    // Followed by a byte indicating external value to read
+    VECTOR(kReadExternal),
+    VECTOR(kRemainderF),
+    VECTOR(kRemainderS),
+    VECTOR(kRemainderU),
+    // Followed by a byte indicating the number of slots to reserve on the stack (for later return)
+    kReserve,
+    // Followed by a byte indicating the number of slots being returned
+    kReturn,
+    // Followed by two bytes indicating columns and rows of matrix (2, 3, or 4 each).
+    // Takes a single value from the top of the stack, and converts to a CxR matrix with that value
+    // replicated along the diagonal (and zero elsewhere), per the GLSL matrix construction rules.
+    kScalarToMatrix,
+    // Followed by a byte indicating the number of bits to shift
+    kShiftLeft,
+    kShiftRightS,
+    kShiftRightU,
+    // Followed by a (redundant) byte indicating the count
+    VECTOR(kSin),
+    VECTOR(kSqrt),
+    // kStore/kStoreGlobal are followed by a byte indicating the local/global slot to store
+    VECTOR(kStore),
+    VECTOR(kStoreGlobal),
+    // Fallback stores. Followed by count byte, and get the slot to store from the top of the stack
+    kStoreExtended,
+    kStoreExtendedGlobal,
+    // As kStore/kStoreGlobal, then a count byte (1-4), then one byte per swizzle component (0-3).
+    // Expects the stack to look like: ... v1 v2 v3 v4, where the number of 'v's is equal to the
+    // number of swizzle components. After the store, all v's are popped from the stack.
+    kStoreSwizzle,
+    kStoreSwizzleGlobal,
+    // As above, but gets the store slot from the top of the stack (before values to be stored)
+    kStoreSwizzleIndirect,
+    kStoreSwizzleIndirectGlobal,
+    // Followed by two count bytes (1-4), and then one byte per swizzle component (0-3). The first
+    // count byte provides the current vector size (the vector is the top n stack elements), and the
+    // second count byte provides the swizzle component count.
+    kSwizzle,
+    VECTOR_MATRIX(kSubtractF),
+    VECTOR(kSubtractI),
+    // Followed by a (redundant) byte indicating the count
+    VECTOR(kTan),
+    // Followed by a byte indicating external value to write
+    VECTOR(kWriteExternal),
+    kXorB,
+
+    kMaskPush,
+    kMaskPop,
+    kMaskNegate,
+    // Followed by count byte
+    kMaskBlend,
+    // Followed by address
+    kBranchIfAllFalse,
+
+    kLoopBegin,
+    kLoopNext,
+    kLoopMask,
+    kLoopEnd,
+    kLoopBreak,
+    kLoopContinue,
+};
+#undef VECTOR
+
+class ByteCodeFunction {
 public:
-    // all counts are of 32-bit values, so a float4 counts as 4 parameter or return slots
-    struct SK_API Parameter {
+    int getParameterCount() const { return fParameterCount; }
+    int getReturnCount() const { return fReturnCount; }
+
+    /**
+     * Print bytecode disassembly to stdout.
+     */
+    void disassemble() const;
+
+private:
+    ByteCodeFunction(const FunctionDeclaration* declaration);
+
+    friend class ByteCode;
+    friend class ByteCodeGenerator;
+    friend struct Interpreter;
+
+    struct Parameter {
         int fSlotCount;
         bool fIsOutParameter;
     };
 
-    /**
-     * Note that this is the actual number of parameters, not the number of parameter slots.
-     */
-    int getParameterCount() const { return fParameters.size(); }
-
-    Parameter getParameter(int idx) const { return fParameters[idx]; }
-
-    int getParameterSlotCount() const { return fParameterSlotCount; }
-
-    int getReturnSlotCount() const { return fReturnSlotCount; }
-
-    void disassemble() const { }
-
-private:
-    ByteCodeFunction(const FunctionDeclaration* declaration)
-        : fName(declaration->fName) {}
-
-    String fName;
-
+    SkSL::String fName;
     std::vector<Parameter> fParameters;
+    int fParameterCount;
+    int fReturnCount = 0;
 
-    int fParameterSlotCount;
-
-    int fReturnSlotCount;
-
-    int fStackSlotCount;
-
+    int fLocalCount = 0;
+    int fStackCount = 0;
+    int fConditionCount = 0;
+    int fLoopCount = 0;
+    mutable SkOnce fPreprocessOnce;
     std::vector<uint8_t> fCode;
 
-    friend class ByteCode;
-    friend class ByteCodeGenerator;
-    template<int width>
-    friend class Interpreter;
+    /**
+     * Replace each opcode with the corresponding entry from the labels array.
+     */
+    void preprocess(const void* labels[]);
 };
 
 enum class TypeCategory {
@@ -73,260 +220,9 @@
 
 class SK_API ByteCode {
 public:
-    template<int width>
-    union Vector {
-        skvx::Vec<width, float> fFloat;
-        skvx::Vec<width, int32_t> fInt;
-        skvx::Vec<width, uint32_t> fUInt;
+    static constexpr int kVecWidth = 8;
 
-        Vector() = default;
-
-        Vector(skvx::Vec<width, float> f)
-            : fFloat(f) {}
-
-        Vector(skvx::Vec<width, int32_t> i)
-            : fInt(i) {}
-
-        Vector(skvx::Vec<width, uint32_t> u)
-            : fUInt(u) {}
-    };
-
-    enum class Instruction : uint8_t {
-        // no parameters
-        kNop,
-        // no parameters
-        kAbort,
-        // Register target, Register src1, Register src2
-        kAddF,
-        // Register target, Register src1, Register src2
-        kAddI,
-        // Register target, Register src1, Register src2
-        kAnd,
-        // Register index, int arrayLength
-        kBoundsCheck,
-        // Pointer target
-        kBranch,
-        // Pointer target
-        kBranchIfAllFalse,
-        // no parameters
-        kBreak,
-        // Register target, uint8_t functionIndex, Register parameters
-        kCall,
-        // Register target, uint8_t externalValueIndex, uint8_t targetSize, Register arguments,
-        // uint8_t argumentSize
-        kCallExternal,
-        // Register target, Register src1, Register src2
-        kCompareEQF,
-        // Register target, Register src1, Register src2
-        kCompareEQI,
-        // Register target, Register src1, Register src2
-        kCompareNEQF,
-        // Register target, Register src1, Register src2
-        kCompareNEQI,
-        // Register target, Register src1, Register src2
-        kCompareGTF,
-        // Register target, Register src1, Register src2
-        kCompareGTS,
-        // Register target, Register src1, Register src2
-        kCompareGTU,
-        // Register target, Register src1, Register src2
-        kCompareGTEQF,
-        // Register target, Register src1, Register src2
-        kCompareGTEQS,
-        // Register target, Register src1, Register src2
-        kCompareGTEQU,
-        // Register target, Register src1, Register src2
-        kCompareLTF,
-        // Register target, Register src1, Register src2
-        kCompareLTS,
-        // Register target, Register src1, Register src2
-        kCompareLTU,
-        // Register target, Register src1, Register src2
-        kCompareLTEQF,
-        // Register target, Register src1, Register src2
-        kCompareLTEQS,
-        // Register target, Register src1, Register src2
-        kCompareLTEQU,
-        // no parameters
-        kContinue,
-        // Register target, Register src
-        kCopy,
-        // Register target, Register src,
-        kCos,
-        // Register target, Register src1, Register src2
-        kDivideF,
-        // Register target, Register src1, Register src2
-        kDivideS,
-        // Register target, Register src1, Register src2
-        kDivideU,
-        // Register target, Register src
-        kFloatToSigned,
-        // Register target, Register src
-        kFloatToUnsigned,
-        // Load a constant into a register
-        // Register target, Immediate value
-        kImmediate,
-        // Register target, Register src
-        kInverse2x2,
-        // Register target, Register src
-        kInverse3x3,
-        // Register target, Register src
-        kInverse4x4,
-        // Load the memory cell pointed to by srcPtr into a register
-        // Register target, Register srcPtr
-        kLoad,
-        // Load the memory cell pointed to by src into a register
-        // Register target, Pointer src
-        kLoadDirect,
-        // Load the parameter slot pointed to by srcPtr into a register
-        // Register target, Register srcPtr
-        kLoadParameter,
-        // Load the parameter slot pointed to by src into a register
-        // Register target, Pointer src
-        kLoadParameterDirect,
-        // Load the stack cell pointed to by srcPtr + sp into a register
-        // Register target, Register srcPtr
-        kLoadStack,
-        // Load the stack cell pointed to by src + sp into a register
-        // Register target, Pointer src
-        kLoadStackDirect,
-        // Pushes a new loop onto the loop and continue stacks
-        // no parameters
-        kLoopBegin,
-        // Pops the loop and continue stacks
-        // no parameters
-        kLoopEnd,
-        // Register mask
-        kLoopMask,
-        // no parameters
-        kLoopNext,
-        // no parameters
-        kMaskNegate,
-        // no parameters
-        kMaskPop,
-        // Register mask
-        kMaskPush,
-        // Register target, Register left, Register right, uint8_t leftColsAndRightRows,
-        // uint8_t leftRows, uint8_t rightCols
-        kMatrixMultiply,
-        // Register target, Register src, uint8_t srcColumns, uint8_t srcRows, uint8_t dstColumns,
-        // uint8_t dstRows
-        kMatrixToMatrix,
-        // Register target, Register src1, Register src2
-        kMultiplyF,
-        // Register target, Register src1, Register src2
-        kMultiplyI,
-        // Register target, Register src
-        kNegateF,
-        // Register target, Register src
-        kNegateS,
-        // Register target, Register src
-        kNot,
-        // Register target, Register src1, Register src2
-        kOr,
-        // Register src
-        kPrint,
-        // Register target, uint8_t count, uint8_t index
-        kReadExternal,
-        // Register target, Register src1, Register src2
-        kRemainderF,
-        // Register target, Register src1, Register src2
-        kRemainderS,
-        // Register target, Register src1, Register src2
-        kRemainderU,
-        // no parameters
-        kReturn,
-        // Register value
-        kReturnValue,
-        // Register target, Register src, uint8_t columns, uint8_t rows
-        kScalarToMatrix,
-        // Register target, Register test, Register ifTrue, Register ifFalse
-        kSelect,
-        // Register target, Register src, uint8_t count
-        kShiftLeft,
-        // Register target, Register src, uint8_t count
-        kShiftRightS,
-        // Register target, Register src, uint8_t count
-        kShiftRightU,
-        // Register target, Register src
-        kSignedToFloat,
-        // Register target, Register src,
-        kSin,
-        // Register target, Register src,
-        kSqrt,
-        // Store to the memory cell pointed to by dstPtr
-        // Register dstPtr, Register src
-        kStore,
-        // Store to the memory cell pointed to by dst
-        // Pointer dst, Register src
-        kStoreDirect,
-        // Store to the parameter slot pointed to by dstPtr
-        // Register dstPtr, Register src
-        kStoreParameter,
-        // Store to the parameter slot pointed to by dst
-        // Pointer dst, Register src
-        kStoreParameterDirect,
-        // Stores a register into the stack cell pointed to by dst + sp
-        // Register dst, Register src
-        kStoreStack,
-        // Stores a register into the stack cell pointed to by dstPtr + sp
-        // Pointer dst, Register src
-        kStoreStackDirect,
-        // Register target, Register src1, Register src2
-        kSubtractF,
-        // Register target, Register src1, Register src2
-        kSubtractI,
-        // Register target, Register src,
-        kTan,
-        // Register target, Register src,
-        kUnsignedToFloat,
-        // uint8_t index, uint8_t count, Register src
-        kWriteExternal,
-        // Register target, Register src1, Register src2
-        kXor,
-    };
-
-
-    // Compound values like vectors span multiple Registers or Pointer addresses. We always refer to
-    // them by the address of their first slot, so for instance if you add two float4's together,
-    // the resulting Register contains the first channel of the result, with the other three
-    // channels following in the next three Registers.
-
-    struct Register {
-        uint16_t fIndex;
-
-        Register operator+(uint16_t offset) const {
-            return Register{(uint16_t) (fIndex + offset)};
-        }
-    };
-
-    struct Pointer {
-        uint16_t fAddress;
-
-        Pointer operator+(uint16_t offset) const {
-            return Pointer{(uint16_t) (fAddress + offset)};
-        }
-    };
-
-    union Immediate {
-        float fFloat;
-        int32_t fInt;
-        uint32_t fUInt;
-
-        Immediate() {}
-
-        Immediate(float f)
-            : fFloat(f) {}
-
-        Immediate(int32_t i)
-            : fInt(i) {}
-
-        Immediate(uint32_t u)
-            : fUInt(u) {}
-    };
-
-    static constexpr int kPointerMax = 65535;
-    static constexpr int kRegisterMax = 65535;
+    ByteCode() = default;
 
     const ByteCodeFunction* getFunction(const char* name) const {
         for (const auto& f : fFunctions) {
@@ -337,9 +233,36 @@
         return nullptr;
     }
 
-    int getGlobalSlotCount() const {
-        return fGlobalSlotCount;
-    }
+    /**
+     * Invokes the specified function once, with the given arguments.
+     * 'args', 'outReturn', and 'uniforms' are collections of 32-bit values (typically floats,
+     * but possibly int32_t or uint32_t, depending on the types used in the SkSL).
+     * Any 'out' or 'inout' parameters will result in the 'args' array being modified.
+     * The return value is stored in 'outReturn' (may be null, to discard the return value).
+     * 'uniforms' are mapped to 'uniform' globals, in order.
+     */
+    bool SKSL_WARN_UNUSED_RESULT run(const ByteCodeFunction*,
+                                     float* args, int argCount,
+                                     float* outReturn, int returnCount,
+                                     const float* uniforms, int uniformCount) const;
+
+    /**
+     * Invokes the specified function with the given arguments, 'N' times. 'args' and 'outReturn'
+     * are accepted and returned in structure-of-arrays form:
+     *   args[0] points to an array of N values, the first argument for each invocation
+     *   ...
+     *   args[argCount - 1] points to an array of N values, the last argument for each invocation
+     *
+     * All values in 'args', 'outReturn', and 'uniforms' are 32-bit values (typically floats,
+     * but possibly int32_t or uint32_t, depending on the types used in the SkSL).
+     * Any 'out' or 'inout' parameters will result in the 'args' array being modified.
+     * The return value is stored in 'outReturn' (may be null, to discard the return value).
+     * 'uniforms' are mapped to 'uniform' globals, in order.
+     */
+    bool SKSL_WARN_UNUSED_RESULT runStriped(const ByteCodeFunction*, int N,
+                                            float* args[], int argCount,
+                                            float* outReturn[], int returnCount,
+                                            const float* uniforms, int uniformCount) const;
 
     struct Uniform {
         SkSL::String fName;
@@ -362,19 +285,20 @@
     const Uniform& getUniform(int i) const { return fUniforms[i]; }
 
 private:
-    std::vector<std::unique_ptr<ByteCodeFunction>> fFunctions;
-    std::vector<ExternalValue*> fExternalValues;
+    ByteCode(const ByteCode&) = delete;
+    ByteCode& operator=(const ByteCode&) = delete;
 
-    int fGlobalSlotCount;
+    friend class ByteCodeGenerator;
+    friend struct Interpreter;
 
+    int fGlobalSlotCount = 0;
     int fUniformSlotCount = 0;
     std::vector<Uniform> fUniforms;
 
-    friend class ByteCodeGenerator;
-    template<int width>
-    friend class Interpreter;
+    std::vector<std::unique_ptr<ByteCodeFunction>> fFunctions;
+    std::vector<ExternalValue*> fExternalValues;
 };
 
-} // namespace
+}
 
 #endif
diff --git a/src/sksl/SkSLByteCodeGenerator.cpp b/src/sksl/SkSLByteCodeGenerator.cpp
index 75b3d22..36a1338 100644
--- a/src/sksl/SkSLByteCodeGenerator.cpp
+++ b/src/sksl/SkSLByteCodeGenerator.cpp
@@ -7,21 +7,50 @@
 
 #include "src/sksl/SkSLByteCodeGenerator.h"
 
+#include <algorithm>
+
 namespace SkSL {
 
-ByteCodeGenerator::ByteCodeGenerator(const Program* program, ErrorReporter* errors,
-                                     ByteCode* output)
+static TypeCategory type_category(const Type& type) {
+    switch (type.kind()) {
+        case Type::Kind::kVector_Kind:
+        case Type::Kind::kMatrix_Kind:
+            return type_category(type.componentType());
+        default:
+            if (type.fName == "bool") {
+                return TypeCategory::kBool;
+            } else if (type.fName == "int" ||
+                       type.fName == "short" ||
+                       type.fName == "$intLiteral") {
+                return TypeCategory::kSigned;
+            } else if (type.fName == "uint" ||
+                       type.fName == "ushort") {
+                return TypeCategory::kUnsigned;
+            } else {
+                SkASSERT(type.fName == "float" ||
+                         type.fName == "half" ||
+                         type.fName == "$floatLiteral");
+                return TypeCategory::kFloat;
+            }
+            ABORT("unsupported type: %s\n", type.displayName().c_str());
+    }
+}
+
+
+ByteCodeGenerator::ByteCodeGenerator(const Context* context, const Program* program, ErrorReporter* errors,
+                  ByteCode* output)
     : INHERITED(program, errors, nullptr)
+    , fContext(*context)
     , fOutput(output)
     , fIntrinsics {
-        { "cos",     ByteCode::Instruction::kCos },
+        { "cos",     ByteCodeInstruction::kCos },
         { "dot",     SpecialIntrinsic::kDot },
-        { "inverse", SpecialIntrinsic::kInverse },
-        { "print",   ByteCode::Instruction::kPrint },
-        { "sin",     ByteCode::Instruction::kSin },
-        { "sqrt",    ByteCode::Instruction::kSqrt },
-        { "tan",     ByteCode::Instruction::kTan },
-    } {}
+        { "inverse", ByteCodeInstruction::kInverse2x2 },
+        { "sin",     ByteCodeInstruction::kSin },
+        { "sqrt",    ByteCodeInstruction::kSqrt },
+        { "tan",     ByteCodeInstruction::kTan },
+      } {}
+
 
 int ByteCodeGenerator::SlotCount(const Type& type) {
     if (type.kind() == Type::kOther_Kind) {
@@ -51,73 +80,89 @@
 static inline bool is_in(const SkSL::Variable& var) {
     return var.fModifiers.fFlags & Modifiers::kIn_Flag;
 }
-ByteCodeGenerator::Location ByteCodeGenerator::getLocation(const Variable& var) {
-    // given that we seldom have more than a couple of variables, linear search is probably the most
-    // efficient way to handle lookups
-    switch (var.fStorage) {
-        case Variable::kLocal_Storage: {
-            for (int i = fLocals.size() - 1; i >= 0; --i) {
-                if (fLocals[i] == &var) {
-                    return ByteCode::Pointer{(uint16_t) (i + fParameterCount)};
-                }
-            }
-            int result = fLocals.size() + fParameterCount;
-            fLocals.push_back(&var);
-            for (int i = 0; i < SlotCount(var.fType) - 1; ++i) {
-                fLocals.push_back(nullptr);
-            }
-            SkASSERT(result <= ByteCode::kPointerMax);
-            return ByteCode::Pointer{(uint16_t) result};
+
+void ByteCodeGenerator::gatherUniforms(const Type& type, const String& name) {
+    if (type.kind() == Type::kOther_Kind) {
+        return;
+    } else if (type.kind() == Type::kStruct_Kind) {
+        for (const auto& f : type.fields()) {
+            this->gatherUniforms(*f.fType, name + "." + f.fName);
         }
-        case Variable::kParameter_Storage: {
-            int offset = 0;
-            for (const auto& p : fFunction->fDeclaration.fParameters) {
-                if (p == &var) {
-                    SkASSERT(offset <= ByteCode::kPointerMax);
-                    return ByteCode::Pointer{(uint16_t) offset};
-                }
-                offset += SlotCount(p->fType);
-            }
-            SkASSERT(false);
-            return ByteCode::Pointer{0};
+    } else if (type.kind() == Type::kArray_Kind) {
+        for (int i = 0; i < type.columns(); ++i) {
+            this->gatherUniforms(type.componentType(), String::printf("%s[%d]", name.c_str(), i));
         }
-        case Variable::kGlobal_Storage: {
-            if (is_in(var)) {
-                // If you trip this assert, it means the program is using raw 'in' variables. You
-                // should either specialize the program (Compiler::specialize) to bake in the final
-                // values of the 'in' variables, or not use 'in' variables (maybe you meant to use
-                // 'uniform' instead?).
-                SkASSERT(false);
-                return ByteCode::Pointer{0};
+    } else {
+        fOutput->fUniforms.push_back({ name, type_category(type), type.rows(), type.columns(),
+                                       fOutput->fUniformSlotCount });
+        fOutput->fUniformSlotCount += type.columns() * type.rows();
+    }
+}
+
+bool ByteCodeGenerator::generateCode() {
+    for (const auto& e : fProgram) {
+        switch (e.fKind) {
+            case ProgramElement::kFunction_Kind: {
+                std::unique_ptr<ByteCodeFunction> f = this->writeFunction((FunctionDefinition&) e);
+                if (!f) {
+                    return false;
+                }
+                fOutput->fFunctions.push_back(std::move(f));
+                fFunctions.push_back(&(FunctionDefinition&)e);
+                break;
             }
-            bool isUniform = is_uniform(var);
-            int offset = isUniform ? fOutput->getGlobalSlotCount() : 0;
-            for (const auto& e : fProgram) {
-                if (e.fKind == ProgramElement::kVar_Kind) {
-                    VarDeclarations& decl = (VarDeclarations&) e;
-                    for (const auto& v : decl.fVars) {
-                        const Variable* declVar = ((VarDeclaration&) *v).fVar;
-                        if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
-                            continue;
-                        }
-                        if (isUniform != is_uniform(*declVar)) {
-                            continue;
-                        }
-                        if (declVar == &var) {
-                            SkASSERT(offset <= ByteCode::kPointerMax);
-                            return ByteCode::Pointer{(uint16_t) offset};
-                        }
-                        offset += SlotCount(declVar->fType);
+            case ProgramElement::kVar_Kind: {
+                VarDeclarations& decl = (VarDeclarations&) e;
+                for (const auto& v : decl.fVars) {
+                    const Variable* declVar = ((VarDeclaration&) *v).fVar;
+                    if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
+                        continue;
+                    }
+                    if (is_uniform(*declVar)) {
+                        this->gatherUniforms(declVar->fType, declVar->fName);
+                    } else {
+                        fOutput->fGlobalSlotCount += SlotCount(declVar->fType);
                     }
                 }
+                break;
             }
-            SkASSERT(false);
-            return ByteCode::Pointer{0};
+            default:
+                ; // ignore
         }
-        default:
-            SkASSERT(false);
-            return ByteCode::Pointer{0};
     }
+    return 0 == fErrors.errorCount();
+}
+
+std::unique_ptr<ByteCodeFunction> ByteCodeGenerator::writeFunction(const FunctionDefinition& f) {
+    fFunction = &f;
+    std::unique_ptr<ByteCodeFunction> result(new ByteCodeFunction(&f.fDeclaration));
+    fParameterCount = result->fParameterCount;
+    fLoopCount = fMaxLoopCount = 0;
+    fConditionCount = fMaxConditionCount = 0;
+    fStackCount = fMaxStackCount = 0;
+    fCode = &result->fCode;
+
+    this->writeStatement(*f.fBody);
+    if (0 == fErrors.errorCount()) {
+        SkASSERT(fLoopCount == 0);
+        SkASSERT(fConditionCount == 0);
+        SkASSERT(fStackCount == 0);
+    }
+    this->write(ByteCodeInstruction::kReturn, 0);
+    this->write8(0);
+
+    result->fLocalCount     = fLocals.size();
+    result->fConditionCount = fMaxConditionCount;
+    result->fLoopCount      = fMaxLoopCount;
+    result->fStackCount     = fMaxStackCount;
+
+    const Type& returnType = f.fDeclaration.fReturnType;
+    if (returnType != *fContext.fVoid_Type) {
+        result->fReturnCount = SlotCount(returnType);
+    }
+    fLocals.clear();
+    fFunction = nullptr;
+    return result;
 }
 
 // A "simple" Swizzle is based on a variable (or a compound variable like a struct or array), and
@@ -141,413 +186,519 @@
     return true;
 }
 
+int ByteCodeGenerator::StackUsage(ByteCodeInstruction inst, int count_) {
+    // Ensures that we use count iff we're passed a non-default value. Most instructions have an
+    // implicit count, so the caller shouldn't need to worry about it (or count makes no sense).
+    // The asserts avoids callers thinking they're supplying useful information in that scenario,
+    // or failing to supply necessary information for the ops that need a count.
+    struct CountValue {
+        operator int() {
+            SkASSERT(val != ByteCodeGenerator::kUnusedStackCount);
+            SkDEBUGCODE(used = true);
+            return val;
+        }
+        ~CountValue() {
+            SkASSERT(used || val == ByteCodeGenerator::kUnusedStackCount);
+        }
+        int val;
+        SkDEBUGCODE(bool used = false;)
+    } count = { count_ };
+
+    switch (inst) {
+        // Unary functions/operators that don't change stack depth at all:
+#define VECTOR_UNARY_OP(base)                \
+        case ByteCodeInstruction::base:      \
+        case ByteCodeInstruction::base ## 2: \
+        case ByteCodeInstruction::base ## 3: \
+        case ByteCodeInstruction::base ## 4: \
+            return 0;
+
+        VECTOR_UNARY_OP(kConvertFtoI)
+        VECTOR_UNARY_OP(kConvertStoF)
+        VECTOR_UNARY_OP(kConvertUtoF)
+
+        VECTOR_UNARY_OP(kCos)
+        VECTOR_UNARY_OP(kSin)
+        VECTOR_UNARY_OP(kSqrt)
+        VECTOR_UNARY_OP(kTan)
+
+        VECTOR_UNARY_OP(kNegateF)
+        VECTOR_UNARY_OP(kNegateI)
+
+        case ByteCodeInstruction::kInverse2x2:
+        case ByteCodeInstruction::kInverse3x3:
+        case ByteCodeInstruction::kInverse4x4: return 0;
+
+        case ByteCodeInstruction::kClampIndex: return 0;
+        case ByteCodeInstruction::kNotB: return 0;
+        case ByteCodeInstruction::kNegateFN: return 0;
+        case ByteCodeInstruction::kShiftLeft: return 0;
+        case ByteCodeInstruction::kShiftRightS: return 0;
+        case ByteCodeInstruction::kShiftRightU: return 0;
+
+#undef VECTOR_UNARY_OP
+
+        // Binary functions/operators that do a 2 -> 1 reduction (possibly N times)
+#define VECTOR_BINARY_OP(base)                          \
+        case ByteCodeInstruction::base:      return -1; \
+        case ByteCodeInstruction::base ## 2: return -2; \
+        case ByteCodeInstruction::base ## 3: return -3; \
+        case ByteCodeInstruction::base ## 4: return -4;
+
+#define VECTOR_MATRIX_BINARY_OP(base)                   \
+        VECTOR_BINARY_OP(base)                          \
+        case ByteCodeInstruction::base ## N: return -count;
+
+        case ByteCodeInstruction::kAndB: return -1;
+        case ByteCodeInstruction::kOrB:  return -1;
+        case ByteCodeInstruction::kXorB: return -1;
+
+        VECTOR_BINARY_OP(kAddI)
+        VECTOR_MATRIX_BINARY_OP(kAddF)
+
+        VECTOR_BINARY_OP(kCompareIEQ)
+        VECTOR_MATRIX_BINARY_OP(kCompareFEQ)
+        VECTOR_BINARY_OP(kCompareINEQ)
+        VECTOR_MATRIX_BINARY_OP(kCompareFNEQ)
+        VECTOR_BINARY_OP(kCompareSGT)
+        VECTOR_BINARY_OP(kCompareUGT)
+        VECTOR_BINARY_OP(kCompareFGT)
+        VECTOR_BINARY_OP(kCompareSGTEQ)
+        VECTOR_BINARY_OP(kCompareUGTEQ)
+        VECTOR_BINARY_OP(kCompareFGTEQ)
+        VECTOR_BINARY_OP(kCompareSLT)
+        VECTOR_BINARY_OP(kCompareULT)
+        VECTOR_BINARY_OP(kCompareFLT)
+        VECTOR_BINARY_OP(kCompareSLTEQ)
+        VECTOR_BINARY_OP(kCompareULTEQ)
+        VECTOR_BINARY_OP(kCompareFLTEQ)
+
+        VECTOR_BINARY_OP(kDivideS)
+        VECTOR_BINARY_OP(kDivideU)
+        VECTOR_MATRIX_BINARY_OP(kDivideF)
+        VECTOR_BINARY_OP(kMultiplyI)
+        VECTOR_MATRIX_BINARY_OP(kMultiplyF)
+        VECTOR_BINARY_OP(kRemainderF)
+        VECTOR_BINARY_OP(kRemainderS)
+        VECTOR_BINARY_OP(kRemainderU)
+        VECTOR_BINARY_OP(kSubtractI)
+        VECTOR_MATRIX_BINARY_OP(kSubtractF)
+
+#undef VECTOR_BINARY_OP
+#undef VECTOR_MATRIX_BINARY_OP
+
+        // Ops that push or load data to grow the stack:
+        case ByteCodeInstruction::kDup:
+        case ByteCodeInstruction::kLoad:
+        case ByteCodeInstruction::kLoadGlobal:
+        case ByteCodeInstruction::kLoadUniform:
+        case ByteCodeInstruction::kReadExternal:
+        case ByteCodeInstruction::kPushImmediate:
+            return 1;
+
+        case ByteCodeInstruction::kDup2:
+        case ByteCodeInstruction::kLoad2:
+        case ByteCodeInstruction::kLoadGlobal2:
+        case ByteCodeInstruction::kLoadUniform2:
+        case ByteCodeInstruction::kReadExternal2:
+            return 2;
+
+        case ByteCodeInstruction::kDup3:
+        case ByteCodeInstruction::kLoad3:
+        case ByteCodeInstruction::kLoadGlobal3:
+        case ByteCodeInstruction::kLoadUniform3:
+        case ByteCodeInstruction::kReadExternal3:
+            return 3;
+
+        case ByteCodeInstruction::kDup4:
+        case ByteCodeInstruction::kLoad4:
+        case ByteCodeInstruction::kLoadGlobal4:
+        case ByteCodeInstruction::kLoadUniform4:
+        case ByteCodeInstruction::kReadExternal4:
+            return 4;
+
+        case ByteCodeInstruction::kDupN:
+        case ByteCodeInstruction::kLoadSwizzle:
+        case ByteCodeInstruction::kLoadSwizzleGlobal:
+        case ByteCodeInstruction::kLoadSwizzleUniform:
+            return count;
+
+        // Pushes 'count' values, minus one for the 'address' that's consumed first
+        case ByteCodeInstruction::kLoadExtended:
+        case ByteCodeInstruction::kLoadExtendedGlobal:
+        case ByteCodeInstruction::kLoadExtendedUniform:
+            return count - 1;
+
+        // Ops that pop or store data to shrink the stack:
+        case ByteCodeInstruction::kPop:
+        case ByteCodeInstruction::kStore:
+        case ByteCodeInstruction::kStoreGlobal:
+        case ByteCodeInstruction::kWriteExternal:
+            return -1;
+
+        case ByteCodeInstruction::kPop2:
+        case ByteCodeInstruction::kStore2:
+        case ByteCodeInstruction::kStoreGlobal2:
+        case ByteCodeInstruction::kWriteExternal2:
+            return -2;
+
+        case ByteCodeInstruction::kPop3:
+        case ByteCodeInstruction::kStore3:
+        case ByteCodeInstruction::kStoreGlobal3:
+        case ByteCodeInstruction::kWriteExternal3:
+            return -3;
+
+        case ByteCodeInstruction::kPop4:
+        case ByteCodeInstruction::kStore4:
+        case ByteCodeInstruction::kStoreGlobal4:
+        case ByteCodeInstruction::kWriteExternal4:
+            return -4;
+
+        case ByteCodeInstruction::kPopN:
+        case ByteCodeInstruction::kStoreSwizzle:
+        case ByteCodeInstruction::kStoreSwizzleGlobal:
+            return -count;
+
+        // Consumes 'count' values, plus one for the 'address'
+        case ByteCodeInstruction::kStoreExtended:
+        case ByteCodeInstruction::kStoreExtendedGlobal:
+        case ByteCodeInstruction::kStoreSwizzleIndirect:
+        case ByteCodeInstruction::kStoreSwizzleIndirectGlobal:
+            return -count - 1;
+
+        // Strange ops where the caller computes the delta for us:
+        case ByteCodeInstruction::kCallExternal:
+        case ByteCodeInstruction::kMatrixToMatrix:
+        case ByteCodeInstruction::kMatrixMultiply:
+        case ByteCodeInstruction::kReserve:
+        case ByteCodeInstruction::kReturn:
+        case ByteCodeInstruction::kScalarToMatrix:
+        case ByteCodeInstruction::kSwizzle:
+            return count;
+
+        // Miscellaneous
+
+        // kCall is net-zero. Max stack depth is adjusted in writeFunctionCall.
+        case ByteCodeInstruction::kCall:             return 0;
+        case ByteCodeInstruction::kBranch:           return 0;
+        case ByteCodeInstruction::kBranchIfAllFalse: return 0;
+
+        case ByteCodeInstruction::kMaskPush:         return -1;
+        case ByteCodeInstruction::kMaskPop:          return 0;
+        case ByteCodeInstruction::kMaskNegate:       return 0;
+        case ByteCodeInstruction::kMaskBlend:        return -count;
+
+        case ByteCodeInstruction::kLoopBegin:        return 0;
+        case ByteCodeInstruction::kLoopNext:         return 0;
+        case ByteCodeInstruction::kLoopMask:         return -1;
+        case ByteCodeInstruction::kLoopEnd:          return 0;
+        case ByteCodeInstruction::kLoopBreak:        return 0;
+        case ByteCodeInstruction::kLoopContinue:     return 0;
+
+        default:
+            ABORT("unsupported instruction %d\n", (int)inst);
+            return 0;
+    }
+}
+
+ByteCodeGenerator::Location ByteCodeGenerator::getLocation(const Variable& var) {
+    // given that we seldom have more than a couple of variables, linear search is probably the most
+    // efficient way to handle lookups
+    switch (var.fStorage) {
+        case Variable::kLocal_Storage: {
+            for (int i = fLocals.size() - 1; i >= 0; --i) {
+                if (fLocals[i] == &var) {
+                    SkASSERT(fParameterCount + i <= 255);
+                    return { fParameterCount + i, Storage::kLocal };
+                }
+            }
+            int result = fParameterCount + fLocals.size();
+            fLocals.push_back(&var);
+            for (int i = 0; i < SlotCount(var.fType) - 1; ++i) {
+                fLocals.push_back(nullptr);
+            }
+            SkASSERT(result <= 255);
+            return { result, Storage::kLocal };
+        }
+        case Variable::kParameter_Storage: {
+            int offset = 0;
+            for (const auto& p : fFunction->fDeclaration.fParameters) {
+                if (p == &var) {
+                    SkASSERT(offset <= 255);
+                    return { offset, Storage::kLocal };
+                }
+                offset += SlotCount(p->fType);
+            }
+            SkASSERT(false);
+            return Location::MakeInvalid();
+        }
+        case Variable::kGlobal_Storage: {
+            if (is_in(var)) {
+                // If you trip this assert, it means the program is using raw 'in' variables. You
+                // should either specialize the program (Compiler::specialize) to bake in the final
+                // values of the 'in' variables, or not use 'in' variables (maybe you meant to use
+                // 'uniform' instead?).
+                SkASSERT(false);
+                return Location::MakeInvalid();
+            }
+            int offset = 0;
+            bool isUniform = is_uniform(var);
+            for (const auto& e : fProgram) {
+                if (e.fKind == ProgramElement::kVar_Kind) {
+                    VarDeclarations& decl = (VarDeclarations&) e;
+                    for (const auto& v : decl.fVars) {
+                        const Variable* declVar = ((VarDeclaration&) *v).fVar;
+                        if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
+                            continue;
+                        }
+                        if (isUniform != is_uniform(*declVar)) {
+                            continue;
+                        }
+                        if (declVar == &var) {
+                            SkASSERT(offset <= 255);
+                            return  { offset, isUniform ? Storage::kUniform : Storage::kGlobal };
+                        }
+                        offset += SlotCount(declVar->fType);
+                    }
+                }
+            }
+            SkASSERT(false);
+            return Location::MakeInvalid();
+        }
+        default:
+            SkASSERT(false);
+            return Location::MakeInvalid();
+    }
+}
+
 ByteCodeGenerator::Location ByteCodeGenerator::getLocation(const Expression& expr) {
     switch (expr.fKind) {
         case Expression::kFieldAccess_Kind: {
-            const FieldAccess& f = (const FieldAccess&) expr;
-            Location result = this->getLocation(*f.fBase);
+            const FieldAccess& f = (const FieldAccess&)expr;
+            Location baseLoc = this->getLocation(*f.fBase);
             int offset = 0;
             for (int i = 0; i < f.fFieldIndex; ++i) {
                 offset += SlotCount(*f.fBase->fType.fields()[i].fType);
             }
-            return result.offset(*this, offset);
-        }
-        case Expression::kIndex_Kind: {
-            const IndexExpression& idx = (const IndexExpression&) expr;
-            int stride = SlotCount(idx.fType);
-            int length = idx.fBase->fType.columns();
-            Location result = this->getLocation(*idx.fBase);
-            if (idx.fIndex->isConstant()) {
-                int64_t index = idx.fIndex->getConstantInt();
-                if (index < 0 || index >= length) {
-                    fErrors.error(idx.fIndex->fOffset, "Array index out of bounds");
-                    return result;
+            if (baseLoc.isOnStack()) {
+                if (offset != 0) {
+                    this->write(ByteCodeInstruction::kPushImmediate);
+                    this->write32(offset);
+                    this->write(ByteCodeInstruction::kAddI);
+                    this->write8(1);
                 }
-                return result.offset(*this, index * stride);
+                return baseLoc;
             } else {
-                ByteCode::Register index = this->next(1);
-                this->writeExpression(*idx.fIndex, index);
-                this->write(ByteCode::Instruction::kBoundsCheck);
-                this->write(index);
-                this->write(length);
-                ByteCode::Register imm = this->next(1);
-                this->write(ByteCode::Instruction::kImmediate);
-                this->write(imm);
-                this->write(ByteCode::Immediate{stride});
-                ByteCode::Register offset = this->next(1);
-                this->write(ByteCode::Instruction::kMultiplyI);
-                this->write(offset);
-                this->write(index);
-                this->write(imm);
-                return result.offset(*this, offset);
+                return baseLoc + offset;
             }
         }
+        case Expression::kIndex_Kind: {
+            const IndexExpression& i = (const IndexExpression&)expr;
+            int stride = SlotCount(i.fType);
+            int length = i.fBase->fType.columns();
+            SkASSERT(length <= 255);
+            int offset = -1;
+            if (i.fIndex->isConstant()) {
+                int64_t index = i.fIndex->getConstantInt();
+                if (index < 0 || index >= length) {
+                    fErrors.error(i.fIndex->fOffset, "Array index out of bounds.");
+                    return Location::MakeInvalid();
+                }
+                offset = index * stride;
+            } else {
+                if (i.fIndex->hasSideEffects()) {
+                    // Having a side-effect in an indexer is technically safe for an rvalue,
+                    // but with lvalues we have to evaluate the indexer twice, so make it an error.
+                    fErrors.error(i.fIndex->fOffset,
+                            "Index expressions with side-effects not supported in byte code.");
+                    return Location::MakeInvalid();
+                }
+                this->writeExpression(*i.fIndex);
+                this->write(ByteCodeInstruction::kClampIndex);
+                this->write8(length);
+                if (stride != 1) {
+                    this->write(ByteCodeInstruction::kPushImmediate);
+                    this->write32(stride);
+                    this->write(ByteCodeInstruction::kMultiplyI);
+                    this->write8(1);
+                }
+            }
+            Location baseLoc = this->getLocation(*i.fBase);
+
+            // Are both components known statically?
+            if (!baseLoc.isOnStack() && offset >= 0) {
+                return baseLoc + offset;
+            }
+
+            // At least one component is dynamic (and on the stack).
+
+            // If the other component is zero, we're done
+            if (baseLoc.fSlot == 0 || offset == 0) {
+                return baseLoc.makeOnStack();
+            }
+
+            // Push the non-dynamic component (if any) to the stack, then add the two
+            if (!baseLoc.isOnStack()) {
+                this->write(ByteCodeInstruction::kPushImmediate);
+                this->write32(baseLoc.fSlot);
+            }
+            if (offset >= 0) {
+                this->write(ByteCodeInstruction::kPushImmediate);
+                this->write32(offset);
+            }
+            this->write(ByteCodeInstruction::kAddI);
+            this->write8(1);
+            return baseLoc.makeOnStack();
+        }
         case Expression::kSwizzle_Kind: {
-            const Swizzle& s = (const Swizzle&) expr;
+            const Swizzle& s = (const Swizzle&)expr;
             SkASSERT(swizzle_is_simple(s));
-            return this->getLocation(*s.fBase).offset(*this, s.fComponents[0]);
+            Location baseLoc = this->getLocation(*s.fBase);
+            int offset = s.fComponents[0];
+            if (baseLoc.isOnStack()) {
+                if (offset != 0) {
+                    this->write(ByteCodeInstruction::kPushImmediate);
+                    this->write32(offset);
+                    this->write(ByteCodeInstruction::kAddI);
+                    this->write8(1);
+                }
+                return baseLoc;
+            } else {
+                return baseLoc + offset;
+            }
         }
         case Expression::kVariableReference_Kind: {
-            const Variable& var = ((const VariableReference&) expr).fVariable;
+            const Variable& var = ((const VariableReference&)expr).fVariable;
             return this->getLocation(var);
         }
         default:
             SkASSERT(false);
-            return ByteCode::Pointer{0};
+            return Location::MakeInvalid();
     }
 }
 
-Variable::Storage ByteCodeGenerator::getStorage(const Expression& expr) {
-    switch (expr.fKind) {
-        case Expression::kFieldAccess_Kind: {
-            const FieldAccess& f = (const FieldAccess&) expr;
-            return this->getStorage(*f.fBase);
-        }
-        case Expression::kIndex_Kind: {
-            const IndexExpression& idx = (const IndexExpression&) expr;
-            return this->getStorage(*idx.fBase);
-        }
-        case Expression::kSwizzle_Kind: {
-            const Swizzle& s = (const Swizzle&) expr;
-            return this->getStorage(*s.fBase);
-        }
-        case Expression::kVariableReference_Kind: {
-            const Variable& var = ((const VariableReference&) expr).fVariable;
-            return var.fStorage;
-        }
-        default:
-            SkASSERT(false);
-            return Variable::kLocal_Storage;
-    }
+void ByteCodeGenerator::write8(uint8_t b) {
+    fCode->push_back(b);
 }
 
-ByteCode::Instruction ByteCodeGenerator::getLoadInstruction(ByteCodeGenerator::Location location,
-                                                            Variable::Storage storage) {
-    switch (storage) {
-        case Variable::kGlobal_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kLoadDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kLoad;
-            }
-        case Variable::kParameter_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kLoadParameterDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kLoadParameter;
-            }
-        case Variable::kLocal_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kLoadStackDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kLoadStack;
-            }
-        default:
-            break;
-    }
-    SkASSERT(false);
-    return ByteCode::Instruction::kNop;
+void ByteCodeGenerator::write16(uint16_t i) {
+    size_t n = fCode->size();
+    fCode->resize(n+2);
+    memcpy(fCode->data() + n, &i, 2);
 }
 
-ByteCode::Instruction ByteCodeGenerator::getStoreInstruction(ByteCodeGenerator::Location location,
-                                                             Variable::Storage storage) {
-    switch (storage) {
-        case Variable::kGlobal_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kStoreDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kStore;
-            }
-        case Variable::kParameter_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kStoreParameterDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kStoreParameter;
-            }
-        case Variable::kLocal_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kStoreStackDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kStoreStack;
-            }
-        default:
-            break;
-    }
-    SkASSERT(false);
-    return ByteCode::Instruction::kNop;
+void ByteCodeGenerator::write32(uint32_t i) {
+    size_t n = fCode->size();
+    fCode->resize(n+4);
+    memcpy(fCode->data() + n, &i, 4);
 }
 
-class ByteCodeSimpleLValue : public ByteCodeGenerator::LValue {
-public:
-    ByteCodeSimpleLValue(ByteCodeGenerator* generator, ByteCodeGenerator::Location location,
-                         int count, ByteCode::Instruction load, ByteCode::Instruction store)
-        : INHERITED(*generator)
-        , fLocation(location)
-        , fCount(count)
-        , fLoad(load)
-        , fStore(store) {}
+void ByteCodeGenerator::write(ByteCodeInstruction i, int count) {
+    switch (i) {
+        case ByteCodeInstruction::kLoopBegin: this->enterLoop();      break;
+        case ByteCodeInstruction::kLoopEnd:   this->exitLoop();       break;
 
-    void load(ByteCode::Register result) override {
-        for (int i = 0; i < fCount; ++i) {
-            ByteCodeGenerator::Location final = fLocation.offset(fGenerator, i);
-            fGenerator.write(fLoad);
-            fGenerator.write(result + i);
-            fGenerator.write(final);
-        }
+        case ByteCodeInstruction::kMaskPush:  this->enterCondition(); break;
+        case ByteCodeInstruction::kMaskPop:
+        case ByteCodeInstruction::kMaskBlend: this->exitCondition();  break;
+        default: /* Do nothing */ break;
     }
-
-    void store(ByteCode::Register src) override {
-        for (int i = 0; i < fCount; ++i) {
-            ByteCodeGenerator::Location final = fLocation.offset(fGenerator, i);
-            fGenerator.write(fStore);
-            fGenerator.write(final);
-            fGenerator.write(src + i);
-        }
-    }
-
-private:
-    ByteCodeGenerator::Location fLocation;
-
-    int fCount;
-
-    ByteCode::Instruction fLoad;
-
-    ByteCode::Instruction fStore;
-
-    typedef ByteCodeGenerator::LValue INHERITED;
-};
-
-class ByteCodeSwizzleLValue : public ByteCodeGenerator::LValue {
-public:
-    ByteCodeSwizzleLValue(ByteCodeGenerator* generator, const Swizzle* swizzle)
-        : INHERITED(*generator)
-        , fSwizzle(*swizzle) {}
-
-    void load(ByteCode::Register result) override {
-        fGenerator.writeSwizzle(fSwizzle, result);
-    }
-
-    void store(ByteCode::Register src) override {
-        ByteCodeGenerator::Location target = fGenerator.getLocation(*fSwizzle.fBase);
-        ByteCode::Instruction inst = fGenerator.getStoreInstruction(
-                                                            target,
-                                                            fGenerator.getStorage(*fSwizzle.fBase));
-        for (size_t i = 0; i < fSwizzle.fComponents.size(); ++i) {
-            ByteCodeGenerator::Location final = target.offset(fGenerator, fSwizzle.fComponents[i]);
-            fGenerator.write(inst);
-            fGenerator.write(final);
-            fGenerator.write(src + i);
-        }
-    }
-
-private:
-    const Swizzle& fSwizzle;
-
-    typedef ByteCodeGenerator::LValue INHERITED;
-};
-
-class ByteCodeExternalValueLValue : public ByteCodeGenerator::LValue {
-public:
-    ByteCodeExternalValueLValue(ByteCodeGenerator* generator, ExternalValue& value, int index)
-        : INHERITED(*generator)
-        , fIndex(index)
-        , fSlotCount(ByteCodeGenerator::SlotCount(value.type())) {
-        SkASSERT(fSlotCount <= 4);
-    }
-
-    void load(ByteCode::Register result) override {
-        fGenerator.write(ByteCode::Instruction::kReadExternal);
-        fGenerator.write(result);
-        fGenerator.write((uint8_t) fSlotCount);
-        fGenerator.write((uint8_t) fIndex);
-    }
-
-    void store(ByteCode::Register src) override {
-        fGenerator.write(ByteCode::Instruction::kWriteExternal);
-        fGenerator.write((uint8_t) fIndex);
-        fGenerator.write((uint8_t) fSlotCount);
-        fGenerator.write(src);
-    }
-
-private:
-    typedef LValue INHERITED;
-
-    int fIndex;
-
-    int fSlotCount;
-};
-
-std::unique_ptr<ByteCodeGenerator::LValue> ByteCodeGenerator::getLValue(const Expression& expr) {
-    switch (expr.fKind) {
-        case Expression::kExternalValue_Kind: {
-            ExternalValue* value = ((ExternalValueReference&) expr).fValue;
-            int index = fOutput->fExternalValues.size();
-            fOutput->fExternalValues.push_back(value);
-            SkASSERT(index <= 255);
-            return std::unique_ptr<LValue>(new ByteCodeExternalValueLValue(this, *value, index));
-        }
-        case Expression::kFieldAccess_Kind:
-        case Expression::kIndex_Kind:
-        case Expression::kVariableReference_Kind: {
-            Location location = this->getLocation(expr);
-            Variable::Storage storage = this->getStorage(expr);
-            ByteCode::Instruction loadInst = this->getLoadInstruction(location, storage);
-            ByteCode::Instruction storeInst = this->getStoreInstruction(location, storage);
-            return std::unique_ptr<LValue>(new ByteCodeSimpleLValue(this, location,
-                                                                    SlotCount(expr.fType),
-                                                                    loadInst, storeInst));
-        }
-        case Expression::kSwizzle_Kind:
-            return std::unique_ptr<LValue>(new ByteCodeSwizzleLValue(this, &(Swizzle&) expr));
-        default:
-            ABORT("unsupported lvalue\n");
-    }
+    instruction val = (instruction) i;
+    size_t n = fCode->size();
+    fCode->resize(n + sizeof(val));
+    memcpy(fCode->data() + n, &val, sizeof(val));
+    fStackCount += StackUsage(i, count);
+    fMaxStackCount = std::max(fMaxStackCount, fStackCount);
 }
 
-ByteCode::Register ByteCodeGenerator::next(int count) {
-    SkASSERT(fNextRegister + count <= ByteCode::kRegisterMax);
-    fNextRegister += count;
-    return ByteCode::Register{(uint16_t) (fNextRegister - count)};
+static ByteCodeInstruction vector_instruction(ByteCodeInstruction base, int count) {
+    SkASSERT(count >= 1 && count <= 4);
+    return ((ByteCodeInstruction) ((int) base + 1 - count));
 }
 
-static TypeCategory type_category(const Type& type) {
-    switch (type.kind()) {
-        case Type::Kind::kVector_Kind:
-        case Type::Kind::kMatrix_Kind:
-            return type_category(type.componentType());
-        default:
-            String name = type.displayName();
-            if (name == "bool") {
-                return TypeCategory::kBool;
-            } else if (name == "int" || name == "short") {
-                return TypeCategory::kSigned;
-            } else if (name == "uint" || name == "ushort") {
-                return TypeCategory::kUnsigned;
-            } else {
-                SkASSERT(name == "float" || name == "half");
-                return TypeCategory::kFloat;
-            }
-            ABORT("unsupported type: %s\n", name.c_str());
-    }
-}
-
-void ByteCodeGenerator::writeTypedInstruction(const Type& type, ByteCode::Instruction s,
-                                              ByteCode::Instruction u, ByteCode::Instruction f) {
+void ByteCodeGenerator::writeTypedInstruction(const Type& type, ByteCodeInstruction s,
+                                              ByteCodeInstruction u, ByteCodeInstruction f,
+                                              int count, bool writeCount) {
     switch (type_category(type)) {
         case TypeCategory::kSigned:
-            this->write(s);
+            this->write(vector_instruction(s, count));
             break;
         case TypeCategory::kUnsigned:
-            this->write(u);
+            this->write(vector_instruction(u, count));
             break;
         case TypeCategory::kFloat: {
-            this->write(f);
+            if (count > 4) {
+                this->write((ByteCodeInstruction)((int)f + 1), count);
+            } else {
+                this->write(vector_instruction(f, count));
+            }
             break;
         }
         default:
             SkASSERT(false);
     }
-}
-
-void ByteCodeGenerator::writeBinaryInstruction(const Type& operandType,
-                                               ByteCode::Register left,
-                                               ByteCode::Register right,
-                                               ByteCode::Instruction s,
-                                               ByteCode::Instruction u,
-                                               ByteCode::Instruction f,
-                                               ByteCode::Register result) {
-    for (int i = 0; i < SlotCount(operandType); ++i) {
-        this->writeTypedInstruction(operandType, s, u, f);
-        this->write(result + i);
-        this->write(left + i);
-        this->write(right + i);
+    if (writeCount) {
+        this->write8(count);
     }
 }
 
-void ByteCodeGenerator::writeBinaryExpression(const BinaryExpression& b,
-                                              ByteCode::Register result) {
+bool ByteCodeGenerator::writeBinaryExpression(const BinaryExpression& b, bool discard) {
     if (b.fOperator == Token::Kind::EQ) {
         std::unique_ptr<LValue> lvalue = this->getLValue(*b.fLeft);
-        this->writeExpression(*b.fRight, result);
-        lvalue->store(result);
-        return;
+        this->writeExpression(*b.fRight);
+        lvalue->store(discard);
+        discard = false;
+        return discard;
     }
     const Type& lType = b.fLeft->fType;
     const Type& rType = b.fRight->fType;
     bool lVecOrMtx = (lType.kind() == Type::kVector_Kind || lType.kind() == Type::kMatrix_Kind);
     bool rVecOrMtx = (rType.kind() == Type::kVector_Kind || rType.kind() == Type::kMatrix_Kind);
-    const Type* operandType;
-    if (!lVecOrMtx && rVecOrMtx) {
-        operandType = &rType;
-    } else {
-        operandType = &lType;
-    }
     Token::Kind op;
     std::unique_ptr<LValue> lvalue;
-    ByteCode::Register left;
-    switch (b.fOperator) {
-        case Token::Kind::LOGICALAND:
-        case Token::Kind::LOGICALANDEQ:
-        case Token::Kind::LOGICALOR:
-        case Token::Kind::LOGICALOREQ:
-            left = result;
-            break;
-        default:
-            left = this->next(SlotCount(*operandType));
-    }
     if (is_assignment(b.fOperator)) {
         lvalue = this->getLValue(*b.fLeft);
-        lvalue->load(left);
+        lvalue->load();
         op = remove_assignment(b.fOperator);
     } else {
-        this->writeExpression(*b.fLeft, left);
+        this->writeExpression(*b.fLeft);
         op = b.fOperator;
         if (!lVecOrMtx && rVecOrMtx) {
-            for (int i = 1; i < SlotCount(rType); ++i) {
-                this->write(ByteCode::Instruction::kCopy);
-                this->write(left + i);
-                this->write(left);
+            for (int i = SlotCount(rType); i > 1; --i) {
+                this->write(ByteCodeInstruction::kDup);
+                this->write8(1);
             }
         }
     }
-    SkDEBUGCODE(TypeCategory tc = type_category(lType));
     int count = std::max(SlotCount(lType), SlotCount(rType));
+    SkDEBUGCODE(TypeCategory tc = type_category(lType));
     switch (op) {
         case Token::Kind::LOGICALAND: {
-            SkASSERT(left.fIndex == result.fIndex);
-            this->write(ByteCode::Instruction::kMaskPush);
-            ++fConditionCount;
-            this->write(left);
-            this->write(ByteCode::Instruction::kBranchIfAllFalse);
+            SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
+            this->write(ByteCodeInstruction::kDup);
+            this->write8(1);
+            this->write(ByteCodeInstruction::kMaskPush);
+            this->write(ByteCodeInstruction::kBranchIfAllFalse);
             DeferredLocation falseLocation(this);
-            SkASSERT(SlotCount(b.fRight->fType) == 1);
-            ByteCode::Register right = this->next(1);
-            this->writeExpression(*b.fRight, right);
-            this->write(ByteCode::Instruction::kAnd);
-            this->write(result);
-            this->write(left);
-            this->write(right);
+            this->writeExpression(*b.fRight);
+            this->write(ByteCodeInstruction::kAndB);
             falseLocation.set();
-            --fConditionCount;
-            this->write(ByteCode::Instruction::kMaskPop);
-            return;
+            this->write(ByteCodeInstruction::kMaskPop);
+            return false;
         }
         case Token::Kind::LOGICALOR: {
-            SkASSERT(left.fIndex == result.fIndex);
-            ByteCode::Register mask = this->next(1);
-            this->write(ByteCode::Instruction::kNot);
-            this->write(mask);
-            this->write(left);
-            this->write(ByteCode::Instruction::kMaskPush);
-            ++fConditionCount;
-            this->write(mask);
-            this->write(ByteCode::Instruction::kBranchIfAllFalse);
+            SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
+            this->write(ByteCodeInstruction::kDup);
+            this->write8(1);
+            this->write(ByteCodeInstruction::kNotB);
+            this->write(ByteCodeInstruction::kMaskPush);
+            this->write(ByteCodeInstruction::kBranchIfAllFalse);
             DeferredLocation falseLocation(this);
-            SkASSERT(SlotCount(b.fRight->fType) == 1);
-            ByteCode::Register right = this->next(1);
-            this->writeExpression(*b.fRight, right);
-            this->write(ByteCode::Instruction::kOr);
-            this->write(result);
-            this->write(left);
-            this->write(right);
+            this->writeExpression(*b.fRight);
+            this->write(ByteCodeInstruction::kOrB);
             falseLocation.set();
-            --fConditionCount;
-            this->write(ByteCode::Instruction::kMaskPop);
-            return;
+            this->write(ByteCodeInstruction::kMaskPop);
+            return false;
         }
         case Token::Kind::SHL:
         case Token::Kind::SHR: {
@@ -555,658 +706,775 @@
                                     tc == SkSL::TypeCategory::kUnsigned));
             if (!b.fRight->isConstant()) {
                 fErrors.error(b.fRight->fOffset, "Shift amounts must be constant");
-                return;
+                return false;
             }
             int64_t shift = b.fRight->getConstantInt();
             if (shift < 0 || shift > 31) {
                 fErrors.error(b.fRight->fOffset, "Shift amount out of range");
-                return;
+                return false;
             }
 
             if (op == Token::Kind::SHL) {
-                this->write(ByteCode::Instruction::kShiftLeft);
+                this->write(ByteCodeInstruction::kShiftLeft);
             } else {
                 this->write(type_category(lType) == TypeCategory::kSigned
-                                ? ByteCode::Instruction::kShiftRightS
-                                : ByteCode::Instruction::kShiftRightU);
+                                ? ByteCodeInstruction::kShiftRightS
+                                : ByteCodeInstruction::kShiftRightU);
             }
-            this->write(result);
-            this->write(left);
-            this->write((uint8_t) shift);
-            return;
+            this->write8(shift);
+            return false;
         }
-        case Token::Kind::STAR:
-            // Special case for M*V, V*M, M*M (but not V*V!)
-            if (lType.columns() > 1 && rType.columns() > 1 &&
-                (lType.rows() > 1 || rType.rows() > 1)) {
-                ByteCode::Register right = this->next(SlotCount(rType));
-                this->writeExpression(*b.fRight, right);
-                int rCols = rType.columns(),
-                    rRows = rType.rows(),
-                    lCols = lType.columns(),
-                    lRows = lType.rows();
-                // M*V treats the vector as a column
-                if (rType.kind() == Type::kVector_Kind) {
-                    std::swap(rCols, rRows);
-                }
-                SkASSERT(lCols == rRows);
-                SkASSERT(SlotCount(b.fType) == lRows * rCols);
-                this->write(ByteCode::Instruction::kMatrixMultiply);
-                this->write(result);
-                this->write(left);
-                this->write(right);
-                this->write((uint8_t) lCols);
-                this->write((uint8_t) lRows);
-                this->write((uint8_t) rCols);
-                return;
-            }
 
         default:
             break;
     }
-    ByteCode::Register right = this->next(SlotCount(*operandType));
-    this->writeExpression(*b.fRight, right);
+    this->writeExpression(*b.fRight);
     if (lVecOrMtx && !rVecOrMtx) {
-        for (int i = 1; i < SlotCount(*operandType); ++i) {
-            this->write(ByteCode::Instruction::kCopy);
-            this->write(right + i);
-            this->write(right);
+        for (int i = SlotCount(lType); i > 1; --i) {
+            this->write(ByteCodeInstruction::kDup);
+            this->write8(1);
         }
     }
-    switch (op) {
-        case Token::Kind::EQEQ:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareEQI,
-                                         ByteCode::Instruction::kCompareEQI,
-                                         ByteCode::Instruction::kCompareEQF,
-                                         result);
-            // Collapse to a single bool
-            for (int i = 1; i < count; ++i) {
-                this->write(ByteCode::Instruction::kAnd);
-                this->write(result);
-                this->write(result);
-                this->write(result + i);
-            }
-            break;
-        case Token::Kind::GT:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareGTS,
-                                         ByteCode::Instruction::kCompareGTU,
-                                         ByteCode::Instruction::kCompareGTF,
-                                         result);
-            break;
-        case Token::Kind::GTEQ:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareGTEQS,
-                                         ByteCode::Instruction::kCompareGTEQU,
-                                         ByteCode::Instruction::kCompareGTEQF,
-                                         result);
-            break;
-        case Token::Kind::LT:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareLTS,
-                                         ByteCode::Instruction::kCompareLTU,
-                                         ByteCode::Instruction::kCompareLTF,
-                                         result);
-            break;
-        case Token::Kind::LTEQ:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareLTEQS,
-                                         ByteCode::Instruction::kCompareLTEQU,
-                                         ByteCode::Instruction::kCompareLTEQF,
-                                         result);
-            break;
-        case Token::Kind::MINUS:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kSubtractI,
-                                         ByteCode::Instruction::kSubtractI,
-                                         ByteCode::Instruction::kSubtractF,
-                                         result);
-            break;
-        case Token::Kind::NEQ:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareNEQI,
-                                         ByteCode::Instruction::kCompareNEQI,
-                                         ByteCode::Instruction::kCompareNEQF,
-                                         result);
-            // Collapse to a single bool
-            for (int i = 1; i < count; ++i) {
-                this->write(ByteCode::Instruction::kOr);
-                this->write(result);
-                this->write(result);
-                this->write(result + i);
-            }
-            break;
-        case Token::Kind::PERCENT:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kRemainderS,
-                                         ByteCode::Instruction::kRemainderU,
-                                         ByteCode::Instruction::kRemainderF,
-                                         result);
-            break;
-        case Token::Kind::PLUS:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kAddI,
-                                         ByteCode::Instruction::kAddI,
-                                         ByteCode::Instruction::kAddF,
-                                         result);
-            break;
-        case Token::Kind::SLASH:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kDivideS,
-                                         ByteCode::Instruction::kDivideU,
-                                         ByteCode::Instruction::kDivideF,
-                                         result);
-            break;
-        case Token::Kind::STAR:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kMultiplyI,
-                                         ByteCode::Instruction::kMultiplyI,
-                                         ByteCode::Instruction::kMultiplyF,
-                                         result);
-            break;
-        case Token::Kind::LOGICALXOR: {
-            SkASSERT(tc == SkSL::TypeCategory::kBool);
-            this->write(ByteCode::Instruction::kXor);
-            this->write(result);
-            this->write(left);
-            this->write(right);
-            break;
+    // Special case for M*V, V*M, M*M (but not V*V!)
+    if (op == Token::Kind::STAR && lVecOrMtx && rVecOrMtx &&
+        !(lType.kind() == Type::kVector_Kind && rType.kind() == Type::kVector_Kind)) {
+        this->write(ByteCodeInstruction::kMatrixMultiply,
+                    SlotCount(b.fType) - (SlotCount(lType) + SlotCount(rType)));
+        int rCols = rType.columns(),
+            rRows = rType.rows(),
+            lCols = lType.columns(),
+            lRows = lType.rows();
+        // M*V treats the vector as a column
+        if (rType.kind() == Type::kVector_Kind) {
+            std::swap(rCols, rRows);
         }
-        case Token::Kind::BITWISEAND: {
-            SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
-            this->write(ByteCode::Instruction::kAnd);
-            this->write(result);
-            this->write(left);
-            this->write(right);
-            break;
+        SkASSERT(lCols == rRows);
+        SkASSERT(SlotCount(b.fType) == lRows * rCols);
+        this->write8(lCols);
+        this->write8(lRows);
+        this->write8(rCols);
+    } else {
+        switch (op) {
+            case Token::Kind::EQEQ:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareIEQ,
+                                            ByteCodeInstruction::kCompareIEQ,
+                                            ByteCodeInstruction::kCompareFEQ,
+                                            count);
+                // Collapse to a single bool
+                for (int i = count; i > 1; --i) {
+                    this->write(ByteCodeInstruction::kAndB);
+                }
+                break;
+            case Token::Kind::GT:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSGT,
+                                            ByteCodeInstruction::kCompareUGT,
+                                            ByteCodeInstruction::kCompareFGT,
+                                            count);
+                break;
+            case Token::Kind::GTEQ:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSGTEQ,
+                                            ByteCodeInstruction::kCompareUGTEQ,
+                                            ByteCodeInstruction::kCompareFGTEQ,
+                                            count);
+                break;
+            case Token::Kind::LT:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSLT,
+                                            ByteCodeInstruction::kCompareULT,
+                                            ByteCodeInstruction::kCompareFLT,
+                                            count);
+                break;
+            case Token::Kind::LTEQ:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSLTEQ,
+                                            ByteCodeInstruction::kCompareULTEQ,
+                                            ByteCodeInstruction::kCompareFLTEQ,
+                                            count);
+                break;
+            case Token::Kind::MINUS:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractF,
+                                            count);
+                break;
+            case Token::Kind::NEQ:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareINEQ,
+                                            ByteCodeInstruction::kCompareINEQ,
+                                            ByteCodeInstruction::kCompareFNEQ,
+                                            count);
+                // Collapse to a single bool
+                for (int i = count; i > 1; --i) {
+                    this->write(ByteCodeInstruction::kOrB);
+                }
+                break;
+            case Token::Kind::PERCENT:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kRemainderS,
+                                            ByteCodeInstruction::kRemainderU,
+                                            ByteCodeInstruction::kRemainderF,
+                                            count);
+                break;
+            case Token::Kind::PLUS:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddF,
+                                            count);
+                break;
+            case Token::Kind::SLASH:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kDivideS,
+                                            ByteCodeInstruction::kDivideU,
+                                            ByteCodeInstruction::kDivideF,
+                                            count);
+                break;
+            case Token::Kind::STAR:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kMultiplyI,
+                                            ByteCodeInstruction::kMultiplyI,
+                                            ByteCodeInstruction::kMultiplyF,
+                                            count);
+                break;
+
+            case Token::Kind::LOGICALXOR:
+                SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
+                this->write(ByteCodeInstruction::kXorB);
+                break;
+
+            case Token::Kind::BITWISEAND:
+                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
+                                        tc == SkSL::TypeCategory::kUnsigned));
+                this->write(ByteCodeInstruction::kAndB);
+                break;
+            case Token::Kind::BITWISEOR:
+                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
+                                        tc == SkSL::TypeCategory::kUnsigned));
+                this->write(ByteCodeInstruction::kOrB);
+                break;
+            case Token::Kind::BITWISEXOR:
+                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
+                                        tc == SkSL::TypeCategory::kUnsigned));
+                this->write(ByteCodeInstruction::kXorB);
+                break;
+
+            default:
+                fErrors.error(b.fOffset, SkSL::String::printf("Unsupported binary operator '%s'",
+                                                              Compiler::OperatorName(op)));
+                break;
         }
-        case Token::Kind::BITWISEOR: {
-            SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
-            this->write(ByteCode::Instruction::kOr);
-            this->write(result);
-            this->write(left);
-            this->write(right);
-            break;
-        }
-        case Token::Kind::BITWISEXOR: {
-            SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
-            this->write(ByteCode::Instruction::kXor);
-            this->write(result);
-            this->write(left);
-            this->write(right);
-            break;
-        }
-        default:
-            fErrors.error(b.fOffset, SkSL::String::printf("Unsupported binary operator '%s'",
-                                                          Compiler::OperatorName(op)));
-            break;
     }
     if (lvalue) {
-        lvalue->store(result);
+        lvalue->store(discard);
+        discard = false;
     }
+    return discard;
 }
 
-void ByteCodeGenerator::writeConstructor(const Constructor& c, ByteCode::Register result) {
-    if (c.fType.rows() > 1) {
-        if (c.fArguments.size() == 1) {
-            if (SlotCount(c.fArguments[0]->fType) == 1) {
-                ByteCode::Register v = this->next(1);
-                this->writeExpression(*c.fArguments[0], v);
-                this->write(ByteCode::Instruction::kScalarToMatrix);
-                this->write(result);
-                this->write(v);
-                this->write((uint8_t) c.fType.columns());
-                this->write((uint8_t) c.fType.rows());
-                return;
-            } else if (c.fArguments[0]->fType.rows() > 1) {
-                ByteCode::Register v = this->next(SlotCount(c.fArguments[0]->fType));
-                this->writeExpression(*c.fArguments[0], v);
-                this->write(ByteCode::Instruction::kMatrixToMatrix);
-                this->write(result);
-                this->write(v);
-                this->write((uint8_t) c.fArguments[0]->fType.columns());
-                this->write((uint8_t) c.fArguments[0]->fType.rows());
-                this->write((uint8_t) c.fType.columns());
-                this->write((uint8_t) c.fType.rows());
-                return;
+void ByteCodeGenerator::writeBoolLiteral(const BoolLiteral& b) {
+    this->write(ByteCodeInstruction::kPushImmediate);
+    this->write32(b.fValue ? ~0 : 0);
+}
+
+void ByteCodeGenerator::writeConstructor(const Constructor& c) {
+    for (const auto& arg : c.fArguments) {
+        this->writeExpression(*arg);
+    }
+    if (c.fArguments.size() == 1) {
+        const Type& inType = c.fArguments[0]->fType;
+        const Type& outType = c.fType;
+        TypeCategory inCategory = type_category(inType);
+        TypeCategory outCategory = type_category(outType);
+        int inCount = SlotCount(inType);
+        int outCount = SlotCount(outType);
+        if (inCategory != outCategory) {
+            SkASSERT(inCount == outCount);
+            if (inCategory == TypeCategory::kFloat) {
+                SkASSERT(outCategory == TypeCategory::kSigned ||
+                         outCategory == TypeCategory::kUnsigned);
+                this->write(vector_instruction(ByteCodeInstruction::kConvertFtoI, outCount));
+            } else if (outCategory == TypeCategory::kFloat) {
+                if (inCategory == TypeCategory::kSigned) {
+                    this->write(vector_instruction(ByteCodeInstruction::kConvertStoF, outCount));
+                } else {
+                    SkASSERT(inCategory == TypeCategory::kUnsigned);
+                    this->write(vector_instruction(ByteCodeInstruction::kConvertUtoF, outCount));
+                }
+            } else {
+                SkASSERT(false);
             }
         }
-        int offset = 0;
-        for (const auto& arg : c.fArguments) {
-            this->writeExpression(*arg, ByteCode::Register{(uint16_t) (result.fIndex + offset)});
-            offset += SlotCount(arg->fType);
-        }
-        return;
-    }
-    if (c.fArguments.size() == 1 && c.fArguments[0]->fType.columns() == 1 &&
-        c.fType.columns() > 1) {
-        SkASSERT(SlotCount(c.fArguments[0]->fType) == 1);
-        ByteCode::Register v = result;
-        this->writeExpression(*c.fArguments[0], v);
-        for (int i = 1; i < c.fType.columns(); ++i) {
-            this->write(ByteCode::Instruction::kCopy);
-            this->write(v + i);
-            this->write(v);
-        }
-        return;
-    }
-    ByteCode::Instruction inst;
-    switch (type_category(c.fArguments[0]->fType)) {
-        case TypeCategory::kSigned:
-            if (type_category(c.fType) == TypeCategory::kFloat) {
-                inst = ByteCode::Instruction::kSignedToFloat;
+        if (inType.kind() == Type::kMatrix_Kind && outType.kind() == Type::kMatrix_Kind) {
+            this->write(ByteCodeInstruction::kMatrixToMatrix,
+                        SlotCount(outType) - SlotCount(inType));
+            this->write8(inType.columns());
+            this->write8(inType.rows());
+            this->write8(outType.columns());
+            this->write8(outType.rows());
+        } else if (inCount != outCount) {
+            SkASSERT(inCount == 1);
+            if (outType.kind() == Type::kMatrix_Kind) {
+                this->write(ByteCodeInstruction::kScalarToMatrix, SlotCount(outType) - 1);
+                this->write8(outType.columns());
+                this->write8(outType.rows());
             } else {
-                inst = ByteCode::Instruction::kNop;
-            }
-            break;
-        case TypeCategory::kUnsigned:
-            if (type_category(c.fType) == TypeCategory::kFloat) {
-                inst = ByteCode::Instruction::kUnsignedToFloat;
-            } else {
-                inst = ByteCode::Instruction::kNop;
-            }
-            break;
-        case TypeCategory::kFloat:
-            if (type_category(c.fType) == TypeCategory::kSigned) {
-                inst = ByteCode::Instruction::kFloatToSigned;
-            } else if (type_category(c.fType) == TypeCategory::kUnsigned) {
-                inst = ByteCode::Instruction::kFloatToUnsigned;
-            } else {
-                inst = ByteCode::Instruction::kNop;
-            }
-            break;
-        default:
-            SkASSERT(false);
-            return;
-    }
-    ByteCode::Register values;
-    if (inst == ByteCode::Instruction::kNop) {
-        values = result;
-    } else {
-        values = this->next(SlotCount(c.fType));
-    }
-    ByteCode::Register v = values;
-    for (size_t i = 0; i < c.fArguments.size(); ++i) {
-        this->writeExpression(*c.fArguments[i], v);
-        v.fIndex += SlotCount(c.fArguments[i]->fType);
-    }
-    if (inst != ByteCode::Instruction::kNop) {
-        v = values;
-        ByteCode::Register target = result;
-        for (size_t i = 0; i < c.fArguments.size(); ++i) {
-            int count = SlotCount(c.fArguments[i]->fType);
-            for (int j = 0; j < count; ++j) {
-                this->write(inst);
-                this->write(target);
-                ++target.fIndex;
-                this->write(v + j);
+                SkASSERT(outType.kind() == Type::kVector_Kind);
+                for (; inCount != outCount; ++inCount) {
+                    this->write(ByteCodeInstruction::kDup);
+                    this->write8(1);
+                }
             }
         }
     }
 }
 
-void ByteCodeGenerator::writeExternalFunctionCall(const ExternalFunctionCall& f,
-                                                  ByteCode::Register result) {
+void ByteCodeGenerator::writeExternalFunctionCall(const ExternalFunctionCall& f) {
     int argumentCount = 0;
     for (const auto& arg : f.fArguments) {
+        this->writeExpression(*arg);
         argumentCount += SlotCount(arg->fType);
     }
-    ByteCode::Register args = this->next(argumentCount);
-    argumentCount = 0;
-    for (const auto& arg : f.fArguments) {
-        this->writeExpression(*arg, args + argumentCount);
-        argumentCount += SlotCount(arg->fType);
-    }
-    this->write(ByteCode::Instruction::kCallExternal);
-    this->write(result);
+    this->write(ByteCodeInstruction::kCallExternal, SlotCount(f.fType) - argumentCount);
+    SkASSERT(argumentCount <= 255);
+    this->write8(argumentCount);
+    this->write8(SlotCount(f.fType));
     int index = fOutput->fExternalValues.size();
     fOutput->fExternalValues.push_back(f.fFunction);
     SkASSERT(index <= 255);
-    this->write((uint8_t) index);
-    SkASSERT(SlotCount(f.fType) <= 255);
-    this->write((uint8_t) SlotCount(f.fType));
-    this->write(args);
-    SkASSERT(argumentCount <= 255);
-    this->write((uint8_t) argumentCount);
+    this->write8(index);
 }
 
-void ByteCodeGenerator::writeExternalValue(const ExternalValueReference& e,
-                                           ByteCode::Register result) {
-    this->write(ByteCode::Instruction::kReadExternal);
-    this->write(result);
-    this->write((uint8_t) SlotCount(e.fValue->type()));
+void ByteCodeGenerator::writeExternalValue(const ExternalValueReference& e) {
+    int count = SlotCount(e.fValue->type());
+    this->write(vector_instruction(ByteCodeInstruction::kReadExternal, count));
+    this->write8(count);
     int index = fOutput->fExternalValues.size();
     fOutput->fExternalValues.push_back(e.fValue);
     SkASSERT(index <= 255);
-    this->write((uint8_t) index);
+    this->write8(index);
 }
 
-void ByteCodeGenerator::writeIntrinsicCall(const FunctionCall& c, Intrinsic intrinsic,
-                                           ByteCode::Register result) {
-    if (intrinsic.fIsSpecial) {
-        switch (intrinsic.fValue.fSpecial) {
+void ByteCodeGenerator::writeVariableExpression(const Expression& expr) {
+    Location location = this->getLocation(expr);
+    int count = SlotCount(expr.fType);
+    if (location.isOnStack() || count > 4) {
+        if (!location.isOnStack()) {
+            this->write(ByteCodeInstruction::kPushImmediate);
+            this->write32(location.fSlot);
+        }
+        this->write(location.selectLoad(ByteCodeInstruction::kLoadExtended,
+                                        ByteCodeInstruction::kLoadExtendedGlobal,
+                                        ByteCodeInstruction::kLoadExtendedUniform),
+                    count);
+        this->write8(count);
+    } else {
+        this->write(vector_instruction(location.selectLoad(ByteCodeInstruction::kLoad,
+                                                           ByteCodeInstruction::kLoadGlobal,
+                                                           ByteCodeInstruction::kLoadUniform),
+                                       count));
+        this->write8(count);
+        this->write8(location.fSlot);
+    }
+}
+
+static inline uint32_t float_to_bits(float x) {
+    uint32_t u;
+    memcpy(&u, &x, sizeof(uint32_t));
+    return u;
+}
+
+void ByteCodeGenerator::writeFloatLiteral(const FloatLiteral& f) {
+    this->write(ByteCodeInstruction::kPushImmediate);
+    this->write32(float_to_bits(f.fValue));
+}
+
+void ByteCodeGenerator::writeIntrinsicCall(const FunctionCall& c) {
+    auto found = fIntrinsics.find(c.fFunction.fName);
+    if (found == fIntrinsics.end()) {
+        fErrors.error(c.fOffset, String::printf("Unsupported intrinsic: '%s'",
+                                                String(c.fFunction.fName).c_str()));
+        return;
+    }
+    int count = SlotCount(c.fArguments[0]->fType);
+    if (found->second.fIsSpecial) {
+        SpecialIntrinsic special = found->second.fValue.fSpecial;
+        switch (special) {
             case SpecialIntrinsic::kDot: {
                 SkASSERT(c.fArguments.size() == 2);
-                int count = SlotCount(c.fArguments[0]->fType);
-                ByteCode::Register left = this->next(count);
-                this->writeExpression(*c.fArguments[0], left);
-                ByteCode::Register right = this->next(count);
-                this->writeExpression(*c.fArguments[1], right);
-                ByteCode::Register product = this->next(count);
-                for (int i = 0; i < count; ++i) {
-                    this->writeTypedInstruction(c.fType,
-                                                ByteCode::Instruction::kMultiplyI,
-                                                ByteCode::Instruction::kMultiplyI,
-                                                ByteCode::Instruction::kMultiplyF);
-                    this->write(product + i);
-                    this->write(left + i);
-                    this->write(right + i);
-                }
-                ByteCode::Register total = product;
-                for (int i = 1; i < count; ++i) {
-                    this->writeTypedInstruction(c.fType,
-                                                ByteCode::Instruction::kAddI,
-                                                ByteCode::Instruction::kAddI,
-                                                ByteCode::Instruction::kAddF);
-                    ByteCode::Register sum = i == count - 1 ? result : this->next(1);
-                    this->write(sum);
-                    this->write(total);
-                    this->write(product + i);
-                    total = sum;
+                SkASSERT(count == SlotCount(c.fArguments[1]->fType));
+                this->write(vector_instruction(ByteCodeInstruction::kMultiplyF, count));
+                this->write8(count);
+                for (int i = count; i > 1; --i) {
+                    this->write(ByteCodeInstruction::kAddF);
+                    this->write8(1);
                 }
                 break;
             }
-            case SpecialIntrinsic::kInverse: {
-                SkASSERT(c.fArguments.size() == 1);
-                int count = SlotCount(c.fArguments[0]->fType);
-                ByteCode::Register arg = this->next(count);
-                this->writeExpression(*c.fArguments[0], arg);
-                switch (SlotCount(c.fArguments[0]->fType)) {
-                    case 4:  this->write(ByteCode::Instruction::kInverse2x2); break;
-                    case 9:  this->write(ByteCode::Instruction::kInverse3x3); break;
-                    case 16: this->write(ByteCode::Instruction::kInverse4x4); break;
-                    default: SkASSERT(false);
-                }
-                this->write(result);
-                this->write(arg);
-                break;
-            }
+            default:
+                SkASSERT(false);
         }
     } else {
-        std::vector<ByteCode::Register> argRegs;
-        for (const auto& expr : c.fArguments) {
-            ByteCode::Register reg = this->next(SlotCount(expr->fType));
-            this->writeExpression(*expr, reg);
-            argRegs.push_back(reg);
-        }
-        this->write(intrinsic.fValue.fInstruction);
-        if (c.fType.fName != "void") {
-            this->write(result);
-        }
-        for (ByteCode::Register arg : argRegs) {
-            this->write(arg);
+        switch (found->second.fValue.fInstruction) {
+            case ByteCodeInstruction::kCos:
+            case ByteCodeInstruction::kSin:
+            case ByteCodeInstruction::kTan:
+                SkASSERT(c.fArguments.size() > 0);
+                this->write(vector_instruction(found->second.fValue.fInstruction, count));
+                this->write8(count);
+                break;
+            case ByteCodeInstruction::kSqrt:
+                SkASSERT(c.fArguments.size() > 0);
+                this->write(vector_instruction(found->second.fValue.fInstruction, count));
+                break;
+            case ByteCodeInstruction::kInverse2x2: {
+                SkASSERT(c.fArguments.size() > 0);
+                auto op = ByteCodeInstruction::kInverse2x2;
+                switch (count) {
+                    case 4: break;  // float2x2
+                    case 9:  op = ByteCodeInstruction::kInverse3x3; break;
+                    case 16: op = ByteCodeInstruction::kInverse4x4; break;
+                    default: SkASSERT(false);
+                }
+                this->write(op);
+                break;
+            }
+            default:
+                SkASSERT(false);
         }
     }
 }
 
-void ByteCodeGenerator::writeFunctionCall(const FunctionCall& c, ByteCode::Register result) {
-    auto found = fIntrinsics.find(c.fFunction.fName);
-    if (found != fIntrinsics.end()) {
-        return this->writeIntrinsicCall(c, found->second, result);
-    }
-    int argCount = c.fArguments.size();
-    std::vector<std::unique_ptr<LValue>> lvalues;
-    int parameterSlotCount = 0;
-    for (const auto& p : c.fFunction.fParameters) {
-        parameterSlotCount += SlotCount(p->fType);
-    }
-    ByteCode::Register argStart = this->next(parameterSlotCount);
-    ByteCode::Register nextArg = argStart;
-    for (int i = 0; i < argCount; ++i) {
-        const auto& param = c.fFunction.fParameters[i];
-        const auto& arg = c.fArguments[i];
-        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
-            lvalues.emplace_back(this->getLValue(*arg));
-            lvalues.back()->load(nextArg);
-        } else {
-            this->writeExpression(*arg, nextArg);
-        }
-        nextArg.fIndex += SlotCount(arg->fType);
-    }
+void ByteCodeGenerator::writeFunctionCall(const FunctionCall& f) {
     // Find the index of the function we're calling. We explicitly do not allow calls to functions
     // before they're defined. This is an easy-to-understand rule that prevents recursion.
-    size_t idx;
-    for (idx = 0; idx < fFunctions.size(); ++idx) {
-        if (c.fFunction.matches(fFunctions[idx]->fDeclaration)) {
+    int idx = -1;
+    for (size_t i = 0; i < fFunctions.size(); ++i) {
+        if (f.fFunction.matches(fFunctions[i]->fDeclaration)) {
+            idx = i;
             break;
         }
     }
-    if (idx > 255) {
-        fErrors.error(c.fOffset, "Function count limit exceeded");
-        return;
-    } else if (idx >= fOutput->fFunctions.size()) {
-        fErrors.error(c.fOffset, "Call to undefined function");
-        return;
-    }
-
-    this->write(ByteCode::Instruction::kCall);
-    this->write(result);
-    this->write((uint8_t) idx);
-    this->write(argStart);
-    nextArg = argStart;
-    auto lvalue = lvalues.begin();
-    for (int i = 0; i < argCount; ++i) {
-        const auto& param = c.fFunction.fParameters[i];
-        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
-            (*(lvalue++))->store(nextArg);
+    if (idx == -1) {
+        for (const auto& arg : f.fArguments) {
+            this->writeExpression(*arg);
         }
-        nextArg.fIndex += SlotCount(param->fType);
+        this->writeIntrinsicCall(f);
+        return;
     }
+
+
+    if (idx > 255) {
+        fErrors.error(f.fOffset, "Function count limit exceeded");
+        return;
+    } else if (idx >= (int) fFunctions.size()) {
+        fErrors.error(f.fOffset, "Call to undefined function");
+        return;
+    }
+
+    // We may need to deal with out parameters, so the sequence is tricky
+    if (int returnCount = SlotCount(f.fType)) {
+        this->write(ByteCodeInstruction::kReserve, returnCount);
+        this->write8(returnCount);
+    }
+
+    int argCount = f.fArguments.size();
+    std::vector<std::unique_ptr<LValue>> lvalues;
+    for (int i = 0; i < argCount; ++i) {
+        const auto& param = f.fFunction.fParameters[i];
+        const auto& arg = f.fArguments[i];
+        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
+            lvalues.emplace_back(this->getLValue(*arg));
+            lvalues.back()->load();
+        } else {
+            this->writeExpression(*arg);
+        }
+    }
+
+    // The space used by the call is based on the callee, but it also unwinds all of that before
+    // we continue execution. We adjust our max stack depths below.
+    this->write(ByteCodeInstruction::kCall);
+    this->write8(idx);
+
+    const ByteCodeFunction* callee = fOutput->fFunctions[idx].get();
+    fMaxLoopCount      = std::max(fMaxLoopCount,      fLoopCount      + callee->fLoopCount);
+    fMaxConditionCount = std::max(fMaxConditionCount, fConditionCount + callee->fConditionCount);
+    fMaxStackCount     = std::max(fMaxStackCount,     fStackCount     + callee->fLocalCount
+                                                                      + callee->fStackCount);
+
+    // After the called function returns, the stack will still contain our arguments. We have to
+    // pop them (storing any out parameters back to their lvalues as we go). We glob together slot
+    // counts for all parameters that aren't out-params, so we can pop them in one big chunk.
+    int popCount = 0;
+    auto pop = [&]() {
+        if (popCount > 4) {
+            this->write(ByteCodeInstruction::kPopN, popCount);
+            this->write8(popCount);
+        } else if (popCount > 0) {
+            this->write(vector_instruction(ByteCodeInstruction::kPop, popCount));
+        }
+        popCount = 0;
+    };
+
+    for (int i = argCount - 1; i >= 0; --i) {
+        const auto& param = f.fFunction.fParameters[i];
+        const auto& arg = f.fArguments[i];
+        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
+            pop();
+            lvalues.back()->store(true);
+            lvalues.pop_back();
+        } else {
+            popCount += SlotCount(arg->fType);
+        }
+    }
+    pop();
 }
 
-void ByteCodeGenerator::incOrDec(Token::Kind op, Expression& operand, bool prefix,
-                                 ByteCode::Register result) {
-    SkASSERT(op == Token::Kind::PLUSPLUS || op == Token::Kind::MINUSMINUS);
-    std::unique_ptr<LValue> lvalue = this->getLValue(operand);
-    SkASSERT(SlotCount(operand.fType) == 1);
-    ByteCode::Register value;
-    if (prefix) {
-        value = this->next(1);
-    } else {
-        value = result;
-    }
-    lvalue->load(value);
-    ByteCode::Register one = this->next(1);
-    this->write(ByteCode::Instruction::kImmediate);
-    this->write(one);
-    if (type_category(operand.fType) == TypeCategory::kFloat) {
-        this->write(ByteCode::Immediate(1.0f));
-    } else {
-        this->write(ByteCode::Immediate((int32_t) 1));
-    }
-    if (op == Token::Kind::PLUSPLUS) {
-        this->writeTypedInstruction(operand.fType,
-                                    ByteCode::Instruction::kAddI,
-                                    ByteCode::Instruction::kAddI,
-                                    ByteCode::Instruction::kAddF);
-    } else {
-        this->writeTypedInstruction(operand.fType,
-                                    ByteCode::Instruction::kSubtractI,
-                                    ByteCode::Instruction::kSubtractI,
-                                    ByteCode::Instruction::kSubtractF);
-    }
-    if (prefix) {
-        this->write(result);
-        this->write(value);
-        this->write(one);
-        lvalue->store(result);
-    } else {
-        ByteCode::Register temp = this->next(1);
-        this->write(temp);
-        this->write(value);
-        this->write(one);
-        lvalue->store(temp);
-    }
+void ByteCodeGenerator::writeIntLiteral(const IntLiteral& i) {
+    this->write(ByteCodeInstruction::kPushImmediate);
+    this->write32(i.fValue);
 }
 
-void ByteCodeGenerator::writePostfixExpression(const PostfixExpression& p,
-                                               ByteCode::Register result) {
-    this->incOrDec(p.fOperator, *p.fOperand, false, result);
+void ByteCodeGenerator::writeNullLiteral(const NullLiteral& n) {
+    // not yet implemented
+    abort();
 }
 
-void ByteCodeGenerator::writePrefixExpression(const PrefixExpression& p,
-                                              ByteCode::Register result) {
+bool ByteCodeGenerator::writePrefixExpression(const PrefixExpression& p, bool discard) {
     switch (p.fOperator) {
-        case Token::Kind::PLUSPLUS:
+        case Token::Kind::PLUSPLUS: // fall through
         case Token::Kind::MINUSMINUS: {
-            return this->incOrDec(p.fOperator, *p.fOperand, true, result);
+            SkASSERT(SlotCount(p.fOperand->fType) == 1);
+            std::unique_ptr<LValue> lvalue = this->getLValue(*p.fOperand);
+            lvalue->load();
+            this->write(ByteCodeInstruction::kPushImmediate);
+            this->write32(type_category(p.fType) == TypeCategory::kFloat ? float_to_bits(1.0f) : 1);
+            if (p.fOperator == Token::Kind::PLUSPLUS) {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddF,
+                                            1);
+            } else {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractF,
+                                            1);
+            }
+            lvalue->store(discard);
+            discard = false;
+            break;
         }
         case Token::Kind::MINUS: {
-            ByteCode::Register src = this->next(SlotCount(p.fType));
-            this->writeExpression(*p.fOperand, src);
-            for (int i = 0; i < SlotCount(p.fType); ++i) {
-                this->writeTypedInstruction(p.fType,
-                                            ByteCode::Instruction::kNegateS,
-                                            ByteCode::Instruction::kNegateS,
-                                            ByteCode::Instruction::kNegateF);
-                this->write(result + i);
-                this->write(src + i);
-            }
+            this->writeExpression(*p.fOperand);
+            this->writeTypedInstruction(p.fType,
+                                        ByteCodeInstruction::kNegateI,
+                                        ByteCodeInstruction::kNegateI,
+                                        ByteCodeInstruction::kNegateF,
+                                        SlotCount(p.fOperand->fType),
+                                        false);
             break;
         }
         case Token::Kind::LOGICALNOT:
         case Token::Kind::BITWISENOT: {
-            ByteCode::Register src = this->next(SlotCount(p.fType));
-            this->writeExpression(*p.fOperand, src);
-            for (int i = 0; i < SlotCount(p.fType); ++i) {
-                this->write(ByteCode::Instruction::kNot);
-                this->write(result + i);
-                this->write(src + i);
-            }
+            SkASSERT(SlotCount(p.fOperand->fType) == 1);
+            SkDEBUGCODE(TypeCategory tc = type_category(p.fOperand->fType));
+            SkASSERT((p.fOperator == Token::Kind::LOGICALNOT && tc == TypeCategory::kBool) ||
+                     (p.fOperator == Token::Kind::BITWISENOT && (tc == TypeCategory::kSigned ||
+                                                                 tc == TypeCategory::kUnsigned)));
+            this->writeExpression(*p.fOperand);
+            this->write(ByteCodeInstruction::kNotB);
             break;
         }
         default:
             SkASSERT(false);
     }
+    return discard;
 }
 
-void ByteCodeGenerator::writeSwizzle(const Swizzle& s, ByteCode::Register result) {
+bool ByteCodeGenerator::writePostfixExpression(const PostfixExpression& p, bool discard) {
+    switch (p.fOperator) {
+        case Token::Kind::PLUSPLUS: // fall through
+        case Token::Kind::MINUSMINUS: {
+            SkASSERT(SlotCount(p.fOperand->fType) == 1);
+            std::unique_ptr<LValue> lvalue = this->getLValue(*p.fOperand);
+            lvalue->load();
+            // If we're not supposed to discard the result, then make a copy *before* the +/-
+            if (!discard) {
+                this->write(ByteCodeInstruction::kDup);
+                this->write8(1);
+            }
+            this->write(ByteCodeInstruction::kPushImmediate);
+            this->write32(type_category(p.fType) == TypeCategory::kFloat ? float_to_bits(1.0f) : 1);
+            if (p.fOperator == Token::Kind::PLUSPLUS) {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddF,
+                                            1);
+            } else {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractF,
+                                            1);
+            }
+            // Always consume the result as part of the store
+            lvalue->store(true);
+            discard = false;
+            break;
+        }
+        default:
+            SkASSERT(false);
+    }
+    return discard;
+}
+
+void ByteCodeGenerator::writeSwizzle(const Swizzle& s) {
     if (swizzle_is_simple(s)) {
-        this->writeVariableExpression(s, result);
+        this->writeVariableExpression(s);
         return;
     }
-    ByteCode::Register base = this->writeExpression(*s.fBase);
-    for (int i = 0; i < (int) s.fComponents.size(); ++i) {
-        this->write(ByteCode::Instruction::kCopy);
-        this->write(result + i);
-        this->write(base + s.fComponents[i]);
+
+    switch (s.fBase->fKind) {
+        case Expression::kVariableReference_Kind: {
+            Location location = this->getLocation(*s.fBase);
+            this->write(location.selectLoad(ByteCodeInstruction::kLoadSwizzle,
+                                            ByteCodeInstruction::kLoadSwizzleGlobal,
+                                            ByteCodeInstruction::kLoadSwizzleUniform),
+                        s.fComponents.size());
+            this->write8(location.fSlot);
+            this->write8(s.fComponents.size());
+            for (int c : s.fComponents) {
+                this->write8(c);
+            }
+            break;
+        }
+        default:
+            this->writeExpression(*s.fBase);
+            this->write(ByteCodeInstruction::kSwizzle,
+                        s.fComponents.size() - s.fBase->fType.columns());
+            this->write8(s.fBase->fType.columns());
+            this->write8(s.fComponents.size());
+            for (int c : s.fComponents) {
+                this->write8(c);
+            }
     }
 }
 
-void ByteCodeGenerator::writeTernaryExpression(const TernaryExpression& t,
-                                               ByteCode::Register result) {
+void ByteCodeGenerator::writeTernaryExpression(const TernaryExpression& t) {
     int count = SlotCount(t.fType);
     SkASSERT(count == SlotCount(t.fIfTrue->fType));
     SkASSERT(count == SlotCount(t.fIfFalse->fType));
 
-    ByteCode::Register test = this->writeExpression(*t.fTest);
-    this->write(ByteCode::Instruction::kMaskPush);
-    ++fConditionCount;
-    this->write(test);
-    ByteCode::Register ifTrue = this->writeExpression(*t.fIfTrue);
-    this->write(ByteCode::Instruction::kMaskNegate);
-    ByteCode::Register ifFalse = this->writeExpression(*t.fIfFalse);
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kMaskPop);
-    for (int i = 0; i < count; ++i) {
-        this->write(ByteCode::Instruction::kSelect);
-        this->write(result + i);
-        this->write(test);
-        this->write(ifTrue + i);
-        this->write(ifFalse + i);
-    }
+    this->writeExpression(*t.fTest);
+    this->write(ByteCodeInstruction::kMaskPush);
+    this->writeExpression(*t.fIfTrue);
+    this->write(ByteCodeInstruction::kMaskNegate);
+    this->writeExpression(*t.fIfFalse);
+    this->write(ByteCodeInstruction::kMaskBlend, count);
+    this->write8(count);
 }
 
-void ByteCodeGenerator::writeVariableExpression(const Expression& expr,
-                                                ByteCode::Register result) {
-    ByteCodeGenerator::Location location = this->getLocation(expr);
-    int count = SlotCount(expr.fType);
-    for (int i = 0; i < count; ++i) {
-        ByteCodeGenerator::Location final = location.offset(*this, i);
-        this->write(this->getLoadInstruction(location, this->getStorage(expr)));
-        this->write(result + i);
-        this->write(final);
-    }
-}
-
-void ByteCodeGenerator::writeExpression(const Expression& expr, ByteCode::Register result) {
-    switch (expr.fKind) {
-        case Expression::kBoolLiteral_Kind: {
-            this->write(ByteCode::Instruction::kImmediate);
-            this->write(result);
-            this->write(ByteCode::Immediate((int32_t) (((BoolLiteral&) expr).fValue ? -1 : 0)));
+void ByteCodeGenerator::writeExpression(const Expression& e, bool discard) {
+    switch (e.fKind) {
+        case Expression::kBinary_Kind:
+            discard = this->writeBinaryExpression((BinaryExpression&) e, discard);
             break;
-        }
-        case Expression::kBinary_Kind: {
-            this->writeBinaryExpression((BinaryExpression&) expr, result);
+        case Expression::kBoolLiteral_Kind:
+            this->writeBoolLiteral((BoolLiteral&) e);
             break;
-        }
-        case Expression::kConstructor_Kind: {
-            this->writeConstructor((Constructor&) expr, result);
+        case Expression::kConstructor_Kind:
+            this->writeConstructor((Constructor&) e);
             break;
-        }
         case Expression::kExternalFunctionCall_Kind:
-            this->writeExternalFunctionCall((ExternalFunctionCall&) expr, result);
+            this->writeExternalFunctionCall((ExternalFunctionCall&) e);
             break;
         case Expression::kExternalValue_Kind:
-            this->writeExternalValue((ExternalValueReference&) expr, result);
-            break;
-        case Expression::kFloatLiteral_Kind: {
-            this->write(ByteCode::Instruction::kImmediate);
-            this->write(result);
-            this->write(ByteCode::Immediate((float) ((FloatLiteral&) expr).fValue));
-            break;
-        }
-        case Expression::kFunctionCall_Kind: {
-            this->writeFunctionCall((FunctionCall&) expr, result);
-            break;
-        }
-        case Expression::kIntLiteral_Kind: {
-            this->write(ByteCode::Instruction::kImmediate);
-            this->write(result);
-            this->write(ByteCode::Immediate((int32_t) ((IntLiteral&) expr).fValue));
-            break;
-        }
-        case Expression::kPostfix_Kind:
-            this->writePostfixExpression((PostfixExpression&) expr, result);
-            break;
-        case Expression::kPrefix_Kind:
-            this->writePrefixExpression((PrefixExpression&) expr, result);
-            break;
-        case Expression::kSwizzle_Kind:
-            this->writeSwizzle((Swizzle&) expr, result);
-            break;
-        case Expression::kTernary_Kind:
-            this->writeTernaryExpression((TernaryExpression&) expr, result);
+            this->writeExternalValue((ExternalValueReference&) e);
             break;
         case Expression::kFieldAccess_Kind:
         case Expression::kIndex_Kind:
         case Expression::kVariableReference_Kind:
-            this->writeVariableExpression(expr, result);
+            this->writeVariableExpression(e);
+            break;
+        case Expression::kFloatLiteral_Kind:
+            this->writeFloatLiteral((FloatLiteral&) e);
+            break;
+        case Expression::kFunctionCall_Kind:
+            this->writeFunctionCall((FunctionCall&) e);
+            break;
+        case Expression::kIntLiteral_Kind:
+            this->writeIntLiteral((IntLiteral&) e);
+            break;
+        case Expression::kNullLiteral_Kind:
+            this->writeNullLiteral((NullLiteral&) e);
+            break;
+        case Expression::kPrefix_Kind:
+            discard = this->writePrefixExpression((PrefixExpression&) e, discard);
+            break;
+        case Expression::kPostfix_Kind:
+            discard = this->writePostfixExpression((PostfixExpression&) e, discard);
+            break;
+        case Expression::kSwizzle_Kind:
+            this->writeSwizzle((Swizzle&) e);
+            break;
+        case Expression::kTernary_Kind:
+            this->writeTernaryExpression((TernaryExpression&) e);
             break;
         default:
 #ifdef SK_DEBUG
-            ABORT("unsupported lvalue %s\n", expr.description().c_str());
+            printf("unsupported expression %s\n", e.description().c_str());
 #endif
-            break;
+            SkASSERT(false);
+    }
+    if (discard) {
+        int count = SlotCount(e.fType);
+        if (count > 4) {
+            this->write(ByteCodeInstruction::kPopN, count);
+            this->write8(count);
+        } else if (count != 0) {
+            this->write(vector_instruction(ByteCodeInstruction::kPop, count));
+        }
+        discard = false;
     }
 }
 
-ByteCode::Register ByteCodeGenerator::writeExpression(const Expression& expr) {
-    ByteCode::Register result = this->next(SlotCount(expr.fType));
-    this->writeExpression(expr, result);
-    return result;
+class ByteCodeExternalValueLValue : public ByteCodeGenerator::LValue {
+public:
+    ByteCodeExternalValueLValue(ByteCodeGenerator* generator, ExternalValue& value, int index)
+        : INHERITED(*generator)
+        , fCount(ByteCodeGenerator::SlotCount(value.type()))
+        , fIndex(index) {}
+
+    void load() override {
+        fGenerator.write(vector_instruction(ByteCodeInstruction::kReadExternal, fCount));
+        fGenerator.write8(fCount);
+        fGenerator.write8(fIndex);
+    }
+
+    void store(bool discard) override {
+        if (!discard) {
+            fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, fCount));
+            fGenerator.write8(fCount);
+        }
+        fGenerator.write(vector_instruction(ByteCodeInstruction::kWriteExternal, fCount));
+        fGenerator.write8(fCount);
+        fGenerator.write8(fIndex);
+    }
+
+private:
+    typedef LValue INHERITED;
+
+    int fCount;
+
+    int fIndex;
+};
+
+class ByteCodeSwizzleLValue : public ByteCodeGenerator::LValue {
+public:
+    ByteCodeSwizzleLValue(ByteCodeGenerator* generator, const Swizzle& swizzle)
+        : INHERITED(*generator)
+        , fSwizzle(swizzle) {}
+
+    void load() override {
+        fGenerator.writeSwizzle(fSwizzle);
+    }
+
+    void store(bool discard) override {
+        int count = fSwizzle.fComponents.size();
+        if (!discard) {
+            fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, count));
+            fGenerator.write8(count);
+        }
+        ByteCodeGenerator::Location location = fGenerator.getLocation(*fSwizzle.fBase);
+        if (location.isOnStack()) {
+            fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreSwizzleIndirect,
+                                                  ByteCodeInstruction::kStoreSwizzleIndirectGlobal),
+                             count);
+        } else {
+            fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreSwizzle,
+                                                  ByteCodeInstruction::kStoreSwizzleGlobal),
+                             count);
+            fGenerator.write8(location.fSlot);
+        }
+        fGenerator.write8(count);
+        for (int c : fSwizzle.fComponents) {
+            fGenerator.write8(c);
+        }
+    }
+
+private:
+    const Swizzle& fSwizzle;
+
+    typedef LValue INHERITED;
+};
+
+class ByteCodeExpressionLValue : public ByteCodeGenerator::LValue {
+public:
+    ByteCodeExpressionLValue(ByteCodeGenerator* generator, const Expression& expr)
+        : INHERITED(*generator)
+        , fExpression(expr) {}
+
+    void load() override {
+        fGenerator.writeVariableExpression(fExpression);
+    }
+
+    void store(bool discard) override {
+        int count = ByteCodeGenerator::SlotCount(fExpression.fType);
+        if (!discard) {
+            if (count > 4) {
+                fGenerator.write(ByteCodeInstruction::kDupN, count);
+                fGenerator.write8(count);
+            } else {
+                fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, count));
+                fGenerator.write8(count);
+            }
+        }
+        ByteCodeGenerator::Location location = fGenerator.getLocation(fExpression);
+        if (location.isOnStack() || count > 4) {
+            if (!location.isOnStack()) {
+                fGenerator.write(ByteCodeInstruction::kPushImmediate);
+                fGenerator.write32(location.fSlot);
+            }
+            fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreExtended,
+                                                  ByteCodeInstruction::kStoreExtendedGlobal),
+                             count);
+            fGenerator.write8(count);
+        } else {
+            fGenerator.write(
+                    vector_instruction(location.selectStore(ByteCodeInstruction::kStore,
+                                                            ByteCodeInstruction::kStoreGlobal),
+                                       count));
+            fGenerator.write8(location.fSlot);
+        }
+    }
+
+private:
+    typedef LValue INHERITED;
+
+    const Expression& fExpression;
+};
+
+std::unique_ptr<ByteCodeGenerator::LValue> ByteCodeGenerator::getLValue(const Expression& e) {
+    switch (e.fKind) {
+        case Expression::kExternalValue_Kind: {
+            ExternalValue* value = ((ExternalValueReference&) e).fValue;
+            int index = fOutput->fExternalValues.size();
+            fOutput->fExternalValues.push_back(value);
+            SkASSERT(index <= 255);
+            return std::unique_ptr<LValue>(new ByteCodeExternalValueLValue(this, *value, index));
+        }
+        case Expression::kFieldAccess_Kind:
+        case Expression::kIndex_Kind:
+        case Expression::kVariableReference_Kind:
+            return std::unique_ptr<LValue>(new ByteCodeExpressionLValue(this, e));
+        case Expression::kSwizzle_Kind: {
+            const Swizzle& s = (const Swizzle&) e;
+            return swizzle_is_simple(s)
+                    ? std::unique_ptr<LValue>(new ByteCodeExpressionLValue(this, e))
+                    : std::unique_ptr<LValue>(new ByteCodeSwizzleLValue(this, s));
+        }
+        case Expression::kTernary_Kind:
+        default:
+#ifdef SK_DEBUG
+            ABORT("unsupported lvalue %s\n", e.description().c_str());
+#endif
+            return nullptr;
+    }
 }
 
 void ByteCodeGenerator::writeBlock(const Block& b) {
@@ -1215,121 +1483,146 @@
     }
 }
 
+void ByteCodeGenerator::setBreakTargets() {
+    std::vector<DeferredLocation>& breaks = fBreakTargets.top();
+    for (DeferredLocation& b : breaks) {
+        b.set();
+    }
+    fBreakTargets.pop();
+}
+
+void ByteCodeGenerator::setContinueTargets() {
+    std::vector<DeferredLocation>& continues = fContinueTargets.top();
+    for (DeferredLocation& c : continues) {
+        c.set();
+    }
+    fContinueTargets.pop();
+}
+
+void ByteCodeGenerator::writeBreakStatement(const BreakStatement& b) {
+    // TODO: Include BranchIfAllFalse to top-most LoopNext
+    this->write(ByteCodeInstruction::kLoopBreak);
+}
+
+void ByteCodeGenerator::writeContinueStatement(const ContinueStatement& c) {
+    // TODO: Include BranchIfAllFalse to top-most LoopNext
+    this->write(ByteCodeInstruction::kLoopContinue);
+}
+
 void ByteCodeGenerator::writeDoStatement(const DoStatement& d) {
-    this->write(ByteCode::Instruction::kLoopBegin);
-    ++fConditionCount;
-    SkASSERT(fCode->size() < ByteCode::kPointerMax);
-    ByteCode::Pointer start{(uint16_t) fCode->size()};
+    this->write(ByteCodeInstruction::kLoopBegin);
+    size_t start = fCode->size();
     this->writeStatement(*d.fStatement);
-    ByteCode::Register test = this->writeExpression(*d.fTest);
-    this->write(ByteCode::Instruction::kLoopNext);
-    this->write(ByteCode::Instruction::kLoopMask);
-    this->write(test);
-    this->write(ByteCode::Instruction::kBranchIfAllFalse);
+    this->write(ByteCodeInstruction::kLoopNext);
+    this->writeExpression(*d.fTest);
+    this->write(ByteCodeInstruction::kLoopMask);
+    // TODO: Could shorten this with kBranchIfAnyTrue
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
     DeferredLocation endLocation(this);
-    this->write(ByteCode::Instruction::kBranch);
-    this->write(start);
+    this->write(ByteCodeInstruction::kBranch);
+    this->write16(start);
     endLocation.set();
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kLoopEnd);
+    this->write(ByteCodeInstruction::kLoopEnd);
 }
 
 void ByteCodeGenerator::writeForStatement(const ForStatement& f) {
+    fContinueTargets.emplace();
+    fBreakTargets.emplace();
     if (f.fInitializer) {
         this->writeStatement(*f.fInitializer);
     }
-    this->write(ByteCode::Instruction::kLoopBegin);
-    ++fConditionCount;
-    ByteCode::Pointer start{(uint16_t) fCode->size()};
+    this->write(ByteCodeInstruction::kLoopBegin);
+    size_t start = fCode->size();
     if (f.fTest) {
-        ByteCode::Register test = this->writeExpression(*f.fTest);
-        this->write(ByteCode::Instruction::kLoopMask);
-        this->write(test);
+        this->writeExpression(*f.fTest);
+        this->write(ByteCodeInstruction::kLoopMask);
     }
-    this->write(ByteCode::Instruction::kBranchIfAllFalse);
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
     DeferredLocation endLocation(this);
     this->writeStatement(*f.fStatement);
-    this->write(ByteCode::Instruction::kLoopNext);
+    this->write(ByteCodeInstruction::kLoopNext);
     if (f.fNext) {
-        this->writeExpression(*f.fNext);
+        this->writeExpression(*f.fNext, true);
     }
-    this->write(ByteCode::Instruction::kBranch);
-    this->write(start);
+    this->write(ByteCodeInstruction::kBranch);
+    this->write16(start);
     endLocation.set();
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kLoopEnd);
+    this->write(ByteCodeInstruction::kLoopEnd);
 }
 
 void ByteCodeGenerator::writeIfStatement(const IfStatement& i) {
-    ByteCode::Register test = this->writeExpression(*i.fTest);
-    this->write(ByteCode::Instruction::kMaskPush);
-    ++fConditionCount;
-    this->write(test);
-    this->write(ByteCode::Instruction::kBranchIfAllFalse);
+    this->writeExpression(*i.fTest);
+    this->write(ByteCodeInstruction::kMaskPush);
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
     DeferredLocation falseLocation(this);
     this->writeStatement(*i.fIfTrue);
     falseLocation.set();
     if (i.fIfFalse) {
-        this->write(ByteCode::Instruction::kMaskNegate);
-        this->write(ByteCode::Instruction::kBranchIfAllFalse);
+        this->write(ByteCodeInstruction::kMaskNegate);
+        this->write(ByteCodeInstruction::kBranchIfAllFalse);
         DeferredLocation endLocation(this);
         this->writeStatement(*i.fIfFalse);
         endLocation.set();
     }
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kMaskPop);
+    this->write(ByteCodeInstruction::kMaskPop);
 }
 
-void ByteCodeGenerator::writeReturn(const ReturnStatement& r) {
-    if (fConditionCount) {
+void ByteCodeGenerator::writeReturnStatement(const ReturnStatement& r) {
+    if (fLoopCount || fConditionCount) {
         fErrors.error(r.fOffset, "return not allowed inside conditional or loop");
         return;
     }
-    if (r.fExpression) {
-        ByteCode::Register value = this->writeExpression(*r.fExpression);
-        this->write(ByteCode::Instruction::kReturnValue);
-        this->write(value);
-    }
-    else {
-        this->write(ByteCode::Instruction::kReturn);
-    }
+    int count = SlotCount(r.fExpression->fType);
+    this->writeExpression(*r.fExpression);
+
+    // Technically, the kReturn also pops fOutput->fLocalCount values from the stack, too, but we
+    // haven't counted pushing those (they're outside the scope of our stack tracking). Instead,
+    // we account for those in writeFunction().
+
+    // This is all fine because we don't allow conditional returns, so we only return once anyway.
+    this->write(ByteCodeInstruction::kReturn, -count);
+    this->write8(count);
+}
+
+void ByteCodeGenerator::writeSwitchStatement(const SwitchStatement& r) {
+    // not yet implemented
+    abort();
 }
 
 void ByteCodeGenerator::writeVarDeclarations(const VarDeclarations& v) {
     for (const auto& declStatement : v.fVars) {
         const VarDeclaration& decl = (VarDeclaration&) *declStatement;
-        // we need to grab the location even if we don't use it, to ensure it
-        // has been allocated
-        ByteCodeGenerator::Location location = this->getLocation(*decl.fVar);
+        // we need to grab the location even if we don't use it, to ensure it has been allocated
+        Location location = this->getLocation(*decl.fVar);
         if (decl.fValue) {
-            ByteCode::Register src = this->writeExpression(*decl.fValue);
-            for (int i = 0; i < SlotCount(decl.fVar->fType); ++i) {
-                ByteCodeGenerator::Location final = location.offset(*this, i);
-                this->write(ByteCode::Instruction::kStoreStackDirect);
-                this->write(final);
-                this->write(src + i);
+            this->writeExpression(*decl.fValue);
+            int count = SlotCount(decl.fValue->fType);
+            if (count > 4) {
+                this->write(ByteCodeInstruction::kPushImmediate);
+                this->write32(location.fSlot);
+                this->write(ByteCodeInstruction::kStoreExtended, count);
+                this->write8(count);
+            } else {
+                this->write(vector_instruction(ByteCodeInstruction::kStore, count));
+                this->write8(location.fSlot);
             }
         }
     }
 }
 
 void ByteCodeGenerator::writeWhileStatement(const WhileStatement& w) {
-    this->write(ByteCode::Instruction::kLoopBegin);
-    ++fConditionCount;
-    SkASSERT(fCode->size() < ByteCode::kPointerMax);
-    ByteCode::Pointer start{(uint16_t) fCode->size()};
-    ByteCode::Register test = this->writeExpression(*w.fTest);
-    this->write(ByteCode::Instruction::kLoopMask);
-    this->write(test);
-    this->write(ByteCode::Instruction::kBranchIfAllFalse);
+    this->write(ByteCodeInstruction::kLoopBegin);
+    size_t cond = fCode->size();
+    this->writeExpression(*w.fTest);
+    this->write(ByteCodeInstruction::kLoopMask);
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
     DeferredLocation endLocation(this);
     this->writeStatement(*w.fStatement);
-    this->write(ByteCode::Instruction::kLoopNext);
-    this->write(ByteCode::Instruction::kBranch);
-    this->write(start);
+    this->write(ByteCodeInstruction::kLoopNext);
+    this->write(ByteCodeInstruction::kBranch);
+    this->write16(cond);
     endLocation.set();
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kLoopEnd);
+    this->write(ByteCodeInstruction::kLoopEnd);
 }
 
 void ByteCodeGenerator::writeStatement(const Statement& s) {
@@ -1338,16 +1631,19 @@
             this->writeBlock((Block&) s);
             break;
         case Statement::kBreak_Kind:
-            this->write(ByteCode::Instruction::kBreak);
+            this->writeBreakStatement((BreakStatement&) s);
             break;
         case Statement::kContinue_Kind:
-            this->write(ByteCode::Instruction::kContinue);
+            this->writeContinueStatement((ContinueStatement&) s);
             break;
+        case Statement::kDiscard_Kind:
+            // not yet implemented
+            abort();
         case Statement::kDo_Kind:
             this->writeDoStatement((DoStatement&) s);
             break;
         case Statement::kExpression_Kind:
-            this->writeExpression(*((ExpressionStatement&) s).fExpression);
+            this->writeExpression(*((ExpressionStatement&) s).fExpression, true);
             break;
         case Statement::kFor_Kind:
             this->writeForStatement((ForStatement&) s);
@@ -1358,7 +1654,10 @@
         case Statement::kNop_Kind:
             break;
         case Statement::kReturn_Kind:
-            this->writeReturn((ReturnStatement&) s);
+            this->writeReturnStatement((ReturnStatement&) s);
+            break;
+        case Statement::kSwitch_Kind:
+            this->writeSwitchStatement((SwitchStatement&) s);
             break;
         case Statement::kVarDeclarations_Kind:
             this->writeVarDeclarations(*((VarDeclarationsStatement&) s).fDeclaration);
@@ -1367,80 +1666,18 @@
             this->writeWhileStatement((WhileStatement&) s);
             break;
         default:
-            ABORT("unsupported statement\n");
+            SkASSERT(false);
     }
 }
 
-void ByteCodeGenerator::writeFunction(const FunctionDefinition& f) {
-    fFunction = &f;
-    std::unique_ptr<ByteCodeFunction> result(new ByteCodeFunction(&f.fDeclaration));
-    result->fReturnSlotCount = SlotCount(f.fDeclaration.fReturnType);
+ByteCodeFunction::ByteCodeFunction(const FunctionDeclaration* declaration)
+        : fName(declaration->fName) {
     fParameterCount = 0;
-    fConditionCount = 0;
-    for (const auto& p : f.fDeclaration.fParameters) {
-        int count = SlotCount(p->fType);
-        bool isOut = ((p->fModifiers.fFlags & Modifiers::kOut_Flag) != 0);
-        result->fParameters.push_back(ByteCodeFunction::Parameter{count, isOut});
-        fParameterCount += count;
-    }
-    result->fParameterSlotCount = fParameterCount;
-    fCode = &result->fCode;
-    this->writeStatement(*f.fBody);
-    result->fStackSlotCount = fLocals.size();
-    if (f.fDeclaration.fReturnType.fName == "void") {
-        this->write(ByteCode::Instruction::kReturn);
-    } else {
-        this->write(ByteCode::Instruction::kAbort);
-    }
-    fOutput->fFunctions.push_back(std::move(result));
-    SkASSERT(fConditionCount == 0);
-}
-
-void ByteCodeGenerator::gatherUniforms(const Type& type, const String& name) {
-    if (type.kind() == Type::kOther_Kind) {
-        return;
-    } else if (type.kind() == Type::kStruct_Kind) {
-        for (const auto& f : type.fields()) {
-            this->gatherUniforms(*f.fType, name + "." + f.fName);
-        }
-    } else if (type.kind() == Type::kArray_Kind) {
-        for (int i = 0; i < type.columns(); ++i) {
-            this->gatherUniforms(type.componentType(), String::printf("%s[%d]", name.c_str(), i));
-        }
-    } else {
-        fOutput->fUniforms.push_back({ name, type_category(type), type.rows(), type.columns(),
-                                       fOutput->fUniformSlotCount });
-        fOutput->fUniformSlotCount += type.columns() * type.rows();
+    for (const auto& p : declaration->fParameters) {
+        int slots = ByteCodeGenerator::SlotCount(p->fType);
+        fParameters.push_back({ slots, (bool)(p->fModifiers.fFlags & Modifiers::kOut_Flag) });
+        fParameterCount += slots;
     }
 }
 
-bool ByteCodeGenerator::generateCode() {
-    fOutput->fGlobalSlotCount = 0;
-    fOutput->fUniformSlotCount = 0;
-    for (const auto& pe : fProgram) {
-        if (pe.fKind == ProgramElement::kVar_Kind) {
-            VarDeclarations& decl = (VarDeclarations&) pe;
-            for (const auto& v : decl.fVars) {
-                const Variable* declVar = ((VarDeclaration&) *v).fVar;
-                if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
-                    continue;
-                }
-                if (is_uniform(*declVar)) {
-                    this->gatherUniforms(declVar->fType, declVar->fName);
-                } else {
-                    fOutput->fGlobalSlotCount += SlotCount(declVar->fType);
-                }
-            }
-        }
-    }
-    for (const auto& pe : fProgram) {
-        if (pe.fKind == ProgramElement::kFunction_Kind) {
-            FunctionDefinition& f = (FunctionDefinition&) pe;
-            fFunctions.push_back(&f);
-            this->writeFunction(f);
-        }
-    }
-    return fErrors.errorCount() == 0;
 }
-
-} // namespace
diff --git a/src/sksl/SkSLByteCodeGenerator.h b/src/sksl/SkSLByteCodeGenerator.h
index ab232c4..4e3accd 100644
--- a/src/sksl/SkSLByteCodeGenerator.h
+++ b/src/sksl/SkSLByteCodeGenerator.h
@@ -54,19 +54,95 @@
 
 class ByteCodeGenerator : public CodeGenerator {
 public:
-    ByteCodeGenerator(const Program* program, ErrorReporter* errors, ByteCode* output);
+    class LValue {
+    public:
+        LValue(ByteCodeGenerator& generator)
+            : fGenerator(generator) {}
+
+        virtual ~LValue() {}
+
+        /**
+         * Stack before call: ... lvalue
+         * Stack after call: ... lvalue load
+         */
+        virtual void load() = 0;
+
+        /**
+         * Stack before call: ... lvalue value
+         * Stack after call: ...
+         */
+        virtual void store(bool discard) = 0;
+
+    protected:
+        ByteCodeGenerator& fGenerator;
+    };
+
+    ByteCodeGenerator(const Context* context, const Program* program, ErrorReporter* errors,
+                      ByteCode* output);
 
     bool generateCode() override;
 
+    void write8(uint8_t b);
+
+    void write16(uint16_t b);
+
+    void write32(uint32_t b);
+
+    void write(ByteCodeInstruction inst, int count = kUnusedStackCount);
+
+    /**
+     * Based on 'type', writes the s (signed), u (unsigned), or f (float) instruction.
+     */
+    void writeTypedInstruction(const Type& type, ByteCodeInstruction s, ByteCodeInstruction u,
+                               ByteCodeInstruction f, int count, bool writeCount = true);
+
+    static int SlotCount(const Type& type);
+
 private:
+    static constexpr int kUnusedStackCount = INT32_MAX;
+    static int StackUsage(ByteCodeInstruction, int count);
+
+    // reserves 16 bits in the output code, to be filled in later with an address once we determine
+    // it
+    class DeferredLocation {
+    public:
+        DeferredLocation(ByteCodeGenerator* generator)
+            : fGenerator(*generator)
+            , fOffset(generator->fCode->size()) {
+            generator->write16(0);
+        }
+
+#ifdef SK_DEBUG
+        ~DeferredLocation() {
+            SkASSERT(fSet);
+        }
+#endif
+
+        void set() {
+            int target = fGenerator.fCode->size();
+            SkASSERT(target <= 65535);
+            (*fGenerator.fCode)[fOffset] = target;
+            (*fGenerator.fCode)[fOffset + 1] = target >> 8;
+#ifdef SK_DEBUG
+            fSet = true;
+#endif
+        }
+
+    private:
+        ByteCodeGenerator& fGenerator;
+        size_t fOffset;
+#ifdef SK_DEBUG
+        bool fSet = false;
+#endif
+    };
+
     // Intrinsics which do not simply map to a single opcode
     enum class SpecialIntrinsic {
         kDot,
-        kInverse,
     };
 
     struct Intrinsic {
-        Intrinsic(ByteCode::Instruction instruction)
+        Intrinsic(ByteCodeInstruction instruction)
             : fIsSpecial(false)
             , fValue(instruction) {}
 
@@ -77,250 +153,201 @@
         bool fIsSpecial;
 
         union Value {
-            Value(ByteCode::Instruction instruction)
+            Value(ByteCodeInstruction instruction)
                 : fInstruction(instruction) {}
 
             Value(SpecialIntrinsic special)
                 : fSpecial(special) {}
 
-            ByteCode::Instruction fInstruction;
+            ByteCodeInstruction fInstruction;
             SpecialIntrinsic fSpecial;
         } fValue;
     };
 
-    class LValue {
-    public:
-        LValue(ByteCodeGenerator& generator)
-            : fGenerator(generator) {}
 
-        virtual ~LValue() {}
-
-        virtual void load(ByteCode::Register result) = 0;
-
-        virtual void store(ByteCode::Register src) = 0;
-
-    protected:
-        ByteCodeGenerator& fGenerator;
+    // Similar to Variable::Storage, but locals and parameters are grouped together, and globals
+    // are further subidivided into uniforms and other (writable) globals.
+    enum class Storage {
+        kLocal,    // include parameters
+        kGlobal,   // non-uniform globals
+        kUniform,  // uniform globals
     };
 
     struct Location {
-        enum {
-            kPointer_Kind,
-            kRegister_Kind
-        } fKind;
+        int     fSlot;
+        Storage fStorage;
 
-        union {
-            ByteCode::Pointer fPointer;
-            ByteCode::Register fRegister;
-        };
+        // Not really invalid, but a "safe" placeholder to be more explicit at call-sites
+        static Location MakeInvalid() { return { 0, Storage::kLocal }; }
 
-        Location(ByteCode::Pointer p)
-            : fKind(kPointer_Kind)
-            , fPointer(p) {}
+        Location makeOnStack() { return { -1, fStorage }; }
+        bool isOnStack() const { return fSlot < 0; }
 
-        Location(ByteCode::Register r)
-            : fKind(kRegister_Kind)
-            , fRegister(r) {}
-
-        /**
-         * Returns this location offset by 'offset' bytes. For pointers, this is a compile-time
-         * operation, while for registers there will be CPU instructions output to handle the
-         * runtime calculation of the address.
-         */
-        Location offset(ByteCodeGenerator& generator, int offset) {
-            if (!offset) {
-                return *this;
-            }
-            if (fKind == kPointer_Kind) {
-                return Location(fPointer + offset);
-            }
-            ByteCode::Register a = generator.next(1);
-            generator.write(ByteCode::Instruction::kImmediate);
-            generator.write(a);
-            generator.write(ByteCode::Immediate{offset});
-            ByteCode::Register result = generator.next(1);
-            generator.write(ByteCode::Instruction::kAddI);
-            generator.write(result);
-            generator.write(fRegister);
-            generator.write(a);
-            return result;
+        Location operator+(int offset) {
+            SkASSERT(fSlot >= 0);
+            return { fSlot + offset, fStorage };
         }
 
-        /**
-         * Returns this location offset by the number of bytes stored in the 'offset' register. This
-         * will output the necessary CPU instructions to perform the math and return a new register
-         * location.
-         */
-        Location offset(ByteCodeGenerator& generator, ByteCode::Register offset) {
-            ByteCode::Register current;
-            switch (fKind) {
-                case kPointer_Kind:
-                    current = generator.next(1);
-                    generator.write(ByteCode::Instruction::kImmediate);
-                    generator.write(current);
-                    generator.write(ByteCode::Immediate{fPointer.fAddress});
-                    break;
-                case kRegister_Kind:
-                    current = fRegister;
+        ByteCodeInstruction selectLoad(ByteCodeInstruction local,
+                                       ByteCodeInstruction global,
+                                       ByteCodeInstruction uniform) const {
+            switch (fStorage) {
+                case Storage::kLocal:   return local;
+                case Storage::kGlobal:  return global;
+                case Storage::kUniform: return uniform;
             }
-            ByteCode::Register result = generator.next(1);
-            generator.write(ByteCode::Instruction::kAddI);
-            generator.write(result);
-            generator.write(current);
-            generator.write(offset);
-            return result;
+            SkUNREACHABLE;
+        }
+
+        ByteCodeInstruction selectStore(ByteCodeInstruction local,
+                                        ByteCodeInstruction global) const {
+            switch (fStorage) {
+                case Storage::kLocal:   return local;
+                case Storage::kGlobal:  return global;
+                case Storage::kUniform: ABORT("Trying to store to a uniform"); break;
+            }
+            return local;
         }
     };
 
-    // reserves 16 bits in the output code, to be filled in later with an address once we determine
-    // it
-    class DeferredLocation {
-    public:
-        explicit DeferredLocation(ByteCodeGenerator* generator)
-            : fGenerator(*generator)
-            , fOffset(generator->fCode->size()) {
-            generator->write(ByteCode::Pointer{65535});
-        }
-
-        void set() {
-            SkASSERT(fGenerator.fCode->size() <= ByteCode::kPointerMax);
-            static_assert(sizeof(ByteCode::Pointer) == 2,
-                          "failed assumption that ByteCode::Pointer is uint16_t");
-            void* dst = &(*fGenerator.fCode)[fOffset];
-            // ensure that the placeholder value 65535 hasn't been modified yet
-            SkASSERT(((uint8_t*) dst)[0] == 255 && ((uint8_t*) dst)[1] == 255);
-            ByteCode::Pointer target{(uint16_t) fGenerator.fCode->size()};
-            memcpy(dst, &target, sizeof(target));
-        }
-
-    private:
-        ByteCodeGenerator& fGenerator;
-        size_t fOffset;
-    };
-
-    template<typename T>
-    void write(T value) {
-        size_t n = fCode->size();
-        fCode->resize(n + sizeof(value));
-        memcpy(fCode->data() + n, &value, sizeof(value));
-    }
-
-    ByteCode::Register next(int slotCount);
-
     /**
-     * Based on 'type', writes the s (signed), u (unsigned), or f (float) instruction.
+     * Returns the local slot into which var should be stored, allocating a new slot if it has not
+     * already been assigned one. Compound variables (e.g. vectors) will consume more than one local
+     * slot, with the getLocation return value indicating where the first element should be stored.
      */
-    void writeTypedInstruction(const Type& type, ByteCode::Instruction s, ByteCode::Instruction u,
-                               ByteCode::Instruction f);
-
-    ByteCode::Instruction getLoadInstruction(Location location, Variable::Storage storage);
-
-    ByteCode::Instruction getStoreInstruction(Location location, Variable::Storage storage);
-
-    static int SlotCount(const Type& type);
-
     Location getLocation(const Variable& var);
 
+    /**
+     * As above, but computes the (possibly dynamic) address of an expression involving indexing &
+     * field access. If the address is known, it's returned. If not, -1 is returned, and the
+     * location will be left on the top of the stack.
+     */
     Location getLocation(const Expression& expr);
 
-    Variable::Storage getStorage(const Expression& expr);
-
-    std::unique_ptr<LValue> getLValue(const Expression& expr);
-
-    void writeFunction(const FunctionDefinition& f);
-
-    // For compound values, the result argument specifies the first component. Subsequent components
-    // will be in subsequent registers.
-
-    void writeBinaryInstruction(const Type& operandType, ByteCode::Register left,
-                                ByteCode::Register right, ByteCode::Instruction s,
-                                ByteCode::Instruction u, ByteCode::Instruction f,
-                                ByteCode::Register result);
-
-    void writeBinaryExpression(const BinaryExpression& expr, ByteCode::Register result);
-
-    void writeConstructor(const Constructor& c, ByteCode::Register result);
-
-    void writeExternalFunctionCall(const ExternalFunctionCall& f, ByteCode::Register result);
-
-    void writeExternalValue(const ExternalValueReference& e, ByteCode::Register result);
-
-    void writeIntrinsicCall(const FunctionCall& c, Intrinsic intrinsic, ByteCode::Register result);
-
-    void writeFunctionCall(const FunctionCall& c, ByteCode::Register result);
-
-    void incOrDec(Token::Kind op, Expression& operand, bool prefix, ByteCode::Register result);
-
-    void writePostfixExpression(const PostfixExpression& p, ByteCode::Register result);
-
-    void writePrefixExpression(const PrefixExpression& p, ByteCode::Register result);
-
-    void writeSwizzle(const Swizzle& s, ByteCode::Register result);
-
-    void writeTernaryExpression(const TernaryExpression& t, ByteCode::Register result);
-
-    void writeVariableExpression(const Expression& e, ByteCode::Register result);
-
-    void writeExpression(const Expression& expr, ByteCode::Register result);
-
-    ByteCode::Register writeExpression(const Expression& expr);
-
-    void writeBlock(const Block& b);
-
-    void writeDoStatement(const DoStatement& d);
-
-    void writeForStatement(const ForStatement& f);
-
-    void writeIfStatement(const IfStatement& i);
-
-    void writeReturn(const ReturnStatement& r);
-
-    void writeVarDeclarations(const VarDeclarations& v);
-
-    void writeWhileStatement(const WhileStatement& w);
-
-    void writeStatement(const Statement& s);
-
     void gatherUniforms(const Type& type, const String& name);
 
+    std::unique_ptr<ByteCodeFunction> writeFunction(const FunctionDefinition& f);
+
+    void writeVarDeclarations(const VarDeclarations& decl);
+
+    void writeVariableExpression(const Expression& expr);
+
+    void writeExpression(const Expression& expr, bool discard = false);
+
+    /**
+     * Pushes whatever values are required by the lvalue onto the stack, and returns an LValue
+     * permitting loads and stores to it.
+     */
+    std::unique_ptr<LValue> getLValue(const Expression& expr);
+
+    void writeIntrinsicCall(const FunctionCall& c);
+
+    void writeFunctionCall(const FunctionCall& c);
+
+    void writeConstructor(const Constructor& c);
+
+    void writeExternalFunctionCall(const ExternalFunctionCall& c);
+
+    void writeExternalValue(const ExternalValueReference& r);
+
+    void writeSwizzle(const Swizzle& swizzle);
+
+    bool writeBinaryExpression(const BinaryExpression& b, bool discard);
+
+    void writeTernaryExpression(const TernaryExpression& t);
+
+    void writeNullLiteral(const NullLiteral& n);
+
+    bool writePrefixExpression(const PrefixExpression& p, bool discard);
+
+    bool writePostfixExpression(const PostfixExpression& p, bool discard);
+
+    void writeBoolLiteral(const BoolLiteral& b);
+
+    void writeIntLiteral(const IntLiteral& i);
+
+    void writeFloatLiteral(const FloatLiteral& f);
+
+    void writeStatement(const Statement& s);
+
+    void writeBlock(const Block& b);
+
+    void writeBreakStatement(const BreakStatement& b);
+
+    void writeContinueStatement(const ContinueStatement& c);
+
+    void writeIfStatement(const IfStatement& stmt);
+
+    void writeForStatement(const ForStatement& f);
+
+    void writeWhileStatement(const WhileStatement& w);
+
+    void writeDoStatement(const DoStatement& d);
+
+    void writeSwitchStatement(const SwitchStatement& s);
+
+    void writeReturnStatement(const ReturnStatement& r);
+
+    // updates the current set of breaks to branch to the current location
+    void setBreakTargets();
+
+    // updates the current set of continues to branch to the current location
+    void setContinueTargets();
+
+    void enterLoop() {
+        fLoopCount++;
+        fMaxLoopCount = std::max(fMaxLoopCount, fLoopCount);
+    }
+
+    void exitLoop() {
+        SkASSERT(fLoopCount > 0);
+        fLoopCount--;
+    }
+
+    void enterCondition() {
+        fConditionCount++;
+        fMaxConditionCount = std::max(fMaxConditionCount, fConditionCount);
+    }
+
+    void exitCondition() {
+        SkASSERT(fConditionCount > 0);
+        fConditionCount--;
+    }
+
+    const Context& fContext;
+
     ByteCode* fOutput;
 
-    int fNextRegister = 0;
-
     const FunctionDefinition* fFunction;
 
-    std::vector<const FunctionDefinition*> fFunctions;
-
     std::vector<uint8_t>* fCode;
 
     std::vector<const Variable*> fLocals;
 
-    int fParameterCount;
+    std::stack<std::vector<DeferredLocation>> fContinueTargets;
 
+    std::stack<std::vector<DeferredLocation>> fBreakTargets;
+
+    std::vector<const FunctionDefinition*> fFunctions;
+
+    int fParameterCount;
+    int fStackCount;
+    int fMaxStackCount;
+
+    int fLoopCount;
+    int fMaxLoopCount;
     int fConditionCount;
+    int fMaxConditionCount;
 
     const std::unordered_map<String, Intrinsic> fIntrinsics;
 
     friend class DeferredLocation;
-    friend class ByteCodeExternalValueLValue;
-    friend class ByteCodeSimpleLValue;
+    friend class ByteCodeExpressionLValue;
     friend class ByteCodeSwizzleLValue;
 
     typedef CodeGenerator INHERITED;
 };
 
-template<>
-inline void ByteCodeGenerator::write(ByteCodeGenerator::Location loc) {
-    switch (loc.fKind) {
-        case ByteCodeGenerator::Location::kPointer_Kind:
-            this->write(loc.fPointer);
-            break;
-        case ByteCodeGenerator::Location::kRegister_Kind:
-            this->write(loc.fRegister);
-            break;
-    }
-}
-
 }
 
 #endif
diff --git a/src/sksl/SkSLCompiler.cpp b/src/sksl/SkSLCompiler.cpp
index 6e84b98..7bfdce1 100644
--- a/src/sksl/SkSLCompiler.cpp
+++ b/src/sksl/SkSLCompiler.cpp
@@ -77,17 +77,14 @@
 namespace SkSL {
 
 static void grab_intrinsics(std::vector<std::unique_ptr<ProgramElement>>* src,
-               std::map<String, std::pair<std::unique_ptr<ProgramElement>, bool>>* target) {
-    for (auto iter = src->begin(); iter != src->end(); ) {
-        std::unique_ptr<ProgramElement>& element = *iter;
+               std::map<StringFragment, std::pair<std::unique_ptr<ProgramElement>, bool>>* target) {
+    for (auto& element : *src) {
         switch (element->fKind) {
             case ProgramElement::kFunction_Kind: {
                 FunctionDefinition& f = (FunctionDefinition&) *element;
-                SkASSERT(f.fDeclaration.fBuiltin);
-                String key = f.fDeclaration.declaration();
-                SkASSERT(target->find(key) == target->end());
-                (*target)[key] = std::make_pair(std::move(element), false);
-                iter = src->erase(iter);
+                StringFragment name = f.fDeclaration.fName;
+                SkASSERT(target->find(name) == target->end());
+                (*target)[name] = std::make_pair(std::move(element), false);
                 break;
             }
             case ProgramElement::kEnum_Kind: {
@@ -95,7 +92,6 @@
                 StringFragment name = e.fTypeName;
                 SkASSERT(target->find(name) == target->end());
                 (*target)[name] = std::make_pair(std::move(element), false);
-                iter = src->erase(iter);
                 break;
             }
             default:
@@ -282,13 +278,11 @@
     this->processIncludeFile(Program::kPipelineStage_Kind, SKSL_PIPELINE_INCLUDE,
                              strlen(SKSL_PIPELINE_INCLUDE), fGpuSymbolTable, &fPipelineInclude,
                              &fPipelineSymbolTable);
+    std::vector<std::unique_ptr<ProgramElement>> interpIntrinsics;
     this->processIncludeFile(Program::kGeneric_Kind, SKSL_INTERP_INCLUDE,
                              strlen(SKSL_INTERP_INCLUDE), symbols, &fInterpreterInclude,
                              &fInterpreterSymbolTable);
-    grab_intrinsics(&fInterpreterInclude, &fInterpreterIntrinsics);
-    // need to hang on to the source so that FunctionDefinition.fSource pointers in this file
-    // remain valid
-    fInterpreterIncludeSource = std::move(fIRGenerator->fFile);
+    grab_intrinsics(&interpIntrinsics, &fInterpreterIntrinsics);
 }
 
 Compiler::~Compiler() {
@@ -1630,7 +1624,7 @@
     }
     fSource = program.fSource.get();
     std::unique_ptr<ByteCode> result(new ByteCode());
-    ByteCodeGenerator cg(&program, this, result.get());
+    ByteCodeGenerator cg(fContext.get(), &program, this, result.get());
     bool success = cg.generateCode();
     fSource = nullptr;
     if (success) {
diff --git a/src/sksl/SkSLCompiler.h b/src/sksl/SkSLCompiler.h
index 25762e8..fb4b4fb 100644
--- a/src/sksl/SkSLCompiler.h
+++ b/src/sksl/SkSLCompiler.h
@@ -215,8 +215,8 @@
 
     Position position(int offset);
 
-    std::map<String, std::pair<std::unique_ptr<ProgramElement>, bool>> fGPUIntrinsics;
-    std::map<String, std::pair<std::unique_ptr<ProgramElement>, bool>> fInterpreterIntrinsics;
+    std::map<StringFragment, std::pair<std::unique_ptr<ProgramElement>, bool>> fGPUIntrinsics;
+    std::map<StringFragment, std::pair<std::unique_ptr<ProgramElement>, bool>> fInterpreterIntrinsics;
     std::unique_ptr<ASTFile> fGpuIncludeSource;
     std::shared_ptr<SymbolTable> fGpuSymbolTable;
     std::vector<std::unique_ptr<ProgramElement>> fVertexInclude;
@@ -227,7 +227,6 @@
     std::shared_ptr<SymbolTable> fGeometrySymbolTable;
     std::vector<std::unique_ptr<ProgramElement>> fPipelineInclude;
     std::shared_ptr<SymbolTable> fPipelineSymbolTable;
-    std::unique_ptr<ASTFile> fInterpreterIncludeSource;
     std::vector<std::unique_ptr<ProgramElement>> fInterpreterInclude;
     std::shared_ptr<SymbolTable> fInterpreterSymbolTable;
 
diff --git a/src/sksl/SkSLIRGenerator.cpp b/src/sksl/SkSLIRGenerator.cpp
index 8d9d42d..44b4200 100644
--- a/src/sksl/SkSLIRGenerator.cpp
+++ b/src/sksl/SkSLIRGenerator.cpp
@@ -1776,7 +1776,7 @@
                                               const FunctionDeclaration& function,
                                               std::vector<std::unique_ptr<Expression>> arguments) {
     if (function.fBuiltin) {
-        auto found = fIntrinsics->find(function.declaration());
+        auto found = fIntrinsics->find(function.fName);
         if (found != fIntrinsics->end() && !found->second.second) {
             found->second.second = true;
             const FunctionDeclaration* old = fCurrentFunction;
@@ -2186,7 +2186,7 @@
         }
     }
     fErrors.error(base->fOffset, "type '" + base->fType.displayName() + "' does not have a "
-                                 "field named '" + field + "'");
+                                 "field named '" + field + "");
     return nullptr;
 }
 
diff --git a/src/sksl/SkSLIRGenerator.h b/src/sksl/SkSLIRGenerator.h
index 14ea097..a088444 100644
--- a/src/sksl/SkSLIRGenerator.h
+++ b/src/sksl/SkSLIRGenerator.h
@@ -159,7 +159,7 @@
     std::shared_ptr<SymbolTable> fSymbolTable;
     // Symbols which have definitions in the include files. The bool tells us whether this
     // intrinsic has been included already.
-    std::map<String, std::pair<std::unique_ptr<ProgramElement>, bool>>* fIntrinsics = nullptr;
+    std::map<StringFragment, std::pair<std::unique_ptr<ProgramElement>, bool>>* fIntrinsics = nullptr;
     // holds extra temp variable declarations needed for the current function
     std::vector<std::unique_ptr<Statement>> fExtraVars;
     int fLoopLevel;
diff --git a/src/sksl/SkSLInterpreter.h b/src/sksl/SkSLInterpreter.h
deleted file mode 100644
index ba7a203..0000000
--- a/src/sksl/SkSLInterpreter.h
+++ /dev/null
@@ -1,1361 +0,0 @@
-/*
- * Copyright 2020 Google LLC
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "include/private/GrTypesPriv.h" // GrAlignTo
-#include "src/core/SkUtils.h" // sk_unaligned_load
-#include "src/sksl/SkSLByteCode.h"
-#include "src/sksl/SkSLExternalValue.h"
-
-#include <stack>
-
-#ifndef SKSL_INTERPRETER
-#define SKSL_INTERPRETER
-
-namespace SkSL {
-
-// GCC and Clang support the "labels as values" extension which we need to implement the interpreter
-// using threaded code. Otherwise, we fall back to using a switch statement in a for loop.
-#if defined(__GNUC__) || defined(__clang__)
-    #define SKSL_THREADED_CODE
-#endif
-
-#ifdef SKSL_THREADED_CODE
-    using instruction = void*;
-    #define LABEL(name) name:
-    #ifdef TRACE
-        #define NEXT()                                   \
-            {                                            \
-                const uint8_t* trace_ip = ip;            \
-                printf("%d: ", (int) (trace_ip - code)); \
-                disassemble(&trace_ip);                  \
-            }                                            \
-            goto *labels[(int) read<ByteCode::Instruction>(&ip)]
-    #else
-        #define NEXT() goto *labels[(int) read<ByteCode::Instruction>(&ip)]
-    #endif
-#else
-    using instruction = uint16_t;
-    #define LABEL(name) case ByteCode::Instruction::name:
-    #define NEXT() continue
-#endif
-
-// If you trip this assert, it means that the order of the opcodes listed in ByteCodeInstruction
-// does not match the order of the opcodes listed in the 'labels' array in innerRun().
-#define CHECK_LABEL(name) \
-    SkASSERT(labels[(int) ByteCode::Instruction::name] == &&name)
-
-template<typename T>
-static T read(const uint8_t** ip) {
-    *ip += sizeof(T);
-    return sk_unaligned_load<T>(*ip - sizeof(T));
-}
-
-#define BINARY_OP(inst, src, result, op)                                  \
-    LABEL(inst) {                                                         \
-        ByteCode::Register target = read<ByteCode::Register>(&ip);        \
-        ByteCode::Register src1 = read<ByteCode::Register>(&ip);          \
-        ByteCode::Register src2 = read<ByteCode::Register>(&ip);          \
-        fRegisters[target.fIndex].result = fRegisters[src1.fIndex].src op \
-                                           fRegisters[src2.fIndex].src;   \
-        NEXT();                                                           \
-    }
-
-#define MASKED_BINARY_OP(inst, src, result, op)                                         \
-    LABEL(inst) {                                                                       \
-        ByteCode::Register target = read<ByteCode::Register>(&ip);                      \
-        ByteCode::Register src1 = read<ByteCode::Register>(&ip);                        \
-        ByteCode::Register src2 = read<ByteCode::Register>(&ip);                        \
-        auto m = mask();                                                                \
-        for (int i = 0; i < width; ++i) {                                               \
-            if (m[i]) {                                                                 \
-                fRegisters[target.fIndex].result[i] = fRegisters[src1.fIndex].src[i] op \
-                                                   fRegisters[src2.fIndex].src[i];      \
-            }                                                                           \
-        }                                                                               \
-        NEXT();                                                                         \
-    }
-
-#define VECTOR_UNARY_FN(inst, fn)                                                       \
-    LABEL(inst) {                                                                       \
-        ByteCode::Register target = read<ByteCode::Register>(&ip);                      \
-        ByteCode::Register src = read<ByteCode::Register>(&ip);                         \
-        for (int i = 0; i < width; ++ i) {                                              \
-            fRegisters[target.fIndex].fFloat[i] = fn(fRegisters[src.fIndex].fFloat[i]); \
-        }                                                                               \
-        NEXT();                                                                         \
-    }
-
-#define DISASSEMBLE_0(inst, name) \
-    case ByteCode::Instruction::inst: printf(name "\n"); break;
-
-#define DISASSEMBLE_1(inst, name)                                   \
-    case ByteCode::Instruction::inst:                               \
-        printf(name " $%d\n", read<ByteCode::Register>(ip).fIndex); \
-        break;
-
-#define DISASSEMBLE_UNARY(inst, name)                             \
-    case ByteCode::Instruction::inst: {                           \
-        ByteCode::Register target = read<ByteCode::Register>(ip); \
-        ByteCode::Register src = read<ByteCode::Register>(ip);    \
-        printf(name " $%d -> $%d\n", src.fIndex, target.fIndex);  \
-        break;                                                    \
-    }
-
-#define DISASSEMBLE_BINARY(inst, name)                                              \
-    case ByteCode::Instruction::inst: {                                             \
-        ByteCode::Register target = read<ByteCode::Register>(ip);                   \
-        ByteCode::Register src1 = read<ByteCode::Register>(ip);                     \
-        ByteCode::Register src2 = read<ByteCode::Register>(ip);                     \
-        printf(name " $%d, $%d -> $%d\n", src1.fIndex, src2.fIndex, target.fIndex); \
-        break;                                                                      \
-    }
-
-/**
- * Operates on vectors of the specified width, so creating an Interpreter<16> means that all inputs,
- * outputs, and internal calculations will be 16-wide vectors.
- */
-template<int width>
-class Interpreter {
-public:
-    using Vector = ByteCode::Vector<width>;
-    using VectorI = skvx::Vec<width, int32_t>;
-    using VectorF = skvx::Vec<width, float>;
-
-    Interpreter(std::unique_ptr<ByteCode> code)
-        : fCode(std::move(code)) {
-        // C++ doesn't guarantee proper alignment of naively-allocated vectors, so we can't have the
-        // registers and memory directly as fields of this object without jumping through some hoops
-        // during Interpreter allocation and deallocation. We simplify this by having the backing
-        // store be a separate allocation, jumping through the hoops ourselves rather than require
-        // Interpreter's clients to be aware of alignment.
-        // Ideally, we could use std::aligned_alloc here, but as of this writing it is not available
-        // on some compilers despite claiming to support C++17.
-        fBackingStore = calloc(sizeof(Vector), MEMORY_SIZE + REGISTER_COUNT + 1);
-        fMemory = (Vector*) GrAlignTo((size_t) fBackingStore, alignof(Vector));
-        fRegisters = fMemory + MEMORY_SIZE;
-    }
-
-    ~Interpreter() {
-        free(fBackingStore);
-    }
-
-    void setUniforms(const float uniforms[]) {
-        for (int i = 0; i < fCode->getUniformSlotCount(); ++i) {
-            fMemory[fCode->getGlobalSlotCount() + i].fFloat = VectorF(uniforms[i]);
-        }
-    }
-
-    /**
-     * Returns true on success and stores a pointer to the first slot of the result into outResult.
-     * This pointer is only guaranteed to be valid until the next run() call.
-     */
-     bool run(const ByteCodeFunction* f, Vector args[], Vector** outResult) {
-        SkASSERT(f);
-        VectorI condStack[MASK_STACK_SIZE];
-        memset(condStack, 255, sizeof(VectorI));
-        VectorI maskStack[MASK_STACK_SIZE];
-        memset(maskStack, 255, sizeof(VectorI));
-        VectorI loopStack[LOOP_STACK_SIZE];
-        memset(loopStack, 255, sizeof(VectorI));
-        VectorI continueStack[LOOP_STACK_SIZE];
-        memset(continueStack, 0, sizeof(VectorI));
-        Vector* stack = fMemory + MEMORY_SIZE;
-        int stackCount = f->fStackSlotCount + f->fParameterSlotCount;
-        stack -= stackCount;
-        if (f->fParameterSlotCount) {
-            memcpy(stack, args, f->fParameterSlotCount * sizeof(Vector));
-        }
-        Context context(fMemory, stack, condStack, maskStack, loopStack, continueStack);
-        if (this->innerRun(f, context, 0, outResult)) {
-            int slot = 0;
-            for (const auto& p : f->fParameters) {
-                if (p.fIsOutParameter) {
-                    printf("run copying out %d slots\n", p.fSlotCount);
-                    memcpy(&args[slot], &stack[slot], p.fSlotCount * sizeof(Vector));
-                    for (int i = 0; i < p.fSlotCount; ++i) {
-                        printf("    %d: %f\n", i, args[slot].fFloat[0]);
-                    }
-                }
-                slot += p.fSlotCount;
-            }
-            return true;
-        }
-        return false;
-    }
-
-    /**
-     * Invokes the specified function with the given arguments, 'count' times. 'args' and
-     * 'outResult' are accepted and returned in structure-of-arrays form:
-     *   args[0] points to an array of N values, the first argument for each invocation
-     *   ...
-     *   args[argCount - 1] points to an array of N values, the last argument for each invocation
-     *
-     * All values in 'args', 'outReturn', and 'uniforms' are 32-bit values (typically floats,
-     * but possibly int32_t or uint32_t, depending on the types used in the SkSL).
-     * Any 'out' or 'inout' parameters will result in the 'args' array being modified.
-     */
-    bool runStriped(const ByteCodeFunction* f, int count, float* args[]) {
-        SkASSERT(f);
-        Vector* stack = fMemory + MEMORY_SIZE;
-        int stackCount = f->fStackSlotCount + f->fParameterSlotCount;
-        stack -= stackCount;
-        VectorI condStack[MASK_STACK_SIZE];
-        VectorI maskStack[MASK_STACK_SIZE];
-        VectorI loopStack[LOOP_STACK_SIZE];
-        VectorI continueStack[LOOP_STACK_SIZE];
-        Context context(fMemory, stack, condStack, maskStack, loopStack, continueStack);
-        for (int i = 0; i < count; i += width) {
-            int lanes = std::min(width, count - i);
-            size_t size = lanes * sizeof(float);
-            memset(maskStack, 255, sizeof(VectorI));
-            memset(loopStack, 255, sizeof(VectorI));
-            for (int j = lanes; j < width; ++j) {
-                maskStack[0][j] = 0;
-                loopStack[0][j] = 0;
-            }
-            memset(continueStack, 0, sizeof(VectorI));
-            for (int j = 0; j < f->fParameterSlotCount; ++j) {
-                memcpy(stack + j, &args[j][i], size);
-            }
-            if (!this->innerRun(f, context, i, nullptr)) {
-                return false;
-            }
-            int slot = 0;
-            for (const auto& p : f->fParameters) {
-                if (p.fIsOutParameter) {
-                    for (int j = 0; j < p.fSlotCount; ++j) {
-                        memcpy(&args[slot + j][i], stack + slot + j, size);
-                    }
-                }
-                slot += p.fSlotCount;
-            }
-        }
-        return true;
-    }
-
-    const ByteCode& getCode() {
-        return *fCode;
-    }
-
-private:
-    static constexpr size_t REGISTER_COUNT = 1024;
-
-    static constexpr size_t MEMORY_SIZE = 1024;
-
-    static constexpr size_t MASK_STACK_SIZE = 64;
-
-    static constexpr size_t LOOP_STACK_SIZE = 16;
-
-    struct StackFrame {
-        StackFrame(const ByteCodeFunction* function, const uint8_t* ip, const int stackSlotCount,
-                   Vector* parameters, Vector* returnValue)
-            : fFunction(function)
-            , fIP(ip)
-            , fStackSlotCount(stackSlotCount)
-            , fParameters(parameters)
-            , fReturnValue(returnValue) {}
-
-        const ByteCodeFunction* fFunction;
-        const uint8_t* fIP;
-        const int fStackSlotCount;
-        Vector* fParameters;
-        Vector* fReturnValue;
-    };
-
-    struct Context {
-        Context(Vector* memory, Vector* stack, VectorI* condStack, VectorI* maskStack,
-                VectorI* loopStack,VectorI* continueStack)
-            : fMemory(memory)
-            , fStack(stack)
-            , fCondStack(condStack)
-            , fMaskStack(maskStack)
-            , fLoopStack(loopStack)
-            , fContinueStack(continueStack) {}
-
-        Vector* fMemory;
-        Vector* fStack;
-        VectorI* fCondStack;
-        VectorI* fMaskStack;
-        VectorI* fLoopStack;
-        VectorI* fContinueStack;
-        std::stack<StackFrame> fCallStack;
-    };
-
-    // $x = register
-    // @x = memory cell
-    // &x = parameter
-    void disassemble(const uint8_t** ip) {
-        ByteCode::Instruction inst = read<ByteCode::Instruction>(ip);
-        switch (inst) {
-            DISASSEMBLE_BINARY(kAddF, "addF")
-            DISASSEMBLE_BINARY(kAddI, "addI")
-            DISASSEMBLE_BINARY(kAnd, "and")
-            DISASSEMBLE_BINARY(kCompareEQF, "compare eqF")
-            DISASSEMBLE_BINARY(kCompareEQI, "compare eqI")
-            DISASSEMBLE_BINARY(kCompareNEQF, "compare neqF")
-            DISASSEMBLE_BINARY(kCompareNEQI, "compare neqI")
-            DISASSEMBLE_BINARY(kCompareGTF, "compare gtF")
-            DISASSEMBLE_BINARY(kCompareGTS, "compare gtS")
-            DISASSEMBLE_BINARY(kCompareGTU, "compare gtU")
-            DISASSEMBLE_BINARY(kCompareGTEQF, "compare gteqF")
-            DISASSEMBLE_BINARY(kCompareGTEQS, "compare gteqS")
-            DISASSEMBLE_BINARY(kCompareGTEQU, "compare gteqU")
-            DISASSEMBLE_BINARY(kCompareLTF, "compare ltF")
-            DISASSEMBLE_BINARY(kCompareLTS, "compare ltS")
-            DISASSEMBLE_BINARY(kCompareLTU, "compare ltU")
-            DISASSEMBLE_BINARY(kCompareLTEQF, "compare lteqF")
-            DISASSEMBLE_BINARY(kCompareLTEQS, "compare lteqS")
-            DISASSEMBLE_BINARY(kCompareLTEQU, "compare lteqU")
-            DISASSEMBLE_BINARY(kSubtractF, "subF")
-            DISASSEMBLE_BINARY(kSubtractI, "subI")
-            DISASSEMBLE_BINARY(kDivideF, "divF")
-            DISASSEMBLE_BINARY(kDivideS, "divS")
-            DISASSEMBLE_BINARY(kDivideU, "divU")
-            DISASSEMBLE_BINARY(kRemainderS, "remS")
-            DISASSEMBLE_BINARY(kRemainderU, "remU")
-            DISASSEMBLE_BINARY(kMultiplyF, "mulF")
-            DISASSEMBLE_BINARY(kMultiplyI, "mulI")
-            DISASSEMBLE_BINARY(kOr, "or")
-            DISASSEMBLE_BINARY(kXor, "xor")
-            DISASSEMBLE_0(kNop, "nop")
-            DISASSEMBLE_BINARY(kRemainderF, "remF")
-            case ByteCode::Instruction::kBoundsCheck: {
-                ByteCode::Register r = read<ByteCode::Register>(ip);
-                int length = read<int>(ip);
-                printf("boundsCheck 0 <= $%d < %d\n", r.fIndex, length);
-                break;
-            }
-            case ByteCode::Instruction::kBranch:
-                printf("branch %d\n", read<ByteCode::Pointer>(ip).fAddress);
-                break;
-            case ByteCode::Instruction::kBranchIfAllFalse:
-                printf("branchIfAllFalse %d\n", read<ByteCode::Pointer>(ip).fAddress);
-                break;
-            DISASSEMBLE_0(kBreak, "break")
-            case ByteCode::Instruction::kCall: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                uint8_t idx = read<uint8_t>(ip);
-                ByteCode::Register args = read<ByteCode::Register>(ip);
-                ByteCodeFunction* f = fCode->fFunctions[idx].get();
-                printf("call %s($%d...) -> $%d", f->fName.c_str(), args.fIndex, target.fIndex);
-                printf("\n");
-                break;
-            }
-            case ByteCode::Instruction::kCallExternal: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                uint8_t idx = read<uint8_t>(ip);
-                uint8_t targetCount = read<uint8_t>(ip);
-                ByteCode::Register args = read<ByteCode::Register>(ip);
-                uint8_t argCount = read<uint8_t>(ip);
-                ExternalValue* ev = fCode->fExternalValues[idx];
-                printf("callExternal %s($%d(%d)...) -> $%d(%d)", String(ev->fName).c_str(),
-                        args.fIndex, argCount, target.fIndex, targetCount);
-                printf("\n");
-                break;
-            }
-            DISASSEMBLE_0(kContinue, "continue")
-            DISASSEMBLE_UNARY(kCopy, "copy")
-            DISASSEMBLE_UNARY(kCos, "cos")
-            DISASSEMBLE_UNARY(kFloatToSigned, "FtoS")
-            DISASSEMBLE_UNARY(kFloatToUnsigned, "FtoU")
-            case ByteCode::Instruction::kImmediate: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Immediate src = read<ByteCode::Immediate>(ip);
-                printf("immediate (%d | %f) -> $%d\n", src.fInt, src.fFloat, target.fIndex);
-                break;
-            }
-            DISASSEMBLE_UNARY(kInverse2x2, "inverse2x2")
-            DISASSEMBLE_UNARY(kInverse3x3, "inverse3x3")
-            DISASSEMBLE_UNARY(kInverse4x4, "inverse4x4")
-            DISASSEMBLE_UNARY(kLoad, "load")
-            case ByteCode::Instruction::kLoadDirect: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
-                printf("loadDirect @%d -> $%d\n", src.fAddress, target.fIndex);
-                break;
-            }
-            DISASSEMBLE_UNARY(kLoadParameter, "loadParameter")
-            case ByteCode::Instruction::kLoadParameterDirect: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
-                printf("loadParameterDirect &%d -> $%d\n", src.fAddress, target.fIndex);
-                break;
-            }
-            DISASSEMBLE_UNARY(kLoadStack, "loadStack")
-            case ByteCode::Instruction::kLoadStackDirect: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
-                printf("loadStackDirect @%d -> $%d\n", src.fAddress, target.fIndex);
-                break;
-            }
-            DISASSEMBLE_0(kLoopBegin, "loopBegin")
-            DISASSEMBLE_0(kLoopEnd, "loopEnd")
-            DISASSEMBLE_1(kLoopMask, "loopMask")
-            DISASSEMBLE_0(kLoopNext, "loopNext")
-            DISASSEMBLE_0(kMaskNegate, "maskNegate")
-            DISASSEMBLE_0(kMaskPop, "maskPop")
-            DISASSEMBLE_1(kMaskPush, "maskPush")
-            case ByteCode::Instruction::kMatrixMultiply: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Register left = read<ByteCode::Register>(ip);
-                ByteCode::Register right = read<ByteCode::Register>(ip);
-                uint8_t leftColsAndRightRows = read<uint8_t>(ip);
-                uint8_t leftRows = read<uint8_t>(ip);
-                uint8_t rightColumns = read<uint8_t>(ip);
-                printf("matrixMultiply $%d, $%d, %d, %d, %d -> $%d\n", left.fIndex, right.fIndex,
-                       leftColsAndRightRows, leftRows, rightColumns, target.fIndex);
-                break;
-            }
-            case ByteCode::Instruction::kMatrixToMatrix: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                uint8_t srcColumns = read<uint8_t>(ip);
-                uint8_t srcRows = read<uint8_t>(ip);
-                uint8_t dstColumns = read<uint8_t>(ip);
-                uint8_t dstRows = read<uint8_t>(ip);
-                printf("matrixToMatrix $%d, %dx%d to %dx%d -> $%d\n", src.fIndex, srcColumns,
-                       srcRows, dstColumns, dstRows, target.fIndex);
-                break;
-            }
-            DISASSEMBLE_UNARY(kNegateF, "negateF")
-            DISASSEMBLE_UNARY(kNegateS, "negateS")
-            DISASSEMBLE_UNARY(kNot, "not")
-            case ByteCode::Instruction::kReadExternal: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                uint8_t count = read<uint8_t>(ip);
-                uint8_t index = read<uint8_t>(ip);
-                printf("readExternal %d, %d -> $%d\n", count, index, target.fIndex);
-                break;
-            }
-            DISASSEMBLE_1(kPrint, "print")
-            DISASSEMBLE_0(kReturn, "return")
-            DISASSEMBLE_1(kReturnValue, "returnValue")
-            case ByteCode::Instruction::kScalarToMatrix: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                uint8_t columns = read<uint8_t>(ip);
-                uint8_t rows = read<uint8_t>(ip);
-                printf("scalarToMatrix $%d, %dx%d -> $%d\n", src.fIndex, columns, rows,
-                       target.fIndex);
-                break;
-            }
-            case ByteCode::Instruction::kSelect: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Register test = read<ByteCode::Register>(ip);
-                ByteCode::Register src1 = read<ByteCode::Register>(ip);
-                ByteCode::Register src2 = read<ByteCode::Register>(ip);
-                printf("select $%d, $%d, $%d -> %d\n", test.fIndex, src1.fIndex, src2.fIndex,
-                       target.fIndex);
-                break;
-            }
-            DISASSEMBLE_BINARY(kShiftLeft, "shiftLeft")
-            DISASSEMBLE_BINARY(kShiftRightS, "shiftRightS")
-            DISASSEMBLE_BINARY(kShiftRightU, "shiftRightU")
-            DISASSEMBLE_UNARY(kSignedToFloat, "signedToFloat")
-            DISASSEMBLE_UNARY(kSin, "sin")
-            DISASSEMBLE_UNARY(kSqrt, "sqrt")
-            DISASSEMBLE_UNARY(kStore, "store")
-            case ByteCode::Instruction::kStoreDirect: {
-                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                printf("store $%d -> @%d\n", src.fIndex, target.fAddress);
-                break;
-            }
-            DISASSEMBLE_UNARY(kStoreParameter, "storeParameter")
-            case ByteCode::Instruction::kStoreParameterDirect: {
-                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                printf("storeParameter $%d -> &%d\n", src.fIndex, target.fAddress);
-                break;
-            }
-            DISASSEMBLE_UNARY(kStoreStack, "storeStack")
-            case ByteCode::Instruction::kStoreStackDirect: {
-                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                printf("storeStackDirect $%d -> @%d\n", src.fIndex, target.fAddress);
-                break;
-            }
-            DISASSEMBLE_UNARY(kTan, "tan")
-            DISASSEMBLE_UNARY(kUnsignedToFloat, "unsignedToFloat")
-            case ByteCode::Instruction::kWriteExternal: {
-                uint8_t index = read<uint8_t>(ip);
-                uint8_t count = read<uint8_t>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                printf("writeExternal $%d, %d -> %d\n", src.fIndex, count, index);
-                break;
-            }
-            default:
-                printf("unsupported: %d\n", (int) inst);
-                SkASSERT(false);
-        }
-    }
-
-    static Vector VecMod(Vector x, Vector y) {
-        return Vector(x.fFloat - skvx::trunc(x.fFloat / y.fFloat) * y.fFloat);
-    }
-
-    #define CHECK_STACK_BOUNDS(address)                              \
-        SkASSERT(context.fStack + address >= fMemory &&              \
-                 context.fStack + address <= fMemory + MEMORY_SIZE)
-
-    static void Inverse2x2(Vector* in, Vector* out) {
-        VectorF a = in[0].fFloat,
-                b = in[1].fFloat,
-                c = in[2].fFloat,
-                d = in[3].fFloat;
-        VectorF idet = VectorF(1) / (a*d - b*c);
-        printf("matrix in: %f, %f, %f, %f\n", a[0], b[0], c[0], d[0]);
-        out[0].fFloat = d * idet;
-        out[1].fFloat = -b * idet;
-        out[2].fFloat = -c * idet;
-        out[3].fFloat = a * idet;
-        printf("matrix out: %f, %f, %f, %f\n", out[0].fFloat[0], out[1].fFloat[0], out[2].fFloat[0], out[3].fFloat[0]);
-    }
-
-    static void Inverse3x3(Vector* in, Vector* out) {
-        VectorF a11 = in[0].fFloat, a12 = in[3].fFloat, a13 = in[6].fFloat,
-                a21 = in[1].fFloat, a22 = in[4].fFloat, a23 = in[7].fFloat,
-                a31 = in[2].fFloat, a32 = in[5].fFloat, a33 = in[8].fFloat;
-        VectorF idet = VectorF(1) / (a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
-                                     a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31);
-        out[0].fFloat = (a22 * a33 - a23 * a32) * idet;
-        out[1].fFloat = (a23 * a31 - a21 * a33) * idet;
-        out[2].fFloat = (a21 * a32 - a22 * a31) * idet;
-        out[3].fFloat = (a13 * a32 - a12 * a33) * idet;
-        out[4].fFloat = (a11 * a33 - a13 * a31) * idet;
-        out[5].fFloat = (a12 * a31 - a11 * a32) * idet;
-        out[6].fFloat = (a12 * a23 - a13 * a22) * idet;
-        out[7].fFloat = (a13 * a21 - a11 * a23) * idet;
-        out[8].fFloat = (a11 * a22 - a12 * a21) * idet;
-    }
-
-
-    static void Inverse4x4(Vector* in, Vector* out) {
-        #define inf(index)  in[index].fFloat
-        #define outf(index) out[index].fFloat
-        VectorF a00 = inf(0), a10 = inf(4), a20 = inf( 8), a30 = inf(12),
-                a01 = inf(1), a11 = inf(5), a21 = inf( 9), a31 = inf(13),
-                a02 = inf(2), a12 = inf(6), a22 = inf(10), a32 = inf(14),
-                a03 = inf(3), a13 = inf(7), a23 = inf(11), a33 = inf(15);
-
-        VectorF b00 = a00 * a11 - a01 * a10,
-                b01 = a00 * a12 - a02 * a10,
-                b02 = a00 * a13 - a03 * a10,
-                b03 = a01 * a12 - a02 * a11,
-                b04 = a01 * a13 - a03 * a11,
-                b05 = a02 * a13 - a03 * a12,
-                b06 = a20 * a31 - a21 * a30,
-                b07 = a20 * a32 - a22 * a30,
-                b08 = a20 * a33 - a23 * a30,
-                b09 = a21 * a32 - a22 * a31,
-                b10 = a21 * a33 - a23 * a31,
-                b11 = a22 * a33 - a23 * a32;
-
-        VectorF idet = VectorF(1) /
-                            (b00 * b11 - b01 * b10 + b02 * b09 + b03 * b08 - b04 * b07 + b05 * b06);
-
-        b00 *= idet;
-        b01 *= idet;
-        b02 *= idet;
-        b03 *= idet;
-        b04 *= idet;
-        b05 *= idet;
-        b06 *= idet;
-        b07 *= idet;
-        b08 *= idet;
-        b09 *= idet;
-        b10 *= idet;
-        b11 *= idet;
-
-        outf( 0) = a11 * b11 - a12 * b10 + a13 * b09;
-        outf( 1) = a02 * b10 - a01 * b11 - a03 * b09;
-        outf( 2) = a31 * b05 - a32 * b04 + a33 * b03;
-        outf( 3) = a22 * b04 - a21 * b05 - a23 * b03;
-        outf( 4) = a12 * b08 - a10 * b11 - a13 * b07;
-        outf( 5) = a00 * b11 - a02 * b08 + a03 * b07;
-        outf( 6) = a32 * b02 - a30 * b05 - a33 * b01;
-        outf( 7) = a20 * b05 - a22 * b02 + a23 * b01;
-        outf( 8) = a10 * b10 - a11 * b08 + a13 * b06;
-        outf( 9) = a01 * b08 - a00 * b10 - a03 * b06;
-        outf(10) = a30 * b04 - a31 * b02 + a33 * b00;
-        outf(11) = a21 * b02 - a20 * b04 - a23 * b00;
-        outf(12) = a11 * b07 - a10 * b09 - a12 * b06;
-        outf(13) = a00 * b09 - a01 * b07 + a02 * b06;
-        outf(14) = a31 * b01 - a30 * b03 - a32 * b00;
-        outf(15) = a20 * b03 - a21 * b01 + a22 * b00;
-        #undef inf
-        #undef outf
-    }
-
-    bool innerRun(const ByteCodeFunction* f, Context context, int baseIndex, Vector** outResult) {
-#ifdef SKSL_THREADED_CODE
-        static const void* labels[] = {
-            // If you aren't familiar with it, the &&label syntax is the GCC / Clang "labels as
-            // values" extension. If you add anything to this array, be sure to add the
-            // corresponding CHECK_LABEL() assert below.
-            &&kNop,
-            &&kAbort,
-            &&kAddF,
-            &&kAddI,
-            &&kAnd,
-            &&kBoundsCheck,
-            &&kBranch,
-            &&kBranchIfAllFalse,
-            &&kBreak,
-            &&kCall,
-            &&kCallExternal,
-            &&kCompareEQF,
-            &&kCompareEQI,
-            &&kCompareNEQF,
-            &&kCompareNEQI,
-            &&kCompareGTF,
-            &&kCompareGTS,
-            &&kCompareGTU,
-            &&kCompareGTEQF,
-            &&kCompareGTEQS,
-            &&kCompareGTEQU,
-            &&kCompareLTF,
-            &&kCompareLTS,
-            &&kCompareLTU,
-            &&kCompareLTEQF,
-            &&kCompareLTEQS,
-            &&kCompareLTEQU,
-            &&kContinue,
-            &&kCopy,
-            &&kCos,
-            &&kDivideF,
-            &&kDivideS,
-            &&kDivideU,
-            &&kFloatToSigned,
-            &&kFloatToUnsigned,
-            &&kImmediate,
-            &&kInverse2x2,
-            &&kInverse3x3,
-            &&kInverse4x4,
-            &&kLoad,
-            &&kLoadDirect,
-            &&kLoadParameter,
-            &&kLoadParameterDirect,
-            &&kLoadStack,
-            &&kLoadStackDirect,
-            &&kLoopBegin,
-            &&kLoopEnd,
-            &&kLoopMask,
-            &&kLoopNext,
-            &&kMaskNegate,
-            &&kMaskPop,
-            &&kMaskPush,
-            &&kMatrixMultiply,
-            &&kMatrixToMatrix,
-            &&kMultiplyF,
-            &&kMultiplyI,
-            &&kNegateF,
-            &&kNegateS,
-            &&kNot,
-            &&kOr,
-            &&kPrint,
-            &&kReadExternal,
-            &&kRemainderF,
-            &&kRemainderS,
-            &&kRemainderU,
-            &&kReturn,
-            &&kReturnValue,
-            &&kScalarToMatrix,
-            &&kSelect,
-            &&kShiftLeft,
-            &&kShiftRightS,
-            &&kShiftRightU,
-            &&kSignedToFloat,
-            &&kSin,
-            &&kSqrt,
-            &&kStore,
-            &&kStoreDirect,
-            &&kStoreParameter,
-            &&kStoreParameterDirect,
-            &&kStoreStack,
-            &&kStoreStackDirect,
-            &&kSubtractF,
-            &&kSubtractI,
-            &&kTan,
-            &&kUnsignedToFloat,
-            &&kWriteExternal,
-            &&kXor
-        };
-        CHECK_LABEL(kNop);
-        CHECK_LABEL(kAbort);
-        CHECK_LABEL(kAddF);
-        CHECK_LABEL(kAddI);
-        CHECK_LABEL(kAnd);
-        CHECK_LABEL(kBoundsCheck);
-        CHECK_LABEL(kBranch);
-        CHECK_LABEL(kBranchIfAllFalse);
-        CHECK_LABEL(kBreak);
-        CHECK_LABEL(kCall);
-        CHECK_LABEL(kCallExternal);
-        CHECK_LABEL(kCompareEQF);
-        CHECK_LABEL(kCompareEQI);
-        CHECK_LABEL(kCompareNEQF);
-        CHECK_LABEL(kCompareNEQI);
-        CHECK_LABEL(kCompareGTF);
-        CHECK_LABEL(kCompareGTS);
-        CHECK_LABEL(kCompareGTU);
-        CHECK_LABEL(kCompareGTEQF);
-        CHECK_LABEL(kCompareGTEQS);
-        CHECK_LABEL(kCompareGTEQU);
-        CHECK_LABEL(kCompareLTF);
-        CHECK_LABEL(kCompareLTS);
-        CHECK_LABEL(kCompareLTU);
-        CHECK_LABEL(kCompareLTEQF);
-        CHECK_LABEL(kCompareLTEQS);
-        CHECK_LABEL(kCompareLTEQU);
-        CHECK_LABEL(kContinue);
-        CHECK_LABEL(kCopy);
-        CHECK_LABEL(kCos);
-        CHECK_LABEL(kDivideF);
-        CHECK_LABEL(kDivideS);
-        CHECK_LABEL(kDivideU);
-        CHECK_LABEL(kFloatToSigned);
-        CHECK_LABEL(kFloatToUnsigned);
-        CHECK_LABEL(kImmediate);
-        CHECK_LABEL(kInverse2x2);
-        CHECK_LABEL(kInverse3x3);
-        CHECK_LABEL(kInverse4x4);
-        CHECK_LABEL(kLoad);
-        CHECK_LABEL(kLoadDirect);
-        CHECK_LABEL(kLoadParameter);
-        CHECK_LABEL(kLoadParameterDirect);
-        CHECK_LABEL(kLoadStack);
-        CHECK_LABEL(kLoadStackDirect);
-        CHECK_LABEL(kLoopBegin);
-        CHECK_LABEL(kLoopEnd);
-        CHECK_LABEL(kLoopMask);
-        CHECK_LABEL(kLoopNext);
-        CHECK_LABEL(kMaskNegate);
-        CHECK_LABEL(kMaskPop);
-        CHECK_LABEL(kMaskPush);
-        CHECK_LABEL(kMatrixMultiply);
-        CHECK_LABEL(kMatrixToMatrix);
-        CHECK_LABEL(kMultiplyF);
-        CHECK_LABEL(kMultiplyI);
-        CHECK_LABEL(kNegateF);
-        CHECK_LABEL(kNegateS);
-        CHECK_LABEL(kNot);
-        CHECK_LABEL(kOr);
-        CHECK_LABEL(kPrint);
-        CHECK_LABEL(kReadExternal);
-        CHECK_LABEL(kRemainderF);
-        CHECK_LABEL(kRemainderS);
-        CHECK_LABEL(kRemainderU);
-        CHECK_LABEL(kReturn);
-        CHECK_LABEL(kReturnValue);
-        CHECK_LABEL(kScalarToMatrix);
-        CHECK_LABEL(kSelect);
-        CHECK_LABEL(kShiftLeft);
-        CHECK_LABEL(kShiftRightS);
-        CHECK_LABEL(kShiftRightU);
-        CHECK_LABEL(kSignedToFloat);
-        CHECK_LABEL(kSin);
-        CHECK_LABEL(kSqrt);
-        CHECK_LABEL(kStore);
-        CHECK_LABEL(kStoreDirect);
-        CHECK_LABEL(kStoreParameter);
-        CHECK_LABEL(kStoreParameterDirect);
-        CHECK_LABEL(kStoreStack);
-        CHECK_LABEL(kStoreStackDirect);
-        CHECK_LABEL(kSubtractF);
-        CHECK_LABEL(kSubtractI);
-        CHECK_LABEL(kTan);
-        CHECK_LABEL(kUnsignedToFloat);
-        CHECK_LABEL(kWriteExternal);
-        CHECK_LABEL(kXor);
-#endif
-        auto mask = [&]() { return *context.fMaskStack & *context.fLoopStack; };
-        auto parameterBase = [&]() {
-            return context.fCallStack.empty() ? context.fStack
-                                              : context.fCallStack.top().fParameters;
-        };
-        const uint8_t* code = f->fCode.data();
-        const uint8_t* ip = code;
-#ifdef SKSL_THREADED_CODE
-        #ifdef TRACE
-            const uint8_t* trace_ip = ip;
-            printf("0: ");
-            disassemble(&trace_ip);
-        #endif
-        goto *labels[(int) read<ByteCode::Instruction>(&ip)];
-#else
-        for (;;) {
-            #ifdef TRACE
-                const uint8_t* trace_ip = ip;
-                disassemble(&trace_ip);
-            #endif
-            ByteCode::Instruction inst = read<ByteCode::Instruction>(&ip);
-            switch (inst) {
-#endif
-                BINARY_OP(kAddF, fFloat, fFloat, +)
-                BINARY_OP(kAddI, fInt, fInt, +)
-                BINARY_OP(kAnd, fInt, fInt, &)
-                BINARY_OP(kCompareEQF, fFloat, fInt, ==)
-                BINARY_OP(kCompareEQI, fInt, fInt, ==)
-                BINARY_OP(kCompareNEQF, fFloat, fInt, !=)
-                BINARY_OP(kCompareNEQI, fInt, fInt, !=)
-                BINARY_OP(kCompareGTF, fFloat, fInt, >)
-                BINARY_OP(kCompareGTS, fInt, fInt, >)
-                BINARY_OP(kCompareGTU, fUInt, fUInt, >)
-                BINARY_OP(kCompareGTEQF, fFloat, fInt, >=)
-                BINARY_OP(kCompareGTEQS, fInt, fInt, >=)
-                BINARY_OP(kCompareGTEQU, fUInt, fUInt, >=)
-                BINARY_OP(kCompareLTF, fFloat, fInt, <)
-                BINARY_OP(kCompareLTS, fInt, fInt, <)
-                BINARY_OP(kCompareLTU, fUInt, fUInt, <)
-                BINARY_OP(kCompareLTEQF, fFloat, fInt, <=)
-                BINARY_OP(kCompareLTEQS, fInt, fInt, <=)
-                BINARY_OP(kCompareLTEQU, fUInt, fUInt, <=)
-                BINARY_OP(kSubtractF, fFloat, fFloat, -)
-                BINARY_OP(kSubtractI, fInt, fInt, -)
-                BINARY_OP(kDivideF, fFloat, fFloat, /)
-                MASKED_BINARY_OP(kDivideS, fInt, fInt, /)
-                MASKED_BINARY_OP(kDivideU, fUInt, fUInt, /)
-                MASKED_BINARY_OP(kRemainderS, fInt, fInt, %)
-                MASKED_BINARY_OP(kRemainderU, fUInt, fUInt, %)
-                BINARY_OP(kMultiplyF, fFloat, fFloat, *)
-                BINARY_OP(kMultiplyI, fInt, fInt, *)
-                BINARY_OP(kOr, fInt, fInt, |)
-                BINARY_OP(kXor, fInt, fInt, ^)
-                LABEL(kAbort)
-                    SkASSERT(false);
-                    return false;
-                LABEL(kBoundsCheck) {
-                    ByteCode::Register r = read<ByteCode::Register>(&ip);
-                    int length = read<int>(&ip);
-                    if (skvx::any(mask() & ((fRegisters[r.fIndex].fInt < 0) |
-                                            (fRegisters[r.fIndex].fInt >= length)))) {
-                        return false;
-                    }
-                    NEXT();
-                }
-                LABEL(kBranch) {
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    ip = code + target.fAddress;
-                    NEXT();
-                }
-                LABEL(kBranchIfAllFalse) {
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    if (!skvx::any(mask())) {
-                        ip = code + target.fAddress;
-                    }
-                    NEXT();
-                }
-                LABEL(kBreak)
-                    *context.fLoopStack &= ~mask();
-                    NEXT();
-                LABEL(kCall) {
-                    ByteCode::Register returnValue = read<ByteCode::Register>(&ip);
-                    uint8_t idx = read<uint8_t>(&ip);
-                    ByteCode::Register args = read<ByteCode::Register>(&ip);
-                    const ByteCodeFunction* target = fCode->fFunctions[idx].get();
-                    int stackSlotCount = target->fStackSlotCount + target->fParameterSlotCount;
-                    context.fCallStack.push(StackFrame(f, ip, stackSlotCount,
-                                                       &fRegisters[args.fIndex],
-                                                       &fRegisters[returnValue.fIndex]));
-                    f = target;
-                    code = f->fCode.data();
-                    ip = code;
-                    context.fStack -= stackSlotCount;
-                    memcpy(context.fStack, &fRegisters[args.fIndex],
-                           f->fParameterSlotCount * sizeof(Vector));
-                    NEXT();
-                }
-                LABEL(kCallExternal) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    uint8_t index = read<uint8_t>(&ip);
-                    uint8_t targetSize = read<uint8_t>(&ip);
-                    ByteCode::Register arguments = read<ByteCode::Register>(&ip);
-                    uint8_t argumentSize = read<uint8_t>(&ip);
-                    ExternalValue* v = fCode->fExternalValues[index];
-                    float tmpReturn[64];
-                    SkASSERT(targetSize < 64);
-                    float tmpArgs[64];
-                    SkASSERT(argumentSize < 64);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            for (int j = 0; j < argumentSize; j++) {
-                                tmpArgs[j] = fRegisters[arguments.fIndex + j].fFloat[i];
-                            }
-                            v->call(baseIndex + i, tmpArgs, tmpReturn);
-                            for (int j = 0; j < targetSize; j++) {
-                                fRegisters[target.fIndex + j].fFloat[i] = tmpReturn[j];
-                            }
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kContinue) {
-                    VectorI m = mask();
-                    *context.fContinueStack |= m;
-                    *context.fLoopStack &= ~m;
-                    NEXT();
-                }
-                LABEL(kCopy) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex].fInt = fRegisters[src.fIndex].fInt;
-                    NEXT();
-                }
-                VECTOR_UNARY_FN(kCos, cosf)
-                LABEL(kFloatToSigned) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex] = Vector(skvx::cast<int32_t>(
-                                                       fRegisters[src.fIndex].fFloat));
-                    NEXT();
-                }
-                LABEL(kFloatToUnsigned) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex] = Vector(skvx::cast<uint32_t>(
-                                                       fRegisters[src.fIndex].fFloat));
-                    NEXT();
-                }
-                LABEL(kImmediate) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Immediate src = read<ByteCode::Immediate>(&ip);
-                    fRegisters[target.fIndex].fInt = src.fInt;
-                    NEXT();
-                }
-                LABEL(kInverse2x2) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Inverse2x2(&fRegisters[src.fIndex], &fRegisters[target.fIndex]);
-                    NEXT();
-                }
-                LABEL(kInverse3x3) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Inverse3x3(&fRegisters[src.fIndex], &fRegisters[target.fIndex]);
-                    NEXT();
-                }
-                LABEL(kInverse4x4) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Inverse4x4(&fRegisters[src.fIndex], &fRegisters[target.fIndex]);
-                    NEXT();
-                }
-                LABEL(kLoad) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            fRegisters[target.fIndex].fInt[i] =
-                                                    fMemory[fRegisters[src.fIndex].fInt[i]].fInt[i];
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kLoadDirect) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
-                    fRegisters[target.fIndex].fInt = fMemory[src.fAddress].fInt;
-                    NEXT();
-                }
-                LABEL(kLoadParameter) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Vector* base = parameterBase();
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            fRegisters[target.fIndex].fInt[i] =
-                                                       base[fRegisters[src.fIndex].fInt[i]].fInt[i];
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kLoadParameterDirect) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
-                    Vector* base = parameterBase();
-                    fRegisters[target.fIndex].fInt = base[src.fAddress].fInt;
-                    NEXT();
-                }
-                LABEL(kLoadStack) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            fRegisters[target.fIndex].fInt[i] =
-                                             context.fStack[fRegisters[src.fIndex].fInt[i]].fInt[i];
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kLoadStackDirect) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
-                    CHECK_STACK_BOUNDS(src.fAddress);
-                    fRegisters[target.fIndex].fInt = context.fStack[src.fAddress].fInt;
-                    NEXT();
-                }
-                LABEL(kLoopBegin) {
-                    context.fLoopStack[1] = context.fLoopStack[0];
-                    ++context.fLoopStack;
-                    context.fContinueStack[1] = 0;
-                    ++context.fContinueStack;
-                    NEXT();
-                }
-                LABEL(kLoopEnd) {
-                    --context.fLoopStack;
-                    --context.fContinueStack;
-                    NEXT();
-                }
-                LABEL(kLoopMask) {
-                    ByteCode::Register value = read<ByteCode::Register>(&ip);
-                    *context.fLoopStack &= fRegisters[value.fIndex].fInt;
-                    NEXT();
-                }
-                LABEL(kLoopNext) {
-                    *context.fLoopStack |= *context.fContinueStack;
-                    *context.fContinueStack = 0;
-                    NEXT();
-                }
-                LABEL(kMaskNegate) {
-                    *context.fMaskStack = context.fMaskStack[-1] & ~context.fCondStack[0];
-                    NEXT();
-                }
-                LABEL(kMaskPop) {
-                    --context.fMaskStack;
-                    --context.fCondStack;
-                    NEXT();
-                }
-                LABEL(kMaskPush) {
-                    ByteCode::Register value = read<ByteCode::Register>(&ip);
-                    context.fCondStack[1] = fRegisters[value.fIndex].fInt;
-                    context.fMaskStack[1] = context.fMaskStack[0] & context.fCondStack[1];
-                    ++context.fCondStack;
-                    ++context.fMaskStack;
-                    NEXT();
-                }
-                LABEL(kMatrixMultiply) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register left = read<ByteCode::Register>(&ip);
-                    ByteCode::Register right = read<ByteCode::Register>(&ip);
-                    uint8_t lCols = read<uint8_t>(&ip);
-                    uint8_t lRows = read<uint8_t>(&ip);
-                    uint8_t rCols = read<uint8_t>(&ip);
-                    uint8_t rRows = lCols;
-                    memset(&fRegisters[target.fIndex], 0, sizeof(Vector) * rCols * lRows);
-                    for (int c = 0; c < rCols; ++c) {
-                        for (int r = 0; r < lRows; ++r) {
-                            for (int j = 0; j < lCols; ++j) {
-                                fRegisters[target.fIndex + c * lRows + r].fFloat +=
-                                        fRegisters[left.fIndex + j * lRows + r].fFloat *
-                                        fRegisters[right.fIndex + c * rRows + j].fFloat;
-                            }
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kMatrixToMatrix) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    uint8_t srcColumns = read<uint8_t>(&ip);
-                    uint8_t srcRows = read<uint8_t>(&ip);
-                    uint8_t dstColumns = read<uint8_t>(&ip);
-                    uint8_t dstRows = read<uint8_t>(&ip);
-                    int offset = 0;
-                    for (int i = 0; i < dstColumns; ++i) {
-                        for (int j = 0; j < dstRows; ++j) {
-                            if (i < srcColumns && j < srcRows) {
-                                fRegisters[target.fIndex + offset] =
-                                                         fRegisters[src.fIndex + (srcRows * i) + j];
-                            } else {
-                                if (i == j) {
-                                    fRegisters[target.fIndex + offset].fFloat = 1;
-                                } else {
-                                    fRegisters[target.fIndex + offset].fFloat = 0;
-                                }
-                            }
-                            ++offset;
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kNegateF) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex].fFloat = -fRegisters[src.fIndex].fFloat;
-                    NEXT();
-                }
-                LABEL(kNegateS) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex].fInt = -fRegisters[src.fIndex].fInt;
-                    NEXT();
-                }
-                LABEL(kNop)
-                    NEXT();
-                LABEL(kNot) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex].fInt = ~fRegisters[src.fIndex].fInt;
-                    NEXT();
-                }
-                LABEL(kPrint) {
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    if (skvx::any(mask())) {
-                        printf("[");
-                        const char* separator = "";
-                        for (int i = 0; i < width; ++i) {
-                            if (mask()[i]) {
-                                printf("%s%f", separator, fRegisters[src.fIndex].fFloat[i]);
-                            }
-                            else {
-                                printf("%s-", separator);
-                            }
-                            separator = ", ";
-                        }
-                        printf("]\n");
-                    }
-                    NEXT();
-                }
-                LABEL(kReadExternal) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    uint8_t count = read<uint8_t>(&ip);
-                    uint8_t index = read<uint8_t>(&ip);
-                    SkASSERT(count <= 4);
-                    SkASSERT(fCode->fExternalValues.size() > index);
-                    float tmp[4];
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            fCode->fExternalValues[index]->read(baseIndex + i, tmp);
-                            for (int j = 0; j < count; ++j) {
-                                fRegisters[target.fIndex + j].fFloat[i] = tmp[j];
-                            }
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kRemainderF) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src1 = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src2 = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex] = VecMod(fRegisters[src1.fIndex],
-                                                       fRegisters[src2.fIndex]);
-                    NEXT();
-                }
-                LABEL(kReturn) {
-                    if (context.fCallStack.empty()) {
-                        return true;
-                    }
-                    StackFrame frame = context.fCallStack.top();
-                    f = frame.fFunction;
-                    code = f->fCode.data();
-                    ip = frame.fIP;
-                    context.fStack += frame.fStackSlotCount;
-                    context.fCallStack.pop();
-                    NEXT();
-                }
-                LABEL(kReturnValue) {
-                    ByteCode::Register returnValue = read<ByteCode::Register>(&ip);
-                    if (context.fCallStack.empty()) {
-                        if (outResult) {
-                            *outResult = &fRegisters[returnValue.fIndex];
-                        }
-                        return true;
-                    }
-                    StackFrame frame = context.fCallStack.top();
-                    ip = frame.fIP;
-                    context.fStack += frame.fStackSlotCount;
-                    memcpy(frame.fReturnValue, &fRegisters[returnValue.fIndex],
-                           sizeof(Vector) * f->fReturnSlotCount);
-                    f = frame.fFunction;
-                    code = f->fCode.data();
-                    context.fCallStack.pop();
-                    NEXT();
-                }
-                LABEL(kScalarToMatrix) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    uint8_t columns = read<uint8_t>(&ip);
-                    uint8_t rows = read<uint8_t>(&ip);
-                    int offset = 0;
-                    for (int i = 0; i < columns; ++i) {
-                        for (int j = 0; j < rows; ++j) {
-                            if (i == j) {
-                                fRegisters[target.fIndex + offset] = fRegisters[src.fIndex];
-                            } else {
-                                fRegisters[target.fIndex + offset].fFloat = 0;
-                            }
-                            ++offset;
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kSelect) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register test = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src1 = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src2 = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex] = skvx::if_then_else(fRegisters[test.fIndex].fInt,
-                                                                   fRegisters[src1.fIndex].fFloat,
-                                                                   fRegisters[src2.fIndex].fFloat);
-                    NEXT();
-                }
-                LABEL(kShiftLeft) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    uint8_t count = read<uint8_t>(&ip);
-                    fRegisters[target.fIndex].fInt = fRegisters[src.fIndex].fInt << count;
-                    NEXT();
-                }
-                LABEL(kShiftRightS) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    int8_t count = read<int8_t>(&ip);
-                    fRegisters[target.fIndex].fInt = fRegisters[src.fIndex].fInt >> count;
-                    NEXT();
-                }
-                LABEL(kShiftRightU) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    uint8_t count = read<uint8_t>(&ip);
-                    fRegisters[target.fIndex].fUInt = fRegisters[src.fIndex].fUInt >> count;
-                    NEXT();
-                }
-                LABEL(kSignedToFloat) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex] = Vector(skvx::cast<float>(
-                                                                      fRegisters[src.fIndex].fInt));
-                    NEXT();
-                }
-                VECTOR_UNARY_FN(kSin, sinf)
-                LABEL(kSqrt) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex].fFloat = skvx::sqrt(fRegisters[src.fIndex].fFloat);
-                    NEXT();
-                }
-                LABEL(kStore) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            fMemory[fRegisters[target.fIndex].fInt[i]].fInt[i] =
-                                                                     fRegisters[src.fIndex].fInt[i];
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kStoreDirect) {
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fMemory[target.fAddress] = skvx::if_then_else(mask(),
-                                                                  fRegisters[src.fIndex].fFloat,
-                                                                  fMemory[target.fAddress].fFloat);
-                    NEXT();
-                }
-                LABEL(kStoreParameter) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Vector* base = parameterBase();
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            base[fRegisters[target.fIndex].fInt[i]].fInt[i] =
-                                                                     fRegisters[src.fIndex].fInt[i];
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kStoreParameterDirect) {
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Vector* base = parameterBase();
-                    base[target.fAddress] = skvx::if_then_else(mask(),
-                                                               fRegisters[src.fIndex].fFloat,
-                                                               base[target.fAddress].fFloat);
-                    NEXT();
-                }
-                LABEL(kStoreStack) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            context.fStack[fRegisters[target.fIndex].fInt[i]].fInt[i] =
-                                                                     fRegisters[src.fIndex].fInt[i];
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kStoreStackDirect) {
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    CHECK_STACK_BOUNDS(target.fAddress);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    context.fStack[target.fAddress] = skvx::if_then_else(
-                                                            mask(),
-                                                            fRegisters[src.fIndex].fFloat,
-                                                            context.fStack[target.fAddress].fFloat);
-                    NEXT();
-                }
-                VECTOR_UNARY_FN(kTan, tanf)
-                LABEL(kUnsignedToFloat) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex] = Vector(skvx::cast<float>(
-                                                                     fRegisters[src.fIndex].fUInt));
-                    NEXT();
-                }
-                LABEL(kWriteExternal) {
-                    uint8_t index = read<uint8_t>(&ip);
-                    uint8_t count = read<uint8_t>(&ip);
-                    SkASSERT(count <= 4);
-                    SkASSERT(fCode->fExternalValues.size() > index);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    float tmp[4];
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            for (int j = 0; j < count; ++j) {
-                                tmp[j] = fRegisters[src.fIndex + j].fFloat[i];
-                            }
-                            fCode->fExternalValues[index]->write(baseIndex + i, tmp);
-                        }
-                    }
-                    NEXT();
-                }
-#ifndef SKSL_THREADED_CODE
-            }
-        }
-#endif
-    }
-
-    const std::unique_ptr<ByteCode> fCode;
-
-    void* fBackingStore;
-
-    Vector* fRegisters;
-
-    Vector* fMemory;
-
-    friend class ByteCode;
-
-    friend class ByteCodeGenerator;
-};
-
-#undef BINARY_OP
-#undef CHECK_STACK_BOUNDS
-
-} // namespace
-
-#endif
diff --git a/src/sksl/SkSLUtil.cpp b/src/sksl/SkSLUtil.cpp
index 686ece3..b2c5162 100644
--- a/src/sksl/SkSLUtil.cpp
+++ b/src/sksl/SkSLUtil.cpp
@@ -9,10 +9,6 @@
 
 #include "src/sksl/SkSLStringStream.h"
 
-#if !defined(SKSL_STANDALONE) & SK_SUPPORT_GPU
-#include "include/gpu/GrContextOptions.h"
-#endif
-
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif
@@ -77,177 +73,4 @@
     }
 }
 
-#if !defined(SKSL_STANDALONE) & SK_SUPPORT_GPU
-sk_sp<GrShaderCaps> ShaderCapsFactory::Default() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fShaderDerivativeSupport = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::Version450Core() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 450 core";
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::Version110() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 110";
-    result->fGLSLGeneration = GrGLSLGeneration::k110_GrGLSLGeneration;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::UsesPrecisionModifiers() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fUsesPrecisionModifiers = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::CannotUseMinAndAbsTogether() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fCanUseMinAndAbsTogether = false;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::CannotUseFractForNegativeValues() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fCanUseFractForNegativeValues = false;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::MustForceNegatedAtanParamToFloat() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fMustForceNegatedAtanParamToFloat = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::ShaderDerivativeExtensionString() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fShaderDerivativeSupport = true;
-    result->fShaderDerivativeExtensionString = "GL_OES_standard_derivatives";
-    result->fUsesPrecisionModifiers = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::FragCoordsOld() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 110";
-    result->fGLSLGeneration = GrGLSLGeneration::k110_GrGLSLGeneration;
-    result->fFragCoordConventionsExtensionString = "GL_ARB_fragment_coord_conventions";
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::FragCoordsNew() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fFragCoordConventionsExtensionString = "GL_ARB_fragment_coord_conventions";
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::GeometryShaderSupport() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fGeometryShaderSupport = true;
-    result->fGSInvocationsSupport = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::NoGSInvocationsSupport() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fGeometryShaderSupport = true;
-    result->fGSInvocationsSupport = false;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::GeometryShaderExtensionString() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 310es";
-    result->fGeometryShaderSupport = true;
-    result->fGeometryShaderExtensionString = "GL_EXT_geometry_shader";
-    result->fGSInvocationsSupport = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::GSInvocationsExtensionString() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fGeometryShaderSupport = true;
-    result->fGSInvocationsSupport = true;
-    result->fGSInvocationsExtensionString = "GL_ARB_gpu_shader5";
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::VariousCaps() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fExternalTextureSupport = true;
-    result->fFBFetchSupport = false;
-    result->fCanUseAnyFunctionInShader = false;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::CannotUseFragCoord() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fCanUseFragCoord = false;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::IncompleteShortIntPrecision() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 310es";
-    result->fUsesPrecisionModifiers = true;
-    result->fIncompleteShortIntPrecision = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::AddAndTrueToLoopCondition() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fAddAndTrueToLoopCondition = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::UnfoldShortCircuitAsTernary() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fUnfoldShortCircuitAsTernary = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::EmulateAbsIntFunction() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fEmulateAbsIntFunction = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::RewriteDoWhileLoops() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fRewriteDoWhileLoops = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::RemovePowWithConstantExponent() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fRemovePowWithConstantExponent = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::SampleMaskSupport() {
-    sk_sp<GrShaderCaps> result = Default();
-    result->fSampleMaskSupport = true;
-    return result;
-}
-#endif
-
 } // namespace
diff --git a/src/sksl/SkSLUtil.h b/src/sksl/SkSLUtil.h
index 3a2b42b..08f2842 100644
--- a/src/sksl/SkSLUtil.h
+++ b/src/sksl/SkSLUtil.h
@@ -18,11 +18,13 @@
 #ifndef SKSL_STANDALONE
 #include "include/core/SkTypes.h"
 #if SK_SUPPORT_GPU
-#include "include/core/SkRefCnt.h"
+#include "include/gpu/GrContextOptions.h"
 #include "src/gpu/GrShaderCaps.h"
 #endif // SK_SUPPORT_GPU
 #endif // SKSL_STANDALONE
 
+class GrShaderCaps;
+
 namespace SkSL {
 
 class OutputStream;
@@ -219,51 +221,176 @@
 // Various sets of caps for use in tests
 class ShaderCapsFactory {
 public:
-    static sk_sp<GrShaderCaps> Default();
+    static sk_sp<GrShaderCaps> Default() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fShaderDerivativeSupport = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> Version450Core();
+    static sk_sp<GrShaderCaps> Version450Core() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 450 core";
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> Version110();
+    static sk_sp<GrShaderCaps> Version110() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 110";
+        result->fGLSLGeneration = GrGLSLGeneration::k110_GrGLSLGeneration;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> UsesPrecisionModifiers();
+    static sk_sp<GrShaderCaps> UsesPrecisionModifiers() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fUsesPrecisionModifiers = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> CannotUseMinAndAbsTogether();
+    static sk_sp<GrShaderCaps> CannotUseMinAndAbsTogether() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fCanUseMinAndAbsTogether = false;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> CannotUseFractForNegativeValues();
+    static sk_sp<GrShaderCaps> CannotUseFractForNegativeValues() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fCanUseFractForNegativeValues = false;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> MustForceNegatedAtanParamToFloat();
+    static sk_sp<GrShaderCaps> MustForceNegatedAtanParamToFloat() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fMustForceNegatedAtanParamToFloat = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> ShaderDerivativeExtensionString();
+    static sk_sp<GrShaderCaps> ShaderDerivativeExtensionString() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fShaderDerivativeSupport = true;
+        result->fShaderDerivativeExtensionString = "GL_OES_standard_derivatives";
+        result->fUsesPrecisionModifiers = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> FragCoordsOld();
+    static sk_sp<GrShaderCaps> FragCoordsOld() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 110";
+        result->fGLSLGeneration = GrGLSLGeneration::k110_GrGLSLGeneration;
+        result->fFragCoordConventionsExtensionString = "GL_ARB_fragment_coord_conventions";
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> FragCoordsNew();
+    static sk_sp<GrShaderCaps> FragCoordsNew() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fFragCoordConventionsExtensionString = "GL_ARB_fragment_coord_conventions";
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> GeometryShaderSupport();
+    static sk_sp<GrShaderCaps> GeometryShaderSupport() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fGeometryShaderSupport = true;
+        result->fGSInvocationsSupport = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> NoGSInvocationsSupport();
+    static sk_sp<GrShaderCaps> NoGSInvocationsSupport() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fGeometryShaderSupport = true;
+        result->fGSInvocationsSupport = false;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> GeometryShaderExtensionString();
+    static sk_sp<GrShaderCaps> GeometryShaderExtensionString() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 310es";
+        result->fGeometryShaderSupport = true;
+        result->fGeometryShaderExtensionString = "GL_EXT_geometry_shader";
+        result->fGSInvocationsSupport = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> GSInvocationsExtensionString();
+    static sk_sp<GrShaderCaps> GSInvocationsExtensionString() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fGeometryShaderSupport = true;
+        result->fGSInvocationsSupport = true;
+        result->fGSInvocationsExtensionString = "GL_ARB_gpu_shader5";
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> VariousCaps();
+    static sk_sp<GrShaderCaps> VariousCaps() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fExternalTextureSupport = true;
+        result->fFBFetchSupport = false;
+        result->fCanUseAnyFunctionInShader = false;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> CannotUseFragCoord();
+    static sk_sp<GrShaderCaps> CannotUseFragCoord() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fCanUseFragCoord = false;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> IncompleteShortIntPrecision();
+    static sk_sp<GrShaderCaps> IncompleteShortIntPrecision() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 310es";
+        result->fUsesPrecisionModifiers = true;
+        result->fIncompleteShortIntPrecision = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> AddAndTrueToLoopCondition();
+    static sk_sp<GrShaderCaps> AddAndTrueToLoopCondition() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fAddAndTrueToLoopCondition = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> UnfoldShortCircuitAsTernary();
+    static sk_sp<GrShaderCaps> UnfoldShortCircuitAsTernary() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fUnfoldShortCircuitAsTernary = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> EmulateAbsIntFunction();
+    static sk_sp<GrShaderCaps> EmulateAbsIntFunction() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fEmulateAbsIntFunction = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> RewriteDoWhileLoops();
+    static sk_sp<GrShaderCaps> RewriteDoWhileLoops() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fRewriteDoWhileLoops = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> RemovePowWithConstantExponent();
+    static sk_sp<GrShaderCaps> RemovePowWithConstantExponent() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fRemovePowWithConstantExponent = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> SampleMaskSupport();
+    static sk_sp<GrShaderCaps> SampleMaskSupport() {
+        sk_sp<GrShaderCaps> result = Default();
+        result->fSampleMaskSupport = true;
+        return result;
+    }
 };
 #endif
 
diff --git a/src/sksl/ir/SkSLFunctionDeclaration.h b/src/sksl/ir/SkSLFunctionDeclaration.h
index f7ce904..11b04a5 100644
--- a/src/sksl/ir/SkSLFunctionDeclaration.h
+++ b/src/sksl/ir/SkSLFunctionDeclaration.h
@@ -36,7 +36,7 @@
         for (auto p : fParameters) {
             result += separator;
             separator = ", ";
-            result += p->fType.displayName();
+            result += p->fName;
         }
         result += ")";
         return result;
diff --git a/src/sksl/ir/SkSLSymbolTable.cpp b/src/sksl/ir/SkSLSymbolTable.cpp
index bbf001d..ed2cb4d 100644
--- a/src/sksl/ir/SkSLSymbolTable.cpp
+++ b/src/sksl/ir/SkSLSymbolTable.cpp
@@ -114,7 +114,9 @@
                 break;
             case Symbol::kUnresolvedFunction_Kind:
                 for (auto& f : ((UnresolvedFunction&) *pair.second).fFunctions) {
-                    ((FunctionDeclaration*)f)->fBuiltin = true;
+                    if (!((FunctionDeclaration*)f)->fDefined) {
+                        ((FunctionDeclaration*)f)->fBuiltin = true;
+                    }
                 }
                 break;
             default:
diff --git a/src/sksl/sksl_interp.inc b/src/sksl/sksl_interp.inc
index e576f9f..f43f05f 100644
--- a/src/sksl/sksl_interp.inc
+++ b/src/sksl/sksl_interp.inc
@@ -1,7 +1,5 @@
 STRINGIFY(
 
-sk_has_side_effects void print(float f);
-
 $genType cos($genType y);
 $genHType cos($genHType y);
 float dot($genType x, $genType y);
diff --git a/tests/SkSLInterpreterTest.cpp b/tests/SkSLInterpreterTest.cpp
index 6aac094..8251269 100644
--- a/tests/SkSLInterpreterTest.cpp
+++ b/tests/SkSLInterpreterTest.cpp
@@ -9,12 +9,20 @@
 #include "src/sksl/SkSLByteCode.h"
 #include "src/sksl/SkSLCompiler.h"
 #include "src/sksl/SkSLExternalValue.h"
-#include "src/sksl/SkSLInterpreter.h"
 #include "src/utils/SkJSON.h"
 
 #include "tests/Test.h"
 
-void test(skiatest::Reporter* r, const char* src, int count, float* in, float* expected,
+static bool nearly_equal(const float a[], const float b[], int count) {
+    for (int i = 0; i < count; ++i) {
+        if (!SkScalarNearlyEqual(a[i], b[i])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void test(skiatest::Reporter* r, const char* src, float* in, float* expected,
           bool exactCompare = true) {
     SkSL::Compiler compiler;
     SkSL::Program::Settings settings;
@@ -31,14 +39,30 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        SkSL::Interpreter<1> interpreter(std::move(byteCode));
-        SkSL::ByteCode::Vector<1>* result;
-        bool success = interpreter.run(main, (SkSL::ByteCode::Vector<1>*) in, &result);
-        REPORTER_ASSERT(r, success);
-        for (int i = 0; i < count; ++i) {
-            printf("%d: expected %f, received %f\n", i, expected[i], result->fFloat[i]);
-            REPORTER_ASSERT(r, result->fFloat[i] == expected[i]);
+        int returnCount = main->getReturnCount();
+        std::unique_ptr<float[]> out = std::unique_ptr<float[]>(new float[returnCount]);
+        SkAssertResult(byteCode->run(main, in, main->getParameterCount(), out.get(), returnCount,
+                                     nullptr, 0));
+        bool valid = exactCompare ? !memcmp(out.get(), expected, sizeof(float) * returnCount)
+                                  : nearly_equal(out.get(), expected, returnCount);
+        if (!valid) {
+            printf("for program: %s\n", src);
+            printf("    expected (");
+            const char* separator = "";
+            for (int i = 0; i < returnCount; ++i) {
+                printf("%s%f", separator, expected[i]);
+                separator = ", ";
+            }
+            printf("), but received (");
+            separator = "";
+            for (int i = 0; i < returnCount; ++i) {
+                printf("%s%f", separator, out.get()[i]);
+                separator = ", ";
+            }
+            printf(")\n");
+            main->disassemble();
         }
+        REPORTER_ASSERT(r, valid);
     } else {
         printf("%s\n%s", src, compiler.errorText().c_str());
     }
@@ -59,8 +83,7 @@
         return;
     }
 
-    const SkSL::ByteCodeFunction* main1 = byteCode->getFunction("main");
-    SkSL::Interpreter<1> interpreter1(std::move(byteCode));
+    const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
 
     // Test on four different vectors (with varying orderings to get divergent control flow)
     const float input[16] = { 1, 2, 3, 4,
@@ -74,16 +97,9 @@
 
     // First run in scalar mode to determine the expected output
     for (int i = 0; i < 4; ++i) {
-        SkAssertResult(interpreter1.run(main1, (SkSL::ByteCode::Vector<1>*) (out_s + i * 4),
-                       nullptr));
+        SkAssertResult(byteCode->run(main, out_s + i * 4, 4, nullptr, 0, nullptr, 0));
     }
 
-    byteCode = compiler.toByteCode(*program);
-    SkASSERT(compiler.errorCount() == 0);
-
-    const SkSL::ByteCodeFunction* main4 = byteCode->getFunction("main");
-    SkSL::Interpreter<4> interpreter4(std::move(byteCode));
-
     // Need to transpose input vectors for striped execution
     auto transpose = [](float* v) {
         for (int r = 0; r < 4; ++r)
@@ -96,7 +112,7 @@
     float* args[] = { out_v, out_v + 4, out_v + 8, out_v + 12 };
 
     // Now run in parallel and compare results
-    SkAssertResult(interpreter4.runStriped(main4, 4, (float**) args));
+    SkAssertResult(byteCode->runStriped(main, 4, args, 4, nullptr, 0, nullptr, 0));
 
     // Transpose striped outputs back
     transpose(out_v);
@@ -109,7 +125,7 @@
                     out_v[4*i + 0], out_v[4*i + 1], out_v[4*i + 2], out_v[4*i + 3],
                     out_s[4*i + 0], out_s[4*i + 1], out_s[4*i + 2], out_s[4*i + 3]);
         }
-        main4->disassemble();
+        main->disassemble();
         REPORT_FAILURE(r, "VecInterpreter mismatch", SkString());
     }
 }
@@ -131,26 +147,20 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        SkSL::ByteCode::Vector<1> inoutColor[4];
-        inoutColor[0].fFloat[0] = inR;
-        inoutColor[1].fFloat[0] = inG;
-        inoutColor[2].fFloat[0] = inB;
-        inoutColor[3].fFloat[0] = inA;
-        SkSL::Interpreter<1> interpreter(std::move(byteCode));
-        bool success = interpreter.run(main, inoutColor, nullptr);
-        REPORTER_ASSERT(r, success);
-        if (inoutColor[0].fFloat[0] != expectedR || inoutColor[1].fFloat[0] != expectedG ||
-            inoutColor[2].fFloat[0] != expectedB || inoutColor[3].fFloat[0] != expectedA) {
+        float inoutColor[4] = { inR, inG, inB, inA };
+        SkAssertResult(byteCode->run(main, inoutColor, 4, nullptr, 0, nullptr, 0));
+        if (inoutColor[0] != expectedR || inoutColor[1] != expectedG ||
+            inoutColor[2] != expectedB || inoutColor[3] != expectedA) {
             printf("for program: %s\n", src);
             printf("    expected (%f, %f, %f, %f), but received (%f, %f, %f, %f)\n", expectedR,
-                   expectedG, expectedB, expectedA, inoutColor[0].fFloat[0],
-                   inoutColor[1].fFloat[0], inoutColor[2].fFloat[0], inoutColor[3].fFloat[0]);
+                   expectedG, expectedB, expectedA, inoutColor[0], inoutColor[1], inoutColor[2],
+                   inoutColor[3]);
             main->disassemble();
         }
-        REPORTER_ASSERT(r, inoutColor[0].fFloat[0] == expectedR);
-        REPORTER_ASSERT(r, inoutColor[1].fFloat[0] == expectedG);
-        REPORTER_ASSERT(r, inoutColor[2].fFloat[0] == expectedB);
-        REPORTER_ASSERT(r, inoutColor[3].fFloat[0] == expectedA);
+        REPORTER_ASSERT(r, inoutColor[0] == expectedR);
+        REPORTER_ASSERT(r, inoutColor[1] == expectedG);
+        REPORTER_ASSERT(r, inoutColor[2] == expectedB);
+        REPORTER_ASSERT(r, inoutColor[3] == expectedA);
     } else {
         printf("%s\n%s", src, compiler.errorText().c_str());
     }
@@ -158,7 +168,7 @@
     // Do additional testing of 4x1 vs 1x4 to stress divergent control flow, etc.
     vec_test(r, src);
 }
-/*
+
 DEF_TEST(SkSLInterpreterAdd, r) {
     test(r, "void main(inout half4 color) { color.r = color.r + color.g; }", 0.25, 0.75, 0, 0, 1,
          0.75, 0, 0);
@@ -167,10 +177,6 @@
          0.5, 1, 1.5, 2);
     test(r, "void main(inout half4 color) { color.r = int(color.r) + int(color.g); }", 1, 3, 0, 0,
          4, 3, 0, 0);
-    test(r, "void main(inout half4 color) { color.rg = color.r + color.gb; }", 1, 2, 3, 4,
-         3, 4, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.rg + color.b; }", 1, 2, 3, 4,
-         4, 5, 3, 4);
 }
 
 DEF_TEST(SkSLInterpreterSubtract, r) {
@@ -183,10 +189,6 @@
     test(r, "void main(inout half4 color) { color = -color; }", 4, 3, 2, 1, -4, -3, -2, -1);
     test(r, "void main(inout half4 color) { color.r = int(color.r) - int(color.g); }", 3, 1, 0, 0,
          2, 1, 0, 0);
-    test(r, "void main(inout half4 color) { color.rg = color.r - color.gb; }", 1, 2, 3, 4,
-         -1, -2, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.rg - color.b; }", 1, 2, 3, 4,
-         -2, -1, 3, 4);
 }
 
 DEF_TEST(SkSLInterpreterMultiply, r) {
@@ -198,10 +200,6 @@
          16, 9, 4, 1);
     test(r, "void main(inout half4 color) { color.r = int(color.r) * int(color.g); }", 3, -2, 0, 0,
          -6, -2, 0, 0);
-    test(r, "void main(inout half4 color) { color.rg = color.r * color.gb; }", 5, 2, 3, 4,
-         10, 15, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.rg * color.b; }", 1, 2, 3, 4,
-         3, 6, 3, 4);
 }
 
 DEF_TEST(SkSLInterpreterDivide, r) {
@@ -213,10 +211,6 @@
          1, 1, 1, 1);
     test(r, "void main(inout half4 color) { color.r = int(color.r) / int(color.g); }", 8, -2, 0, 0,
          -4, -2, 0, 0);
-    test(r, "void main(inout half4 color) { color.rg = color.r / color.gb; }", 12, 2, 3, 4,
-         6, 4, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.rg / color.b; }", 6, 3, 3, 4,
-         2, 1, 3, 4);
 }
 
 DEF_TEST(SkSLInterpreterRemainder, r) {
@@ -228,14 +222,6 @@
          2, 3, 0, 0);
     test(r, "void main(inout half4 color) { color.rg = half2(int2(int(color.r), int(color.g)) % "
                 "int(color.b)); }", 8, 10, 6, 0, 2, 4, 6, 0);
-    test(r, "void main(inout half4 color) { color.rg = color.r + color.gb; }", 1, 2, 3, 4,
-         3, 4, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.rg + color.b; }", 1, 2, 3, 4,
-         4, 5, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.r % color.gb; }", 10, 2, 3, 4,
-         0, 1, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.rg % color.b; }", 6, 3, 4, 4,
-         2, 3, 4, 4);
 }
 
 DEF_TEST(SkSLInterpreterAnd, r) {
@@ -292,7 +278,7 @@
     unsigned out;
 
     out = 0x00000088;
-    test(r, "int main(int x) { return x << 3; }", (float*)&in, (float*)&out);
+    test(r, "int  main(int  x) { return x << 3; }", (float*)&in, (float*)&out);
 
     out = 0xF0000002;
     test(r, "int main(int x) { return x >> 3; }", (float*)&in, (float*)&out);
@@ -488,15 +474,11 @@
     test(r, "void main(inout half4 color) { if (color.rg == color.ba) color.a = 1; }",
          1, 2, 1, 2, 1, 2, 1, 1);
     test(r, "void main(inout half4 color) { if (color.rg == color.ba) color.a = 1; }",
-         1, 2, 1, 3, 1, 2, 1, 3);
-    test(r, "void main(inout half4 color) { if (color.rg == color.ba) color.a = 1; }",
          1, 2, 3, 2, 1, 2, 3, 2);
     test(r, "void main(inout half4 color) { if (color.rg != color.ba) color.a = 1; }",
          1, 2, 1, 2, 1, 2, 1, 2);
     test(r, "void main(inout half4 color) { if (color.rg != color.ba) color.a = 1; }",
          1, 2, 3, 2, 1, 2, 3, 1);
-    test(r, "void main(inout half4 color) { if (color.rg != color.ba) color.a = 1; }",
-         1, 2, 1, 3, 1, 2, 1, 1);
 }
 
 DEF_TEST(SkSLInterpreterWhile, r) {
@@ -659,67 +641,51 @@
     SkIRect gRects[4] = { { 1,2,3,4 }, { 5,6,7,8 }, { 9,10,11,12 }, { 13,14,15,16 } };
     const float* fRects = (const float*)gRects;
 
-    SkSL::Interpreter<1> interpreter(std::move(byteCode));
-    auto geti = [](SkSL::Interpreter<1>::Vector* v) { return v->fInt[0]; };
-    auto getf = [](SkSL::Interpreter<1>::Vector* v) { return v->fFloat[0]; };
-
     {
         SkIRect in = SkIRect::MakeXYWH(10, 10, 20, 30);
-        SkSL::Interpreter<1>::Vector* out;
-        bool success = interpreter.run(rect_height, (SkSL::Interpreter<1>::Vector*) &in, &out);
-        REPORTER_ASSERT(r, success);
-        REPORTER_ASSERT(r, geti(out) == 30);
+        int out = 0;
+        SkAssertResult(byteCode->run(rect_height, (float*)&in, 4, (float*)&out, 1, fRects, 16));
+        REPORTER_ASSERT(r, out == 30);
     }
 
     {
         int in[2] = { 15, 25 };
-        SkSL::Interpreter<1>::Vector* out;
-        bool success = interpreter.run(make_blue_rect, (SkSL::Interpreter<1>::Vector*) in, &out);
-        REPORTER_ASSERT(r, success);
-        RectAndColor result{ { geti(out), geti(out + 1), geti(out + 2), geti(out + 3) },
-                             { getf(out + 4), getf(out + 5), getf(out + 6), getf(out + 7) } };
-        REPORTER_ASSERT(r, result.fRect.width() == 15);
-        REPORTER_ASSERT(r, result.fRect.height() == 25);
+        RectAndColor out;
+        SkAssertResult(byteCode->run(make_blue_rect, (float*)in, 2, (float*)&out, 8, fRects, 16));
+        REPORTER_ASSERT(r, out.fRect.width() == 15);
+        REPORTER_ASSERT(r, out.fRect.height() == 25);
         SkColor4f blue = { 0.0f, 1.0f, 0.0f, 1.0f };
-        REPORTER_ASSERT(r, result.fColor == blue);
+        REPORTER_ASSERT(r, out.fColor == blue);
     }
 
     {
         int in[15] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-        SkSL::Interpreter<1>::Vector* out;
-        bool success = interpreter.run(median, (SkSL::Interpreter<1>::Vector*) in, &out);
-        REPORTER_ASSERT(r, success);
-        REPORTER_ASSERT(r, geti(out) == 8);
+        int out = 0;
+        SkAssertResult(byteCode->run(median, (float*)in, 15, (float*)&out, 1, fRects, 16));
+        REPORTER_ASSERT(r, out == 8);
     }
 
     {
         float in[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
-        SkSL::Interpreter<1>::Vector* out;
-        bool success = interpreter.run(sums, (SkSL::Interpreter<1>::Vector*) in, &out);
-        REPORTER_ASSERT(r, success);
+        float out[8] = { 0 };
+        SkAssertResult(byteCode->run(sums, in, 8, out, 8, fRects, 16));
         for (int i = 0; i < 8; ++i) {
-            REPORTER_ASSERT(r, getf(out + i) == static_cast<float>((i + 1) * (i + 2) / 2));
+            REPORTER_ASSERT(r, out[i] == static_cast<float>((i + 1) * (i + 2) / 2));
         }
     }
 
     {
         int in = 2;
-        interpreter.setUniforms(fRects);
-        SkSL::Interpreter<1>::Vector* out;
-        bool success = interpreter.run(get_rect, (SkSL::Interpreter<1>::Vector*) &in, &out);
-        REPORTER_ASSERT(r, success);
-        REPORTER_ASSERT(r, geti(out) == gRects[2].fLeft);
-        REPORTER_ASSERT(r, geti(out + 1) == gRects[2].fTop);
-        REPORTER_ASSERT(r, geti(out + 2) == gRects[2].fRight);
-        REPORTER_ASSERT(r, geti(out + 3) == gRects[2].fBottom);
+        SkIRect out = SkIRect::MakeEmpty();
+        SkAssertResult(byteCode->run(get_rect, (float*)&in, 1, (float*)&out, 4, fRects, 16));
+        REPORTER_ASSERT(r, out == gRects[2]);
     }
 
     {
         ManyRects in;
         memset(&in, 0, sizeof(in));
         in.fNumRects = 2;
-        bool success = interpreter.run(fill_rects, (SkSL::Interpreter<1>::Vector*) &in, nullptr);
-        REPORTER_ASSERT(r, success);
+        SkAssertResult(byteCode->run(fill_rects, (float*)&in, 33, nullptr, 0, fRects, 16));
         ManyRects expected;
         memset(&expected, 0, sizeof(expected));
         expected.fNumRects = 2;
@@ -752,11 +718,9 @@
     auto byteCode = compiler.toByteCode(*program);
     REPORTER_ASSERT(r, byteCode);
 
-    auto main = byteCode->getFunction("main");
-    SkSL::Interpreter<1> interpreter(std::move(byteCode));
-    SkSL::ByteCode::Vector<1>* result;
-    bool success = interpreter.run(main, (SkSL::ByteCode::Vector<1>*) in, &result);
-    REPORTER_ASSERT(r, !success);
+    auto fun = byteCode->getFunction("main");
+    bool result = byteCode->run(fun, in, fun->getParameterCount(), nullptr, 0, nullptr, 0);
+    REPORTER_ASSERT(r, !result);
 }
 
 DEF_TEST(SkSLInterpreterRestrictFunctionCalls, r) {
@@ -822,21 +786,16 @@
     REPORTER_ASSERT(r, dot3);
     REPORTER_ASSERT(r, dot2);
 
-    SkSL::Interpreter<1> interpreter(std::move(byteCode));
+    float out = 0.0f;
     float in = 3.0f;
+    SkAssertResult(byteCode->run(main, &in, 1, &out, 1, nullptr, 0));
+    REPORTER_ASSERT(r, out = 6.0f);
 
-    SkSL::Interpreter<1>::Vector* out;
-    bool success = interpreter.run(main, (SkSL::Interpreter<1>::Vector*) &in, &out);
-    REPORTER_ASSERT(r, success);
-    REPORTER_ASSERT(r, out->fFloat[0] = 6.0f);
+    SkAssertResult(byteCode->run(dot3, &in, 1, &out, 1, nullptr, 0));
+    REPORTER_ASSERT(r, out = 9.0f);
 
-    success = interpreter.run(dot3, (SkSL::Interpreter<1>::Vector*) &in, &out);
-    REPORTER_ASSERT(r, success);
-    REPORTER_ASSERT(r, out->fFloat[0] = 9.0f);
-
-    success = interpreter.run(dot2, (SkSL::Interpreter<1>::Vector*) &in, &out);
-    REPORTER_ASSERT(r, success);
-    REPORTER_ASSERT(r, out->fFloat[0] = -1.0f);
+    SkAssertResult(byteCode->run(dot2, &in, 1, &out, 1, nullptr, 0));
+    REPORTER_ASSERT(r, out = -1.0f);
 }
 
 DEF_TEST(SkSLInterpreterOutParams, r) {
@@ -845,18 +804,15 @@
          "void main(inout half4 color) { oneAlpha(color); }",
          0, 0, 0, 0, 0, 0, 0, 1);
     test(r,
-         "half2 tricky(half x, half y, inout half2 color, half z, out half w) {"
+         "half2 tricky(half x, half y, inout half2 color, half z) {"
          "    color.xy = color.yx;"
-         "    w = 47;"
          "    return half2(x + y, z);"
          "}"
          "void main(inout half4 color) {"
-         "    half w;"
-         "    half2 t = tricky(1, 2, color.rb, 5, w);"
-         "    color.r += w;"
+         "    half2 t = tricky(1, 2, color.rb, 5);"
          "    color.ga = t;"
          "}",
-         1, 2, 3, 4, 50, 3, 1, 5);
+         1, 2, 3, 4, 3, 3, 1, 5);
 }
 
 DEF_TEST(SkSLInterpreterMathFunctions, r) {
@@ -910,34 +866,342 @@
     float expected[] = { cross.x, cross.y, cross.z };
     test(r, "float3 main(float3 x, float3 y) { return cross(x, y); }", args, expected);
 }
-*/
+
 DEF_TEST(SkSLInterpreterInverse, r) {
     {
-        printf("invert 2x2\n");
         SkMatrix m;
         m.setRotate(30).postScale(1, 2);
         float args[4] = { m[0], m[3], m[1], m[4] };
         SkAssertResult(m.invert(&m));
         float expt[4] = { m[0], m[3], m[1], m[4] };
-        test(r, "float2x2 main(float2x2 m) { return inverse(m); }", 4, args, expt, false);
+        test(r, "float2x2 main(float2x2 m) { return inverse(m); }", args, expt, false);
     }
     {
-        printf("invert 3x3\n");
         SkMatrix m;
         m.setRotate(30).postScale(1, 2).postTranslate(1, 2);
         float args[9] = { m[0], m[3], m[6], m[1], m[4], m[7], m[2], m[5], m[8] };
         SkAssertResult(m.invert(&m));
         float expt[9] = { m[0], m[3], m[6], m[1], m[4], m[7], m[2], m[5], m[8] };
-        test(r, "float3x3 main(float3x3 m) { return inverse(m); }", 4, args, expt, false);
+        test(r, "float3x3 main(float3x3 m) { return inverse(m); }", args, expt, false);
     }
     {
-        printf("invert 4x4\n");
         float args[16], expt[16];
         // just some crazy thing that is invertible
         SkM44 m = {1, 2, 3, 4, 1, 2, 0, 3, 1, 0, 1, 4, 1, 3, 2, 0};
         m.getColMajor(args);
         SkAssertResult(m.invert(&m));
-        m.asColMajorf(expt);
-        test(r, "float4x4 main(float4x4 m) { return inverse(m); }", 4, args, expt, false);
+        m.getColMajor(expt);
+        test(r, "float4x4 main(float4x4 m) { return inverse(m); }", args, expt, false);
+    }
+}
+
+DEF_TEST(SkSLInterpreterDot, r) {
+    float args[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f };
+    float expected = args[0] * args[2] +
+                     args[1] * args[3];
+    test(r, "float main(float2 x, float2 y) { return dot(x, y); }", args, &expected);
+
+    expected = args[0] * args[3] +
+               args[1] * args[4] +
+               args[2] * args[5];
+    test(r, "float main(float3 x, float3 y) { return dot(x, y); }", args, &expected);
+
+    expected = args[0] * args[4] +
+               args[1] * args[5] +
+               args[2] * args[6] +
+               args[3] * args[7];
+    test(r, "float main(float4 x, float4 y) { return dot(x, y); }", args, &expected);
+}
+
+static const SkSL::Type& type_of(const skjson::Value* value, SkSL::Compiler* compiler) {
+    switch (value->getType()) {
+        case skjson::Value::Type::kNumber: {
+            float f = *value->as<skjson::NumberValue>();
+            if (f == (float) (int) f) {
+                return *compiler->context().fInt_Type;
+            }
+            return *compiler->context().fFloat_Type;
+        }
+        case skjson::Value::Type::kBool:
+            return *compiler->context().fBool_Type;
+        default:
+            return *compiler->context().fVoid_Type;
+    }
+}
+
+class JSONExternalValue : public SkSL::ExternalValue {
+public:
+    JSONExternalValue(const char* name, const skjson::Value* value, SkSL::Compiler* compiler)
+        : INHERITED(name, type_of(value, compiler))
+        , fValue(*value)
+        , fCompiler(*compiler) {}
+
+    bool canRead() const override {
+        return type() != *fCompiler.context().fVoid_Type;
+    }
+
+    void read(int /*unusedIndex*/, float* target) override {
+        if (type() == *fCompiler.context().fInt_Type) {
+            *(int*) target = *fValue.as<skjson::NumberValue>();
+        } else if (type() == *fCompiler.context().fFloat_Type) {
+            *(float*) target = *fValue.as<skjson::NumberValue>();
+        } else if (type() == *fCompiler.context().fBool_Type) {
+            // ByteCode "booleans" are actually bit-masks
+            *(int*) target = *fValue.as<skjson::BoolValue>() ? ~0 : 0;
+        } else {
+            SkASSERT(false);
+        }
+    }
+
+    SkSL::ExternalValue* getChild(const char* name) const override {
+        if (fValue.getType() == skjson::Value::Type::kObject) {
+            const skjson::Value& v = fValue.as<skjson::ObjectValue>()[name];
+            return (SkSL::ExternalValue*) fCompiler.takeOwnership(std::unique_ptr<Symbol>(
+                                                      new JSONExternalValue(name, &v, &fCompiler)));
+        }
+        return nullptr;
+    }
+
+private:
+    const skjson::Value& fValue;
+    SkSL::Compiler& fCompiler;
+
+    typedef SkSL::ExternalValue INHERITED;
+};
+
+class PointerExternalValue : public SkSL::ExternalValue {
+public:
+    PointerExternalValue(const char* name, const SkSL::Type& type, void* data, size_t size)
+        : INHERITED(name, type)
+        , fData(data)
+        , fSize(size) {}
+
+    bool canRead() const override {
+        return true;
+    }
+
+    bool canWrite() const override {
+        return true;
+    }
+
+    void read(int /*unusedIndex*/, float* target) override {
+        memcpy(target, fData, fSize);
+    }
+
+    void write(int /*unusedIndex*/, float* src) override {
+        memcpy(fData, src, fSize);
+    }
+
+
+private:
+    void* fData;
+    size_t fSize;
+
+    typedef SkSL::ExternalValue INHERITED;
+};
+
+DEF_TEST(SkSLInterpreterExternalValues, r) {
+    const char* json = "{ \"value1\": 12, \"child\": { \"value2\": true, \"value3\": 5.5 } }";
+    skjson::DOM dom(json, strlen(json));
+    SkSL::Compiler compiler;
+    SkSL::Program::Settings settings;
+    const char* src = "float main() {"
+                      "    outValue = 152;"
+                      "    return root.child.value2 ? root.value1 * root.child.value3 : -1;"
+                      "}";
+    compiler.registerExternalValue((SkSL::ExternalValue*) compiler.takeOwnership(
+             std::unique_ptr<SkSL::Symbol>(new JSONExternalValue("root", &dom.root(), &compiler))));
+    int32_t outValue = -1;
+    compiler.registerExternalValue((SkSL::ExternalValue*) compiler.takeOwnership(
+               std::unique_ptr<SkSL::Symbol>(new PointerExternalValue("outValue",
+                                                                      *compiler.context().fInt_Type,
+                                                                      &outValue,
+                                                                      sizeof(outValue)))));
+    std::unique_ptr<SkSL::Program> program = compiler.convertProgram(
+                                                             SkSL::Program::kGeneric_Kind,
+                                                             SkSL::String(src), settings);
+    REPORTER_ASSERT(r, program);
+    if (program) {
+        std::unique_ptr<SkSL::ByteCode> byteCode = compiler.toByteCode(*program);
+        REPORTER_ASSERT(r, !compiler.errorCount());
+        if (compiler.errorCount() > 0) {
+            printf("%s\n%s", src, compiler.errorText().c_str());
+            return;
+        }
+        const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
+        float out;
+        SkAssertResult(byteCode->run(main, nullptr, 0, &out, 1, nullptr, 0));
+        REPORTER_ASSERT(r, out == 66.0);
+        REPORTER_ASSERT(r, outValue == 152);
+    } else {
+        printf("%s\n%s", src, compiler.errorText().c_str());
+    }
+}
+
+DEF_TEST(SkSLInterpreterExternalValuesVector, r) {
+    SkSL::Compiler compiler;
+    SkSL::Program::Settings settings;
+    const char* src = "void main() {"
+                      "    value *= 2;"
+                      "}";
+    int32_t value[4] = { 1, 2, 3, 4 };
+    compiler.registerExternalValue((SkSL::ExternalValue*) compiler.takeOwnership(
+              std::unique_ptr<SkSL::Symbol>(new PointerExternalValue("value",
+                                                                     *compiler.context().fInt4_Type,
+                                                                     value,
+                                                                     sizeof(value)))));
+    std::unique_ptr<SkSL::Program> program = compiler.convertProgram(SkSL::Program::kGeneric_Kind,
+                                                                     SkSL::String(src),
+                                                                     settings);
+    REPORTER_ASSERT(r, program);
+    if (program) {
+        std::unique_ptr<SkSL::ByteCode> byteCode = compiler.toByteCode(*program);
+        REPORTER_ASSERT(r, !compiler.errorCount());
+        if (compiler.errorCount() > 0) {
+            printf("%s\n%s", src, compiler.errorText().c_str());
+            return;
+        }
+        const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
+        SkAssertResult(byteCode->run(main, nullptr, 0, nullptr, 0, nullptr, 0));
+        REPORTER_ASSERT(r, value[0] == 2);
+        REPORTER_ASSERT(r, value[1] == 4);
+        REPORTER_ASSERT(r, value[2] == 6);
+        REPORTER_ASSERT(r, value[3] == 8);
+    } else {
+        printf("%s\n%s", src, compiler.errorText().c_str());
+    }
+}
+
+class FunctionExternalValue : public SkSL::ExternalValue {
+public:
+    FunctionExternalValue(const char* name, float(*function)(float), SkSL::Compiler& compiler)
+        : INHERITED(name, *compiler.context().fFloat_Type)
+        , fCompiler(compiler)
+        , fFunction(function) {}
+
+    bool canCall() const override {
+        return true;
+    }
+
+    int callParameterCount() const override {
+        return 1;
+    }
+
+    void getCallParameterTypes(const SkSL::Type** outTypes) const override {
+        outTypes[0] = fCompiler.context().fFloat_Type.get();
+    }
+
+    void call(int /*unusedIndex*/, float* arguments, float* outReturn) override {
+        outReturn[0] = fFunction(arguments[0]);
+    }
+
+private:
+    SkSL::Compiler& fCompiler;
+
+    float (*fFunction)(float);
+
+    typedef SkSL::ExternalValue INHERITED;
+};
+
+DEF_TEST(SkSLInterpreterExternalValuesCall, r) {
+    SkSL::Compiler compiler;
+    SkSL::Program::Settings settings;
+    const char* src = "float main() {"
+                      "    return external(25);"
+                      "}";
+    compiler.registerExternalValue((SkSL::ExternalValue*) compiler.takeOwnership(
+            std::unique_ptr<SkSL::Symbol>(new FunctionExternalValue("external",
+                                                                    [] (float x) {
+                                                                        return (float) sqrt(x);
+                                                                    },
+                                                                    compiler))));
+    std::unique_ptr<SkSL::Program> program = compiler.convertProgram(SkSL::Program::kGeneric_Kind,
+                                                                     SkSL::String(src),
+                                                                     settings);
+    REPORTER_ASSERT(r, program);
+    if (program) {
+        std::unique_ptr<SkSL::ByteCode> byteCode = compiler.toByteCode(*program);
+        REPORTER_ASSERT(r, !compiler.errorCount());
+        if (compiler.errorCount() > 0) {
+            printf("%s\n%s", src, compiler.errorText().c_str());
+            return;
+        }
+        const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
+        float out;
+        SkAssertResult(byteCode->run(main, nullptr, 0, &out, 1, nullptr, 0));
+        REPORTER_ASSERT(r, out == 5.0);
+    } else {
+        printf("%s\n%s", src, compiler.errorText().c_str());
+    }
+}
+
+class VectorFunctionExternalValue : public SkSL::ExternalValue {
+public:
+    VectorFunctionExternalValue(const char* name, void(*function)(float[4], float[4]),
+                                SkSL::Compiler& compiler)
+        : INHERITED(name, *compiler.context().fFloat4_Type)
+        , fCompiler(compiler)
+        , fFunction(function) {}
+
+    bool canCall() const override {
+        return true;
+    }
+
+    int callParameterCount() const override {
+        return 1;
+    }
+
+    void getCallParameterTypes(const SkSL::Type** outTypes) const override {
+        outTypes[0] = fCompiler.context().fFloat4_Type.get();
+    }
+
+    void call(int /*unusedIndex*/, float* arguments, float* outReturn) override {
+        fFunction(arguments, outReturn);
+    }
+
+private:
+    SkSL::Compiler& fCompiler;
+
+    void (*fFunction)(float[4], float[4]);
+
+    typedef SkSL::ExternalValue INHERITED;
+};
+
+
+DEF_TEST(SkSLInterpreterExternalValuesVectorCall, r) {
+    SkSL::Compiler compiler;
+    SkSL::Program::Settings settings;
+    const char* src = "float4 main() {"
+                      "    return external(float4(1, 4, 9, 16));"
+                      "}";
+    compiler.registerExternalValue((SkSL::ExternalValue*) compiler.takeOwnership(
+            std::unique_ptr<SkSL::Symbol>(new VectorFunctionExternalValue("external",
+                                                                    [] (float in[4], float out[4]) {
+                                                                        out[0] = sqrt(in[0]);
+                                                                        out[1] = sqrt(in[1]);
+                                                                        out[2] = sqrt(in[2]);
+                                                                        out[3] = sqrt(in[3]);
+                                                                    },
+                                                                    compiler))));
+    std::unique_ptr<SkSL::Program> program = compiler.convertProgram(SkSL::Program::kGeneric_Kind,
+                                                                     SkSL::String(src),
+                                                                     settings);
+    REPORTER_ASSERT(r, program);
+    if (program) {
+        std::unique_ptr<SkSL::ByteCode> byteCode = compiler.toByteCode(*program);
+        REPORTER_ASSERT(r, !compiler.errorCount());
+        if (compiler.errorCount() > 0) {
+            printf("%s\n%s", src, compiler.errorText().c_str());
+            return;
+        }
+        const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
+        float out[4];
+        SkAssertResult(byteCode->run(main, nullptr, 0, out, 4, nullptr, 0));
+        REPORTER_ASSERT(r, out[0] == 1.0);
+        REPORTER_ASSERT(r, out[1] == 2.0);
+        REPORTER_ASSERT(r, out[2] == 3.0);
+        REPORTER_ASSERT(r, out[3] == 4.0);
+    } else {
+        printf("%s\n%s", src, compiler.errorText().c_str());
     }
 }