SkSL ByteCode: Remove specialized instructions for N up to 4

Nearly all instructions have one form, with a count byte after the
instruction. Simplifes the SkVM conversion logic, reduces code size.

Change-Id: I5ff7bb2991a09198c5c8f5bcaf2c1017c06be5d4
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/299682
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Brian Osman <brianosman@google.com>
diff --git a/src/core/SkRuntimeEffect.cpp b/src/core/SkRuntimeEffect.cpp
index 87d1501..008ae7a 100644
--- a/src/core/SkRuntimeEffect.cpp
+++ b/src/core/SkRuntimeEffect.cpp
@@ -461,33 +461,36 @@
       //auto u16 = [&]{ auto x = sk_unaligned_load<uint16_t>(ip); ip += sizeof(x); return x; };
         auto u32 = [&]{ auto x = sk_unaligned_load<uint32_t>(ip); ip += sizeof(x); return x; };
 
-        auto unary = [&](Inst base, auto&& fn, bool allow_big = false) {
-            int N = (int)base - (int)inst + 1;
-            SkASSERT(0 < N && N <= (allow_big ? 5 : 4));
-            if (N == 5) { N = u8(); }
-            std::vector<skvm::F32> args(N);
-            for (int i = 0; i < N; ++i) {
-                args[i] = pop();
-            }
-            for (int i = N; i --> 0;) {
-                push(fn(args[i]));
+        auto unary = [&](auto&& fn) {
+            int N = u8();
+            std::vector<skvm::F32> a(N);
+            for (int i = N; i --> 0; ) { a[i] = pop(); }
+
+            for (int i = 0; i < N; i++) {
+                push(fn(a[i]));
             }
         };
 
-        auto binary = [&](Inst base, auto&& fn, bool allow_big = false) {
-            int N = (int)base - (int)inst + 1;
-            SkASSERT(0 < N && N <= (allow_big ? 5 : 4));
-            if (N == 5) { N = u8(); }
-            std::vector<skvm::F32> right(N);
-            for (int i = 0; i < N; ++i) {
-                right[i] = pop();
+        auto binary = [&](auto&& fn) {
+            int N = u8();
+            std::vector<skvm::F32> a(N), b(N);
+            for (int i = N; i --> 0; ) { b[i] = pop(); }
+            for (int i = N; i --> 0; ) { a[i] = pop(); }
+
+            for (int i = 0; i < N; i++) {
+                push(fn(a[i], b[i]));
             }
-            std::vector<skvm::F32> left(N);
-            for (int i = 0; i < N; ++i) {
-                left[i] = pop();
-            }
-            for (int i = N; i --> 0;) {
-                push(fn(left[i], right[i]));
+        };
+
+        auto ternary = [&](auto&& fn) {
+            int N = u8();
+            std::vector<skvm::F32> a(N), b(N), c(N);
+            for (int i = N; i --> 0; ) { c[i] = pop(); }
+            for (int i = N; i --> 0; ) { b[i] = pop(); }
+            for (int i = N; i --> 0; ) { a[i] = pop(); }
+
+            for (int i = 0; i < N; i++) {
+                push(fn(a[i], b[i], c[i]));
             }
         };
 
@@ -552,55 +555,19 @@
             } break;
 
             case Inst::kLoad: {
-                int ix = u8();
-                push(stack[ix + 0]);
-            } break;
-
-            case Inst::kLoad2: {
-                int ix = u8();
-                push(stack[ix + 0]);
-                push(stack[ix + 1]);
-            } break;
-
-            case Inst::kLoad3: {
-                int ix = u8();
-                push(stack[ix + 0]);
-                push(stack[ix + 1]);
-                push(stack[ix + 2]);
-            } break;
-
-            case Inst::kLoad4: {
-                int ix = u8();
-                push(stack[ix + 0]);
-                push(stack[ix + 1]);
-                push(stack[ix + 2]);
-                push(stack[ix + 3]);
+                int N  = u8(),
+                    ix = u8();
+                for (int i = 0; i < N; ++i) {
+                    push(stack[ix + i]);
+                }
             } break;
 
             case Inst::kLoadUniform: {
-                int ix = u8();
-                push(uniform[ix]);
-            } break;
-
-            case Inst::kLoadUniform2: {
-                int ix = u8();
-                push(uniform[ix + 0]);
-                push(uniform[ix + 1]);
-            } break;
-
-            case Inst::kLoadUniform3: {
-                int ix = u8();
-                push(uniform[ix + 0]);
-                push(uniform[ix + 1]);
-                push(uniform[ix + 2]);
-            } break;
-
-            case Inst::kLoadUniform4: {
-                int ix = u8();
-                push(uniform[ix + 0]);
-                push(uniform[ix + 1]);
-                push(uniform[ix + 2]);
-                push(uniform[ix + 3]);
+                int N  = u8(),
+                    ix = u8();
+                for (int i = 0; i < N; ++i) {
+                    push(uniform[ix + i]);
+                }
             } break;
 
             case Inst::kLoadFragCoord: {
@@ -612,56 +579,22 @@
             } break;
 
             case Inst::kStore: {
-                int ix = u8();
-                stack[ix + 0] = pop();
+                int N  = u8(),
+                    ix = u8();
+                for (int i = N; i --> 0; ) {
+                    stack[ix + i] = pop();
+                }
             } break;
 
-            case Inst::kStore2: {
-                int ix = u8();
-                stack[ix + 1] = pop();
-                stack[ix + 0] = pop();
-            } break;
-
-            case Inst::kStore3: {
-                int ix = u8();
-                stack[ix + 2] = pop();
-                stack[ix + 1] = pop();
-                stack[ix + 0] = pop();
-            } break;
-
-            case Inst::kStore4: {
-                int ix = u8();
-                stack[ix + 3] = pop();
-                stack[ix + 2] = pop();
-                stack[ix + 1] = pop();
-                stack[ix + 0] = pop();
-            } break;
-
-
             case Inst::kPushImmediate: {
                 push(bit_cast(p->splat(u32())));
             } break;
 
             case Inst::kDup: {
-                push(stack[stack.size() - 1]);
-            } break;
-
-            case Inst::kDup2: {
-                push(stack[stack.size() - 2]);
-                push(stack[stack.size() - 2]);
-            } break;
-
-            case Inst::kDup3: {
-                push(stack[stack.size() - 3]);
-                push(stack[stack.size() - 3]);
-                push(stack[stack.size() - 3]);
-            } break;
-
-            case Inst::kDup4: {
-                push(stack[stack.size() - 4]);
-                push(stack[stack.size() - 4]);
-                push(stack[stack.size() - 4]);
-                push(stack[stack.size() - 4]);
+                int N = u8();
+                for (int i = 0; i < N; ++i) {
+                    push(stack[stack.size() - N]);
+                }
             } break;
 
             case Inst::kSwizzle: {
@@ -674,104 +607,34 @@
                 }
             } break;
 
-            case Inst::kAddF:
-            case Inst::kAddF2:
-            case Inst::kAddF3:
-            case Inst::kAddF4:
-            case Inst::kAddFN: binary(Inst::kAddF, std::plus<>{}, true); break;
-
-            case Inst::kSubtractF:
-            case Inst::kSubtractF2:
-            case Inst::kSubtractF3:
-            case Inst::kSubtractF4:
-            case Inst::kSubtractFN: binary(Inst::kSubtractF, std::minus<>{}, true); break;
-
-            case Inst::kMultiplyF:
-            case Inst::kMultiplyF2:
-            case Inst::kMultiplyF3:
-            case Inst::kMultiplyF4:
-            case Inst::kMultiplyFN: binary(Inst::kMultiplyF, std::multiplies<>{}, true); break;
-
-            case Inst::kDivideF:
-            case Inst::kDivideF2:
-            case Inst::kDivideF3:
-            case Inst::kDivideF4:
-            case Inst::kDivideFN: binary(Inst::kDivideF, std::divides<>{}, true); break;
+            case Inst::kAddF:      binary(std::plus<>{});       break;
+            case Inst::kSubtractF: binary(std::minus<>{});      break;
+            case Inst::kMultiplyF: binary(std::multiplies<>{}); break;
+            case Inst::kDivideF:   binary(std::divides<>{});    break;
+            case Inst::kNegateF:    unary(std::negate<>{});     break;
 
             case Inst::kMinF:
-            case Inst::kMinF2:
-            case Inst::kMinF3:
-            case Inst::kMinF4:
-                binary(Inst::kMinF, [](skvm::F32 x, skvm::F32 y) { return skvm::min(x,y); });
+                binary([](skvm::F32 x, skvm::F32 y) { return skvm::min(x,y); });
                 break;
 
             case Inst::kMaxF:
-            case Inst::kMaxF2:
-            case Inst::kMaxF3:
-            case Inst::kMaxF4:
-                binary(Inst::kMaxF, [](skvm::F32 x, skvm::F32 y) { return skvm::max(x,y); });
+                binary([](skvm::F32 x, skvm::F32 y) { return skvm::max(x,y); });
                 break;
 
-            case Inst::kNegateF:
-            case Inst::kNegateF2:
-            case Inst::kNegateF3:
-            case Inst::kNegateF4:
-            case Inst::kNegateFN: unary(Inst::kNegateF, std::negate<>{}, true); break;
-
             case Inst::kPow:
-            case Inst::kPow2:
-            case Inst::kPow3:
-            case Inst::kPow4:
-                binary(Inst::kPow, [](skvm::F32 x, skvm::F32 y) { return skvm::approx_powf(x,y); });
+                binary([](skvm::F32 x, skvm::F32 y) { return skvm::approx_powf(x,y); });
                 break;
 
             case Inst::kLerp:
-            case Inst::kLerp2:
-            case Inst::kLerp3:
-            case Inst::kLerp4: {
-                int N = (int)Inst::kLerp - (int)inst + 1;
+                ternary([](skvm::F32 x, skvm::F32 y, skvm::F32 t) { return skvm::lerp(x, y, t); });
+                break;
 
-                skvm::F32 t[4],
-                          b[4],
-                          a[4];
-                for (int i = N; i --> 0; ) { t[i] = pop(); }
-                for (int i = N; i --> 0; ) { b[i] = pop(); }
-                for (int i = N; i --> 0; ) { a[i] = pop(); }
-
-                for (int i = 0; i < N; i++) {
-                    push(skvm::lerp(a[i], b[i], t[i]));
-                }
-            } break;
-
-            case Inst::kATan:
-            case Inst::kATan2:
-            case Inst::kATan3:
-            case Inst::kATan4: unary(Inst::kATan, skvm::approx_atan); break;
-
-            case Inst::kCeil:
-            case Inst::kCeil2:
-            case Inst::kCeil3:
-            case Inst::kCeil4: unary(Inst::kCeil, skvm::ceil); break;
-
-            case Inst::kFloor:
-            case Inst::kFloor2:
-            case Inst::kFloor3:
-            case Inst::kFloor4: unary(Inst::kFloor, skvm::floor); break;
-
-            case Inst::kFract:
-            case Inst::kFract2:
-            case Inst::kFract3:
-            case Inst::kFract4: unary(Inst::kFract, skvm::fract); break;
-
-            case Inst::kSqrt:
-            case Inst::kSqrt2:
-            case Inst::kSqrt3:
-            case Inst::kSqrt4: unary(Inst::kSqrt, skvm::sqrt); break;
-
-            case Inst::kSin:
-            case Inst::kSin2:
-            case Inst::kSin3:
-            case Inst::kSin4: unary(Inst::kSin, skvm::approx_sin); break;
+            case Inst::kATan:  unary(skvm::approx_atan); break;
+            case Inst::kCeil:  unary(skvm::ceil);        break;
+            case Inst::kFloor: unary(skvm::floor);       break;
+            case Inst::kFract: unary(skvm::fract);       break;
+            case Inst::kSqrt:  unary(skvm::sqrt);        break;
+            case Inst::kSin:   unary(skvm::approx_sin);  break;
 
             case Inst::kMatrixMultiply: {
                 // Computes M = A*B (all stored column major)
@@ -798,11 +661,9 @@
             case Inst::kMaskPush:   break;
             case Inst::kMaskNegate: break;
 
-            case Inst::kCompareFLT: {
-                skvm::F32 x = pop(),
-                          a = pop();
-                push(bit_cast(a<x));
-            } break;
+            case Inst::kCompareFLT:
+                binary([](skvm::F32 x, skvm::F32 y) { return bit_cast(x<y); });
+                break;
 
             case Inst::kMaskBlend: {
                 std::vector<skvm::F32> if_true,
diff --git a/src/sksl/SkSLByteCode.cpp b/src/sksl/SkSLByteCode.cpp
index bbd3c99..ae573c6 100644
--- a/src/sksl/SkSLByteCode.cpp
+++ b/src/sksl/SkSLByteCode.cpp
@@ -14,6 +14,7 @@
 #include "src/sksl/SkSLByteCodeGenerator.h"
 #include "src/sksl/SkSLExternalValue.h"
 
+#include <functional>
 #include <vector>
 
 namespace SkSL {
@@ -34,24 +35,24 @@
 #define READ_INST() (ip += sizeof(ByteCodeInstruction), \
                      sk_unaligned_load<ByteCodeInstruction>(ip - sizeof(ByteCodeInstruction)))
 
-#define VECTOR_DISASSEMBLE(op, text)                          \
-    case ByteCodeInstruction::op: printf(text); break;        \
-    case ByteCodeInstruction::op##2: printf(text "2"); break; \
-    case ByteCodeInstruction::op##3: printf(text "3"); break; \
-    case ByteCodeInstruction::op##4: printf(text "4"); break;
+#define DISASSEMBLE_COUNT(op, text) \
+    case ByteCodeInstruction::op: printf(text " %d", READ8()); break;
 
-#define VECTOR_MATRIX_DISASSEMBLE(op, text) \
-    VECTOR_DISASSEMBLE(op, text)            \
-    case ByteCodeInstruction::op##N: printf(text "N %d", READ8()); break;
+#define DISASSEMBLE_COUNT_SLOT(op, text)  \
+    case ByteCodeInstruction::op: {       \
+        int N    = READ8(),               \
+            slot = READ8();               \
+        printf(text " %d [%d]", N, slot); \
+    } break;
 
 static const uint8_t* DisassembleInstruction(const uint8_t* ip) {
     auto inst = READ_INST();
     printf("%04x ", (int)inst);
     switch (inst) {
-        VECTOR_MATRIX_DISASSEMBLE(kAddF, "addf")
-        VECTOR_DISASSEMBLE(kAddI, "addi")
-        case ByteCodeInstruction::kAndB: printf("andb"); break;
-        VECTOR_DISASSEMBLE(kATan, "atan")
+        DISASSEMBLE_COUNT(kAddF, "addf")
+        DISASSEMBLE_COUNT(kAddI, "addi")
+        DISASSEMBLE_COUNT(kAndB, "andb")
+        DISASSEMBLE_COUNT(kATan, "atan")
         case ByteCodeInstruction::kBranch: printf("branch %d", READ16()); break;
         case ByteCodeInstruction::kCall: printf("call %d", READ8()); break;
         case ByteCodeInstruction::kCallExternal: {
@@ -61,55 +62,44 @@
             printf("callexternal %d, %d, %d", argumentCount, returnCount, externalValue);
             break;
         }
-        VECTOR_DISASSEMBLE(kCeil, "ceil")
+        DISASSEMBLE_COUNT(kCeil, "ceil")
         case ByteCodeInstruction::kClampIndex: printf("clampindex %d", READ8()); break;
-        VECTOR_DISASSEMBLE(kCompareIEQ, "compareieq")
-        VECTOR_DISASSEMBLE(kCompareINEQ, "compareineq")
-        VECTOR_MATRIX_DISASSEMBLE(kCompareFEQ, "comparefeq")
-        VECTOR_MATRIX_DISASSEMBLE(kCompareFNEQ, "comparefneq")
-        VECTOR_DISASSEMBLE(kCompareFGT, "comparefgt")
-        VECTOR_DISASSEMBLE(kCompareFGTEQ, "comparefgteq")
-        VECTOR_DISASSEMBLE(kCompareFLT, "compareflt")
-        VECTOR_DISASSEMBLE(kCompareFLTEQ, "compareflteq")
-        VECTOR_DISASSEMBLE(kCompareSGT, "comparesgt")
-        VECTOR_DISASSEMBLE(kCompareSGTEQ, "comparesgteq")
-        VECTOR_DISASSEMBLE(kCompareSLT, "compareslt")
-        VECTOR_DISASSEMBLE(kCompareSLTEQ, "compareslteq")
-        VECTOR_DISASSEMBLE(kCompareUGT, "compareugt")
-        VECTOR_DISASSEMBLE(kCompareUGTEQ, "compareugteq")
-        VECTOR_DISASSEMBLE(kCompareULT, "compareult")
-        VECTOR_DISASSEMBLE(kCompareULTEQ, "compareulteq")
-        VECTOR_DISASSEMBLE(kConvertFtoI, "convertftoi")
-        VECTOR_DISASSEMBLE(kConvertStoF, "convertstof")
-        VECTOR_DISASSEMBLE(kConvertUtoF, "convertutof")
-        VECTOR_DISASSEMBLE(kCos, "cos")
-        VECTOR_MATRIX_DISASSEMBLE(kDivideF, "dividef")
-        VECTOR_DISASSEMBLE(kDivideS, "divideS")
-        VECTOR_DISASSEMBLE(kDivideU, "divideu")
-        VECTOR_MATRIX_DISASSEMBLE(kDup, "dup")
-        VECTOR_DISASSEMBLE(kFloor, "floor")
-        VECTOR_DISASSEMBLE(kFract, "fract")
+        DISASSEMBLE_COUNT(kCompareIEQ, "compareieq")
+        DISASSEMBLE_COUNT(kCompareINEQ, "compareineq")
+        DISASSEMBLE_COUNT(kCompareFEQ, "comparefeq")
+        DISASSEMBLE_COUNT(kCompareFNEQ, "comparefneq")
+        DISASSEMBLE_COUNT(kCompareFGT, "comparefgt")
+        DISASSEMBLE_COUNT(kCompareFGTEQ, "comparefgteq")
+        DISASSEMBLE_COUNT(kCompareFLT, "compareflt")
+        DISASSEMBLE_COUNT(kCompareFLTEQ, "compareflteq")
+        DISASSEMBLE_COUNT(kCompareSGT, "comparesgt")
+        DISASSEMBLE_COUNT(kCompareSGTEQ, "comparesgteq")
+        DISASSEMBLE_COUNT(kCompareSLT, "compareslt")
+        DISASSEMBLE_COUNT(kCompareSLTEQ, "compareslteq")
+        DISASSEMBLE_COUNT(kCompareUGT, "compareugt")
+        DISASSEMBLE_COUNT(kCompareUGTEQ, "compareugteq")
+        DISASSEMBLE_COUNT(kCompareULT, "compareult")
+        DISASSEMBLE_COUNT(kCompareULTEQ, "compareulteq")
+        DISASSEMBLE_COUNT(kConvertFtoI, "convertftoi")
+        DISASSEMBLE_COUNT(kConvertStoF, "convertstof")
+        DISASSEMBLE_COUNT(kConvertUtoF, "convertutof")
+        DISASSEMBLE_COUNT(kCos, "cos")
+        DISASSEMBLE_COUNT(kDivideF, "dividef")
+        DISASSEMBLE_COUNT(kDivideS, "divideS")
+        DISASSEMBLE_COUNT(kDivideU, "divideu")
+        DISASSEMBLE_COUNT(kDup, "dup")
+        DISASSEMBLE_COUNT(kFloor, "floor")
+        DISASSEMBLE_COUNT(kFract, "fract")
         case ByteCodeInstruction::kInverse2x2: printf("inverse2x2"); break;
         case ByteCodeInstruction::kInverse3x3: printf("inverse3x3"); break;
         case ByteCodeInstruction::kInverse4x4: printf("inverse4x4"); break;
-        VECTOR_DISASSEMBLE(kLerp, "lerp")
-        case ByteCodeInstruction::kLoad: printf("load %d", READ8()); break;
-        case ByteCodeInstruction::kLoad2: printf("load2 %d", READ8()); break;
-        case ByteCodeInstruction::kLoad3: printf("load3 %d", READ8()); break;
-        case ByteCodeInstruction::kLoad4: printf("load4 %d", READ8()); break;
-        case ByteCodeInstruction::kLoadGlobal: printf("loadglobal %d", READ8()); break;
-        case ByteCodeInstruction::kLoadGlobal2: printf("loadglobal2 %d", READ8()); break;
-        case ByteCodeInstruction::kLoadGlobal3: printf("loadglobal3 %d", READ8()); break;
-        case ByteCodeInstruction::kLoadGlobal4: printf("loadglobal4 %d", READ8()); break;
-        case ByteCodeInstruction::kLoadUniform: printf("loaduniform %d", READ8()); break;
-        case ByteCodeInstruction::kLoadUniform2: printf("loaduniform2 %d", READ8()); break;
-        case ByteCodeInstruction::kLoadUniform3: printf("loaduniform3 %d", READ8()); break;
-        case ByteCodeInstruction::kLoadUniform4: printf("loaduniform4 %d", READ8()); break;
-        case ByteCodeInstruction::kLoadExtended: printf("loadextended %d", READ8()); break;
-        case ByteCodeInstruction::kLoadExtendedGlobal: printf("loadextendedglobal %d", READ8());
-            break;
-        case ByteCodeInstruction::kLoadExtendedUniform: printf("loadextendeduniform %d", READ8());
-            break;
+        DISASSEMBLE_COUNT(kLerp, "lerp")
+        DISASSEMBLE_COUNT_SLOT(kLoad, "load")
+        DISASSEMBLE_COUNT_SLOT(kLoadGlobal, "loadglobal")
+        DISASSEMBLE_COUNT_SLOT(kLoadUniform, "loaduniform")
+        DISASSEMBLE_COUNT(kLoadExtended, "loadextended")
+        DISASSEMBLE_COUNT(kLoadExtendedGlobal, "loadextendedglobal")
+        DISASSEMBLE_COUNT(kLoadExtendedUniform, "loadextendeduniform")
         case ByteCodeInstruction::kLoadFragCoord: printf("loadfragcoord"); break;
         case ByteCodeInstruction::kMatrixToMatrix: {
             int srcCols = READ8();
@@ -126,34 +116,31 @@
             printf("matrixmultiply %dx%d %dx%d", lCols, lRows, rCols, lCols);
             break;
         }
-        VECTOR_DISASSEMBLE(kMaxF, "maxf")
-        VECTOR_DISASSEMBLE(kMaxS, "maxs")
-        VECTOR_DISASSEMBLE(kMinF, "minf")
-        VECTOR_DISASSEMBLE(kMinS, "mins")
-        VECTOR_DISASSEMBLE(kMix, "mix")
-        VECTOR_MATRIX_DISASSEMBLE(kMultiplyF, "multiplyf")
-        VECTOR_DISASSEMBLE(kMultiplyI, "multiplyi")
-        VECTOR_MATRIX_DISASSEMBLE(kNegateF, "negatef")
-        VECTOR_DISASSEMBLE(kNegateI, "negatei")
-        VECTOR_DISASSEMBLE(kNotB, "notb")
-        case ByteCodeInstruction::kOrB: printf("orb"); break;
-        VECTOR_MATRIX_DISASSEMBLE(kPop, "pop")
-        VECTOR_DISASSEMBLE(kPow, "pow")
+        DISASSEMBLE_COUNT(kMaxF, "maxf")
+        DISASSEMBLE_COUNT(kMaxS, "maxs")
+        DISASSEMBLE_COUNT(kMinF, "minf")
+        DISASSEMBLE_COUNT(kMinS, "mins")
+        DISASSEMBLE_COUNT(kMix, "mix")
+        DISASSEMBLE_COUNT(kMultiplyF, "multiplyf")
+        DISASSEMBLE_COUNT(kMultiplyI, "multiplyi")
+        DISASSEMBLE_COUNT(kNegateF, "negatef")
+        DISASSEMBLE_COUNT(kNegateI, "negatei")
+        DISASSEMBLE_COUNT(kNotB, "notb")
+        DISASSEMBLE_COUNT(kOrB, "orb")
+        DISASSEMBLE_COUNT(kPop, "pop")
+        DISASSEMBLE_COUNT(kPow, "pow")
         case ByteCodeInstruction::kPushImmediate: {
             uint32_t v = READ32();
             union { uint32_t u; float f; } pun = { v };
             printf("pushimmediate %s", (to_string(v) + "(" + to_string(pun.f) + ")").c_str());
             break;
         }
-        case ByteCodeInstruction::kReadExternal: printf("readexternal %d", READ8()); break;
-        case ByteCodeInstruction::kReadExternal2: printf("readexternal2 %d", READ8()); break;
-        case ByteCodeInstruction::kReadExternal3: printf("readexternal3 %d", READ8()); break;
-        case ByteCodeInstruction::kReadExternal4: printf("readexternal4 %d", READ8()); break;
-        VECTOR_DISASSEMBLE(kRemainderF, "remainderf")
-        VECTOR_DISASSEMBLE(kRemainderS, "remainders")
-        VECTOR_DISASSEMBLE(kRemainderU, "remainderu")
-        case ByteCodeInstruction::kReserve: printf("reserve %d", READ8()); break;
-        case ByteCodeInstruction::kReturn: printf("return %d", READ8()); break;
+        DISASSEMBLE_COUNT_SLOT(kReadExternal, "readexternal")
+        DISASSEMBLE_COUNT(kRemainderF, "remainderf")
+        DISASSEMBLE_COUNT(kRemainderS, "remainders")
+        DISASSEMBLE_COUNT(kRemainderU, "remainderu")
+        DISASSEMBLE_COUNT(kReserve, "reserve")
+        DISASSEMBLE_COUNT(kReturn, "return")
         case ByteCodeInstruction::kSampleExplicit: printf("sample %d", READ8()); break;
         case ByteCodeInstruction::kSampleMatrix: printf("sampleMtx %d", READ8()); break;
         case ByteCodeInstruction::kScalarToMatrix: {
@@ -165,21 +152,14 @@
         case ByteCodeInstruction::kShiftLeft: printf("shl %d", READ8()); break;
         case ByteCodeInstruction::kShiftRightS: printf("shrs %d", READ8()); break;
         case ByteCodeInstruction::kShiftRightU: printf("shru %d", READ8()); break;
-        VECTOR_DISASSEMBLE(kSin, "sin")
-        VECTOR_DISASSEMBLE(kSqrt, "sqrt")
-        case ByteCodeInstruction::kStore: printf("store %d", READ8()); break;
-        case ByteCodeInstruction::kStore2: printf("store2 %d", READ8()); break;
-        case ByteCodeInstruction::kStore3: printf("store3 %d", READ8()); break;
-        case ByteCodeInstruction::kStore4: printf("store4 %d", READ8()); break;
-        case ByteCodeInstruction::kStoreGlobal: printf("storeglobal %d", READ8()); break;
-        case ByteCodeInstruction::kStoreGlobal2: printf("storeglobal2 %d", READ8()); break;
-        case ByteCodeInstruction::kStoreGlobal3: printf("storeglobal3 %d", READ8()); break;
-        case ByteCodeInstruction::kStoreGlobal4: printf("storeglobal4 %d", READ8()); break;
-        case ByteCodeInstruction::kStoreExtended: printf("storeextended %d", READ8()); break;
-        case ByteCodeInstruction::kStoreExtendedGlobal: printf("storeextendedglobal %d", READ8());
-            break;
-        VECTOR_MATRIX_DISASSEMBLE(kSubtractF, "subtractf")
-        VECTOR_DISASSEMBLE(kSubtractI, "subtracti")
+        DISASSEMBLE_COUNT(kSin, "sin")
+        DISASSEMBLE_COUNT(kSqrt, "sqrt")
+        DISASSEMBLE_COUNT_SLOT(kStore, "store")
+        DISASSEMBLE_COUNT_SLOT(kStoreGlobal, "storeglobal")
+        DISASSEMBLE_COUNT(kStoreExtended, "storeextended")
+        DISASSEMBLE_COUNT(kStoreExtendedGlobal, "storeextendedglobal")
+        DISASSEMBLE_COUNT(kSubtractF, "subtractf")
+        DISASSEMBLE_COUNT(kSubtractI, "subtracti")
         case ByteCodeInstruction::kSwizzle: {
             printf("swizzle %d, ", READ8());
             int count = READ8();
@@ -189,12 +169,9 @@
             }
             break;
         }
-        VECTOR_DISASSEMBLE(kTan, "tan")
-        case ByteCodeInstruction::kWriteExternal: printf("writeexternal %d", READ8()); break;
-        case ByteCodeInstruction::kWriteExternal2: printf("writeexternal2 %d", READ8()); break;
-        case ByteCodeInstruction::kWriteExternal3: printf("writeexternal3 %d", READ8()); break;
-        case ByteCodeInstruction::kWriteExternal4: printf("writeexternal4 %d", READ8()); break;
-        case ByteCodeInstruction::kXorB: printf("xorb"); break;
+        DISASSEMBLE_COUNT(kTan, "tan")
+        DISASSEMBLE_COUNT_SLOT(kWriteExternal, "writeexternal")
+        DISASSEMBLE_COUNT(kXorB, "xorb")
         case ByteCodeInstruction::kMaskPush: printf("maskpush"); break;
         case ByteCodeInstruction::kMaskPop: printf("maskpop"); break;
         case ByteCodeInstruction::kMaskNegate: printf("masknegate"); break;
@@ -216,117 +193,47 @@
     return ip;
 }
 
-#define VECTOR_BINARY_OP(base, field, op)                             \
-    case ByteCodeInstruction::base ## 4: {                            \
-        sp[-4] = sp[-4].field op sp[0].field;                         \
-        POP();                                                        \
-        [[fallthrough]];                                              \
-    }                                                                 \
-    case ByteCodeInstruction::base ## 3: {                            \
-        int count = (int)inst - (int)(ByteCodeInstruction::base) - 1; \
-        sp[count] = sp[count].field op sp[0].field;                   \
-        POP();                                                        \
-        [[fallthrough]];                                              \
-    }                                                                 \
-    case ByteCodeInstruction::base ## 2: {                            \
-        int count = (int)inst - (int)(ByteCodeInstruction::base) - 1; \
-        sp[count] = sp[count].field op sp[0].field;                   \
-        POP();                                                        \
-        [[fallthrough]];                                              \
-    }                                                                 \
-    case ByteCodeInstruction::base: {                                 \
-        int count = (int)inst - (int)(ByteCodeInstruction::base) - 1; \
-        sp[count] = sp[count].field op sp[0].field;                   \
-        POP();                                                        \
-        continue;                                                     \
-    }
-
 // A naive implementation of / or % using skvx operations will likely crash with a divide by zero
 // in inactive vector lanes, so we need to be sure to avoid masked-off lanes.
-#define VECTOR_BINARY_MASKED_OP(base, field, op)                      \
-    case ByteCodeInstruction::base ## 4: {                            \
-        for (int i = 0; i < VecWidth; ++i) {                          \
-            if (mask()[i]) {                                          \
-                sp[-4].field[i] op ## = sp[0].field[i];               \
-            }                                                         \
-        }                                                             \
-        POP();                                                        \
-        [[fallthrough]];                                              \
-    }                                                                 \
-    case ByteCodeInstruction::base ## 3: {                            \
-        int count = (int)inst - (int)(ByteCodeInstruction::base) - 1; \
-        for (int i = 0; i < VecWidth; ++i) {                          \
-            if (mask()[i]) {                                          \
-                sp[count].field[i] op ## = sp[0].field[i];            \
-            }                                                         \
-        }                                                             \
-        POP();                                                        \
-        [[fallthrough]];                                              \
-    }                                                                 \
-    case ByteCodeInstruction::base ## 2: {                            \
-        int count = (int)inst - (int)(ByteCodeInstruction::base) - 1; \
-        for (int i = 0; i < VecWidth; ++i) {                          \
-            if (mask()[i]) {                                          \
-                sp[count].field[i] op ## = sp[0].field[i];            \
-            }                                                         \
-        }                                                             \
-        POP();                                                        \
-        [[fallthrough]];                                              \
-    }                                                                 \
-    case ByteCodeInstruction::base: {                                 \
-        int count = (int)inst - (int)(ByteCodeInstruction::base) - 1; \
-        for (int i = 0; i < VecWidth; ++i) {                          \
-            if (mask()[i]) {                                          \
-                sp[count].field[i] op ## = sp[0].field[i];            \
-            }                                                         \
-        }                                                             \
-        POP();                                                        \
-        continue;                                                     \
-    }
+// TODO: Would it be better to do this with a select of (lane, 1) based on mask?
+#define VECTOR_BINARY_MASKED_OP(inst, field, op)                \
+    case ByteCodeInstruction::inst: {                           \
+        int count = READ8();                                    \
+        for (int i = count; i > 0; --i) {                       \
+            for (int j = 0; j < VecWidth; ++j) {                \
+                if (mask()[j]) {                                \
+                    sp[-count].field[j] op ## = sp[0].field[j]; \
+                }                                               \
+            }                                                   \
+            POP();                                              \
+        }                                                       \
+    } continue;
 
-
-#define VECTOR_MATRIX_BINARY_OP(base, field, op)          \
-    VECTOR_BINARY_OP(base, field, op)                     \
-    case ByteCodeInstruction::base ## N: {                \
+#define VECTOR_BINARY_OP(inst, field, op)                 \
+    case ByteCodeInstruction::inst: {                     \
         int count = READ8();                              \
         for (int i = count; i > 0; --i) {                 \
             sp[-count] = sp[-count].field op sp[0].field; \
             POP();                                        \
         }                                                 \
-        continue;                                         \
-    }
+    } continue;
 
-#define VECTOR_BINARY_FN(base, field, fn)                             \
-    case ByteCodeInstruction::base ## 4: {                            \
-        sp[-4] = fn(sp[-4].field, sp[0].field);                       \
-        POP();                                                        \
-        [[fallthrough]];                                              \
-    }                                                                 \
-    case ByteCodeInstruction::base ## 3: {                            \
-        int count = (int)inst - (int)(ByteCodeInstruction::base) - 1; \
-        sp[count] = fn(sp[count].field, sp[0].field);                 \
-        POP();                                                        \
-        [[fallthrough]];                                              \
-    }                                                                 \
-    case ByteCodeInstruction::base ## 2: {                            \
-        int count = (int)inst - (int)(ByteCodeInstruction::base) - 1; \
-        sp[count] = fn(sp[count].field, sp[0].field);                 \
-        POP();                                                        \
-        [[fallthrough]];                                              \
-    }                                                                 \
-    case ByteCodeInstruction::base: {                                 \
-        int count = (int)inst - (int)(ByteCodeInstruction::base) - 1; \
-        sp[count] = fn(sp[count].field, sp[0].field);                 \
-        POP();                                                        \
-        continue;                                                     \
-    }
+#define VECTOR_BINARY_FN(inst, field, fn)                   \
+    case ByteCodeInstruction::inst: {                       \
+        int count = READ8();                                \
+        for (int i = count; i > 0; --i) {                   \
+            sp[-count] = fn(sp[-count].field, sp[0].field); \
+            POP();                                          \
+        }                                                   \
+    } continue;
 
-#define VECTOR_UNARY_FN(base, fn, field)                                              \
-    case ByteCodeInstruction::base ## 4:  sp[-3] = fn(sp[-3].field); [[fallthrough]]; \
-    case ByteCodeInstruction::base ## 3:  sp[-2] = fn(sp[-2].field); [[fallthrough]]; \
-    case ByteCodeInstruction::base ## 2:  sp[-1] = fn(sp[-1].field); [[fallthrough]]; \
-    case ByteCodeInstruction::base:       sp[ 0] = fn(sp[ 0].field);                  \
-                      continue;
+#define VECTOR_UNARY_FN(inst, fn, field) \
+    case ByteCodeInstruction::inst: {    \
+        int count = READ8();             \
+        for (int i = count; i --> 0; ) { \
+            sp[-i] = fn(sp[-i].field);   \
+        }                                \
+    } continue;
 
 union VValue {
     VValue() {}
@@ -506,27 +413,14 @@
         ByteCodeInstruction inst = READ_INST();
         switch (inst) {
 
-            VECTOR_MATRIX_BINARY_OP(kAddF, fFloat, +)
+            VECTOR_BINARY_OP(kAddF, fFloat, +)
             VECTOR_BINARY_OP(kAddI, fSigned, +)
 
             // Booleans are integer masks: 0/~0 for false/true. So bitwise ops do what we want:
-            case ByteCodeInstruction::kAndB:
-                sp[-1] = sp[-1].fSigned & sp[0].fSigned;
-                POP();
-                continue;
-            case ByteCodeInstruction::kNotB4: sp[-3] = ~sp[-3].fSigned; [[fallthrough]];
-            case ByteCodeInstruction::kNotB3: sp[-2] = ~sp[-2].fSigned; [[fallthrough]];
-            case ByteCodeInstruction::kNotB2: sp[-1] = ~sp[-1].fSigned; [[fallthrough]];
-            case ByteCodeInstruction::kNotB:  sp[ 0] = ~sp[ 0].fSigned;
-                continue;
-            case ByteCodeInstruction::kOrB:
-                sp[-1] = sp[-1].fSigned | sp[0].fSigned;
-                POP();
-                continue;
-            case ByteCodeInstruction::kXorB:
-                sp[-1] = sp[-1].fSigned ^ sp[0].fSigned;
-                POP();
-                continue;
+            VECTOR_BINARY_OP(kAndB, fSigned, &)
+            VECTOR_BINARY_OP(kOrB,  fSigned, |)
+            VECTOR_BINARY_OP(kXorB, fSigned, ^)
+            VECTOR_UNARY_FN(kNotB, std::bit_not<>{}, fSigned)
 
             case ByteCodeInstruction::kBranch:
                 ip = code + READ16();
@@ -548,13 +442,11 @@
                         stack[i].fFloat = 0.0f;
                     }
                 }
-                continue;
-            }
+            } continue;
 
-            case ByteCodeInstruction::kCallExternal: {
+            case ByteCodeInstruction::kCallExternal:
                 CallExternal(byteCode, ip, sp, baseIndex, mask());
                 continue;
-            }
 
             VECTOR_UNARY_FN(kCeil, skvx::ceil, fFloat)
 
@@ -563,90 +455,40 @@
                 if (skvx::any(mask() & ((sp[0].fSigned < 0) | (sp[0].fSigned >= length)))) {
                     return false;
                 }
-                continue;
-            }
+            } continue;
 
-            VECTOR_BINARY_OP(kCompareIEQ, fSigned, ==)
-            VECTOR_MATRIX_BINARY_OP(kCompareFEQ, fFloat, ==)
-            VECTOR_BINARY_OP(kCompareINEQ, fSigned, !=)
-            VECTOR_MATRIX_BINARY_OP(kCompareFNEQ, fFloat, !=)
-            VECTOR_BINARY_OP(kCompareSGT, fSigned, >)
-            VECTOR_BINARY_OP(kCompareUGT, fUnsigned, >)
-            VECTOR_BINARY_OP(kCompareFGT, fFloat, >)
-            VECTOR_BINARY_OP(kCompareSGTEQ, fSigned, >=)
+            VECTOR_BINARY_OP(kCompareIEQ,   fSigned,   ==)
+            VECTOR_BINARY_OP(kCompareFEQ,   fFloat,    ==)
+            VECTOR_BINARY_OP(kCompareINEQ,  fSigned,   !=)
+            VECTOR_BINARY_OP(kCompareFNEQ,  fFloat,    !=)
+            VECTOR_BINARY_OP(kCompareSGT,   fSigned,   >)
+            VECTOR_BINARY_OP(kCompareUGT,   fUnsigned, >)
+            VECTOR_BINARY_OP(kCompareFGT,   fFloat,    >)
+            VECTOR_BINARY_OP(kCompareSGTEQ, fSigned,   >=)
             VECTOR_BINARY_OP(kCompareUGTEQ, fUnsigned, >=)
-            VECTOR_BINARY_OP(kCompareFGTEQ, fFloat, >=)
-            VECTOR_BINARY_OP(kCompareSLT, fSigned, <)
-            VECTOR_BINARY_OP(kCompareULT, fUnsigned, <)
-            VECTOR_BINARY_OP(kCompareFLT, fFloat, <)
-            VECTOR_BINARY_OP(kCompareSLTEQ, fSigned, <=)
+            VECTOR_BINARY_OP(kCompareFGTEQ, fFloat,    >=)
+            VECTOR_BINARY_OP(kCompareSLT,   fSigned,   <)
+            VECTOR_BINARY_OP(kCompareULT,   fUnsigned, <)
+            VECTOR_BINARY_OP(kCompareFLT,   fFloat,    <)
+            VECTOR_BINARY_OP(kCompareSLTEQ, fSigned,   <=)
             VECTOR_BINARY_OP(kCompareULTEQ, fUnsigned, <=)
-            VECTOR_BINARY_OP(kCompareFLTEQ, fFloat, <=)
+            VECTOR_BINARY_OP(kCompareFLTEQ, fFloat,    <=)
 
-            case ByteCodeInstruction::kConvertFtoI4:
-                sp[-3] = skvx::cast<int>(sp[-3].fFloat);
-                [[fallthrough]];
-            case ByteCodeInstruction::kConvertFtoI3:
-                sp[-2] = skvx::cast<int>(sp[-2].fFloat);
-                [[fallthrough]];
-            case ByteCodeInstruction::kConvertFtoI2:
-                sp[-1] = skvx::cast<int>(sp[-1].fFloat);
-                [[fallthrough]];
-            case ByteCodeInstruction::kConvertFtoI:
-                sp[ 0] = skvx::cast<int>(sp[ 0].fFloat);
-                continue;
-
-            case ByteCodeInstruction::kConvertStoF4:
-                sp[-3] = skvx::cast<float>(sp[-3].fSigned);
-                [[fallthrough]];
-            case ByteCodeInstruction::kConvertStoF3:
-                sp[-2] = skvx::cast<float>(sp[-2].fSigned);
-                [[fallthrough]];
-            case ByteCodeInstruction::kConvertStoF2:
-                sp[-1] = skvx::cast<float>(sp[-1].fSigned);
-                [[fallthrough]];
-            case ByteCodeInstruction::kConvertStoF:
-                sp[ 0] = skvx::cast<float>(sp[ 0].fSigned);
-                continue;
-
-            case ByteCodeInstruction::kConvertUtoF4:
-                sp[-3] = skvx::cast<float>(sp[-3].fUnsigned);
-                [[fallthrough]];
-            case ByteCodeInstruction::kConvertUtoF3:
-                sp[-2] = skvx::cast<float>(sp[-2].fUnsigned);
-                [[fallthrough]];
-            case ByteCodeInstruction::kConvertUtoF2:
-                sp[-1] = skvx::cast<float>(sp[-1].fUnsigned);
-                [[fallthrough]];
-            case ByteCodeInstruction::kConvertUtoF:
-                sp[ 0] = skvx::cast<float>(sp[ 0].fUnsigned);
-                continue;
+            VECTOR_UNARY_FN(kConvertFtoI, skvx::cast<int>, fFloat)
+            VECTOR_UNARY_FN(kConvertStoF, skvx::cast<float>, fSigned)
+            VECTOR_UNARY_FN(kConvertUtoF, skvx::cast<float>, fUnsigned)
 
             VECTOR_UNARY_FN(kCos, skvx::cos, fFloat)
 
             VECTOR_BINARY_MASKED_OP(kDivideS, fSigned, /)
             VECTOR_BINARY_MASKED_OP(kDivideU, fUnsigned, /)
-            VECTOR_MATRIX_BINARY_OP(kDivideF, fFloat, /)
+            VECTOR_BINARY_OP(kDivideF, fFloat, /)
 
-            case ByteCodeInstruction::kDup4:
-                PUSH(sp[(int)inst - (int)ByteCodeInstruction::kDup]);
-                [[fallthrough]];
-            case ByteCodeInstruction::kDup3:
-                PUSH(sp[(int)inst - (int)ByteCodeInstruction::kDup]);
-                [[fallthrough]];
-            case ByteCodeInstruction::kDup2:
-                PUSH(sp[(int)inst - (int)ByteCodeInstruction::kDup]);
-                [[fallthrough]];
-            case ByteCodeInstruction::kDup :
-                PUSH(sp[(int)inst - (int)ByteCodeInstruction::kDup]);
-                continue;
-
-            case ByteCodeInstruction::kDupN: {
+            case ByteCodeInstruction::kDup: {
                 int count = READ8();
                 memcpy(sp + 1, sp - count + 1, count * sizeof(VValue));
                 sp += count;
-                continue;
-            }
+            } continue;
 
             VECTOR_UNARY_FN(kFloor, skvx::floor, fFloat)
             VECTOR_UNARY_FN(kFract, skvx::fract, fFloat)
@@ -661,11 +503,8 @@
                 Inverse4x4(sp);
                 continue;
 
-            case ByteCodeInstruction::kLerp4:
-            case ByteCodeInstruction::kLerp3:
-            case ByteCodeInstruction::kLerp2:
             case ByteCodeInstruction::kLerp: {
-                int count = (int)ByteCodeInstruction::kLerp - (int)inst + 1;
+                int count = READ8();
                 VValue* T = sp - count + 1,
                       * B = T - count,
                       * A = B - count;
@@ -673,39 +512,30 @@
                     A[i].fFloat += (B[i].fFloat - A[i].fFloat) * T[i].fFloat;
                 }
                 sp -= 2 * count;
-                continue;
-            }
+            } continue;
 
-            case ByteCodeInstruction::kLoad4: sp[4] = stack[*ip + 3]; [[fallthrough]];
-            case ByteCodeInstruction::kLoad3: sp[3] = stack[*ip + 2]; [[fallthrough]];
-            case ByteCodeInstruction::kLoad2: sp[2] = stack[*ip + 1]; [[fallthrough]];
-            case ByteCodeInstruction::kLoad:  sp[1] = stack[*ip + 0];
-                        ++ip;
-                        sp += (int)ByteCodeInstruction::kLoad - (int)inst + 1;
-                        continue;
+            case ByteCodeInstruction::kLoad: {
+                int count = READ8(),
+                    slot  = READ8();
+                memcpy(sp + 1, stack + slot, count * sizeof(VValue));
+                sp += count;
+            } continue;
 
-            case ByteCodeInstruction::kLoadGlobal4: sp[4] = globals[*ip + 3]; [[fallthrough]];
-            case ByteCodeInstruction::kLoadGlobal3: sp[3] = globals[*ip + 2]; [[fallthrough]];
-            case ByteCodeInstruction::kLoadGlobal2: sp[2] = globals[*ip + 1]; [[fallthrough]];
-            case ByteCodeInstruction::kLoadGlobal:  sp[1] = globals[*ip + 0];
-                                ++ip;
-                                sp += (int)ByteCodeInstruction::kLoadGlobal - (int)inst + 1;
-                                continue;
+            case ByteCodeInstruction::kLoadGlobal: {
+                int count = READ8(),
+                    slot  = READ8();
+                memcpy(sp + 1, globals + slot, count * sizeof(VValue));
+                sp += count;
+            } continue;
 
-            case ByteCodeInstruction::kLoadUniform4:
-                sp[4].fFloat = uniforms[*ip + 3];
-                [[fallthrough]];
-            case ByteCodeInstruction::kLoadUniform3:
-                sp[3].fFloat = uniforms[*ip + 2];
-                [[fallthrough]];
-            case ByteCodeInstruction::kLoadUniform2:
-                sp[2].fFloat = uniforms[*ip + 1];
-                [[fallthrough]];
-            case ByteCodeInstruction::kLoadUniform:
-                sp[1].fFloat = uniforms[*ip + 0];
-                ++ip;
-                sp += (int)ByteCodeInstruction::kLoadUniform - (int)inst + 1;
-                continue;
+            case ByteCodeInstruction::kLoadUniform: {
+                int count = READ8(),
+                    slot  = READ8();
+                for (int i = 0; i < count; ++i) {
+                    sp[i + 1].fFloat = uniforms[slot + i];
+                }
+                sp += count;
+            } continue;
 
             case ByteCodeInstruction::kLoadExtended: {
                 int count = READ8();
@@ -719,8 +549,7 @@
                     }
                 }
                 sp += count;
-                continue;
-            }
+            } continue;
 
             case ByteCodeInstruction::kLoadExtendedGlobal: {
                 int count = READ8();
@@ -734,8 +563,7 @@
                     }
                 }
                 sp += count;
-                continue;
-            }
+            } continue;
 
             case ByteCodeInstruction::kLoadExtendedUniform: {
                 int count = READ8();
@@ -749,8 +577,7 @@
                     }
                 }
                 sp += count;
-                continue;
-            }
+            } continue;
 
             case ByteCodeInstruction::kMatrixToMatrix: {
                 int srcCols = READ8();
@@ -774,8 +601,7 @@
                         PUSH(tmp[c*4 + r]);
                     }
                 }
-                continue;
-            }
+            } continue;
 
             case ByteCodeInstruction::kMatrixMultiply: {
                 int lCols = READ8();
@@ -795,19 +621,15 @@
                 sp -= (lCols * lRows) + (rCols * rRows);
                 memcpy(sp + 1, tmp, rCols * lRows * sizeof(VValue));
                 sp += (rCols * lRows);
-                continue;
-            }
+            } continue;
 
             VECTOR_BINARY_FN(kMaxF, fFloat, skvx::max)
             VECTOR_BINARY_FN(kMaxS, fSigned, skvx::max)
             VECTOR_BINARY_FN(kMinF, fFloat, skvx::min)
             VECTOR_BINARY_FN(kMinS, fSigned, skvx::min)
 
-            case ByteCodeInstruction::kMix4:
-            case ByteCodeInstruction::kMix3:
-            case ByteCodeInstruction::kMix2:
             case ByteCodeInstruction::kMix: {
-                int count = (int)ByteCodeInstruction::kMix - (int)inst + 1;
+                int count = READ8();
                 for (int i = count; i --> 0; ) {
                     // GLSL's arguments are mix(else, true, cond)
                     sp[-(2*count + i)] = skvx::if_then_else(sp[-(          i)].fSigned,
@@ -815,39 +637,15 @@
                                                             sp[-(2*count + i)].fFloat);
                 }
                 sp -= 2 * count;
-                continue;
-            }
+            } continue;
 
             VECTOR_BINARY_OP(kMultiplyI, fSigned, *)
-            VECTOR_MATRIX_BINARY_OP(kMultiplyF, fFloat, *)
+            VECTOR_BINARY_OP(kMultiplyF, fFloat, *)
 
-            case ByteCodeInstruction::kNegateF4: sp[-3] = -sp[-3].fFloat; [[fallthrough]];
-            case ByteCodeInstruction::kNegateF3: sp[-2] = -sp[-2].fFloat; [[fallthrough]];
-            case ByteCodeInstruction::kNegateF2: sp[-1] = -sp[-1].fFloat; [[fallthrough]];
-            case ByteCodeInstruction::kNegateF:  sp[ 0] = -sp[ 0].fFloat;
-                                                 continue;
+            VECTOR_UNARY_FN(kNegateF, std::negate<>{}, fFloat)
+            VECTOR_UNARY_FN(kNegateI, std::negate<>{}, fSigned)
 
-            case ByteCodeInstruction::kNegateFN: {
-                int count = READ8();
-                for (int i = count - 1; i >= 0; --i) {
-                    sp[-i] = -sp[-i].fFloat;
-                }
-                continue;
-            }
-
-            case ByteCodeInstruction::kNegateI4: sp[-3] = -sp[-3].fSigned; [[fallthrough]];
-            case ByteCodeInstruction::kNegateI3: sp[-2] = -sp[-2].fSigned; [[fallthrough]];
-            case ByteCodeInstruction::kNegateI2: sp[-1] = -sp[-1].fSigned; [[fallthrough]];
-            case ByteCodeInstruction::kNegateI:  sp[ 0] = -sp[ 0].fSigned;
-                                                 continue;
-
-            case ByteCodeInstruction::kPop4: POP(); [[fallthrough]];
-            case ByteCodeInstruction::kPop3: POP(); [[fallthrough]];
-            case ByteCodeInstruction::kPop2: POP(); [[fallthrough]];
-            case ByteCodeInstruction::kPop:  POP();
-                                             continue;
-
-            case ByteCodeInstruction::kPopN:
+            case ByteCodeInstruction::kPop:
                 sp -= READ8();
                 continue;
 
@@ -857,25 +655,22 @@
                 PUSH(U32(READ32()));
                 continue;
 
-            case ByteCodeInstruction::kReadExternal:
-            case ByteCodeInstruction::kReadExternal2:
-            case ByteCodeInstruction::kReadExternal3:
-            case ByteCodeInstruction::kReadExternal4: {
-                int count = (int)ByteCodeInstruction::kReadExternal - (int)inst + 1;
-                int src = READ8();
+            case ByteCodeInstruction::kReadExternal: {
+                int count = READ8(),
+                    slot  = READ8();
+                SkASSERT(count <= 4);
                 float tmp[4];
                 I32 m = mask();
                 for (int i = 0; i < VecWidth; ++i) {
                     if (m[i]) {
-                        byteCode->fExternalValues[src]->read(baseIndex + i, tmp);
+                        byteCode->fExternalValues[slot]->read(baseIndex + i, tmp);
                         for (int j = 0; j < count; ++j) {
                             sp[j + 1].fFloat[i] = tmp[j];
                         }
                     }
                 }
                 sp += count;
-                continue;
-            }
+            } continue;
 
             VECTOR_BINARY_FN(kRemainderF, fFloat, VecMod)
             VECTOR_BINARY_MASKED_OP(kRemainderS, fSigned, %)
@@ -923,9 +718,8 @@
                     code = frame.fCode;
                     ip = frame.fIP;
                     frames.pop_back();
-                    continue;
                 }
-            }
+            } continue;
 
             case ByteCodeInstruction::kScalarToMatrix: {
                 int cols = READ8();
@@ -936,8 +730,7 @@
                         PUSH(c == r ? v : F32(0.0f));
                     }
                 }
-                continue;
-            }
+            }  continue;
 
             case ByteCodeInstruction::kShiftLeft:
                 sp[0] = sp[0].fSigned << READ8();
@@ -952,33 +745,23 @@
             VECTOR_UNARY_FN(kSin, skvx::sin, fFloat)
             VECTOR_UNARY_FN(kSqrt, skvx::sqrt, fFloat)
 
-            case ByteCodeInstruction::kStore4:
-                stack[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+3].fFloat);
-                [[fallthrough]];
-            case ByteCodeInstruction::kStore3:
-                stack[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+2].fFloat);
-                [[fallthrough]];
-            case ByteCodeInstruction::kStore2:
-                stack[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+1].fFloat);
-                [[fallthrough]];
-            case ByteCodeInstruction::kStore:
-                stack[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+0].fFloat);
-                ++ip;
-                continue;
+            case ByteCodeInstruction::kStore: {
+                int count = READ8(),
+                    slot  = READ8();
+                auto m = mask();
+                for (int i = count; i --> 0; ) {
+                    stack[slot+i] = skvx::if_then_else(m, POP().fFloat, stack[slot+i].fFloat);
+                }
+            } continue;
 
-            case ByteCodeInstruction::kStoreGlobal4:
-                globals[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+3].fFloat);
-                [[fallthrough]];
-            case ByteCodeInstruction::kStoreGlobal3:
-                globals[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+2].fFloat);
-                [[fallthrough]];
-            case ByteCodeInstruction::kStoreGlobal2:
-                globals[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+1].fFloat);
-                [[fallthrough]];
-            case ByteCodeInstruction::kStoreGlobal:
-                globals[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+0].fFloat);
-                ++ip;
-                continue;
+            case ByteCodeInstruction::kStoreGlobal: {
+                int count = READ8(),
+                    slot  = READ8();
+                auto m = mask();
+                for (int i = count; i --> 0; ) {
+                    globals[slot+i] = skvx::if_then_else(m, POP().fFloat, globals[slot+i].fFloat);
+                }
+            } continue;
 
             case ByteCodeInstruction::kStoreExtended: {
                 int count = READ8();
@@ -993,8 +776,8 @@
                     }
                 }
                 sp -= count;
-                continue;
-            }
+            } continue;
+
             case ByteCodeInstruction::kStoreExtendedGlobal: {
                 int count = READ8();
                 I32 target = POP().fSigned;
@@ -1008,11 +791,10 @@
                     }
                 }
                 sp -= count;
-                continue;
-            }
+            } continue;
 
             VECTOR_BINARY_OP(kSubtractI, fSigned, -)
-            VECTOR_MATRIX_BINARY_OP(kSubtractF, fFloat, -)
+            VECTOR_BINARY_OP(kSubtractF, fFloat, -)
 
             case ByteCodeInstruction::kSwizzle: {
                 VValue tmp[4];
@@ -1022,18 +804,15 @@
                 for (int i = READ8() - 1; i >= 0; --i) {
                     PUSH(tmp[READ8()]);
                 }
-                continue;
-            }
+            } continue;
 
             VECTOR_UNARY_FN(kATan, skvx::atan, fFloat)
             VECTOR_UNARY_FN(kTan, skvx::tan, fFloat)
 
-            case ByteCodeInstruction::kWriteExternal4:
-            case ByteCodeInstruction::kWriteExternal3:
-            case ByteCodeInstruction::kWriteExternal2:
             case ByteCodeInstruction::kWriteExternal: {
-                int count = (int)ByteCodeInstruction::kWriteExternal - (int)inst + 1;
-                int target = READ8();
+                int count = READ8(),
+                    slot  = READ8();
+                SkASSERT(count <= 4);
                 float tmp[4];
                 I32 m = mask();
                 sp -= count;
@@ -1042,11 +821,10 @@
                         for (int j = 0; j < count; ++j) {
                             tmp[j] = sp[j + 1].fFloat[i];
                         }
-                        byteCode->fExternalValues[target]->write(baseIndex + i, tmp);
+                        byteCode->fExternalValues[slot]->write(baseIndex + i, tmp);
                     }
                 }
-                continue;
-            }
+            } continue;
 
             case ByteCodeInstruction::kMaskPush:
                 condPtr[1] = POP().fSigned;
@@ -1067,15 +845,13 @@
                     sp[-count] = skvx::if_then_else(m, sp[-count].fFloat, sp[0].fFloat);
                     --sp;
                 }
-                continue;
-            }
+            } continue;
             case ByteCodeInstruction::kBranchIfAllFalse: {
                 int target = READ16();
                 if (!skvx::any(mask())) {
                     ip = code + target;
                 }
-                continue;
-            }
+            } continue;
 
             case ByteCodeInstruction::kLoopBegin:
                 contPtr[1] = 0;
@@ -1099,8 +875,7 @@
                 I32 m = mask();
                 *contPtr |=  m;
                 *loopPtr &= ~m;
-                continue;
-            }
+            } continue;
 
             case ByteCodeInstruction::kLoadFragCoord:
             case ByteCodeInstruction::kSampleExplicit:
diff --git a/src/sksl/SkSLByteCode.h b/src/sksl/SkSLByteCode.h
index 9a1b19a..3832b9b 100644
--- a/src/sksl/SkSLByteCode.h
+++ b/src/sksl/SkSLByteCode.h
@@ -19,65 +19,61 @@
 class  ExternalValue;
 struct FunctionDeclaration;
 
-#define VECTOR(name) name ## 4, name ## 3, name ## 2, name
-#define VECTOR_MATRIX(name) name ## N, name ## 4, name ## 3, name ## 2, name
-
 enum class ByteCodeInstruction : uint16_t {
     // B = bool, F = float, I = int, S = signed, U = unsigned
-    VECTOR_MATRIX(kAddF),
-    VECTOR(kAddI),
-    kAndB,
-    VECTOR(kATan),
+
+    kAddF,  // N
+    kAddI,  // N
+    kAndB,  // N
+    kATan,  // N
     kBranch,
     // Followed by a byte indicating the index of the function to call
     kCall,
     // Followed by three bytes indicating: the number of argument slots, the number of return slots,
     // and the index of the external value to call
     kCallExternal,
-    VECTOR(kCeil),
+    kCeil,  // N
     // For dynamic array access: Followed by byte indicating length of array
     kClampIndex,
-    VECTOR(kCompareIEQ),
-    VECTOR(kCompareINEQ),
-    VECTOR_MATRIX(kCompareFEQ),
-    VECTOR_MATRIX(kCompareFNEQ),
-    VECTOR(kCompareFGT),
-    VECTOR(kCompareFGTEQ),
-    VECTOR(kCompareFLT),
-    VECTOR(kCompareFLTEQ),
-    VECTOR(kCompareSGT),
-    VECTOR(kCompareSGTEQ),
-    VECTOR(kCompareSLT),
-    VECTOR(kCompareSLTEQ),
-    VECTOR(kCompareUGT),
-    VECTOR(kCompareUGTEQ),
-    VECTOR(kCompareULT),
-    VECTOR(kCompareULTEQ),
-    VECTOR(kConvertFtoI),
-    VECTOR(kConvertStoF),
-    VECTOR(kConvertUtoF),
-    VECTOR(kCos),
-    VECTOR_MATRIX(kDivideF),
-    VECTOR(kDivideS),
-    VECTOR(kDivideU),
-    // Duplicates the top stack value
-    VECTOR_MATRIX(kDup),
-    VECTOR(kFloor),
-    VECTOR(kFract),
+    kCompareIEQ,    // N
+    kCompareINEQ,   // N
+    kCompareFEQ,    // N
+    kCompareFNEQ,   // N
+    kCompareFGT,    // N
+    kCompareFGTEQ,  // N
+    kCompareFLT,    // N
+    kCompareFLTEQ,  // N
+    kCompareSGT,    // N
+    kCompareSGTEQ,  // N
+    kCompareSLT,    // N
+    kCompareSLTEQ,  // N
+    kCompareUGT,    // N
+    kCompareUGTEQ,  // N
+    kCompareULT,    // N
+    kCompareULTEQ,  // N
+    kConvertFtoI,   // N
+    kConvertStoF,   // N
+    kConvertUtoF,   // N
+    kCos,           // N
+    kDivideF,       // N
+    kDivideS,       // N
+    kDivideU,       // N
+    // Duplicates the top N stack values
+    kDup,    // N
+    kFloor,  // N
+    kFract,  // N
     kInverse2x2,
     kInverse3x3,
     kInverse4x4,
     // A1, A2, .., B1, B2, .., T1, T2, .. -> lerp(A1, B1, T1), lerp(A2, B2, T2), ..
-    VECTOR(kLerp),
-    // kLoad/kLoadGlobal are followed by a byte indicating the local/global slot to load
-    VECTOR(kLoad),
-    VECTOR(kLoadGlobal),
-    VECTOR(kLoadUniform),
-    // kLoadExtended* are fallback load ops when we lack a specialization. They are followed by a
-    // count byte, and get the slot to load from the top of the stack.
-    kLoadExtended,
-    kLoadExtendedGlobal,
-    kLoadExtendedUniform,
+    kLerp,  // N
+    kLoad,                 // N, slot
+    kLoadGlobal,           // N, slot
+    kLoadUniform,          // N, slot
+    // Indirect loads get the slot to load from the top of the stack
+    kLoadExtended,         // N
+    kLoadExtendedGlobal,   // N
+    kLoadExtendedUniform,  // N
     // Loads "sk_FragCoord" [X, Y, Z, 1/W]
     kLoadFragCoord,
     // Followed by four bytes: srcCols, srcRows, dstCols, dstRows. Consumes the src matrix from the
@@ -87,28 +83,27 @@
     kMatrixToMatrix,
     // Followed by three bytes: leftCols (== rightRows), leftRows, rightCols
     kMatrixMultiply,
-    VECTOR(kMaxF),
-    VECTOR(kMaxS),  // SkSL only declares signed versions of min/max
-    VECTOR(kMinF),
-    VECTOR(kMinS),
+    kMaxF,  // N
+    kMaxS,  // N  --  SkSL only declares signed versions of min/max
+    kMinF,  // N
+    kMinS,  // N
     // Masked selection: Stack is ... A1, A2, A3, B1, B2, B3, M1, M2, M3
     //                   Result:      M1 ? B1 : A1, M2 ? B2 : A2, M3 ? B3 : A3
-    VECTOR(kMix),
-    VECTOR_MATRIX(kNegateF),
-    VECTOR(kNegateI),
-    VECTOR_MATRIX(kMultiplyF),
-    VECTOR(kMultiplyI),
-    VECTOR(kNotB),
-    kOrB,
-    VECTOR_MATRIX(kPop),
-    VECTOR(kPow),
+    kMix,        // N
+    kNegateF,    // N
+    kNegateI,    // N
+    kMultiplyF,  // N
+    kMultiplyI,  // N
+    kNotB,       // N
+    kOrB,        // N
+    kPop,        // N
+    kPow,        // N
     // Followed by a 32 bit value containing the value to push
     kPushImmediate,
-    // Followed by a byte indicating external value to read
-    VECTOR(kReadExternal),
-    VECTOR(kRemainderF),
-    VECTOR(kRemainderS),
-    VECTOR(kRemainderU),
+    kReadExternal,  // N, slot
+    kRemainderF,    // N
+    kRemainderS,    // N
+    kRemainderU,    // N
     // Followed by a byte indicating the number of slots to reserve on the stack (for later return)
     kReserve,
     // Followed by a byte indicating the number of slots being returned
@@ -126,24 +121,22 @@
     kShiftLeft,
     kShiftRightS,
     kShiftRightU,
-    VECTOR(kSin),
-    VECTOR(kSqrt),
-    // kStore/kStoreGlobal are followed by a byte indicating the local/global slot to store
-    VECTOR(kStore),
-    VECTOR(kStoreGlobal),
-    // Fallback stores. Followed by count byte, and get the slot to store from the top of the stack
-    kStoreExtended,
-    kStoreExtendedGlobal,
+    kSin,   // N
+    kSqrt,  // N
+    kStore,                // N, slot
+    kStoreGlobal,          // N, slot
+    // Indirect stores get the slot to store from the top of the stack
+    kStoreExtended,        // N
+    kStoreExtendedGlobal,  // N
     // Followed by two count bytes (1-4), and then one byte per swizzle component (0-3). The first
     // count byte provides the current vector size (the vector is the top n stack elements), and the
     // second count byte provides the swizzle component count.
     kSwizzle,
-    VECTOR_MATRIX(kSubtractF),
-    VECTOR(kSubtractI),
-    VECTOR(kTan),
-    // Followed by a byte indicating external value to write
-    VECTOR(kWriteExternal),
-    kXorB,
+    kSubtractF,  // N
+    kSubtractI,  // N
+    kTan,        // N
+    kWriteExternal,  // N, slot
+    kXorB,       // N
 
     kMaskPush,
     kMaskPop,
diff --git a/src/sksl/SkSLByteCodeGenerator.cpp b/src/sksl/SkSLByteCodeGenerator.cpp
index c28a982..c484d7c 100644
--- a/src/sksl/SkSLByteCodeGenerator.cpp
+++ b/src/sksl/SkSLByteCodeGenerator.cpp
@@ -191,7 +191,6 @@
         SkASSERT(fStackCount == 0);
     }
     this->write(ByteCodeInstruction::kReturn, 0);
-    this->write8(0);
 
     result->fLocalCount     = fLocals.size();
     result->fConditionCount = fMaxConditionCount;
@@ -265,128 +264,90 @@
 
     switch (inst) {
         // Unary functions/operators that don't change stack depth at all:
-#define VECTOR_UNARY_OP(base)                \
-        case ByteCodeInstruction::base:      \
-        case ByteCodeInstruction::base ## 2: \
-        case ByteCodeInstruction::base ## 3: \
-        case ByteCodeInstruction::base ## 4: \
-            return 0;
 
-        VECTOR_UNARY_OP(kConvertFtoI)
-        VECTOR_UNARY_OP(kConvertStoF)
-        VECTOR_UNARY_OP(kConvertUtoF)
+#define VEC_UNARY(inst) case ByteCodeInstruction::inst: return count - count;
 
-        VECTOR_UNARY_OP(kATan)
-        VECTOR_UNARY_OP(kCeil)
-        VECTOR_UNARY_OP(kCos)
-        VECTOR_UNARY_OP(kFloor)
-        VECTOR_UNARY_OP(kFract)
-        VECTOR_UNARY_OP(kSin)
-        VECTOR_UNARY_OP(kSqrt)
-        VECTOR_UNARY_OP(kTan)
+        VEC_UNARY(kConvertFtoI)
+        VEC_UNARY(kConvertStoF)
+        VEC_UNARY(kConvertUtoF)
 
-        VECTOR_UNARY_OP(kNegateF)
-        VECTOR_UNARY_OP(kNegateI)
-        VECTOR_UNARY_OP(kNotB)
+        VEC_UNARY(kATan)
+        VEC_UNARY(kCeil)
+        VEC_UNARY(kCos)
+        VEC_UNARY(kFloor)
+        VEC_UNARY(kFract)
+        VEC_UNARY(kSin)
+        VEC_UNARY(kSqrt)
+        VEC_UNARY(kTan)
+
+        VEC_UNARY(kNegateF)
+        VEC_UNARY(kNegateI)
+        VEC_UNARY(kNotB)
+
+#undef VEC_UNARY
 
         case ByteCodeInstruction::kInverse2x2:
         case ByteCodeInstruction::kInverse3x3:
         case ByteCodeInstruction::kInverse4x4: return 0;
 
-        case ByteCodeInstruction::kClampIndex: return 0;
-        case ByteCodeInstruction::kNegateFN: return 0;
-        case ByteCodeInstruction::kShiftLeft: return 0;
+        case ByteCodeInstruction::kClampIndex:  return 0;
+        case ByteCodeInstruction::kShiftLeft:   return 0;
         case ByteCodeInstruction::kShiftRightS: return 0;
         case ByteCodeInstruction::kShiftRightU: return 0;
 
-#undef VECTOR_UNARY_OP
+        // Binary functions/operators that do a 2 -> 1 reduction, N times
+        case ByteCodeInstruction::kAndB: return -count;
+        case ByteCodeInstruction::kOrB:  return -count;
+        case ByteCodeInstruction::kXorB: return -count;
 
-        // Binary functions/operators that do a 2 -> 1 reduction (possibly N times)
-#define VECTOR_BINARY_OP(base)                          \
-        case ByteCodeInstruction::base:      return -1; \
-        case ByteCodeInstruction::base ## 2: return -2; \
-        case ByteCodeInstruction::base ## 3: return -3; \
-        case ByteCodeInstruction::base ## 4: return -4;
+        case ByteCodeInstruction::kAddI: return -count;
+        case ByteCodeInstruction::kAddF: return -count;
 
-#define VECTOR_MATRIX_BINARY_OP(base)                   \
-        VECTOR_BINARY_OP(base)                          \
-        case ByteCodeInstruction::base ## N: return -count;
+        case ByteCodeInstruction::kCompareIEQ:   return -count;
+        case ByteCodeInstruction::kCompareFEQ:   return -count;
+        case ByteCodeInstruction::kCompareINEQ:  return -count;
+        case ByteCodeInstruction::kCompareFNEQ:  return -count;
+        case ByteCodeInstruction::kCompareSGT:   return -count;
+        case ByteCodeInstruction::kCompareUGT:   return -count;
+        case ByteCodeInstruction::kCompareFGT:   return -count;
+        case ByteCodeInstruction::kCompareSGTEQ: return -count;
+        case ByteCodeInstruction::kCompareUGTEQ: return -count;
+        case ByteCodeInstruction::kCompareFGTEQ: return -count;
+        case ByteCodeInstruction::kCompareSLT:   return -count;
+        case ByteCodeInstruction::kCompareULT:   return -count;
+        case ByteCodeInstruction::kCompareFLT:   return -count;
+        case ByteCodeInstruction::kCompareSLTEQ: return -count;
+        case ByteCodeInstruction::kCompareULTEQ: return -count;
+        case ByteCodeInstruction::kCompareFLTEQ: return -count;
 
-        case ByteCodeInstruction::kAndB: return -1;
-        case ByteCodeInstruction::kOrB:  return -1;
-        case ByteCodeInstruction::kXorB: return -1;
-
-        VECTOR_BINARY_OP(kAddI)
-        VECTOR_MATRIX_BINARY_OP(kAddF)
-
-        VECTOR_BINARY_OP(kCompareIEQ)
-        VECTOR_MATRIX_BINARY_OP(kCompareFEQ)
-        VECTOR_BINARY_OP(kCompareINEQ)
-        VECTOR_MATRIX_BINARY_OP(kCompareFNEQ)
-        VECTOR_BINARY_OP(kCompareSGT)
-        VECTOR_BINARY_OP(kCompareUGT)
-        VECTOR_BINARY_OP(kCompareFGT)
-        VECTOR_BINARY_OP(kCompareSGTEQ)
-        VECTOR_BINARY_OP(kCompareUGTEQ)
-        VECTOR_BINARY_OP(kCompareFGTEQ)
-        VECTOR_BINARY_OP(kCompareSLT)
-        VECTOR_BINARY_OP(kCompareULT)
-        VECTOR_BINARY_OP(kCompareFLT)
-        VECTOR_BINARY_OP(kCompareSLTEQ)
-        VECTOR_BINARY_OP(kCompareULTEQ)
-        VECTOR_BINARY_OP(kCompareFLTEQ)
-
-        VECTOR_BINARY_OP(kDivideS)
-        VECTOR_BINARY_OP(kDivideU)
-        VECTOR_MATRIX_BINARY_OP(kDivideF)
-        VECTOR_BINARY_OP(kMaxF)
-        VECTOR_BINARY_OP(kMaxS)
-        VECTOR_BINARY_OP(kMinF)
-        VECTOR_BINARY_OP(kMinS)
-        VECTOR_BINARY_OP(kMultiplyI)
-        VECTOR_MATRIX_BINARY_OP(kMultiplyF)
-        VECTOR_BINARY_OP(kPow)
-        VECTOR_BINARY_OP(kRemainderF)
-        VECTOR_BINARY_OP(kRemainderS)
-        VECTOR_BINARY_OP(kRemainderU)
-        VECTOR_BINARY_OP(kSubtractI)
-        VECTOR_MATRIX_BINARY_OP(kSubtractF)
-
-#undef VECTOR_BINARY_OP
-#undef VECTOR_MATRIX_BINARY_OP
+        case ByteCodeInstruction::kDivideS:    return -count;
+        case ByteCodeInstruction::kDivideU:    return -count;
+        case ByteCodeInstruction::kDivideF:    return -count;
+        case ByteCodeInstruction::kMaxF:       return -count;
+        case ByteCodeInstruction::kMaxS:       return -count;
+        case ByteCodeInstruction::kMinF:       return -count;
+        case ByteCodeInstruction::kMinS:       return -count;
+        case ByteCodeInstruction::kMultiplyI:  return -count;
+        case ByteCodeInstruction::kMultiplyF:  return -count;
+        case ByteCodeInstruction::kPow:        return -count;
+        case ByteCodeInstruction::kRemainderF: return -count;
+        case ByteCodeInstruction::kRemainderS: return -count;
+        case ByteCodeInstruction::kRemainderU: return -count;
+        case ByteCodeInstruction::kSubtractI:  return -count;
+        case ByteCodeInstruction::kSubtractF:  return -count;
 
         // Ops that push or load data to grow the stack:
+        case ByteCodeInstruction::kPushImmediate:
+            return 1;
+        case ByteCodeInstruction::kLoadFragCoord:
+            return 4;
+
         case ByteCodeInstruction::kDup:
         case ByteCodeInstruction::kLoad:
         case ByteCodeInstruction::kLoadGlobal:
         case ByteCodeInstruction::kLoadUniform:
         case ByteCodeInstruction::kReadExternal:
-        case ByteCodeInstruction::kPushImmediate:
-            return 1;
-
-        case ByteCodeInstruction::kDup2:
-        case ByteCodeInstruction::kLoad2:
-        case ByteCodeInstruction::kLoadGlobal2:
-        case ByteCodeInstruction::kLoadUniform2:
-        case ByteCodeInstruction::kReadExternal2:
-            return 2;
-
-        case ByteCodeInstruction::kDup3:
-        case ByteCodeInstruction::kLoad3:
-        case ByteCodeInstruction::kLoadGlobal3:
-        case ByteCodeInstruction::kLoadUniform3:
-        case ByteCodeInstruction::kReadExternal3:
-            return 3;
-
-        case ByteCodeInstruction::kDup4:
-        case ByteCodeInstruction::kLoad4:
-        case ByteCodeInstruction::kLoadGlobal4:
-        case ByteCodeInstruction::kLoadUniform4:
-        case ByteCodeInstruction::kReadExternal4:
-        case ByteCodeInstruction::kLoadFragCoord:
-            return 4;
-
-        case ByteCodeInstruction::kDupN:
+        case ByteCodeInstruction::kReserve:
             return count;
 
         // Pushes 'count' values, minus one for the 'address' that's consumed first
@@ -397,30 +358,10 @@
 
         // Ops that pop or store data to shrink the stack:
         case ByteCodeInstruction::kPop:
+        case ByteCodeInstruction::kReturn:
         case ByteCodeInstruction::kStore:
         case ByteCodeInstruction::kStoreGlobal:
         case ByteCodeInstruction::kWriteExternal:
-            return -1;
-
-        case ByteCodeInstruction::kPop2:
-        case ByteCodeInstruction::kStore2:
-        case ByteCodeInstruction::kStoreGlobal2:
-        case ByteCodeInstruction::kWriteExternal2:
-            return -2;
-
-        case ByteCodeInstruction::kPop3:
-        case ByteCodeInstruction::kStore3:
-        case ByteCodeInstruction::kStoreGlobal3:
-        case ByteCodeInstruction::kWriteExternal3:
-            return -3;
-
-        case ByteCodeInstruction::kPop4:
-        case ByteCodeInstruction::kStore4:
-        case ByteCodeInstruction::kStoreGlobal4:
-        case ByteCodeInstruction::kWriteExternal4:
-            return -4;
-
-        case ByteCodeInstruction::kPopN:
             return -count;
 
         // Consumes 'count' values, plus one for the 'address'
@@ -432,8 +373,6 @@
         case ByteCodeInstruction::kCallExternal:
         case ByteCodeInstruction::kMatrixToMatrix:
         case ByteCodeInstruction::kMatrixMultiply:
-        case ByteCodeInstruction::kReserve:
-        case ByteCodeInstruction::kReturn:
         case ByteCodeInstruction::kScalarToMatrix:
         case ByteCodeInstruction::kSwizzle:
             return count;
@@ -446,16 +385,10 @@
         case ByteCodeInstruction::kSampleMatrix: return 4 - 9;
 
         // kMix does a 3 -> 1 reduction (A, B, M -> A -or- B) for each component
-        case ByteCodeInstruction::kMix:  return -2;
-        case ByteCodeInstruction::kMix2: return -4;
-        case ByteCodeInstruction::kMix3: return -6;
-        case ByteCodeInstruction::kMix4: return -8;
+        case ByteCodeInstruction::kMix:  return -(2 * count);
 
         // kLerp works the same way (producing lerp(A, B, T) for each component)
-        case ByteCodeInstruction::kLerp:  return -2;
-        case ByteCodeInstruction::kLerp2: return -4;
-        case ByteCodeInstruction::kLerp3: return -6;
-        case ByteCodeInstruction::kLerp4: return -8;
+        case ByteCodeInstruction::kLerp:  return -(2 * count);
 
         // kCall is net-zero. Max stack depth is adjusted in writeFunctionCall.
         case ByteCodeInstruction::kCall:             return 0;
@@ -583,7 +516,7 @@
                 if (offset != 0) {
                     this->write(ByteCodeInstruction::kPushImmediate);
                     this->write32(offset);
-                    this->write(ByteCodeInstruction::kAddI);
+                    this->write(ByteCodeInstruction::kAddI, 1);
                 }
                 return baseLoc;
             } else {
@@ -617,7 +550,7 @@
                 if (stride != 1) {
                     this->write(ByteCodeInstruction::kPushImmediate);
                     this->write32(stride);
-                    this->write(ByteCodeInstruction::kMultiplyI);
+                    this->write(ByteCodeInstruction::kMultiplyI, 1);
                 }
             }
             Location baseLoc = this->getLocation(*i.fBase);
@@ -643,7 +576,7 @@
                 this->write(ByteCodeInstruction::kPushImmediate);
                 this->write32(offset);
             }
-            this->write(ByteCodeInstruction::kAddI);
+            this->write(ByteCodeInstruction::kAddI, 1);
             return baseLoc.makeOnStack();
         }
         case Expression::kSwizzle_Kind: {
@@ -655,7 +588,7 @@
                 if (offset != 0) {
                     this->write(ByteCodeInstruction::kPushImmediate);
                     this->write32(offset);
-                    this->write(ByteCodeInstruction::kAddI);
+                    this->write(ByteCodeInstruction::kAddI, 1);
                 }
                 return baseLoc;
             } else {
@@ -701,33 +634,37 @@
     this->write16((uint16_t)i);
     fStackCount += StackUsage(i, count);
     fMaxStackCount = std::max(fMaxStackCount, fStackCount);
+
+    // Most ops have an explicit count byte after them (passed here as 'count')
+    // Ops that don't have a count byte pass the default (kUnusedStackCount)
+    // There are a handful of strange ops that pass in a computed stack delta as count, but where
+    // that value should *not* be written as a count byte (it may even be negative!)
+    if (count != kUnusedStackCount) {
+        switch (i) {
+            // Odd instructions that have a non-default count, but we shouldn't write it
+            case ByteCodeInstruction::kCallExternal:
+            case ByteCodeInstruction::kMatrixToMatrix:
+            case ByteCodeInstruction::kMatrixMultiply:
+            case ByteCodeInstruction::kScalarToMatrix:
+            case ByteCodeInstruction::kSwizzle:
+                break;
+            default:
+                this->write8(count);
+                break;
+        }
+    }
 }
 
-static ByteCodeInstruction vector_instruction(ByteCodeInstruction base, int count) {
-    SkASSERT(count >= 1 && count <= 4);
-    return ((ByteCodeInstruction) ((int) base + 1 - count));
-}
-
-void ByteCodeGenerator::writeTypedInstruction(const Type& type, ByteCodeInstruction s,
-                                              ByteCodeInstruction u, ByteCodeInstruction f,
+void ByteCodeGenerator::writeTypedInstruction(const Type& type,
+                                              ByteCodeInstruction s,
+                                              ByteCodeInstruction u,
+                                              ByteCodeInstruction f,
                                               int count) {
     switch (type_category(type)) {
         case TypeCategory::kBool:
-        case TypeCategory::kSigned:
-            this->write(vector_instruction(s, count));
-            break;
-        case TypeCategory::kUnsigned:
-            this->write(vector_instruction(u, count));
-            break;
-        case TypeCategory::kFloat: {
-            if (count > 4) {
-                this->write((ByteCodeInstruction)((int)f + 1 - 5), count);
-                this->write8(count);
-            } else {
-                this->write(vector_instruction(f, count));
-            }
-            break;
-        }
+        case TypeCategory::kSigned:   this->write(s, count); break;
+        case TypeCategory::kUnsigned: this->write(u, count); break;
+        case TypeCategory::kFloat:    this->write(f, count); break;
         default:
             SkASSERT(false);
     }
@@ -756,7 +693,7 @@
         op = b.fOperator;
         if (!lVecOrMtx && rVecOrMtx) {
             for (int i = SlotCount(rType); i > 1; --i) {
-                this->write(ByteCodeInstruction::kDup);
+                this->write(ByteCodeInstruction::kDup, 1);
             }
         }
     }
@@ -765,25 +702,25 @@
     switch (op) {
         case Token::Kind::TK_LOGICALAND: {
             SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
-            this->write(ByteCodeInstruction::kDup);
+            this->write(ByteCodeInstruction::kDup, 1);
             this->write(ByteCodeInstruction::kMaskPush);
             this->write(ByteCodeInstruction::kBranchIfAllFalse);
             DeferredLocation falseLocation(this);
             this->writeExpression(*b.fRight);
-            this->write(ByteCodeInstruction::kAndB);
+            this->write(ByteCodeInstruction::kAndB, 1);
             falseLocation.set();
             this->write(ByteCodeInstruction::kMaskPop);
             return false;
         }
         case Token::Kind::TK_LOGICALOR: {
             SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
-            this->write(ByteCodeInstruction::kDup);
-            this->write(ByteCodeInstruction::kNotB);
+            this->write(ByteCodeInstruction::kDup, 1);
+            this->write(ByteCodeInstruction::kNotB, 1);
             this->write(ByteCodeInstruction::kMaskPush);
             this->write(ByteCodeInstruction::kBranchIfAllFalse);
             DeferredLocation falseLocation(this);
             this->writeExpression(*b.fRight);
-            this->write(ByteCodeInstruction::kOrB);
+            this->write(ByteCodeInstruction::kOrB, 1);
             falseLocation.set();
             this->write(ByteCodeInstruction::kMaskPop);
             return false;
@@ -819,7 +756,7 @@
     this->writeExpression(*b.fRight);
     if (lVecOrMtx && !rVecOrMtx) {
         for (int i = SlotCount(lType); i > 1; --i) {
-            this->write(ByteCodeInstruction::kDup);
+            this->write(ByteCodeInstruction::kDup, 1);
         }
     }
     // Special case for M*V, V*M, M*M (but not V*V!)
@@ -849,7 +786,7 @@
                                             count);
                 // Collapse to a single bool
                 for (int i = count; i > 1; --i) {
-                    this->write(ByteCodeInstruction::kAndB);
+                    this->write(ByteCodeInstruction::kAndB, 1);
                 }
                 break;
             case Token::Kind::TK_GT:
@@ -889,7 +826,7 @@
                                             count);
                 // Collapse to a single bool
                 for (int i = count; i > 1; --i) {
-                    this->write(ByteCodeInstruction::kOrB);
+                    this->write(ByteCodeInstruction::kOrB, 1);
                 }
                 break;
             case Token::Kind::TK_PERCENT:
@@ -918,24 +855,21 @@
                 break;
 
             case Token::Kind::TK_LOGICALXOR:
-                SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
-                this->write(ByteCodeInstruction::kXorB);
+                SkASSERT(tc == SkSL::TypeCategory::kBool);
+                this->write(ByteCodeInstruction::kXorB, count);
                 break;
 
             case Token::Kind::TK_BITWISEAND:
-                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
-                                        tc == SkSL::TypeCategory::kUnsigned));
-                this->write(ByteCodeInstruction::kAndB);
+                SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
+                this->write(ByteCodeInstruction::kAndB, count);
                 break;
             case Token::Kind::TK_BITWISEOR:
-                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
-                                        tc == SkSL::TypeCategory::kUnsigned));
-                this->write(ByteCodeInstruction::kOrB);
+                SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
+                this->write(ByteCodeInstruction::kOrB, count);
                 break;
             case Token::Kind::TK_BITWISEXOR:
-                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
-                                        tc == SkSL::TypeCategory::kUnsigned));
-                this->write(ByteCodeInstruction::kXorB);
+                SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
+                this->write(ByteCodeInstruction::kXorB, count);
                 break;
 
             default:
@@ -972,13 +906,13 @@
             if (inCategory == TypeCategory::kFloat) {
                 SkASSERT(outCategory == TypeCategory::kSigned ||
                          outCategory == TypeCategory::kUnsigned);
-                this->write(vector_instruction(ByteCodeInstruction::kConvertFtoI, outCount));
+                this->write(ByteCodeInstruction::kConvertFtoI, outCount);
             } else if (outCategory == TypeCategory::kFloat) {
                 if (inCategory == TypeCategory::kSigned) {
-                    this->write(vector_instruction(ByteCodeInstruction::kConvertStoF, outCount));
+                    this->write(ByteCodeInstruction::kConvertStoF, outCount);
                 } else {
                     SkASSERT(inCategory == TypeCategory::kUnsigned);
-                    this->write(vector_instruction(ByteCodeInstruction::kConvertUtoF, outCount));
+                    this->write(ByteCodeInstruction::kConvertUtoF, outCount);
                 }
             } else {
                 SkASSERT(false);
@@ -1000,7 +934,7 @@
             } else {
                 SkASSERT(outType.kind() == Type::kVector_Kind);
                 for (; inCount != outCount; ++inCount) {
-                    this->write(ByteCodeInstruction::kDup);
+                    this->write(ByteCodeInstruction::kDup, 1);
                 }
             }
         }
@@ -1025,7 +959,7 @@
 
 void ByteCodeGenerator::writeExternalValue(const ExternalValueReference& e) {
     int count = SlotCount(e.fValue->type());
-    this->write(vector_instruction(ByteCodeInstruction::kReadExternal, count));
+    this->write(ByteCodeInstruction::kReadExternal, count);
     int index = fOutput->fExternalValues.size();
     fOutput->fExternalValues.push_back(e.fValue);
     SkASSERT(index <= 255);
@@ -1056,18 +990,12 @@
                                         ByteCodeInstruction::kLoadExtendedGlobal,
                                         ByteCodeInstruction::kLoadExtendedUniform),
                     count);
-        this->write8(count);
     } else {
-        while (count) {
-            int loadCount = std::min(count, 4);
-            this->write(vector_instruction(location.selectLoad(ByteCodeInstruction::kLoad,
-                                                               ByteCodeInstruction::kLoadGlobal,
-                                                               ByteCodeInstruction::kLoadUniform),
-                                           loadCount));
-            this->write8(location.fSlot);
-            count -= loadCount;
-            location.fSlot += loadCount;
-        }
+        this->write(location.selectLoad(ByteCodeInstruction::kLoad,
+                                        ByteCodeInstruction::kLoadGlobal,
+                                        ByteCodeInstruction::kLoadUniform),
+                    count);
+        this->write8(location.fSlot);
     }
 }
 
@@ -1102,7 +1030,7 @@
     auto dupSmallerType = [count, this](int smallCount) {
         SkASSERT(smallCount == 1 || smallCount == count);
         for (int i = smallCount; i < count; ++i) {
-            this->write(ByteCodeInstruction::kDup);
+            this->write(ByteCodeInstruction::kDup, 1);
         }
     };
 
@@ -1179,33 +1107,33 @@
         switch (intrin.special) {
             case SpecialIntrinsic::kAll: {
                 for (int i = count-1; i --> 0;) {
-                    this->write(ByteCodeInstruction::kAndB);
+                    this->write(ByteCodeInstruction::kAndB, 1);
                 }
             } break;
 
             case SpecialIntrinsic::kAny: {
                 for (int i = count-1; i --> 0;) {
-                    this->write(ByteCodeInstruction::kOrB);
+                    this->write(ByteCodeInstruction::kOrB, 1);
                 }
             } break;
 
             case SpecialIntrinsic::kDot: {
                 SkASSERT(c.fArguments.size() == 2);
                 SkASSERT(count == SlotCount(c.fArguments[1]->fType));
-                this->write(vector_instruction(ByteCodeInstruction::kMultiplyF, count));
+                this->write(ByteCodeInstruction::kMultiplyF, count);
                 for (int i = count-1; i --> 0;) {
-                    this->write(ByteCodeInstruction::kAddF);
+                    this->write(ByteCodeInstruction::kAddF, 1);
                 }
             } break;
 
             case SpecialIntrinsic::kLength: {
                 SkASSERT(c.fArguments.size() == 1);
-                this->write(vector_instruction(ByteCodeInstruction::kDup      , count));
-                this->write(vector_instruction(ByteCodeInstruction::kMultiplyF, count));
+                this->write(ByteCodeInstruction::kDup, count);
+                this->write(ByteCodeInstruction::kMultiplyF, count);
                 for (int i = count-1; i --> 0;) {
-                    this->write(ByteCodeInstruction::kAddF);
+                    this->write(ByteCodeInstruction::kAddF, 1);
                 }
-                this->write(ByteCodeInstruction::kSqrt);
+                this->write(ByteCodeInstruction::kSqrt, 1);
             } break;
 
             case SpecialIntrinsic::kMax:
@@ -1237,25 +1165,25 @@
                 if (is_generic_type(&c.fArguments[2]->fType, fContext.fGenBType_Type.get())) {
                     // mix(genType, genType, genBoolType)
                     SkASSERT(selectorCount == count);
-                    this->write(vector_instruction(ByteCodeInstruction::kMix, count));
+                    this->write(ByteCodeInstruction::kMix, count);
                 } else {
                     // mix(genType, genType, genType) or mix(genType, genType, float)
                     dupSmallerType(selectorCount);
-                    this->write(vector_instruction(ByteCodeInstruction::kLerp, count));
+                    this->write(ByteCodeInstruction::kLerp, count);
                 }
             } break;
 
             case SpecialIntrinsic::kNormalize: {
                 SkASSERT(c.fArguments.size() == 1);
-                this->write(vector_instruction(ByteCodeInstruction::kDup      , count));
-                this->write(vector_instruction(ByteCodeInstruction::kDup      , count));
-                this->write(vector_instruction(ByteCodeInstruction::kMultiplyF, count));
+                this->write(ByteCodeInstruction::kDup, count);
+                this->write(ByteCodeInstruction::kDup, count);
+                this->write(ByteCodeInstruction::kMultiplyF, count);
                 for (int i = count-1; i --> 0;) {
-                    this->write(ByteCodeInstruction::kAddF);
+                    this->write(ByteCodeInstruction::kAddF, 1);
                 }
-                this->write(ByteCodeInstruction::kSqrt);
+                this->write(ByteCodeInstruction::kSqrt, 1);
                 dupSmallerType(1);
-                this->write(vector_instruction(ByteCodeInstruction::kDivideF, count));
+                this->write(ByteCodeInstruction::kDivideF, count);
             } break;
 
             default:
@@ -1277,8 +1205,11 @@
             }
 
             default:
-                this->writeTypedInstruction(c.fArguments[0]->fType, intrin.inst_s, intrin.inst_u,
-                                            intrin.inst_f, count);
+                this->writeTypedInstruction(c.fArguments[0]->fType,
+                                            intrin.inst_s,
+                                            intrin.inst_u,
+                                            intrin.inst_f,
+                                            count);
                 break;
         }
     }
@@ -1311,7 +1242,6 @@
     // We may need to deal with out parameters, so the sequence is tricky
     if (int returnCount = SlotCount(f.fType)) {
         this->write(ByteCodeInstruction::kReserve, returnCount);
-        this->write8(returnCount);
     }
 
     int argCount = f.fArguments.size();
@@ -1343,11 +1273,8 @@
     // counts for all parameters that aren't out-params, so we can pop them in one big chunk.
     int popCount = 0;
     auto pop = [&]() {
-        if (popCount > 4) {
-            this->write(ByteCodeInstruction::kPopN, popCount);
-            this->write8(popCount);
-        } else if (popCount > 0) {
-            this->write(vector_instruction(ByteCodeInstruction::kPop, popCount));
+        if (popCount > 0) {
+            this->write(ByteCodeInstruction::kPop, popCount);
         }
         popCount = 0;
     };
@@ -1419,7 +1346,7 @@
                      (p.fOperator == Token::Kind::TK_BITWISENOT && (tc == TypeCategory::kSigned ||
                                                                  tc == TypeCategory::kUnsigned)));
             this->writeExpression(*p.fOperand);
-            this->write(ByteCodeInstruction::kNotB);
+            this->write(ByteCodeInstruction::kNotB, 1);
             break;
         }
         default:
@@ -1437,7 +1364,7 @@
             lvalue->load();
             // If we're not supposed to discard the result, then make a copy *before* the +/-
             if (!discard) {
-                this->write(ByteCodeInstruction::kDup);
+                this->write(ByteCodeInstruction::kDup, 1);
             }
             this->write(ByteCodeInstruction::kPushImmediate);
             this->write32(type_category(p.fType) == TypeCategory::kFloat ? float_to_bits(1.0f) : 1);
@@ -1491,7 +1418,6 @@
     this->write(ByteCodeInstruction::kMaskNegate);
     this->writeExpression(*t.fIfFalse);
     this->write(ByteCodeInstruction::kMaskBlend, count);
-    this->write8(count);
 }
 
 void ByteCodeGenerator::writeExpression(const Expression& e, bool discard) {
@@ -1548,11 +1474,8 @@
     }
     if (discard) {
         int count = SlotCount(e.fType);
-        if (count > 4) {
-            this->write(ByteCodeInstruction::kPopN, count);
-            this->write8(count);
-        } else if (count != 0) {
-            this->write(vector_instruction(ByteCodeInstruction::kPop, count));
+        if (count > 0) {
+            this->write(ByteCodeInstruction::kPop, count);
         }
         discard = false;
     }
@@ -1566,15 +1489,15 @@
         , fIndex(index) {}
 
     void load() override {
-        fGenerator.write(vector_instruction(ByteCodeInstruction::kReadExternal, fCount));
+        fGenerator.write(ByteCodeInstruction::kReadExternal, fCount);
         fGenerator.write8(fIndex);
     }
 
     void store(bool discard) override {
         if (!discard) {
-            fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, fCount));
+            fGenerator.write(ByteCodeInstruction::kDup, fCount);
         }
-        fGenerator.write(vector_instruction(ByteCodeInstruction::kWriteExternal, fCount));
+        fGenerator.write(ByteCodeInstruction::kWriteExternal, fCount);
         fGenerator.write8(fIndex);
     }
 
@@ -1582,7 +1505,6 @@
     typedef LValue INHERITED;
 
     int fCount;
-
     int fIndex;
 };
 
@@ -1599,7 +1521,7 @@
     void store(bool discard) override {
         int count = fSwizzle.fComponents.size();
         if (!discard) {
-            fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, count));
+            fGenerator.write(ByteCodeInstruction::kDup, count);
         }
         // We already have the correct number of values on the stack, thanks to type checking.
         // The algorithm: Walk down the values on the stack, doing 'count' single-element stores.
@@ -1615,16 +1537,16 @@
             ByteCodeGenerator::Location location = fGenerator.getLocation(*fSwizzle.fBase);
             if (!location.isOnStack()) {
                 fGenerator.write(location.selectStore(ByteCodeInstruction::kStore,
-                                                      ByteCodeInstruction::kStoreGlobal));
+                                                      ByteCodeInstruction::kStoreGlobal),
+                                 1);
                 fGenerator.write8(location.fSlot + fSwizzle.fComponents[i]);
             } else {
                 fGenerator.write(ByteCodeInstruction::kPushImmediate);
                 fGenerator.write32(fSwizzle.fComponents[i]);
-                fGenerator.write(ByteCodeInstruction::kAddI);
+                fGenerator.write(ByteCodeInstruction::kAddI, 1);
                 fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreExtended,
                                                       ByteCodeInstruction::kStoreExtendedGlobal),
                                  1);
-                fGenerator.write8(1);
             }
         }
     }
@@ -1648,28 +1570,17 @@
     void store(bool discard) override {
         int count = ByteCodeGenerator::SlotCount(fExpression.fType);
         if (!discard) {
-            if (count > 4) {
-                fGenerator.write(ByteCodeInstruction::kDupN, count);
-                fGenerator.write8(count);
-            } else {
-                fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, count));
-            }
+            fGenerator.write(ByteCodeInstruction::kDup, count);
         }
         ByteCodeGenerator::Location location = fGenerator.getLocation(fExpression);
-        if (location.isOnStack() || count > 4) {
-            if (!location.isOnStack()) {
-                fGenerator.write(ByteCodeInstruction::kPushImmediate);
-                fGenerator.write32(location.fSlot);
-            }
+        if (location.isOnStack()) {
             fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreExtended,
                                                   ByteCodeInstruction::kStoreExtendedGlobal),
                              count);
-            fGenerator.write8(count);
         } else {
-            fGenerator.write(
-                    vector_instruction(location.selectStore(ByteCodeInstruction::kStore,
-                                                            ByteCodeInstruction::kStoreGlobal),
-                                       count));
+            fGenerator.write(location.selectStore(ByteCodeInstruction::kStore,
+                                                  ByteCodeInstruction::kStoreGlobal),
+                             count);
             fGenerator.write8(location.fSlot);
         }
     }
@@ -1811,8 +1722,7 @@
     // we account for those in writeFunction().
 
     // This is all fine because we don't allow conditional returns, so we only return once anyway.
-    this->write(ByteCodeInstruction::kReturn, -count);
-    this->write8(count);
+    this->write(ByteCodeInstruction::kReturn, count);
 }
 
 void ByteCodeGenerator::writeSwitchStatement(const SwitchStatement& r) {
@@ -1828,15 +1738,8 @@
         if (decl.fValue) {
             this->writeExpression(*decl.fValue);
             int count = SlotCount(decl.fValue->fType);
-            if (count > 4) {
-                this->write(ByteCodeInstruction::kPushImmediate);
-                this->write32(location.fSlot);
-                this->write(ByteCodeInstruction::kStoreExtended, count);
-                this->write8(count);
-            } else {
-                this->write(vector_instruction(ByteCodeInstruction::kStore, count));
-                this->write8(location.fSlot);
-            }
+            this->write(ByteCodeInstruction::kStore, count);
+            this->write8(location.fSlot);
         }
     }
 }