only emit _imm ops when JITing for x86

There are probably ways to make this more efficient by only optimizing
what's necessary (e.g. try JIT first, then interpreter only if it fails)
and some other performance improvements to make, but for now I want to
focus mostly on keeping things simple and correct.

The line between Builder::done() and Program::Program() is particularly
fuzzy and becoming fuzzier here, and I think that'll be something
that'll change eventually.

This makes SkVMTest debug dumps more portable, though perhaps less
useful.  Might kill that feature soon now that SkVM is tested more
thoroughly in unit tests and GMs and bots and such.

Change-Id: Id9ce8daaf8570e5bea8b10f1a80b97f5b33d45dc
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/269941
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/resources/SkVMTest.expected b/resources/SkVMTest.expected
index 8d71444..ca199c9 100644
--- a/resources/SkVMTest.expected
+++ b/resources/SkVMTest.expected
@@ -1,658 +1,714 @@
 A8 over A8
-12 values (originally 16):
-  v0 = load8 arg(0)
-  v1 = to_f32 v0
-  v2 = mul_f32 v1 3B808081 (0.0039215689)
-  v3 = load8 arg(1)
-  v4 = to_f32 v3
-  v5 = mul_f32 v4 3B808081 (0.0039215689)
-↑ v6 = splat 3F800000 (1)
-  v7 = sub_f32 v6 v2
-  v8 = mad_f32 v5 v7 v2
-  v9 = mul_f32 v8 437F0000 (255)
-  v10 = round v9
-  store8 arg(1) v10
-
-4 registers, 12 instructions:
-0	r0 = splat 3F800000 (1)
-loop:
-1	    r1 = load8 arg(0)
-2	    r1 = to_f32 r1
-3	    r1 = mul_f32 r1 3B808081 (0.0039215689)
-4	    r2 = load8 arg(1)
-5	    r2 = to_f32 r2
-6	    r2 = mul_f32 r2 3B808081 (0.0039215689)
-7	    r3 = sub_f32 r0 r1
-8	    r1 = mad_f32 r2 r3 r1
-9	    r1 = mul_f32 r1 437F0000 (255)
-10	    r1 = round r1
-11	    store8 arg(1) r1
-
-A8 over G8
-17 values (originally 22):
-  v0 = load8 arg(1)
-  v1 = to_f32 v0
-  v2 = mul_f32 v1 3B808081 (0.0039215689)
-  v3 = load8 arg(0)
-  v4 = to_f32 v3
-  v5 = mul_f32 v4 3B808081 (0.0039215689)
-↑ v6 = splat 3F800000 (1)
-  v7 = sub_f32 v6 v5
-  v8 = mul_f32 v2 v7
-↑ v9 = splat 3E59B3D0 (0.21259999)
-↑ v10 = splat 3F371759 (0.71520001)
-  v11 = mul_f32 v8 3D93DD98 (0.0722)
-  v12 = mad_f32 v8 v10 v11
-  v13 = mad_f32 v8 v9 v12
-  v14 = mul_f32 v13 437F0000 (255)
-  v15 = round v14
-  store8 arg(1) v15
-
-5 registers, 17 instructions:
-0	r0 = splat 3F800000 (1)
-1	r1 = splat 3E59B3D0 (0.21259999)
-2	r2 = splat 3F371759 (0.71520001)
-loop:
-3	    r3 = load8 arg(1)
-4	    r3 = to_f32 r3
-5	    r3 = mul_f32 r3 3B808081 (0.0039215689)
-6	    r4 = load8 arg(0)
-7	    r4 = to_f32 r4
-8	    r4 = mul_f32 r4 3B808081 (0.0039215689)
-9	    r4 = sub_f32 r0 r4
-10	    r4 = mul_f32 r3 r4
-11	    r3 = mul_f32 r4 3D93DD98 (0.0722)
-12	    r3 = mad_f32 r4 r2 r3
-13	    r3 = mad_f32 r4 r1 r3
-14	    r3 = mul_f32 r3 437F0000 (255)
-15	    r3 = round r3
-16	    store8 arg(1) r3
-
-A8 over RGBA_8888
-36 values (originally 40):
-  v0 = load32 arg(1)
-  v1 = bit_and v0 FF
+14 values (originally 16):
+↑ v0 = splat 3B808081 (0.0039215689)
+  v1 = load8 arg(0)
   v2 = to_f32 v1
-  v3 = mul_f32 v2 3B808081 (0.0039215689)
-  v4 = load8 arg(0)
-  v5 = to_f32 v4
-  v6 = mul_f32 v5 3B808081 (0.0039215689)
-↑ v7 = splat 3F800000 (1)
-  v8 = sub_f32 v7 v6
-  v9 = mul_f32 v3 v8
-  v10 = mul_f32 v9 437F0000 (255)
-  v11 = round v10
-  v12 = shr_i32 v0 8
-  v13 = bit_and v12 FF
-  v14 = to_f32 v13
-  v15 = mul_f32 v14 3B808081 (0.0039215689)
-  v16 = mul_f32 v15 v8
-  v17 = mul_f32 v16 437F0000 (255)
-  v18 = round v17
-  v19 = pack v11 v18 8
-  v20 = shr_i32 v0 16
-  v21 = bit_and v20 FF
-  v22 = to_f32 v21
-  v23 = mul_f32 v22 3B808081 (0.0039215689)
-  v24 = mul_f32 v23 v8
-  v25 = mul_f32 v24 437F0000 (255)
-  v26 = round v25
-  v27 = shr_i32 v0 24
-  v28 = to_f32 v27
-  v29 = mul_f32 v28 3B808081 (0.0039215689)
-  v30 = mad_f32 v29 v8 v6
-  v31 = mul_f32 v30 437F0000 (255)
-  v32 = round v31
-  v33 = pack v26 v32 8
-  v34 = pack v19 v33 16
-  store32 arg(1) v34
-
-6 registers, 36 instructions:
-0	r0 = splat 3F800000 (1)
-loop:
-1	    r1 = load32 arg(1)
-2	    r2 = bit_and r1 FF
-3	    r2 = to_f32 r2
-4	    r2 = mul_f32 r2 3B808081 (0.0039215689)
-5	    r3 = load8 arg(0)
-6	    r3 = to_f32 r3
-7	    r3 = mul_f32 r3 3B808081 (0.0039215689)
-8	    r4 = sub_f32 r0 r3
-9	    r2 = mul_f32 r2 r4
-10	    r2 = mul_f32 r2 437F0000 (255)
-11	    r2 = round r2
-12	    r5 = shr_i32 r1 8
-13	    r5 = bit_and r5 FF
-14	    r5 = to_f32 r5
-15	    r5 = mul_f32 r5 3B808081 (0.0039215689)
-16	    r5 = mul_f32 r5 r4
-17	    r5 = mul_f32 r5 437F0000 (255)
-18	    r5 = round r5
-19	    r5 = pack r2 r5 8
-20	    r2 = shr_i32 r1 16
-21	    r2 = bit_and r2 FF
-22	    r2 = to_f32 r2
-23	    r2 = mul_f32 r2 3B808081 (0.0039215689)
-24	    r2 = mul_f32 r2 r4
-25	    r2 = mul_f32 r2 437F0000 (255)
-26	    r2 = round r2
-27	    r1 = shr_i32 r1 24
-28	    r1 = to_f32 r1
-29	    r1 = mul_f32 r1 3B808081 (0.0039215689)
-30	    r3 = mad_f32 r1 r4 r3
-31	    r3 = mul_f32 r3 437F0000 (255)
-32	    r3 = round r3
-33	    r3 = pack r2 r3 8
-34	    r3 = pack r5 r3 16
-35	    store32 arg(1) r3
-
-G8 over A8
-9 values (originally 15):
-↑ v0 = splat 3F800000 (1)
-↑ v1 = splat 0 (0)
-  v2 = load8 arg(1)
-  v3 = to_f32 v2
-  v4 = mul_f32 v3 3B808081 (0.0039215689)
-  v5 = mad_f32 v4 v1 v0
-  v6 = mul_f32 v5 437F0000 (255)
-  v7 = round v6
-  store8 arg(1) v7
-
-3 registers, 9 instructions:
-0	r0 = splat 3F800000 (1)
-1	r1 = splat 0 (0)
-loop:
-2	    r2 = load8 arg(1)
-3	    r2 = to_f32 r2
-4	    r2 = mul_f32 r2 3B808081 (0.0039215689)
-5	    r2 = mad_f32 r2 r1 r0
-6	    r2 = mul_f32 r2 437F0000 (255)
-7	    r2 = round r2
-8	    store8 arg(1) r2
-
-G8 over G8
-16 values (originally 20):
-  v0 = load8 arg(0)
-  v1 = to_f32 v0
-  v2 = mul_f32 v1 3B808081 (0.0039215689)
-  v3 = load8 arg(1)
-  v4 = to_f32 v3
-  v5 = mul_f32 v4 3B808081 (0.0039215689)
-↑ v6 = splat 0 (0)
-  v7 = mad_f32 v5 v6 v2
-↑ v8 = splat 3E59B3D0 (0.21259999)
-↑ v9 = splat 3F371759 (0.71520001)
-  v10 = mul_f32 v7 3D93DD98 (0.0722)
-  v11 = mad_f32 v7 v9 v10
-  v12 = mad_f32 v7 v8 v11
-  v13 = mul_f32 v12 437F0000 (255)
-  v14 = round v13
-  store8 arg(1) v14
-
-5 registers, 16 instructions:
-0	r0 = splat 0 (0)
-1	r1 = splat 3E59B3D0 (0.21259999)
-2	r2 = splat 3F371759 (0.71520001)
-loop:
-3	    r3 = load8 arg(0)
-4	    r3 = to_f32 r3
-5	    r3 = mul_f32 r3 3B808081 (0.0039215689)
-6	    r4 = load8 arg(1)
-7	    r4 = to_f32 r4
-8	    r4 = mul_f32 r4 3B808081 (0.0039215689)
-9	    r3 = mad_f32 r4 r0 r3
-10	    r4 = mul_f32 r3 3D93DD98 (0.0722)
-11	    r4 = mad_f32 r3 r2 r4
-12	    r4 = mad_f32 r3 r1 r4
-13	    r4 = mul_f32 r4 437F0000 (255)
-14	    r4 = round r4
-15	    store8 arg(1) r4
-
-G8 over RGBA_8888
-36 values (originally 39):
-  v0 = load8 arg(0)
-  v1 = to_f32 v0
-  v2 = mul_f32 v1 3B808081 (0.0039215689)
-  v3 = load32 arg(1)
-  v4 = bit_and v3 FF
-  v5 = to_f32 v4
-  v6 = mul_f32 v5 3B808081 (0.0039215689)
-↑ v7 = splat 0 (0)
-  v8 = mad_f32 v6 v7 v2
-  v9 = mul_f32 v8 437F0000 (255)
-  v10 = round v9
-  v11 = shr_i32 v3 8
-  v12 = bit_and v11 FF
-  v13 = to_f32 v12
-  v14 = mul_f32 v13 3B808081 (0.0039215689)
-  v15 = mad_f32 v14 v7 v2
-  v16 = mul_f32 v15 437F0000 (255)
-  v17 = round v16
-  v18 = pack v10 v17 8
-  v19 = shr_i32 v3 16
-  v20 = bit_and v19 FF
-  v21 = to_f32 v20
-  v22 = mul_f32 v21 3B808081 (0.0039215689)
-  v23 = mad_f32 v22 v7 v2
-  v24 = mul_f32 v23 437F0000 (255)
-  v25 = round v24
-↑ v26 = splat 3F800000 (1)
-  v27 = shr_i32 v3 24
-  v28 = to_f32 v27
-  v29 = mul_f32 v28 3B808081 (0.0039215689)
-  v30 = mad_f32 v29 v7 v26
-  v31 = mul_f32 v30 437F0000 (255)
-  v32 = round v31
-  v33 = pack v25 v32 8
-  v34 = pack v18 v33 16
-  store32 arg(1) v34
-
-6 registers, 36 instructions:
-0	r0 = splat 0 (0)
-1	r1 = splat 3F800000 (1)
-loop:
-2	    r2 = load8 arg(0)
-3	    r2 = to_f32 r2
-4	    r2 = mul_f32 r2 3B808081 (0.0039215689)
-5	    r3 = load32 arg(1)
-6	    r4 = bit_and r3 FF
-7	    r4 = to_f32 r4
-8	    r4 = mul_f32 r4 3B808081 (0.0039215689)
-9	    r4 = mad_f32 r4 r0 r2
-10	    r4 = mul_f32 r4 437F0000 (255)
-11	    r4 = round r4
-12	    r5 = shr_i32 r3 8
-13	    r5 = bit_and r5 FF
-14	    r5 = to_f32 r5
-15	    r5 = mul_f32 r5 3B808081 (0.0039215689)
-16	    r5 = mad_f32 r5 r0 r2
-17	    r5 = mul_f32 r5 437F0000 (255)
-18	    r5 = round r5
-19	    r5 = pack r4 r5 8
-20	    r4 = shr_i32 r3 16
-21	    r4 = bit_and r4 FF
-22	    r4 = to_f32 r4
-23	    r4 = mul_f32 r4 3B808081 (0.0039215689)
-24	    r2 = mad_f32 r4 r0 r2
-25	    r2 = mul_f32 r2 437F0000 (255)
-26	    r2 = round r2
-27	    r3 = shr_i32 r3 24
-28	    r3 = to_f32 r3
-29	    r3 = mul_f32 r3 3B808081 (0.0039215689)
-30	    r3 = mad_f32 r3 r0 r1
-31	    r3 = mul_f32 r3 437F0000 (255)
-32	    r3 = round r3
-33	    r3 = pack r2 r3 8
-34	    r3 = pack r5 r3 16
-35	    store32 arg(1) r3
-
-RGBA_8888 over A8
-13 values (originally 31):
-  v0 = load32 arg(0)
-  v1 = shr_i32 v0 24
-  v2 = to_f32 v1
-  v3 = mul_f32 v2 3B808081 (0.0039215689)
+  v3 = mul_f32 v0 v2
   v4 = load8 arg(1)
   v5 = to_f32 v4
-  v6 = mul_f32 v5 3B808081 (0.0039215689)
+  v6 = mul_f32 v0 v5
 ↑ v7 = splat 3F800000 (1)
   v8 = sub_f32 v7 v3
   v9 = mad_f32 v6 v8 v3
-  v10 = mul_f32 v9 437F0000 (255)
-  v11 = round v10
-  store8 arg(1) v11
+↑ v10 = splat 437F0000 (255)
+  v11 = mul_f32 v9 v10
+  v12 = round v11
+  store8 arg(1) v12
 
-4 registers, 13 instructions:
-0	r0 = splat 3F800000 (1)
+6 registers, 14 instructions:
+0	r0 = splat 3B808081 (0.0039215689)
+1	r1 = splat 3F800000 (1)
+2	r2 = splat 437F0000 (255)
 loop:
-1	    r1 = load32 arg(0)
-2	    r1 = shr_i32 r1 24
-3	    r1 = to_f32 r1
-4	    r1 = mul_f32 r1 3B808081 (0.0039215689)
-5	    r2 = load8 arg(1)
-6	    r2 = to_f32 r2
-7	    r2 = mul_f32 r2 3B808081 (0.0039215689)
-8	    r3 = sub_f32 r0 r1
-9	    r1 = mad_f32 r2 r3 r1
-10	    r1 = mul_f32 r1 437F0000 (255)
-11	    r1 = round r1
-12	    store8 arg(1) r1
+3	    r3 = load8 arg(0)
+4	    r3 = to_f32 r3
+5	    r3 = mul_f32 r0 r3
+6	    r4 = load8 arg(1)
+7	    r4 = to_f32 r4
+8	    r4 = mul_f32 r0 r4
+9	    r5 = sub_f32 r1 r3
+10	    r3 = mad_f32 r4 r5 r3
+11	    r3 = mul_f32 r3 r2
+12	    r3 = round r3
+13	    store8 arg(1) r3
 
-RGBA_8888 over G8
-31 values (originally 36):
-  v0 = load32 arg(0)
-  v1 = bit_and v0 FF
+A8 over G8
+20 values (originally 22):
+↑ v0 = splat 3B808081 (0.0039215689)
+  v1 = load8 arg(1)
   v2 = to_f32 v1
-  v3 = mul_f32 v2 3B808081 (0.0039215689)
+  v3 = mul_f32 v0 v2
+  v4 = load8 arg(0)
+  v5 = to_f32 v4
+  v6 = mul_f32 v0 v5
+↑ v7 = splat 3F800000 (1)
+  v8 = sub_f32 v7 v6
+  v9 = mul_f32 v3 v8
+↑ v10 = splat 3E59B3D0 (0.21259999)
+↑ v11 = splat 3F371759 (0.71520001)
+↑ v12 = splat 3D93DD98 (0.0722)
+  v13 = mul_f32 v9 v12
+  v14 = mad_f32 v9 v11 v13
+  v15 = mad_f32 v9 v10 v14
+↑ v16 = splat 437F0000 (255)
+  v17 = mul_f32 v15 v16
+  v18 = round v17
+  store8 arg(1) v18
+
+8 registers, 20 instructions:
+0	r0 = splat 3B808081 (0.0039215689)
+1	r1 = splat 3F800000 (1)
+2	r2 = splat 3E59B3D0 (0.21259999)
+3	r3 = splat 3F371759 (0.71520001)
+4	r4 = splat 3D93DD98 (0.0722)
+5	r5 = splat 437F0000 (255)
+loop:
+6	    r6 = load8 arg(1)
+7	    r6 = to_f32 r6
+8	    r6 = mul_f32 r0 r6
+9	    r7 = load8 arg(0)
+10	    r7 = to_f32 r7
+11	    r7 = mul_f32 r0 r7
+12	    r7 = sub_f32 r1 r7
+13	    r7 = mul_f32 r6 r7
+14	    r6 = mul_f32 r7 r4
+15	    r6 = mad_f32 r7 r3 r6
+16	    r6 = mad_f32 r7 r2 r6
+17	    r6 = mul_f32 r6 r5
+18	    r6 = round r6
+19	    store8 arg(1) r6
+
+A8 over RGBA_8888
+39 values (originally 40):
+↑ v0 = splat 3B808081 (0.0039215689)
+  v1 = load32 arg(1)
+↑ v2 = splat FF (3.5733111e-43)
+  v3 = bit_and v2 v1
+  v4 = to_f32 v3
+  v5 = mul_f32 v0 v4
+  v6 = load8 arg(0)
+  v7 = to_f32 v6
+  v8 = mul_f32 v0 v7
+↑ v9 = splat 3F800000 (1)
+  v10 = sub_f32 v9 v8
+  v11 = mul_f32 v5 v10
+↑ v12 = splat 437F0000 (255)
+  v13 = mul_f32 v11 v12
+  v14 = round v13
+  v15 = shr_i32 v1 8
+  v16 = bit_and v2 v15
+  v17 = to_f32 v16
+  v18 = mul_f32 v0 v17
+  v19 = mul_f32 v18 v10
+  v20 = mul_f32 v19 v12
+  v21 = round v20
+  v22 = pack v14 v21 8
+  v23 = shr_i32 v1 16
+  v24 = bit_and v2 v23
+  v25 = to_f32 v24
+  v26 = mul_f32 v0 v25
+  v27 = mul_f32 v26 v10
+  v28 = mul_f32 v27 v12
+  v29 = round v28
+  v30 = shr_i32 v1 24
+  v31 = to_f32 v30
+  v32 = mul_f32 v0 v31
+  v33 = mad_f32 v32 v10 v8
+  v34 = mul_f32 v33 v12
+  v35 = round v34
+  v36 = pack v29 v35 8
+  v37 = pack v22 v36 16
+  store32 arg(1) v37
+
+9 registers, 39 instructions:
+0	r0 = splat 3B808081 (0.0039215689)
+1	r1 = splat FF (3.5733111e-43)
+2	r2 = splat 3F800000 (1)
+3	r3 = splat 437F0000 (255)
+loop:
+4	    r4 = load32 arg(1)
+5	    r5 = bit_and r1 r4
+6	    r5 = to_f32 r5
+7	    r5 = mul_f32 r0 r5
+8	    r6 = load8 arg(0)
+9	    r6 = to_f32 r6
+10	    r6 = mul_f32 r0 r6
+11	    r7 = sub_f32 r2 r6
+12	    r5 = mul_f32 r5 r7
+13	    r5 = mul_f32 r5 r3
+14	    r5 = round r5
+15	    r8 = shr_i32 r4 8
+16	    r8 = bit_and r1 r8
+17	    r8 = to_f32 r8
+18	    r8 = mul_f32 r0 r8
+19	    r8 = mul_f32 r8 r7
+20	    r8 = mul_f32 r8 r3
+21	    r8 = round r8
+22	    r8 = pack r5 r8 8
+23	    r5 = shr_i32 r4 16
+24	    r5 = bit_and r1 r5
+25	    r5 = to_f32 r5
+26	    r5 = mul_f32 r0 r5
+27	    r5 = mul_f32 r5 r7
+28	    r5 = mul_f32 r5 r3
+29	    r5 = round r5
+30	    r4 = shr_i32 r4 24
+31	    r4 = to_f32 r4
+32	    r4 = mul_f32 r0 r4
+33	    r6 = mad_f32 r4 r7 r6
+34	    r6 = mul_f32 r6 r3
+35	    r6 = round r6
+36	    r6 = pack r5 r6 8
+37	    r6 = pack r8 r6 16
+38	    store32 arg(1) r6
+
+G8 over A8
+11 values (originally 15):
+↑ v0 = splat 3F800000 (1)
+↑ v1 = splat 0 (0)
+↑ v2 = splat 3B808081 (0.0039215689)
+  v3 = load8 arg(1)
+  v4 = to_f32 v3
+  v5 = mul_f32 v2 v4
+  v6 = mad_f32 v5 v1 v0
+↑ v7 = splat 437F0000 (255)
+  v8 = mul_f32 v6 v7
+  v9 = round v8
+  store8 arg(1) v9
+
+5 registers, 11 instructions:
+0	r0 = splat 3F800000 (1)
+1	r1 = splat 0 (0)
+2	r2 = splat 3B808081 (0.0039215689)
+3	r3 = splat 437F0000 (255)
+loop:
+4	    r4 = load8 arg(1)
+5	    r4 = to_f32 r4
+6	    r4 = mul_f32 r2 r4
+7	    r4 = mad_f32 r4 r1 r0
+8	    r4 = mul_f32 r4 r3
+9	    r4 = round r4
+10	    store8 arg(1) r4
+
+G8 over G8
+19 values (originally 20):
+↑ v0 = splat 3B808081 (0.0039215689)
+  v1 = load8 arg(0)
+  v2 = to_f32 v1
+  v3 = mul_f32 v0 v2
   v4 = load8 arg(1)
   v5 = to_f32 v4
-  v6 = mul_f32 v5 3B808081 (0.0039215689)
-  v7 = shr_i32 v0 24
-  v8 = to_f32 v7
-  v9 = mul_f32 v8 3B808081 (0.0039215689)
-↑ v10 = splat 3F800000 (1)
-  v11 = sub_f32 v10 v9
-  v12 = mad_f32 v6 v11 v3
-↑ v13 = splat 3E59B3D0 (0.21259999)
-  v14 = shr_i32 v0 8
-  v15 = bit_and v14 FF
-  v16 = to_f32 v15
-  v17 = mul_f32 v16 3B808081 (0.0039215689)
-  v18 = mad_f32 v6 v11 v17
-↑ v19 = splat 3F371759 (0.71520001)
-  v20 = shr_i32 v0 16
-  v21 = bit_and v20 FF
-  v22 = to_f32 v21
-  v23 = mul_f32 v22 3B808081 (0.0039215689)
-  v24 = mad_f32 v6 v11 v23
-  v25 = mul_f32 v24 3D93DD98 (0.0722)
-  v26 = mad_f32 v18 v19 v25
-  v27 = mad_f32 v12 v13 v26
-  v28 = mul_f32 v27 437F0000 (255)
-  v29 = round v28
-  store8 arg(1) v29
+  v6 = mul_f32 v0 v5
+↑ v7 = splat 0 (0)
+  v8 = mad_f32 v6 v7 v3
+↑ v9 = splat 3E59B3D0 (0.21259999)
+↑ v10 = splat 3F371759 (0.71520001)
+↑ v11 = splat 3D93DD98 (0.0722)
+  v12 = mul_f32 v8 v11
+  v13 = mad_f32 v8 v10 v12
+  v14 = mad_f32 v8 v9 v13
+↑ v15 = splat 437F0000 (255)
+  v16 = mul_f32 v14 v15
+  v17 = round v16
+  store8 arg(1) v17
 
-8 registers, 31 instructions:
-0	r0 = splat 3F800000 (1)
-1	r1 = splat 3E59B3D0 (0.21259999)
-2	r2 = splat 3F371759 (0.71520001)
+8 registers, 19 instructions:
+0	r0 = splat 3B808081 (0.0039215689)
+1	r1 = splat 0 (0)
+2	r2 = splat 3E59B3D0 (0.21259999)
+3	r3 = splat 3F371759 (0.71520001)
+4	r4 = splat 3D93DD98 (0.0722)
+5	r5 = splat 437F0000 (255)
+loop:
+6	    r6 = load8 arg(0)
+7	    r6 = to_f32 r6
+8	    r6 = mul_f32 r0 r6
+9	    r7 = load8 arg(1)
+10	    r7 = to_f32 r7
+11	    r7 = mul_f32 r0 r7
+12	    r6 = mad_f32 r7 r1 r6
+13	    r7 = mul_f32 r6 r4
+14	    r7 = mad_f32 r6 r3 r7
+15	    r7 = mad_f32 r6 r2 r7
+16	    r7 = mul_f32 r7 r5
+17	    r7 = round r7
+18	    store8 arg(1) r7
+
+G8 over RGBA_8888
+39 values (originally 39):
+↑ v0 = splat 3B808081 (0.0039215689)
+  v1 = load8 arg(0)
+  v2 = to_f32 v1
+  v3 = mul_f32 v0 v2
+  v4 = load32 arg(1)
+↑ v5 = splat FF (3.5733111e-43)
+  v6 = bit_and v5 v4
+  v7 = to_f32 v6
+  v8 = mul_f32 v0 v7
+↑ v9 = splat 0 (0)
+  v10 = mad_f32 v8 v9 v3
+↑ v11 = splat 437F0000 (255)
+  v12 = mul_f32 v10 v11
+  v13 = round v12
+  v14 = shr_i32 v4 8
+  v15 = bit_and v5 v14
+  v16 = to_f32 v15
+  v17 = mul_f32 v0 v16
+  v18 = mad_f32 v17 v9 v3
+  v19 = mul_f32 v18 v11
+  v20 = round v19
+  v21 = pack v13 v20 8
+  v22 = shr_i32 v4 16
+  v23 = bit_and v5 v22
+  v24 = to_f32 v23
+  v25 = mul_f32 v0 v24
+  v26 = mad_f32 v25 v9 v3
+  v27 = mul_f32 v26 v11
+  v28 = round v27
+↑ v29 = splat 3F800000 (1)
+  v30 = shr_i32 v4 24
+  v31 = to_f32 v30
+  v32 = mul_f32 v0 v31
+  v33 = mad_f32 v32 v9 v29
+  v34 = mul_f32 v33 v11
+  v35 = round v34
+  v36 = pack v28 v35 8
+  v37 = pack v21 v36 16
+  store32 arg(1) v37
+
+9 registers, 39 instructions:
+0	r0 = splat 3B808081 (0.0039215689)
+1	r1 = splat FF (3.5733111e-43)
+2	r2 = splat 0 (0)
+3	r3 = splat 437F0000 (255)
+4	r4 = splat 3F800000 (1)
+loop:
+5	    r5 = load8 arg(0)
+6	    r5 = to_f32 r5
+7	    r5 = mul_f32 r0 r5
+8	    r6 = load32 arg(1)
+9	    r7 = bit_and r1 r6
+10	    r7 = to_f32 r7
+11	    r7 = mul_f32 r0 r7
+12	    r7 = mad_f32 r7 r2 r5
+13	    r7 = mul_f32 r7 r3
+14	    r7 = round r7
+15	    r8 = shr_i32 r6 8
+16	    r8 = bit_and r1 r8
+17	    r8 = to_f32 r8
+18	    r8 = mul_f32 r0 r8
+19	    r8 = mad_f32 r8 r2 r5
+20	    r8 = mul_f32 r8 r3
+21	    r8 = round r8
+22	    r8 = pack r7 r8 8
+23	    r7 = shr_i32 r6 16
+24	    r7 = bit_and r1 r7
+25	    r7 = to_f32 r7
+26	    r7 = mul_f32 r0 r7
+27	    r5 = mad_f32 r7 r2 r5
+28	    r5 = mul_f32 r5 r3
+29	    r5 = round r5
+30	    r6 = shr_i32 r6 24
+31	    r6 = to_f32 r6
+32	    r6 = mul_f32 r0 r6
+33	    r6 = mad_f32 r6 r2 r4
+34	    r6 = mul_f32 r6 r3
+35	    r6 = round r6
+36	    r6 = pack r5 r6 8
+37	    r6 = pack r8 r6 16
+38	    store32 arg(1) r6
+
+RGBA_8888 over A8
+15 values (originally 31):
+↑ v0 = splat 3B808081 (0.0039215689)
+  v1 = load32 arg(0)
+  v2 = shr_i32 v1 24
+  v3 = to_f32 v2
+  v4 = mul_f32 v0 v3
+  v5 = load8 arg(1)
+  v6 = to_f32 v5
+  v7 = mul_f32 v0 v6
+↑ v8 = splat 3F800000 (1)
+  v9 = sub_f32 v8 v4
+  v10 = mad_f32 v7 v9 v4
+↑ v11 = splat 437F0000 (255)
+  v12 = mul_f32 v10 v11
+  v13 = round v12
+  store8 arg(1) v13
+
+6 registers, 15 instructions:
+0	r0 = splat 3B808081 (0.0039215689)
+1	r1 = splat 3F800000 (1)
+2	r2 = splat 437F0000 (255)
 loop:
 3	    r3 = load32 arg(0)
-4	    r4 = bit_and r3 FF
-5	    r4 = to_f32 r4
-6	    r4 = mul_f32 r4 3B808081 (0.0039215689)
-7	    r5 = load8 arg(1)
-8	    r5 = to_f32 r5
-9	    r5 = mul_f32 r5 3B808081 (0.0039215689)
-10	    r6 = shr_i32 r3 24
-11	    r6 = to_f32 r6
-12	    r6 = mul_f32 r6 3B808081 (0.0039215689)
-13	    r6 = sub_f32 r0 r6
-14	    r4 = mad_f32 r5 r6 r4
-15	    r7 = shr_i32 r3 8
-16	    r7 = bit_and r7 FF
-17	    r7 = to_f32 r7
-18	    r7 = mul_f32 r7 3B808081 (0.0039215689)
-19	    r7 = mad_f32 r5 r6 r7
-20	    r3 = shr_i32 r3 16
-21	    r3 = bit_and r3 FF
-22	    r3 = to_f32 r3
-23	    r3 = mul_f32 r3 3B808081 (0.0039215689)
-24	    r3 = mad_f32 r5 r6 r3
-25	    r3 = mul_f32 r3 3D93DD98 (0.0722)
-26	    r3 = mad_f32 r7 r2 r3
-27	    r3 = mad_f32 r4 r1 r3
-28	    r3 = mul_f32 r3 437F0000 (255)
-29	    r3 = round r3
-30	    store8 arg(1) r3
+4	    r3 = shr_i32 r3 24
+5	    r3 = to_f32 r3
+6	    r3 = mul_f32 r0 r3
+7	    r4 = load8 arg(1)
+8	    r4 = to_f32 r4
+9	    r4 = mul_f32 r0 r4
+10	    r5 = sub_f32 r1 r3
+11	    r3 = mad_f32 r4 r5 r3
+12	    r3 = mul_f32 r3 r2
+13	    r3 = round r3
+14	    store8 arg(1) r3
+
+RGBA_8888 over G8
+35 values (originally 36):
+↑ v0 = splat 3B808081 (0.0039215689)
+  v1 = load32 arg(0)
+↑ v2 = splat FF (3.5733111e-43)
+  v3 = bit_and v2 v1
+  v4 = to_f32 v3
+  v5 = mul_f32 v0 v4
+  v6 = load8 arg(1)
+  v7 = to_f32 v6
+  v8 = mul_f32 v0 v7
+  v9 = shr_i32 v1 24
+  v10 = to_f32 v9
+  v11 = mul_f32 v0 v10
+↑ v12 = splat 3F800000 (1)
+  v13 = sub_f32 v12 v11
+  v14 = mad_f32 v8 v13 v5
+↑ v15 = splat 3E59B3D0 (0.21259999)
+  v16 = shr_i32 v1 8
+  v17 = bit_and v2 v16
+  v18 = to_f32 v17
+  v19 = mul_f32 v0 v18
+  v20 = mad_f32 v8 v13 v19
+↑ v21 = splat 3F371759 (0.71520001)
+  v22 = shr_i32 v1 16
+  v23 = bit_and v2 v22
+  v24 = to_f32 v23
+  v25 = mul_f32 v0 v24
+  v26 = mad_f32 v8 v13 v25
+↑ v27 = splat 3D93DD98 (0.0722)
+  v28 = mul_f32 v26 v27
+  v29 = mad_f32 v20 v21 v28
+  v30 = mad_f32 v14 v15 v29
+↑ v31 = splat 437F0000 (255)
+  v32 = mul_f32 v30 v31
+  v33 = round v32
+  store8 arg(1) v33
+
+12 registers, 35 instructions:
+0	r0 = splat 3B808081 (0.0039215689)
+1	r1 = splat FF (3.5733111e-43)
+2	r2 = splat 3F800000 (1)
+3	r3 = splat 3E59B3D0 (0.21259999)
+4	r4 = splat 3F371759 (0.71520001)
+5	r5 = splat 3D93DD98 (0.0722)
+6	r6 = splat 437F0000 (255)
+loop:
+7	    r7 = load32 arg(0)
+8	    r8 = bit_and r1 r7
+9	    r8 = to_f32 r8
+10	    r8 = mul_f32 r0 r8
+11	    r9 = load8 arg(1)
+12	    r9 = to_f32 r9
+13	    r9 = mul_f32 r0 r9
+14	    r10 = shr_i32 r7 24
+15	    r10 = to_f32 r10
+16	    r10 = mul_f32 r0 r10
+17	    r10 = sub_f32 r2 r10
+18	    r8 = mad_f32 r9 r10 r8
+19	    r11 = shr_i32 r7 8
+20	    r11 = bit_and r1 r11
+21	    r11 = to_f32 r11
+22	    r11 = mul_f32 r0 r11
+23	    r11 = mad_f32 r9 r10 r11
+24	    r7 = shr_i32 r7 16
+25	    r7 = bit_and r1 r7
+26	    r7 = to_f32 r7
+27	    r7 = mul_f32 r0 r7
+28	    r7 = mad_f32 r9 r10 r7
+29	    r7 = mul_f32 r7 r5
+30	    r7 = mad_f32 r11 r4 r7
+31	    r7 = mad_f32 r8 r3 r7
+32	    r7 = mul_f32 r7 r6
+33	    r7 = round r7
+34	    store8 arg(1) r7
 
 RGBA_8888 over RGBA_8888
-48 values (originally 51):
-  v0 = load32 arg(0)
-  v1 = bit_and v0 FF
-  v2 = to_f32 v1
-  v3 = mul_f32 v2 3B808081 (0.0039215689)
-  v4 = load32 arg(1)
-  v5 = bit_and v4 FF
-  v6 = to_f32 v5
-  v7 = mul_f32 v6 3B808081 (0.0039215689)
-  v8 = shr_i32 v0 24
-  v9 = to_f32 v8
-  v10 = mul_f32 v9 3B808081 (0.0039215689)
-↑ v11 = splat 3F800000 (1)
-  v12 = sub_f32 v11 v10
-  v13 = mad_f32 v7 v12 v3
-  v14 = mul_f32 v13 437F0000 (255)
-  v15 = round v14
-  v16 = shr_i32 v0 8
-  v17 = bit_and v16 FF
-  v18 = to_f32 v17
-  v19 = mul_f32 v18 3B808081 (0.0039215689)
-  v20 = shr_i32 v4 8
-  v21 = bit_and v20 FF
-  v22 = to_f32 v21
-  v23 = mul_f32 v22 3B808081 (0.0039215689)
-  v24 = mad_f32 v23 v12 v19
-  v25 = mul_f32 v24 437F0000 (255)
-  v26 = round v25
-  v27 = pack v15 v26 8
-  v28 = shr_i32 v0 16
-  v29 = bit_and v28 FF
-  v30 = to_f32 v29
-  v31 = mul_f32 v30 3B808081 (0.0039215689)
-  v32 = shr_i32 v4 16
-  v33 = bit_and v32 FF
-  v34 = to_f32 v33
-  v35 = mul_f32 v34 3B808081 (0.0039215689)
-  v36 = mad_f32 v35 v12 v31
-  v37 = mul_f32 v36 437F0000 (255)
-  v38 = round v37
-  v39 = shr_i32 v4 24
-  v40 = to_f32 v39
-  v41 = mul_f32 v40 3B808081 (0.0039215689)
-  v42 = mad_f32 v41 v12 v10
-  v43 = mul_f32 v42 437F0000 (255)
-  v44 = round v43
-  v45 = pack v38 v44 8
-  v46 = pack v27 v45 16
-  store32 arg(1) v46
+51 values (originally 51):
+↑ v0 = splat 3B808081 (0.0039215689)
+  v1 = load32 arg(0)
+↑ v2 = splat FF (3.5733111e-43)
+  v3 = bit_and v2 v1
+  v4 = to_f32 v3
+  v5 = mul_f32 v0 v4
+  v6 = load32 arg(1)
+  v7 = bit_and v2 v6
+  v8 = to_f32 v7
+  v9 = mul_f32 v0 v8
+  v10 = shr_i32 v1 24
+  v11 = to_f32 v10
+  v12 = mul_f32 v0 v11
+↑ v13 = splat 3F800000 (1)
+  v14 = sub_f32 v13 v12
+  v15 = mad_f32 v9 v14 v5
+↑ v16 = splat 437F0000 (255)
+  v17 = mul_f32 v15 v16
+  v18 = round v17
+  v19 = shr_i32 v1 8
+  v20 = bit_and v2 v19
+  v21 = to_f32 v20
+  v22 = mul_f32 v0 v21
+  v23 = shr_i32 v6 8
+  v24 = bit_and v2 v23
+  v25 = to_f32 v24
+  v26 = mul_f32 v0 v25
+  v27 = mad_f32 v26 v14 v22
+  v28 = mul_f32 v27 v16
+  v29 = round v28
+  v30 = pack v18 v29 8
+  v31 = shr_i32 v1 16
+  v32 = bit_and v2 v31
+  v33 = to_f32 v32
+  v34 = mul_f32 v0 v33
+  v35 = shr_i32 v6 16
+  v36 = bit_and v2 v35
+  v37 = to_f32 v36
+  v38 = mul_f32 v0 v37
+  v39 = mad_f32 v38 v14 v34
+  v40 = mul_f32 v39 v16
+  v41 = round v40
+  v42 = shr_i32 v6 24
+  v43 = to_f32 v42
+  v44 = mul_f32 v0 v43
+  v45 = mad_f32 v44 v14 v12
+  v46 = mul_f32 v45 v16
+  v47 = round v46
+  v48 = pack v41 v47 8
+  v49 = pack v30 v48 16
+  store32 arg(1) v49
 
-8 registers, 48 instructions:
-0	r0 = splat 3F800000 (1)
+11 registers, 51 instructions:
+0	r0 = splat 3B808081 (0.0039215689)
+1	r1 = splat FF (3.5733111e-43)
+2	r2 = splat 3F800000 (1)
+3	r3 = splat 437F0000 (255)
 loop:
-1	    r1 = load32 arg(0)
-2	    r2 = bit_and r1 FF
-3	    r2 = to_f32 r2
-4	    r2 = mul_f32 r2 3B808081 (0.0039215689)
-5	    r3 = load32 arg(1)
-6	    r4 = bit_and r3 FF
-7	    r4 = to_f32 r4
-8	    r4 = mul_f32 r4 3B808081 (0.0039215689)
-9	    r5 = shr_i32 r1 24
-10	    r5 = to_f32 r5
-11	    r5 = mul_f32 r5 3B808081 (0.0039215689)
-12	    r6 = sub_f32 r0 r5
-13	    r2 = mad_f32 r4 r6 r2
-14	    r2 = mul_f32 r2 437F0000 (255)
-15	    r2 = round r2
-16	    r4 = shr_i32 r1 8
-17	    r4 = bit_and r4 FF
-18	    r4 = to_f32 r4
-19	    r4 = mul_f32 r4 3B808081 (0.0039215689)
-20	    r7 = shr_i32 r3 8
-21	    r7 = bit_and r7 FF
-22	    r7 = to_f32 r7
-23	    r7 = mul_f32 r7 3B808081 (0.0039215689)
-24	    r4 = mad_f32 r7 r6 r4
-25	    r4 = mul_f32 r4 437F0000 (255)
-26	    r4 = round r4
-27	    r4 = pack r2 r4 8
-28	    r1 = shr_i32 r1 16
-29	    r1 = bit_and r1 FF
-30	    r1 = to_f32 r1
-31	    r1 = mul_f32 r1 3B808081 (0.0039215689)
-32	    r2 = shr_i32 r3 16
-33	    r2 = bit_and r2 FF
-34	    r2 = to_f32 r2
-35	    r2 = mul_f32 r2 3B808081 (0.0039215689)
-36	    r1 = mad_f32 r2 r6 r1
-37	    r1 = mul_f32 r1 437F0000 (255)
-38	    r1 = round r1
-39	    r3 = shr_i32 r3 24
-40	    r3 = to_f32 r3
-41	    r3 = mul_f32 r3 3B808081 (0.0039215689)
-42	    r5 = mad_f32 r3 r6 r5
-43	    r5 = mul_f32 r5 437F0000 (255)
-44	    r5 = round r5
-45	    r5 = pack r1 r5 8
-46	    r5 = pack r4 r5 16
-47	    store32 arg(1) r5
+4	    r4 = load32 arg(0)
+5	    r5 = bit_and r1 r4
+6	    r5 = to_f32 r5
+7	    r5 = mul_f32 r0 r5
+8	    r6 = load32 arg(1)
+9	    r7 = bit_and r1 r6
+10	    r7 = to_f32 r7
+11	    r7 = mul_f32 r0 r7
+12	    r8 = shr_i32 r4 24
+13	    r8 = to_f32 r8
+14	    r8 = mul_f32 r0 r8
+15	    r9 = sub_f32 r2 r8
+16	    r5 = mad_f32 r7 r9 r5
+17	    r5 = mul_f32 r5 r3
+18	    r5 = round r5
+19	    r7 = shr_i32 r4 8
+20	    r7 = bit_and r1 r7
+21	    r7 = to_f32 r7
+22	    r7 = mul_f32 r0 r7
+23	    r10 = shr_i32 r6 8
+24	    r10 = bit_and r1 r10
+25	    r10 = to_f32 r10
+26	    r10 = mul_f32 r0 r10
+27	    r7 = mad_f32 r10 r9 r7
+28	    r7 = mul_f32 r7 r3
+29	    r7 = round r7
+30	    r7 = pack r5 r7 8
+31	    r4 = shr_i32 r4 16
+32	    r4 = bit_and r1 r4
+33	    r4 = to_f32 r4
+34	    r4 = mul_f32 r0 r4
+35	    r5 = shr_i32 r6 16
+36	    r5 = bit_and r1 r5
+37	    r5 = to_f32 r5
+38	    r5 = mul_f32 r0 r5
+39	    r4 = mad_f32 r5 r9 r4
+40	    r4 = mul_f32 r4 r3
+41	    r4 = round r4
+42	    r6 = shr_i32 r6 24
+43	    r6 = to_f32 r6
+44	    r6 = mul_f32 r0 r6
+45	    r8 = mad_f32 r6 r9 r8
+46	    r8 = mul_f32 r8 r3
+47	    r8 = round r8
+48	    r8 = pack r4 r8 8
+49	    r8 = pack r7 r8 16
+50	    store32 arg(1) r8
 
 I32 (Naive) 8888 over 8888
-32 values (originally 33):
+33 values (originally 33):
   v0 = load32 arg(0)
-  v1 = bit_and v0 FF
-  v2 = load32 arg(1)
-  v3 = bit_and v2 FF
-  v4 = shr_i32 v0 24
-↑ v5 = splat 100 (3.5873241e-43)
-  v6 = sub_i32 v5 v4
-  v7 = mul_i32 v3 v6
-  v8 = shr_i32 v7 8
-  v9 = add_i32 v1 v8
-  v10 = shr_i32 v0 8
-  v11 = bit_and v10 FF
-  v12 = shr_i32 v2 8
-  v13 = bit_and v12 FF
-  v14 = mul_i32 v13 v6
-  v15 = shr_i32 v14 8
-  v16 = add_i32 v11 v15
-  v17 = pack v9 v16 8
-  v18 = shr_i32 v0 16
-  v19 = bit_and v18 FF
-  v20 = shr_i32 v2 16
-  v21 = bit_and v20 FF
-  v22 = mul_i32 v21 v6
-  v23 = shr_i32 v22 8
-  v24 = add_i32 v19 v23
-  v25 = shr_i32 v2 24
-  v26 = mul_i32 v25 v6
-  v27 = shr_i32 v26 8
-  v28 = add_i32 v4 v27
-  v29 = pack v24 v28 8
-  v30 = pack v17 v29 16
-  store32 arg(1) v30
+↑ v1 = splat FF (3.5733111e-43)
+  v2 = bit_and v1 v0
+  v3 = load32 arg(1)
+  v4 = bit_and v1 v3
+  v5 = shr_i32 v0 24
+↑ v6 = splat 100 (3.5873241e-43)
+  v7 = sub_i32 v6 v5
+  v8 = mul_i32 v4 v7
+  v9 = shr_i32 v8 8
+  v10 = add_i32 v2 v9
+  v11 = shr_i32 v0 8
+  v12 = bit_and v1 v11
+  v13 = shr_i32 v3 8
+  v14 = bit_and v1 v13
+  v15 = mul_i32 v14 v7
+  v16 = shr_i32 v15 8
+  v17 = add_i32 v12 v16
+  v18 = pack v10 v17 8
+  v19 = shr_i32 v0 16
+  v20 = bit_and v1 v19
+  v21 = shr_i32 v3 16
+  v22 = bit_and v1 v21
+  v23 = mul_i32 v22 v7
+  v24 = shr_i32 v23 8
+  v25 = add_i32 v20 v24
+  v26 = shr_i32 v3 24
+  v27 = mul_i32 v26 v7
+  v28 = shr_i32 v27 8
+  v29 = add_i32 v5 v28
+  v30 = pack v25 v29 8
+  v31 = pack v18 v30 16
+  store32 arg(1) v31
 
-8 registers, 32 instructions:
-0	r0 = splat 100 (3.5873241e-43)
+9 registers, 33 instructions:
+0	r0 = splat FF (3.5733111e-43)
+1	r1 = splat 100 (3.5873241e-43)
 loop:
-1	    r1 = load32 arg(0)
-2	    r2 = bit_and r1 FF
-3	    r3 = load32 arg(1)
-4	    r4 = bit_and r3 FF
-5	    r5 = shr_i32 r1 24
-6	    r6 = sub_i32 r0 r5
-7	    r4 = mul_i32 r4 r6
-8	    r4 = shr_i32 r4 8
-9	    r4 = add_i32 r2 r4
-10	    r2 = shr_i32 r1 8
-11	    r2 = bit_and r2 FF
-12	    r7 = shr_i32 r3 8
-13	    r7 = bit_and r7 FF
-14	    r7 = mul_i32 r7 r6
-15	    r7 = shr_i32 r7 8
-16	    r7 = add_i32 r2 r7
-17	    r7 = pack r4 r7 8
-18	    r1 = shr_i32 r1 16
-19	    r1 = bit_and r1 FF
-20	    r4 = shr_i32 r3 16
-21	    r4 = bit_and r4 FF
-22	    r4 = mul_i32 r4 r6
-23	    r4 = shr_i32 r4 8
-24	    r4 = add_i32 r1 r4
-25	    r3 = shr_i32 r3 24
-26	    r6 = mul_i32 r3 r6
-27	    r6 = shr_i32 r6 8
-28	    r6 = add_i32 r5 r6
-29	    r6 = pack r4 r6 8
-30	    r6 = pack r7 r6 16
-31	    store32 arg(1) r6
+2	    r2 = load32 arg(0)
+3	    r3 = bit_and r0 r2
+4	    r4 = load32 arg(1)
+5	    r5 = bit_and r0 r4
+6	    r6 = shr_i32 r2 24
+7	    r7 = sub_i32 r1 r6
+8	    r5 = mul_i32 r5 r7
+9	    r5 = shr_i32 r5 8
+10	    r5 = add_i32 r3 r5
+11	    r3 = shr_i32 r2 8
+12	    r3 = bit_and r0 r3
+13	    r8 = shr_i32 r4 8
+14	    r8 = bit_and r0 r8
+15	    r8 = mul_i32 r8 r7
+16	    r8 = shr_i32 r8 8
+17	    r8 = add_i32 r3 r8
+18	    r8 = pack r5 r8 8
+19	    r2 = shr_i32 r2 16
+20	    r2 = bit_and r0 r2
+21	    r5 = shr_i32 r4 16
+22	    r5 = bit_and r0 r5
+23	    r5 = mul_i32 r5 r7
+24	    r5 = shr_i32 r5 8
+25	    r5 = add_i32 r2 r5
+26	    r4 = shr_i32 r4 24
+27	    r7 = mul_i32 r4 r7
+28	    r7 = shr_i32 r7 8
+29	    r7 = add_i32 r6 r7
+30	    r7 = pack r5 r7 8
+31	    r7 = pack r8 r7 16
+32	    store32 arg(1) r7
 
 I32 8888 over 8888
-28 values (originally 29):
+29 values (originally 29):
   v0 = load32 arg(0)
-  v1 = bit_and v0 FF
-  v2 = load32 arg(1)
-  v3 = bit_and v2 FF
-  v4 = shr_i32 v0 24
-↑ v5 = splat 100 (3.5873241e-43)
-  v6 = sub_i32 v5 v4
-  v7 = mul_i16x2 v3 v6
-  v8 = shr_i32 v7 8
-  v9 = add_i32 v1 v8
-  v10 = bytes v0 2
-  v11 = bytes v2 2
-  v12 = mul_i16x2 v11 v6
-  v13 = shr_i32 v12 8
-  v14 = add_i32 v10 v13
-  v15 = pack v9 v14 8
-  v16 = bytes v0 3
-  v17 = bytes v2 3
-  v18 = mul_i16x2 v17 v6
-  v19 = shr_i32 v18 8
-  v20 = add_i32 v16 v19
-  v21 = shr_i32 v2 24
-  v22 = mul_i16x2 v21 v6
-  v23 = shr_i32 v22 8
-  v24 = add_i32 v4 v23
-  v25 = pack v20 v24 8
-  v26 = pack v15 v25 16
-  store32 arg(1) v26
+↑ v1 = splat FF (3.5733111e-43)
+  v2 = bit_and v0 v1
+  v3 = load32 arg(1)
+  v4 = bit_and v3 v1
+  v5 = shr_i32 v0 24
+↑ v6 = splat 100 (3.5873241e-43)
+  v7 = sub_i32 v6 v5
+  v8 = mul_i16x2 v4 v7
+  v9 = shr_i32 v8 8
+  v10 = add_i32 v2 v9
+  v11 = bytes v0 2
+  v12 = bytes v3 2
+  v13 = mul_i16x2 v12 v7
+  v14 = shr_i32 v13 8
+  v15 = add_i32 v11 v14
+  v16 = pack v10 v15 8
+  v17 = bytes v0 3
+  v18 = bytes v3 3
+  v19 = mul_i16x2 v18 v7
+  v20 = shr_i32 v19 8
+  v21 = add_i32 v17 v20
+  v22 = shr_i32 v3 24
+  v23 = mul_i16x2 v22 v7
+  v24 = shr_i32 v23 8
+  v25 = add_i32 v5 v24
+  v26 = pack v21 v25 8
+  v27 = pack v16 v26 16
+  store32 arg(1) v27
 
-8 registers, 28 instructions:
-0	r0 = splat 100 (3.5873241e-43)
+9 registers, 29 instructions:
+0	r0 = splat FF (3.5733111e-43)
+1	r1 = splat 100 (3.5873241e-43)
 loop:
-1	    r1 = load32 arg(0)
-2	    r2 = bit_and r1 FF
-3	    r3 = load32 arg(1)
-4	    r4 = bit_and r3 FF
-5	    r5 = shr_i32 r1 24
-6	    r6 = sub_i32 r0 r5
-7	    r4 = mul_i16x2 r4 r6
-8	    r4 = shr_i32 r4 8
-9	    r4 = add_i32 r2 r4
-10	    r2 = bytes r1 2
-11	    r7 = bytes r3 2
-12	    r7 = mul_i16x2 r7 r6
-13	    r7 = shr_i32 r7 8
-14	    r7 = add_i32 r2 r7
-15	    r7 = pack r4 r7 8
-16	    r1 = bytes r1 3
-17	    r4 = bytes r3 3
-18	    r4 = mul_i16x2 r4 r6
-19	    r4 = shr_i32 r4 8
-20	    r4 = add_i32 r1 r4
-21	    r3 = shr_i32 r3 24
-22	    r6 = mul_i16x2 r3 r6
-23	    r6 = shr_i32 r6 8
-24	    r6 = add_i32 r5 r6
-25	    r6 = pack r4 r6 8
-26	    r6 = pack r7 r6 16
-27	    store32 arg(1) r6
+2	    r2 = load32 arg(0)
+3	    r3 = bit_and r2 r0
+4	    r4 = load32 arg(1)
+5	    r5 = bit_and r4 r0
+6	    r6 = shr_i32 r2 24
+7	    r7 = sub_i32 r1 r6
+8	    r5 = mul_i16x2 r5 r7
+9	    r5 = shr_i32 r5 8
+10	    r5 = add_i32 r3 r5
+11	    r3 = bytes r2 2
+12	    r8 = bytes r4 2
+13	    r8 = mul_i16x2 r8 r7
+14	    r8 = shr_i32 r8 8
+15	    r8 = add_i32 r3 r8
+16	    r8 = pack r5 r8 8
+17	    r2 = bytes r2 3
+18	    r5 = bytes r4 3
+19	    r5 = mul_i16x2 r5 r7
+20	    r5 = shr_i32 r5 8
+21	    r5 = add_i32 r2 r5
+22	    r4 = shr_i32 r4 24
+23	    r7 = mul_i16x2 r4 r7
+24	    r7 = shr_i32 r7 8
+25	    r7 = add_i32 r6 r7
+26	    r7 = pack r5 r7 8
+27	    r7 = pack r8 r7 16
+28	    store32 arg(1) r7
 
 I32 (SWAR) 8888 over 8888
-14 values (originally 15):
+15 values (originally 15):
   v0 = load32 arg(0)
   v1 = bytes v0 404
 ↑ v2 = splat 1000100 (2.3510604e-38)
   v3 = sub_i16x2 v2 v1
   v4 = load32 arg(1)
-  v5 = bit_and v4 FF00FF
-  v6 = mul_i16x2 v5 v3
-  v7 = shr_i16x2 v6 8
-  v8 = shr_i16x2 v4 8
-  v9 = mul_i16x2 v8 v3
-  v10 = bit_and v9 FF00FF00
-  v11 = bit_or v7 v10
-  v12 = add_i32 v0 v11
-  store32 arg(1) v12
+↑ v5 = splat FF00FF (2.3418409e-38)
+  v6 = bit_and v4 v5
+  v7 = mul_i16x2 v6 v3
+  v8 = shr_i16x2 v7 8
+  v9 = shr_i16x2 v4 8
+  v10 = mul_i16x2 v9 v3
+  v11 = bit_clear v10 v5
+  v12 = bit_or v8 v11
+  v13 = add_i32 v0 v12
+  store32 arg(1) v13
 
-5 registers, 14 instructions:
+6 registers, 15 instructions:
 0	r0 = splat 1000100 (2.3510604e-38)
+1	r1 = splat FF00FF (2.3418409e-38)
 loop:
-1	    r1 = load32 arg(0)
-2	    r2 = bytes r1 404
-3	    r2 = sub_i16x2 r0 r2
-4	    r3 = load32 arg(1)
-5	    r4 = bit_and r3 FF00FF
-6	    r4 = mul_i16x2 r4 r2
-7	    r4 = shr_i16x2 r4 8
-8	    r3 = shr_i16x2 r3 8
-9	    r2 = mul_i16x2 r3 r2
-10	    r2 = bit_and r2 FF00FF00
-11	    r2 = bit_or r4 r2
-12	    r2 = add_i32 r1 r2
-13	    store32 arg(1) r2
+2	    r2 = load32 arg(0)
+3	    r3 = bytes r2 404
+4	    r3 = sub_i16x2 r0 r3
+5	    r4 = load32 arg(1)
+6	    r5 = bit_and r4 r1
+7	    r5 = mul_i16x2 r5 r3
+8	    r5 = shr_i16x2 r5 8
+9	    r4 = shr_i16x2 r4 8
+10	    r3 = mul_i16x2 r4 r3
+11	    r3 = bit_clear r3 r1
+12	    r3 = bit_or r5 r3
+13	    r3 = add_i32 r2 r3
+14	    store32 arg(1) r3
 
 6 values (originally 6):
 ↟ v0 = splat 1 (1.4012985e-45)
@@ -671,52 +727,54 @@
 4	    r0 = mul_i32 r0 r1
 5	    store32 arg(0) r0
 
-22 values (originally 23):
-  v0 = load32 arg(0)
-  v1 = bit_and v0 FF
-  v2 = load32 arg(1)
-  v3 = bit_and v2 FF
-  v4 = add_i32 v1 v3
-  v5 = shr_i32 v0 8
-  v6 = bit_and v5 FF
-  v7 = shr_i32 v2 8
-  v8 = bit_and v7 FF
-  v9 = add_i32 v6 v8
-  v10 = pack v4 v9 8
-  v11 = shr_i32 v0 16
-  v12 = bit_and v11 FF
-  v13 = shr_i32 v2 16
-  v14 = bit_and v13 FF
-  v15 = add_i32 v12 v14
-  v16 = shr_i32 v0 24
-  v17 = shr_i32 v2 24
-  v18 = add_i32 v16 v17
-  v19 = pack v15 v18 8
-  v20 = pack v10 v19 16
-  store32 arg(1) v20
+23 values (originally 23):
+↑ v0 = splat FF (3.5733111e-43)
+  v1 = load32 arg(0)
+  v2 = bit_and v0 v1
+  v3 = load32 arg(1)
+  v4 = bit_and v0 v3
+  v5 = add_i32 v2 v4
+  v6 = shr_i32 v1 8
+  v7 = bit_and v0 v6
+  v8 = shr_i32 v3 8
+  v9 = bit_and v0 v8
+  v10 = add_i32 v7 v9
+  v11 = pack v5 v10 8
+  v12 = shr_i32 v1 16
+  v13 = bit_and v0 v12
+  v14 = shr_i32 v3 16
+  v15 = bit_and v0 v14
+  v16 = add_i32 v13 v15
+  v17 = shr_i32 v1 24
+  v18 = shr_i32 v3 24
+  v19 = add_i32 v17 v18
+  v20 = pack v16 v19 8
+  v21 = pack v11 v20 16
+  store32 arg(1) v21
 
-5 registers, 22 instructions:
+6 registers, 23 instructions:
+0	r0 = splat FF (3.5733111e-43)
 loop:
-0	    r0 = load32 arg(0)
-1	    r1 = bit_and r0 FF
-2	    r2 = load32 arg(1)
-3	    r3 = bit_and r2 FF
-4	    r3 = add_i32 r1 r3
-5	    r1 = shr_i32 r0 8
-6	    r1 = bit_and r1 FF
-7	    r4 = shr_i32 r2 8
-8	    r4 = bit_and r4 FF
-9	    r4 = add_i32 r1 r4
-10	    r4 = pack r3 r4 8
-11	    r3 = shr_i32 r0 16
-12	    r3 = bit_and r3 FF
-13	    r1 = shr_i32 r2 16
-14	    r1 = bit_and r1 FF
-15	    r1 = add_i32 r3 r1
-16	    r0 = shr_i32 r0 24
-17	    r2 = shr_i32 r2 24
-18	    r2 = add_i32 r0 r2
-19	    r2 = pack r1 r2 8
-20	    r2 = pack r4 r2 16
-21	    store32 arg(1) r2
+1	    r1 = load32 arg(0)
+2	    r2 = bit_and r0 r1
+3	    r3 = load32 arg(1)
+4	    r4 = bit_and r0 r3
+5	    r4 = add_i32 r2 r4
+6	    r2 = shr_i32 r1 8
+7	    r2 = bit_and r0 r2
+8	    r5 = shr_i32 r3 8
+9	    r5 = bit_and r0 r5
+10	    r5 = add_i32 r2 r5
+11	    r5 = pack r4 r5 8
+12	    r4 = shr_i32 r1 16
+13	    r4 = bit_and r0 r4
+14	    r2 = shr_i32 r3 16
+15	    r2 = bit_and r0 r2
+16	    r2 = add_i32 r4 r2
+17	    r1 = shr_i32 r1 24
+18	    r3 = shr_i32 r3 24
+19	    r3 = add_i32 r1 r3
+20	    r3 = pack r2 r3 8
+21	    r3 = pack r5 r3 16
+22	    store32 arg(1) r3
 
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index c758a7e..9e4b6bf 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -350,58 +350,59 @@
         }
     }
 
-    std::vector<OptimizedInstruction> Builder::optimize(/*TODO bool jit*/) const {
-        // First specialize for our target backend.
-        Builder specialized;
-        for (int i = 0; i < (int)fProgram.size(); i++) {
-            Builder::Instruction inst = fProgram[i];
+    std::vector<OptimizedInstruction> Builder::optimize(bool for_jit) const {
+        // If requested, first specialize for our JIT backend.
+        auto specialize_for_jit = [&]() -> std::vector<Instruction> {
+            Builder specialized;
+            for (int i = 0; i < (int)fProgram.size(); i++) {
+                Builder::Instruction inst = fProgram[i];
 
-            #if defined(SK_CPU_X86)
-            Op imm_op;
-            switch (inst.op) {
-                default: break;
+                #if defined(SK_CPU_X86)
+                switch (Op imm_op; inst.op) {
+                    default: break;
 
-                case Op::add_f32: imm_op = Op::add_f32_imm; goto try_imm_x_and_y;
-                case Op::sub_f32: imm_op = Op::sub_f32_imm; goto try_imm_y;
-                case Op::mul_f32: imm_op = Op::mul_f32_imm; goto try_imm_x_and_y;
-                case Op::min_f32: imm_op = Op::min_f32_imm; goto try_imm_x_and_y;
-                case Op::max_f32: imm_op = Op::max_f32_imm; goto try_imm_x_and_y;
-                case Op::bit_and: imm_op = Op::bit_and_imm; goto try_imm_x_and_y;
-                case Op::bit_or:  imm_op = Op::bit_or_imm ; goto try_imm_x_and_y;
-                case Op::bit_xor: imm_op = Op::bit_xor_imm; goto try_imm_x_and_y;
+                    case Op::add_f32: imm_op = Op::add_f32_imm; goto try_imm_x_and_y;
+                    case Op::sub_f32: imm_op = Op::sub_f32_imm; goto try_imm_y;
+                    case Op::mul_f32: imm_op = Op::mul_f32_imm; goto try_imm_x_and_y;
+                    case Op::min_f32: imm_op = Op::min_f32_imm; goto try_imm_x_and_y;
+                    case Op::max_f32: imm_op = Op::max_f32_imm; goto try_imm_x_and_y;
+                    case Op::bit_and: imm_op = Op::bit_and_imm; goto try_imm_x_and_y;
+                    case Op::bit_or:  imm_op = Op::bit_or_imm ; goto try_imm_x_and_y;
+                    case Op::bit_xor: imm_op = Op::bit_xor_imm; goto try_imm_x_and_y;
 
-                try_imm_x_and_y:
-                    if (int bits; /*TODO jit &&*/this->allImm(inst.x, &bits)) {
-                        inst.op   = imm_op;
-                        inst.x    = inst.y;
-                        inst.y    = NA;
-                        inst.immy = bits;
-                    } else
-                try_imm_y:
-                    if (int bits; /*TODO jit &&*/this->allImm(inst.y, &bits)) {
-                        inst.op   = imm_op;
-                        inst.y    = NA;
-                        inst.immy = bits;
-                    } break;
+                    try_imm_x_and_y:
+                        if (int bits; this->allImm(inst.x, &bits)) {
+                            inst.op   = imm_op;
+                            inst.x    = inst.y;
+                            inst.y    = NA;
+                            inst.immy = bits;
+                        } else
+                    try_imm_y:
+                        if (int bits; this->allImm(inst.y, &bits)) {
+                            inst.op   = imm_op;
+                            inst.y    = NA;
+                            inst.immy = bits;
+                        } break;
 
-                case Op::bit_clear:
-                    if (int bits; /*TODO jit &&*/this->allImm(inst.y, &bits)) {
-                        inst.op   = Op::bit_and_imm;
-                        inst.y    = NA;
-                        inst.immy = ~bits;
-                    } break;
+                    case Op::bit_clear:
+                        if (int bits; this->allImm(inst.y, &bits)) {
+                            inst.op   = Op::bit_and_imm;
+                            inst.y    = NA;
+                            inst.immy = ~bits;
+                        } break;
+                }
+                #endif
+                SkDEBUGCODE(Val id =) specialized.push(inst.op,
+                                                       inst.x,inst.y,inst.z,
+                                                       inst.immy,inst.immz);
+                // If we replace single instructions with multiple, this will start breaking,
+                // and we'll need a table to remap them like we have in optimize().
+                SkASSERT(id == i);
             }
-            #endif
-            SkDEBUGCODE(Val id =) specialized.push(inst.op,
-                                                   inst.x,inst.y,inst.z,
-                                                   inst.immy,inst.immz);
-            // If we replace single instructions with multiple, this will start breaking,
-            // and we'll need a table to remap them like we have in optimize().
-            SkASSERT(id == i);
-        }
-
-        // N.B. specialized.fStrides is not set, but our original fStrides is still fine.
-        const std::vector<Builder::Instruction>& program = specialized.fProgram;
+            return specialized.fProgram;
+        };
+        const std::vector<Builder::Instruction>& program = for_jit ? specialize_for_jit()
+                                                                   : fProgram;
 
         // Next rewrite the program order by issuing instructions as late as possible:
         //    - any side-effect-only (i.e. store) instruction in order as we see them;
@@ -504,7 +505,12 @@
             *SkStrAppendU32(buf+9, this->hash()) = '\0';
             debug_name = buf;
         }
-        return {this->optimize(), fStrides, debug_name};
+
+    #if defined(SKVM_JIT)
+        return {this->optimize(false), this->optimize(true), fStrides, debug_name};
+    #else
+        return {this->optimize(false), fStrides};
+    #endif
     }
 
     uint64_t Builder::hash() const {
@@ -1788,31 +1794,15 @@
                     CASE(Op::min_f32): r(d).f32 = min(r(x).f32, r(y).f32); break;
                     CASE(Op::max_f32): r(d).f32 = max(r(x).f32, r(y).f32); break;
 
-                    CASE(Op::add_f32_imm): {
-                        Slot tmp;
-                        tmp.i32 = immy;
-                        r(d).f32 = r(x).f32 + tmp.f32;
-                    } break;
-                    CASE(Op::sub_f32_imm): {
-                        Slot tmp;
-                        tmp.i32 = immy;
-                        r(d).f32 = r(x).f32 - tmp.f32;
-                    } break;
-                    CASE(Op::mul_f32_imm): {
-                        Slot tmp;
-                        tmp.i32 = immy;
-                        r(d).f32 = r(x).f32 * tmp.f32;
-                    } break;
-                    CASE(Op::min_f32_imm): {
-                        Slot tmp;
-                        tmp.i32 = immy;
-                        r(d).f32 = min(r(x).f32, tmp.f32);
-                    } break;
-                    CASE(Op::max_f32_imm): {
-                        Slot tmp;
-                        tmp.i32 = immy;
-                        r(d).f32 = max(r(x).f32, tmp.f32);
-                    } break;
+                    // These _imm instructions are all x86/JIT only.
+                    CASE(Op::add_f32_imm):
+                    CASE(Op::sub_f32_imm):
+                    CASE(Op::mul_f32_imm):
+                    CASE(Op::min_f32_imm):
+                    CASE(Op::max_f32_imm):
+                    CASE(Op::bit_and_imm):
+                    CASE(Op::bit_or_imm ):
+                    CASE(Op::bit_xor_imm): SkUNREACHABLE; break;
 
                     CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break;
 
@@ -1854,10 +1844,6 @@
                     CASE(Op::bit_xor  ): r(d).i32 = r(x).i32 ^  r(y).i32; break;
                     CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
 
-                    CASE(Op::bit_and_imm): r(d).i32 = r(x).i32 & immy; break;
-                    CASE(Op::bit_or_imm ): r(d).i32 = r(x).i32 | immy; break;
-                    CASE(Op::bit_xor_imm): r(d).i32 = r(x).i32 ^ immy; break;
-
                     CASE(Op::select): r(d).i32 = skvx::if_then_else(r(x).i32, r(y).i32, r(z).i32);
                                       break;
 
@@ -1934,14 +1920,17 @@
 
     Program::Program() {}
 
-    Program::Program(const std::vector<OptimizedInstruction>& instructions,
+    Program::Program(const std::vector<OptimizedInstruction>& interpreter,
+                     const std::vector<int>& strides) : fStrides(strides) {
+        this->setupInterpreter(interpreter);
+    }
+
+    Program::Program(const std::vector<OptimizedInstruction>& interpreter,
+                     const std::vector<OptimizedInstruction>& jit,
                      const std::vector<int>& strides,
-                     const char* debug_name)
-        : fStrides(strides)
-    {
-        this->setupInterpreter(instructions);
+                     const char* debug_name) : Program(interpreter, strides) {
     #if 1 && defined(SKVM_JIT)
-        this->setupJIT(instructions, debug_name);
+        this->setupJIT(jit, debug_name);
     #endif
     }
 
@@ -2492,8 +2481,15 @@
                                        if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
                                                             break;
 
-                // We should not see _imm ops on ARM.
-                case Op::mul_f32_imm: SkUNREACHABLE; break;
+                // These _imm instructions are all x86/JIT only.
+                case  Op::add_f32_imm :
+                case  Op::sub_f32_imm :
+                case  Op::mul_f32_imm :
+                case  Op::min_f32_imm :
+                case  Op::max_f32_imm :
+                case  Op::bit_and_imm :
+                case  Op::bit_or_imm  :
+                case  Op::bit_xor_imm : SkUNREACHABLE; break;
 
                 case Op:: gt_f32: a->fcmgt4s (dst(), r[x], r[y]); break;
                 case Op::gte_f32: a->fcmge4s (dst(), r[x], r[y]); break;
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index fc6cf52..5d870f3 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -361,7 +361,7 @@
 
         // Mostly for debugging, tests, etc.
         std::vector<Instruction> program() const { return fProgram; }
-        std::vector<OptimizedInstruction> optimize() const;
+        std::vector<OptimizedInstruction> optimize(bool for_jit=false) const;
 
         // Declare an argument with given stride (use stride=0 for uniforms).
         // TODO: different types for varying and uniforms?
@@ -617,8 +617,12 @@
             union { Reg z; int immz; };
         };
 
-        Program(const std::vector<OptimizedInstruction>& instructions,
-                const std::vector<int>                 & strides,
+        Program(const std::vector<OptimizedInstruction>& interpreter,
+                const std::vector<int>& strides);
+
+        Program(const std::vector<OptimizedInstruction>& interpreter,
+                const std::vector<OptimizedInstruction>& jit,
+                const std::vector<int>& strides,
                 const char* debug_name);
 
         Program();
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 4bfb0f7..b0c9699 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -161,7 +161,6 @@
         dump(b, &buf);
     }
 
-#if defined(SK_CPU_X86)
     sk_sp<SkData> blob = buf.detachAsData();
     {
 
@@ -182,7 +181,6 @@
             }
         }
     }
-#endif
 
     auto test_8888 = [&](skvm::Program&& program) {
         uint32_t src[9];