streamline srcover math in I32_SWAR

This is the final bunny I've got in my hat, I think...

Remembering that none of the s += d*invA adds can overflow,
we can use a single 32-bit add to add them all at once.

This means we don't have to unpack the src pixel into rb/ga
halves.  We need only extract the alpha for invA.

This brings I32_SWAR even with the Opts code!

curr/maxrss	loops	min	median	mean	max	stddev	samples   	config	bench
  36/36  MB	133	0.206ns	0.211ns	0.208ns	0.211ns	1%	▁▇▁█▁▇▁▇▁▇	nonrendering	SkVM_4096_I32_SWAR
  37/37  MB	152	0.432ns	0.432ns	0.434ns	0.444ns	1%	▃▁▁▁▁▃▁▁█▁	nonrendering	SkVM_4096_I32
  37/37  MB	50	0.781ns	0.794ns	0.815ns	0.895ns	5%	▆▂█▃▅▂▂▁▂▁	nonrendering	SkVM_4096_F32
  37/37  MB	76	0.773ns	0.78ns	0.804ns	0.907ns	6%	▄█▅▁▁▁▁▂▁▁	nonrendering	SkVM_4096_RP
  37/37  MB	268	0.201ns	0.203ns	0.203ns	0.204ns	0%	█▇▆▆▆▆▁▆▆▆	nonrendering	SkVM_4096_Opts

Change-Id: Ibf0a9c5d90b35f1e9cf7265868bd18b7e0a76c43
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220805
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/resources/SkVMTest.expected b/resources/SkVMTest.expected
index 880155f..396899c 100644
--- a/resources/SkVMTest.expected
+++ b/resources/SkVMTest.expected
@@ -599,27 +599,23 @@
 store32 arg(1) r10
 
 I32 (SWAR) 8888 over 8888
-8 registers, 21 instructions:
-r0 = splat FF00FF (2.3418409e-38)
-r1 = splat 1000100 (2.3510604e-38)
+7 registers, 17 instructions:
+r0 = splat 1000100 (2.3510604e-38)
+r1 = splat FF00FF (2.3418409e-38)
 r2 = splat FF00FF00 (-1.7146522e+38)
 loop:
 r3 = load32 arg(0)
-r4 = extract r3 0 r0
-r5 = extract r3 8 r0
-r3 = shr r3 24
-r3 = pack r3 r3 16
-r3 = sub_i16x2 r1 r3
-r6 = load32 arg(1)
-r7 = extract r6 0 r0
-r6 = extract r6 8 r0
-r7 = mul_i16x2 r7 r3
-r7 = shr_i16x2 r7 8
-r6 = mul_i16x2 r6 r3
-r6 = bit_and r6 r2
-r7 = add_i32 r4 r7
-r5 = shl r5 8
-r5 = add_i32 r5 r6
-r5 = bit_or r7 r5
+r4 = shr r3 24
+r4 = pack r4 r4 16
+r4 = sub_i16x2 r0 r4
+r5 = load32 arg(1)
+r6 = extract r5 0 r1
+r5 = extract r5 8 r1
+r6 = mul_i16x2 r6 r4
+r6 = shr_i16x2 r6 8
+r5 = mul_i16x2 r5 r4
+r5 = bit_and r5 r2
+r5 = bit_or r6 r5
+r5 = add_i32 r3 r5
 store32 arg(1) r5
 
diff --git a/tools/SkVMBuilders.cpp b/tools/SkVMBuilders.cpp
index ce8ef31..b958462 100644
--- a/tools/SkVMBuilders.cpp
+++ b/tools/SkVMBuilders.cpp
@@ -138,33 +138,23 @@
     skvm::Arg src = arg(0),
               dst = arg(1);
 
-    auto load = [&](skvm::Arg ptr,
-                    skvm::I32* rb, skvm::I32* ga, skvm::I32* a) {
-        skvm::I32 rgba = load32(ptr);
-        *rb = extract(rgba, 0, splat(0x00ff00ff));
-        *ga = extract(rgba, 8, splat(0x00ff00ff));
-        * a = shr    (rgba, 24);
-    };
+    // The s += d*invA adds won't overflow,
+    // so we don't have to unpack s beyond grabbing the alpha channel.
+    skvm::I32 s = load32(src),
+              a = shr(s, 24);
 
-    skvm::I32 rb, ga, a;
-    load(src, &rb, &ga, &a);
+    // We'll use the same approximation math as above, this time making sure to
+    // use both i16 multiplies to our benefit, one for r/g, the other for b/a.
+    skvm::I32 ax2    = pack(a,a,16),
+              invAx2 = sub_16x2(splat(0x01000100), ax2);
 
-    skvm::I32 ax2    = pack(a,a, 16),
-              invAx2 = sub_16x2(splat(0x01000100/*256 x2*/), ax2);
+    skvm::I32 d   = load32(dst),
+              rb = extract(d, 0, splat(0x00ff00ff)),
+              ga = extract(d, 8, splat(0x00ff00ff));
 
-    skvm::I32 drb, dga, da;
-    load(dst, &drb, &dga, &da);
+    rb = shr_16x2(mul_16x2(rb, invAx2), 8);  // Put the high 8 bits back in the low lane.
+    ga =          mul_16x2(ga, invAx2);      // Keep the high 8 bits up high...
+    ga = bit_and(ga, splat(0xff00ff00));     // ...and mask off the low bits.
 
-    // Same approximation as above,
-    // but this time we make sure to use both i16 multiplies to our benefit,
-    // one for r/g, the other for b/a simultaneously.
-
-    skvm::I32 RB = shr_16x2(mul_16x2(drb, invAx2), 8),  // 8 high bits of results shifted back down.
-              GA =          mul_16x2(dga, invAx2)    ;  // Keep high bits of results in high lanes,
-    GA = bit_and(GA, splat(0xff00ff00));                // and mask off any low bits remaining.
-
-    rb = add(    rb    , RB);   // src += dst*invA
-    ga = add(shl(ga, 8), GA);
-
-    store32(dst, bit_or(rb,ga));
+    store32(dst, add(s, bit_or(rb, ga)));
 }