streamline srcover math in I32_SWAR This is the final bunny I've got in my hat, I think... Remembering that none of the s += d*invA adds can overflow, we can use a single 32-bit add to add them all at once. This means we don't have to unpack the src pixel into rb/ga halves. We need only extract the alpha for invA. This brings I32_SWAR even with the Opts code! curr/maxrss loops min median mean max stddev samples config bench 36/36 MB 133 0.206ns 0.211ns 0.208ns 0.211ns 1% ▁▇▁█▁▇▁▇▁▇ nonrendering SkVM_4096_I32_SWAR 37/37 MB 152 0.432ns 0.432ns 0.434ns 0.444ns 1% ▃▁▁▁▁▃▁▁█▁ nonrendering SkVM_4096_I32 37/37 MB 50 0.781ns 0.794ns 0.815ns 0.895ns 5% ▆▂█▃▅▂▂▁▂▁ nonrendering SkVM_4096_F32 37/37 MB 76 0.773ns 0.78ns 0.804ns 0.907ns 6% ▄█▅▁▁▁▁▂▁▁ nonrendering SkVM_4096_RP 37/37 MB 268 0.201ns 0.203ns 0.203ns 0.204ns 0% █▇▆▆▆▆▁▆▆▆ nonrendering SkVM_4096_Opts Change-Id: Ibf0a9c5d90b35f1e9cf7265868bd18b7e0a76c43 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220805 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>

commit: 7f061fb53b9e347d8f5eb951b8d73aa4ce68c004 [log] [tgz]
author: Mike Klein <mtklein@google.com> Thu Jun 13 13:12:38 2019 -0500
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> Thu Jun 13 21:32:45 2019 +0000
tree: 545cb33e902723f78324cceb7a003ee982be5d7c
parent: f7a57f986a60dca9e153af2fcff6868db4c00fd1 [diff]
diff --git a/resources/SkVMTest.expected b/resources/SkVMTest.expected
index 880155f..396899c 100644
--- a/resources/SkVMTest.expected
+++ b/resources/SkVMTest.expected

@@ -599,27 +599,23 @@
 store32 arg(1) r10
 
 I32 (SWAR) 8888 over 8888
-8 registers, 21 instructions:
-r0 = splat FF00FF (2.3418409e-38)
-r1 = splat 1000100 (2.3510604e-38)
+7 registers, 17 instructions:
+r0 = splat 1000100 (2.3510604e-38)
+r1 = splat FF00FF (2.3418409e-38)
 r2 = splat FF00FF00 (-1.7146522e+38)
 loop:
 r3 = load32 arg(0)
-r4 = extract r3 0 r0
-r5 = extract r3 8 r0
-r3 = shr r3 24
-r3 = pack r3 r3 16
-r3 = sub_i16x2 r1 r3
-r6 = load32 arg(1)
-r7 = extract r6 0 r0
-r6 = extract r6 8 r0
-r7 = mul_i16x2 r7 r3
-r7 = shr_i16x2 r7 8
-r6 = mul_i16x2 r6 r3
-r6 = bit_and r6 r2
-r7 = add_i32 r4 r7
-r5 = shl r5 8
-r5 = add_i32 r5 r6
-r5 = bit_or r7 r5
+r4 = shr r3 24
+r4 = pack r4 r4 16
+r4 = sub_i16x2 r0 r4
+r5 = load32 arg(1)
+r6 = extract r5 0 r1
+r5 = extract r5 8 r1
+r6 = mul_i16x2 r6 r4
+r6 = shr_i16x2 r6 8
+r5 = mul_i16x2 r5 r4
+r5 = bit_and r5 r2
+r5 = bit_or r6 r5
+r5 = add_i32 r3 r5
 store32 arg(1) r5
 

diff --git a/tools/SkVMBuilders.cpp b/tools/SkVMBuilders.cpp
index ce8ef31..b958462 100644
--- a/tools/SkVMBuilders.cpp
+++ b/tools/SkVMBuilders.cpp

@@ -138,33 +138,23 @@
     skvm::Arg src = arg(0),
               dst = arg(1);
 
-    auto load = [&](skvm::Arg ptr,
-                    skvm::I32* rb, skvm::I32* ga, skvm::I32* a) {
-        skvm::I32 rgba = load32(ptr);
-        *rb = extract(rgba, 0, splat(0x00ff00ff));
-        *ga = extract(rgba, 8, splat(0x00ff00ff));
-        * a = shr    (rgba, 24);
-    };
+    // The s += d*invA adds won't overflow,
+    // so we don't have to unpack s beyond grabbing the alpha channel.
+    skvm::I32 s = load32(src),
+              a = shr(s, 24);
 
-    skvm::I32 rb, ga, a;
-    load(src, &rb, &ga, &a);
+    // We'll use the same approximation math as above, this time making sure to
+    // use both i16 multiplies to our benefit, one for r/g, the other for b/a.
+    skvm::I32 ax2    = pack(a,a,16),
+              invAx2 = sub_16x2(splat(0x01000100), ax2);
 
-    skvm::I32 ax2    = pack(a,a, 16),
-              invAx2 = sub_16x2(splat(0x01000100/*256 x2*/), ax2);
+    skvm::I32 d   = load32(dst),
+              rb = extract(d, 0, splat(0x00ff00ff)),
+              ga = extract(d, 8, splat(0x00ff00ff));
 
-    skvm::I32 drb, dga, da;
-    load(dst, &drb, &dga, &da);
+    rb = shr_16x2(mul_16x2(rb, invAx2), 8);  // Put the high 8 bits back in the low lane.
+    ga =          mul_16x2(ga, invAx2);      // Keep the high 8 bits up high...
+    ga = bit_and(ga, splat(0xff00ff00));     // ...and mask off the low bits.
 
-    // Same approximation as above,
-    // but this time we make sure to use both i16 multiplies to our benefit,
-    // one for r/g, the other for b/a simultaneously.
-
-    skvm::I32 RB = shr_16x2(mul_16x2(drb, invAx2), 8),  // 8 high bits of results shifted back down.
-              GA =          mul_16x2(dga, invAx2)    ;  // Keep high bits of results in high lanes,
-    GA = bit_and(GA, splat(0xff00ff00));                // and mask off any low bits remaining.
-
-    rb = add(    rb    , RB);   // src += dst*invA
-    ga = add(shl(ga, 8), GA);
-
-    store32(dst, bit_or(rb,ga));
+    store32(dst, add(s, bit_or(rb, ga)));
 }
commit	7f061fb53b9e347d8f5eb951b8d73aa4ce68c004	[log] [tgz]
author	Mike Klein <mtklein@google.com>	Thu Jun 13 13:12:38 2019 -0500
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	Thu Jun 13 21:32:45 2019 +0000
tree	545cb33e902723f78324cceb7a003ee982be5d7c
parent	f7a57f986a60dca9e153af2fcff6868db4c00fd1 [diff]