streamline srcover math in I32_SWAR
This is the final bunny I've got in my hat, I think...
Remembering that none of the s += d*invA adds can overflow,
we can use a single 32-bit add to add them all at once.
This means we don't have to unpack the src pixel into rb/ga
halves. We need only extract the alpha for invA.
This brings I32_SWAR even with the Opts code!
curr/maxrss loops min median mean max stddev samples config bench
36/36 MB 133 0.206ns 0.211ns 0.208ns 0.211ns 1% ▁▇▁█▁▇▁▇▁▇ nonrendering SkVM_4096_I32_SWAR
37/37 MB 152 0.432ns 0.432ns 0.434ns 0.444ns 1% ▃▁▁▁▁▃▁▁█▁ nonrendering SkVM_4096_I32
37/37 MB 50 0.781ns 0.794ns 0.815ns 0.895ns 5% ▆▂█▃▅▂▂▁▂▁ nonrendering SkVM_4096_F32
37/37 MB 76 0.773ns 0.78ns 0.804ns 0.907ns 6% ▄█▅▁▁▁▁▂▁▁ nonrendering SkVM_4096_RP
37/37 MB 268 0.201ns 0.203ns 0.203ns 0.204ns 0% █▇▆▆▆▆▁▆▆▆ nonrendering SkVM_4096_Opts
Change-Id: Ibf0a9c5d90b35f1e9cf7265868bd18b7e0a76c43
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220805
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/resources/SkVMTest.expected b/resources/SkVMTest.expected
index 880155f..396899c 100644
--- a/resources/SkVMTest.expected
+++ b/resources/SkVMTest.expected
@@ -599,27 +599,23 @@
store32 arg(1) r10
I32 (SWAR) 8888 over 8888
-8 registers, 21 instructions:
-r0 = splat FF00FF (2.3418409e-38)
-r1 = splat 1000100 (2.3510604e-38)
+7 registers, 17 instructions:
+r0 = splat 1000100 (2.3510604e-38)
+r1 = splat FF00FF (2.3418409e-38)
r2 = splat FF00FF00 (-1.7146522e+38)
loop:
r3 = load32 arg(0)
-r4 = extract r3 0 r0
-r5 = extract r3 8 r0
-r3 = shr r3 24
-r3 = pack r3 r3 16
-r3 = sub_i16x2 r1 r3
-r6 = load32 arg(1)
-r7 = extract r6 0 r0
-r6 = extract r6 8 r0
-r7 = mul_i16x2 r7 r3
-r7 = shr_i16x2 r7 8
-r6 = mul_i16x2 r6 r3
-r6 = bit_and r6 r2
-r7 = add_i32 r4 r7
-r5 = shl r5 8
-r5 = add_i32 r5 r6
-r5 = bit_or r7 r5
+r4 = shr r3 24
+r4 = pack r4 r4 16
+r4 = sub_i16x2 r0 r4
+r5 = load32 arg(1)
+r6 = extract r5 0 r1
+r5 = extract r5 8 r1
+r6 = mul_i16x2 r6 r4
+r6 = shr_i16x2 r6 8
+r5 = mul_i16x2 r5 r4
+r5 = bit_and r5 r2
+r5 = bit_or r6 r5
+r5 = add_i32 r3 r5
store32 arg(1) r5
diff --git a/tools/SkVMBuilders.cpp b/tools/SkVMBuilders.cpp
index ce8ef31..b958462 100644
--- a/tools/SkVMBuilders.cpp
+++ b/tools/SkVMBuilders.cpp
@@ -138,33 +138,23 @@
skvm::Arg src = arg(0),
dst = arg(1);
- auto load = [&](skvm::Arg ptr,
- skvm::I32* rb, skvm::I32* ga, skvm::I32* a) {
- skvm::I32 rgba = load32(ptr);
- *rb = extract(rgba, 0, splat(0x00ff00ff));
- *ga = extract(rgba, 8, splat(0x00ff00ff));
- * a = shr (rgba, 24);
- };
+ // The s += d*invA adds won't overflow,
+ // so we don't have to unpack s beyond grabbing the alpha channel.
+ skvm::I32 s = load32(src),
+ a = shr(s, 24);
- skvm::I32 rb, ga, a;
- load(src, &rb, &ga, &a);
+ // We'll use the same approximation math as above, this time making sure to
+ // use both i16 multiplies to our benefit, one for r/g, the other for b/a.
+ skvm::I32 ax2 = pack(a,a,16),
+ invAx2 = sub_16x2(splat(0x01000100), ax2);
- skvm::I32 ax2 = pack(a,a, 16),
- invAx2 = sub_16x2(splat(0x01000100/*256 x2*/), ax2);
+ skvm::I32 d = load32(dst),
+ rb = extract(d, 0, splat(0x00ff00ff)),
+ ga = extract(d, 8, splat(0x00ff00ff));
- skvm::I32 drb, dga, da;
- load(dst, &drb, &dga, &da);
+ rb = shr_16x2(mul_16x2(rb, invAx2), 8); // Put the high 8 bits back in the low lane.
+ ga = mul_16x2(ga, invAx2); // Keep the high 8 bits up high...
+ ga = bit_and(ga, splat(0xff00ff00)); // ...and mask off the low bits.
- // Same approximation as above,
- // but this time we make sure to use both i16 multiplies to our benefit,
- // one for r/g, the other for b/a simultaneously.
-
- skvm::I32 RB = shr_16x2(mul_16x2(drb, invAx2), 8), // 8 high bits of results shifted back down.
- GA = mul_16x2(dga, invAx2) ; // Keep high bits of results in high lanes,
- GA = bit_and(GA, splat(0xff00ff00)); // and mask off any low bits remaining.
-
- rb = add( rb , RB); // src += dst*invA
- ga = add(shl(ga, 8), GA);
-
- store32(dst, bit_or(rb,ga));
+ store32(dst, add(s, bit_or(rb, ga)));
}