arithmetic mode with Sk4f

[Cherry-pick of d58f840650a6768b50d024247e2817ccbacd8a0d to chrome/m50 branch.]

After reading the SSE version, I figured I'd show off the new hotness a little.  This'll get us SSE, NEON and portable implementations all in one easy to read package.

Since we've been talking about it, it's worth noting the several ways this implementation is still not constant time:
  - short circuits on 0x00 and 0xff coverage;
  - floating point multiplication with untrusted k1-k4; if someone figures out a clever way to sometimes create denorm floats and sometimes not, there's a gigantic performance difference.

I would hazard the pin is constant time now though.

I've also fixed the lerp to lerp between dst and r instead of src and r.  That can't have been right.

curr/maxrss	loops	min	median	mean	max	stddev	samples   	config	bench
   9/9   MB	1	25.5ms	25.5ms	25.5ms	25.5ms	0%	▃▁▁▃▂▇▅▆▇█	8888	Xfermode_arithmetic_enforce_pm_aa
   9/9   MB	1	24.1ms	24.2ms	24.2ms	24.3ms	0%	▄▃▁▄█▆▆█▃█	8888	Xfermode_arithmetic_aa
   9/9   MB	1	102ms	102ms	102ms	103ms	0%	▁▅▂▆▂█▂█▁▂	8888	Xfermode_arithmetic_enforce_pm
   9/9   MB	1	94.8ms	95.4ms	95.2ms	95.8ms	0%	▅▅▁▁▁▁▄▇█▇	8888	Xfermode_arithmetic

~~~~>

curr/maxrss	loops	min	median	mean	max	stddev	samples   	config	bench
   9/9   MB	1	9.71ms	9.74ms	9.73ms	9.78ms	0%	█▅▄▄▁▂▂▂▄▄	8888	Xfermode_arithmetic_enforce_pm_aa
   9/9   MB	1	9.5ms	9.57ms	9.58ms	9.7ms	1%	▂▁█▅▂▂▆▃▄▄	8888	Xfermode_arithmetic_aa
   9/9   MB	1	21.8ms	21.8ms	21.8ms	21.9ms	0%	█▂▂▂▂▂▂▁▄▂	8888	Xfermode_arithmetic_enforce_pm
   9/9   MB	1	16.5ms	16.6ms	16.6ms	16.6ms	0%	▃█▁▁▄▄▁▁▆▅	8888	Xfermode_arithmetic

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1873963003
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1873963003
NOTREECHECKS=true
NOTRY=true
NOPRESUBMIT=true

TBR=mtklein@google.com

Review URL: https://codereview.chromium.org/1902903002 .
diff --git a/src/effects/SkArithmeticMode.cpp b/src/effects/SkArithmeticMode.cpp
index c8e0756..954f8eb 100644
--- a/src/effects/SkArithmeticMode.cpp
+++ b/src/effects/SkArithmeticMode.cpp
@@ -7,10 +7,11 @@
 
 #include "SkArithmeticMode.h"
 #include "SkColorPriv.h"
+#include "SkNx.h"
 #include "SkReadBuffer.h"
-#include "SkWriteBuffer.h"
 #include "SkString.h"
 #include "SkUnPreMultiply.h"
+#include "SkWriteBuffer.h"
 #if SK_SUPPORT_GPU
 #include "SkArithmeticMode_gpu.h"
 #endif
@@ -63,64 +64,37 @@
     return SkArithmeticMode::Create(k1, k2, k3, k4, enforcePMColor);
 }
 
-static int pinToByte(int value) {
-    if (value < 0) {
-        value = 0;
-    } else if (value > 255) {
-        value = 255;
-    }
-    return value;
-}
-
-static int arith(SkScalar k1, SkScalar k2, SkScalar k3, SkScalar k4,
-                 int src, int dst) {
-    SkScalar result = SkScalarMul(k1, src * dst) +
-                      SkScalarMul(k2, src) +
-                      SkScalarMul(k3, dst) +
-                      k4;
-    int res = SkScalarRoundToInt(result);
-    return pinToByte(res);
-}
-
-static int blend(int src, int dst, int scale) {
-    return dst + ((src - dst) * scale >> 8);
-}
-
 void SkArithmeticMode_scalar::xfer32(SkPMColor dst[], const SkPMColor src[],
                                  int count, const SkAlpha aaCoverage[]) const {
-    SkScalar k1 = fK[0] / 255;
-    SkScalar k2 = fK[1];
-    SkScalar k3 = fK[2];
-    SkScalar k4 = fK[3] * 255;
+    const Sk4f k1 = fK[0] * (1/255.0f),
+               k2 = fK[1],
+               k3 = fK[2],
+               k4 = fK[3] * 255.0f + 0.5f;
 
-    for (int i = 0; i < count; ++i) {
-        if ((nullptr == aaCoverage) || aaCoverage[i]) {
-            SkPMColor sc = src[i];
-            SkPMColor dc = dst[i];
+    auto pin = [](float min, const Sk4f& val, float max) {
+        return Sk4f::Max(min, Sk4f::Min(val, max));
+    };
 
-            int a, r, g, b;
-
-            a = arith(k1, k2, k3, k4, SkGetPackedA32(sc), SkGetPackedA32(dc));
-            r = arith(k1, k2, k3, k4, SkGetPackedR32(sc), SkGetPackedR32(dc));
-            g = arith(k1, k2, k3, k4, SkGetPackedG32(sc), SkGetPackedG32(dc));
-            b = arith(k1, k2, k3, k4, SkGetPackedB32(sc), SkGetPackedB32(dc));
-            if (fEnforcePMColor) {
-                r = SkMin32(r, a);
-                g = SkMin32(g, a);
-                b = SkMin32(b, a);
-            }
-
-            // apply antialias coverage if necessary
-            if (aaCoverage && 0xFF != aaCoverage[i]) {
-                int scale = aaCoverage[i] + (aaCoverage[i] >> 7);
-                a = blend(a, SkGetPackedA32(sc), scale);
-                r = blend(r, SkGetPackedR32(sc), scale);
-                g = blend(g, SkGetPackedG32(sc), scale);
-                b = blend(b, SkGetPackedB32(sc), scale);
-            }
-
-            dst[i] = fEnforcePMColor ? SkPackARGB32(a, r, g, b) : SkPackARGB32NoCheck(a, r, g, b);
+    for (int i = 0; i < count; i++) {
+        if (aaCoverage && aaCoverage[i] == 0) {
+            continue;
         }
+
+        Sk4f s = SkNx_cast<float>(Sk4b::Load(src+i)),
+             d = SkNx_cast<float>(Sk4b::Load(dst+i)),
+             r = pin(0, k1*s*d + k2*s + k3*d + k4, 255);
+
+        if (fEnforcePMColor) {
+            Sk4f a = SkNx_shuffle<3,3,3,3>(r);
+            r = Sk4f::Min(a, r);
+        }
+
+        if (aaCoverage && aaCoverage[i] != 255) {
+            Sk4f c = aaCoverage[i] * (1/255.0f);
+            r = d + (r-d)*c;
+        }
+
+        SkNx_cast<uint8_t>(r).store(dst+i);
     }
 }