replace Sk4px with SkVx in SkBlitter_ARGB32

This is a 1:1 transliteration, with the skvx code more explicit about
what's going on, the old Sk4px code more abstracted and specificially
designed to work with unorm8 types.  It's not as terse, but I think I've
got it equally or more clear?

I haven't changed anything here, but I think it should be pretty clear
how to generalize this up to say, 8 pixels at a time?

Writing this CL has got me pretty strongly considering adding explicit
bit-pun constructors to the skvx::Vec types, given how often I'm finding
them to be useful, particularly between uint32 and 4x uint8.

Change-Id: Ie62340e571258fc82176ae3d6605b1f96695f90e
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/207722
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Michael Ludwig <michaelludwig@google.com>
Auto-Submit: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkBlitter_ARGB32.cpp b/src/core/SkBlitter_ARGB32.cpp
index abd63b7..c41579f 100644
--- a/src/core/SkBlitter_ARGB32.cpp
+++ b/src/core/SkBlitter_ARGB32.cpp
@@ -5,12 +5,12 @@
  * found in the LICENSE file.
  */
 
-#include "Sk4px.h"
 #include "SkColorData.h"
 #include "SkCoreBlitters.h"
 #include "SkShader.h"
 #include "SkUtils.h"
 #include "SkXfermodePriv.h"
+#include "SkVx.h"
 
 static inline int upscale_31_to_32(int value) {
     SkASSERT((unsigned)value <= 31);
@@ -1152,36 +1152,68 @@
     }
 }
 
-static void blend_row_A8(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
-    auto mask = (const uint8_t*)vmask;
+#ifndef SK_SUPPORT_LEGACY_A8_MASKBLITTER
+using U32  = skvx::Vec< 4, uint32_t>;
+using U8x4 = skvx::Vec<16, uint8_t>;
+using U8   = skvx::Vec< 4, uint8_t>;
+
+static void drive(SkPMColor* dst, const SkPMColor* src, const uint8_t* cov, int n,
+                  U8x4 (*kernel)(U8x4,U8x4,U8x4)) {
+
+    auto apply = [kernel](U32 dst, U32 src, U8 cov) -> U32 {
+        U8x4 cov_splat = skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(cov);
+        return skvx::bit_pun<U32>(kernel(skvx::bit_pun<U8x4>(dst),
+                                         skvx::bit_pun<U8x4>(src),
+                                         cov_splat));
+    };
+    while (n >= 4) {
+        apply(U32::Load(dst), U32::Load(src), U8::Load(cov)).store(dst);
+        dst += 4;
+        src += 4;
+        cov += 4;
+        n   -= 4;
+    }
+    while (n --> 0) {
+        *dst = apply(U32{*dst}, U32{*src}, U8{*cov})[0];
+        dst++;
+        src++;
+        cov++;
+    }
+}
+#endif
+
+static void blend_row_A8(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
+    auto cov = (const uint8_t*)mask;
 
 #ifdef SK_SUPPORT_LEGACY_A8_MASKBLITTER
     for (int i = 0; i < n; ++i) {
-        if (mask[i]) {
-            dst[i] = SkBlendARGB32(src[i], dst[i], mask[i]);
+        if (cov[i]) {
+            dst[i] = SkBlendARGB32(src[i], dst[i], cov[i]);
         }
     }
 #else
-    Sk4px::MapDstSrcAlpha(n, dst, src, mask, [](const Sk4px& d, const Sk4px& s, const Sk4px& aa) {
-        const auto s_aa = s.approxMulDiv255(aa);
-        return s_aa + d.approxMulDiv255(s_aa.alphas().inv());
+    drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
+        U8x4 s_aa  = skvx::approx_scale(s, c),
+             alpha = skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(s_aa);
+        return s_aa + skvx::approx_scale(d, 255 - alpha);
     });
 #endif
 }
 
-static void blend_row_A8_opaque(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
-    auto mask = (const uint8_t*)vmask;
+static void blend_row_A8_opaque(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
+    auto cov = (const uint8_t*)mask;
 
 #ifdef SK_SUPPORT_LEGACY_A8_MASKBLITTER
     for (int i = 0; i < n; ++i) {
-        if (int m = mask[i]) {
-            m += (m >> 7);
-            dst[i] = SkAlphaMulQ(src[i], m) + SkAlphaMulQ(dst[i], 256 - m);
+        if (int c = cov[i]) {
+            c += (c >> 7);
+            dst[i] = SkAlphaMulQ(src[i], c) + SkAlphaMulQ(dst[i], 256 - c);
         }
     }
 #else
-    Sk4px::MapDstSrcAlpha(n, dst, src, mask, [](const Sk4px& d, const Sk4px& s, const Sk4px& aa) {
-        return (s * aa + d * aa.inv()).div255();
+    drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
+        return skvx::div255( skvx::cast<uint16_t>(s) * skvx::cast<uint16_t>(  c  )
+                           + skvx::cast<uint16_t>(d) * skvx::cast<uint16_t>(255-c));
     });
 #endif
 }