Add AVX2 implementation for blit_row_s32a_opaque

The function is added in SkOpts_hsw but doesn't have a AVX2 implementation.
The implementation boosts Vellamo Pixelblender test case for 20% performance

Change-Id: I3bf77eb7629213df1f1bdfa1087ebaf40894d7c4
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/215400
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/opts/SkBlitRow_opts.h b/src/opts/SkBlitRow_opts.h
index 759f464..2514e6c 100644
--- a/src/opts/SkBlitRow_opts.h
+++ b/src/opts/SkBlitRow_opts.h
@@ -11,8 +11,33 @@
 #include "include/private/SkColorData.h"
 #include "include/private/SkVx.h"
 #include "src/core/SkMSAN.h"
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+    #include <immintrin.h>
 
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+    static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) {
+        auto SkAlphaMulQ_AVX2 = [](const __m256i& c, const __m256i& scale) {
+            const __m256i mask = _mm256_set1_epi32(0xFF00FF);
+            __m256i s = _mm256_or_si256(_mm256_slli_epi32(scale, 16), scale);
+
+            // uint32_t rb = ((c & mask) * scale) >> 8
+            __m256i rb = _mm256_and_si256(mask, c);
+            rb = _mm256_mullo_epi16(rb, s);
+            rb = _mm256_srli_epi16(rb, 8);
+
+            // uint32_t ag = ((c >> 8) & mask) * scale
+            __m256i ag = _mm256_srli_epi16(c, 8);
+            ag = _mm256_mullo_epi16(ag, s);
+
+            // (rb & mask) | (ag & ~mask)
+            ag = _mm256_andnot_si256(mask, ag);
+            return _mm256_or_si256(rb, ag);
+        };
+        return _mm256_add_epi32(src,
+                             SkAlphaMulQ_AVX2(dst, _mm256_sub_epi32(_mm256_set1_epi32(256),
+                                                                 _mm256_srli_epi32(src, 24))));
+    }
+
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
     #include <immintrin.h>
 
     static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
@@ -116,8 +141,56 @@
 void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
     SkASSERT(alpha == 0xFF);
     sk_msan_assert_initialized(src, src+len);
+// Require AVX2 because of AVX2 integer calculation intrinsics in SrcOver
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+    while (len >= 32) {
+        // Load 32 source pixels.
+        auto s0 = _mm256_loadu_si256((const __m256i*)(src) + 0),
+             s1 = _mm256_loadu_si256((const __m256i*)(src) + 1),
+             s2 = _mm256_loadu_si256((const __m256i*)(src) + 2),
+             s3 = _mm256_loadu_si256((const __m256i*)(src) + 3);
 
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+        const auto alphaMask = _mm256_set1_epi32(0xFF000000);
+
+        auto ORed = _mm256_or_si256(s3, _mm256_or_si256(s2, _mm256_or_si256(s1, s0)));
+        if (_mm256_testz_si256(ORed, alphaMask)) {
+            // All 32 source pixels are transparent.  Nothing to do.
+            src += 32;
+            dst += 32;
+            len -= 32;
+            continue;
+        }
+
+        auto d0 = (__m256i*)(dst) + 0,
+             d1 = (__m256i*)(dst) + 1,
+             d2 = (__m256i*)(dst) + 2,
+             d3 = (__m256i*)(dst) + 3;
+
+        auto ANDed = _mm256_and_si256(s3, _mm256_and_si256(s2, _mm256_and_si256(s1, s0)));
+        if (_mm256_testc_si256(ANDed, alphaMask)) {
+            // All 32 source pixels are opaque.  SrcOver becomes Src.
+            _mm256_storeu_si256(d0, s0);
+            _mm256_storeu_si256(d1, s1);
+            _mm256_storeu_si256(d2, s2);
+            _mm256_storeu_si256(d3, s3);
+            src += 32;
+            dst += 32;
+            len -= 32;
+            continue;
+        }
+
+        // TODO: This math is wrong.
+        // Do SrcOver.
+        _mm256_storeu_si256(d0, SkPMSrcOver_AVX2(s0, _mm256_loadu_si256(d0)));
+        _mm256_storeu_si256(d1, SkPMSrcOver_AVX2(s1, _mm256_loadu_si256(d1)));
+        _mm256_storeu_si256(d2, SkPMSrcOver_AVX2(s2, _mm256_loadu_si256(d2)));
+        _mm256_storeu_si256(d3, SkPMSrcOver_AVX2(s3, _mm256_loadu_si256(d3)));
+        src += 32;
+        dst += 32;
+        len -= 32;
+    }
+
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
     while (len >= 16) {
         // Load 16 source pixels.
         auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),