Add AVX2 implementation for blit_row_s32a_opaque
The function is added in SkOpts_hsw but doesn't have a AVX2 implementation.
The implementation boosts Vellamo Pixelblender test case for 20% performance
Change-Id: I3bf77eb7629213df1f1bdfa1087ebaf40894d7c4
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/215400
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/opts/SkBlitRow_opts.h b/src/opts/SkBlitRow_opts.h
index 759f464..2514e6c 100644
--- a/src/opts/SkBlitRow_opts.h
+++ b/src/opts/SkBlitRow_opts.h
@@ -11,8 +11,33 @@
#include "include/private/SkColorData.h"
#include "include/private/SkVx.h"
#include "src/core/SkMSAN.h"
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+ #include <immintrin.h>
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+ static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) {
+ auto SkAlphaMulQ_AVX2 = [](const __m256i& c, const __m256i& scale) {
+ const __m256i mask = _mm256_set1_epi32(0xFF00FF);
+ __m256i s = _mm256_or_si256(_mm256_slli_epi32(scale, 16), scale);
+
+ // uint32_t rb = ((c & mask) * scale) >> 8
+ __m256i rb = _mm256_and_si256(mask, c);
+ rb = _mm256_mullo_epi16(rb, s);
+ rb = _mm256_srli_epi16(rb, 8);
+
+ // uint32_t ag = ((c >> 8) & mask) * scale
+ __m256i ag = _mm256_srli_epi16(c, 8);
+ ag = _mm256_mullo_epi16(ag, s);
+
+ // (rb & mask) | (ag & ~mask)
+ ag = _mm256_andnot_si256(mask, ag);
+ return _mm256_or_si256(rb, ag);
+ };
+ return _mm256_add_epi32(src,
+ SkAlphaMulQ_AVX2(dst, _mm256_sub_epi32(_mm256_set1_epi32(256),
+ _mm256_srli_epi32(src, 24))));
+ }
+
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
#include <immintrin.h>
static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
@@ -116,8 +141,56 @@
void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
SkASSERT(alpha == 0xFF);
sk_msan_assert_initialized(src, src+len);
+// Require AVX2 because of AVX2 integer calculation intrinsics in SrcOver
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+ while (len >= 32) {
+ // Load 32 source pixels.
+ auto s0 = _mm256_loadu_si256((const __m256i*)(src) + 0),
+ s1 = _mm256_loadu_si256((const __m256i*)(src) + 1),
+ s2 = _mm256_loadu_si256((const __m256i*)(src) + 2),
+ s3 = _mm256_loadu_si256((const __m256i*)(src) + 3);
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+ const auto alphaMask = _mm256_set1_epi32(0xFF000000);
+
+ auto ORed = _mm256_or_si256(s3, _mm256_or_si256(s2, _mm256_or_si256(s1, s0)));
+ if (_mm256_testz_si256(ORed, alphaMask)) {
+ // All 32 source pixels are transparent. Nothing to do.
+ src += 32;
+ dst += 32;
+ len -= 32;
+ continue;
+ }
+
+ auto d0 = (__m256i*)(dst) + 0,
+ d1 = (__m256i*)(dst) + 1,
+ d2 = (__m256i*)(dst) + 2,
+ d3 = (__m256i*)(dst) + 3;
+
+ auto ANDed = _mm256_and_si256(s3, _mm256_and_si256(s2, _mm256_and_si256(s1, s0)));
+ if (_mm256_testc_si256(ANDed, alphaMask)) {
+ // All 32 source pixels are opaque. SrcOver becomes Src.
+ _mm256_storeu_si256(d0, s0);
+ _mm256_storeu_si256(d1, s1);
+ _mm256_storeu_si256(d2, s2);
+ _mm256_storeu_si256(d3, s3);
+ src += 32;
+ dst += 32;
+ len -= 32;
+ continue;
+ }
+
+ // TODO: This math is wrong.
+ // Do SrcOver.
+ _mm256_storeu_si256(d0, SkPMSrcOver_AVX2(s0, _mm256_loadu_si256(d0)));
+ _mm256_storeu_si256(d1, SkPMSrcOver_AVX2(s1, _mm256_loadu_si256(d1)));
+ _mm256_storeu_si256(d2, SkPMSrcOver_AVX2(s2, _mm256_loadu_si256(d2)));
+ _mm256_storeu_si256(d3, SkPMSrcOver_AVX2(s3, _mm256_loadu_si256(d3)));
+ src += 32;
+ dst += 32;
+ len -= 32;
+ }
+
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
while (len >= 16) {
// Load 16 source pixels.
auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),