Add AVX2 implementation for blit_row_s32a_opaque The function is added in SkOpts_hsw but doesn't have a AVX2 implementation. The implementation boosts Vellamo Pixelblender test case for 20% performance Change-Id: I3bf77eb7629213df1f1bdfa1087ebaf40894d7c4 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/215400 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/opts/SkBlitRow_opts.h b/src/opts/SkBlitRow_opts.h index 759f464..2514e6c 100644 --- a/src/opts/SkBlitRow_opts.h +++ b/src/opts/SkBlitRow_opts.h
@@ -11,8 +11,33 @@ #include "include/private/SkColorData.h" #include "include/private/SkVx.h" #include "src/core/SkMSAN.h" +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 + #include <immintrin.h> -#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) { + auto SkAlphaMulQ_AVX2 = [](const __m256i& c, const __m256i& scale) { + const __m256i mask = _mm256_set1_epi32(0xFF00FF); + __m256i s = _mm256_or_si256(_mm256_slli_epi32(scale, 16), scale); + + // uint32_t rb = ((c & mask) * scale) >> 8 + __m256i rb = _mm256_and_si256(mask, c); + rb = _mm256_mullo_epi16(rb, s); + rb = _mm256_srli_epi16(rb, 8); + + // uint32_t ag = ((c >> 8) & mask) * scale + __m256i ag = _mm256_srli_epi16(c, 8); + ag = _mm256_mullo_epi16(ag, s); + + // (rb & mask) | (ag & ~mask) + ag = _mm256_andnot_si256(mask, ag); + return _mm256_or_si256(rb, ag); + }; + return _mm256_add_epi32(src, + SkAlphaMulQ_AVX2(dst, _mm256_sub_epi32(_mm256_set1_epi32(256), + _mm256_srli_epi32(src, 24)))); + } + +#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 #include <immintrin.h> static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) { @@ -116,8 +141,56 @@ void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) { SkASSERT(alpha == 0xFF); sk_msan_assert_initialized(src, src+len); +// Require AVX2 because of AVX2 integer calculation intrinsics in SrcOver +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 + while (len >= 32) { + // Load 32 source pixels. + auto s0 = _mm256_loadu_si256((const __m256i*)(src) + 0), + s1 = _mm256_loadu_si256((const __m256i*)(src) + 1), + s2 = _mm256_loadu_si256((const __m256i*)(src) + 2), + s3 = _mm256_loadu_si256((const __m256i*)(src) + 3); -#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 + const auto alphaMask = _mm256_set1_epi32(0xFF000000); + + auto ORed = _mm256_or_si256(s3, _mm256_or_si256(s2, _mm256_or_si256(s1, s0))); + if (_mm256_testz_si256(ORed, alphaMask)) { + // All 32 source pixels are transparent. Nothing to do. + src += 32; + dst += 32; + len -= 32; + continue; + } + + auto d0 = (__m256i*)(dst) + 0, + d1 = (__m256i*)(dst) + 1, + d2 = (__m256i*)(dst) + 2, + d3 = (__m256i*)(dst) + 3; + + auto ANDed = _mm256_and_si256(s3, _mm256_and_si256(s2, _mm256_and_si256(s1, s0))); + if (_mm256_testc_si256(ANDed, alphaMask)) { + // All 32 source pixels are opaque. SrcOver becomes Src. + _mm256_storeu_si256(d0, s0); + _mm256_storeu_si256(d1, s1); + _mm256_storeu_si256(d2, s2); + _mm256_storeu_si256(d3, s3); + src += 32; + dst += 32; + len -= 32; + continue; + } + + // TODO: This math is wrong. + // Do SrcOver. + _mm256_storeu_si256(d0, SkPMSrcOver_AVX2(s0, _mm256_loadu_si256(d0))); + _mm256_storeu_si256(d1, SkPMSrcOver_AVX2(s1, _mm256_loadu_si256(d1))); + _mm256_storeu_si256(d2, SkPMSrcOver_AVX2(s2, _mm256_loadu_si256(d2))); + _mm256_storeu_si256(d3, SkPMSrcOver_AVX2(s3, _mm256_loadu_si256(d3))); + src += 32; + dst += 32; + len -= 32; + } + +#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 while (len >= 16) { // Load 16 source pixels. auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),