src/opts/SkBitmapProcState_opts.h - skia - Git at Google

 /*
  * Copyright 2018 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */

 #ifndef SkBitmapProcState_opts_DEFINED
 #define SkBitmapProcState_opts_DEFINED

 #include "src/base/SkMSAN.h"
 #include "src/base/SkVx.h"
 #include "src/core/SkBitmapProcState.h"

 // SkBitmapProcState optimized Shader, Sample, or Matrix procs.
 //
 // Only S32_alpha_D32_filter_DX exploits instructions beyond
 // our common baseline SSE2/NEON instruction sets, so that's
 // all that lives here.
 //
 // The rest are scattershot at the moment but I want to get them
 // all migrated to be normal code inside SkBitmapProcState.cpp.

 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
     #include <immintrin.h>
 #elif defined(SK_ARM_HAS_NEON)
     #include <arm_neon.h>
 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
     #include <lasxintrin.h>
 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
     #include <lsxintrin.h>
 #endif

 namespace SK_OPTS_NS {

 // This same basic packing scheme is used throughout the file.
 template <typename U32, typename Out>
 static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
     *v0 = (packed >> 18);       // Integer coordinate x0 or y0.
     *v1 = (packed & 0x3fff);    // Integer coordinate x1 or y1.
     *w  = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.
 }

 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

     /*not static*/ inline
     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
                                  const uint32_t* xy, int count, uint32_t* colors) {
         SkASSERT(count > 0 && colors != nullptr);
         SkASSERT(s.fBilerp);
         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
         SkASSERT(s.fAlphaScale <= 256);

         // interpolate_in_x() is the crux of the SSSE3 implementation,
         // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
         auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
                                    uint32_t B0, uint32_t B1,
                                    __m128i interlaced_x_weights) {
             // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp.
             //
             // It takes two arguments interlaced byte-wise:
             //    - first  arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...]
             //    - second arg: [ w,W, ... 7 more pairs of   signed 8-bit values ...]
             // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ].
             //
             // That's why we go to all this trouble to make interlaced_x_weights,
             // and here we're about to interlace A0 with A1 and B0 with B1 to match.
             //
             // Our interlaced_x_weights are all in [0,16], and so we need not worry about
             // the signedness of that input nor about the signedness of the output.

             __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
                     interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));

             return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
                                      interlaced_x_weights);
         };

         // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
         // Returns two pixels, with each color channel in a 16-bit lane of the __m128i.
         auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
                                           uint32_t A2, uint32_t A3,
                                           uint32_t B0, uint32_t B1,
                                           uint32_t B2, uint32_t B3,
                                           __m128i interlaced_x_weights,
                                           int wy) {
             // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.
             __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
                     bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);

             // Interpolate in Y.  As in the SSE2 code, we calculate top*(16-wy) + bot*wy
             // as 16*top + (bot-top)*wy to save a multiply.
             __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),
                                        _mm_mullo_epi16(_mm_sub_epi16(bot, top),
                                                        _mm_set1_epi16(wy)));

             // Scale down by total max weight 16x16 = 256.
             px = _mm_srli_epi16(px, 8);

             // Scale by alpha if needed.
             if (s.fAlphaScale < 256) {
                 px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);
             }
             return px;
         };

         // We're in _DX mode here, so we're only varying in X.
         // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
         // All the other entries in xy will be pairs of X coordinates and the X weight.
         int y0, y1, wy;
         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

         auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
              row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());

         while (count >= 4) {
             // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
             int x0[4],
                 x1[4];
             __m128i wx;

             // decode_packed_coordinates_and_weight(), 4x.
             __m128i packed = _mm_loadu_si128((const __m128i*)xy);
             _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));
             _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
             wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));  // [0,15]

             // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
             // and sixteen minus that as wl for pixels on the left at x0.
             __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
                     wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);

             // We need to interlace wl and wr for _mm_maddubs_epi16().
             __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
                     interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);

             enum { A,B,C,D };

             // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
             // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
             __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],
                                                 row1[x0[A]], row1[x1[A]],
                                                 row0[x0[B]], row0[x1[B]],
                                                 row1[x0[B]], row1[x1[B]],
                                                 interlaced_x_weights_AB, wy);

             // Once more with the other half of the x-weights for two more pixels C,D.
             __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],
                                                 row1[x0[C]], row1[x1[C]],
                                                 row0[x0[D]], row0[x1[D]],
                                                 row1[x0[D]], row1[x1[D]],
                                                 interlaced_x_weights_CD, wy);

             // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
             _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));
             xy     += 4;
             colors += 4;
             count  -= 4;
         }

         while (count --> 0) {
             // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
             int x0, x1, wx;
             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

             // As above, splat out wx four times as wr, and sixteen minus that as wl.
             __m128i wr = _mm_set1_epi8(wx),     // This splats it out 16 times, but that's fine.
                     wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);

             __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);

             __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
                                                row1[x0], row1[x1],
                                                       0,        0,
                                                       0,        0,
                                                interlaced_x_weights, wy);

             *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));
         }
     }


 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

     /*not static*/ inline
     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
                                  const uint32_t* xy, int count, uint32_t* colors) {
         SkASSERT(count > 0 && colors != nullptr);
         SkASSERT(s.fBilerp);
         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
         SkASSERT(s.fAlphaScale <= 256);

         int y0, y1, wy;
         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );

         // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
         // and another in the upper 4 16-bit lanes to line up with 16 - wy.
         const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16(   wy),   // Bottom pixel goes here.
                                                 _mm_set1_epi16(16-wy));  // Top pixel goes here.

         while (count --> 0) {
             int x0, x1, wx;
             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

             // Load the 4 pixels we're interpolating, in this grid:
             //    | tl  tr |
             //    | bl  br |
             const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),
                           bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);

             // We want to calculate a sum of 4 pixels weighted in two directions:
             //
             //  sum = tl * (16-wy) * (16-wx)
             //      + bl * (   wy) * (16-wx)
             //      + tr * (16-wy) * (   wx)
             //      + br * (   wy) * (   wx)
             //
             // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
             //
             // We've already prepared allY as a vector containing [wy, 16-wy] as a way
             // to apply those y-direction weights.  So we'll start on the x-direction
             // first, grouping into left and right halves, lined up with allY:
             //
             //     L = [bl, tl]
             //     R = [br, tr]
             //
             //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
             //
             // Rewriting that one more step, we can replace a multiply with a shift:
             //
             //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
             //
             // That's how we'll actually do this math.

             __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
                     R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());

             __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4),
                                           _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));

             __m128i sum_in_x = _mm_mullo_epi16(inner, allY);

             // sum = horizontalSum( ... )
             __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));

             // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
             sum = _mm_srli_epi16(sum, 8);

             if (s.fAlphaScale < 256) {
                 // Scale by alpha, which is in [0,256].
                 sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
                 sum = _mm_srli_epi16(sum, 8);
             }

             // Pack back into 8-bit values and store.
             *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
         }
     }

 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
     /*not static*/ inline
     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
                                  const uint32_t* xy, int count, uint32_t* colors) {
         SkASSERT(count > 0 && colors != nullptr);
         SkASSERT(s.fBilerp);
         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
         SkASSERT(s.fAlphaScale <= 256);

         int y0, y1, wy;
         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );

         // We'll put one pixel in the low 16 16-bit lanes to line up with wy,
         // and another in the upper 16 16-bit lanes to line up with 16 - wy.
         __m256i allY = __lasx_xvilvl_d(__lasx_xvreplgr2vr_h(16-wy), __lasx_xvreplgr2vr_h(wy));

         while (count --> 0) {
             int x0, x1, wx;
             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

             // Load the 4 pixels we're interpolating, in this grid:
             //    | tl  tr |
             //    | bl  br |

             const __m256i zeros = __lasx_xvldi(0);
             const __m256i tl = __lasx_xvinsgr2vr_w(zeros, row0[x0], 0),
                           tr = __lasx_xvinsgr2vr_w(zeros, row0[x1], 0),
                           bl = __lasx_xvinsgr2vr_w(zeros, row1[x0], 0),
                           br = __lasx_xvinsgr2vr_w(zeros, row1[x1], 0);

             // We want to calculate a sum of 8 pixels weighted in two directions:
             //
             //  sum = tl * (16-wy) * (16-wx)
             //      + bl * (   wy) * (16-wx)
             //      + tr * (16-wy) * (   wx)
             //      + br * (   wy) * (   wx)
             //
             // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
             //
             // We've already prepared allY as a vector containing [wy, 16-wy] as a way
             // to apply those y-direction weights.  So we'll start on the x-direction
             // first, grouping into left and right halves, lined up with allY:
             //
             //     L = [bl, tl]
             //     R = [br, tr]
             //
             //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
             //
             // Rewriting that one more step, we can replace a multiply with a shift:
             //
             //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
             //
             // That's how we'll actually do this math.

             __m256i L = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tl, bl)),
                     R = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tr, br));

             __m256i inner = __lasx_xvadd_h(__lasx_xvslli_h(L, 4),
                                            __lasx_xvmul_h(__lasx_xvsub_h(R,L),
                                                           __lasx_xvreplgr2vr_h(wx)));

             __m256i sum_in_x = __lasx_xvmul_h(inner, allY);

             // sum = horizontalSum( ... )
             __m256i sum = __lasx_xvadd_h(sum_in_x, __lasx_xvbsrl_v(sum_in_x, 8));

             // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
             sum = __lasx_xvsrli_h(sum, 8);

             if (s.fAlphaScale < 256) {
                 // Scale by alpha, which is in [0,256].
                 sum = __lasx_xvmul_h(sum, __lasx_xvreplgr2vr_h(s.fAlphaScale));
                 sum = __lasx_xvsrli_h(sum, 8);
             }

             // Pack back into 8-bit values and store.
             *colors++ = __lasx_xvpickve2gr_w(__lasx_xvpickev_b(__lasx_xvldi(0),
                                                                __lasx_xvsat_hu(sum, 8)), 0);
         }
     }

 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX

     /*not static*/ inline
     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
                                  const uint32_t* xy, int count, uint32_t* colors) {
         SkASSERT(count > 0 && colors != nullptr);
         SkASSERT(s.fBilerp);
         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
         SkASSERT(s.fAlphaScale <= 256);

         int y0, y1, wy;
         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );

         // We'll put one pixel in the low 8 16-bit lanes to line up with wy,
         // and another in the upper 8 16-bit lanes to line up with 16 - wy.
         __m128i allY = __lsx_vilvl_d(__lsx_vreplgr2vr_h(16-wy), __lsx_vreplgr2vr_h(wy));

         while (count --> 0) {
             int x0, x1, wx;
             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

             // Load the 4 pixels we're interpolating, in this grid:
             //    | tl  tr |
             //    | bl  br |
             const __m128i zeros = __lsx_vldi(0);
             const __m128i tl = __lsx_vinsgr2vr_w(zeros, row0[x0], 0),
                           tr = __lsx_vinsgr2vr_w(zeros, row0[x1], 0),
                           bl = __lsx_vinsgr2vr_w(zeros, row1[x0], 0),
                           br = __lsx_vinsgr2vr_w(zeros, row1[x1], 0);

             // We want to calculate a sum of 8 pixels weighted in two directions:
             //
             //  sum = tl * (16-wy) * (16-wx)
             //      + bl * (   wy) * (16-wx)
             //      + tr * (16-wy) * (   wx)
             //      + br * (   wy) * (   wx)
             //
             // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
             //
             // We've already prepared allY as a vector containing [wy, 16-wy] as a way
             // to apply those y-direction weights.  So we'll start on the x-direction
             // first, grouping into left and right halves, lined up with allY:
             //
             //     L = [bl, tl]
             //     R = [br, tr]
             //
             //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
             //
             // Rewriting that one more step, we can replace a multiply with a shift:
             //
             //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
             //
             // That's how we'll actually do this math.


             __m128i L = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tl, bl)),
                     R = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tr, br));

             __m128i inner = __lsx_vadd_h(__lsx_vslli_h(L, 4),
                                          __lsx_vmul_h(__lsx_vsub_h(R,L),
                                                       __lsx_vreplgr2vr_h(wx)));

             __m128i sum_in_x = __lsx_vmul_h(inner, allY);

             // sum = horizontalSum( ... )
             __m128i sum = __lsx_vadd_h(sum_in_x, __lsx_vbsrl_v(sum_in_x, 8));

             // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
             sum = __lsx_vsrli_h(sum, 8);

             if (s.fAlphaScale < 256) {
                 // Scale by alpha, which is in [0,256].
                 sum = __lsx_vmul_h(sum, __lsx_vreplgr2vr_h(s.fAlphaScale));
                 sum = __lsx_vsrli_h(sum, 8);
             }

             // Pack back into 8-bit values and store.
             *colors++ = __lsx_vpickve2gr_w(__lsx_vpickev_b(__lsx_vldi(0),
                                                            __lsx_vsat_hu(sum, 8)), 0);
         }
     }

 #else

     // The NEON code only actually differs from the portable code in the
     // filtering step after we've loaded all four pixels we want to bilerp.

     #if defined(SK_ARM_HAS_NEON)
         static void filter_and_scale_by_alpha(unsigned x, unsigned y,
                                               SkPMColor a00, SkPMColor a01,
                                               SkPMColor a10, SkPMColor a11,
                                               SkPMColor *dst,
                                               uint16_t scale) {
             uint8x8_t vy, vconst16_8, v16_y, vres;
             uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
             uint32x2_t va0, va1;
             uint16x8_t tmp1, tmp2;

             vy = vdup_n_u8(y);                // duplicate y into vy
             vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
             v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y

             va0 = vdup_n_u32(a00);            // duplicate a00
             va1 = vdup_n_u32(a10);            // duplicate a10
             va0 = vset_lane_u32(a01, va0, 1); // set top to a01
             va1 = vset_lane_u32(a11, va1, 1); // set top to a11

             tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
             tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y

             vx = vdup_n_u16(x);                // duplicate x into vx
             vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
             v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x

             tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
             tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
             tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
             tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)

             if (scale < 256) {
                 vscale = vdup_n_u16(scale);        // duplicate scale
                 tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
                 tmp = vmul_u16(tmp, vscale);       // multiply result by scale
             }

             vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8
             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
         }
     #else
         static void filter_and_scale_by_alpha(unsigned x, unsigned y,
                                               SkPMColor a00, SkPMColor a01,
                                               SkPMColor a10, SkPMColor a11,
                                               SkPMColor* dstColor,
                                               unsigned alphaScale) {
             SkASSERT((unsigned)x <= 0xF);
             SkASSERT((unsigned)y <= 0xF);
             SkASSERT(alphaScale <= 256);

             int xy = x * y;
             const uint32_t mask = 0xFF00FF;

             int scale = 256 - 16*y - 16*x + xy;
             uint32_t lo = (a00 & mask) * scale;
             uint32_t hi = ((a00 >> 8) & mask) * scale;

             scale = 16*x - xy;
             lo += (a01 & mask) * scale;
             hi += ((a01 >> 8) & mask) * scale;

             scale = 16*y - xy;
             lo += (a10 & mask) * scale;
             hi += ((a10 >> 8) & mask) * scale;

             lo += (a11 & mask) * xy;
             hi += ((a11 >> 8) & mask) * xy;

             if (alphaScale < 256) {
                 lo = ((lo >> 8) & mask) * alphaScale;
                 hi = ((hi >> 8) & mask) * alphaScale;
             }

             *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
         }
     #endif


     /*not static*/ inline
     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
                                  const uint32_t* xy, int count, SkPMColor* colors) {
         SkASSERT(count > 0 && colors != nullptr);
         SkASSERT(s.fBilerp);
         SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
         SkASSERT(s.fAlphaScale <= 256);

         int y0, y1, wy;
         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );

         while (count --> 0) {
             int x0, x1, wx;
             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

             filter_and_scale_by_alpha(wx, wy,
                                       row0[x0], row0[x1],
                                       row1[x0], row1[x1],
                                       colors++,
                                       s.fAlphaScale);
         }
     }

 #endif

 #if defined(SK_ARM_HAS_NEON)
     /*not static*/ inline
     void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,
                                    const uint32_t* xy, int count, SkPMColor* colors) {
         SkASSERT(count > 0 && colors != nullptr);
         SkASSERT(s.fBilerp);
         SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
         SkASSERT(s.fAlphaScale <= 256);

         auto src = (const char*)s.fPixmap.addr();
         size_t rb = s.fPixmap.rowBytes();

         while (count --> 0) {
             int y0, y1, wy,
                 x0, x1, wx;
             decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

             auto row0 = (const uint32_t*)(src + y0*rb),
                  row1 = (const uint32_t*)(src + y1*rb);

             filter_and_scale_by_alpha(wx, wy,
                                       row0[x0], row0[x1],
                                       row1[x0], row1[x1],
                                       colors++,
                                       s.fAlphaScale);
         }
     }
 #else
     // It's not yet clear whether it's worthwhile specializing for other architectures.
     constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,
                                                        const uint32_t*, int, SkPMColor*) = nullptr;
 #endif

 }  // namespace SK_OPTS_NS

 namespace sktests {
     template <typename U32, typename Out>
     void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
         SK_OPTS_NS::decode_packed_coordinates_and_weight<U32, Out>(packed, v0, v1, w);
     }
 }

 #endif
	/*
	* Copyright 2018 Google Inc.
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/

	#ifndef SkBitmapProcState_opts_DEFINED
	#define SkBitmapProcState_opts_DEFINED

	#include "src/base/SkMSAN.h"
	#include "src/base/SkVx.h"
	#include "src/core/SkBitmapProcState.h"

	// SkBitmapProcState optimized Shader, Sample, or Matrix procs.
	//
	// Only S32_alpha_D32_filter_DX exploits instructions beyond
	// our common baseline SSE2/NEON instruction sets, so that's
	// all that lives here.
	//
	// The rest are scattershot at the moment but I want to get them
	// all migrated to be normal code inside SkBitmapProcState.cpp.

	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
	#include <immintrin.h>
	#elif defined(SK_ARM_HAS_NEON)
	#include <arm_neon.h>
	#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
	#include <lasxintrin.h>
	#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
	#include <lsxintrin.h>
	#endif

	namespace SK_OPTS_NS {

	// This same basic packing scheme is used throughout the file.
	template <typename U32, typename Out>
	static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
	*v0 = (packed >> 18); // Integer coordinate x0 or y0.
	*v1 = (packed & 0x3fff); // Integer coordinate x1 or y1.
	*w = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.
	}

	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

	/not static/ inline
	void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
	const uint32_t* xy, int count, uint32_t* colors) {
	SkASSERT(count > 0 && colors != nullptr);
	SkASSERT(s.fBilerp);
	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
	SkASSERT(s.fAlphaScale <= 256);

	// interpolate_in_x() is the crux of the SSSE3 implementation,
	// interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
	auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
	uint32_t B0, uint32_t B1,
	__m128i interlaced_x_weights) {
	// _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp.
	//
	// It takes two arguments interlaced byte-wise:
	// - first arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...]
	// - second arg: [ w,W, ... 7 more pairs of signed 8-bit values ...]
	// and returns 8 signed 16-bit values: [ lw + rW, ... 7 more ... ].
	//
	// That's why we go to all this trouble to make interlaced_x_weights,
	// and here we're about to interlace A0 with A1 and B0 with B1 to match.
	//
	// Our interlaced_x_weights are all in [0,16], and so we need not worry about
	// the signedness of that input nor about the signedness of the output.

	__m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
	interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));

	return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
	interlaced_x_weights);
	};

	// Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
	// Returns two pixels, with each color channel in a 16-bit lane of the __m128i.
	auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
	uint32_t A2, uint32_t A3,
	uint32_t B0, uint32_t B1,
	uint32_t B2, uint32_t B3,
	__m128i interlaced_x_weights,
	int wy) {
	// Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.
	__m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
	bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);

	// Interpolate in Y. As in the SSE2 code, we calculate top(16-wy) + botwy
	// as 16top + (bot-top)wy to save a multiply.
	__m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),
	_mm_mullo_epi16(_mm_sub_epi16(bot, top),
	_mm_set1_epi16(wy)));

	// Scale down by total max weight 16x16 = 256.
	px = _mm_srli_epi16(px, 8);

	// Scale by alpha if needed.
	if (s.fAlphaScale < 256) {
	px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);
	}
	return px;
	};

	// We're in _DX mode here, so we're only varying in X.
	// That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
	// All the other entries in xy will be pairs of X coordinates and the X weight.
	int y0, y1, wy;
	decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

	auto row0 = (const uint32_t)((const uint8_t)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
	row1 = (const uint32_t)((const uint8_t)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());

	while (count >= 4) {
	// We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
	int x0[4],
	x1[4];
	__m128i wx;

	// decode_packed_coordinates_and_weight(), 4x.
	__m128i packed = _mm_loadu_si128((const __m128i*)xy);
	_mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));
	_mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
	wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); // [0,15]

	// Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
	// and sixteen minus that as wl for pixels on the left at x0.
	__m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
	wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);

	// We need to interlace wl and wr for _mm_maddubs_epi16().
	__m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
	interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);

	enum { A,B,C,D };

	// interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
	// from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
	__m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],
	row1[x0[A]], row1[x1[A]],
	row0[x0[B]], row0[x1[B]],
	row1[x0[B]], row1[x1[B]],
	interlaced_x_weights_AB, wy);

	// Once more with the other half of the x-weights for two more pixels C,D.
	__m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],
	row1[x0[C]], row1[x1[C]],
	row0[x0[D]], row0[x1[D]],
	row1[x0[D]], row1[x1[D]],
	interlaced_x_weights_CD, wy);

	// Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
	_mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));
	xy += 4;
	colors += 4;
	count -= 4;
	}

	while (count --> 0) {
	// This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
	int x0, x1, wx;
	decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

	// As above, splat out wx four times as wr, and sixteen minus that as wl.
	__m128i wr = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine.
	wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);

	__m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);

	__m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
	row1[x0], row1[x1],
	0, 0,
	0, 0,
	interlaced_x_weights, wy);

	*colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));
	}
	}


	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	/not static/ inline
	void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
	const uint32_t* xy, int count, uint32_t* colors) {
	SkASSERT(count > 0 && colors != nullptr);
	SkASSERT(s.fBilerp);
	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
	SkASSERT(s.fAlphaScale <= 256);

	int y0, y1, wy;
	decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

	auto row0 = (const uint32_t)( (const char)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
	row1 = (const uint32_t)( (const char)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );

	// We'll put one pixel in the low 4 16-bit lanes to line up with wy,
	// and another in the upper 4 16-bit lanes to line up with 16 - wy.
	const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here.
	_mm_set1_epi16(16-wy)); // Top pixel goes here.

	while (count --> 0) {
	int x0, x1, wx;
	decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

	// Load the 4 pixels we're interpolating, in this grid:
	// \| tl tr \|
	// \| bl br \|
	const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),
	bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);

	// We want to calculate a sum of 4 pixels weighted in two directions:
	//
	// sum = tl * (16-wy) * (16-wx)
	// + bl * ( wy) * (16-wx)
	// + tr * (16-wy) * ( wx)
	// + br * ( wy) * ( wx)
	//
	// (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
	//
	// We've already prepared allY as a vector containing [wy, 16-wy] as a way
	// to apply those y-direction weights. So we'll start on the x-direction
	// first, grouping into left and right halves, lined up with allY:
	//
	// L = [bl, tl]
	// R = [br, tr]
	//
	// sum = horizontalSum( allY * (L(16-wx) + Rwx) )
	//
	// Rewriting that one more step, we can replace a multiply with a shift:
	//
	// sum = horizontalSum( allY * (16L + (R-L)wx) )
	//
	// That's how we'll actually do this math.

	__m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
	R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());

	__m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4),
	_mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));

	__m128i sum_in_x = _mm_mullo_epi16(inner, allY);

	// sum = horizontalSum( ... )
	__m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));

	// Get back to [0,255] by dividing by maximum weight 16x16 = 256.
	sum = _mm_srli_epi16(sum, 8);

	if (s.fAlphaScale < 256) {
	// Scale by alpha, which is in [0,256].
	sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
	sum = _mm_srli_epi16(sum, 8);
	}

	// Pack back into 8-bit values and store.
	*colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
	}
	}

	#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
	/not static/ inline
	void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
	const uint32_t* xy, int count, uint32_t* colors) {
	SkASSERT(count > 0 && colors != nullptr);
	SkASSERT(s.fBilerp);
	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
	SkASSERT(s.fAlphaScale <= 256);

	int y0, y1, wy;
	decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

	auto row0 = (const uint32_t)( (const char)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
	row1 = (const uint32_t)( (const char)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );

	// We'll put one pixel in the low 16 16-bit lanes to line up with wy,
	// and another in the upper 16 16-bit lanes to line up with 16 - wy.
	__m256i allY = __lasx_xvilvl_d(__lasx_xvreplgr2vr_h(16-wy), __lasx_xvreplgr2vr_h(wy));

	while (count --> 0) {
	int x0, x1, wx;
	decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

	// Load the 4 pixels we're interpolating, in this grid:
	// \| tl tr \|
	// \| bl br \|

	const __m256i zeros = __lasx_xvldi(0);
	const __m256i tl = __lasx_xvinsgr2vr_w(zeros, row0[x0], 0),
	tr = __lasx_xvinsgr2vr_w(zeros, row0[x1], 0),
	bl = __lasx_xvinsgr2vr_w(zeros, row1[x0], 0),
	br = __lasx_xvinsgr2vr_w(zeros, row1[x1], 0);

	// We want to calculate a sum of 8 pixels weighted in two directions:
	//
	// sum = tl * (16-wy) * (16-wx)
	// + bl * ( wy) * (16-wx)
	// + tr * (16-wy) * ( wx)
	// + br * ( wy) * ( wx)
	//
	// (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
	//
	// We've already prepared allY as a vector containing [wy, 16-wy] as a way
	// to apply those y-direction weights. So we'll start on the x-direction
	// first, grouping into left and right halves, lined up with allY:
	//
	// L = [bl, tl]
	// R = [br, tr]
	//
	// sum = horizontalSum( allY * (L(16-wx) + Rwx) )
	//
	// Rewriting that one more step, we can replace a multiply with a shift:
	//
	// sum = horizontalSum( allY * (16L + (R-L)wx) )
	//
	// That's how we'll actually do this math.

	__m256i L = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tl, bl)),
	R = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tr, br));

	__m256i inner = __lasx_xvadd_h(__lasx_xvslli_h(L, 4),
	__lasx_xvmul_h(__lasx_xvsub_h(R,L),
	__lasx_xvreplgr2vr_h(wx)));

	__m256i sum_in_x = __lasx_xvmul_h(inner, allY);

	// sum = horizontalSum( ... )
	__m256i sum = __lasx_xvadd_h(sum_in_x, __lasx_xvbsrl_v(sum_in_x, 8));

	// Get back to [0,255] by dividing by maximum weight 16x16 = 256.
	sum = __lasx_xvsrli_h(sum, 8);

	if (s.fAlphaScale < 256) {
	// Scale by alpha, which is in [0,256].
	sum = __lasx_xvmul_h(sum, __lasx_xvreplgr2vr_h(s.fAlphaScale));
	sum = __lasx_xvsrli_h(sum, 8);
	}

	// Pack back into 8-bit values and store.
	*colors++ = __lasx_xvpickve2gr_w(__lasx_xvpickev_b(__lasx_xvldi(0),
	__lasx_xvsat_hu(sum, 8)), 0);
	}
	}

	#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX

	/not static/ inline
	void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
	const uint32_t* xy, int count, uint32_t* colors) {
	SkASSERT(count > 0 && colors != nullptr);
	SkASSERT(s.fBilerp);
	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
	SkASSERT(s.fAlphaScale <= 256);

	int y0, y1, wy;
	decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

	auto row0 = (const uint32_t)( (const char)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
	row1 = (const uint32_t)( (const char)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );

	// We'll put one pixel in the low 8 16-bit lanes to line up with wy,
	// and another in the upper 8 16-bit lanes to line up with 16 - wy.
	__m128i allY = __lsx_vilvl_d(__lsx_vreplgr2vr_h(16-wy), __lsx_vreplgr2vr_h(wy));

	while (count --> 0) {
	int x0, x1, wx;
	decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

	// Load the 4 pixels we're interpolating, in this grid:
	// \| tl tr \|
	// \| bl br \|
	const __m128i zeros = __lsx_vldi(0);
	const __m128i tl = __lsx_vinsgr2vr_w(zeros, row0[x0], 0),
	tr = __lsx_vinsgr2vr_w(zeros, row0[x1], 0),
	bl = __lsx_vinsgr2vr_w(zeros, row1[x0], 0),
	br = __lsx_vinsgr2vr_w(zeros, row1[x1], 0);

	// We want to calculate a sum of 8 pixels weighted in two directions:
	//
	// sum = tl * (16-wy) * (16-wx)
	// + bl * ( wy) * (16-wx)
	// + tr * (16-wy) * ( wx)
	// + br * ( wy) * ( wx)
	//
	// (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
	//
	// We've already prepared allY as a vector containing [wy, 16-wy] as a way
	// to apply those y-direction weights. So we'll start on the x-direction
	// first, grouping into left and right halves, lined up with allY:
	//
	// L = [bl, tl]
	// R = [br, tr]
	//
	// sum = horizontalSum( allY * (L(16-wx) + Rwx) )
	//
	// Rewriting that one more step, we can replace a multiply with a shift:
	//
	// sum = horizontalSum( allY * (16L + (R-L)wx) )
	//
	// That's how we'll actually do this math.


	__m128i L = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tl, bl)),
	R = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tr, br));

	__m128i inner = __lsx_vadd_h(__lsx_vslli_h(L, 4),
	__lsx_vmul_h(__lsx_vsub_h(R,L),
	__lsx_vreplgr2vr_h(wx)));

	__m128i sum_in_x = __lsx_vmul_h(inner, allY);

	// sum = horizontalSum( ... )
	__m128i sum = __lsx_vadd_h(sum_in_x, __lsx_vbsrl_v(sum_in_x, 8));

	// Get back to [0,255] by dividing by maximum weight 16x16 = 256.
	sum = __lsx_vsrli_h(sum, 8);

	if (s.fAlphaScale < 256) {
	// Scale by alpha, which is in [0,256].
	sum = __lsx_vmul_h(sum, __lsx_vreplgr2vr_h(s.fAlphaScale));
	sum = __lsx_vsrli_h(sum, 8);
	}

	// Pack back into 8-bit values and store.
	*colors++ = __lsx_vpickve2gr_w(__lsx_vpickev_b(__lsx_vldi(0),
	__lsx_vsat_hu(sum, 8)), 0);
	}
	}

	#else

	// The NEON code only actually differs from the portable code in the
	// filtering step after we've loaded all four pixels we want to bilerp.

	#if defined(SK_ARM_HAS_NEON)
	static void filter_and_scale_by_alpha(unsigned x, unsigned y,
	SkPMColor a00, SkPMColor a01,
	SkPMColor a10, SkPMColor a11,
	SkPMColor *dst,
	uint16_t scale) {
	uint8x8_t vy, vconst16_8, v16_y, vres;
	uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
	uint32x2_t va0, va1;
	uint16x8_t tmp1, tmp2;

	vy = vdup_n_u8(y); // duplicate y into vy
	vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8
	v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y

	va0 = vdup_n_u32(a00); // duplicate a00
	va1 = vdup_n_u32(a10); // duplicate a10
	va0 = vset_lane_u32(a01, va0, 1); // set top to a01
	va1 = vset_lane_u32(a11, va1, 1); // set top to a11

	tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01\|a00] * (16-y)
	tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11\|a10] * y

	vx = vdup_n_u16(x); // duplicate x into vx
	vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16
	v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x

	tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x
	tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x
	tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
	tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)

	if (scale < 256) {
	vscale = vdup_n_u16(scale); // duplicate scale
	tmp = vshr_n_u16(tmp, 8); // shift down result by 8
	tmp = vmul_u16(tmp, vscale); // multiply result by scale
	}

	vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8
	vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result
	}
	#else
	static void filter_and_scale_by_alpha(unsigned x, unsigned y,
	SkPMColor a00, SkPMColor a01,
	SkPMColor a10, SkPMColor a11,
	SkPMColor* dstColor,
	unsigned alphaScale) {
	SkASSERT((unsigned)x <= 0xF);
	SkASSERT((unsigned)y <= 0xF);
	SkASSERT(alphaScale <= 256);

	int xy = x * y;
	const uint32_t mask = 0xFF00FF;

	int scale = 256 - 16y - 16x + xy;
	uint32_t lo = (a00 & mask) * scale;
	uint32_t hi = ((a00 >> 8) & mask) * scale;

	scale = 16*x - xy;
	lo += (a01 & mask) * scale;
	hi += ((a01 >> 8) & mask) * scale;

	scale = 16*y - xy;
	lo += (a10 & mask) * scale;
	hi += ((a10 >> 8) & mask) * scale;

	lo += (a11 & mask) * xy;
	hi += ((a11 >> 8) & mask) * xy;

	if (alphaScale < 256) {
	lo = ((lo >> 8) & mask) * alphaScale;
	hi = ((hi >> 8) & mask) * alphaScale;
	}

	*dstColor = ((lo >> 8) & mask) \| (hi & ~mask);
	}
	#endif


	/not static/ inline
	void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
	const uint32_t* xy, int count, SkPMColor* colors) {
	SkASSERT(count > 0 && colors != nullptr);
	SkASSERT(s.fBilerp);
	SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
	SkASSERT(s.fAlphaScale <= 256);

	int y0, y1, wy;
	decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

	auto row0 = (const uint32_t)( (const char)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
	row1 = (const uint32_t)( (const char)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );

	while (count --> 0) {
	int x0, x1, wx;
	decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

	filter_and_scale_by_alpha(wx, wy,
	row0[x0], row0[x1],
	row1[x0], row1[x1],
	colors++,
	s.fAlphaScale);
	}
	}

	#endif

	#if defined(SK_ARM_HAS_NEON)
	/not static/ inline
	void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,
	const uint32_t* xy, int count, SkPMColor* colors) {
	SkASSERT(count > 0 && colors != nullptr);
	SkASSERT(s.fBilerp);
	SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
	SkASSERT(s.fAlphaScale <= 256);

	auto src = (const char*)s.fPixmap.addr();
	size_t rb = s.fPixmap.rowBytes();

	while (count --> 0) {
	int y0, y1, wy,
	x0, x1, wx;
	decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
	decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

	auto row0 = (const uint32_t)(src + y0rb),
	row1 = (const uint32_t)(src + y1rb);

	filter_and_scale_by_alpha(wx, wy,
	row0[x0], row0[x1],
	row1[x0], row1[x1],
	colors++,
	s.fAlphaScale);
	}
	}
	#else
	// It's not yet clear whether it's worthwhile specializing for other architectures.
	constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,
	const uint32_t, int, SkPMColor) = nullptr;
	#endif

	} // namespace SK_OPTS_NS

	namespace sktests {
	template <typename U32, typename Out>
	void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
	SK_OPTS_NS::decode_packed_coordinates_and_weight<U32, Out>(packed, v0, v1, w);
	}
	}

	#endif