port S32_alpha_D32_filter_DX to SkOpts I'll follow up by moving all the other things that live in src/opts today into SkBitmapProcState.cpp... they only use SSE2 or NEON, and don't need runtime detection. There's lots of refactoring to do here still, and I've mostly resisted the urge until this code is all in one place. Cq-Include-Trybots: master.tryserver.blink:linux_trusty_blink_rel Change-Id: Idea34a03c46d79b0fd6fbef1a49aaf27961c8260 Reviewed-on: https://skia-review.googlesource.com/c/171582 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>

commit: a2187bf7629af9eff9b37d99f3899e7f69d61a0d [log] [tgz]
author: Mike Klein <mtklein@google.com> Fri Nov 16 12:22:05 2018 -0500
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> Fri Nov 16 20:33:03 2018 +0000
tree: 236588d3ad2dd284f4c14f33899f1f576fcb4d93
parent: fd32e724d6d8a9569b3980ff4f72175df2e455c9 [diff]
diff --git a/gn/core.gni b/gn/core.gni
index 2fedb8a..b8d1207 100644
--- a/gn/core.gni
+++ b/gn/core.gni

@@ -32,7 +32,6 @@
   "$_src/core/SkBitmapDevice.h",
   "$_src/core/SkBitmapProcState.cpp",
   "$_src/core/SkBitmapProcState.h",
-  "$_src/core/SkBitmapProcState_filter.h",
   "$_src/core/SkBitmapProcState_matrix.h",
   "$_src/core/SkBitmapProcState_matrix_template.h",
   "$_src/core/SkBitmapProcState_matrixProcs.cpp",

diff --git a/gn/opts.gni b/gn/opts.gni
index 3d3e040..0fee868 100644
--- a/gn/opts.gni
+++ b/gn/opts.gni

@@ -17,8 +17,6 @@
 
 neon = [
   "$_src/opts/Sk4px_NEON.h",
-  "$_src/opts/SkBitmapProcState_arm_neon.cpp",
-  "$_src/opts/SkBitmapProcState_filter_neon.h",
   "$_src/opts/SkBitmapProcState_matrixProcs_neon.cpp",
   "$_src/opts/SkBitmapProcState_matrix_neon.h",
   "$_src/opts/SkColor_opts_neon.h",
@@ -26,8 +24,6 @@
 
 arm64 = [
   "$_src/opts/Sk4px_NEON.h",
-  "$_src/opts/SkBitmapProcState_arm_neon.cpp",
-  "$_src/opts/SkBitmapProcState_filter_neon.h",
   "$_src/opts/SkBitmapProcState_matrixProcs_neon.cpp",
   "$_src/opts/SkBitmapProcState_matrix_neon.h",
   "$_src/opts/SkBitmapProcState_opts_none.cpp",
@@ -44,13 +40,8 @@
   "$_src/opts/opts_check_x86.cpp",
 ]
 
-ssse3 = [
-  "$_src/opts/SkBitmapProcState_opts_SSSE3.h",
-  "$_src/opts/SkBitmapProcState_opts_SSSE3.cpp",
-  "$_src/opts/SkOpts_ssse3.cpp",
-]
-
+ssse3 = [ "$_src/opts/SkOpts_ssse3.cpp", ]
 sse41 = [ "$_src/opts/SkOpts_sse41.cpp" ]
 sse42 = [ "$_src/opts/SkOpts_sse42.cpp" ]
-avx = [ "$_src/opts/SkOpts_avx.cpp" ]
-hsw = [ "$_src/opts/SkOpts_hsw.cpp" ]
+avx   = [ "$_src/opts/SkOpts_avx.cpp" ]
+hsw   = [ "$_src/opts/SkOpts_hsw.cpp" ]

diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index 8d4005e..fb05ddd 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp

@@ -10,6 +10,7 @@
 #include "SkBitmapProcState.h"
 #include "SkColorData.h"
 #include "SkMacros.h"
+#include "SkOpts.h"
 #include "SkPaint.h"
 #include "SkShader.h"   // for tilemodes
 #include "SkUtilsArm.h"
@@ -80,7 +81,6 @@
 }
 
 #define   NAME_WRAP(x)  x
-#include "SkBitmapProcState_filter.h"
 #include "SkBitmapProcState_procs.h"
 
 SkBitmapProcInfo::SkBitmapProcInfo(const SkBitmapProvider& provider,
@@ -245,25 +245,20 @@
     // the shader procs above and can skip all this.
 
     if (fFilterQuality < kHigh_SkFilterQuality) {
-        int index = fFilterQuality > kNone_SkFilterQuality ? 1 : 0;
 
-#if !defined(SK_ARM_HAS_NEON)
-        static const SampleProc32 gSkBitmapProcStateSample32[] = {
-            S32_alpha_D32_nofilter_DX,
-            S32_alpha_D32_filter_DX,
-        };
-#endif
-
-        fSampleProc32 = SK_ARM_NEON_WRAP(gSkBitmapProcStateSample32)[index];
-
-        // our special-case shaderprocs
-        if (fAlphaScale == 256
-                && fSampleProc32 == S32_alpha_D32_nofilter_DX
-                && clampClamp) {
-            fShaderProc32 = Clamp_S32_opaque_D32_nofilter_DX_shaderproc;
+        if (fFilterQuality > kNone_SkFilterQuality) {
+            fSampleProc32 = SkOpts::S32_alpha_D32_filter_DX;
+        } else {
+            fSampleProc32 = S32_alpha_D32_nofilter_DX;
         }
 
-        if (nullptr == fShaderProc32) {
+        // our special-case shaderprocs
+        // TODO: move this one into chooseShaderProc32() or pull all that in here.
+        if (fAlphaScale == 256
+                && fFilterQuality == kNone_SkFilterQuality
+                && clampClamp) {
+            fShaderProc32 = Clamp_S32_opaque_D32_nofilter_DX_shaderproc;
+        } else {
             fShaderProc32 = this->chooseShaderProc32();
         }
     }
@@ -366,6 +361,31 @@
     }
 }
 
+static inline void filter_32_alpha(unsigned t,
+                                   SkPMColor color0,
+                                   SkPMColor color1,
+                                   SkPMColor* dstColor,
+                                   unsigned alphaScale) {
+    SkASSERT((unsigned)t <= 0xF);
+    SkASSERT(alphaScale <= 256);
+
+    const uint32_t mask = 0xFF00FF;
+
+    int scale = 256 - 16*t;
+    uint32_t lo = (color0 & mask) * scale;
+    uint32_t hi = ((color0 >> 8) & mask) * scale;
+
+    scale = 16*t;
+    lo += (color1 & mask) * scale;
+    hi += ((color1 >> 8) & mask) * scale;
+
+    // TODO: if (alphaScale < 256) ...
+    lo = ((lo >> 8) & mask) * alphaScale;
+    hi = ((hi >> 8) & mask) * alphaScale;
+
+    *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
+}
+
 static void S32_D32_constX_shaderproc(const void* sIn,
                                       int x, int y,
                                       SkPMColor* SK_RESTRICT colors,
@@ -459,7 +479,7 @@
 
     if (kNone_SkFilterQuality != s.fFilterQuality) {
         const SkPMColor* row1 = s.fPixmap.addr32(0, iY1);
-        Filter_32_alpha(iSubY, *row0, *row1, &color, s.fAlphaScale);
+        filter_32_alpha(iSubY, *row0, *row1, &color, s.fAlphaScale);
     } else {
         if (s.fAlphaScale < 256) {
             color = SkAlphaMulQ(*row0, s.fAlphaScale);

diff --git a/src/core/SkBitmapProcState_filter.h b/src/core/SkBitmapProcState_filter.h
deleted file mode 100644
index a7b9a84..0000000
--- a/src/core/SkBitmapProcState_filter.h
+++ /dev/null

@@ -1,69 +0,0 @@
-/*
- * Copyright 2009 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-
-#include "SkColorData.h"
-
-static inline void Filter_32_alpha(unsigned x, unsigned y,
-                                   SkPMColor a00, SkPMColor a01,
-                                   SkPMColor a10, SkPMColor a11,
-                                   SkPMColor* dstColor,
-                                   unsigned alphaScale) {
-    SkASSERT((unsigned)x <= 0xF);
-    SkASSERT((unsigned)y <= 0xF);
-    SkASSERT(alphaScale <= 256);
-
-    int xy = x * y;
-    const uint32_t mask = 0xFF00FF;
-
-    int scale = 256 - 16*y - 16*x + xy;
-    uint32_t lo = (a00 & mask) * scale;
-    uint32_t hi = ((a00 >> 8) & mask) * scale;
-
-    scale = 16*x - xy;
-    lo += (a01 & mask) * scale;
-    hi += ((a01 >> 8) & mask) * scale;
-
-    scale = 16*y - xy;
-    lo += (a10 & mask) * scale;
-    hi += ((a10 >> 8) & mask) * scale;
-
-    lo += (a11 & mask) * xy;
-    hi += ((a11 >> 8) & mask) * xy;
-
-    // TODO: if (alphaScale < 256) ...
-    lo = ((lo >> 8) & mask) * alphaScale;
-    hi = ((hi >> 8) & mask) * alphaScale;
-
-    *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
-}
-
-// Two color version, where we filter only along 1 axis
-static inline void Filter_32_alpha(unsigned t,
-                                   SkPMColor color0,
-                                   SkPMColor color1,
-                                   SkPMColor* dstColor,
-                                   unsigned alphaScale) {
-    SkASSERT((unsigned)t <= 0xF);
-    SkASSERT(alphaScale <= 256);
-
-    const uint32_t mask = 0xFF00FF;
-
-    int scale = 256 - 16*t;
-    uint32_t lo = (color0 & mask) * scale;
-    uint32_t hi = ((color0 >> 8) & mask) * scale;
-
-    scale = 16*t;
-    lo += (color1 & mask) * scale;
-    hi += ((color1 >> 8) & mask) * scale;
-
-    // TODO: if (alphaScale < 256) ...
-    lo = ((lo >> 8) & mask) * alphaScale;
-    hi = ((hi >> 8) & mask) * alphaScale;
-
-    *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
-}

diff --git a/src/core/SkBitmapProcState_procs.h b/src/core/SkBitmapProcState_procs.h
index ebdf155..79e28b6 100644
--- a/src/core/SkBitmapProcState_procs.h
+++ b/src/core/SkBitmapProcState_procs.h

@@ -12,9 +12,6 @@
     #error "Please define NAME_WRAP() before including this file"
 #endif
 
-#define FILTER_PROC(x, y, a, b, c, d, dst) \
-    NAME_WRAP(Filter_32_alpha)(x, y, a, b, c, d, dst, alphaScale)
-
 #define MAKENAME(suffix)        NAME_WRAP(S32_alpha_D32 ## suffix)
 #define SRCTYPE                 SkPMColor
 #define CHECKSTATE(state)       SkASSERT(4 == state.fPixmap.info().bytesPerPixel()); \

diff --git a/src/core/SkBitmapProcState_sample.h b/src/core/SkBitmapProcState_sample.h
index 70bae6a..aaf7d10 100644
--- a/src/core/SkBitmapProcState_sample.h
+++ b/src/core/SkBitmapProcState_sample.h

@@ -11,9 +11,6 @@
 void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s,
                             const uint32_t* SK_RESTRICT xy,
                             int count, SkPMColor* SK_RESTRICT colors);
-void MAKENAME(_filter_DX)(const SkBitmapProcState& s,
-                          const uint32_t* SK_RESTRICT xy,
-                           int count, SkPMColor* SK_RESTRICT colors);
 
 void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s,
                             const uint32_t* SK_RESTRICT xy,
@@ -66,49 +63,6 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
-void MAKENAME(_filter_DX)(const SkBitmapProcState& s,
-                          const uint32_t* SK_RESTRICT xy,
-                           int count, SkPMColor* SK_RESTRICT colors) {
-    SkASSERT(count > 0 && colors != nullptr);
-    SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
-    SkDEBUGCODE(CHECKSTATE(s);)
-
-#ifdef PREAMBLE
-    PREAMBLE(s);
-#endif
-    const char* SK_RESTRICT srcAddr = (const char*)s.fPixmap.addr();
-    size_t rb = s.fPixmap.rowBytes();
-    unsigned subY;
-    const SRCTYPE* SK_RESTRICT row0;
-    const SRCTYPE* SK_RESTRICT row1;
-
-    // setup row ptrs and update proc_table
-    {
-        uint32_t XY = *xy++;
-        unsigned y0 = XY >> 14;
-        row0 = (const SRCTYPE*)(srcAddr + (y0 >> 4) * rb);
-        row1 = (const SRCTYPE*)(srcAddr + (XY & 0x3FFF) * rb);
-        subY = y0 & 0xF;
-    }
-
-    do {
-        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
-        unsigned x0 = XX >> 14;
-        unsigned x1 = XX & 0x3FFF;
-        unsigned subX = x0 & 0xF;
-        x0 >>= 4;
-
-        FILTER_PROC(subX, subY,
-                    SRC_TO_FILTER(row0[x0]),
-                    SRC_TO_FILTER(row0[x1]),
-                    SRC_TO_FILTER(row1[x0]),
-                    SRC_TO_FILTER(row1[x1]),
-                    colors);
-        colors += 1;
-
-    } while (--count != 0);
-}
-
 #undef MAKENAME
 #undef SRCTYPE
 #undef CHECKSTATE

diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 68dd89b..25486f9 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp

@@ -36,6 +36,7 @@
     #define SK_OPTS_NS portable
 #endif
 
+#include "SkBitmapProcState_opts.h"
 #include "SkBlitMask_opts.h"
 #include "SkBlitRow_opts.h"
 #include "SkChecksum_opts.h"
@@ -73,6 +74,8 @@
 
     DEFINE_DEFAULT(hash_fn);
 
+    DEFINE_DEFAULT(S32_alpha_D32_filter_DX);
+
 #undef DEFINE_DEFAULT
 
 #define M(st) (StageFn)SK_OPTS_NS::st,

diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index decbdd4..b2a1a0d 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h

@@ -12,7 +12,7 @@
 #include "SkTypes.h"
 #include "SkXfermodePriv.h"
 
-struct ProcCoeff;
+struct SkBitmapProcState;
 
 namespace SkOpts {
     // Call to replace pointers to portable functions with pointers to CPU-specific functions.
@@ -53,6 +53,11 @@
         return hash_fn(data, bytes, seed);
     }
 
+    // SkBitmapProcState optimized Shader, Sample, or Matrix procs.
+    // This is the only one that can use anything past SSE2/NEON.
+    extern void (*S32_alpha_D32_filter_DX)(const SkBitmapProcState&,
+                                           const uint32_t* xy, int count, SkPMColor*);
+
 #define M(st) +1
     // We can't necessarily express the type of SkJumper stage functions here,
     // so we just use this void(*)(void) as a stand-in.

diff --git a/src/opts/SkBitmapProcState_arm_neon.cpp b/src/opts/SkBitmapProcState_arm_neon.cpp
deleted file mode 100644
index 6341aee..0000000
--- a/src/opts/SkBitmapProcState_arm_neon.cpp
+++ /dev/null

@@ -1,25 +0,0 @@
-/*
- * Copyright 2012 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkBitmapProcState.h"
-#include "SkBitmapProcState_filter.h"
-#include "SkColorData.h"
-#include "SkPaint.h"
-#include "SkShader.h"   // for tilemodes
-#include "SkUtilsArm.h"
-
-// Required to ensure the table is part of the final binary.
-extern const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[];
-
-#define   NAME_WRAP(x)  x ## _neon
-#include "SkBitmapProcState_filter_neon.h"
-#include "SkBitmapProcState_procs.h"
-
-const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[] = {
-    S32_alpha_D32_nofilter_DX_neon,
-    S32_alpha_D32_filter_DX_neon,
-};

diff --git a/src/opts/SkBitmapProcState_filter_neon.h b/src/opts/SkBitmapProcState_filter_neon.h
deleted file mode 100644
index ab3cec8..0000000
--- a/src/opts/SkBitmapProcState_filter_neon.h
+++ /dev/null

@@ -1,48 +0,0 @@
-/*
- * Copyright 2012 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkColorData.h"
-#include <arm_neon.h>
-
-static SK_ALWAYS_INLINE void Filter_32_alpha_neon(unsigned x, unsigned y,
-                                                  SkPMColor a00, SkPMColor a01,
-                                                  SkPMColor a10, SkPMColor a11,
-                                                  SkPMColor *dst,
-                                                  uint16_t scale) {
-    uint8x8_t vy, vconst16_8, v16_y, vres;
-    uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
-    uint32x2_t va0, va1;
-    uint16x8_t tmp1, tmp2;
-
-    vy = vdup_n_u8(y);                // duplicate y into vy
-    vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
-    v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y
-
-    va0 = vdup_n_u32(a00);            // duplicate a00
-    va1 = vdup_n_u32(a10);            // duplicate a10
-    va0 = vset_lane_u32(a01, va0, 1); // set top to a01
-    va1 = vset_lane_u32(a11, va1, 1); // set top to a11
-
-    tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
-    tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y
-
-    vx = vdup_n_u16(x);                // duplicate x into vx
-    vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
-    v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
-
-    tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
-    tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
-    tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
-    tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
-
-    vscale = vdup_n_u16(scale);        // duplicate scale
-    tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
-    tmp = vmul_u16(tmp, vscale);       // multiply result by scale
-
-    vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
-    vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
-}

diff --git a/src/opts/SkBitmapProcState_opts.h b/src/opts/SkBitmapProcState_opts.h
new file mode 100644
index 0000000..adc03c5
--- /dev/null
+++ b/src/opts/SkBitmapProcState_opts.h

@@ -0,0 +1,432 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkBitmapProcState_opts_DEFINED
+#define SkBitmapProcState_opts_DEFINED
+
+#include "SkBitmapProcState.h"
+
+// SkBitmapProcState optimized Shader, Sample, or Matrix procs.
+//
+// Only S32_alpha_D32_filter_DX exploits instructions beyond
+// our common baseline SSE2/NEON instruction sets, so that's
+// all that lives here.
+//
+// The rest are scattershot at the moment but I want to get them
+// all migrated to be normal code inside SkBitmapProcState.cpp.
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+    #include <immintrin.h>
+#elif defined(SK_ARM_HAS_NEON)
+    #include <arm_neon.h>
+#endif
+
+namespace SK_OPTS_NS {
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+    // This same basic packing scheme is used throughout the file.
+    static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) {
+        // The top 14 bits are the integer coordinate x0 or y0.
+        *v0 = packed >> 18;
+
+        // The bottom 14 bits are the integer coordinate x1 or y1.
+        *v1 = packed & 0x3fff;
+
+        // The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1.
+        *w = (packed >> 14) & 0xf;
+    }
+
+    // As above, 4x.
+    static void decode_packed_coordinates_and_weight(__m128i packed,
+                                                     int v0[4], int v1[4], __m128i* w) {
+        _mm_storeu_si128((__m128i*)v0, _mm_srli_epi32(packed, 18));
+        _mm_storeu_si128((__m128i*)v1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
+        *w = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));
+    }
+
+    // This is the crux of the SSSE3 implementation,
+    // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
+    static inline __m128i interpolate_in_x(uint32_t A0, uint32_t A1,
+                                           uint32_t B0, uint32_t B1,
+                                           const __m128i& interlaced_x_weights) {
+        // _mm_maddubs_epi16() is a little idiosyncratic, but very helpful as the core of a lerp.
+        //
+        // It takes two arguments interlaced byte-wise:
+        //    - first  arg: [ x,y, ... 7 more pairs of 8-bit values ...]
+        //    - second arg: [ z,w, ... 7 more pairs of 8-bit values ...]
+        // and returns 8 16-bit values: [ x*z + y*w, ... 7 more 16-bit values ... ].
+        //
+        // That's why we go to all this trouble to make interlaced_x_weights,
+        // and here we're interlacing A0 with A1, B0 with B1 to match.
+
+        __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
+                interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
+
+        return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
+                                 interlaced_x_weights);
+    }
+
+    // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
+    // Returns two pixels, with each channel in a 16-bit lane of the __m128i.
+    static inline __m128i interpolate_in_x_and_y(uint32_t A0, uint32_t A1,
+                                                 uint32_t A2, uint32_t A3,
+                                                 uint32_t B0, uint32_t B1,
+                                                 uint32_t B2, uint32_t B3,
+                                                 const __m128i& interlaced_x_weights,
+                                                 int wy) {
+        // The stored Y weight wy is for y1, and y0 gets a weight 16-wy.
+        const __m128i wy1 = _mm_set1_epi16(wy),
+                      wy0 = _mm_sub_epi16(_mm_set1_epi16(16), wy1);
+
+        // First interpolate in X,
+        // leaving the values in 16-bit lanes scaled up by those [0,16] interlaced_x_weights.
+        __m128i row0 = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
+                row1 = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
+
+        // Interpolate in Y across the two rows,
+        // then scale everything down by the maximum total weight 16x16 = 256.
+        return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(row0, wy0),
+                                            _mm_mullo_epi16(row1, wy1)), 8);
+    }
+
+    /*not static*/ inline
+    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
+                                 const uint32_t* xy, int count, uint32_t* colors) {
+        SkASSERT(count > 0 && colors != nullptr);
+        SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
+        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
+
+        int alpha = s.fAlphaScale;
+
+        // Return (px * s.fAlphaScale) / 256.   (s.fAlphaScale is in [0,256].)
+        auto scale_by_alpha = [alpha](const __m128i& px) {
+            return alpha == 256 ? px
+                                : _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(alpha)), 8);
+        };
+
+        // We're in _DX_ mode here, so we're only varying in X.
+        // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
+        // All the other entries in xy will be pairs of X coordinates and the X weight.
+        int y0, y1, wy;
+        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
+
+        auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
+             row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
+
+        while (count >= 4) {
+            // We can really get going, loading 4 X pairs at a time to produce 4 output pixels.
+            const __m128i xx = _mm_loadu_si128((const __m128i*)xy);
+
+            int x0[4],
+                x1[4];
+            __m128i wx;
+            decode_packed_coordinates_and_weight(xx, x0, x1, &wx);
+
+            // Splat out each x weight wx four times (one for each pixel channel) as wx1,
+            // and sixteen minus that as the weight for x0, wx0.
+            __m128i wx1 = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
+                    wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);
+
+            // We need to interlace wx0 and wx1 for _mm_maddubs_epi16().
+            __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wx0,wx1),
+                    interlaced_x_weights_CD = _mm_unpackhi_epi8(wx0,wx1);
+
+            // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
+            // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
+            __m128i AB = interpolate_in_x_and_y(row0[x0[0]], row0[x1[0]],
+                                                row1[x0[0]], row1[x1[0]],
+                                                row0[x0[1]], row0[x1[1]],
+                                                row1[x0[1]], row1[x1[1]],
+                                                interlaced_x_weights_AB, wy);
+
+            // Once more with the other half of the x-weights for two more pixels C,D.
+            __m128i CD = interpolate_in_x_and_y(row0[x0[2]], row0[x1[2]],
+                                                row1[x0[2]], row1[x1[2]],
+                                                row0[x0[3]], row0[x1[3]],
+                                                row1[x0[3]], row1[x1[3]],
+                                                interlaced_x_weights_CD, wy);
+
+            // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
+            _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(scale_by_alpha(AB),
+                                                                scale_by_alpha(CD)));
+            xy     += 4;
+            colors += 4;
+            count  -= 4;
+        }
+
+        while (count --> 0) {
+            // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
+            int x0, x1, wx;
+            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
+
+            // As above, splat out wx four times as wx1, and sixteen minus that as wx0.
+            __m128i wx1 = _mm_set1_epi8(wx),     // This splats it out 16 times, but that's fine.
+                    wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);
+
+            __m128i interlaced_x_weights_A = _mm_unpacklo_epi8(wx0, wx1);
+
+            __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
+                                               row1[x0], row1[x1],
+                                                      0,        0,
+                                                      0,        0,
+                                               interlaced_x_weights_A, wy);
+
+            *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(scale_by_alpha(A), _mm_setzero_si128()));
+        }
+    }
+
+
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+
+    // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.
+
+    /*not static*/ inline
+    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
+                                 const uint32_t* xy, int count, uint32_t* colors) {
+        SkASSERT(count > 0 && colors != nullptr);
+        SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
+        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
+        SkASSERT(s.fAlphaScale <= 256);
+
+        const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
+        size_t rb = s.fPixmap.rowBytes();
+        uint32_t XY = *xy++;
+        unsigned y0 = XY >> 14;
+        const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
+        const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
+        unsigned subY = y0 & 0xF;
+
+        // ( 0,  0,  0,  0,  0,  0,  0, 16)
+        __m128i sixteen = _mm_cvtsi32_si128(16);
+
+        // ( 0,  0,  0,  0, 16, 16, 16, 16)
+        sixteen = _mm_shufflelo_epi16(sixteen, 0);
+
+        // ( 0,  0,  0,  0,  0,  0,  0,  y)
+        __m128i allY = _mm_cvtsi32_si128(subY);
+
+        // ( 0,  0,  0,  0,  y,  y,  y,  y)
+        allY = _mm_shufflelo_epi16(allY, 0);
+
+        // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
+        __m128i negY = _mm_sub_epi16(sixteen, allY);
+
+        // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
+        allY = _mm_unpacklo_epi64(allY, negY);
+
+        // (16, 16, 16, 16, 16, 16, 16, 16 )
+        sixteen = _mm_shuffle_epi32(sixteen, 0);
+
+        // ( 0,  0,  0,  0,  0,  0,  0,  0)
+        __m128i zero = _mm_setzero_si128();
+
+        // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
+        __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
+
+        do {
+            uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
+            unsigned x0 = XX >> 18;
+            unsigned x1 = XX & 0x3FFF;
+
+            // (0, 0, 0, 0, 0, 0, 0, x)
+            __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
+
+            // (0, 0, 0, 0, x, x, x, x)
+            allX = _mm_shufflelo_epi16(allX, 0);
+
+            // (x, x, x, x, x, x, x, x)
+            allX = _mm_shuffle_epi32(allX, 0);
+
+            // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
+            __m128i negX = _mm_sub_epi16(sixteen, allX);
+
+            // Load 4 samples (pixels).
+            __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
+            __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
+            __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
+            __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
+
+            // (0, 0, a00, a10)
+            __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
+
+            // Expand to 16 bits per component.
+            a00a10 = _mm_unpacklo_epi8(a00a10, zero);
+
+            // ((a00 * (16-y)), (a10 * y)).
+            a00a10 = _mm_mullo_epi16(a00a10, allY);
+
+            // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
+            a00a10 = _mm_mullo_epi16(a00a10, negX);
+
+            // (0, 0, a01, a10)
+            __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
+
+            // Expand to 16 bits per component.
+            a01a11 = _mm_unpacklo_epi8(a01a11, zero);
+
+            // (a01 * (16-y)), (a11 * y)
+            a01a11 = _mm_mullo_epi16(a01a11, allY);
+
+            // (a01 * (16-y) * x), (a11 * y * x)
+            a01a11 = _mm_mullo_epi16(a01a11, allX);
+
+            // (a00*w00 + a01*w01, a10*w10 + a11*w11)
+            __m128i sum = _mm_add_epi16(a00a10, a01a11);
+
+            // (DC, a00*w00 + a01*w01)
+            __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
+
+            // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
+            sum = _mm_add_epi16(sum, shifted);
+
+            // Divide each 16 bit component by 256.
+            sum = _mm_srli_epi16(sum, 8);
+
+            // Multiply by alpha.
+            sum = _mm_mullo_epi16(sum, alpha);
+
+            // Divide each 16 bit component by 256.
+            sum = _mm_srli_epi16(sum, 8);
+
+            // Pack lower 4 16 bit values of sum into lower 4 bytes.
+            sum = _mm_packus_epi16(sum, zero);
+
+            // Extract low int and store.
+            *colors++ = _mm_cvtsi128_si32(sum);
+        } while (--count > 0);
+    }
+
+#else
+
+    // The NEON code only actually differs from the portable code in the
+    // filtering step after we've loaded all four pixels we want to bilerp.
+
+    #if defined(SK_ARM_HAS_NEON)
+        static void filter_and_scale_by_alpha(unsigned x, unsigned y,
+                                              SkPMColor a00, SkPMColor a01,
+                                              SkPMColor a10, SkPMColor a11,
+                                              SkPMColor *dst,
+                                              uint16_t scale) {
+            uint8x8_t vy, vconst16_8, v16_y, vres;
+            uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
+            uint32x2_t va0, va1;
+            uint16x8_t tmp1, tmp2;
+
+            vy = vdup_n_u8(y);                // duplicate y into vy
+            vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
+            v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y
+
+            va0 = vdup_n_u32(a00);            // duplicate a00
+            va1 = vdup_n_u32(a10);            // duplicate a10
+            va0 = vset_lane_u32(a01, va0, 1); // set top to a01
+            va1 = vset_lane_u32(a11, va1, 1); // set top to a11
+
+            tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
+            tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y
+
+            vx = vdup_n_u16(x);                // duplicate x into vx
+            vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
+            v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
+
+            tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
+            tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
+            tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
+            tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
+
+            vscale = vdup_n_u16(scale);        // duplicate scale
+            tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
+            tmp = vmul_u16(tmp, vscale);       // multiply result by scale
+
+            vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
+            vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
+        }
+    #else
+        static void filter_and_scale_by_alpha(unsigned x, unsigned y,
+                                              SkPMColor a00, SkPMColor a01,
+                                              SkPMColor a10, SkPMColor a11,
+                                              SkPMColor* dstColor,
+                                              unsigned alphaScale) {
+            SkASSERT((unsigned)x <= 0xF);
+            SkASSERT((unsigned)y <= 0xF);
+            SkASSERT(alphaScale <= 256);
+
+            int xy = x * y;
+            const uint32_t mask = 0xFF00FF;
+
+            int scale = 256 - 16*y - 16*x + xy;
+            uint32_t lo = (a00 & mask) * scale;
+            uint32_t hi = ((a00 >> 8) & mask) * scale;
+
+            scale = 16*x - xy;
+            lo += (a01 & mask) * scale;
+            hi += ((a01 >> 8) & mask) * scale;
+
+            scale = 16*y - xy;
+            lo += (a10 & mask) * scale;
+            hi += ((a10 >> 8) & mask) * scale;
+
+            lo += (a11 & mask) * xy;
+            hi += ((a11 >> 8) & mask) * xy;
+
+            // TODO: if (alphaScale < 256) ...
+            lo = ((lo >> 8) & mask) * alphaScale;
+            hi = ((hi >> 8) & mask) * alphaScale;
+
+            *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
+        }
+    #endif
+
+
+    // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.
+
+    /*not static*/ inline
+    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
+                                 const uint32_t* xy, int count, SkPMColor* colors) {
+        SkASSERT(count > 0 && colors != nullptr);
+        SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
+        SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
+        SkASSERT(s.fAlphaScale <= 256);
+
+        unsigned alphaScale = s.fAlphaScale;
+
+        const char* srcAddr = (const char*)s.fPixmap.addr();
+        size_t rb = s.fPixmap.rowBytes();
+        unsigned subY;
+        const SkPMColor* row0;
+        const SkPMColor* row1;
+
+        // setup row ptrs and update proc_table
+        {
+            uint32_t XY = *xy++;
+            unsigned y0 = XY >> 14;
+            row0 = (const SkPMColor*)(srcAddr + (y0 >> 4) * rb);
+            row1 = (const SkPMColor*)(srcAddr + (XY & 0x3FFF) * rb);
+            subY = y0 & 0xF;
+        }
+
+        do {
+            uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
+            unsigned x0 = XX >> 14;
+            unsigned x1 = XX & 0x3FFF;
+            unsigned subX = x0 & 0xF;
+            x0 >>= 4;
+
+            filter_and_scale_by_alpha(subX, subY,
+                                      row0[x0], row0[x1],
+                                      row1[x0], row1[x1],
+                                      colors,
+                                      alphaScale);
+            colors += 1;
+
+        } while (--count != 0);
+    }
+
+#endif
+
+}  // namespace SK_OPTS_NS
+
+#endif

diff --git a/src/opts/SkBitmapProcState_opts_SSE2.cpp b/src/opts/SkBitmapProcState_opts_SSE2.cpp
index 391d421..71f6f51 100644
--- a/src/opts/SkBitmapProcState_opts_SSE2.cpp
+++ b/src/opts/SkBitmapProcState_opts_SSE2.cpp

@@ -12,122 +12,6 @@
 
 #include <emmintrin.h>
 
-void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
-                                  const uint32_t* xy,
-                                  int count, uint32_t* colors) {
-    SkASSERT(count > 0 && colors != nullptr);
-    SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
-    SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
-    SkASSERT(s.fAlphaScale <= 256);
-
-    const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
-    size_t rb = s.fPixmap.rowBytes();
-    uint32_t XY = *xy++;
-    unsigned y0 = XY >> 14;
-    const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
-    const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
-    unsigned subY = y0 & 0xF;
-
-    // ( 0,  0,  0,  0,  0,  0,  0, 16)
-    __m128i sixteen = _mm_cvtsi32_si128(16);
-
-    // ( 0,  0,  0,  0, 16, 16, 16, 16)
-    sixteen = _mm_shufflelo_epi16(sixteen, 0);
-
-    // ( 0,  0,  0,  0,  0,  0,  0,  y)
-    __m128i allY = _mm_cvtsi32_si128(subY);
-
-    // ( 0,  0,  0,  0,  y,  y,  y,  y)
-    allY = _mm_shufflelo_epi16(allY, 0);
-
-    // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
-    __m128i negY = _mm_sub_epi16(sixteen, allY);
-
-    // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
-    allY = _mm_unpacklo_epi64(allY, negY);
-
-    // (16, 16, 16, 16, 16, 16, 16, 16 )
-    sixteen = _mm_shuffle_epi32(sixteen, 0);
-
-    // ( 0,  0,  0,  0,  0,  0,  0,  0)
-    __m128i zero = _mm_setzero_si128();
-
-    // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
-    __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
-
-    do {
-        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
-        unsigned x0 = XX >> 18;
-        unsigned x1 = XX & 0x3FFF;
-
-        // (0, 0, 0, 0, 0, 0, 0, x)
-        __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
-
-        // (0, 0, 0, 0, x, x, x, x)
-        allX = _mm_shufflelo_epi16(allX, 0);
-
-        // (x, x, x, x, x, x, x, x)
-        allX = _mm_shuffle_epi32(allX, 0);
-
-        // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
-        __m128i negX = _mm_sub_epi16(sixteen, allX);
-
-        // Load 4 samples (pixels).
-        __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
-        __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
-        __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
-        __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
-
-        // (0, 0, a00, a10)
-        __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
-
-        // Expand to 16 bits per component.
-        a00a10 = _mm_unpacklo_epi8(a00a10, zero);
-
-        // ((a00 * (16-y)), (a10 * y)).
-        a00a10 = _mm_mullo_epi16(a00a10, allY);
-
-        // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
-        a00a10 = _mm_mullo_epi16(a00a10, negX);
-
-        // (0, 0, a01, a10)
-        __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
-
-        // Expand to 16 bits per component.
-        a01a11 = _mm_unpacklo_epi8(a01a11, zero);
-
-        // (a01 * (16-y)), (a11 * y)
-        a01a11 = _mm_mullo_epi16(a01a11, allY);
-
-        // (a01 * (16-y) * x), (a11 * y * x)
-        a01a11 = _mm_mullo_epi16(a01a11, allX);
-
-        // (a00*w00 + a01*w01, a10*w10 + a11*w11)
-        __m128i sum = _mm_add_epi16(a00a10, a01a11);
-
-        // (DC, a00*w00 + a01*w01)
-        __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
-
-        // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
-        sum = _mm_add_epi16(sum, shifted);
-
-        // Divide each 16 bit component by 256.
-        sum = _mm_srli_epi16(sum, 8);
-
-        // Multiply by alpha.
-        sum = _mm_mullo_epi16(sum, alpha);
-
-        // Divide each 16 bit component by 256.
-        sum = _mm_srli_epi16(sum, 8);
-
-        // Pack lower 4 16 bit values of sum into lower 4 bytes.
-        sum = _mm_packus_epi16(sum, zero);
-
-        // Extract low int and store.
-        *colors++ = _mm_cvtsi128_si32(sum);
-    } while (--count > 0);
-}
-
 // Temporarily go into 64bit so we don't overflow during the add. Since we shift down by 16
 // in the end, the result should always fit back in 32bits.
 static inline int32_t safe_fixed_add_shift(SkFixed a, SkFixed b) {

diff --git a/src/opts/SkBitmapProcState_opts_SSE2.h b/src/opts/SkBitmapProcState_opts_SSE2.h
index 7faeab4..d55c74c 100644
--- a/src/opts/SkBitmapProcState_opts_SSE2.h
+++ b/src/opts/SkBitmapProcState_opts_SSE2.h

@@ -10,9 +10,6 @@
 
 #include "SkBitmapProcState.h"
 
-void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
-                                  const uint32_t* xy, int count, uint32_t* colors);
-
 void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
                                      int count, int x, int y);
 void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,

diff --git a/src/opts/SkBitmapProcState_opts_SSSE3.cpp b/src/opts/SkBitmapProcState_opts_SSSE3.cpp
deleted file mode 100644
index 428c756..0000000
--- a/src/opts/SkBitmapProcState_opts_SSSE3.cpp
+++ /dev/null

@@ -1,160 +0,0 @@
-/*
- * Copyright 2012 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkBitmapProcState_opts_SSSE3.h"
-#include <tmmintrin.h>
-
-// This same basic packing scheme is used throughout the file.
-static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) {
-    // The top 14 bits are the integer coordinate x0 or y0.
-    *v0 = packed >> 18;
-
-    // The bottom 14 bits are the integer coordinate x1 or y1.
-    *v1 = packed & 0x3fff;
-
-    // The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1.
-    *w = (packed >> 14) & 0xf;
-}
-
-// As above, 4x.
-static void decode_packed_coordinates_and_weight(__m128i packed, int v0[4], int v1[4], __m128i* w) {
-    _mm_storeu_si128((__m128i*)v0, _mm_srli_epi32(packed, 18));
-    _mm_storeu_si128((__m128i*)v1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
-    *w = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));
-}
-
-
-// This is the crux of the whole file, interpolating in X for up to two output pixels (A and B).
-static inline __m128i interpolate_in_x(uint32_t A0, uint32_t A1,
-                                       uint32_t B0, uint32_t B1,
-                                       const __m128i& interlaced_x_weights) {
-    // _mm_maddubs_epi16() is a little idiosyncratic, but very helpful as the core of a lerp.
-    //
-    // It takes two arguments interlaced byte-wise:
-    //    - first  arg: [ x,y, ... 7 more pairs of 8-bit values ...]
-    //    - second arg: [ z,w, ... 7 more pairs of 8-bit values ...]
-    // and returns 8 16-bit values: [ x*z + y*w, ... 7 more 16-bit values ... ].
-    //
-    // That's why we go to all this trouble to make interlaced_x_weights,
-    // and here we're interlacing A0 with A1, B0 with B1 to match.
-
-    __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
-            interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
-
-    return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
-                             interlaced_x_weights);
-}
-
-// Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
-// Returns two pixels, with each channel in a 16-bit lane of the __m128i.
-static inline __m128i interpolate_in_x_and_y(uint32_t A0, uint32_t A1,
-                                             uint32_t A2, uint32_t A3,
-                                             uint32_t B0, uint32_t B1,
-                                             uint32_t B2, uint32_t B3,
-                                             const __m128i& interlaced_x_weights,
-                                             int wy) {
-    // The stored Y weight wy is for y1, and y0 gets a weight 16-wy.
-    const __m128i wy1 = _mm_set1_epi16(wy),
-                  wy0 = _mm_sub_epi16(_mm_set1_epi16(16), wy1);
-
-    // First interpolate in X,
-    // leaving the values in 16-bit lanes scaled up by those [0,16] interlaced_x_weights.
-    __m128i row0 = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
-            row1 = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
-
-    // Interpolate in Y across the two rows,
-    // then scale everything down by the maximum total weight 16x16 = 256.
-    return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(row0, wy0),
-                                        _mm_mullo_epi16(row1, wy1)), 8);
-}
-
-
-void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
-                                   const uint32_t* xy,
-                                   int count, uint32_t* colors) {
-    SkASSERT(count > 0 && colors != nullptr);
-    SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
-    SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
-
-    int alpha = s.fAlphaScale;
-
-    // Return (px * s.fAlphaScale) / 256.   (s.fAlphaScale is in [0,256].)
-    auto scale_by_alpha = [alpha](const __m128i& px) {
-        return alpha == 256 ? px
-                            : _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(alpha)), 8);
-    };
-
-    // We're in _DX_ mode here, so we're only varying in X.
-    // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
-    // All the other entries in xy will be pairs of X coordinates and the X weight.
-    int y0, y1, wy;
-    decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
-
-    auto row0 = (const uint32_t*)( (const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
-         row1 = (const uint32_t*)( (const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
-
-    while (count >= 4) {
-        // We can really get going, loading 4 X pairs at a time to produce 4 output pixels.
-        const __m128i xx = _mm_loadu_si128((const __m128i*)xy);
-
-        int x0[4],
-            x1[4];
-        __m128i wx;
-        decode_packed_coordinates_and_weight(xx, x0, x1, &wx);
-
-        // Splat out each x weight wx four times (one for each pixel channel) as wx1,
-        // and sixteen minus that as the weight for x0, wx0.
-        __m128i wx1 = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12)),
-                wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);
-
-        // We need to interlace wx0 and wx1 for _mm_maddubs_epi16().
-        __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wx0,wx1),
-                interlaced_x_weights_CD = _mm_unpackhi_epi8(wx0,wx1);
-
-        // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
-        // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
-        __m128i AB = interpolate_in_x_and_y(row0[x0[0]], row0[x1[0]],
-                                            row1[x0[0]], row1[x1[0]],
-                                            row0[x0[1]], row0[x1[1]],
-                                            row1[x0[1]], row1[x1[1]],
-                                            interlaced_x_weights_AB, wy);
-
-        // Once more with the other half of the x-weights for two more pixels C,D.
-        __m128i CD = interpolate_in_x_and_y(row0[x0[2]], row0[x1[2]],
-                                            row1[x0[2]], row1[x1[2]],
-                                            row0[x0[3]], row0[x1[3]],
-                                            row1[x0[3]], row1[x1[3]],
-                                            interlaced_x_weights_CD, wy);
-
-        // Scale them all by alpha, pack back together to 8-bit lanes, and write out four pixels!
-        _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(scale_by_alpha(AB),
-                                                            scale_by_alpha(CD)));
-        xy     += 4;
-        colors += 4;
-        count  -= 4;
-    }
-
-    while (count --> 0) {
-        // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
-        int x0, x1, wx;
-        decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
-
-        // As above, splat out wx four times as wx1, and sixteen minus that as wx0.
-        __m128i wx1 = _mm_set1_epi8(wx),     // This splats it out 16 times, but that's fine.
-                wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);
-
-        __m128i interlaced_x_weights_A = _mm_unpacklo_epi8(wx0, wx1);
-
-        __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
-                                           row1[x0], row1[x1],
-                                                  0,        0,
-                                                  0,        0,
-                                           interlaced_x_weights_A, wy);
-
-        *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(scale_by_alpha(A), _mm_setzero_si128()));
-    }
-}

diff --git a/src/opts/SkBitmapProcState_opts_SSSE3.h b/src/opts/SkBitmapProcState_opts_SSSE3.h
deleted file mode 100644
index c122ef0..0000000
--- a/src/opts/SkBitmapProcState_opts_SSSE3.h
+++ /dev/null

@@ -1,16 +0,0 @@
-/*
- * Copyright 2012 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef SkBitmapProcState_opts_SSSE3_DEFINED
-#define SkBitmapProcState_opts_SSSE3_DEFINED
-
-#include "SkBitmapProcState.h"
-
-void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
-                                   const uint32_t* xy,
-                                   int count, uint32_t* colors);
-#endif

diff --git a/src/opts/SkOpts_ssse3.cpp b/src/opts/SkOpts_ssse3.cpp
index bc3bf76..df2fdb1 100644
--- a/src/opts/SkOpts_ssse3.cpp
+++ b/src/opts/SkOpts_ssse3.cpp

@@ -7,6 +7,7 @@
 
 #include "SkOpts.h"
 #define SK_OPTS_NS ssse3
+#include "SkBitmapProcState_opts.h"
 #include "SkBlitMask_opts.h"
 #include "SkSwizzler_opts.h"
 #include "SkXfermode_opts.h"
@@ -27,9 +28,6 @@
         inverted_CMYK_to_RGB1 = ssse3::inverted_CMYK_to_RGB1;
         inverted_CMYK_to_BGR1 = ssse3::inverted_CMYK_to_BGR1;
 
-        /*
-        S32_opaque_D32_filter_DX = ssse3::S32_opaque_D32_filter_DX;
         S32_alpha_D32_filter_DX  = ssse3::S32_alpha_D32_filter_DX;
-        */
     }
 }

diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index 9b63e4a..6772504 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp

@@ -6,7 +6,6 @@
  */
 
 #include "SkBitmapProcState_opts_SSE2.h"
-#include "SkBitmapProcState_opts_SSSE3.h"
 #include "SkCpu.h"
 
 /*
@@ -25,11 +24,6 @@
         return;
     }
 
-    if (fSampleProc32 == S32_alpha_D32_filter_DX) {
-        fSampleProc32 = SkCpu::Supports(SkCpu::SSSE3) ? S32_alpha_D32_filter_DX_SSSE3
-                                                      : S32_alpha_D32_filter_DX_SSE2;
-    }
-
     if (fMatrixProc == ClampX_ClampY_filter_scale) {
         fMatrixProc =  ClampX_ClampY_filter_scale_SSE2;
     }
commit	a2187bf7629af9eff9b37d99f3899e7f69d61a0d	[log] [tgz]
author	Mike Klein <mtklein@google.com>	Fri Nov 16 12:22:05 2018 -0500
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	Fri Nov 16 20:33:03 2018 +0000
tree	236588d3ad2dd284f4c14f33899f1f576fcb4d93
parent	fd32e724d6d8a9569b3980ff4f72175df2e455c9 [diff]