refactor SkBitmapProcState_opts.h a bit

Shouldn't be any major change in here, and the only codegen change
should be the easy-to-predict branch on alpha < 256.  This is mostly
about makings sure I understand and can read the code.

Cq-Include-Trybots: master.tryserver.blink:linux_trusty_blink_rel
Change-Id: I3e6260be76595275ba177551cbb8f4a84e4970ec
Reviewed-on: https://skia-review.googlesource.com/c/171585
Auto-Submit: Mike Klein <mtklein@google.com>
Commit-Queue: Herb Derby <herb@google.com>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/src/opts/SkBitmapProcState_opts.h b/src/opts/SkBitmapProcState_opts.h
index adc03c5..4d6e573 100644
--- a/src/opts/SkBitmapProcState_opts.h
+++ b/src/opts/SkBitmapProcState_opts.h
@@ -27,18 +27,19 @@
 
 namespace SK_OPTS_NS {
 
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
-    // This same basic packing scheme is used throughout the file.
-    static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) {
-        // The top 14 bits are the integer coordinate x0 or y0.
-        *v0 = packed >> 18;
+// This same basic packing scheme is used throughout the file.
+static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) {
+    // The top 14 bits are the integer coordinate x0 or y0.
+    *v0 = packed >> 18;
 
-        // The bottom 14 bits are the integer coordinate x1 or y1.
-        *v1 = packed & 0x3fff;
+    // The bottom 14 bits are the integer coordinate x1 or y1.
+    *v1 = packed & 0x3fff;
 
-        // The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1.
-        *w = (packed >> 14) & 0xf;
-    }
+    // The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1.
+    *w = (packed >> 14) & 0xf;
+}
+
+#if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
 
     // As above, 4x.
     static void decode_packed_coordinates_and_weight(__m128i packed,
@@ -180,7 +181,7 @@
     }
 
 
-#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+#elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
 
     // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.
 
@@ -192,112 +193,63 @@
         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
         SkASSERT(s.fAlphaScale <= 256);
 
-        const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
-        size_t rb = s.fPixmap.rowBytes();
-        uint32_t XY = *xy++;
-        unsigned y0 = XY >> 14;
-        const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
-        const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
-        unsigned subY = y0 & 0xF;
+        int y0, y1, wy;
+        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
 
-        // ( 0,  0,  0,  0,  0,  0,  0, 16)
-        __m128i sixteen = _mm_cvtsi32_si128(16);
+        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
+             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
 
-        // ( 0,  0,  0,  0, 16, 16, 16, 16)
-        sixteen = _mm_shufflelo_epi16(sixteen, 0);
+        // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
+        // and another in the upper 4 16-bit lanes to line up with 16 - wy.
+        const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16(   wy),
+                                                _mm_set1_epi16(16-wy));
 
-        // ( 0,  0,  0,  0,  0,  0,  0,  y)
-        __m128i allY = _mm_cvtsi32_si128(subY);
+        while (count --> 0) {
+            int x0, x1, wx;
+            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
 
-        // ( 0,  0,  0,  0,  y,  y,  y,  y)
-        allY = _mm_shufflelo_epi16(allY, 0);
+            // Load the 4 pixels we're interpolating.
+            const __m128i a00 = _mm_cvtsi32_si128(row0[x0]),
+                          a01 = _mm_cvtsi32_si128(row0[x1]),
+                          a10 = _mm_cvtsi32_si128(row1[x0]),
+                          a11 = _mm_cvtsi32_si128(row1[x1]);
 
-        // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
-        __m128i negY = _mm_sub_epi16(sixteen, allY);
+            // Line up low-x pixels a00 and a10 with allY.
+            __m128i a00a10 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a10, a00),
+                                               _mm_setzero_si128());
 
-        // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
-        allY = _mm_unpacklo_epi64(allY, negY);
-
-        // (16, 16, 16, 16, 16, 16, 16, 16 )
-        sixteen = _mm_shuffle_epi32(sixteen, 0);
-
-        // ( 0,  0,  0,  0,  0,  0,  0,  0)
-        __m128i zero = _mm_setzero_si128();
-
-        // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
-        __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
-
-        do {
-            uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
-            unsigned x0 = XX >> 18;
-            unsigned x1 = XX & 0x3FFF;
-
-            // (0, 0, 0, 0, 0, 0, 0, x)
-            __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
-
-            // (0, 0, 0, 0, x, x, x, x)
-            allX = _mm_shufflelo_epi16(allX, 0);
-
-            // (x, x, x, x, x, x, x, x)
-            allX = _mm_shuffle_epi32(allX, 0);
-
-            // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
-            __m128i negX = _mm_sub_epi16(sixteen, allX);
-
-            // Load 4 samples (pixels).
-            __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
-            __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
-            __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
-            __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
-
-            // (0, 0, a00, a10)
-            __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
-
-            // Expand to 16 bits per component.
-            a00a10 = _mm_unpacklo_epi8(a00a10, zero);
-
-            // ((a00 * (16-y)), (a10 * y)).
+            // Scale by allY and 16-wx.
             a00a10 = _mm_mullo_epi16(a00a10, allY);
+            a00a10 = _mm_mullo_epi16(a00a10, _mm_set1_epi16(16-wx));
 
-            // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
-            a00a10 = _mm_mullo_epi16(a00a10, negX);
 
-            // (0, 0, a01, a10)
-            __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
+            // Line up high-x pixels a01 and a11 with allY.
+            __m128i a01a11 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a11, a01),
+                                               _mm_setzero_si128());
 
-            // Expand to 16 bits per component.
-            a01a11 = _mm_unpacklo_epi8(a01a11, zero);
-
-            // (a01 * (16-y)), (a11 * y)
+            // Scale by allY and wx.
             a01a11 = _mm_mullo_epi16(a01a11, allY);
+            a01a11 = _mm_mullo_epi16(a01a11, _mm_set1_epi16(wx));
 
-            // (a01 * (16-y) * x), (a11 * y * x)
-            a01a11 = _mm_mullo_epi16(a01a11, allX);
 
-            // (a00*w00 + a01*w01, a10*w10 + a11*w11)
-            __m128i sum = _mm_add_epi16(a00a10, a01a11);
+            // Add the two intermediates, summing across in one direction.
+            __m128i halves = _mm_add_epi16(a00a10, a01a11);
 
-            // (DC, a00*w00 + a01*w01)
-            __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
+            // Add the two halves to each other to sum in the other direction.
+            __m128i sum = _mm_add_epi16(halves, _mm_srli_si128(halves, 8));
 
-            // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
-            sum = _mm_add_epi16(sum, shifted);
-
-            // Divide each 16 bit component by 256.
+            // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
             sum = _mm_srli_epi16(sum, 8);
 
-            // Multiply by alpha.
-            sum = _mm_mullo_epi16(sum, alpha);
+            if (s.fAlphaScale < 256) {
+                // Scale by alpha, which is in [0,256].
+                sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
+                sum = _mm_srli_epi16(sum, 8);
+            }
 
-            // Divide each 16 bit component by 256.
-            sum = _mm_srli_epi16(sum, 8);
-
-            // Pack lower 4 16 bit values of sum into lower 4 bytes.
-            sum = _mm_packus_epi16(sum, zero);
-
-            // Extract low int and store.
-            *colors++ = _mm_cvtsi128_si32(sum);
-        } while (--count > 0);
+            // Pack back into 8-bit values and store.
+            *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
+        }
     }
 
 #else
@@ -337,9 +289,11 @@
             tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
             tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
 
-            vscale = vdup_n_u16(scale);        // duplicate scale
-            tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
-            tmp = vmul_u16(tmp, vscale);       // multiply result by scale
+            if (scale < 256) {
+                vscale = vdup_n_u16(scale);        // duplicate scale
+                tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
+                tmp = vmul_u16(tmp, vscale);       // multiply result by scale
+            }
 
             vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
@@ -372,17 +326,16 @@
             lo += (a11 & mask) * xy;
             hi += ((a11 >> 8) & mask) * xy;
 
-            // TODO: if (alphaScale < 256) ...
-            lo = ((lo >> 8) & mask) * alphaScale;
-            hi = ((hi >> 8) & mask) * alphaScale;
+            if (alphaScale < 256) {
+                lo = ((lo >> 8) & mask) * alphaScale;
+                hi = ((hi >> 8) & mask) * alphaScale;
+            }
 
             *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
         }
     #endif
 
 
-    // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.
-
     /*not static*/ inline
     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
                                  const uint32_t* xy, int count, SkPMColor* colors) {
@@ -391,38 +344,22 @@
         SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
         SkASSERT(s.fAlphaScale <= 256);
 
-        unsigned alphaScale = s.fAlphaScale;
+        int y0, y1, wy;
+        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
 
-        const char* srcAddr = (const char*)s.fPixmap.addr();
-        size_t rb = s.fPixmap.rowBytes();
-        unsigned subY;
-        const SkPMColor* row0;
-        const SkPMColor* row1;
+        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
+             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
 
-        // setup row ptrs and update proc_table
-        {
-            uint32_t XY = *xy++;
-            unsigned y0 = XY >> 14;
-            row0 = (const SkPMColor*)(srcAddr + (y0 >> 4) * rb);
-            row1 = (const SkPMColor*)(srcAddr + (XY & 0x3FFF) * rb);
-            subY = y0 & 0xF;
-        }
+        while (count --> 0) {
+            int x0, x1, wx;
+            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
 
-        do {
-            uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
-            unsigned x0 = XX >> 14;
-            unsigned x1 = XX & 0x3FFF;
-            unsigned subX = x0 & 0xF;
-            x0 >>= 4;
-
-            filter_and_scale_by_alpha(subX, subY,
+            filter_and_scale_by_alpha(wx, wy,
                                       row0[x0], row0[x1],
                                       row1[x0], row1[x1],
-                                      colors,
-                                      alphaScale);
-            colors += 1;
-
-        } while (--count != 0);
+                                      colors++,
+                                      s.fAlphaScale);
+        }
     }
 
 #endif