Mike's radial gradient CL with better float -> int.

patch from issue 1072303005 at patchset 40001 (http://crrev.com/1072303005#ps40001)

This looks quite launchable.  radial_gradient3, min of 100 samples:
  N5:  985µs -> 946µs
  MBP: 395µs -> 279µs

On my MBP, most of the meat looks like it's now in reading the cache and writing to dst one color at a time.  Is that something we could do in float math rather than with a lookup table?

BUG=skia:

CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Mac10.8-Clang-Arm7-Debug-Android-Trybot,Build-Ubuntu-GCC-Arm7-Release-Android_NoNeon-Trybot

Committed: https://skia.googlesource.com/skia/+/abf6c5cf95e921fae59efb487480e5b5081cf0ec

Review URL: https://codereview.chromium.org/1109643002
diff --git a/gm/gradients.cpp b/gm/gradients.cpp
index cedaff3..7ff9d38 100644
--- a/gm/gradients.cpp
+++ b/gm/gradients.cpp
@@ -448,7 +448,7 @@
         const SkScalar kRadius = 3000;
         const SkColor gColors[] = { 0xFFFFFFFF, 0xFF000000 };
         fShader.reset(SkGradientShader::CreateRadial(center, kRadius, gColors, NULL, 2,
-                                                    SkShader::kClamp_TileMode));
+                                                     SkShader::kClamp_TileMode));
     }
 
     void onDraw(SkCanvas* canvas) override {
diff --git a/src/core/SkNx.h b/src/core/SkNx.h
index 8244e90..65b5b97 100644
--- a/src/core/SkNx.h
+++ b/src/core/SkNx.h
@@ -39,6 +39,7 @@
 class SkNi {
 public:
     SkNi() {}
+    SkNi(const SkNi<N/2, T>& lo, const SkNi<N/2, T>& hi) : fLo(lo), fHi(hi) {}
     explicit SkNi(T val) : fLo(val), fHi(val) {}
     static SkNi Load(const T vals[N]) {
         return SkNi(SkNi<N/2,T>::Load(vals), SkNi<N/2,T>::Load(vals+N/2));
@@ -69,7 +70,6 @@
 
 private:
     REQUIRE(0 == (N & (N-1)));
-    SkNi(const SkNi<N/2, T>& lo, const SkNi<N/2, T>& hi) : fLo(lo), fHi(hi) {}
 
     SkNi<N/2, T> fLo, fHi;
 };
@@ -77,6 +77,10 @@
 template <int N, typename T>
 class SkNf {
     typedef SkNb<N, sizeof(T)> Nb;
+
+    static int32_t MyNi(float);
+    static int64_t MyNi(double);
+    typedef SkNi<N, decltype(MyNi(T()))> Ni;
 public:
     SkNf() {}
     explicit SkNf(T val) : fLo(val),  fHi(val) {}
@@ -93,6 +97,8 @@
         fHi.store(vals+N/2);
     }
 
+    Ni castTrunc() const { return Ni(fLo.castTrunc(), fHi.castTrunc()); }
+
     SkNf operator + (const SkNf& o) const { return SkNf(fLo + o.fLo, fHi + o.fHi); }
     SkNf operator - (const SkNf& o) const { return SkNf(fLo - o.fLo, fHi - o.fHi); }
     SkNf operator * (const SkNf& o) const { return SkNf(fLo * o.fLo, fHi * o.fHi); }
@@ -172,6 +178,10 @@
 template <typename T>
 class SkNf<1,T> {
     typedef SkNb<1, sizeof(T)> Nb;
+
+    static int32_t MyNi(float);
+    static int64_t MyNi(double);
+    typedef SkNi<1, decltype(MyNi(T()))> Ni;
 public:
     SkNf() {}
     explicit SkNf(T val) : fVal(val) {}
@@ -179,6 +189,8 @@
 
     void store(T vals[1]) const { vals[0] = fVal; }
 
+    Ni castTrunc() const { return Ni(fVal); }
+
     SkNf operator + (const SkNf& o) const { return SkNf(fVal + o.fVal); }
     SkNf operator - (const SkNf& o) const { return SkNf(fVal - o.fVal); }
     SkNf operator * (const SkNf& o) const { return SkNf(fVal * o.fVal); }
@@ -248,4 +260,6 @@
 typedef SkNi<4, uint16_t> Sk4h;
 typedef SkNi<8, uint16_t> Sk8h;
 
+typedef SkNi<4, int> Sk4i;
+
 #endif//SkNx_DEFINED
diff --git a/src/effects/gradients/SkRadialGradient.cpp b/src/effects/gradients/SkRadialGradient.cpp
index b25a875..bf3c821 100644
--- a/src/effects/gradients/SkRadialGradient.cpp
+++ b/src/effects/gradients/SkRadialGradient.cpp
@@ -8,6 +8,7 @@
 
 #include "SkRadialGradient.h"
 #include "SkRadialGradient_Table.h"
+#include "SkNx.h"
 
 #define kSQRT_TABLE_BITS    11
 #define kSQRT_TABLE_SIZE    (1 << kSQRT_TABLE_BITS)
@@ -270,13 +271,16 @@
 namespace {
 
 inline bool radial_completely_pinned(int fx, int dx, int fy, int dy) {
-    // fast, overly-conservative test: checks unit square instead
-    // of unit circle
-    bool xClamped = (fx >= SK_FixedHalf && dx >= 0) ||
-                    (fx <= -SK_FixedHalf && dx <= 0);
-    bool yClamped = (fy >= SK_FixedHalf && dy >= 0) ||
-                    (fy <= -SK_FixedHalf && dy <= 0);
+    // fast, overly-conservative test: checks unit square instead of unit circle
+    bool xClamped = (fx >= SK_FixedHalf && dx >= 0) || (fx <= -SK_FixedHalf && dx <= 0);
+    bool yClamped = (fy >= SK_FixedHalf && dy >= 0) || (fy <= -SK_FixedHalf && dy <= 0);
+    return xClamped || yClamped;
+}
 
+inline bool radial_completely_pinned(SkScalar fx, SkScalar dx, SkScalar fy, SkScalar dy) {
+    // fast, overly-conservative test: checks unit square instead of unit circle
+    bool xClamped = (fx >= 1 && dx >= 0) || (fx <= -1 && dx <= 0);
+    bool yClamped = (fy >= 1 && dy >= 0) || (fy <= -1 && dy <= 0);
     return xClamped || yClamped;
 }
 
@@ -373,6 +377,70 @@
     }
 }
 
+// TODO: can we get away with 0th approximatino of inverse-sqrt (i.e. faster than rsqrt)?
+//       seems like ~10bits is more than enough for our use, since we want a byte-index
+static inline Sk4f fast_sqrt(const Sk4f& R) {
+    return R * R.rsqrt();
+}
+
+static inline Sk4f sum_squares(const Sk4f& a, const Sk4f& b) {
+    return a * a + b * b;
+}
+
+void shadeSpan_radial_clamp2(SkScalar sfx, SkScalar sdx, SkScalar sfy, SkScalar sdy,
+                             SkPMColor* SK_RESTRICT dstC, const SkPMColor* SK_RESTRICT cache,
+                             int count, int toggle) {
+    if (radial_completely_pinned(sfx, sdx, sfy, sdy)) {
+        unsigned fi = SkGradientShaderBase::kCache32Count - 1;
+        sk_memset32_dither(dstC,
+                           cache[toggle + fi],
+                           cache[next_dither_toggle(toggle) + fi],
+                           count);
+    } else {
+        const Sk4f max(255);
+        const float scale = 255;
+        sfx *= scale;
+        sfy *= scale;
+        sdx *= scale;
+        sdy *= scale;
+        const Sk4f fx4(sfx, sfx + sdx, sfx + 2*sdx, sfx + 3*sdx);
+        const Sk4f fy4(sfy, sfy + sdy, sfy + 2*sdy, sfy + 3*sdy);
+        const Sk4f dx4(sdx * 4);
+        const Sk4f dy4(sdy * 4);
+
+        Sk4f tmpxy = fx4 * dx4 + fy4 * dy4;
+        Sk4f tmpdxdy = sum_squares(dx4, dy4);
+        Sk4f R = sum_squares(fx4, fy4);
+        Sk4f dR = tmpxy + tmpxy + tmpdxdy;
+        const Sk4f ddR = tmpdxdy + tmpdxdy;
+
+        for (int i = 0; i < (count >> 2); ++i) {
+            Sk4f dist = Sk4f::Min(fast_sqrt(R), max);
+            R += dR;
+            dR += ddR;
+
+            int fi[4];
+            dist.castTrunc().store(fi);
+
+            for (int i = 0; i < 4; i++) {
+                *dstC++ = cache[toggle + fi[i]];
+                toggle = next_dither_toggle(toggle);
+            }
+        }
+        count &= 3;
+        if (count) {
+            Sk4f dist = Sk4f::Min(fast_sqrt(R), max);
+
+            int fi[4];
+            dist.castTrunc().store(fi);
+            for (int i = 0; i < count; i++) {
+                *dstC++ = cache[toggle + fi[i]];
+                toggle = next_dither_toggle(toggle);
+            }
+        }
+    }
+}
+
 // Unrolling this loop doesn't seem to help (when float); we're stalling to
 // get the results of the sqrt (?), and don't have enough extra registers to
 // have many in flight.
@@ -407,6 +475,11 @@
 
 void SkRadialGradient::RadialGradientContext::shadeSpan(int x, int y,
                                                         SkPMColor* SK_RESTRICT dstC, int count) {
+#ifdef SK_SUPPORT_LEGACY_RADIAL_GRADIENT_SQRT
+    const bool use_new_proc = false;
+#else
+    const bool use_new_proc = true;
+#endif
     SkASSERT(count > 0);
 
     const SkRadialGradient& radialGradient = static_cast<const SkRadialGradient&>(fShader);
@@ -435,7 +508,7 @@
 
         RadialShadeProc shadeProc = shadeSpan_radial_repeat;
         if (SkShader::kClamp_TileMode == radialGradient.fTileMode) {
-            shadeProc = shadeSpan_radial_clamp;
+            shadeProc = use_new_proc ? shadeSpan_radial_clamp2 : shadeSpan_radial_clamp;
         } else if (SkShader::kMirror_TileMode == radialGradient.fTileMode) {
             shadeProc = shadeSpan_radial_mirror;
         } else {
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 04db878..6b21682 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -181,6 +181,48 @@
 #endif//defined(SK_CPU_ARM64)
 
 template <>
+class SkNi<4, int> {
+public:
+    SkNi(const int32x4_t& vec) : fVec(vec) {}
+
+    SkNi() {}
+    explicit SkNi(int val) : fVec(vdupq_n_s32(val)) {}
+    static SkNi Load(const int vals[4]) { return vld1q_s32(vals); }
+    SkNi(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; }
+
+    void store(int vals[4]) const { vst1q_s32(vals, fVec); }
+
+    SkNi operator + (const SkNi& o) const { return vaddq_s32(fVec, o.fVec); }
+    SkNi operator - (const SkNi& o) const { return vsubq_s32(fVec, o.fVec); }
+    SkNi operator * (const SkNi& o) const { return vmulq_s32(fVec, o.fVec); }
+
+    // Well, this is absurd.  The shifts require compile-time constant arguments.
+#define SHIFT(op, v, bits) switch(bits) { \
+    case  1: return op(v,  1);  case  2: return op(v,  2);  case  3: return op(v,  3); \
+    case  4: return op(v,  4);  case  5: return op(v,  5);  case  6: return op(v,  6); \
+    case  7: return op(v,  7);  case  8: return op(v,  8);  case  9: return op(v,  9); \
+    case 10: return op(v, 10);  case 11: return op(v, 11);  case 12: return op(v, 12); \
+    case 13: return op(v, 13);  case 14: return op(v, 14);  case 15: return op(v, 15); \
+    case 16: return op(v, 16);  case 17: return op(v, 17);  case 18: return op(v, 18); \
+    case 19: return op(v, 19);  case 20: return op(v, 20);  case 21: return op(v, 21); \
+    case 22: return op(v, 22);  case 23: return op(v, 23);  case 24: return op(v, 24); \
+    case 25: return op(v, 25);  case 26: return op(v, 26);  case 27: return op(v, 27); \
+    case 28: return op(v, 28);  case 29: return op(v, 29);  case 30: return op(v, 30); \
+    case 31: return op(v, 31); } return fVec
+
+    SkNi operator << (int bits) const { SHIFT(vshlq_n_s32, fVec, bits); }
+    SkNi operator >> (int bits) const { SHIFT(vshrq_n_s32, fVec, bits); }
+#undef SHIFT
+
+    template <int k> int kth() const {
+        SkASSERT(0 <= k && k < 4);
+        return vgetq_lane_s32(fVec, k&3);
+    }
+protected:
+    int32x4_t fVec;
+};
+
+template <>
 class SkNf<4, float> {
     typedef SkNb<4, 4> Nb;
 public:
@@ -193,6 +235,8 @@
 
     void store(float vals[4]) const { vst1q_f32(vals, fVec); }
 
+    SkNi<4, int> castTrunc() const { return vcvtq_s32_f32(fVec); }
+
     SkNf approxInvert() const {
         float32x4_t est0 = vrecpeq_f32(fVec),
                     est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0);
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 46ddcb2..2608525 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -142,6 +142,44 @@
 };
 
 template <>
+class SkNi<4, int> {
+public:
+    SkNi(const __m128i& vec) : fVec(vec) {}
+
+    SkNi() {}
+    explicit SkNi(int val) : fVec(_mm_set1_epi32(val)) {}
+    static SkNi Load(const int vals[4]) { return _mm_loadu_si128((const __m128i*)vals); }
+    SkNi(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {}
+
+    void store(int vals[4]) const { _mm_storeu_si128((__m128i*)vals, fVec); }
+
+    SkNi operator + (const SkNi& o) const { return _mm_add_epi32(fVec, o.fVec); }
+    SkNi operator - (const SkNi& o) const { return _mm_sub_epi32(fVec, o.fVec); }
+    SkNi operator * (const SkNi& o) const {
+        __m128i mul20 = _mm_mul_epu32(fVec, o.fVec),
+                mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4));
+        return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)),
+                                  _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
+    }
+
+    SkNi operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
+    SkNi operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }
+
+    template <int k> int kth() const {
+        SkASSERT(0 <= k && k < 4);
+        switch (k) {
+            case 0: return _mm_cvtsi128_si32(fVec);
+            case 1: return _mm_cvtsi128_si32(_mm_srli_si128(fVec,  4));
+            case 2: return _mm_cvtsi128_si32(_mm_srli_si128(fVec,  8));
+            case 3: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 12));
+            default: SkASSERT(false); return 0;
+        }
+    }
+protected:
+    __m128i fVec;
+};
+
+template <>
 class SkNf<4, float> {
     typedef SkNb<4, 4> Nb;
 public:
@@ -154,6 +192,8 @@
 
     void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); }
 
+    SkNi<4, int> castTrunc() const { return _mm_cvttps_epi32(fVec); }
+
     SkNf operator + (const SkNf& o) const { return _mm_add_ps(fVec, o.fVec); }
     SkNf operator - (const SkNf& o) const { return _mm_sub_ps(fVec, o.fVec); }
     SkNf operator * (const SkNf& o) const { return _mm_mul_ps(fVec, o.fVec); }
diff --git a/tests/SkNxTest.cpp b/tests/SkNxTest.cpp
index af6918e..25ece38 100644
--- a/tests/SkNxTest.cpp
+++ b/tests/SkNxTest.cpp
@@ -89,6 +89,12 @@
           case 4: REPORTER_ASSERT(r, vals[2] == c && vals[3] == d);
           case 2: REPORTER_ASSERT(r, vals[0] == a && vals[1] == b);
         }
+        switch (N) {
+          case 8: REPORTER_ASSERT(r, v.template kth<4>() == e && v.template kth<5>() == f &&
+                                     v.template kth<6>() == g && v.template kth<7>() == h);
+          case 4: REPORTER_ASSERT(r, v.template kth<2>() == c && v.template kth<3>() == d);
+          case 2: REPORTER_ASSERT(r, v.template kth<0>() == a && v.template kth<1>() == b);
+        }
     };
 
     T vals[] = { 1,2,3,4,5,6,7,8 };
@@ -117,4 +123,8 @@
     test_Ni<2, uint16_t>(r);
     test_Ni<4, uint16_t>(r);
     test_Ni<8, uint16_t>(r);
+
+    test_Ni<2, int>(r);
+    test_Ni<4, int>(r);
+    test_Ni<8, int>(r);
 }