SkPMFloat::trunc()

Add and test trunc(), which is what get() used to be before rounding.
Using trunc() is a ~40% speedup on our linear gradient bench.

#neon #floats
BUG=skia:3592
#n5
#n9
CQ_INCLUDE_TRYBOTS=client.skia.android:Test-Android-Nexus5-Adreno330-Arm7-Debug-Trybot;client.skia.android:Test-Android-Nexus9-TegraK1-Arm64-Release-Trybot

Review URL: https://codereview.chromium.org/1032243002
diff --git a/HASHTAGS b/HASHTAGS
index 8c65ce4..ec76d96 100644
--- a/HASHTAGS
+++ b/HASHTAGS
@@ -14,6 +14,8 @@
 nocommit,COMMIT=false
 
 floats,BUG=skia:3592
+neon,#n5,#n9
+n5,CQ_INCLUDE_TRYBOTS=client.skia.android:Test-Android-Nexus5-Adreno330-Arm7-Debug-Trybot
 n7,CQ_INCLUDE_TRYBOTS=client.skia.android:Test-Android-Nexus7-Tegra3-Arm7-Debug-Trybot
 n9,CQ_INCLUDE_TRYBOTS=client.skia.android:Test-Android-Nexus9-TegraK1-Arm64-Release-Trybot
 
diff --git a/bench/PMFloatBench.cpp b/bench/PMFloatBench.cpp
index d748144..09819e9 100644
--- a/bench/PMFloatBench.cpp
+++ b/bench/PMFloatBench.cpp
@@ -112,15 +112,15 @@
              dcdx4(dcdx+dcdx+dcdx+dcdx);
 
         for (int n = 0; n < loops; n++) {
-            Sk4f a = c0 + dc*fx,  // TODO: add 0.5f, here call trunc() instead of get().
+            Sk4f a = c0 + dc*fx + Sk4f(0.5f),  // The +0.5f lets us call trunc() instead of get().
                  b = a + dcdx,
                  c = b + dcdx,
                  d = c + dcdx;
             for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {
-                fDevice[i+0] = SkPMFloat(a).get();
-                fDevice[i+1] = SkPMFloat(b).get();
-                fDevice[i+2] = SkPMFloat(c).get();
-                fDevice[i+3] = SkPMFloat(d).get();
+                fDevice[i+0] = SkPMFloat(a).trunc();
+                fDevice[i+1] = SkPMFloat(b).trunc();
+                fDevice[i+2] = SkPMFloat(c).trunc();
+                fDevice[i+3] = SkPMFloat(d).trunc();
                 a += dcdx4;
                 b += dcdx4;
                 c += dcdx4;
diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h
index 1d034f0..66262a8 100644
--- a/src/core/SkPMFloat.h
+++ b/src/core/SkPMFloat.h
@@ -57,6 +57,10 @@
     SkPMColor     get() const;  // May SkASSERT(this->isValid()).  Some implementations may clamp.
     SkPMColor clamped() const;  // Will clamp all values to [0, 255].  Then may assert isValid().
 
+    // Like get(), but truncates instead of rounding.
+    // The domain of this function is (-1.0f, 256.0f).  Values in (-1.0f, 0.0f] trunc to a zero.
+    SkPMColor trunc() const;
+
     // 4-at-a-time versions of get() and clamped().  Like From4PMColors(), no alignment assumed.
     static void To4PMColors(
             const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
diff --git a/src/opts/SkPMFloat_SSE2.h b/src/opts/SkPMFloat_SSE2.h
index 156c0c9..fa920d7 100644
--- a/src/opts/SkPMFloat_SSE2.h
+++ b/src/opts/SkPMFloat_SSE2.h
@@ -41,6 +41,16 @@
     return c;
 }
 
+inline SkPMColor SkPMFloat::trunc() const {
+    // Basically, same as clamped(), but no rounding.
+    __m128i fix8_32 = _mm_cvttps_epi32(fColors),
+            fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
+            fix8    = _mm_packus_epi16(fix8_16, fix8_16);
+    SkPMColor c = _mm_cvtsi128_si32(fix8);
+    SkPMColorAssert(c);
+    return c;
+}
+
 inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
                                      SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
     // Haven't beaten this yet.
diff --git a/src/opts/SkPMFloat_SSSE3.h b/src/opts/SkPMFloat_SSSE3.h
index fca4197..6ff6929 100644
--- a/src/opts/SkPMFloat_SSSE3.h
+++ b/src/opts/SkPMFloat_SSSE3.h
@@ -27,17 +27,20 @@
     SkASSERT(this->isValid());
 }
 
-inline SkPMColor SkPMFloat::get() const {
-    SkASSERT(this->isValid());
+inline SkPMColor SkPMFloat::trunc() const {
     const int _ = 255;  // _ means to zero that byte.
-    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
-    __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fColors)),
+    __m128i fix8_32 = _mm_cvttps_epi32(fColors),
             fix8    = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 12,8,4,0));
     SkPMColor c = _mm_cvtsi128_si32(fix8);
     SkPMColorAssert(c);
     return c;
 }
 
+inline SkPMColor SkPMFloat::get() const {
+    SkASSERT(this->isValid());
+    return SkPMFloat(Sk4f(0.5f) + *this).trunc();
+}
+
 inline SkPMColor SkPMFloat::clamped() const {
     // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
     __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fColors)),
diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h
index 780981b..e5b16f5 100644
--- a/src/opts/SkPMFloat_neon.h
+++ b/src/opts/SkPMFloat_neon.h
@@ -26,10 +26,8 @@
     SkASSERT(this->isValid());
 }
 
-inline SkPMColor SkPMFloat::get() const {
-    SkASSERT(this->isValid());
-    float32x4_t add_half = vaddq_f32(fColors, vdupq_n_f32(0.5f));
-    uint32x4_t  fix8_32  = vcvtq_u32_f32(add_half);  // vcvtq_u32_f32 truncates, so round manually
+inline SkPMColor SkPMFloat::trunc() const {
+    uint32x4_t  fix8_32  = vcvtq_u32_f32(fColors);  // vcvtq_u32_f32 truncates
     uint16x4_t  fix8_16  = vmovn_u32(fix8_32);
     uint8x8_t   fix8     = vmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
     SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
@@ -37,6 +35,11 @@
     return c;
 }
 
+inline SkPMColor SkPMFloat::get() const {
+    SkASSERT(this->isValid());
+    return SkPMFloat(Sk4f(0.5f) + *this).trunc();
+}
+
 inline SkPMColor SkPMFloat::clamped() const {
     float32x4_t add_half = vaddq_f32(fColors, vdupq_n_f32(0.5f));
     uint32x4_t  fix8_32  = vcvtq_u32_f32(add_half);  // vcvtq_u32_f32 truncates, so round manually
diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h
index 00705aa..86516b1 100644
--- a/src/opts/SkPMFloat_none.h
+++ b/src/opts/SkPMFloat_none.h
@@ -18,6 +18,10 @@
     SkASSERT(this->isValid());
 }
 
+inline SkPMColor SkPMFloat::trunc() const {
+    return SkPackARGB32(this->a(), this->r(), this->g(), this->b());
+}
+
 inline SkPMColor SkPMFloat::get() const {
     SkASSERT(this->isValid());
     return SkPackARGB32(this->a()+0.5f, this->r()+0.5f, this->g()+0.5f, this->b()+0.5f);
diff --git a/tests/PMFloatTest.cpp b/tests/PMFloatTest.cpp
index 0f0d853..309cd60 100644
--- a/tests/PMFloatTest.cpp
+++ b/tests/PMFloatTest.cpp
@@ -22,6 +22,9 @@
     pmf = SkPMFloat(254.5f, 203.5f, 153.1f, 50.8f);
     REPORTER_ASSERT(r, c == pmf.get());
 
+    pmf = SkPMFloat(255.9f, 204.01f, 153.0f, -0.9f);
+    REPORTER_ASSERT(r, SkPreMultiplyColor(0xFFCC9900) == pmf.trunc());
+
     // Test clamping.
     SkPMFloat clamped(SkPMFloat(510.0f, 153.0f, 1.0f, -0.2f).clamped());
     REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, clamped.a()));