Update 4-at-a-time APIs.

There is no reason to require the 4 SkPMFloats (registers) to be adjacent.
The only potential win in loads and stores comes from the SkPMColors being adjacent.

Makes no difference to existing bench.

BUG=skia:

Review URL: https://codereview.chromium.org/1035583002
diff --git a/bench/PMFloatBench.cpp b/bench/PMFloatBench.cpp
index 1da667f..f3caea5 100644
--- a/bench/PMFloatBench.cpp
+++ b/bench/PMFloatBench.cpp
@@ -1,3 +1,10 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
 #include "Benchmark.h"
 #include "SkPMFloat.h"
 
@@ -49,21 +56,32 @@
             colors[3] = seed + 3;
         #endif
 
-            SkPMFloat floats[4];
+            SkPMFloat fa,fb,fc,fd;
             if (kWide) {
-                SkPMFloat::From4PMColors(floats, colors);
+                SkPMFloat::From4PMColors(colors, &fa, &fb, &fc, &fd);
             } else {
-                for (int i = 0; i < 4; i++) {
-                    floats[i] = SkPMFloat::FromPMColor(colors[i]);
-                }
+                fa = SkPMFloat::FromPMColor(colors[0]);
+                fb = SkPMFloat::FromPMColor(colors[1]);
+                fc = SkPMFloat::FromPMColor(colors[2]);
+                fd = SkPMFloat::FromPMColor(colors[3]);
             }
 
             SkPMColor back[4];
             switch (kClamp << 1 | kWide) {
-                case 0: for (int i = 0; i < 4; i++) { back[i] = floats[i].get(); }     break;
-                case 1: SkPMFloat::To4PMColors(back, floats);                          break;
-                case 2: for (int i = 0; i < 4; i++) { back[i] = floats[i].clamped(); } break;
-                case 3: SkPMFloat::ClampTo4PMColors(back, floats);                     break;
+                case 0: {
+                    back[0] = fa.get();
+                    back[1] = fb.get();
+                    back[2] = fc.get();
+                    back[3] = fd.get();
+                } break;
+                case 1: SkPMFloat::To4PMColors(fa, fb, fc, fd, back); break;
+                case 2: {
+                    back[0] = fa.clamped();
+                    back[1] = fb.clamped();
+                    back[2] = fc.clamped();
+                    back[3] = fd.clamped();
+                } break;
+                case 3: SkPMFloat::ClampTo4PMColors(fa, fb, fc, fd, back); break;
             }
             for (int i = 0; i < 4; i++) {
                 junk ^= back[i];
diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h
index 010974d..1d034f0 100644
--- a/src/core/SkPMFloat.h
+++ b/src/core/SkPMFloat.h
@@ -1,3 +1,10 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
 #ifndef SkPM_DEFINED
 #define SkPM_DEFINED
 
@@ -20,7 +27,7 @@
     static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); }
 
     // May be more efficient than one at a time.  No special alignment assumed for SkPMColors.
-    static void From4PMColors(SkPMFloat[4], const SkPMColor[4]);
+    static void From4PMColors(const SkPMColor[4], SkPMFloat*, SkPMFloat*, SkPMFloat*, SkPMFloat*);
 
     explicit SkPMFloat(SkPMColor);
     SkPMFloat(float a, float r, float g, float b) {
@@ -51,8 +58,10 @@
     SkPMColor clamped() const;  // Will clamp all values to [0, 255].  Then may assert isValid().
 
     // 4-at-a-time versions of get() and clamped().  Like From4PMColors(), no alignment assumed.
-    static void To4PMColors(SkPMColor[4], const SkPMFloat[4]);
-    static void ClampTo4PMColors(SkPMColor[4], const SkPMFloat[4]);
+    static void To4PMColors(
+            const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
+    static void ClampTo4PMColors(
+            const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
 
     bool isValid() const {
         return this->a() >= 0 && this->a() <= 255
diff --git a/src/opts/SkPMFloat_SSE2.h b/src/opts/SkPMFloat_SSE2.h
index 2a85b1a..156c0c9 100644
--- a/src/opts/SkPMFloat_SSE2.h
+++ b/src/opts/SkPMFloat_SSE2.h
@@ -1,3 +1,10 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
 inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
     fColors = that.fColors;
     return *this;
@@ -34,25 +41,31 @@
     return c;
 }
 
-inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) {
+inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
+                                     SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
     // Haven't beaten this yet.
-    for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); }
+    *a = FromPMColor(colors[0]);
+    *b = FromPMColor(colors[1]);
+    *c = FromPMColor(colors[2]);
+    *d = FromPMColor(colors[3]);
 }
 
-inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
-    SkASSERT(floats[0].isValid() && floats[1].isValid()
-          && floats[2].isValid() && floats[3].isValid());
+inline void SkPMFloat::To4PMColors(
+        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
+        SkPMColor colors[4]) {
     // Haven't beaten this yet.
-    ClampTo4PMColors(colors, floats);
+    ClampTo4PMColors(a,b,c,d, colors);
 }
 
-inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
+inline void SkPMFloat::ClampTo4PMColors(
+        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
+        SkPMColor colors[4]) {
     // Same as _SSSE3.h's.  We use 3 _mm_packus_epi16() where the naive loop uses 8.
     // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
-    __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[0].fColors)),
-            c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[1].fColors)),
-            c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[2].fColors)),
-            c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[3].fColors));
+    __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fColors)),
+            c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fColors)),
+            c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fColors)),
+            c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fColors));
     __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
                                      _mm_packus_epi16(c2, c3));
     _mm_storeu_si128((__m128i*)colors, c3210);
diff --git a/src/opts/SkPMFloat_SSSE3.h b/src/opts/SkPMFloat_SSSE3.h
index ab54caf..fca4197 100644
--- a/src/opts/SkPMFloat_SSSE3.h
+++ b/src/opts/SkPMFloat_SSSE3.h
@@ -1,3 +1,10 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
 inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
     fColors = that.fColors;
     return *this;
@@ -41,23 +48,34 @@
     return c;
 }
 
-inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) {
+inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
+                                     SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
     // Haven't beaten this yet.
-    for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); }
+    *a = FromPMColor(colors[0]);
+    *b = FromPMColor(colors[1]);
+    *c = FromPMColor(colors[2]);
+    *d = FromPMColor(colors[3]);
 }
 
-inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
-    // Haven't beaten this yet.  Still faster than ClampTo4PMColors too.
-    for (int i = 0; i < 4; i++) { colors[i] = floats[i].get(); }
+inline void SkPMFloat::To4PMColors(
+        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
+        SkPMColor colors[4]) {
+    // Haven't beaten this yet.  Still faster than ClampTo4PMColors?
+    colors[0] = a.get();
+    colors[1] = b.get();
+    colors[2] = c.get();
+    colors[3] = d.get();
 }
 
-inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
+inline void SkPMFloat::ClampTo4PMColors(
+        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
+        SkPMColor colors[4]) {
     // Same as _SSE2.h's.  We use 3 _mm_packus_epi16() where the naive loop uses 8.
     // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
-    __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[0].fColors)),
-            c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[1].fColors)),
-            c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[2].fColors)),
-            c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[3].fColors));
+    __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fColors)),
+            c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fColors)),
+            c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fColors)),
+            c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fColors));
     __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
                                      _mm_packus_epi16(c2, c3));
     _mm_storeu_si128((__m128i*)colors, c3210);
diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h
index 6c9df37..780981b 100644
--- a/src/opts/SkPMFloat_neon.h
+++ b/src/opts/SkPMFloat_neon.h
@@ -1,3 +1,10 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
 inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
     fColors = that.fColors;
     return *this;
@@ -41,14 +48,28 @@
 }
 
 // TODO: we should be able to beat these loops on all three methods.
-inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) {
-    for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); }
+inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
+                                     SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
+    *a = FromPMColor(colors[0]);
+    *b = FromPMColor(colors[1]);
+    *c = FromPMColor(colors[2]);
+    *d = FromPMColor(colors[3]);
 }
 
-inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
-    for (int i = 0; i < 4; i++) { colors[i] = floats[i].get(); }
+inline void SkPMFloat::To4PMColors(
+        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
+        SkPMColor colors[4]) {
+    colors[0] = a.get();
+    colors[1] = b.get();
+    colors[2] = c.get();
+    colors[3] = d.get();
 }
 
-inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
-    for (int i = 0; i < 4; i++) { colors[i] = floats[i].clamped(); }
+inline void SkPMFloat::ClampTo4PMColors(
+        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
+        SkPMColor colors[4]) {
+    colors[0] = a.clamped();
+    colors[1] = b.clamped();
+    colors[2] = c.clamped();
+    colors[3] = d.clamped();
 }
diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h
index c47f8a3..00705aa 100644
--- a/src/opts/SkPMFloat_none.h
+++ b/src/opts/SkPMFloat_none.h
@@ -1,3 +1,10 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
 inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
     for (int i = 0; i < 4; i++) { fColor[i] = that.fColor[i]; }
     return *this;
@@ -28,14 +35,28 @@
     return SkPackARGB32(a+0.5f, r+0.5f, g+0.5f, b+0.5f);
 }
 
-inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) {
-    for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); }
+inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
+                                     SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
+    *a = FromPMColor(colors[0]);
+    *b = FromPMColor(colors[1]);
+    *c = FromPMColor(colors[2]);
+    *d = FromPMColor(colors[3]);
 }
 
-inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
-    for (int i = 0; i < 4; i++) { colors[i] = floats[i].get(); }
+inline void SkPMFloat::To4PMColors(
+        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
+        SkPMColor colors[4]) {
+    colors[0] = a.get();
+    colors[1] = b.get();
+    colors[2] = c.get();
+    colors[3] = d.get();
 }
 
-inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
-    for (int i = 0; i < 4; i++) { colors[i] = floats[i].clamped(); }
+inline void SkPMFloat::ClampTo4PMColors(
+        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
+        SkPMColor colors[4]) {
+    colors[0] = a.clamped();
+    colors[1] = b.clamped();
+    colors[2] = c.clamped();
+    colors[3] = d.clamped();
 }
diff --git a/tests/PMFloatTest.cpp b/tests/PMFloatTest.cpp
index b1a4d04..0f0d853 100644
--- a/tests/PMFloatTest.cpp
+++ b/tests/PMFloatTest.cpp
@@ -1,3 +1,10 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
 #include "SkPMFloat.h"
 #include "Test.h"
 
@@ -33,15 +40,15 @@
     // Test 4-at-a-time conversions.
     SkPMColor colors[4] = { 0xFF000000, 0xFFFF0000, 0xFF00FF00, 0xFF0000FF };
     SkPMFloat floats[4];
-    SkPMFloat::From4PMColors(floats, colors);
+    SkPMFloat::From4PMColors(colors, floats+0, floats+1, floats+2, floats+3);
 
     SkPMColor back[4];
-    SkPMFloat::To4PMColors(back, floats);
+    SkPMFloat::To4PMColors(floats[0], floats[1], floats[2], floats[3], back);
     for (int i = 0; i < 4; i++) {
         REPORTER_ASSERT(r, back[i] == colors[i]);
     }
 
-    SkPMFloat::ClampTo4PMColors(back, floats);
+    SkPMFloat::ClampTo4PMColors(floats[0], floats[1], floats[2], floats[3], back);
     for (int i = 0; i < 4; i++) {
         REPORTER_ASSERT(r, back[i] == colors[i]);
     }