remove meaningless matrix benches, add mapPts() and add new benches

mapPts definitely faster than mapPoints (identity and perspective same speed). Up to 3x for large values of N.

cloned from https://codereview.chromium.org/1031443002/

BUG=skia:

Review URL: https://codereview.chromium.org/1030653002
diff --git a/bench/MatrixBench.cpp b/bench/MatrixBench.cpp
index 1a35c2e..1d8e811 100644
--- a/bench/MatrixBench.cpp
+++ b/bench/MatrixBench.cpp
@@ -94,125 +94,6 @@
     }
 }
 
-// Test the performance of setConcat() non-perspective case:
-// using floating point precision only.
-class FloatConcatMatrixBench : public MatrixBench {
-public:
-    FloatConcatMatrixBench() : INHERITED("concat_floatfloat") {
-        init9(mya);
-        init9(myb);
-        init9(myr);
-    }
-protected:
-    virtual int mulLoopCount() const { return 4; }
-
-    static inline void muladdmul(float a, float b, float c, float d,
-                                   float* result) {
-      *result = a * b + c * d;
-    }
-    virtual void performTest() {
-        const float* a = mya;
-        const float* b = myb;
-        float* r = myr;
-        muladdmul(a[0], b[0], a[1], b[3], &r[0]);
-        muladdmul(a[0], b[1], a[1], b[4], &r[1]);
-        muladdmul(a[0], b[2], a[1], b[5], &r[2]);
-        r[2] += a[2];
-        muladdmul(a[3], b[0], a[4], b[3], &r[3]);
-        muladdmul(a[3], b[1], a[4], b[4], &r[4]);
-        muladdmul(a[3], b[2], a[4], b[5], &r[5]);
-        r[5] += a[5];
-        r[6] = r[7] = 0.0f;
-        r[8] = 1.0f;
-    }
-private:
-    float mya [9];
-    float myb [9];
-    float myr [9];
-    typedef MatrixBench INHERITED;
-};
-
-static inline float SkDoubleToFloat(double x) {
-    return static_cast<float>(x);
-}
-
-// Test the performance of setConcat() non-perspective case:
-// using floating point precision but casting up to float for
-// intermediate results during computations.
-class FloatDoubleConcatMatrixBench : public MatrixBench {
-public:
-    FloatDoubleConcatMatrixBench() : INHERITED("concat_floatdouble") {
-        init9(mya);
-        init9(myb);
-        init9(myr);
-    }
-protected:
-    virtual int mulLoopCount() const { return 4; }
-
-    static inline void muladdmul(float a, float b, float c, float d,
-                                   float* result) {
-      *result = SkDoubleToFloat((double)a * b + (double)c * d);
-    }
-    virtual void performTest() {
-        const float* a = mya;
-        const float* b = myb;
-        float* r = myr;
-        muladdmul(a[0], b[0], a[1], b[3], &r[0]);
-        muladdmul(a[0], b[1], a[1], b[4], &r[1]);
-        muladdmul(a[0], b[2], a[1], b[5], &r[2]);
-        r[2] += a[2];
-        muladdmul(a[3], b[0], a[4], b[3], &r[3]);
-        muladdmul(a[3], b[1], a[4], b[4], &r[4]);
-        muladdmul(a[3], b[2], a[4], b[5], &r[5]);
-        r[5] += a[5];
-        r[6] = r[7] = 0.0f;
-        r[8] = 1.0f;
-    }
-private:
-    float mya [9];
-    float myb [9];
-    float myr [9];
-    typedef MatrixBench INHERITED;
-};
-
-// Test the performance of setConcat() non-perspective case:
-// using double precision only.
-class DoubleConcatMatrixBench : public MatrixBench {
-public:
-    DoubleConcatMatrixBench() : INHERITED("concat_double") {
-        init9(mya);
-        init9(myb);
-        init9(myr);
-    }
-protected:
-    virtual int mulLoopCount() const { return 4; }
-
-    static inline void muladdmul(double a, double b, double c, double d,
-                                   double* result) {
-      *result = a * b + c * d;
-    }
-    virtual void performTest() {
-        const double* a = mya;
-        const double* b = myb;
-        double* r = myr;
-        muladdmul(a[0], b[0], a[1], b[3], &r[0]);
-        muladdmul(a[0], b[1], a[1], b[4], &r[1]);
-        muladdmul(a[0], b[2], a[1], b[5], &r[2]);
-        r[2] += a[2];
-        muladdmul(a[3], b[0], a[4], b[3], &r[3]);
-        muladdmul(a[3], b[1], a[4], b[4], &r[4]);
-        muladdmul(a[3], b[2], a[4], b[5], &r[5]);
-        r[5] += a[5];
-        r[6] = r[7] = 0.0;
-        r[8] = 1.0;
-    }
-private:
-    double mya [9];
-    double myb [9];
-    double myr [9];
-    typedef MatrixBench INHERITED;
-};
-
 class GetTypeMatrixBench : public MatrixBench {
 public:
     GetTypeMatrixBench()
@@ -260,87 +141,6 @@
     typedef MatrixBench INHERITED;
 };
 
-class ScaleTransMixedMatrixBench : public MatrixBench {
- public:
-    ScaleTransMixedMatrixBench() : INHERITED("scaletrans_mixed") {
-        fMatrix.setAll(fRandom.nextSScalar1(), fRandom.nextSScalar1(), fRandom.nextSScalar1(),
-                       fRandom.nextSScalar1(), fRandom.nextSScalar1(), fRandom.nextSScalar1(),
-                       fRandom.nextSScalar1(), fRandom.nextSScalar1(), fRandom.nextSScalar1());
-        int i;
-        for (i = 0; i < kCount; i++) {
-            fSrc[i].fX = fRandom.nextSScalar1();
-            fSrc[i].fY = fRandom.nextSScalar1();
-            fDst[i].fX = fRandom.nextSScalar1();
-            fDst[i].fY = fRandom.nextSScalar1();
-        }
-    }
- protected:
-    virtual void performTest() {
-        SkPoint* dst = fDst;
-        const SkPoint* src = fSrc;
-        int count = kCount;
-        float mx = fMatrix[SkMatrix::kMScaleX];
-        float my = fMatrix[SkMatrix::kMScaleY];
-        float tx = fMatrix[SkMatrix::kMTransX];
-        float ty = fMatrix[SkMatrix::kMTransY];
-        do {
-            dst->fY = SkScalarMulAdd(src->fY, my, ty);
-            dst->fX = SkScalarMulAdd(src->fX, mx, tx);
-            src += 1;
-            dst += 1;
-        } while (--count);
-    }
- private:
-    enum {
-        kCount = 16
-    };
-    SkMatrix fMatrix;
-    SkPoint fSrc [kCount];
-    SkPoint fDst [kCount];
-    SkRandom fRandom;
-    typedef MatrixBench INHERITED;
-};
-
-class ScaleTransDoubleMatrixBench : public MatrixBench {
- public:
-    ScaleTransDoubleMatrixBench() : INHERITED("scaletrans_double") {
-        init9(fMatrix);
-        int i;
-        for (i = 0; i < kCount; i++) {
-            fSrc[i].fX = fRandom.nextSScalar1();
-            fSrc[i].fY = fRandom.nextSScalar1();
-            fDst[i].fX = fRandom.nextSScalar1();
-            fDst[i].fY = fRandom.nextSScalar1();
-        }
-    }
- protected:
-    virtual void performTest() {
-        SkPoint* dst = fDst;
-        const SkPoint* src = fSrc;
-        int count = kCount;
-        // As doubles, on Z600 Linux systems this is 2.5x as expensive as mixed mode
-        float mx = (float) fMatrix[SkMatrix::kMScaleX];
-        float my = (float) fMatrix[SkMatrix::kMScaleY];
-        float tx = (float) fMatrix[SkMatrix::kMTransX];
-        float ty = (float) fMatrix[SkMatrix::kMTransY];
-        do {
-            dst->fY = src->fY * my + ty;
-            dst->fX = src->fX * mx + tx;
-            src += 1;
-            dst += 1;
-        } while (--count);
-    }
- private:
-    enum {
-        kCount = 16
-    };
-    double fMatrix [9];
-    SkPoint fSrc [kCount];
-    SkPoint fDst [kCount];
-    SkRandom fRandom;
-    typedef MatrixBench INHERITED;
-};
-
 class DecomposeMatrixBench : public MatrixBench {
 public:
     DecomposeMatrixBench() : INHERITED("decompose") {}
@@ -428,9 +228,6 @@
 
 DEF_BENCH( return new EqualsMatrixBench(); )
 DEF_BENCH( return new ScaleMatrixBench(); )
-DEF_BENCH( return new FloatConcatMatrixBench(); )
-DEF_BENCH( return new FloatDoubleConcatMatrixBench(); )
-DEF_BENCH( return new DoubleConcatMatrixBench(); )
 DEF_BENCH( return new GetTypeMatrixBench(); )
 DEF_BENCH( return new DecomposeMatrixBench(); )
 
@@ -468,5 +265,49 @@
                            InvertMapRectMatrixBench::kRotate_Flag |
                            InvertMapRectMatrixBench::kTranslate_Flag); )
 
-DEF_BENCH( return new ScaleTransMixedMatrixBench(); )
-DEF_BENCH( return new ScaleTransDoubleMatrixBench(); )
+///////////////////////////////////////////////////////////////////////////////
+
+static SkMatrix make_ident() { SkMatrix m; m.reset(); return m; }
+static SkMatrix make_trans() { SkMatrix m; m.setTranslate(2, 3); return m; }
+static SkMatrix make_scale() { SkMatrix m(make_trans()); m.postScale(1.5f, 0.5f); return m; }
+static SkMatrix make_afine() { SkMatrix m(make_trans()); m.postRotate(15); return m; }
+
+class MapPointsMatrixBench : public MatrixBench {
+protected:
+    SkMatrix fM;
+    enum {
+        N = 32
+    };
+    SkPoint fSrc[N], fDst[N];
+    const bool fNewWay;
+public:
+    MapPointsMatrixBench(const char name[], const SkMatrix& m, bool newWay)
+        : MatrixBench(name), fM(m), fNewWay(newWay)
+    {
+        SkRandom rand;
+        for (int i = 0; i < N; ++i) {
+            fSrc[i].set(rand.nextSScalar1(), rand.nextSScalar1());
+        }
+    }
+
+    void performTest() SK_OVERRIDE {
+        if (fNewWay) {
+            for (int i = 0; i < 1000000; ++i) {
+                fM.mapPts(fDst, fSrc, N);
+            }
+        } else {
+            for (int i = 0; i < 1000000; ++i) {
+                fM.mapPoints(fDst, fSrc, N);
+            }
+        }
+    }
+};
+DEF_BENCH( return new MapPointsMatrixBench("mappts_ident0", make_ident(), false); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_ident1", make_ident(), true); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_trans0", make_trans(), false); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_trans1", make_trans(), true); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_scale0", make_scale(), false); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_scale1", make_scale(), true); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_afine0", make_afine(), false); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_afine1", make_afine(), true); )
+
diff --git a/include/core/SkMatrix.h b/include/core/SkMatrix.h
index b2ee3c6..d00beda 100644
--- a/include/core/SkMatrix.h
+++ b/include/core/SkMatrix.h
@@ -414,6 +414,8 @@
         this->mapPoints(pts, pts, count);
     }
 
+    void mapPts(SkPoint dst[], const SkPoint src[], int count) const;
+
     /** Like mapPoints but with custom byte stride between the points. Stride
      *  should be a multiple of sizeof(SkScalar).
      */
diff --git a/src/core/SkMatrix.cpp b/src/core/SkMatrix.cpp
index 753c4dc..81b8960 100644
--- a/src/core/SkMatrix.cpp
+++ b/src/core/SkMatrix.cpp
@@ -1020,6 +1020,90 @@
     this->getMapPtsProc()(*this, dst, src, count);
 }
 
+#include "Sk4x.h"
+
+void SkMatrix::mapPts(SkPoint dst[], const SkPoint src[], int count) const {
+    if (count <= 0) {
+        return;
+    }
+
+    unsigned mask = this->getType() & 0xF;
+    
+    if (SkMatrix::kIdentity_Mask == mask) {
+        if (src != dst) {
+            memcpy(dst, src, count * sizeof(SkPoint));
+        }
+        return;
+    }
+    if (SkMatrix::kTranslate_Mask == mask) {
+        SkScalar tx = this->getTranslateX();
+        SkScalar ty = this->getTranslateY();
+        if (count & 1) {
+            dst->fX = src->fX + tx;
+            dst->fY = src->fY + ty;
+            src += 1;
+            dst += 1;
+        }
+        Sk4f trans4(tx, ty, tx, ty);
+        count >>= 1;
+        for (int i = 0; i < count; ++i) {
+            (Sk4f::Load(&src->fX) + trans4).store(&dst->fX);
+            src += 2;
+            dst += 2;
+        }
+        return;
+    }
+    if (mask <= SkMatrix::kScale_Mask + SkMatrix::kTranslate_Mask) {
+        SkScalar tx = this->getTranslateX();
+        SkScalar ty = this->getTranslateY();
+        SkScalar sx = this->getScaleX();
+        SkScalar sy = this->getScaleY();
+        if (count & 1) {
+            dst->fX = src->fX * sx + tx;
+            dst->fY = src->fY * sy + ty;
+            src += 1;
+            dst += 1;
+        }
+        Sk4f trans4(tx, ty, tx, ty);
+        Sk4f scale4(sx, sy, sx, sy);
+        count >>= 1;
+        for (int i = 0; i < count; ++i) {
+            (Sk4f::Load(&src->fX) * scale4 + trans4).store(&dst->fX);
+            src += 2;
+            dst += 2;
+        }
+        return;
+    }
+    if (mask < SkMatrix::kPerspective_Mask) {   // affine
+        SkScalar tx = this->getTranslateX();
+        SkScalar ty = this->getTranslateY();
+        SkScalar sx = this->getScaleX();
+        SkScalar sy = this->getScaleY();
+        SkScalar kx = this->getSkewX();
+        SkScalar ky = this->getSkewY();
+        if (count & 1) {
+            dst->set(src->fX * sx + src->fY * kx + tx,
+                     src->fX * ky + src->fY * sy + ty);
+            src += 1;
+            dst += 1;
+        }
+        Sk4f trans4(tx, ty, tx, ty);
+        Sk4f scale4(sx, sy, sx, sy);
+        Sk4f  skew4(kx, ky, kx, ky);    // applied to swizzle of src4
+        count >>= 1;
+        for (int i = 0; i < count; ++i) {
+            Sk4f src4 = Sk4f::Load(&src->fX);
+            Sk4f swz4(src[0].fY, src[0].fX, src[1].fY, src[1].fX);  // need ABCD -> BADC
+            (src4 * scale4 + swz4 * skew4 + trans4).store(&dst->fX);
+            src += 2;
+            dst += 2;
+        }
+        return;
+    }
+    // fall through for perspective
+    this->mapPoints(dst, src, count);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 
 void SkMatrix::mapHomogeneousPoints(SkScalar dst[], const SkScalar src[], int count) const {
diff --git a/tests/MatrixTest.cpp b/tests/MatrixTest.cpp
index 95d33ac..35306b3 100644
--- a/tests/MatrixTest.cpp
+++ b/tests/MatrixTest.cpp
@@ -799,6 +799,48 @@
     REPORTER_ASSERT(reporter, !check_decompScale(m));
 }
 
+static void test_mappts(skiatest::Reporter* reporter, const SkMatrix& m, const char type[], int n) {
+    const int MAX = 100;
+    SkPoint src[MAX];
+    SkPoint dst0[MAX], dst1[MAX];
+    SkASSERT(n <= MAX);
+
+    SkRandom rand;
+    for (int i = 0; i < n; ++i) {
+        src[i].fX = rand.nextSScalar1() * 100;
+        src[i].fY = rand.nextSScalar1() * 100;
+    }
+
+    m.mapPoints(dst0, src, n);
+    m.mapPts(   dst1, src, n);
+    for (int i = 0; i < n; ++i) {
+        bool eq = SkScalarNearlyEqual(dst0[i].fX, dst1[i].fX) &&
+                  SkScalarNearlyEqual(dst0[i].fY, dst1[i].fY);
+        if (!eq) {
+            SkDebugf("%s [%d] points (%g %g) pts (%g %g)\n", type, i, dst0[i].fX, dst0[i].fY, dst1[i].fX, dst1[i].fY);
+            REPORTER_ASSERT(reporter, eq);
+        }
+    }
+}
+
+static void test_mappts(skiatest::Reporter* reporter) {
+    const int counts[] = { 0, 1, 2, 3, 4, 100 };
+    for (size_t i = 0; i < SK_ARRAY_COUNT(counts); ++i) {
+        const int n = counts[i];
+        SkMatrix m;
+        m.reset();
+        test_mappts(reporter, m, "ident", n);
+        m.setTranslate(2, 3);
+        test_mappts(reporter, m, "trans", n);
+        m.postScale(2, 0.5f);
+        test_mappts(reporter, m, "scale", n);
+        m.postRotate(35);
+        test_mappts(reporter, m, "affine", n);
+        m.setPerspX(0.1f);
+        test_mappts(reporter, m, "persp", n);
+    }
+}
+
 DEF_TEST(Matrix, reporter) {
     SkMatrix    mat, inverse, iden1, iden2;
 
@@ -919,6 +961,8 @@
     test_set9(reporter);
 
     test_decompScale(reporter);
+
+    test_mappts(reporter);
 }
 
 DEF_TEST(Matrix_Concat, r) {