diff --git a/bench/MatrixBench.cpp b/bench/MatrixBench.cpp
index 1a35c2e..1d8e811 100644
--- a/bench/MatrixBench.cpp
+++ b/bench/MatrixBench.cpp
@@ -94,125 +94,6 @@
     }
 }
 
-// Test the performance of setConcat() non-perspective case:
-// using floating point precision only.
-class FloatConcatMatrixBench : public MatrixBench {
-public:
-    FloatConcatMatrixBench() : INHERITED("concat_floatfloat") {
-        init9(mya);
-        init9(myb);
-        init9(myr);
-    }
-protected:
-    virtual int mulLoopCount() const { return 4; }
-
-    static inline void muladdmul(float a, float b, float c, float d,
-                                   float* result) {
-      *result = a * b + c * d;
-    }
-    virtual void performTest() {
-        const float* a = mya;
-        const float* b = myb;
-        float* r = myr;
-        muladdmul(a[0], b[0], a[1], b[3], &r[0]);
-        muladdmul(a[0], b[1], a[1], b[4], &r[1]);
-        muladdmul(a[0], b[2], a[1], b[5], &r[2]);
-        r[2] += a[2];
-        muladdmul(a[3], b[0], a[4], b[3], &r[3]);
-        muladdmul(a[3], b[1], a[4], b[4], &r[4]);
-        muladdmul(a[3], b[2], a[4], b[5], &r[5]);
-        r[5] += a[5];
-        r[6] = r[7] = 0.0f;
-        r[8] = 1.0f;
-    }
-private:
-    float mya [9];
-    float myb [9];
-    float myr [9];
-    typedef MatrixBench INHERITED;
-};
-
-static inline float SkDoubleToFloat(double x) {
-    return static_cast<float>(x);
-}
-
-// Test the performance of setConcat() non-perspective case:
-// using floating point precision but casting up to float for
-// intermediate results during computations.
-class FloatDoubleConcatMatrixBench : public MatrixBench {
-public:
-    FloatDoubleConcatMatrixBench() : INHERITED("concat_floatdouble") {
-        init9(mya);
-        init9(myb);
-        init9(myr);
-    }
-protected:
-    virtual int mulLoopCount() const { return 4; }
-
-    static inline void muladdmul(float a, float b, float c, float d,
-                                   float* result) {
-      *result = SkDoubleToFloat((double)a * b + (double)c * d);
-    }
-    virtual void performTest() {
-        const float* a = mya;
-        const float* b = myb;
-        float* r = myr;
-        muladdmul(a[0], b[0], a[1], b[3], &r[0]);
-        muladdmul(a[0], b[1], a[1], b[4], &r[1]);
-        muladdmul(a[0], b[2], a[1], b[5], &r[2]);
-        r[2] += a[2];
-        muladdmul(a[3], b[0], a[4], b[3], &r[3]);
-        muladdmul(a[3], b[1], a[4], b[4], &r[4]);
-        muladdmul(a[3], b[2], a[4], b[5], &r[5]);
-        r[5] += a[5];
-        r[6] = r[7] = 0.0f;
-        r[8] = 1.0f;
-    }
-private:
-    float mya [9];
-    float myb [9];
-    float myr [9];
-    typedef MatrixBench INHERITED;
-};
-
-// Test the performance of setConcat() non-perspective case:
-// using double precision only.
-class DoubleConcatMatrixBench : public MatrixBench {
-public:
-    DoubleConcatMatrixBench() : INHERITED("concat_double") {
-        init9(mya);
-        init9(myb);
-        init9(myr);
-    }
-protected:
-    virtual int mulLoopCount() const { return 4; }
-
-    static inline void muladdmul(double a, double b, double c, double d,
-                                   double* result) {
-      *result = a * b + c * d;
-    }
-    virtual void performTest() {
-        const double* a = mya;
-        const double* b = myb;
-        double* r = myr;
-        muladdmul(a[0], b[0], a[1], b[3], &r[0]);
-        muladdmul(a[0], b[1], a[1], b[4], &r[1]);
-        muladdmul(a[0], b[2], a[1], b[5], &r[2]);
-        r[2] += a[2];
-        muladdmul(a[3], b[0], a[4], b[3], &r[3]);
-        muladdmul(a[3], b[1], a[4], b[4], &r[4]);
-        muladdmul(a[3], b[2], a[4], b[5], &r[5]);
-        r[5] += a[5];
-        r[6] = r[7] = 0.0;
-        r[8] = 1.0;
-    }
-private:
-    double mya [9];
-    double myb [9];
-    double myr [9];
-    typedef MatrixBench INHERITED;
-};
-
 class GetTypeMatrixBench : public MatrixBench {
 public:
     GetTypeMatrixBench()
@@ -260,87 +141,6 @@
     typedef MatrixBench INHERITED;
 };
 
-class ScaleTransMixedMatrixBench : public MatrixBench {
- public:
-    ScaleTransMixedMatrixBench() : INHERITED("scaletrans_mixed") {
-        fMatrix.setAll(fRandom.nextSScalar1(), fRandom.nextSScalar1(), fRandom.nextSScalar1(),
-                       fRandom.nextSScalar1(), fRandom.nextSScalar1(), fRandom.nextSScalar1(),
-                       fRandom.nextSScalar1(), fRandom.nextSScalar1(), fRandom.nextSScalar1());
-        int i;
-        for (i = 0; i < kCount; i++) {
-            fSrc[i].fX = fRandom.nextSScalar1();
-            fSrc[i].fY = fRandom.nextSScalar1();
-            fDst[i].fX = fRandom.nextSScalar1();
-            fDst[i].fY = fRandom.nextSScalar1();
-        }
-    }
- protected:
-    virtual void performTest() {
-        SkPoint* dst = fDst;
-        const SkPoint* src = fSrc;
-        int count = kCount;
-        float mx = fMatrix[SkMatrix::kMScaleX];
-        float my = fMatrix[SkMatrix::kMScaleY];
-        float tx = fMatrix[SkMatrix::kMTransX];
-        float ty = fMatrix[SkMatrix::kMTransY];
-        do {
-            dst->fY = SkScalarMulAdd(src->fY, my, ty);
-            dst->fX = SkScalarMulAdd(src->fX, mx, tx);
-            src += 1;
-            dst += 1;
-        } while (--count);
-    }
- private:
-    enum {
-        kCount = 16
-    };
-    SkMatrix fMatrix;
-    SkPoint fSrc [kCount];
-    SkPoint fDst [kCount];
-    SkRandom fRandom;
-    typedef MatrixBench INHERITED;
-};
-
-class ScaleTransDoubleMatrixBench : public MatrixBench {
- public:
-    ScaleTransDoubleMatrixBench() : INHERITED("scaletrans_double") {
-        init9(fMatrix);
-        int i;
-        for (i = 0; i < kCount; i++) {
-            fSrc[i].fX = fRandom.nextSScalar1();
-            fSrc[i].fY = fRandom.nextSScalar1();
-            fDst[i].fX = fRandom.nextSScalar1();
-            fDst[i].fY = fRandom.nextSScalar1();
-        }
-    }
- protected:
-    virtual void performTest() {
-        SkPoint* dst = fDst;
-        const SkPoint* src = fSrc;
-        int count = kCount;
-        // As doubles, on Z600 Linux systems this is 2.5x as expensive as mixed mode
-        float mx = (float) fMatrix[SkMatrix::kMScaleX];
-        float my = (float) fMatrix[SkMatrix::kMScaleY];
-        float tx = (float) fMatrix[SkMatrix::kMTransX];
-        float ty = (float) fMatrix[SkMatrix::kMTransY];
-        do {
-            dst->fY = src->fY * my + ty;
-            dst->fX = src->fX * mx + tx;
-            src += 1;
-            dst += 1;
-        } while (--count);
-    }
- private:
-    enum {
-        kCount = 16
-    };
-    double fMatrix [9];
-    SkPoint fSrc [kCount];
-    SkPoint fDst [kCount];
-    SkRandom fRandom;
-    typedef MatrixBench INHERITED;
-};
-
 class DecomposeMatrixBench : public MatrixBench {
 public:
     DecomposeMatrixBench() : INHERITED("decompose") {}
@@ -428,9 +228,6 @@
 
 DEF_BENCH( return new EqualsMatrixBench(); )
 DEF_BENCH( return new ScaleMatrixBench(); )
-DEF_BENCH( return new FloatConcatMatrixBench(); )
-DEF_BENCH( return new FloatDoubleConcatMatrixBench(); )
-DEF_BENCH( return new DoubleConcatMatrixBench(); )
 DEF_BENCH( return new GetTypeMatrixBench(); )
 DEF_BENCH( return new DecomposeMatrixBench(); )
 
@@ -468,5 +265,49 @@
                            InvertMapRectMatrixBench::kRotate_Flag |
                            InvertMapRectMatrixBench::kTranslate_Flag); )
 
-DEF_BENCH( return new ScaleTransMixedMatrixBench(); )
-DEF_BENCH( return new ScaleTransDoubleMatrixBench(); )
+///////////////////////////////////////////////////////////////////////////////
+
+static SkMatrix make_ident() { SkMatrix m; m.reset(); return m; }
+static SkMatrix make_trans() { SkMatrix m; m.setTranslate(2, 3); return m; }
+static SkMatrix make_scale() { SkMatrix m(make_trans()); m.postScale(1.5f, 0.5f); return m; }
+static SkMatrix make_afine() { SkMatrix m(make_trans()); m.postRotate(15); return m; }
+
+class MapPointsMatrixBench : public MatrixBench {
+protected:
+    SkMatrix fM;
+    enum {
+        N = 32
+    };
+    SkPoint fSrc[N], fDst[N];
+    const bool fNewWay;
+public:
+    MapPointsMatrixBench(const char name[], const SkMatrix& m, bool newWay)
+        : MatrixBench(name), fM(m), fNewWay(newWay)
+    {
+        SkRandom rand;
+        for (int i = 0; i < N; ++i) {
+            fSrc[i].set(rand.nextSScalar1(), rand.nextSScalar1());
+        }
+    }
+
+    void performTest() SK_OVERRIDE {
+        if (fNewWay) {
+            for (int i = 0; i < 1000000; ++i) {
+                fM.mapPts(fDst, fSrc, N);
+            }
+        } else {
+            for (int i = 0; i < 1000000; ++i) {
+                fM.mapPoints(fDst, fSrc, N);
+            }
+        }
+    }
+};
+DEF_BENCH( return new MapPointsMatrixBench("mappts_ident0", make_ident(), false); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_ident1", make_ident(), true); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_trans0", make_trans(), false); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_trans1", make_trans(), true); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_scale0", make_scale(), false); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_scale1", make_scale(), true); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_afine0", make_afine(), false); )
+DEF_BENCH( return new MapPointsMatrixBench("mappts_afine1", make_afine(), true); )
+
diff --git a/include/core/SkMatrix.h b/include/core/SkMatrix.h
index b2ee3c6..d00beda 100644
--- a/include/core/SkMatrix.h
+++ b/include/core/SkMatrix.h
@@ -414,6 +414,8 @@
         this->mapPoints(pts, pts, count);
     }
 
+    void mapPts(SkPoint dst[], const SkPoint src[], int count) const;
+
     /** Like mapPoints but with custom byte stride between the points. Stride
      *  should be a multiple of sizeof(SkScalar).
      */
diff --git a/src/core/SkMatrix.cpp b/src/core/SkMatrix.cpp
index 753c4dc..81b8960 100644
--- a/src/core/SkMatrix.cpp
+++ b/src/core/SkMatrix.cpp
@@ -1020,6 +1020,90 @@
     this->getMapPtsProc()(*this, dst, src, count);
 }
 
+#include "Sk4x.h"
+
+void SkMatrix::mapPts(SkPoint dst[], const SkPoint src[], int count) const {
+    if (count <= 0) {
+        return;
+    }
+
+    unsigned mask = this->getType() & 0xF;
+    
+    if (SkMatrix::kIdentity_Mask == mask) {
+        if (src != dst) {
+            memcpy(dst, src, count * sizeof(SkPoint));
+        }
+        return;
+    }
+    if (SkMatrix::kTranslate_Mask == mask) {
+        SkScalar tx = this->getTranslateX();
+        SkScalar ty = this->getTranslateY();
+        if (count & 1) {
+            dst->fX = src->fX + tx;
+            dst->fY = src->fY + ty;
+            src += 1;
+            dst += 1;
+        }
+        Sk4f trans4(tx, ty, tx, ty);
+        count >>= 1;
+        for (int i = 0; i < count; ++i) {
+            (Sk4f::Load(&src->fX) + trans4).store(&dst->fX);
+            src += 2;
+            dst += 2;
+        }
+        return;
+    }
+    if (mask <= SkMatrix::kScale_Mask + SkMatrix::kTranslate_Mask) {
+        SkScalar tx = this->getTranslateX();
+        SkScalar ty = this->getTranslateY();
+        SkScalar sx = this->getScaleX();
+        SkScalar sy = this->getScaleY();
+        if (count & 1) {
+            dst->fX = src->fX * sx + tx;
+            dst->fY = src->fY * sy + ty;
+            src += 1;
+            dst += 1;
+        }
+        Sk4f trans4(tx, ty, tx, ty);
+        Sk4f scale4(sx, sy, sx, sy);
+        count >>= 1;
+        for (int i = 0; i < count; ++i) {
+            (Sk4f::Load(&src->fX) * scale4 + trans4).store(&dst->fX);
+            src += 2;
+            dst += 2;
+        }
+        return;
+    }
+    if (mask < SkMatrix::kPerspective_Mask) {   // affine
+        SkScalar tx = this->getTranslateX();
+        SkScalar ty = this->getTranslateY();
+        SkScalar sx = this->getScaleX();
+        SkScalar sy = this->getScaleY();
+        SkScalar kx = this->getSkewX();
+        SkScalar ky = this->getSkewY();
+        if (count & 1) {
+            dst->set(src->fX * sx + src->fY * kx + tx,
+                     src->fX * ky + src->fY * sy + ty);
+            src += 1;
+            dst += 1;
+        }
+        Sk4f trans4(tx, ty, tx, ty);
+        Sk4f scale4(sx, sy, sx, sy);
+        Sk4f  skew4(kx, ky, kx, ky);    // applied to swizzle of src4
+        count >>= 1;
+        for (int i = 0; i < count; ++i) {
+            Sk4f src4 = Sk4f::Load(&src->fX);
+            Sk4f swz4(src[0].fY, src[0].fX, src[1].fY, src[1].fX);  // need ABCD -> BADC
+            (src4 * scale4 + swz4 * skew4 + trans4).store(&dst->fX);
+            src += 2;
+            dst += 2;
+        }
+        return;
+    }
+    // fall through for perspective
+    this->mapPoints(dst, src, count);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 
 void SkMatrix::mapHomogeneousPoints(SkScalar dst[], const SkScalar src[], int count) const {
diff --git a/tests/MatrixTest.cpp b/tests/MatrixTest.cpp
index 95d33ac..35306b3 100644
--- a/tests/MatrixTest.cpp
+++ b/tests/MatrixTest.cpp
@@ -799,6 +799,48 @@
     REPORTER_ASSERT(reporter, !check_decompScale(m));
 }
 
+static void test_mappts(skiatest::Reporter* reporter, const SkMatrix& m, const char type[], int n) {
+    const int MAX = 100;
+    SkPoint src[MAX];
+    SkPoint dst0[MAX], dst1[MAX];
+    SkASSERT(n <= MAX);
+
+    SkRandom rand;
+    for (int i = 0; i < n; ++i) {
+        src[i].fX = rand.nextSScalar1() * 100;
+        src[i].fY = rand.nextSScalar1() * 100;
+    }
+
+    m.mapPoints(dst0, src, n);
+    m.mapPts(   dst1, src, n);
+    for (int i = 0; i < n; ++i) {
+        bool eq = SkScalarNearlyEqual(dst0[i].fX, dst1[i].fX) &&
+                  SkScalarNearlyEqual(dst0[i].fY, dst1[i].fY);
+        if (!eq) {
+            SkDebugf("%s [%d] points (%g %g) pts (%g %g)\n", type, i, dst0[i].fX, dst0[i].fY, dst1[i].fX, dst1[i].fY);
+            REPORTER_ASSERT(reporter, eq);
+        }
+    }
+}
+
+static void test_mappts(skiatest::Reporter* reporter) {
+    const int counts[] = { 0, 1, 2, 3, 4, 100 };
+    for (size_t i = 0; i < SK_ARRAY_COUNT(counts); ++i) {
+        const int n = counts[i];
+        SkMatrix m;
+        m.reset();
+        test_mappts(reporter, m, "ident", n);
+        m.setTranslate(2, 3);
+        test_mappts(reporter, m, "trans", n);
+        m.postScale(2, 0.5f);
+        test_mappts(reporter, m, "scale", n);
+        m.postRotate(35);
+        test_mappts(reporter, m, "affine", n);
+        m.setPerspX(0.1f);
+        test_mappts(reporter, m, "persp", n);
+    }
+}
+
 DEF_TEST(Matrix, reporter) {
     SkMatrix    mat, inverse, iden1, iden2;
 
@@ -919,6 +961,8 @@
     test_set9(reporter);
 
     test_decompScale(reporter);
+
+    test_mappts(reporter);
 }
 
 DEF_TEST(Matrix_Concat, r) {
