Simplified linear pipeline.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1704583003

Review URL: https://codereview.chromium.org/1704583003
diff --git a/bench/SkLinearBitmapPipelineBench.cpp b/bench/SkLinearBitmapPipelineBench.cpp
new file mode 100644
index 0000000..9d2e48c
--- /dev/null
+++ b/bench/SkLinearBitmapPipelineBench.cpp
@@ -0,0 +1,288 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <memory>
+#include "SkColor.h"
+#include "SkLinearBitmapPipeline.h"
+#include "Benchmark.h"
+#include "SkShader.h"
+#include "SkImage.h"
+
+struct CommonBitmapFPBenchmark : public Benchmark {
+    CommonBitmapFPBenchmark(
+        SkISize srcSize,
+        SkColorProfileType colorProfile,
+        SkMatrix m,
+        bool useBilerp,
+        SkShader::TileMode xTile,
+        SkShader::TileMode yTile)
+        : fColorProfile(colorProfile)
+        , fM{m}
+        , fUseBilerp{useBilerp}
+        , fXTile{xTile}
+        , fYTile{yTile} {
+        fSrcSize = srcSize;
+    }
+
+    static SkString tileName(const char* pre, SkShader::TileMode mode) {
+        SkString name{pre};
+        switch (mode) {
+            case SkShader::kClamp_TileMode:
+                name.append("Clamp");
+                return name;
+            case SkShader::kRepeat_TileMode:
+                name.append("Repeat");
+                return name;
+            case SkShader::kMirror_TileMode:
+                name.append("Mirror");
+                return name;
+            default:
+                name.append("Unknown");
+                return name;
+        }
+    }
+
+    const char* onGetName() override {
+        SkString name {"SkBitmapFP"};
+        if (fM.getType() & SkMatrix::kPerspective_Mask) {
+            name.append("Perspective");
+        } else if (fM.getType() & SkMatrix::kAffine_Mask) {
+            name.append("Affine");
+        } else if (fM.getType() & SkMatrix::kScale_Mask) {
+            name.append("Scale");
+        } else if (fM.getType() & SkMatrix::kTranslate_Mask) {
+            name.append("Translate");
+        } else {
+            name.append("Identity");
+        }
+
+        name.append(tileName("X", fXTile));
+        name.append(tileName("Y", fYTile));
+
+        if (fUseBilerp) {
+            name.append("Filter");
+        } else {
+            name.append("Nearest");
+        }
+
+        name.appendf("%s", BaseName().c_str());
+
+        return name.c_str();
+    }
+
+    void onPreDraw(SkCanvas*) override {
+        int width = fSrcSize.fWidth;
+        int height = fSrcSize.fHeight;
+        fBitmap.reset(new uint32_t[width * height]);
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x++) {
+                fBitmap[y * width + x] = (y << 8) + x + (128<<24);
+            }
+        }
+
+        bool trash = fM.invert(&fInvert);
+        sk_ignore_unused_variable(trash);
+
+        fInfo = SkImageInfo::MakeN32Premul(width, height, fColorProfile);
+    }
+
+    bool isSuitableFor(Backend backend) override {
+        return backend == kNonRendering_Backend;
+    }
+
+    virtual SkString BaseName() = 0;
+
+    SkISize fSrcSize;
+    SkColorProfileType fColorProfile;
+    SkMatrix fM;
+    SkMatrix fInvert;
+    bool fUseBilerp;
+    SkShader::TileMode fXTile;
+    SkShader::TileMode fYTile;
+    SkImageInfo fInfo;
+    std::unique_ptr<uint32_t[]> fBitmap;
+};
+
+struct SkBitmapFPGeneral final : public CommonBitmapFPBenchmark {
+    SkBitmapFPGeneral(
+        SkISize srcSize,
+        SkColorProfileType colorProfile,
+        SkMatrix m,
+        bool useBilerp,
+        SkShader::TileMode xTile,
+        SkShader::TileMode yTile)
+            : CommonBitmapFPBenchmark(srcSize, colorProfile, m, useBilerp, xTile, yTile) { }
+    SkString BaseName() override {
+        SkString name;
+        if (fInfo.isSRGB()) {
+            name.set("sRGB");
+        } else {
+            name.set("Linr");
+        }
+        return name;
+    }
+
+    void onDraw(int loops, SkCanvas*) override {
+        int width = fSrcSize.fWidth;
+        int height = fSrcSize.fHeight;
+
+        SkPM4f* FPbuffer = new SkPM4f[width * height];
+
+        SkLinearBitmapPipeline pipeline{fInvert, fXTile, fYTile, fInfo, fBitmap.get(), };
+
+        int count = 100;
+
+        for (int n = 0; n < 1000*loops; n++) {
+            pipeline.shadeSpan4f(3, 6, FPbuffer, count);
+        }
+
+        delete [] FPbuffer;
+
+    }
+};
+
+struct SkBitmapFPOrigShader : public CommonBitmapFPBenchmark {
+    SkBitmapFPOrigShader(
+        SkISize srcSize,
+        SkColorProfileType colorProfile,
+        SkMatrix m,
+        bool useBilerp,
+        SkShader::TileMode xTile,
+        SkShader::TileMode yTile)
+            : CommonBitmapFPBenchmark(srcSize, colorProfile, m, useBilerp, xTile, yTile) { }
+    SkString BaseName() override {
+        SkString name{"Orig"};
+        return name;
+    }
+
+    void onPreDraw(SkCanvas* c) override {
+        CommonBitmapFPBenchmark::onPreDraw(c);
+
+        SkImage* image = SkImage::NewRasterCopy(
+            fInfo, fBitmap.get(), sizeof(SkPMColor) * fSrcSize.fWidth);
+        fImage.reset(image);
+        SkShader* shader = fImage->newShader(fXTile, fYTile);
+        if (fUseBilerp) {
+            fPaint.setFilterQuality(SkFilterQuality::kLow_SkFilterQuality);
+        } else {
+            fPaint.setFilterQuality(SkFilterQuality::kNone_SkFilterQuality);
+        }
+        fPaint.setShader(shader)->unref();
+
+    }
+
+    void onPostDraw(SkCanvas*) override {
+
+    }
+
+    void onDraw(int loops, SkCanvas*) override {
+        int width = fSrcSize.fWidth;
+        int height = fSrcSize.fHeight;
+
+        SkPMColor *buffer4b = new SkPMColor[width * height];
+
+        uint32_t storage[200];
+        SkASSERT(fPaint.getShader()->contextSize() <= sizeof(storage));
+        SkShader::Context* ctx = fPaint.getShader()->createContext(
+            {fPaint, fM, nullptr},
+            storage);
+
+        int count = 100;
+
+        for (int n = 0; n < 1000*loops; n++) {
+            ctx->shadeSpan(3, 6, buffer4b, count);
+        }
+
+        ctx->~Context();
+        delete buffer4b;
+    }
+    SkPaint fPaint;
+    SkAutoTUnref<SkImage> fImage;
+};
+
+static SkISize srcSize = SkISize::Make(120, 100);
+static SkMatrix mI = SkMatrix::I();
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kSRGB_SkColorProfileType, mI, false,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kLinear_SkColorProfileType, mI, false,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPOrigShader(
+    srcSize, kLinear_SkColorProfileType, mI, false,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kSRGB_SkColorProfileType, mI, true,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kLinear_SkColorProfileType, mI, true,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPOrigShader(
+    srcSize, kLinear_SkColorProfileType, mI, true,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+static SkMatrix mS = SkMatrix::MakeScale(2.7f, 2.7f);
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kSRGB_SkColorProfileType, mS, false,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kLinear_SkColorProfileType, mS, false,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPOrigShader(
+    srcSize, kLinear_SkColorProfileType, mS, false,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kSRGB_SkColorProfileType, mS, true,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kLinear_SkColorProfileType, mS, true,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPOrigShader(
+    srcSize, kLinear_SkColorProfileType, mS, true,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+static SkMatrix rotate(SkScalar r) {
+    SkMatrix m;
+    m.setRotate(30);
+    return m;
+}
+
+static SkMatrix mR = rotate(30);
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kSRGB_SkColorProfileType, mR, false,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kLinear_SkColorProfileType, mR, false,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPOrigShader(
+    srcSize, kLinear_SkColorProfileType, mR, false,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kSRGB_SkColorProfileType, mR, true,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kLinear_SkColorProfileType, mR, true,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPOrigShader(
+    srcSize, kLinear_SkColorProfileType, mR, true,
+    SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
+
diff --git a/gm/SkLinearBitmapPipelineGM.cpp b/gm/SkLinearBitmapPipelineGM.cpp
new file mode 100644
index 0000000..fd6a423
--- /dev/null
+++ b/gm/SkLinearBitmapPipelineGM.cpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "gm.h"
+#include "SkCanvas.h"
+#include "SkColor.h"
+#include "SkImage.h"
+#include "SkImageInfo.h"
+#include "SkLinearBitmapPipeline.h"
+#include "SkXfermode.h"
+#include "SkPM4fPriv.h"
+#include "SkShader.h"
+
+static void fill_in_bits(SkBitmap& bm, SkIRect ir, SkColor c, bool premul) {
+    bm.allocN32Pixels(ir.width(), ir.height());
+    SkPixmap pm;
+    bm.peekPixels(&pm);
+
+    SkPMColor b = SkColorSetARGBMacro(255, 0, 0, 0);
+    SkPMColor w;
+    if (premul) {
+        w = SkPreMultiplyColor(c);
+    } else {
+        w = SkPackARGB32NoCheck(SkColorGetA(c), SkColorGetR(c), SkColorGetG(c), SkColorGetB(c));
+    }
+
+    for (int y = 0; y < ir.height(); y++) {
+        for (int x = 0; x < ir.width(); x++) {
+            if ((x ^ y)  & 16) {
+                *pm.writable_addr32(x, y) = b;
+            } else {
+                *pm.writable_addr32(x, y) = w;
+            }
+        }
+    }
+}
+
+static void draw_rect_orig(SkCanvas* canvas, const SkRect& r, SkColor c, const SkMatrix* mat, bool useBilerp) {
+    const SkIRect ir = r.round();
+
+    SkBitmap bmsrc;
+    fill_in_bits(bmsrc, ir, c, true);
+
+    SkPixmap pmsrc;
+    bmsrc.peekPixels(&pmsrc);
+
+    SkBitmap bmdst;
+    bmdst.allocN32Pixels(ir.width(), ir.height());
+    bmdst.eraseColor(0xFFFFFFFF);
+    SkPixmap pmdst;
+    bmdst.peekPixels(&pmdst);
+
+    SkImageInfo info = SkImageInfo::MakeN32Premul(ir.width(), ir.height(), kLinear_SkColorProfileType);
+
+    SkAutoTUnref<SkImage> image{SkImage::NewRasterCopy(
+        info, pmsrc.addr32(), pmsrc.rowBytes())};
+    SkPaint paint;
+    int32_t storage[200];
+    SkShader* shader = image->newShader(SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);
+    if (useBilerp) {
+        paint.setFilterQuality(SkFilterQuality::kLow_SkFilterQuality);
+    } else {
+        paint.setFilterQuality(SkFilterQuality::kNone_SkFilterQuality);
+    }
+    paint.setShader(shader)->unref();
+    SkASSERT(paint.getShader()->contextSize() <= sizeof(storage));
+
+    SkShader::Context* ctx = paint.getShader()->createContext(
+        {paint, *mat, nullptr},
+        storage);
+
+    for (int y = 0; y < ir.height(); y++) {
+        ctx->shadeSpan(0, y, pmdst.writable_addr32(0, y), ir.width());
+    }
+
+    canvas->drawBitmap(bmdst, r.left(), r.top(), nullptr);
+
+    ctx->~Context();
+
+}
+
+static void draw_rect_fp(SkCanvas* canvas, const SkRect& r, SkColor c, const SkMatrix* mat, bool useBilerp) {
+    const SkIRect ir = r.round();
+
+    SkBitmap bmsrc;
+    fill_in_bits(bmsrc, ir, c, true);
+    SkPixmap pmsrc;
+    bmsrc.peekPixels(&pmsrc);
+
+    SkBitmap bmdst;
+    bmdst.allocN32Pixels(ir.width(), ir.height());
+    bmdst.eraseColor(0xFFFFFFFF);
+    SkPixmap pmdst;
+    bmdst.peekPixels(&pmdst);
+
+    SkPM4f* dstBits = new SkPM4f[ir.width()];
+    SkImageInfo info = SkImageInfo::MakeN32(ir.width(), ir.height(), kPremul_SkAlphaType);
+
+    SkMatrix inv;
+    bool trash = mat->invert(&inv);
+    sk_ignore_unused_variable(trash);
+
+
+    uint32_t flags = 0;
+    //if (kSRGB_SkColorProfileType == profile) {
+        //flags |= SkXfermode::kDstIsSRGB_PM4fFlag;
+    //}
+    const SkXfermode::PM4fState state { nullptr, flags };
+    auto procN = SkXfermode::GetPM4fProcN(SkXfermode::kSrcOver_Mode, flags);
+
+    SkLinearBitmapPipeline pipeline{
+            inv, SkShader::kClamp_TileMode, SkShader::kClamp_TileMode, info, pmsrc.addr32()};
+
+
+    for (int y = 0; y < ir.height(); y++) {
+        pipeline.shadeSpan4f(0, y, dstBits, ir.width());
+        procN(state, pmdst.writable_addr32(0, y), dstBits, ir.width(), nullptr);
+    }
+
+    delete [] dstBits;
+
+    canvas->drawBitmap(bmdst, r.left(), r.top(), nullptr);
+}
+
+static void draw_rect_none(SkCanvas* canvas, const SkRect& r, SkColor c) {
+    const SkIRect ir = r.round();
+
+    SkBitmap bm;
+    fill_in_bits(bm, ir, c, true);
+
+    canvas->drawBitmap(bm, r.left(), r.top(), nullptr);
+}
+
+/*
+ *  Test SkXfer4fProcs directly for src-over, comparing them to current SkColor blits.
+ */
+DEF_SIMPLE_GM(linear_pipeline, canvas, 580, 1400) {
+    const int IW = 50;
+    const SkScalar W = IW;
+    const SkScalar H = 100;
+
+    const SkColor colors[] = {
+        0x880000FF, 0x8800FF00, 0x88FF0000, 0x88000000,
+        SK_ColorBLUE, SK_ColorGREEN, SK_ColorRED, SK_ColorBLACK,
+    };
+
+    canvas->translate(20, 20);
+
+    SkMatrix mi = SkMatrix::I();
+    SkMatrix mt;
+    mt.setTranslate(8, 8);
+    SkMatrix ms;
+    ms.setScale(2.7f, 2.7f);
+    SkMatrix mr;
+    mr.setRotate(10);
+
+    const SkMatrix* mats[] = {nullptr, &mi, &mt, &ms, &mr};
+
+    const SkRect r = SkRect::MakeWH(W, H);
+    bool useBilerp = false;
+    while (true) {
+        canvas->save();
+        for (auto mat : mats) {
+            canvas->save();
+            for (SkColor c : colors) {
+                if (mat == nullptr) {
+                    SkPaint p;
+                    p.setColor(c);
+                    draw_rect_none(canvas, r, c);
+                    canvas->translate(W + 20, 0);
+                    draw_rect_none(canvas, r, c);
+
+                } else {
+                    draw_rect_orig(canvas, r, c, mat, useBilerp);
+                    canvas->translate(W + 20, 0);
+                    draw_rect_fp(canvas, r, c, mat, useBilerp);
+                }
+                canvas->translate(W + 20, 0);
+            }
+            canvas->restore();
+            canvas->translate(0, H + 20);
+        }
+        canvas->restore();
+        canvas->translate(0, (H + 20) * SK_ARRAY_COUNT(mats));
+        if (useBilerp) break;
+        useBilerp = true;
+    }
+}
diff --git a/gyp/core.gypi b/gyp/core.gypi
index 1853985..3bafe7b 100644
--- a/gyp/core.gypi
+++ b/gyp/core.gypi
@@ -149,6 +149,8 @@
         '<(skia_src_path)/core/SkLight.h',
         '<(skia_src_path)/core/SkLightingShader.h',
         '<(skia_src_path)/core/SkLightingShader.cpp',
+        '<(skia_src_path)/core/SkLinearBitmapPipeline.cpp',
+        '<(skia_src_path)/core/SkLinearBitmapPipeline.h',
         '<(skia_src_path)/core/SkLineClipper.cpp',
         '<(skia_src_path)/core/SkLocalMatrixImageFilter.cpp',
         '<(skia_src_path)/core/SkLocalMatrixImageFilter.h',
diff --git a/src/core/SkLinearBitmapPipeline.cpp b/src/core/SkLinearBitmapPipeline.cpp
new file mode 100644
index 0000000..d100028
--- /dev/null
+++ b/src/core/SkLinearBitmapPipeline.cpp
@@ -0,0 +1,449 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkLinearBitmapPipeline.h"
+
+struct X {
+    explicit X(SkScalar val) : fVal{val} { }
+    explicit X(SkPoint pt) : fVal{pt.fX} { }
+    explicit X(SkSize s) : fVal{s.fWidth} { }
+    explicit X(SkISize s) : fVal(s.fWidth) { }
+    operator float () const {return fVal;}
+private:
+    float fVal;
+};
+
+struct Y {
+    explicit Y(SkScalar val) : fVal{val} { }
+    explicit Y(SkPoint pt) : fVal{pt.fY} { }
+    explicit Y(SkSize s) : fVal{s.fHeight} { }
+    explicit Y(SkISize s) : fVal(s.fHeight) { }
+
+    operator float () const {return fVal;}
+private:
+    float fVal;
+};
+
+template<typename Strategy, typename Next>
+class PointProcessor : public PointProcessorInterface {
+public:
+    template <typename... Args>
+    PointProcessor(Next* next, Args&&... args)
+        : fNext{next}
+        , fStrategy{std::forward<Args>(args)...}{ }
+
+    void pointListFew(int n, Sk4fArg xs, Sk4fArg ys) override {
+        Sk4f newXs = xs;
+        Sk4f newYs = ys;
+        fStrategy.processPoints(&newXs, &newYs);
+        fNext->pointListFew(n, newXs, newYs);
+    }
+
+    void pointList4(Sk4fArg xs, Sk4fArg ys) override {
+        Sk4f newXs = xs;
+        Sk4f newYs = ys;
+        fStrategy.processPoints(&newXs, &newYs);
+        fNext->pointList4(newXs, newYs);
+    }
+
+private:
+    Next* const fNext;
+    Strategy fStrategy;
+};
+
+class SkippedStage final : public PointProcessorInterface {
+    void pointListFew(int n, Sk4fArg xs, Sk4fArg ys) override {
+        SkFAIL("Abort tiler.");
+    }
+    void pointList4(Sk4fArg Xs, Sk4fArg Ys) override {
+        SkFAIL("Abort point processor.");
+    }
+};
+
+class TranslateMatrixStrategy {
+public:
+    TranslateMatrixStrategy(SkVector offset)
+        : fXOffset{X(offset)}
+        , fYOffset{Y(offset)} { }
+    void processPoints(Sk4f* xs, Sk4f* ys) {
+        *xs = *xs + fXOffset;
+        *ys = *ys + fYOffset;
+    }
+
+private:
+    const Sk4f fXOffset, fYOffset;
+};
+template <typename Next = PointProcessorInterface>
+using TranslateMatrix = PointProcessor<TranslateMatrixStrategy, Next>;
+
+class ScaleMatrixStrategy {
+public:
+    ScaleMatrixStrategy(SkVector offset, SkVector scale)
+        : fXOffset{X(offset)}, fYOffset{Y(offset)}
+        ,  fXScale{X(scale)},   fYScale{Y(scale)} { }
+    void processPoints(Sk4f* xs, Sk4f* ys) {
+        *xs = *xs * fXScale + fXOffset;
+        *ys = *ys * fYScale + fYOffset;
+    }
+
+private:
+    const Sk4f fXOffset, fYOffset;
+    const Sk4f fXScale, fYScale;
+};
+template <typename Next = PointProcessorInterface>
+using ScaleMatrix = PointProcessor<ScaleMatrixStrategy, Next>;
+
+class AffineMatrixStrategy {
+public:
+    AffineMatrixStrategy(SkVector offset, SkVector scale, SkVector skew)
+        : fXOffset{X(offset)}, fYOffset{Y(offset)}
+        , fXScale{X(scale)},   fYScale{Y(scale)}
+        , fXSkew{X(skew)},     fYSkew{Y(skew)} { }
+    void processPoints(Sk4f* xs, Sk4f* ys) {
+        Sk4f newXs = fXScale * *xs +  fXSkew * *ys + fXOffset;
+        Sk4f newYs =  fYSkew * *xs + fYScale * *ys + fYOffset;
+
+        *xs = newXs;
+        *ys = newYs;
+    }
+
+private:
+    const Sk4f fXOffset, fYOffset;
+    const Sk4f fXScale,  fYScale;
+    const Sk4f fXSkew,   fYSkew;
+};
+template <typename Next = PointProcessorInterface>
+using AffineMatrix = PointProcessor<AffineMatrixStrategy, Next>;
+
+static PointProcessorInterface* choose_matrix(
+    PointProcessorInterface* next,
+    const SkMatrix& inverse,
+    SkLinearBitmapPipeline::MatrixStage* matrixProc) {
+    if (inverse.hasPerspective()) {
+        SkFAIL("Not implemented.");
+    } else if (inverse.getSkewX() != 0.0f || inverse.getSkewY() != 0.0f) {
+        matrixProc->Initialize<AffineMatrix<>>(
+            next,
+            SkVector{inverse.getTranslateX(), inverse.getTranslateY()},
+            SkVector{inverse.getScaleX(), inverse.getScaleY()},
+            SkVector{inverse.getSkewX(), inverse.getSkewY()});
+    } else if (inverse.getScaleX() != 1.0f || inverse.getScaleY() != 1.0f) {
+        matrixProc->Initialize<ScaleMatrix<>>(
+            next,
+            SkVector{inverse.getTranslateX(), inverse.getTranslateY()},
+            SkVector{inverse.getScaleX(), inverse.getScaleY()});
+    } else if (inverse.getTranslateX() != 0.0f || inverse.getTranslateY() != 0.0f) {
+        matrixProc->Initialize<TranslateMatrix<>>(
+            next,
+            SkVector{inverse.getTranslateX(), inverse.getTranslateY()});
+    } else {
+        matrixProc->Initialize<SkippedStage>();
+        return next;
+    }
+    return matrixProc->get();
+}
+
+class ClampStrategy {
+public:
+    ClampStrategy(X max)
+        : fXMin{0.0f}
+        , fXMax{max - 1.0f} { }
+    ClampStrategy(Y max)
+        : fYMin{0.0f}
+        , fYMax{max - 1.0f} { }
+    ClampStrategy(SkSize max)
+        : fXMin{0.0f}
+        , fYMin{0.0f}
+        , fXMax{X(max) - 1.0f}
+        , fYMax{Y(max) - 1.0f} { }
+
+    void processPoints(Sk4f* xs, Sk4f* ys) {
+        *xs = Sk4f::Min(Sk4f::Max(*xs, fXMin), fXMax);
+        *ys = Sk4f::Min(Sk4f::Max(*ys, fYMin), fYMax);
+    }
+
+private:
+    const Sk4f fXMin{SK_FloatNegativeInfinity};
+    const Sk4f fYMin{SK_FloatNegativeInfinity};
+    const Sk4f fXMax{SK_FloatInfinity};
+    const Sk4f fYMax{SK_FloatInfinity};
+};
+template <typename Next = PointProcessorInterface>
+using Clamp = PointProcessor<ClampStrategy, Next>;
+
+class RepeatStrategy {
+public:
+    RepeatStrategy(X max) : fXMax{max}, fXInvMax{1.0f/max} { }
+    RepeatStrategy(Y max) : fYMax{max}, fYInvMax{1.0f/max} { }
+    RepeatStrategy(SkSize max)
+        : fXMax{X(max)}
+        , fXInvMax{1.0f / X(max)}
+        , fYMax{Y(max)}
+        , fYInvMax{1.0f / Y(max)} { }
+
+    void processPoints(Sk4f* xs, Sk4f* ys) {
+        Sk4f divX = (*xs * fXInvMax).floor();
+        Sk4f divY = (*ys * fYInvMax).floor();
+        Sk4f baseX = (divX * fXMax);
+        Sk4f baseY = (divY * fYMax);
+        *xs = *xs - baseX;
+        *ys = *ys - baseY;
+    }
+
+private:
+    const Sk4f fXMax{0.0f};
+    const Sk4f fXInvMax{0.0f};
+    const Sk4f fYMax{0.0f};
+    const Sk4f fYInvMax{0.0f};
+};
+
+template <typename Next = PointProcessorInterface>
+using Repeat = PointProcessor<RepeatStrategy, Next>;
+
+static PointProcessorInterface* choose_tiler(
+    PointProcessorInterface* next,
+    SkSize dimensions,
+    SkShader::TileMode xMode,
+    SkShader::TileMode yMode,
+    SkLinearBitmapPipeline::TileStage* tileProcXOrBoth,
+    SkLinearBitmapPipeline::TileStage* tileProcY) {
+    if (xMode == yMode) {
+        switch (xMode) {
+            case SkShader::kClamp_TileMode:
+                tileProcXOrBoth->Initialize<Clamp<>>(next, dimensions);
+                break;
+            case SkShader::kRepeat_TileMode:
+                tileProcXOrBoth->Initialize<Repeat<>>(next, dimensions);
+                break;
+            case SkShader::kMirror_TileMode:
+                SkFAIL("Not implemented.");
+                break;
+        }
+        tileProcY->Initialize<SkippedStage>();
+    } else {
+        switch (yMode) {
+            case SkShader::kClamp_TileMode:
+                tileProcY->Initialize<Clamp<>>(next, Y(dimensions));
+                break;
+            case SkShader::kRepeat_TileMode:
+                tileProcY->Initialize<Repeat<>>(next, Y(dimensions));
+                break;
+            case SkShader::kMirror_TileMode:
+                SkFAIL("Not implemented.");
+                break;
+        }
+        switch (xMode) {
+            case SkShader::kClamp_TileMode:
+                tileProcXOrBoth->Initialize<Clamp<>>(tileProcY->get(), X(dimensions));
+                break;
+            case SkShader::kRepeat_TileMode:
+                tileProcXOrBoth->Initialize<Repeat<>>(tileProcY->get(), X(dimensions));
+                break;
+            case SkShader::kMirror_TileMode:
+                SkFAIL("Not implemented.");
+                break;
+        }
+    }
+    return tileProcXOrBoth->get();
+}
+
+class sRGBFast {
+public:
+    static Sk4f sRGBToLinear(Sk4fArg pixel) {
+        Sk4f l = pixel * pixel;
+        return Sk4f{l[0], l[1], l[2], pixel[3]};
+    }
+};
+
+template <SkColorProfileType colorProfile>
+class Passthrough8888 {
+public:
+    Passthrough8888(int width, const uint32_t* src)
+        : fSrc{src}, fWidth{width}{ }
+
+    void getFewPixels(int n, Sk4fArg xs, Sk4fArg ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) {
+        Sk4i XIs = SkNx_cast<int, float>(xs);
+        Sk4i YIs = SkNx_cast<int, float>(ys);
+        Sk4i bufferLoc = YIs * fWidth + XIs;
+        switch (n) {
+            case 3:
+                *px2 = getPixel(fSrc, bufferLoc[2]);
+            case 2:
+                *px1 = getPixel(fSrc, bufferLoc[1]);
+            case 1:
+                *px0 = getPixel(fSrc, bufferLoc[0]);
+            default:
+                break;
+        }
+    }
+
+    void get4Pixels(Sk4fArg xs, Sk4fArg ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
+        Sk4i XIs = SkNx_cast<int, float>(xs);
+        Sk4i YIs = SkNx_cast<int, float>(ys);
+        Sk4i bufferLoc = YIs * fWidth + XIs;
+        *px0 = getPixel(fSrc, bufferLoc[0]);
+        *px1 = getPixel(fSrc, bufferLoc[1]);
+        *px2 = getPixel(fSrc, bufferLoc[2]);
+        *px3 = getPixel(fSrc, bufferLoc[3]);
+    }
+
+    const uint32_t* row(int y) { return fSrc + y * fWidth[0]; }
+
+private:
+    Sk4f getPixel(const uint32_t* src, int index) {
+        Sk4b bytePixel = Sk4b::Load((uint8_t *)(&src[index]));
+        Sk4f pixel = SkNx_cast<float, uint8_t>(bytePixel);
+        pixel = pixel * Sk4f{1.0f/255.0f};
+        if (colorProfile == kSRGB_SkColorProfileType) {
+            pixel = sRGBFast::sRGBToLinear(pixel);
+        }
+        return pixel;
+    }
+    const uint32_t* const fSrc;
+    const Sk4i fWidth;
+};
+
+template <typename SourceStrategy>
+class Sampler final : public PointProcessorInterface {
+public:
+    template <typename... Args>
+    Sampler(PixelPlacerInterface* next, Args&&... args)
+        : fNext{next}
+        , fStrategy{std::forward<Args>(args)...} { }
+
+    void pointListFew(int n, Sk4fArg xs, Sk4fArg ys) override {
+        SkASSERT(0 < n && n < 4);
+        Sk4f px0, px1, px2;
+        fStrategy.getFewPixels(n, xs, ys, &px0, &px1, &px2);
+        if (n >= 1) fNext->placePixel(px0);
+        if (n >= 2) fNext->placePixel(px1);
+        if (n >= 3) fNext->placePixel(px2);
+    }
+
+    void pointList4(Sk4fArg xs, Sk4fArg ys) override {
+        Sk4f px0, px1, px2, px3;
+        fStrategy.get4Pixels(xs, ys, &px0, &px1, &px2, &px3);
+        fNext->place4Pixels(px0, px1, px2, px3);
+    }
+
+private:
+    PixelPlacerInterface* const fNext;
+    SourceStrategy fStrategy;
+};
+
+static PointProcessorInterface* choose_pixel_sampler(
+    PixelPlacerInterface* next,
+    const SkImageInfo& imageInfo,
+    const void* imageData,
+    SkLinearBitmapPipeline::SampleStage* sampleStage) {
+    switch (imageInfo.colorType()) {
+        case kRGBA_8888_SkColorType:
+        case kBGRA_8888_SkColorType:
+            if (kN32_SkColorType == imageInfo.colorType()) {
+                if (imageInfo.profileType() == kSRGB_SkColorProfileType) {
+                    sampleStage->Initialize<Sampler<Passthrough8888<kSRGB_SkColorProfileType>>>(
+                        next, imageInfo.width(),
+                        (uint32_t*)imageData);
+                } else {
+                    sampleStage->Initialize<Sampler<Passthrough8888<kLinear_SkColorProfileType>>>(
+                        next, imageInfo.width(),
+                        (uint32_t*)imageData);
+                }
+            } else {
+                SkFAIL("Not implemented. No 8888 Swizzle");
+            }
+            break;
+        default:
+            SkFAIL("Not implemented. Unsupported src");
+            break;
+    }
+    return sampleStage->get();
+}
+
+template <SkAlphaType alphaType>
+class PlaceFPPixel final : public PixelPlacerInterface {
+public:
+    void placePixel(Sk4fArg pixel) override {
+        PlacePixel(fDst, pixel, 0);
+        fDst += 1;
+    }
+
+    void place4Pixels(Sk4fArg p0, Sk4fArg p1, Sk4fArg p2, Sk4fArg p3) override {
+        SkPM4f* dst = fDst;
+        PlacePixel(dst, p0, 0);
+        PlacePixel(dst, p1, 1);
+        PlacePixel(dst, p2, 2);
+        PlacePixel(dst, p3, 3);
+        fDst += 4;
+    }
+
+    void setDestination(SkPM4f* dst) override {
+        fDst = dst;
+    }
+
+private:
+    static void PlacePixel(SkPM4f* dst, Sk4fArg pixel, int index) {
+        Sk4f newPixel = pixel;
+        if (alphaType == kUnpremul_SkAlphaType) {
+            newPixel = Premultiply(pixel);
+        }
+        newPixel.store(dst + index);
+    }
+    static Sk4f Premultiply(Sk4fArg pixel) {
+        float alpha = pixel[3];
+        return pixel * Sk4f{alpha, alpha, alpha, 1.0f};
+    }
+
+    SkPM4f* fDst;
+};
+
+static PixelPlacerInterface* choose_pixel_placer(
+    SkAlphaType alphaType,
+    SkLinearBitmapPipeline::PixelStage* placerStage) {
+    if (alphaType == kUnpremul_SkAlphaType) {
+        placerStage->Initialize<PlaceFPPixel<kUnpremul_SkAlphaType>>();
+    } else {
+        // kOpaque_SkAlphaType is treated the same as kPremul_SkAlphaType
+        placerStage->Initialize<PlaceFPPixel<kPremul_SkAlphaType>>();
+    }
+    return placerStage->get();
+}
+
+SkLinearBitmapPipeline::SkLinearBitmapPipeline(
+    const SkMatrix& inverse,
+    SkShader::TileMode xTile, SkShader::TileMode yTile,
+    const SkImageInfo& srcImageInfo,
+    const void* srcImageData) {
+    SkSize size;
+    size = srcImageInfo.dimensions();
+
+    // As the stages are built, the chooser function may skip a stage. For example, with the
+    // identity matrix, the matrix stage is skipped, and the tilerStage is the first stage.
+    auto placementStage = choose_pixel_placer(srcImageInfo.alphaType(), &fPixelStage);
+    auto samplerStage   = choose_pixel_sampler(placementStage, srcImageInfo,
+                                               srcImageData, &fSampleStage);
+    auto tilerStage     = choose_tiler(samplerStage, size, xTile, yTile, &fTileXOrBothStage,
+                                       &fTileYStage);
+    fFirstStage         = choose_matrix(tilerStage, inverse, &fMatrixStage);
+}
+
+void SkLinearBitmapPipeline::shadeSpan4f(int x, int y, SkPM4f* dst, int count) {
+    fPixelStage->setDestination(dst);
+
+    Sk4f Xs = Sk4f(x) + Sk4f{0.5f, 1.5f, 2.5f, 3.5f};
+    Sk4f Ys(y);
+    Sk4f fours{4.0f};
+
+    while (count >= 4) {
+        fFirstStage->pointList4(Xs, Ys);
+        Xs = Xs + fours;
+        count -= 4;
+    }
+    if (count > 0) {
+        fFirstStage->pointListFew(count, Xs, Ys);
+    }
+}
diff --git a/src/core/SkLinearBitmapPipeline.h b/src/core/SkLinearBitmapPipeline.h
new file mode 100644
index 0000000..f6875c6
--- /dev/null
+++ b/src/core/SkLinearBitmapPipeline.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkLinearBitmapPipeline_DEFINED
+#define SkLinearBitmapPipeline_DEFINED
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <cstdio>
+#include "SkColor.h"
+#include "SkImageInfo.h"
+#include "SkMatrix.h"
+#include "SkShader.h"
+#include "SkSize.h"
+#include "SkNx.h"
+
+using Sk4fArg = const Sk4f&;
+
+class PointProcessorInterface {
+public:
+    virtual ~PointProcessorInterface() { }
+    virtual void pointListFew(int n, Sk4fArg xs, Sk4fArg ys) = 0;
+    virtual void pointList4(Sk4fArg xs, Sk4fArg ys) = 0;
+};
+
+class PixelPlacerInterface {
+public:
+    virtual ~PixelPlacerInterface() { }
+    virtual void setDestination(SkPM4f* dst) = 0;
+    virtual void placePixel(Sk4fArg pixel0) = 0;
+    virtual void place4Pixels(Sk4fArg p0, Sk4fArg p1, Sk4fArg p2, Sk4fArg p3) = 0;
+};
+
+class SkLinearBitmapPipeline {
+public:
+    SkLinearBitmapPipeline(
+        const SkMatrix& inverse,
+        SkShader::TileMode xTile, SkShader::TileMode yTile,
+        const SkImageInfo& srcImageInfo,
+        const void* srcImageData);
+
+    void shadeSpan4f(int x, int y, SkPM4f* dst, int count);
+
+    template<typename Base, size_t kSize>
+    class PolymorphicUnion {
+    public:
+        PolymorphicUnion() {}
+
+        ~PolymorphicUnion() { get()->~Base(); }
+
+        template<typename Variant, typename... Args>
+        void Initialize(Args&&... args) {
+            SkASSERTF(sizeof(Variant) <= sizeof(fSpace),
+                      "Size Variant: %d, Space: %d", sizeof(Variant), sizeof(fSpace));
+
+            new(&fSpace) Variant(std::forward<Args>(args)...);
+        };
+
+        Base* get() const { return reinterpret_cast<Base*>(&fSpace); }
+        Base* operator->() const { return get(); }
+        Base& operator*() const { return *get(); }
+
+    private:
+        struct SK_STRUCT_ALIGN(16) Space {
+            char space[kSize];
+        };
+        mutable Space fSpace;
+    };
+
+    using MatrixStage = PolymorphicUnion<PointProcessorInterface, 112>;
+    using TileStage   = PolymorphicUnion<PointProcessorInterface,  96>;
+    using SampleStage = PolymorphicUnion<PointProcessorInterface,  80>;
+    using PixelStage  = PolymorphicUnion<PixelPlacerInterface,     80>;
+
+private:
+    PointProcessorInterface* fFirstStage;
+    MatrixStage fMatrixStage;
+    TileStage   fTileXOrBothStage;
+    TileStage   fTileYStage;
+    SampleStage fSampleStage;
+    PixelStage  fPixelStage;
+};
+
+#endif  // SkLinearBitmapPipeline_DEFINED
diff --git a/src/core/SkNx.h b/src/core/SkNx.h
index 8722bf6..166557d 100644
--- a/src/core/SkNx.h
+++ b/src/core/SkNx.h
@@ -198,6 +198,7 @@
 typedef SkNx<16,  uint8_t> Sk16b;
 typedef SkNx<4,  uint16_t> Sk4h;
 typedef SkNx<16, uint16_t> Sk16h;
+typedef SkNx<4,       int> Sk4i;
 
 typedef SkNx<4, int> Sk4i;
 
diff --git a/tests/SkLinearBitmapPipelineTest.cpp b/tests/SkLinearBitmapPipelineTest.cpp
new file mode 100644
index 0000000..dbe0dee
--- /dev/null
+++ b/tests/SkLinearBitmapPipelineTest.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#include "SkLinearBitmapPipeline.h"
+
+#include "SkColor.h"
+
+#include "Test.h"
+
+struct SinkBilerpProcessor final : public PointProcessorInterface {
+    void pointListFew(int n, Sk4fArg xs, Sk4fArg ys) override { fXs = xs; fYs = ys; }
+    void pointList4(Sk4fArg Xs, Sk4fArg Ys) override { fXs = Xs; fYs = Ys; }
+
+    Sk4f fXs;
+    Sk4f fYs;
+};
+
+using Pixel = float[4];
+DEF_TEST(SkBitmapFP, reporter) {
+
+    int width = 10;
+    int height = 10;
+    uint32_t* bitmap = new uint32_t[width * height];
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            bitmap[y * width + x] = (y << 8) + x + (128<<24);
+        }
+    }
+
+    SkPM4f* FPbuffer = new SkPM4f[width * height];
+
+    SkMatrix m = SkMatrix::I();
+    //m.setRotate(30.0f, 1.0f, 1.0f);
+    SkMatrix invert;
+    bool trash = m.invert(&invert);
+    sk_ignore_unused_variable(trash);
+
+    const SkImageInfo info =
+        SkImageInfo::MakeN32Premul(width, height, kLinear_SkColorProfileType);
+
+    SkLinearBitmapPipeline pipeline{invert, SkShader::kClamp_TileMode,
+                                    SkShader::kClamp_TileMode, info, bitmap};
+
+    int count = 10;
+
+    pipeline.shadeSpan4f(3, 6, FPbuffer, count);
+
+    Pixel* pixelBuffer = (Pixel*)FPbuffer;
+    for (int i = 0; i < count; i++) {
+        printf("i: %d - (%g, %g, %g, %g)\n", i,
+               pixelBuffer[i][0] * 255.0f,
+               pixelBuffer[i][1] * 255.0f,
+               pixelBuffer[i][2] * 255.0f,
+               pixelBuffer[i][3] * 255.0f);
+    }
+
+    delete [] bitmap;
+    delete [] FPbuffer;
+}
+