build custom blitter for drawAtlas

- add uniform_color_dst stage
- add stageupdate option for shaders

More cases we could try to handle in the future:
- perspective (not hard)
- mipmaps (plumbing is there, need to re-call bitmapstate helper)

Before

  10025.60  	drawAtlas_3	8888
   6636.06  	drawAtlas_2	8888

After

   3566.18  	drawAtlas_3	8888
   2585.83 ?	drawAtlas_2	8888

Change-Id: I656231324c0390029f6d08941c4f9d11ccdb8e87
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/233061
Commit-Queue: Mike Reed <reed@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkDraw_atlas.cpp b/src/core/SkDraw_atlas.cpp
index 05681b2..fdaa9a4 100644
--- a/src/core/SkDraw_atlas.cpp
+++ b/src/core/SkDraw_atlas.cpp
@@ -7,35 +7,110 @@
 
 #include "include/core/SkColorFilter.h"
 #include "include/core/SkRSXform.h"
+#include "src/core/SkBlendModePriv.h"
+#include "src/core/SkColorSpacePriv.h"
+#include "src/core/SkColorSpaceXformSteps.h"
+#include "src/core/SkCoreBlitters.h"
 #include "src/core/SkDraw.h"
+#include "src/core/SkRasterPipeline.h"
 #include "src/core/SkScan.h"
 #include "src/shaders/SkShaderBase.h"
 
+#include "include/core/SkMatrix.h"
+#include "src/core/SkScan.h"
+
+static void fill_rect(const SkMatrix& ctm, const SkRasterClip& rc,
+                      const SkRect& r, SkBlitter* blitter) {
+    if (ctm.rectStaysRect()) {
+        SkRect dr;
+        ctm.mapRect(&dr, r);
+        SkScan::FillRect(dr, rc, blitter);
+    } else {
+        SkPath path;
+        path.addRect(r);
+        path.transform(ctm);
+        SkScan::FillPath(path, rc, blitter);
+    }
+}
+
+static void load_color(SkRasterPipeline_UniformColorCtx* ctx, const float rgba[]) {
+    // only need one of these. can I query the pipeline to know if its lowp or highp?
+    ctx->rgba[0] = SkScalarRoundToInt(rgba[0]*255); ctx->r = rgba[0];
+    ctx->rgba[1] = SkScalarRoundToInt(rgba[1]*255); ctx->g = rgba[1];
+    ctx->rgba[2] = SkScalarRoundToInt(rgba[2]*255); ctx->b = rgba[2];
+    ctx->rgba[3] = SkScalarRoundToInt(rgba[3]*255); ctx->a = rgba[3];
+}
+
 void SkDraw::drawAtlas(const SkImage* atlas, const SkRSXform xform[], const SkRect textures[],
                        const SkColor colors[], int count, SkBlendMode bmode, const SkPaint& paint) {
-    SkDraw draw(*this);
-    SkPaint p(paint);
+    sk_sp<SkShader> atlasShader = atlas->makeShader();
+    if (!atlasShader) {
+        return;
+    }
 
+    SkPaint p(paint);
     p.setAntiAlias(false);  // we never respect this for drawAtlas(or drawVertices)
     p.setStyle(SkPaint::kFill_Style);
     p.setShader(nullptr);
     p.setMaskFilter(nullptr);
 
-    sk_sp<SkShader> atlasShader = atlas->makeShader();
-    if (!atlasShader) {
+    SkSTArenaAlloc<256> alloc;
+    SkRasterPipeline pipeline(&alloc);
+    SkStageRec rec = {
+        &pipeline, &alloc, fDst.colorType(), fDst.colorSpace(), p, nullptr, *fMatrix
+    };
+
+    SkStageUpdater* updator = as_SB(atlasShader.get())->appendUpdatableStages(rec);
+    if (!updator) {
+        SkDraw draw(*this);
+
+        p.setShader(atlasShader);
+        for (int i = 0; i < count; ++i) {
+            if (colors) {
+                p.setShader(SkShaders::Blend(bmode, SkShaders::Color(colors[i]), atlasShader));
+            }
+            SkMatrix mx;
+            mx.setRSXform(xform[i]);
+            mx.preTranslate(-textures[i].fLeft, -textures[i].fTop);
+            mx.postConcat(*fMatrix);
+            draw.fMatrix = &mx;
+            draw.drawRect(textures[i], p);
+        }
         return;
     }
-    p.setShader(atlasShader);
 
-    SkMatrix xf;
+    SkRasterPipeline_UniformColorCtx* uniformCtx = nullptr;
+    SkColorSpaceXformSteps steps(sk_srgb_singleton(), kUnpremul_SkAlphaType,
+                                 rec.fDstCS,          kUnpremul_SkAlphaType);
+
+    if (colors) {
+        // we will late-bind the values in ctx, once for each color in the loop
+        uniformCtx = alloc.make<SkRasterPipeline_UniformColorCtx>();
+        rec.fPipeline->append(SkRasterPipeline::uniform_color_dst, uniformCtx);
+        SkBlendMode_AppendStages(bmode, rec.fPipeline);
+    }
+
+    bool isOpaque = !colors && atlasShader->isOpaque();
+    if (p.getAlphaf() != 1) {
+        rec.fPipeline->append(SkRasterPipeline::scale_1_float, alloc.make<float>(p.getAlphaf()));
+        isOpaque = false;
+    }
+
+    auto blitter = SkCreateRasterPipelineBlitter(fDst, p, pipeline, isOpaque, &alloc);
+
     for (int i = 0; i < count; ++i) {
         if (colors) {
-            p.setShader(SkShaders::Blend(bmode, SkShaders::Color(colors[i]), atlasShader));
+            SkColor4f c4 = SkColor4f::FromColor(colors[i]);
+            steps.apply(c4.vec());
+            load_color(uniformCtx, c4.premul().vec());
         }
-        xf.setRSXform(xform[i]);
-        xf.preTranslate(-textures[i].fLeft, -textures[i].fTop);
-        xf.postConcat(*fMatrix);
-        draw.fMatrix = &xf;
-        draw.drawRect(textures[i], p);
+
+        SkMatrix mx;
+        mx.setRSXform(xform[i]);
+        mx.preTranslate(-textures[i].fLeft, -textures[i].fTop);
+        mx.postConcat(*fMatrix);
+
+        updator->update(mx, nullptr);
+        fill_rect(mx, *fRC, textures[i], blitter);
     }
 }
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 8552c02..8a3e0cc 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -41,7 +41,8 @@
     M(force_opaque) M(force_opaque_dst)                            \
     M(set_rgb) M(unbounded_set_rgb) M(swap_rb) M(swap_rb_dst)      \
     M(from_srgb) M(to_srgb)                                        \
-    M(black_color) M(white_color) M(uniform_color) M(unbounded_uniform_color) \
+    M(black_color) M(white_color)                                  \
+    M(uniform_color) M(unbounded_uniform_color) M(uniform_color_dst) \
     M(seed_shader) M(dither)                                       \
     M(load_a8)     M(load_a8_dst)   M(store_a8)    M(gather_a8)    \
     M(load_565)    M(load_565_dst)  M(store_565)   M(gather_565)   \
@@ -265,7 +266,6 @@
 
     bool empty() const { return fStages == nullptr; }
 
-
 private:
     struct StageList {
         StageList* prev;
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 702a41a..5dc6da9 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -1317,6 +1317,13 @@
     b = c->b;
     a = c->a;
 }
+// load 4 floats from memory, and splat them into dr,dg,db,da
+STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
+    dr = c->r;
+    dg = c->g;
+    db = c->b;
+    da = c->a;
+}
 
 // splats opaque-black into r,g,b,a
 STAGE(black_color, Ctx::None) {
@@ -3145,6 +3152,12 @@
     b = c->rgba[2];
     a = c->rgba[3];
 }
+STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
+    dr = c->rgba[0];
+    dg = c->rgba[1];
+    db = c->rgba[2];
+    da = c->rgba[3];
+}
 STAGE_PP(black_color, Ctx::None) { r = g = b =   0; a = 255; }
 STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; }
 
diff --git a/src/shaders/SkImageShader.cpp b/src/shaders/SkImageShader.cpp
index 28ff579..57523c7 100644
--- a/src/shaders/SkImageShader.cpp
+++ b/src/shaders/SkImageShader.cpp
@@ -291,15 +291,45 @@
 
 void SkShaderBase::RegisterFlattenables() { SK_REGISTER_FLATTENABLE(SkImageShader); }
 
-bool SkImageShader::onAppendStages(const SkStageRec& rec) const {
+class SkImageStageUpdater : public SkStageUpdater {
+public:
+    const SkImageShader* fShader;
+
+    float fMatrixStorage[6];
+
+#if 0   // TODO: when we support mipmaps
+    SkRasterPipeline_GatherCtx* fGather;
+    SkRasterPipeline_TileCtx* fLimitX;
+    SkRasterPipeline_TileCtx* fLimitY;
+    SkRasterPipeline_DecalTileCtx* fDecal;
+#endif
+
+    bool update(const SkMatrix& ctm, const SkMatrix* localM) override {
+        SkMatrix matrix;
+        return fShader->computeTotalInverse(ctm, localM, &matrix) &&
+               matrix.asAffine(fMatrixStorage);
+    }
+};
+
+bool SkImageShader::doStages(const SkStageRec& rec, SkImageStageUpdater* updater) const {
+    if (updater &&
+        (rec.fPaint.getFilterQuality() == kMedium_SkFilterQuality ||
+         rec.fCTM.hasPerspective()))
+    {
+        // TODO: handle these cases
+        // medium: recall RequestBitmap and update width/height accordingly
+        // perspt: store 9 floats and use persp stage
+        return false;
+    }
+
     SkRasterPipeline* p = rec.fPipeline;
     SkArenaAlloc* alloc = rec.fAlloc;
+    auto quality = rec.fPaint.getFilterQuality();
 
     SkMatrix matrix;
     if (!this->computeTotalInverse(rec.fCTM, rec.fLocalM, &matrix)) {
         return false;
     }
-    auto quality = rec.fPaint.getFilterQuality();
 
     const auto* state = SkBitmapController::RequestBitmap(as_IB(fImage.get()),
                                                           matrix, quality, alloc);
@@ -312,28 +342,32 @@
     quality = state->quality();
     auto info = pm.info();
 
-    // When the matrix is just an integer translate, bilerp == nearest neighbor.
-    if (quality == kLow_SkFilterQuality &&
-        matrix.getType() <= SkMatrix::kTranslate_Mask &&
-        matrix.getTranslateX() == (int)matrix.getTranslateX() &&
-        matrix.getTranslateY() == (int)matrix.getTranslateY()) {
-        quality = kNone_SkFilterQuality;
-    }
-
-    // See skia:4649 and the GM image_scale_aligned.
-    if (quality == kNone_SkFilterQuality) {
-        if (matrix.getScaleX() >= 0) {
-            matrix.setTranslateX(nextafterf(matrix.getTranslateX(),
-                                            floorf(matrix.getTranslateX())));
-        }
-        if (matrix.getScaleY() >= 0) {
-            matrix.setTranslateY(nextafterf(matrix.getTranslateY(),
-                                            floorf(matrix.getTranslateY())));
-        }
-    }
-
     p->append(SkRasterPipeline::seed_shader);
-    p->append_matrix(alloc, matrix);
+
+    if (updater) {
+        p->append(SkRasterPipeline::matrix_2x3, updater->fMatrixStorage);
+    } else {
+        // When the matrix is just an integer translate, bilerp == nearest neighbor.
+        if (quality == kLow_SkFilterQuality &&
+            matrix.getType() <= SkMatrix::kTranslate_Mask &&
+            matrix.getTranslateX() == (int)matrix.getTranslateX() &&
+            matrix.getTranslateY() == (int)matrix.getTranslateY()) {
+            quality = kNone_SkFilterQuality;
+        }
+
+        // See skia:4649 and the GM image_scale_aligned.
+        if (quality == kNone_SkFilterQuality) {
+            if (matrix.getScaleX() >= 0) {
+                matrix.setTranslateX(nextafterf(matrix.getTranslateX(),
+                                                floorf(matrix.getTranslateX())));
+            }
+            if (matrix.getScaleY() >= 0) {
+                matrix.setTranslateY(nextafterf(matrix.getTranslateY(),
+                                                floorf(matrix.getTranslateY())));
+            }
+        }
+        p->append_matrix(alloc, matrix);
+    }
 
     auto gather = alloc->make<SkRasterPipeline_GatherCtx>();
     gather->pixels = pm.addr();
@@ -356,6 +390,16 @@
         decal_ctx->limit_y = limit_y->scale;
     }
 
+#if 0   // TODO: when we support kMedium
+    if (updator && (quality == kMedium_SkFilterQuality)) {
+        // if we change levels in mipmap, we need to update the scales (and invScales)
+        updator->fGather = gather;
+        updator->fLimitX = limit_x;
+        updator->fLimitY = limit_y;
+        updator->fDecal = decal_ctx;
+    }
+#endif
+
     auto append_tiling_and_gather = [&] {
         if (decal_x_and_y) {
             p->append(SkRasterPipeline::decal_x_and_y,  decal_ctx);
@@ -439,7 +483,7 @@
         return true;
     };
 
-    // We've got a fast path for 8888 bilinear clamp/clamp sampling.
+    // Check for fast-path stages.
     auto ct = info.colorType();
     if (true
         && (ct == kRGBA_8888_SkColorType || ct == kBGRA_8888_SkColorType)
@@ -511,7 +555,6 @@
 
     if (quality == kNone_SkFilterQuality) {
         append_tiling_and_gather();
-
     } else if (quality == kLow_SkFilterQuality) {
         p->append(SkRasterPipeline::save_xy, sampler);
 
@@ -550,3 +593,14 @@
 
     return append_misc();
 }
+
+bool SkImageShader::onAppendStages(const SkStageRec& rec) const {
+    return this->doStages(rec, nullptr);
+}
+
+SkStageUpdater* SkImageShader::onAppendUpdatableStages(const SkStageRec& rec) const {
+    auto updater = rec.fAlloc->make<SkImageStageUpdater>();
+    updater->fShader = this;
+    return this->doStages(rec, updater) ? updater : nullptr;
+}
+
diff --git a/src/shaders/SkImageShader.h b/src/shaders/SkImageShader.h
index 6990a4a..5a01f0a 100644
--- a/src/shaders/SkImageShader.h
+++ b/src/shaders/SkImageShader.h
@@ -12,6 +12,9 @@
 #include "src/shaders/SkBitmapProcShader.h"
 #include "src/shaders/SkShaderBase.h"
 
+// private subclass of SkStageUpdater
+class SkImageStageUpdater;
+
 class SkImageShader : public SkShaderBase {
 public:
     static sk_sp<SkShader> Make(sk_sp<SkImage>,
@@ -42,6 +45,9 @@
     SkImage* onIsAImage(SkMatrix*, SkTileMode*) const override;
 
     bool onAppendStages(const SkStageRec&) const override;
+    SkStageUpdater* onAppendUpdatableStages(const SkStageRec&) const override;
+
+    bool doStages(const SkStageRec&, SkImageStageUpdater* = nullptr) const;
 
     sk_sp<SkImage>   fImage;
     const SkTileMode fTileModeX;
diff --git a/src/shaders/SkShaderBase.h b/src/shaders/SkShaderBase.h
index 58634f3..e071e22 100644
--- a/src/shaders/SkShaderBase.h
+++ b/src/shaders/SkShaderBase.h
@@ -29,6 +29,23 @@
 class SkPaint;
 class SkRasterPipeline;
 
+/**
+ *  Shaders can optionally return a subclass of this when appending their stages.
+ *  Doing so tells the caller that the stages can be reused with different CTMs (but nothing
+ *  else can change), by calling the updater's udpate() method before each use.
+ *
+ *  This can be a perf-win bulk draws like drawAtlas and drawVertices, where most of the setup
+ *  (i.e. uniforms) are constant, and only something small is changing (i.e. matrices). This
+ *  reuse skips the cost of computing the stages (and/or avoids having to allocate a separate
+ *  shader for each small draw.
+ */
+class SkStageUpdater {
+public:
+    virtual ~SkStageUpdater() {}
+
+    virtual bool update(const SkMatrix& ctm, const SkMatrix* localM) = 0;
+};
+
 class SkShaderBase : public SkShader {
 public:
     ~SkShaderBase() override;
@@ -185,6 +202,10 @@
      */
     virtual sk_sp<SkShader> makeAsALocalMatrixShader(SkMatrix* localMatrix) const;
 
+    SkStageUpdater* appendUpdatableStages(const SkStageRec& rec) const {
+        return this->onAppendUpdatableStages(rec);
+    }
+
 protected:
     SkShaderBase(const SkMatrix* localMatrix = nullptr);
 
@@ -207,6 +228,8 @@
     // Default impl creates shadercontext and calls that (not very efficient)
     virtual bool onAppendStages(const SkStageRec&) const;
 
+    virtual SkStageUpdater* onAppendUpdatableStages(const SkStageRec&) const { return nullptr; }
+
 private:
     // This is essentially const, but not officially so it can be modified in constructors.
     SkMatrix fLocalMatrix;