Adapt GrRectBlur for cases with rect W/H less than six sigma.

Add GM that generates reference blur rect images and compares against
actual.

Change-Id: If0ce291e211fefe96af8afdf0a60636b5f40ef47
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/239757
Commit-Queue: Brian Salomon <bsalomon@google.com>
Reviewed-by: Robert Phillips <robertphillips@google.com>
diff --git a/gm/blurrect.cpp b/gm/blurrect.cpp
index b88132c..caca08c 100644
--- a/gm/blurrect.cpp
+++ b/gm/blurrect.cpp
@@ -5,11 +5,13 @@
 * found in the LICENSE file.
 */
 
+#include <cmath>
 #include "gm/gm.h"
 #include "include/core/SkBitmap.h"
 #include "include/core/SkBlurTypes.h"
 #include "include/core/SkCanvas.h"
 #include "include/core/SkColor.h"
+#include "include/core/SkImage.h"
 #include "include/core/SkMaskFilter.h"
 #include "include/core/SkMatrix.h"
 #include "include/core/SkPaint.h"
@@ -21,12 +23,16 @@
 #include "include/core/SkShader.h"
 #include "include/core/SkSize.h"
 #include "include/core/SkString.h"
+#include "include/core/SkSurface.h"
 #include "include/core/SkTileMode.h"
 #include "include/core/SkTypes.h"
 #include "include/effects/SkGradientShader.h"
+#include "include/gpu/GrContext.h"
 #include "include/private/SkTo.h"
 #include "src/core/SkBlurMask.h"
 #include "src/core/SkMask.h"
+#include "src/gpu/GrContextPriv.h"
+#include "tools/timer/TimeUtils.h"
 
 #define STROKE_WIDTH    SkIntToScalar(10)
 
@@ -234,6 +240,255 @@
         }
 }
 
+namespace skiagm {
+
+// Compares actual blur rects with reference masks created by the GM. Animates sigma in viewer.
+class BlurRectCompareGM : public GM {
+protected:
+    SkString onShortName() override { return SkString("blurrect_compare"); }
+
+    SkISize onISize() override { return {900, 1220}; }
+
+    void onOnceBeforeDraw() override { this->prepareReferenceMasks(); }
+
+    void onDraw(SkCanvas* canvas) override {
+        int32_t ctxID = canvas->getGrContext() ? canvas->getGrContext()->priv().contextID() : 0;
+        if (fRecalcMasksForAnimation || !fActualMasks[0][0][0] || ctxID != fLastContextUniqueID) {
+            if (fRecalcMasksForAnimation) {
+                // Sigma is changing so references must also be recalculated.
+                this->prepareReferenceMasks();
+            }
+            this->prepareActualMasks(canvas);
+            this->prepareMaskDifferences(canvas);
+            fLastContextUniqueID = ctxID;
+            fRecalcMasksForAnimation = false;
+        }
+        canvas->clear(SK_ColorBLACK);
+        static constexpr float kMargin = 30;
+        float totalW = 0;
+        for (auto w : kSizes) {
+            totalW += w + kMargin;
+        }
+        canvas->translate(kMargin, kMargin);
+        for (int mode = 0; mode < 3; ++mode) {
+            canvas->save();
+            for (size_t sigmaIdx = 0; sigmaIdx < kNumSigmas; ++sigmaIdx) {
+                auto sigma = kSigmas[sigmaIdx] + fSigmaAnimationBoost;
+                for (size_t heightIdx = 0; heightIdx < kNumSizes; ++heightIdx) {
+                    auto h = kSizes[heightIdx];
+                    canvas->save();
+                    for (size_t widthIdx = 0; widthIdx < kNumSizes; ++widthIdx) {
+                        auto w = kSizes[widthIdx];
+                        SkPaint paint;
+                        paint.setColor(SK_ColorWHITE);
+                        SkImage* img;
+                        switch (mode) {
+                            case 0:
+                                img = fReferenceMasks[sigmaIdx][heightIdx][widthIdx].get();
+                                break;
+                            case 1:
+                                img = fActualMasks[sigmaIdx][heightIdx][widthIdx].get();
+                                break;
+                            case 2:
+                                img = fMaskDifferences[sigmaIdx][heightIdx][widthIdx].get();
+                                // The error images are opaque, use kPlus so they are additive if
+                                // the overlap between test cases.
+                                paint.setBlendMode(SkBlendMode::kPlus);
+                                break;
+                        }
+                        auto pad = PadForSigma(sigma);
+                        canvas->drawImage(img, -pad, -pad, &paint);
+#if 0  // Uncomment to hairline stroke around blurred rect in red on top of the blur result.
+       // The rect is defined at integer coords. We inset by 1/2 pixel so our stroke lies on top
+       // of the edge pixels.
+                        SkPaint stroke;
+                        stroke.setColor(SK_ColorRED);
+                        stroke.setStrokeWidth(0.f);
+                        stroke.setStyle(SkPaint::kStroke_Style);
+                        canvas->drawRect(SkRect::MakeWH(w, h).makeInset(0.5, 0.5), stroke);
+#endif
+                        canvas->translate(w + kMargin, 0.f);
+                    }
+                    canvas->restore();
+                    canvas->translate(0, h + kMargin);
+                }
+            }
+            canvas->restore();
+            canvas->translate(totalW + 2 * kMargin, 0);
+        }
+    }
+    bool onAnimate(double nanos) override {
+        fSigmaAnimationBoost = TimeUtils::SineWave(nanos, 5, 2.5f, 0.f, 2.f);
+        fRecalcMasksForAnimation = true;
+        return true;
+    }
+
+private:
+    void prepareReferenceMasks() {
+        auto create_reference_mask = [](int w, int h, float sigma, int numSubpixels) {
+            int pad = PadForSigma(sigma);
+            int maskW = w + 2 * pad;
+            int maskH = h + 2 * pad;
+            // We'll do all our calculations at subpixel resolution, so adjust params
+            w *= numSubpixels;
+            h *= numSubpixels;
+            sigma *= numSubpixels;
+            auto scale = SK_ScalarRoot2Over2 / sigma;
+            auto def_integral_approx = [scale](float a, float b) {
+                return 0.5f * (std::erf(b * scale) - std::erf(a * scale));
+            };
+            // Do the x-pass. Above/below rect are rows of zero. All rows that intersect the rect
+            // are the same. The row is calculated and stored at subpixel resolution.
+            SkASSERT(!(numSubpixels & 0b1));
+            std::unique_ptr<float[]> row(new float[maskW * numSubpixels]);
+            for (int col = 0; col < maskW * numSubpixels; ++col) {
+                // Compute distance to rect left in subpixel units
+                float ldiff = numSubpixels * pad - (col + 0.5f);
+                float rdiff = ldiff + w;
+                row[col] = def_integral_approx(ldiff, rdiff);
+            }
+            // y-pass
+            SkBitmap bmp;
+            bmp.allocPixels(SkImageInfo::MakeA8(maskW, maskH));
+            std::unique_ptr<float[]> accums(new float[maskW]);
+            const float accumScale = 1.f / (numSubpixels * numSubpixels);
+            for (int y = 0; y < maskH; ++y) {
+                // Initialize subpixel accumulation buffer for this row.
+                std::fill_n(accums.get(), maskW, 0);
+                for (int ys = 0; ys < numSubpixels; ++ys) {
+                    // At each subpixel we want to integrate over the kernel centered at the
+                    // subpixel multiplied by the x-pass. The x-pass is zero above and below the
+                    // rect and constant valued from rect top to rect bottom. So we can get the
+                    // integral of just the kernel from rect top to rect bottom and multiply by
+                    // the single x-pass value from our precomputed row.
+                    float tdiff = numSubpixels * pad - (y * numSubpixels + ys + 0.5f);
+                    float bdiff = tdiff + h;
+                    auto w = def_integral_approx(tdiff, bdiff);
+                    for (int x = 0; x < maskW; ++x) {
+                        for (int xs = 0; xs < numSubpixels; ++xs) {
+                            int rowIdx = x * numSubpixels + xs;
+                            accums[x] += w * row[rowIdx];
+                        }
+                    }
+                }
+                for (int x = 0; x < maskW; ++x) {
+                    auto result = accums[x] * accumScale;
+                    *bmp.getAddr8(x, y) = SkToU8(sk_float_round2int(255.f * result));
+                }
+            }
+            return SkImage::MakeFromBitmap(bmp);
+        };
+
+        // Number of times to subsample (in both X and Y). If fRecalcMasksForAnimation is true
+        // then we're animating, don't subsample as much to keep fps higher.
+        const int numSubpixels = fRecalcMasksForAnimation ? 2 : 8;
+
+        for (size_t sigmaIdx = 0; sigmaIdx < kNumSigmas; ++sigmaIdx) {
+            auto sigma = kSigmas[sigmaIdx] + fSigmaAnimationBoost;
+            for (size_t heightIdx = 0; heightIdx < kNumSizes; ++heightIdx) {
+                auto h = kSizes[heightIdx];
+                for (size_t widthIdx = 0; widthIdx < kNumSizes; ++widthIdx) {
+                    auto w = kSizes[widthIdx];
+                    fReferenceMasks[sigmaIdx][heightIdx][widthIdx] =
+                            create_reference_mask(w, h, sigma, numSubpixels);
+                }
+            }
+        }
+    }
+
+    void prepareActualMasks(SkCanvas* canvas) {
+        for (size_t sigmaIdx = 0; sigmaIdx < kNumSigmas; ++sigmaIdx) {
+            auto sigma = kSigmas[sigmaIdx] + fSigmaAnimationBoost;
+            for (size_t heightIdx = 0; heightIdx < kNumSizes; ++heightIdx) {
+                auto h = kSizes[heightIdx];
+                for (size_t widthIdx = 0; widthIdx < kNumSizes; ++widthIdx) {
+                    auto w = kSizes[widthIdx];
+                    auto pad = PadForSigma(sigma);
+                    auto ii = SkImageInfo::MakeA8(w + 2 * pad, h + 2 * pad);
+                    auto surf = canvas->makeSurface(ii);
+                    if (!surf) {
+                        return;
+                    }
+                    auto rect = SkRect::MakeXYWH(pad, pad, w, h);
+                    SkPaint paint;
+                    paint.setMaskFilter(SkMaskFilter::MakeBlur(kNormal_SkBlurStyle, sigma));
+                    surf->getCanvas()->drawRect(rect, paint);
+                    fActualMasks[sigmaIdx][heightIdx][widthIdx] = surf->makeImageSnapshot();
+                }
+            }
+        }
+    }
+
+    void prepareMaskDifferences(SkCanvas* canvas) {
+        for (size_t sigmaIdx = 0; sigmaIdx < kNumSigmas; ++sigmaIdx) {
+            for (size_t heightIdx = 0; heightIdx < kNumSizes; ++heightIdx) {
+                for (size_t widthIdx = 0; widthIdx < kNumSizes; ++widthIdx) {
+                    const auto& r =  fReferenceMasks[sigmaIdx][heightIdx][widthIdx];
+                    const auto& a =     fActualMasks[sigmaIdx][heightIdx][widthIdx];
+                          auto& d = fMaskDifferences[sigmaIdx][heightIdx][widthIdx];
+                    // The actual image might not be present if we're on an abandoned GrContext.
+                    if (!a) {
+                        d.reset();
+                        continue;
+                    }
+                    SkASSERT(r->width()  == a->width());
+                    SkASSERT(r->height() == a->height());
+                    auto ii = SkImageInfo::Make(r->width(), r->height(),
+                                                kRGBA_8888_SkColorType, kPremul_SkAlphaType);
+                    auto surf = canvas->makeSurface(ii);
+                    if (!surf) {
+                        return;
+                    }
+                    // We visualize the difference by turning both the alpha masks into opaque green
+                    // images (where alpha becomes the green channel) and then perform a
+                    // SkBlendMode::kDifference between them.
+                    SkPaint filterPaint;
+                    filterPaint.setColor(SK_ColorWHITE);
+                    // Actually 8 * alpha becomes green to really highlight differences.
+                    static constexpr float kGreenifyM[] = {0, 0, 0, 0, 0,
+                                                           0, 0, 0, 8, 0,
+                                                           0, 0, 0, 0, 0,
+                                                           0, 0, 0, 0, 1};
+                    auto greenifyCF = SkColorFilters::Matrix(kGreenifyM);
+                    SkPaint paint;
+                    paint.setBlendMode(SkBlendMode::kSrc);
+                    paint.setColorFilter(std::move(greenifyCF));
+                    surf->getCanvas()->drawImage(a, 0, 0, &paint);
+                    paint.setBlendMode(SkBlendMode::kDifference);
+                    surf->getCanvas()->drawImage(r, 0, 0, &paint);
+                    d = surf->makeImageSnapshot();
+                }
+            }
+        }
+    }
+
+    // Per side padding around mask images for a sigma. Make this overly generous to ensure bugs
+    // related to big blurs are fully visible.
+    static int PadForSigma(float sigma) { return sk_float_ceil2int(4 * sigma); }
+
+    static constexpr int kSizes[] = {1, 2, 4, 8, 16, 32};
+    static constexpr float kSigmas[] = {0.5f, 1.2f, 2.3f, 3.9f, 7.4f};
+    static constexpr size_t kNumSizes = SK_ARRAY_COUNT(kSizes);
+    static constexpr size_t kNumSigmas = SK_ARRAY_COUNT(kSigmas);
+
+    sk_sp<SkImage> fReferenceMasks[kNumSigmas][kNumSizes][kNumSizes];
+    sk_sp<SkImage> fActualMasks[kNumSigmas][kNumSizes][kNumSizes];
+    sk_sp<SkImage> fMaskDifferences[kNumSigmas][kNumSizes][kNumSizes];
+    int32_t fLastContextUniqueID;
+    // These are used only when animating.
+    float fSigmaAnimationBoost = 0;
+    bool fRecalcMasksForAnimation = false;
+};
+
+// Delete these when C++17.
+constexpr int BlurRectCompareGM::kSizes[];
+constexpr float BlurRectCompareGM::kSigmas[];
+constexpr size_t BlurRectCompareGM::kNumSizes;
+constexpr size_t BlurRectCompareGM::kNumSigmas;
+
+}  // namespace skiagm
+
 //////////////////////////////////////////////////////////////////////////////
 
 DEF_GM(return new BlurRectGM("blurrects", 0xFF);)
+DEF_GM(return new skiagm::BlurRectCompareGM();)
diff --git a/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-ASAN.json b/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-ASAN.json
index 2e253a1..a09cb22 100644
--- a/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-ASAN.json
+++ b/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-ASAN.json
@@ -449,6 +449,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "complexclip4_aa",
       "serialize-8888",
       "gm",
@@ -457,6 +465,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "p3",
       "serialize-8888",
       "gm",
@@ -465,6 +481,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_text_up_large",
       "serialize-8888",
       "gm",
@@ -473,6 +497,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_text_up",
       "serialize-8888",
       "gm",
@@ -481,6 +513,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_text_down",
       "serialize-8888",
       "gm",
@@ -489,6 +529,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_dog_up",
       "serialize-8888",
       "gm",
@@ -497,6 +545,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_dog_down",
       "serialize-8888",
       "gm",
@@ -505,6 +561,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_rose",
       "serialize-8888",
       "gm",
@@ -513,11 +577,27 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_no_bleed",
       "serialize-8888",
       "gm",
       "_",
       "async_rescale_and_read_no_bleed",
+      "pic-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
       "tiles_rt-8888",
       "gm",
       "_",
diff --git a/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-BonusConfigs.json b/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-BonusConfigs.json
index fd43325..baf2285 100644
--- a/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-BonusConfigs.json
+++ b/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-BonusConfigs.json
@@ -525,6 +525,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "complexclip4_aa",
       "serialize-8888",
       "gm",
@@ -533,6 +541,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "p3",
       "serialize-8888",
       "gm",
@@ -541,6 +557,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_text_up_large",
       "serialize-8888",
       "gm",
@@ -549,6 +573,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_text_up",
       "serialize-8888",
       "gm",
@@ -557,6 +589,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_text_down",
       "serialize-8888",
       "gm",
@@ -565,6 +605,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_dog_up",
       "serialize-8888",
       "gm",
@@ -573,6 +621,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_dog_down",
       "serialize-8888",
       "gm",
@@ -581,6 +637,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_rose",
       "serialize-8888",
       "gm",
@@ -589,11 +653,27 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_no_bleed",
       "serialize-8888",
       "gm",
       "_",
       "async_rescale_and_read_no_bleed",
+      "pic-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
       "tiles_rt-8888",
       "gm",
       "_",
diff --git a/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-MSAN.json b/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-MSAN.json
index fd751f5..c0614ce 100644
--- a/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-MSAN.json
+++ b/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Debug-All-MSAN.json
@@ -443,6 +443,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "complexclip4_aa",
       "serialize-8888",
       "gm",
@@ -451,6 +459,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "p3",
       "serialize-8888",
       "gm",
@@ -459,6 +475,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_text_up_large",
       "serialize-8888",
       "gm",
@@ -467,6 +491,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_text_up",
       "serialize-8888",
       "gm",
@@ -475,6 +507,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_text_down",
       "serialize-8888",
       "gm",
@@ -483,6 +523,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_dog_up",
       "serialize-8888",
       "gm",
@@ -491,6 +539,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_dog_down",
       "serialize-8888",
       "gm",
@@ -499,6 +555,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_rose",
       "serialize-8888",
       "gm",
@@ -507,11 +571,27 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_no_bleed",
       "serialize-8888",
       "gm",
       "_",
       "async_rescale_and_read_no_bleed",
+      "pic-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
       "tiles_rt-8888",
       "gm",
       "_",
diff --git a/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-TSAN.json b/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-TSAN.json
index 6647f2f..713cebb 100644
--- a/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-TSAN.json
+++ b/infra/bots/recipes/test.expected/Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-TSAN.json
@@ -444,6 +444,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "complexclip4_aa",
       "serialize-8888",
       "gm",
@@ -452,6 +460,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "p3",
       "serialize-8888",
       "gm",
@@ -460,6 +476,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_text_up_large",
       "serialize-8888",
       "gm",
@@ -468,6 +492,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_text_up",
       "serialize-8888",
       "gm",
@@ -476,6 +508,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_text_down",
       "serialize-8888",
       "gm",
@@ -484,6 +524,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_dog_up",
       "serialize-8888",
       "gm",
@@ -492,6 +540,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_dog_down",
       "serialize-8888",
       "gm",
@@ -500,6 +556,14 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_rose",
       "serialize-8888",
       "gm",
@@ -508,11 +572,27 @@
       "pic-8888",
       "gm",
       "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "pic-8888",
+      "gm",
+      "_",
       "async_rescale_and_read_no_bleed",
       "serialize-8888",
       "gm",
       "_",
       "async_rescale_and_read_no_bleed",
+      "pic-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
+      "serialize-8888",
+      "gm",
+      "_",
+      "blurrect_compare",
       "tiles_rt-8888",
       "gm",
       "_",
diff --git a/infra/bots/recipes/test.py b/infra/bots/recipes/test.py
index 71daa52..e1d3785 100644
--- a/infra/bots/recipes/test.py
+++ b/infra/bots/recipes/test.py
@@ -587,6 +587,10 @@
     blacklist([      'pic-8888', 'gm', '_', test])
     blacklist(['serialize-8888', 'gm', '_', test])
 
+  # GM requries canvas->makeSurface() to return a valid surface.
+    blacklist([      'pic-8888', 'gm', '_', "blurrect_compare"])
+    blacklist(['serialize-8888', 'gm', '_', "blurrect_compare"])
+
   # GM that not support tiles_rt
   for test in ['complexclip4_bw', 'complexclip4_aa']:
     blacklist([ 'tiles_rt-8888', 'gm', '_', test])
diff --git a/src/gpu/effects/GrRectBlurEffect.fp b/src/gpu/effects/GrRectBlurEffect.fp
index f357066..175fa65 100644
--- a/src/gpu/effects/GrRectBlurEffect.fp
+++ b/src/gpu/effects/GrRectBlurEffect.fp
@@ -6,6 +6,8 @@
  */
 
 @header {
+#include <cmath>
+#include "include/core/SkRect.h"
 #include "include/core/SkScalar.h"
 #include "src/core/SkBlurMask.h"
 #include "src/core/SkMathPriv.h"
@@ -21,53 +23,66 @@
 layout(when= highp) uniform float4 rectF;
 layout(when=!highp) uniform half4  rectH;
 
-in uniform sampler2D blurProfile;
-in uniform half invProfileWidth;
+// Texture that is a LUT for integral of normal distribution. The value at x (where x is a texture
+// coord between 0 and 1) is the integral from -inf to (3 * sigma * (-2 * x - 1)). I.e. x is mapped
+// 0 3*sigma to -3 sigma. The flip saves a reversal in the shader.
+in uniform sampler2D integral;
+// Used to produce normalized texture coords for lookups in 'integral'
+in uniform half invSixSigma;
+
+// There is a fast variant of the effect that does 2 texture lookups and a more general one for
+// wider blurs relative to rect sizes that does 4.
+layout(key) in bool isFast;
 
 @constructorParams {
     GrSamplerState samplerParams
 }
 
-@samplerParams(blurProfile) {
+@samplerParams(integral) {
     samplerParams
 }
 @class {
-static sk_sp<GrTextureProxy> CreateBlurProfileTexture(GrProxyProvider* proxyProvider,
-                                                      float sixSigma) {
-    // The "profile" we are calculating is the integral of a Gaussian with 'sigma' and a half
-    // plane. All such profiles are just scales of each other. So all we really care about is
-    // having enough resolution so that the linear interpolation done in texture lookup doesn't
-    // introduce noticeable artifacts. SkBlurMask::ComputeBlurProfile() produces profiles with
-    // ceil(6 * sigma) entries. We conservatively choose to have 2 texels for each dst pixel.
-    int minProfileWidth = 2 * sk_float_ceil2int(sixSigma);
-    // Bin by powers of 2 with a minimum so we get good profile reuse (remember we can just scale
-    // the texture coords to span the larger profile over a 6 sigma distance).
-    int profileWidth = SkTMax(SkNextPow2(minProfileWidth), 32);
+static sk_sp<GrTextureProxy> CreateIntegralTexture(GrProxyProvider* proxyProvider,
+                                                   float sixSigma) {
+    // The texture we're producing represents the integral of a normal distribution over a six-sigma
+    // range centered at zero. We want enough resolution so that the linear interpolation done in
+    // texture lookup doesn't introduce noticeable artifacts. We conservatively choose to have 2
+    // texels for each dst pixel.
+    int minWidth = 2 * sk_float_ceil2int(sixSigma);
+    // Bin by powers of 2 with a minimum so we get good profile reuse.
+    int width = SkTMax(SkNextPow2(minWidth), 32);
 
     static const GrUniqueKey::Domain kDomain = GrUniqueKey::GenerateDomain();
     GrUniqueKey key;
     GrUniqueKey::Builder builder(&key, kDomain, 1, "Rect Blur Mask");
-    builder[0] = profileWidth;
+    builder[0] = width;
     builder.finish();
 
-    sk_sp<GrTextureProxy> blurProfile(proxyProvider->findOrCreateProxyByUniqueKey(
+    sk_sp<GrTextureProxy> proxy(proxyProvider->findOrCreateProxyByUniqueKey(
             key, GrColorType::kAlpha_8, kTopLeft_GrSurfaceOrigin));
-    if (!blurProfile) {
+    if (!proxy) {
         SkBitmap bitmap;
-        if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(profileWidth, 1))) {
+        if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(width, 1))) {
             return nullptr;
         }
-        SkBlurMask::ComputeBlurProfile(bitmap.getAddr8(0, 0), profileWidth, profileWidth / 6.f);
+        *bitmap.getAddr8(0, 0) = 255;
+        const float invWidth = 1.f / width;
+        for (int i = 1; i < width - 1; ++i) {
+            float x = (i + 0.5f) * invWidth;
+            x = (-6 * x + 3) * SK_ScalarRoot2Over2;
+            float integral = 0.5f * (std::erf(x) + 1.f);
+            *bitmap.getAddr8(i, 0) = SkToU8(sk_float_round2int(255.f * integral));
+        }
+        *bitmap.getAddr8(width - 1, 0) = 0;
         bitmap.setImmutable();
-        blurProfile = proxyProvider->createProxyFromBitmap(bitmap, GrMipMapped::kNo);
-        if (!blurProfile) {
+        proxy = proxyProvider->createProxyFromBitmap(bitmap, GrMipMapped::kNo);
+        if (!proxy) {
             return nullptr;
         }
-        SkASSERT(blurProfile->origin() == kTopLeft_GrSurfaceOrigin);
-        proxyProvider->assignUniqueKeyToProxy(key, blurProfile.get());
+        SkASSERT(proxy->origin() == kTopLeft_GrSurfaceOrigin);
+        proxyProvider->assignUniqueKeyToProxy(key, proxy.get());
     }
-
-    return blurProfile;
+    return proxy;
 }
 }
 
@@ -75,6 +90,7 @@
      static std::unique_ptr<GrFragmentProcessor> Make(GrProxyProvider* proxyProvider,
                                                       const GrShaderCaps& caps,
                                                       const SkRect& rect, float sigma) {
+         SkASSERT(rect.isSorted());
          if (!caps.floatIs32Bits()) {
              // We promote the math that gets us into the Gaussian space to full float when the rect
              // coords are large. If we don't have full float then fail. We could probably clip the
@@ -85,55 +101,91 @@
              }
          }
 
-         // The profilee straddles the rect edges (half inside, half outside). Thus if the profile
-         // size is greater than the rect width/height then the area at the center of the rect is
-         // influenced by both edges. This is not handled by this effect.
-         float profileSize = 6 * sigma;
-         if (profileSize >= (float) rect.width() || profileSize >= (float) rect.height()) {
-             // if the blur sigma is too large so the gaussian overlaps the whole
-             // rect in either direction, fall back to CPU path for now.
+         const float sixSigma = 6 * sigma;
+         auto integral = CreateIntegralTexture(proxyProvider, sixSigma);
+         if (!integral) {
              return nullptr;
          }
 
-         auto profile = CreateBlurProfileTexture(proxyProvider, profileSize);
-         if (!profile) {
-             return nullptr;
-         }
-         // The profile is calculated such that the midpoint is at the rect's edge. To simplify
-         // calculating texture coords in the shader, we inset the rect such that the profile
-         // can be used with one end point aligned to the edges of the rect uniform. The texture
-         // coords should be scaled such that the profile is sampled over a 6 sigma range so inset
-         // by 3 sigma.
-         float halfWidth = profileSize / 2;
-         auto insetR = rect.makeInset(halfWidth, halfWidth);
-         // inverse of the width over which the profile texture should be interpolated outward from
-         // the inset rect.
-         float invWidth = 1.f / profileSize;
-         return std::unique_ptr<GrFragmentProcessor>(new GrRectBlurEffect(
-                 insetR, std::move(profile), invWidth, GrSamplerState::ClampBilerp()));
+         // In the fast variant we think of the midpoint of the integral texture as aligning
+         // with the closest rect edge both in x and y. To simplify texture coord calculation we
+         // inset the rect so that the edge of the inset rect corresponds to t = 0 in the texture.
+         // It actually simplifies things a bit in the !isFast case, too.
+         float threeSigma = sixSigma / 2;
+         SkRect insetRect = {rect.fLeft   + threeSigma,
+                             rect.fTop    + threeSigma,
+                             rect.fRight  - threeSigma,
+                             rect.fBottom - threeSigma};
+
+         // In our fast variant we find the nearest horizontal and vertical edges and for each
+         // do a lookup in the integral texture for each and multiply them. When the rect is
+         // less than 6 sigma wide then things aren't so simple and we have to consider both the
+         // left and right edge of the rectangle (and similar in y).
+         bool isFast = insetRect.isSorted();
+         // 1 / (6 * sigma) is the domain of the integral texture. We use the inverse to produce
+         // normalized texture coords from frag coord distances.
+         float invSixSigma = 1.f / sixSigma;
+         return std::unique_ptr<GrFragmentProcessor>(new GrRectBlurEffect(insetRect,
+                 std::move(integral), invSixSigma, isFast, GrSamplerState::ClampBilerp()));
      }
 }
 
 void main() {
-        // Get the smaller of the signed distance from the frag coord to the left and right edges
-        // and similar for y.
-        // The blur profile computed by SkMaskFilter::ComputeBlurProfile is actually 1 - integral.
-        // The integral is an S-looking shape that is symmetric about 0, so we just  compute x and
-        // "backwards" such that texture coord is 1 at the edge and goes to 0 as we move outward.
-        half x;
-        @if (highp) {
-            x = max(half(rectF.x - sk_FragCoord.x), half(sk_FragCoord.x - rectF.z));
+        half xCoverage, yCoverage;
+        @if (isFast) {
+            // Get the smaller of the signed distance from the frag coord to the left and right
+            // edges and similar for y.
+            // The integral texture goes "backwards" (from 3*sigma to -3*sigma), So, the below
+            // computations align the left edge of the integral texture with the inset rect's edge
+            // extending outward 6 * sigma from the inset rect.
+            half x, y;
+            @if (highp) {
+                x = max(half(rectF.x - sk_FragCoord.x), half(sk_FragCoord.x - rectF.z));
+                y = max(half(rectF.y - sk_FragCoord.y), half(sk_FragCoord.y - rectF.w));
+           } else {
+                x = max(half(rectH.x - sk_FragCoord.x), half(sk_FragCoord.x - rectH.z));
+                y = max(half(rectH.y - sk_FragCoord.y), half(sk_FragCoord.y - rectH.w));
+            }
+            xCoverage = sample(integral, half2(x * invSixSigma, 0.5)).a;
+            yCoverage = sample(integral, half2(y * invSixSigma, 0.5)).a;
+            sk_OutColor = sk_InColor * xCoverage * yCoverage;
         } else {
-            x = max(half(rectH.x - sk_FragCoord.x), half(sk_FragCoord.x - rectH.z));
+            // We just consider just the x direction here. In practice we compute x and y separately
+            // and multiply them together.
+            // We define our coord system so that the point at which we're evaluating a kernel
+            // defined by the normal distribution (K) as  0. In this coord system let L be left
+            // edge and R be the right edge of the rectangle.
+            // We can calculate C by integrating K with the half infinite ranges outside the L to R
+            // range and subtracting from 1:
+            //   C = 1 - <integral of K from from -inf to  L> - <integral of K from R to inf>
+            // K is symmetric about x=0 so:
+            //   C = 1 - <integral of K from from -inf to  L> - <integral of K from -inf to -R>
+
+            // The integral texture goes "backwards" (from 3*sigma to -3*sigma) which is factored
+            // in to the below calculations.
+            // Also, our rect uniform was pre-inset by 3 sigma from the actual rect being blurred,
+            // also factored in.
+            half l, r, t, b;
+            @if (highp) {
+                l = half(sk_FragCoord.x - rectF.x);
+                r = half(rectF.z - sk_FragCoord.x);
+                t = half(sk_FragCoord.y - rectF.y);
+                b = half(rectF.w - sk_FragCoord.y);
+            } else {
+                l = half(sk_FragCoord.x - rectH.x);
+                r = half(rectH.z - sk_FragCoord.x);
+                t = half(sk_FragCoord.y - rectH.y);
+                b = half(rectH.w - sk_FragCoord.y);
+            }
+            half il = 1 + l * invSixSigma;
+            half ir = 1 + r * invSixSigma;
+            half it = 1 + t * invSixSigma;
+            half ib = 1 + b * invSixSigma;
+            xCoverage = 1 - sample(integral, half2(il, 0.5)).a
+                          - sample(integral, half2(ir, 0.5)).a;
+            yCoverage = 1 - sample(integral, half2(it, 0.5)).a
+                          - sample(integral, half2(ib, 0.5)).a;
         }
-        half y;
-        @if (highp) {
-            y = max(half(rectF.y - sk_FragCoord.y), half(sk_FragCoord.y - rectF.w));
-        } else {
-            y = max(half(rectH.y - sk_FragCoord.y), half(sk_FragCoord.y - rectH.w));
-        }
-        half xCoverage = sample(blurProfile, half2(x * invProfileWidth, 0.5)).a;
-        half yCoverage = sample(blurProfile, half2(y * invProfileWidth, 0.5)).a;
         sk_OutColor = sk_InColor * xCoverage * yCoverage;
 }
 
diff --git a/src/gpu/effects/generated/GrRectBlurEffect.cpp b/src/gpu/effects/generated/GrRectBlurEffect.cpp
index a717040..186dcfe 100644
--- a/src/gpu/effects/generated/GrRectBlurEffect.cpp
+++ b/src/gpu/effects/generated/GrRectBlurEffect.cpp
@@ -25,8 +25,10 @@
         (void)_outer;
         auto rect = _outer.rect;
         (void)rect;
-        auto invProfileWidth = _outer.invProfileWidth;
-        (void)invProfileWidth;
+        auto invSixSigma = _outer.invSixSigma;
+        (void)invSixSigma;
+        auto isFast = _outer.isFast;
+        (void)isFast;
         highp = ((abs(rect.left()) > 16000.0 || abs(rect.top()) > 16000.0) ||
                  abs(rect.right()) > 16000.0) ||
                 abs(rect.bottom()) > 16000.0;
@@ -38,33 +40,65 @@
             rectHVar = args.fUniformHandler->addUniform(kFragment_GrShaderFlag, kHalf4_GrSLType,
                                                         "rectH");
         }
-        invProfileWidthVar = args.fUniformHandler->addUniform(kFragment_GrShaderFlag,
-                                                              kHalf_GrSLType, "invProfileWidth");
+        invSixSigmaVar = args.fUniformHandler->addUniform(kFragment_GrShaderFlag, kHalf_GrSLType,
+                                                          "invSixSigma");
         fragBuilder->codeAppendf(
-                "/* key */ bool highp = %s;\nhalf x;\n@if (highp) {\n    x = max(half(%s.x - "
-                "sk_FragCoord.x), half(sk_FragCoord.x - %s.z));\n} else {\n    x = "
-                "max(half(float(%s.x) - sk_FragCoord.x), half(sk_FragCoord.x - "
-                "float(%s.z)));\n}\nhalf y;\n@if (highp) {\n    y = max(half(%s.y - "
-                "sk_FragCoord.y), half(sk_FragCoord.y - %s.w));\n} else {\n    y = "
-                "max(half(float(%s.y) - sk_FragCoord.y), half(sk_FragCoord.y - "
-                "float(%s.w)));\n}\nhalf xCoverage = sample(%s, float2(half2(x * %s, "
-                "0.5))).%s.w;\nhalf yCoverage = sample(%s, flo",
-                (highp ? "true" : "false"),
+                "/* key */ bool highp = %s;\nhalf xCoverage, yCoverage;\n@if (%s) {\n    half x, "
+                "y;\n    @if (highp) {\n        x = max(half(%s.x - sk_FragCoord.x), "
+                "half(sk_FragCoord.x - %s.z));\n        y = max(half(%s.y - sk_FragCoord.y), "
+                "half(sk_FragCoord.y - %s.w));\n    } else {\n        x = max(half(float(%s.x) - "
+                "sk_FragCoord.x), half(sk_FragCoord.x - float(%s.z)));\n        y = "
+                "max(half(float(%s.y) - sk_FragCoord.y), half(sk_FragCoord.y - float(%s.w)));\n    "
+                "}\n    xCoverage = sample(%s, float2(half2(x * %s, 0.5))).",
+                (highp ? "true" : "false"), (_outer.isFast ? "true" : "false"),
+                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
+                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
                 rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
                 rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
                 rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
                 rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
-                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
-                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
                 rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
                 rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
                 fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]),
-                args.fUniformHandler->getUniformCStr(invProfileWidthVar),
-                fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
-                fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]));
+                args.fUniformHandler->getUniformCStr(invSixSigmaVar));
         fragBuilder->codeAppendf(
-                "at2(half2(y * %s, 0.5))).%s.w;\n%s = (%s * xCoverage) * yCoverage;\n",
-                args.fUniformHandler->getUniformCStr(invProfileWidthVar),
+                "%s.w;\n    yCoverage = sample(%s, float2(half2(y * %s, 0.5))).%s.w;\n    %s = (%s "
+                "* xCoverage) * yCoverage;\n} else {\n    half l, r, t, b;\n    @if (highp) {\n    "
+                "    l = half(sk_FragCoord.x - %s.x);\n        r = half(%s.z - sk_FragCoord.x);\n  "
+                "      t = half(sk_FragCoord.y - %s.y);\n        b = half(%s.w - "
+                "sk_FragCoord.y);\n    } else {\n        l = half(sk_FragCoord.x - float(%s.x));\n "
+                "       r = half(float(%s.z) - sk_FragCoord.x);\n        t = half(sk_FragCoord.y - "
+                "float(%s.y));\n        b = half(float(",
+                fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
+                fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]),
+                args.fUniformHandler->getUniformCStr(invSixSigmaVar),
+                fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
+                args.fOutputColor, args.fInputColor,
+                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
+                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
+                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
+                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
+                rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
+                rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
+                rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)");
+        fragBuilder->codeAppendf(
+                "%s.w) - sk_FragCoord.y);\n    }\n    half il = 1.0 + l * %s;\n    half ir = 1.0 + "
+                "r * %s;\n    half it = 1.0 + t * %s;\n    half ib = 1.0 + b * %s;\n    xCoverage "
+                "= (1.0 - sample(%s, float2(half2(il, 0.5))).%s.w) - sample(%s, float2(half2(ir, "
+                "0.5))).%s.w;\n    yCoverage = (1.0 - sample(%s, float2(half2(it, 0.5))).%s.w) - "
+                "sample(%s, float2(half2(ib, 0.5))).%s.w;\n}\n%s = (%s * xCoverage) * yCoverage;\n",
+                rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
+                args.fUniformHandler->getUniformCStr(invSixSigmaVar),
+                args.fUniformHandler->getUniformCStr(invSixSigmaVar),
+                args.fUniformHandler->getUniformCStr(invSixSigmaVar),
+                args.fUniformHandler->getUniformCStr(invSixSigmaVar),
+                fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]),
+                fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
+                fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]),
+                fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
+                fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]),
+                fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
+                fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]),
                 fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
                 args.fOutputColor, args.fInputColor);
     }
@@ -73,18 +107,20 @@
     void onSetData(const GrGLSLProgramDataManager& pdman,
                    const GrFragmentProcessor& _proc) override {
         const GrRectBlurEffect& _outer = _proc.cast<GrRectBlurEffect>();
-        { pdman.set1f(invProfileWidthVar, (_outer.invProfileWidth)); }
+        { pdman.set1f(invSixSigmaVar, (_outer.invSixSigma)); }
         auto rect = _outer.rect;
         (void)rect;
         UniformHandle& rectF = rectFVar;
         (void)rectF;
         UniformHandle& rectH = rectHVar;
         (void)rectH;
-        GrSurfaceProxy& blurProfileProxy = *_outer.textureSampler(0).proxy();
-        GrTexture& blurProfile = *blurProfileProxy.peekTexture();
-        (void)blurProfile;
-        UniformHandle& invProfileWidth = invProfileWidthVar;
-        (void)invProfileWidth;
+        GrSurfaceProxy& integralProxy = *_outer.textureSampler(0).proxy();
+        GrTexture& integral = *integralProxy.peekTexture();
+        (void)integral;
+        UniformHandle& invSixSigma = invSixSigmaVar;
+        (void)invSixSigma;
+        auto isFast = _outer.isFast;
+        (void)isFast;
 
         float r[]{rect.fLeft, rect.fTop, rect.fRight, rect.fBottom};
         pdman.set4fv(highp ? rectF : rectH, 1, r);
@@ -92,7 +128,7 @@
     bool highp = false;
     UniformHandle rectFVar;
     UniformHandle rectHVar;
-    UniformHandle invProfileWidthVar;
+    UniformHandle invSixSigmaVar;
 };
 GrGLSLFragmentProcessor* GrRectBlurEffect::onCreateGLSLInstance() const {
     return new GrGLSLRectBlurEffect();
@@ -103,27 +139,30 @@
                   abs(rect.right()) > 16000.0) ||
                  abs(rect.bottom()) > 16000.0;
     b->add32((int32_t)highp);
+    b->add32((int32_t)isFast);
 }
 bool GrRectBlurEffect::onIsEqual(const GrFragmentProcessor& other) const {
     const GrRectBlurEffect& that = other.cast<GrRectBlurEffect>();
     (void)that;
     if (rect != that.rect) return false;
-    if (blurProfile != that.blurProfile) return false;
-    if (invProfileWidth != that.invProfileWidth) return false;
+    if (integral != that.integral) return false;
+    if (invSixSigma != that.invSixSigma) return false;
+    if (isFast != that.isFast) return false;
     return true;
 }
 GrRectBlurEffect::GrRectBlurEffect(const GrRectBlurEffect& src)
         : INHERITED(kGrRectBlurEffect_ClassID, src.optimizationFlags())
         , rect(src.rect)
-        , blurProfile(src.blurProfile)
-        , invProfileWidth(src.invProfileWidth) {
+        , integral(src.integral)
+        , invSixSigma(src.invSixSigma)
+        , isFast(src.isFast) {
     this->setTextureSamplerCnt(1);
 }
 std::unique_ptr<GrFragmentProcessor> GrRectBlurEffect::clone() const {
     return std::unique_ptr<GrFragmentProcessor>(new GrRectBlurEffect(*this));
 }
 const GrFragmentProcessor::TextureSampler& GrRectBlurEffect::onTextureSampler(int index) const {
-    return IthTextureSampler(index, blurProfile);
+    return IthTextureSampler(index, integral);
 }
 GR_DEFINE_FRAGMENT_PROCESSOR_TEST(GrRectBlurEffect);
 #if GR_TEST_UTILS
diff --git a/src/gpu/effects/generated/GrRectBlurEffect.h b/src/gpu/effects/generated/GrRectBlurEffect.h
index 5202d1a..b0c86bd 100644
--- a/src/gpu/effects/generated/GrRectBlurEffect.h
+++ b/src/gpu/effects/generated/GrRectBlurEffect.h
@@ -12,6 +12,8 @@
 #define GrRectBlurEffect_DEFINED
 #include "include/core/SkTypes.h"
 
+#include <cmath>
+#include "include/core/SkRect.h"
 #include "include/core/SkScalar.h"
 #include "src/core/SkBlurMask.h"
 #include "src/core/SkMathPriv.h"
@@ -22,47 +24,53 @@
 #include "src/gpu/GrFragmentProcessor.h"
 class GrRectBlurEffect : public GrFragmentProcessor {
 public:
-    static sk_sp<GrTextureProxy> CreateBlurProfileTexture(GrProxyProvider* proxyProvider,
-                                                          float sixSigma) {
-        // The "profile" we are calculating is the integral of a Gaussian with 'sigma' and a half
-        // plane. All such profiles are just scales of each other. So all we really care about is
-        // having enough resolution so that the linear interpolation done in texture lookup doesn't
-        // introduce noticeable artifacts. SkBlurMask::ComputeBlurProfile() produces profiles with
-        // ceil(6 * sigma) entries. We conservatively choose to have 2 texels for each dst pixel.
-        int minProfileWidth = 2 * sk_float_ceil2int(sixSigma);
-        // Bin by powers of 2 with a minimum so we get good profile reuse (remember we can just
-        // scale the texture coords to span the larger profile over a 6 sigma distance).
-        int profileWidth = SkTMax(SkNextPow2(minProfileWidth), 32);
+    static sk_sp<GrTextureProxy> CreateIntegralTexture(GrProxyProvider* proxyProvider,
+                                                       float sixSigma) {
+        // The texture we're producing represents the integral of a normal distribution over a
+        // six-sigma range centered at zero. We want enough resolution so that the linear
+        // interpolation done in texture lookup doesn't introduce noticeable artifacts. We
+        // conservatively choose to have 2 texels for each dst pixel.
+        int minWidth = 2 * sk_float_ceil2int(sixSigma);
+        // Bin by powers of 2 with a minimum so we get good profile reuse.
+        int width = SkTMax(SkNextPow2(minWidth), 32);
 
         static const GrUniqueKey::Domain kDomain = GrUniqueKey::GenerateDomain();
         GrUniqueKey key;
         GrUniqueKey::Builder builder(&key, kDomain, 1, "Rect Blur Mask");
-        builder[0] = profileWidth;
+        builder[0] = width;
         builder.finish();
 
-        sk_sp<GrTextureProxy> blurProfile(proxyProvider->findOrCreateProxyByUniqueKey(
+        sk_sp<GrTextureProxy> proxy(proxyProvider->findOrCreateProxyByUniqueKey(
                 key, GrColorType::kAlpha_8, kTopLeft_GrSurfaceOrigin));
-        if (!blurProfile) {
+        if (!proxy) {
             SkBitmap bitmap;
-            if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(profileWidth, 1))) {
+            if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(width, 1))) {
                 return nullptr;
             }
-            SkBlurMask::ComputeBlurProfile(bitmap.getAddr8(0, 0), profileWidth, profileWidth / 6.f);
+            *bitmap.getAddr8(0, 0) = 255;
+            const float invWidth = 1.f / width;
+            for (int i = 1; i < width - 1; ++i) {
+                float x = (i + 0.5f) * invWidth;
+                x = (-6 * x + 3) * SK_ScalarRoot2Over2;
+                float integral = 0.5f * (std::erf(x) + 1.f);
+                *bitmap.getAddr8(i, 0) = SkToU8(sk_float_round2int(255.f * integral));
+            }
+            *bitmap.getAddr8(width - 1, 0) = 0;
             bitmap.setImmutable();
-            blurProfile = proxyProvider->createProxyFromBitmap(bitmap, GrMipMapped::kNo);
-            if (!blurProfile) {
+            proxy = proxyProvider->createProxyFromBitmap(bitmap, GrMipMapped::kNo);
+            if (!proxy) {
                 return nullptr;
             }
-            SkASSERT(blurProfile->origin() == kTopLeft_GrSurfaceOrigin);
-            proxyProvider->assignUniqueKeyToProxy(key, blurProfile.get());
+            SkASSERT(proxy->origin() == kTopLeft_GrSurfaceOrigin);
+            proxyProvider->assignUniqueKeyToProxy(key, proxy.get());
         }
-
-        return blurProfile;
+        return proxy;
     }
 
     static std::unique_ptr<GrFragmentProcessor> Make(GrProxyProvider* proxyProvider,
                                                      const GrShaderCaps& caps, const SkRect& rect,
                                                      float sigma) {
+        SkASSERT(rect.isSorted());
         if (!caps.floatIs32Bits()) {
             // We promote the math that gets us into the Gaussian space to full float when the rect
             // coords are large. If we don't have full float then fail. We could probably clip the
@@ -73,48 +81,49 @@
             }
         }
 
-        // The profilee straddles the rect edges (half inside, half outside). Thus if the profile
-        // size is greater than the rect width/height then the area at the center of the rect is
-        // influenced by both edges. This is not handled by this effect.
-        float profileSize = 6 * sigma;
-        if (profileSize >= (float)rect.width() || profileSize >= (float)rect.height()) {
-            // if the blur sigma is too large so the gaussian overlaps the whole
-            // rect in either direction, fall back to CPU path for now.
+        const float sixSigma = 6 * sigma;
+        auto integral = CreateIntegralTexture(proxyProvider, sixSigma);
+        if (!integral) {
             return nullptr;
         }
 
-        auto profile = CreateBlurProfileTexture(proxyProvider, profileSize);
-        if (!profile) {
-            return nullptr;
-        }
-        // The profile is calculated such that the midpoint is at the rect's edge. To simplify
-        // calculating texture coords in the shader, we inset the rect such that the profile
-        // can be used with one end point aligned to the edges of the rect uniform. The texture
-        // coords should be scaled such that the profile is sampled over a 6 sigma range so inset
-        // by 3 sigma.
-        float halfWidth = profileSize / 2;
-        auto insetR = rect.makeInset(halfWidth, halfWidth);
-        // inverse of the width over which the profile texture should be interpolated outward from
-        // the inset rect.
-        float invWidth = 1.f / profileSize;
-        return std::unique_ptr<GrFragmentProcessor>(new GrRectBlurEffect(
-                insetR, std::move(profile), invWidth, GrSamplerState::ClampBilerp()));
+        // In the fast variant we think of the midpoint of the integral texture as aligning
+        // with the closest rect edge both in x and y. To simplify texture coord calculation we
+        // inset the rect so that the edge of the inset rect corresponds to t = 0 in the texture.
+        // It actually simplifies things a bit in the !isFast case, too.
+        float threeSigma = sixSigma / 2;
+        SkRect insetRect = {rect.fLeft + threeSigma, rect.fTop + threeSigma,
+                            rect.fRight - threeSigma, rect.fBottom - threeSigma};
+
+        // In our fast variant we find the nearest horizontal and vertical edges and for each
+        // do a lookup in the integral texture for each and multiply them. When the rect is
+        // less than 6 sigma wide then things aren't so simple and we have to consider both the
+        // left and right edge of the rectangle (and similar in y).
+        bool isFast = insetRect.isSorted();
+        // 1 / (6 * sigma) is the domain of the integral texture. We use the inverse to produce
+        // normalized texture coords from frag coord distances.
+        float invSixSigma = 1.f / sixSigma;
+        return std::unique_ptr<GrFragmentProcessor>(
+                new GrRectBlurEffect(insetRect, std::move(integral), invSixSigma, isFast,
+                                     GrSamplerState::ClampBilerp()));
     }
     GrRectBlurEffect(const GrRectBlurEffect& src);
     std::unique_ptr<GrFragmentProcessor> clone() const override;
     const char* name() const override { return "RectBlurEffect"; }
     SkRect rect;
-    TextureSampler blurProfile;
-    float invProfileWidth;
+    TextureSampler integral;
+    float invSixSigma;
+    bool isFast;
 
 private:
-    GrRectBlurEffect(SkRect rect, sk_sp<GrTextureProxy> blurProfile, float invProfileWidth,
+    GrRectBlurEffect(SkRect rect, sk_sp<GrTextureProxy> integral, float invSixSigma, bool isFast,
                      GrSamplerState samplerParams)
             : INHERITED(kGrRectBlurEffect_ClassID,
                         (OptimizationFlags)kCompatibleWithCoverageAsAlpha_OptimizationFlag)
             , rect(rect)
-            , blurProfile(std::move(blurProfile), samplerParams)
-            , invProfileWidth(invProfileWidth) {
+            , integral(std::move(integral), samplerParams)
+            , invSixSigma(invSixSigma)
+            , isFast(isFast) {
         this->setTextureSamplerCnt(1);
     }
     GrGLSLFragmentProcessor* onCreateGLSLInstance() const override;
diff --git a/tools/timer/TimeUtils.h b/tools/timer/TimeUtils.h
index 244dfc2..5638f83 100644
--- a/tools/timer/TimeUtils.h
+++ b/tools/timer/TimeUtils.h
@@ -6,6 +6,7 @@
 #define TimeUtils_DEFINED
 
 #include "include/core/SkTypes.h"
+#include "include/private/SkFloatingPoint.h"
 
 #include <cmath>
 
@@ -43,5 +44,19 @@
         double diff  = ::fabs(value - half);
         return (float)(ends + (1.0 - diff / half) * (mid - ends));
     }
+
+    static inline float SineWave(double time,
+                                 float periodInSecs,
+                                 float phaseInSecs,
+                                 float min,
+                                 float max) {
+        if (periodInSecs < 0.f) {
+            return (min + max) / 2.f;
+        }
+        double t = NanosToSeconds(time) + phaseInSecs;
+        t *= 2 * SK_FloatPI / periodInSecs;
+        float halfAmplitude = (max - min) / 2.f;
+        return halfAmplitude * std::sin(t) + halfAmplitude + min;
+    }
 }  // namespace TimeUtils
 #endif