Revert "Simplify quickReject implementation in SkCanvas"

This reverts commit 0a0f4f5c35520cb7ca5b065d3813e30dae124a77.

Reason for revert: possible cause of flutter roll failure

Original change's description:
> Simplify quickReject implementation in SkCanvas
>
>  - SkCanvas no longer keeps fIsScaleTranslate bool that has to stay in
>    sync with the type of the matrix.
>  - No more fast or slow path for quickReject, the Sk4f code has been
>    completely removed.
>  - Uses SkM44::mapRect instead of SkMatrix::mapRect. This is slightly
>    slower for S+T, but much faster for other transforms. I'm hopeful we
>    won't notice the regression in the grand scheme for S+T, since the
>    code is a lot simpler now.
>  - The final isFinite() and intersects() check for quickReject uses
>    SkRect's functions instead of hand-written SSE/NEON. If we think this
>    is optimization is necessary, I'm hoping we can rewrite it in terms
>    of skvx instead of specific instructions.
>  - Consolidated how the quick-reject bounds outsetting into
>    computeDeviceClipBounds, and added an option to skip outsetting for
>    the one call site that doesn't want it.
>
> Bug: skia:10987
> Change-Id: I3cf2a73636cdeed06d12cab4548cfb94d1eb074a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/405198
> Commit-Queue: Mike Reed <reed@google.com>
> Auto-Submit: Michael Ludwig <michaelludwig@google.com>
> Reviewed-by: Mike Reed <reed@google.com>

TBR=mtklein@google.com,reed@google.com,michaelludwig@google.com

Change-Id: I1a373740ee167827b9a6a2eee9afb7f814641fb0
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Bug: skia:10987
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/405616
Reviewed-by: Greg Daniel <egdaniel@google.com>
Commit-Queue: Greg Daniel <egdaniel@google.com>
diff --git a/include/core/SkCanvas.h b/include/core/SkCanvas.h
index cb6ca55..2bfc2d0 100644
--- a/include/core/SkCanvas.h
+++ b/include/core/SkCanvas.h
@@ -2423,13 +2423,16 @@
 
     virtual SkPaintFilterCanvas* internal_private_asPaintFilterCanvas() const { return nullptr; }
 
-    // Keep track of the device clip bounds in the canvas' global space to reject draws before
-    // invoking the top-level device.
+    /**
+     *  Keep track of the device clip bounds and if the matrix is scale-translate.  This allows
+     *  us to do a fast quick reject in the common case.
+     */
+    bool   fIsScaleTranslate;
     SkRect fQuickRejectBounds;
 
     // Compute the clip's bounds based on all clipped SkDevice's reported device bounds transformed
     // into the canvas' global space.
-    SkRect computeDeviceClipBounds(bool outsetForAA=true) const;
+    SkRect computeDeviceClipBounds() const;
 
     class AutoUpdateQRBounds;
     void validateClip() const;
diff --git a/src/core/SkCanvas.cpp b/src/core/SkCanvas.cpp
index cfbcf14..aec582a 100644
--- a/src/core/SkCanvas.cpp
+++ b/src/core/SkCanvas.cpp
@@ -18,6 +18,7 @@
 #include "include/core/SkTextBlob.h"
 #include "include/core/SkVertices.h"
 #include "include/effects/SkRuntimeEffect.h"
+#include "include/private/SkNx.h"
 #include "include/private/SkTo.h"
 #include "include/utils/SkNoDrawCanvas.h"
 #include "src/core/SkArenaAlloc.h"
@@ -270,6 +271,18 @@
     }
 };
 
+static inline SkRect qr_clip_bounds(const SkRect& bounds) {
+    if (bounds.isEmpty()) {
+        return SkRect::MakeEmpty();
+    }
+
+    // Expand bounds out by 1 in case we are anti-aliasing.  We store the
+    // bounds as floats to enable a faster quick reject implementation.
+    SkRect dst;
+    (Sk4f::Load(&bounds.fLeft) + Sk4f(-1.f, -1.f, 1.f, 1.f)).store(&dst.fLeft);
+    return dst;
+}
+
 class SkCanvas::AutoUpdateQRBounds {
 public:
     explicit AutoUpdateQRBounds(SkCanvas* canvas) : fCanvas(canvas) {
@@ -278,7 +291,7 @@
         fCanvas->validateClip();
     }
     ~AutoUpdateQRBounds() {
-        fCanvas->fQuickRejectBounds = fCanvas->computeDeviceClipBounds();
+        fCanvas->fQuickRejectBounds = qr_clip_bounds(fCanvas->computeDeviceClipBounds());
         // post-condition, we should remain valid after re-computing the bounds
         fCanvas->validateClip();
     }
@@ -409,7 +422,8 @@
     SkASSERT(fBaseDevice->isNoPixelsDevice());
     static_cast<SkNoPixelsDevice*>(fBaseDevice.get())->resetForNextPicture(bounds);
     fMCRec->reset(fBaseDevice.get());
-    fQuickRejectBounds = this->computeDeviceClipBounds();
+    fQuickRejectBounds = qr_clip_bounds(this->computeDeviceClipBounds());
+    fIsScaleTranslate = true;
 }
 
 void SkCanvas::init(sk_sp<SkBaseDevice> device) {
@@ -435,9 +449,10 @@
     device->setMarkerStack(fMarkerStack.get());
 
     fSurfaceBase = nullptr;
+    fIsScaleTranslate = true;
     fBaseDevice = std::move(device);
     fScratchGlyphRunBuilder = std::make_unique<SkGlyphRunBuilder>();
-    fQuickRejectBounds = this->computeDeviceClipBounds();
+    fQuickRejectBounds = qr_clip_bounds(this->computeDeviceClipBounds());
 }
 
 SkCanvas::SkCanvas()
@@ -1057,7 +1072,7 @@
 
     fMCRec->newLayer(std::move(newDevice), paint, stashedMatrix);
 
-    fQuickRejectBounds = this->computeDeviceClipBounds();
+    fQuickRejectBounds = qr_clip_bounds(this->computeDeviceClipBounds());
 }
 
 int SkCanvas::saveLayerAlpha(const SkRect* bounds, U8CPU alpha) {
@@ -1153,9 +1168,10 @@
         this->internalSetMatrix(SkM44(layer->fStashedMatrix));
     }
 
+    fIsScaleTranslate = SkMatrixPriv::IsScaleTranslateAsM33(fMCRec->fMatrix);
     // Update the quick-reject bounds in case the restore changed the top device or the
     // removed save record had included modifications to the clip stack.
-    fQuickRejectBounds = this->computeDeviceClipBounds();
+    fQuickRejectBounds = qr_clip_bounds(this->computeDeviceClipBounds());
     this->validateClip();
 }
 
@@ -1291,6 +1307,11 @@
         this->checkForDeferredSave();
         fMCRec->fMatrix.preTranslate(dx, dy);
 
+        // Translate shouldn't affect the is-scale-translateness of the matrix.
+        // However, if either is non-finite, we might still complicate the matrix type,
+        // so we still have to compute this.
+        fIsScaleTranslate = SkMatrixPriv::IsScaleTranslateAsM33(fMCRec->fMatrix);
+
         this->topDevice()->setGlobalCTM(fMCRec->fMatrix);
 
         this->didTranslate(dx,dy);
@@ -1302,6 +1323,10 @@
         this->checkForDeferredSave();
         fMCRec->fMatrix.preScale(sx, sy);
 
+        // shouldn't need to do this (theoretically), as the state shouldn't have changed,
+        // but pre-scaling by a non-finite does change it, so we have to recompute.
+        fIsScaleTranslate = SkMatrixPriv::IsScaleTranslateAsM33(fMCRec->fMatrix);
+
         this->topDevice()->setGlobalCTM(fMCRec->fMatrix);
 
         this->didScale(sx, sy);
@@ -1338,6 +1363,8 @@
 
     fMCRec->fMatrix.preConcat(m);
 
+    fIsScaleTranslate = SkMatrixPriv::IsScaleTranslateAsM33(fMCRec->fMatrix);
+
     this->topDevice()->setGlobalCTM(fMCRec->fMatrix);
 }
 
@@ -1349,6 +1376,7 @@
 
 void SkCanvas::internalSetMatrix(const SkM44& m) {
     fMCRec->fMatrix = m;
+    fIsScaleTranslate = SkMatrixPriv::IsScaleTranslateAsM33(m);
 
     this->topDevice()->setGlobalCTM(fMCRec->fMatrix);
 }
@@ -1501,7 +1529,7 @@
 
 void SkCanvas::validateClip() const {
 #ifdef SK_DEBUG
-    SkRect tmp = this->computeDeviceClipBounds();
+    SkRect tmp = qr_clip_bounds(this->computeDeviceClipBounds());
     if (this->isClipEmpty()) {
         SkASSERT(fQuickRejectBounds.isEmpty());
     } else {
@@ -1536,14 +1564,68 @@
     return this->topDevice()->onGetClipType() == SkBaseDevice::ClipType::kRect;
 }
 
+static inline bool is_nan_or_clipped(const Sk4f& devRect, const Sk4f& devClip) {
+#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+    __m128 lLtT = _mm_unpacklo_ps(devRect.fVec, devClip.fVec);
+    __m128 RrBb = _mm_unpackhi_ps(devClip.fVec, devRect.fVec);
+    __m128 mask = _mm_cmplt_ps(lLtT, RrBb);
+    return 0xF != _mm_movemask_ps(mask);
+#elif !defined(SKNX_NO_SIMD) && defined(SK_ARM_HAS_NEON)
+    float32x4_t lLtT = vzipq_f32(devRect.fVec, devClip.fVec).val[0];
+    float32x4_t RrBb = vzipq_f32(devClip.fVec, devRect.fVec).val[1];
+    uint32x4_t mask = vcltq_f32(lLtT, RrBb);
+    return 0xFFFFFFFFFFFFFFFF != (uint64_t) vmovn_u32(mask);
+#else
+    SkRect devRectAsRect;
+    SkRect devClipAsRect;
+    devRect.store(&devRectAsRect.fLeft);
+    devClip.store(&devClipAsRect.fLeft);
+    return !devRectAsRect.isFinite() || !devRectAsRect.intersect(devClipAsRect);
+#endif
+}
+
+// It's important for this function to not be inlined.  Otherwise the compiler will share code
+// between the fast path and the slow path, resulting in two slow paths.
+static SK_NEVER_INLINE bool quick_reject_slow_path(const SkRect& src, const SkRect& deviceClip,
+                                                   const SkMatrix& matrix) {
+    SkRect deviceRect;
+    matrix.mapRect(&deviceRect, src);
+    return !deviceRect.isFinite() || !deviceRect.intersect(deviceClip);
+}
+
 bool SkCanvas::quickReject(const SkRect& src) const {
 #ifdef SK_DEBUG
     // Verify that fQuickRejectBounds are set properly.
     this->validateClip();
+    // Verify that fIsScaleTranslate is set properly.
+    SkASSERT(fIsScaleTranslate == SkMatrixPriv::IsScaleTranslateAsM33(fMCRec->fMatrix));
 #endif
 
-    SkRect devRect = SkMatrixPriv::MapRect(fMCRec->fMatrix, src);
-    return !devRect.isFinite() || !devRect.intersects(fQuickRejectBounds);
+    if (!fIsScaleTranslate) {
+        return quick_reject_slow_path(src, fQuickRejectBounds, fMCRec->fMatrix.asM33());
+    }
+
+    // We inline the implementation of mapScaleTranslate() for the fast path.
+    float sx = fMCRec->fMatrix.rc(0, 0);
+    float sy = fMCRec->fMatrix.rc(1, 1);
+    float tx = fMCRec->fMatrix.rc(0, 3);
+    float ty = fMCRec->fMatrix.rc(1, 3);
+    Sk4f scale(sx, sy, sx, sy);
+    Sk4f trans(tx, ty, tx, ty);
+
+    // Apply matrix.
+    Sk4f ltrb = Sk4f::Load(&src.fLeft) * scale + trans;
+
+    // Make sure left < right, top < bottom.
+    Sk4f rblt(ltrb[2], ltrb[3], ltrb[0], ltrb[1]);
+    Sk4f min = Sk4f::Min(ltrb, rblt);
+    Sk4f max = Sk4f::Max(ltrb, rblt);
+    // We can extract either pair [0,1] or [2,3] from min and max and be correct, but on
+    // ARM this sequence generates the fastest (a single instruction).
+    Sk4f devRect = Sk4f(min[2], min[3], max[0], max[1]);
+
+    // Check if the device rect is NaN or outside the clip.
+    return is_nan_or_clipped(devRect, Sk4f::Load(&fQuickRejectBounds.fLeft));
 }
 
 bool SkCanvas::quickReject(const SkPath& path) const {
@@ -1587,22 +1669,16 @@
 }
 
 SkIRect SkCanvas::getDeviceClipBounds() const {
-    return this->computeDeviceClipBounds(/*outsetForAA=*/false).roundOut();
+    return this->computeDeviceClipBounds().roundOut();
 }
 
-SkRect SkCanvas::computeDeviceClipBounds(bool outsetForAA) const {
+SkRect SkCanvas::computeDeviceClipBounds() const {
     const SkBaseDevice* dev = this->topDevice();
     if (dev->onGetClipType() == SkBaseDevice::ClipType::kEmpty) {
         return SkRect::MakeEmpty();
     } else {
-        SkRect devClipBounds = SkRect::Make(dev->devClipBounds());
-        dev->deviceToGlobal().mapRect(&devClipBounds);
-        if (outsetForAA) {
-            // Expand bounds out by 1 in case we are anti-aliasing.  We store the
-            // bounds as floats to enable a faster quick reject implementation.
-            devClipBounds.outset(1.f, 1.f);
-        }
-        return devClipBounds;
+        SkIRect devClipBounds = dev->devClipBounds();
+        return dev->deviceToGlobal().mapRect(SkRect::Make(devClipBounds));
     }
 }