Revert "Simplify quickReject implementation in SkCanvas"
This reverts commit 0a0f4f5c35520cb7ca5b065d3813e30dae124a77.
Reason for revert: possible cause of flutter roll failure
Original change's description:
> Simplify quickReject implementation in SkCanvas
>
> - SkCanvas no longer keeps fIsScaleTranslate bool that has to stay in
> sync with the type of the matrix.
> - No more fast or slow path for quickReject, the Sk4f code has been
> completely removed.
> - Uses SkM44::mapRect instead of SkMatrix::mapRect. This is slightly
> slower for S+T, but much faster for other transforms. I'm hopeful we
> won't notice the regression in the grand scheme for S+T, since the
> code is a lot simpler now.
> - The final isFinite() and intersects() check for quickReject uses
> SkRect's functions instead of hand-written SSE/NEON. If we think this
> is optimization is necessary, I'm hoping we can rewrite it in terms
> of skvx instead of specific instructions.
> - Consolidated how the quick-reject bounds outsetting into
> computeDeviceClipBounds, and added an option to skip outsetting for
> the one call site that doesn't want it.
>
> Bug: skia:10987
> Change-Id: I3cf2a73636cdeed06d12cab4548cfb94d1eb074a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/405198
> Commit-Queue: Mike Reed <reed@google.com>
> Auto-Submit: Michael Ludwig <michaelludwig@google.com>
> Reviewed-by: Mike Reed <reed@google.com>
TBR=mtklein@google.com,reed@google.com,michaelludwig@google.com
Change-Id: I1a373740ee167827b9a6a2eee9afb7f814641fb0
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Bug: skia:10987
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/405616
Reviewed-by: Greg Daniel <egdaniel@google.com>
Commit-Queue: Greg Daniel <egdaniel@google.com>
diff --git a/include/core/SkCanvas.h b/include/core/SkCanvas.h
index cb6ca55..2bfc2d0 100644
--- a/include/core/SkCanvas.h
+++ b/include/core/SkCanvas.h
@@ -2423,13 +2423,16 @@
virtual SkPaintFilterCanvas* internal_private_asPaintFilterCanvas() const { return nullptr; }
- // Keep track of the device clip bounds in the canvas' global space to reject draws before
- // invoking the top-level device.
+ /**
+ * Keep track of the device clip bounds and if the matrix is scale-translate. This allows
+ * us to do a fast quick reject in the common case.
+ */
+ bool fIsScaleTranslate;
SkRect fQuickRejectBounds;
// Compute the clip's bounds based on all clipped SkDevice's reported device bounds transformed
// into the canvas' global space.
- SkRect computeDeviceClipBounds(bool outsetForAA=true) const;
+ SkRect computeDeviceClipBounds() const;
class AutoUpdateQRBounds;
void validateClip() const;
diff --git a/src/core/SkCanvas.cpp b/src/core/SkCanvas.cpp
index cfbcf14..aec582a 100644
--- a/src/core/SkCanvas.cpp
+++ b/src/core/SkCanvas.cpp
@@ -18,6 +18,7 @@
#include "include/core/SkTextBlob.h"
#include "include/core/SkVertices.h"
#include "include/effects/SkRuntimeEffect.h"
+#include "include/private/SkNx.h"
#include "include/private/SkTo.h"
#include "include/utils/SkNoDrawCanvas.h"
#include "src/core/SkArenaAlloc.h"
@@ -270,6 +271,18 @@
}
};
+static inline SkRect qr_clip_bounds(const SkRect& bounds) {
+ if (bounds.isEmpty()) {
+ return SkRect::MakeEmpty();
+ }
+
+ // Expand bounds out by 1 in case we are anti-aliasing. We store the
+ // bounds as floats to enable a faster quick reject implementation.
+ SkRect dst;
+ (Sk4f::Load(&bounds.fLeft) + Sk4f(-1.f, -1.f, 1.f, 1.f)).store(&dst.fLeft);
+ return dst;
+}
+
class SkCanvas::AutoUpdateQRBounds {
public:
explicit AutoUpdateQRBounds(SkCanvas* canvas) : fCanvas(canvas) {
@@ -278,7 +291,7 @@
fCanvas->validateClip();
}
~AutoUpdateQRBounds() {
- fCanvas->fQuickRejectBounds = fCanvas->computeDeviceClipBounds();
+ fCanvas->fQuickRejectBounds = qr_clip_bounds(fCanvas->computeDeviceClipBounds());
// post-condition, we should remain valid after re-computing the bounds
fCanvas->validateClip();
}
@@ -409,7 +422,8 @@
SkASSERT(fBaseDevice->isNoPixelsDevice());
static_cast<SkNoPixelsDevice*>(fBaseDevice.get())->resetForNextPicture(bounds);
fMCRec->reset(fBaseDevice.get());
- fQuickRejectBounds = this->computeDeviceClipBounds();
+ fQuickRejectBounds = qr_clip_bounds(this->computeDeviceClipBounds());
+ fIsScaleTranslate = true;
}
void SkCanvas::init(sk_sp<SkBaseDevice> device) {
@@ -435,9 +449,10 @@
device->setMarkerStack(fMarkerStack.get());
fSurfaceBase = nullptr;
+ fIsScaleTranslate = true;
fBaseDevice = std::move(device);
fScratchGlyphRunBuilder = std::make_unique<SkGlyphRunBuilder>();
- fQuickRejectBounds = this->computeDeviceClipBounds();
+ fQuickRejectBounds = qr_clip_bounds(this->computeDeviceClipBounds());
}
SkCanvas::SkCanvas()
@@ -1057,7 +1072,7 @@
fMCRec->newLayer(std::move(newDevice), paint, stashedMatrix);
- fQuickRejectBounds = this->computeDeviceClipBounds();
+ fQuickRejectBounds = qr_clip_bounds(this->computeDeviceClipBounds());
}
int SkCanvas::saveLayerAlpha(const SkRect* bounds, U8CPU alpha) {
@@ -1153,9 +1168,10 @@
this->internalSetMatrix(SkM44(layer->fStashedMatrix));
}
+ fIsScaleTranslate = SkMatrixPriv::IsScaleTranslateAsM33(fMCRec->fMatrix);
// Update the quick-reject bounds in case the restore changed the top device or the
// removed save record had included modifications to the clip stack.
- fQuickRejectBounds = this->computeDeviceClipBounds();
+ fQuickRejectBounds = qr_clip_bounds(this->computeDeviceClipBounds());
this->validateClip();
}
@@ -1291,6 +1307,11 @@
this->checkForDeferredSave();
fMCRec->fMatrix.preTranslate(dx, dy);
+ // Translate shouldn't affect the is-scale-translateness of the matrix.
+ // However, if either is non-finite, we might still complicate the matrix type,
+ // so we still have to compute this.
+ fIsScaleTranslate = SkMatrixPriv::IsScaleTranslateAsM33(fMCRec->fMatrix);
+
this->topDevice()->setGlobalCTM(fMCRec->fMatrix);
this->didTranslate(dx,dy);
@@ -1302,6 +1323,10 @@
this->checkForDeferredSave();
fMCRec->fMatrix.preScale(sx, sy);
+ // shouldn't need to do this (theoretically), as the state shouldn't have changed,
+ // but pre-scaling by a non-finite does change it, so we have to recompute.
+ fIsScaleTranslate = SkMatrixPriv::IsScaleTranslateAsM33(fMCRec->fMatrix);
+
this->topDevice()->setGlobalCTM(fMCRec->fMatrix);
this->didScale(sx, sy);
@@ -1338,6 +1363,8 @@
fMCRec->fMatrix.preConcat(m);
+ fIsScaleTranslate = SkMatrixPriv::IsScaleTranslateAsM33(fMCRec->fMatrix);
+
this->topDevice()->setGlobalCTM(fMCRec->fMatrix);
}
@@ -1349,6 +1376,7 @@
void SkCanvas::internalSetMatrix(const SkM44& m) {
fMCRec->fMatrix = m;
+ fIsScaleTranslate = SkMatrixPriv::IsScaleTranslateAsM33(m);
this->topDevice()->setGlobalCTM(fMCRec->fMatrix);
}
@@ -1501,7 +1529,7 @@
void SkCanvas::validateClip() const {
#ifdef SK_DEBUG
- SkRect tmp = this->computeDeviceClipBounds();
+ SkRect tmp = qr_clip_bounds(this->computeDeviceClipBounds());
if (this->isClipEmpty()) {
SkASSERT(fQuickRejectBounds.isEmpty());
} else {
@@ -1536,14 +1564,68 @@
return this->topDevice()->onGetClipType() == SkBaseDevice::ClipType::kRect;
}
+static inline bool is_nan_or_clipped(const Sk4f& devRect, const Sk4f& devClip) {
+#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+ __m128 lLtT = _mm_unpacklo_ps(devRect.fVec, devClip.fVec);
+ __m128 RrBb = _mm_unpackhi_ps(devClip.fVec, devRect.fVec);
+ __m128 mask = _mm_cmplt_ps(lLtT, RrBb);
+ return 0xF != _mm_movemask_ps(mask);
+#elif !defined(SKNX_NO_SIMD) && defined(SK_ARM_HAS_NEON)
+ float32x4_t lLtT = vzipq_f32(devRect.fVec, devClip.fVec).val[0];
+ float32x4_t RrBb = vzipq_f32(devClip.fVec, devRect.fVec).val[1];
+ uint32x4_t mask = vcltq_f32(lLtT, RrBb);
+ return 0xFFFFFFFFFFFFFFFF != (uint64_t) vmovn_u32(mask);
+#else
+ SkRect devRectAsRect;
+ SkRect devClipAsRect;
+ devRect.store(&devRectAsRect.fLeft);
+ devClip.store(&devClipAsRect.fLeft);
+ return !devRectAsRect.isFinite() || !devRectAsRect.intersect(devClipAsRect);
+#endif
+}
+
+// It's important for this function to not be inlined. Otherwise the compiler will share code
+// between the fast path and the slow path, resulting in two slow paths.
+static SK_NEVER_INLINE bool quick_reject_slow_path(const SkRect& src, const SkRect& deviceClip,
+ const SkMatrix& matrix) {
+ SkRect deviceRect;
+ matrix.mapRect(&deviceRect, src);
+ return !deviceRect.isFinite() || !deviceRect.intersect(deviceClip);
+}
+
bool SkCanvas::quickReject(const SkRect& src) const {
#ifdef SK_DEBUG
// Verify that fQuickRejectBounds are set properly.
this->validateClip();
+ // Verify that fIsScaleTranslate is set properly.
+ SkASSERT(fIsScaleTranslate == SkMatrixPriv::IsScaleTranslateAsM33(fMCRec->fMatrix));
#endif
- SkRect devRect = SkMatrixPriv::MapRect(fMCRec->fMatrix, src);
- return !devRect.isFinite() || !devRect.intersects(fQuickRejectBounds);
+ if (!fIsScaleTranslate) {
+ return quick_reject_slow_path(src, fQuickRejectBounds, fMCRec->fMatrix.asM33());
+ }
+
+ // We inline the implementation of mapScaleTranslate() for the fast path.
+ float sx = fMCRec->fMatrix.rc(0, 0);
+ float sy = fMCRec->fMatrix.rc(1, 1);
+ float tx = fMCRec->fMatrix.rc(0, 3);
+ float ty = fMCRec->fMatrix.rc(1, 3);
+ Sk4f scale(sx, sy, sx, sy);
+ Sk4f trans(tx, ty, tx, ty);
+
+ // Apply matrix.
+ Sk4f ltrb = Sk4f::Load(&src.fLeft) * scale + trans;
+
+ // Make sure left < right, top < bottom.
+ Sk4f rblt(ltrb[2], ltrb[3], ltrb[0], ltrb[1]);
+ Sk4f min = Sk4f::Min(ltrb, rblt);
+ Sk4f max = Sk4f::Max(ltrb, rblt);
+ // We can extract either pair [0,1] or [2,3] from min and max and be correct, but on
+ // ARM this sequence generates the fastest (a single instruction).
+ Sk4f devRect = Sk4f(min[2], min[3], max[0], max[1]);
+
+ // Check if the device rect is NaN or outside the clip.
+ return is_nan_or_clipped(devRect, Sk4f::Load(&fQuickRejectBounds.fLeft));
}
bool SkCanvas::quickReject(const SkPath& path) const {
@@ -1587,22 +1669,16 @@
}
SkIRect SkCanvas::getDeviceClipBounds() const {
- return this->computeDeviceClipBounds(/*outsetForAA=*/false).roundOut();
+ return this->computeDeviceClipBounds().roundOut();
}
-SkRect SkCanvas::computeDeviceClipBounds(bool outsetForAA) const {
+SkRect SkCanvas::computeDeviceClipBounds() const {
const SkBaseDevice* dev = this->topDevice();
if (dev->onGetClipType() == SkBaseDevice::ClipType::kEmpty) {
return SkRect::MakeEmpty();
} else {
- SkRect devClipBounds = SkRect::Make(dev->devClipBounds());
- dev->deviceToGlobal().mapRect(&devClipBounds);
- if (outsetForAA) {
- // Expand bounds out by 1 in case we are anti-aliasing. We store the
- // bounds as floats to enable a faster quick reject implementation.
- devClipBounds.outset(1.f, 1.f);
- }
- return devClipBounds;
+ SkIRect devClipBounds = dev->devClipBounds();
+ return dev->deviceToGlobal().mapRect(SkRect::Make(devClipBounds));
}
}