Add GrRenderTask and GrOp prePrepare framework

We want to be able to pull more work forward when creating DDLs. The prePrepare entry points will allow us to perform this CPU-side preprocessing.

Change-Id: I2c0c7978dbf7d7c335425ea81ab2323cb9dcdbc3
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/247048
Commit-Queue: Robert Phillips <robertphillips@google.com>
Reviewed-by: Greg Daniel <egdaniel@google.com>
diff --git a/src/gpu/GrCopyRenderTask.h b/src/gpu/GrCopyRenderTask.h
index 8093e69..640bb11 100644
--- a/src/gpu/GrCopyRenderTask.h
+++ b/src/gpu/GrCopyRenderTask.h
@@ -24,7 +24,6 @@
                      sk_sp<GrSurfaceProxy> dstProxy,
                      const SkIPoint& dstPoint);
 
-    void onPrepare(GrOpFlushState*) override {}
     bool onIsUsed(GrSurfaceProxy* proxy) const override {
         SkASSERT(proxy != fTarget.get());  // This case should be handled by GrRenderTask.
         return proxy == fSrcProxy.get();
diff --git a/src/gpu/GrDrawingManager.cpp b/src/gpu/GrDrawingManager.cpp
index 65d8059..fe44231 100644
--- a/src/gpu/GrDrawingManager.cpp
+++ b/src/gpu/GrDrawingManager.cpp
@@ -577,6 +577,10 @@
 
     fDAG.swap(&ddl->fRenderTasks);
 
+    for (auto renderTask : ddl->fRenderTasks) {
+        renderTask->prePrepare();
+    }
+
     if (fPathRendererChain) {
         if (auto ccpr = fPathRendererChain->getCoverageCountingPathRenderer()) {
             ddl->fPendingPaths = ccpr->detachPendingPaths();
diff --git a/src/gpu/GrOpsTask.cpp b/src/gpu/GrOpsTask.cpp
index 35bfcb6..229dd9c 100644
--- a/src/gpu/GrOpsTask.cpp
+++ b/src/gpu/GrOpsTask.cpp
@@ -391,6 +391,26 @@
     fAuditTrail = nullptr;
 }
 
+void GrOpsTask::onPrePrepare() {
+    SkASSERT(this->isClosed());
+#ifdef SK_BUILD_FOR_ANDROID_FRAMEWORK
+    TRACE_EVENT0("skia.gpu", TRACE_FUNC);
+#endif
+    // TODO: remove the check for discard here once reduced op splitting is turned on. Currently we
+    // can end up with GrOpsTasks that only have a discard load op and no ops. For vulkan validation
+    // we need to keep that discard and not drop it. Once we have reduce op list splitting enabled
+    // we shouldn't end up with GrOpsTasks with only discard.
+    if (this->isNoOp() || (fClippedContentBounds.isEmpty() && fColorLoadOp != GrLoadOp::kDiscard)) {
+        return;
+    }
+
+    for (const auto& chain : fOpChains) {
+        if (chain.shouldExecute()) {
+            chain.head()->prePrepare();
+        }
+    }
+}
+
 void GrOpsTask::onPrepare(GrOpFlushState* flushState) {
     SkASSERT(fTarget->peekRenderTarget());
     SkASSERT(this->isClosed());
@@ -418,6 +438,7 @@
                                           chain.dstProxy());
 
             flushState->setOpArgs(&opArgs);
+            // GrOp::prePrepare may or may not have been called at this point
             chain.head()->prepare(flushState);
             flushState->setOpArgs(nullptr);
         }
diff --git a/src/gpu/GrOpsTask.h b/src/gpu/GrOpsTask.h
index 80e4ffd..9531ab9 100644
--- a/src/gpu/GrOpsTask.h
+++ b/src/gpu/GrOpsTask.h
@@ -50,6 +50,7 @@
      */
     void endFlush() override;
 
+    void onPrePrepare() override;
     /**
      * Together these two functions flush all queued up draws to GrCommandBuffer. The return value
      * of executeOps() indicates whether any commands were actually issued to the GPU.
diff --git a/src/gpu/GrRenderTask.h b/src/gpu/GrRenderTask.h
index 801713c..22bd25e 100644
--- a/src/gpu/GrRenderTask.h
+++ b/src/gpu/GrRenderTask.h
@@ -29,6 +29,8 @@
 
     void makeClosed(const GrCaps&);
 
+    void prePrepare() { this->onPrePrepare(); }
+
     // These two methods are only invoked at flush time
     void prepare(GrOpFlushState* flushState);
     bool execute(GrOpFlushState* flushState) { return this->onExecute(flushState); }
@@ -183,7 +185,8 @@
         }
     };
 
-    virtual void onPrepare(GrOpFlushState* flushState) = 0;
+    virtual void onPrePrepare() {} // Only the GrOpsTask currently overrides this virtual
+    virtual void onPrepare(GrOpFlushState*) {} // Only the GrOpsTask overrides this virtual
     virtual bool onExecute(GrOpFlushState* flushState) = 0;
 
     const uint32_t         fUniqueID;
diff --git a/src/gpu/GrTextureResolveRenderTask.h b/src/gpu/GrTextureResolveRenderTask.h
index 913f250..48fea39 100644
--- a/src/gpu/GrTextureResolveRenderTask.h
+++ b/src/gpu/GrTextureResolveRenderTask.h
@@ -18,7 +18,6 @@
     void addProxy(sk_sp<GrSurfaceProxy>, GrSurfaceProxy::ResolveFlags, const GrCaps&);
 
 private:
-    void onPrepare(GrOpFlushState*) override {}
     bool onIsUsed(GrSurfaceProxy* proxy) const override {
         SkASSERT(proxy != fTarget.get());  // This case should be handled by GrRenderTask.
         return false;
diff --git a/src/gpu/GrTransferFromRenderTask.h b/src/gpu/GrTransferFromRenderTask.h
index 08437c7..40e89da 100644
--- a/src/gpu/GrTransferFromRenderTask.h
+++ b/src/gpu/GrTransferFromRenderTask.h
@@ -27,7 +27,6 @@
             , fDstOffset(dstOffset) {}
 
 private:
-    void onPrepare(GrOpFlushState*) override {}
     bool onIsUsed(GrSurfaceProxy* proxy) const override {
         SkASSERT(!fTarget);
         return proxy == fSrcProxy.get();
diff --git a/src/gpu/GrWaitRenderTask.h b/src/gpu/GrWaitRenderTask.h
index 46c2edf..fc736e1 100644
--- a/src/gpu/GrWaitRenderTask.h
+++ b/src/gpu/GrWaitRenderTask.h
@@ -20,7 +20,6 @@
             , fNumSemaphores(numSemaphores){}
 
 private:
-    void onPrepare(GrOpFlushState*) override {}
     bool onIsUsed(GrSurfaceProxy* proxy) const override {
         SkASSERT(proxy != fTarget.get());  // This case should be handled by GrRenderTask.
         return false;
diff --git a/src/gpu/ops/GrMeshDrawOp.h b/src/gpu/ops/GrMeshDrawOp.h
index 5baf985..b956a4b 100644
--- a/src/gpu/ops/GrMeshDrawOp.h
+++ b/src/gpu/ops/GrMeshDrawOp.h
@@ -72,7 +72,10 @@
     };
 
 private:
+    void onPrePrepare() final { this->onPrePrepareDraws(); }
     void onPrepare(GrOpFlushState* state) final;
+
+    virtual void onPrePrepareDraws() {}   // Only the GrTextureOp currently overrides this virtual
     virtual void onPrepareDraws(Target*) = 0;
     typedef GrDrawOp INHERITED;
 };
diff --git a/src/gpu/ops/GrOp.h b/src/gpu/ops/GrOp.h
index e63df01..f42b5b5 100644
--- a/src/gpu/ops/GrOp.h
+++ b/src/gpu/ops/GrOp.h
@@ -154,6 +154,13 @@
     }
 
     /**
+     * This can optionally be called before 'prepare' (but after sorting). Each op that overrides
+     * onPrePrepare must be prepared to handle both cases (when onPrePrepare has been called
+     * ahead of time and when it has not been called).
+     */
+    void prePrepare() { this->onPrePrepare(); }
+
+    /**
      * Called prior to executing. The op should perform any resource creation or data transfers
      * necessary before execute() is called.
      */
@@ -282,6 +289,7 @@
         return CombineResult::kCannotCombine;
     }
 
+    virtual void onPrePrepare() {}  // Only GrMeshDrawOp currently overrides this virtual
     virtual void onPrepare(GrOpFlushState*) = 0;
     // If this op is chained then chainBounds is the union of the bounds of all ops in the chain.
     // Otherwise, this op's bounds.
diff --git a/src/gpu/ops/GrTextureOp.cpp b/src/gpu/ops/GrTextureOp.cpp
index b3ef414..669c935 100644
--- a/src/gpu/ops/GrTextureOp.cpp
+++ b/src/gpu/ops/GrTextureOp.cpp
@@ -282,7 +282,8 @@
             , fQuads(1, true /* includes locals */)
             , fTextureColorSpaceXform(std::move(textureColorSpaceXform))
             , fSaturate(static_cast<unsigned>(saturate))
-            , fFilter(static_cast<unsigned>(filter)) {
+            , fFilter(static_cast<unsigned>(filter))
+            , fPrePrepared(false) {
         // Clean up disparities between the overall aa type and edge configuration and apply
         // optimizations based on the rect and matrix when appropriate
         GrQuadUtils::ResolveAAType(aaType, aaFlags, dstQuad, &aaType, &aaFlags);
@@ -319,7 +320,8 @@
             , fQuads(cnt, true /* includes locals */)
             , fTextureColorSpaceXform(std::move(textureColorSpaceXform))
             , fSaturate(static_cast<unsigned>(saturate))
-            , fFilter(static_cast<unsigned>(filter)) {
+            , fFilter(static_cast<unsigned>(filter))
+            , fPrePrepared(false) {
         fProxyCnt = SkToUInt(cnt);
         SkRect bounds = SkRectPriv::MakeLargestInverted();
         GrAAType overallAAType = GrAAType::kNone; // aa type maximally compatible with all dst rects
@@ -425,6 +427,13 @@
         }
     }
 
+    void onPrePrepareDraws() override {
+        SkASSERT(!fPrePrepared);
+        // Pull forward the tessellation of the quads to here
+        fPrePrepared = true;
+    }
+
+    // onPrePrepareDraws may or may not have been called at this point
     void onPrepareDraws(Target* target) override {
         TRACE_EVENT0("skia.gpu", TRACE_FUNC);
         GrQuad::Type quadType = GrQuad::Type::kAxisAligned;
@@ -555,6 +564,13 @@
     CombineResult onCombineIfPossible(GrOp* t, const GrCaps& caps) override {
         TRACE_EVENT0("skia.gpu", TRACE_FUNC);
         const auto* that = t->cast<TextureOp>();
+
+        if (fPrePrepared || that->fPrePrepared) {
+            // This should never happen (since only DDL recorded ops should be prePrepared)
+            // but, in any case, we should never combine ops that that been prePrepared
+            return CombineResult::kCannotCombine;
+        }
+
         if (fDomain != that->fDomain) {
             // It is technically possible to combine operations across domain modes, but performance
             // testing suggests it's better to make more draw calls where some take advantage of
@@ -615,7 +631,8 @@
     unsigned fDomain : 1;
     unsigned fColorType : 2;
     GR_STATIC_ASSERT(GrQuadPerEdgeAA::kColorTypeCount <= 4);
-    unsigned fProxyCnt : 32 - 8;
+    unsigned fPrePrepared : 1;
+    unsigned fProxyCnt : 32 - 7;
     Proxy fProxies[1];
 
     static_assert(GrQuad::kTypeCount <= 4, "GrQuad::Type does not fit in 2 bits");