Reland "Delete the index buffer from middle-out tessellation"

This is a reland of 0d0b1b3b56da6d13506d34756c79a79c4929950f

Original change's description:
> Delete the index buffer from middle-out tessellation
>
> This gives us more flexibility for customizing triangulations in
> future modes. It is also hopefully cheaper than the extra memory
> indirection from indexed draws.
>
> Bug: skia:10419
> Bug: chromium:1202607
> Change-Id: Iba41a35a634edf8f962c3d604c7e035e7a85801d
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/407296
> Commit-Queue: Chris Dalton <csmartdalton@google.com>
> Reviewed-by: Greg Daniel <egdaniel@google.com>

Bug: skia:10419
Bug: chromium:1202607
Change-Id: I2f5022d2122dee1ca197780b534663b37cd2504f
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/408236
Reviewed-by: Greg Daniel <egdaniel@google.com>
Commit-Queue: Chris Dalton <csmartdalton@google.com>
diff --git a/src/gpu/GrDrawIndirectCommand.h b/src/gpu/GrDrawIndirectCommand.h
index 17cd709..d645223 100644
--- a/src/gpu/GrDrawIndirectCommand.h
+++ b/src/gpu/GrDrawIndirectCommand.h
@@ -52,7 +52,7 @@
 
     bool operator==(const GrDrawIndirectWriter& that) { return fData == that.fData; }
 
-    bool isValid() const { return fData != nullptr; }
+    operator bool() const { return fData != nullptr; }
 
     GrDrawIndirectWriter makeOffset(int drawCount) const { return {fData + drawCount}; }
 
diff --git a/src/gpu/GrShaderCaps.cpp b/src/gpu/GrShaderCaps.cpp
index 0beba60..22ec760 100644
--- a/src/gpu/GrShaderCaps.cpp
+++ b/src/gpu/GrShaderCaps.cpp
@@ -51,7 +51,7 @@
     fSampleMaskSupport = false;
     fExternalTextureSupport = false;
     fVertexIDSupport = false;
-    fFPManipulationSupport = false;
+    fBitManipulationSupport = false;
     fFloatIs32Bits = true;
     fHalfIs32Bits = false;
     fHasLowFragmentPrecision = false;
@@ -135,7 +135,7 @@
     writer->appendBool("Sample mask support", fSampleMaskSupport);
     writer->appendBool("External texture support", fExternalTextureSupport);
     writer->appendBool("sk_VertexID support", fVertexIDSupport);
-    writer->appendBool("Floating point manipulation support", fFPManipulationSupport);
+    writer->appendBool("Bit manipulation support", fBitManipulationSupport);
     writer->appendBool("float == fp32", fFloatIs32Bits);
     writer->appendBool("half == fp32", fHalfIs32Bits);
     writer->appendBool("Has poor fragment precision", fHasLowFragmentPrecision);
diff --git a/src/gpu/GrShaderCaps.h b/src/gpu/GrShaderCaps.h
index bf47593..446dc74 100644
--- a/src/gpu/GrShaderCaps.h
+++ b/src/gpu/GrShaderCaps.h
@@ -76,8 +76,8 @@
 
     bool vertexIDSupport() const { return fVertexIDSupport; }
 
-    // frexp, ldexp, etc.
-    bool fpManipulationSupport() const { return fFPManipulationSupport; }
+    // frexp, ldexp, findMSB, findLSB.
+    bool bitManipulationSupport() const { return fBitManipulationSupport; }
 
     bool floatIs32Bits() const { return fFloatIs32Bits; }
 
@@ -289,7 +289,7 @@
     bool fSampleMaskSupport                 : 1;
     bool fExternalTextureSupport            : 1;
     bool fVertexIDSupport                   : 1;
-    bool fFPManipulationSupport             : 1;
+    bool fBitManipulationSupport            : 1;
     bool fFloatIs32Bits                     : 1;
     bool fHalfIs32Bits                      : 1;
     bool fHasLowFragmentPrecision           : 1;
diff --git a/src/gpu/d3d/GrD3DCaps.cpp b/src/gpu/d3d/GrD3DCaps.cpp
index 9bd72bf..940c7aa 100644
--- a/src/gpu/d3d/GrD3DCaps.cpp
+++ b/src/gpu/d3d/GrD3DCaps.cpp
@@ -241,7 +241,7 @@
 
     shaderCaps->fIntegerSupport = true;
     shaderCaps->fVertexIDSupport = true;
-    shaderCaps->fFPManipulationSupport = true;
+    shaderCaps->fBitManipulationSupport = true;
 
     shaderCaps->fFloatIs32Bits = true;
     shaderCaps->fHalfIs32Bits =
diff --git a/src/gpu/gl/GrGLCaps.cpp b/src/gpu/gl/GrGLCaps.cpp
index e2f43a4..5d4a686 100644
--- a/src/gpu/gl/GrGLCaps.cpp
+++ b/src/gpu/gl/GrGLCaps.cpp
@@ -955,9 +955,9 @@
     }
 
     if (GR_IS_GR_GL(standard)) {
-        shaderCaps->fFPManipulationSupport = ctxInfo.glslGeneration() >= k400_GrGLSLGeneration;
+        shaderCaps->fBitManipulationSupport = ctxInfo.glslGeneration() >= k400_GrGLSLGeneration;
     } else if (GR_IS_GR_GL_ES(standard) || GR_IS_GR_WEBGL(standard)) {
-        shaderCaps->fFPManipulationSupport = ctxInfo.glslGeneration() >= k310es_GrGLSLGeneration;
+        shaderCaps->fBitManipulationSupport = ctxInfo.glslGeneration() >= k310es_GrGLSLGeneration;
     }
 
     shaderCaps->fFloatIs32Bits = is_float_fp32(ctxInfo, gli, GR_GL_HIGH_FLOAT);
diff --git a/src/gpu/tessellate/GrPathTessellator.cpp b/src/gpu/tessellate/GrPathTessellator.cpp
index 7fe80ae..ce31813 100644
--- a/src/gpu/tessellate/GrPathTessellator.cpp
+++ b/src/gpu/tessellate/GrPathTessellator.cpp
@@ -101,19 +101,13 @@
         SkASSERT(count == breadcrumbTriangleList->count());
     }
 
-    fIndirectIndexBuffer = GrMiddleOutCubicShader::FindOrMakeMiddleOutIndexBuffer(
-            target->resourceProvider());
-    if (!fIndirectIndexBuffer) {
-        vertexAlloc.unlock(0);
-        return;
-    }
-
     // Allocate space for the GrDrawIndexedIndirectCommand structs. Allocate enough for each
     // possible resolve level (kMaxResolveLevel; resolveLevel=0 never has any instances), plus one
     // more for the optional inner fan triangles.
     int indirectLockCnt = kMaxResolveLevel + 1;
-    GrDrawIndexedIndirectWriter indirectWriter = target->makeDrawIndexedIndirectSpace(
-            indirectLockCnt, &fIndirectDrawBuffer, &fIndirectDrawOffset);
+    GrDrawIndirectWriter indirectWriter = target->makeDrawIndirectSpace(indirectLockCnt,
+                                                                        &fIndirectDrawBuffer,
+                                                                        &fIndirectDrawOffset);
     if (!indirectWriter) {
         SkASSERT(!fIndirectDrawBuffer);
         vertexAlloc.unlock(0);
@@ -220,9 +214,9 @@
 
 void GrPathIndirectTessellator::draw(GrOpFlushState* flushState) const {
     if (fIndirectDrawCount) {
-        flushState->bindBuffers(fIndirectIndexBuffer, fInstanceBuffer, nullptr);
-        flushState->drawIndexedIndirect(fIndirectDrawBuffer.get(), fIndirectDrawOffset,
-                                        fIndirectDrawCount);
+        flushState->bindBuffers(nullptr, fInstanceBuffer, nullptr);
+        flushState->drawIndirect(fIndirectDrawBuffer.get(), fIndirectDrawOffset,
+                                 fIndirectDrawCount);
     }
 }
 
diff --git a/src/gpu/tessellate/GrPathTessellator.h b/src/gpu/tessellate/GrPathTessellator.h
index 393ac26..96494c4 100644
--- a/src/gpu/tessellate/GrPathTessellator.h
+++ b/src/gpu/tessellate/GrPathTessellator.h
@@ -71,7 +71,6 @@
     sk_sp<const GrBuffer> fIndirectDrawBuffer;
     size_t fIndirectDrawOffset = 0;
     int fIndirectDrawCount = 0;
-    sk_sp<const GrBuffer> fIndirectIndexBuffer;
 };
 
 // Base class for GrPathTessellators that draw actual hardware tessellation patches.
diff --git a/src/gpu/tessellate/GrStencilPathShader.cpp b/src/gpu/tessellate/GrStencilPathShader.cpp
index fb4aee8..b827b6c 100644
--- a/src/gpu/tessellate/GrStencilPathShader.cpp
+++ b/src/gpu/tessellate/GrStencilPathShader.cpp
@@ -302,100 +302,56 @@
     return new WedgeImpl;
 }
 
-constexpr static int kMaxResolveLevel = GrTessellationPathRenderer::kMaxResolveLevel;
-
-GR_DECLARE_STATIC_UNIQUE_KEY(gMiddleOutIndexBufferKey);
-
-sk_sp<const GrGpuBuffer> GrMiddleOutCubicShader::FindOrMakeMiddleOutIndexBuffer(
-        GrResourceProvider* resourceProvider) {
-    GR_DEFINE_STATIC_UNIQUE_KEY(gMiddleOutIndexBufferKey);
-    if (auto buffer = resourceProvider->findByUniqueKey<GrGpuBuffer>(gMiddleOutIndexBufferKey)) {
-        return std::move(buffer);
-    }
-
-    // One explicit triangle at index 0, and one middle-out cubic with kMaxResolveLevel line
-    // segments beginning at index 3.
-    constexpr static int kIndexCount = 3 + NumVerticesAtResolveLevel(kMaxResolveLevel);
-    auto buffer = resourceProvider->createBuffer(
-            kIndexCount * sizeof(uint16_t), GrGpuBufferType::kIndex, kStatic_GrAccessPattern);
-    if (!buffer) {
-        return nullptr;
-    }
-
-    // We shouldn't bin and/or cache static buffers.
-    SkASSERT(buffer->size() == kIndexCount * sizeof(uint16_t));
-    SkASSERT(!buffer->resourcePriv().getScratchKey().isValid());
-    auto indexData = static_cast<uint16_t*>(buffer->map());
-    SkAutoTMalloc<uint16_t> stagingBuffer;
-    if (!indexData) {
-        SkASSERT(!buffer->isMapped());
-        indexData = stagingBuffer.reset(kIndexCount);
-    }
-
-    // Indices 0,1,2 contain special values that emit points P0, P1, and P2 respectively. (When the
-    // vertex shader is fed an index value larger than (1 << kMaxResolveLevel), it emits
-    // P[index % 4].)
-    int i = 0;
-    indexData[i++] = (1 << kMaxResolveLevel) + 4;  // % 4 == 0
-    indexData[i++] = (1 << kMaxResolveLevel) + 5;  // % 4 == 1
-    indexData[i++] = (1 << kMaxResolveLevel) + 6;  // % 4 == 2
-
-    // Starting at index 3, we triangulate a cubic with 2^kMaxResolveLevel line segments. Each
-    // index value corresponds to parametric value T=(index / 2^kMaxResolveLevel). Since the
-    // triangles are arranged in "middle-out" order, we will be able to conveniently control the
-    // resolveLevel by changing only the indexCount.
-    for (uint16_t advance = 1 << (kMaxResolveLevel - 1); advance; advance >>= 1) {
-        uint16_t T = 0;
-        do {
-            indexData[i++] = T;
-            indexData[i++] = (T += advance);
-            indexData[i++] = (T += advance);
-        } while (T != (1 << kMaxResolveLevel));
-    }
-    SkASSERT(i == kIndexCount);
-
-    if (buffer->isMapped()) {
-        buffer->unmap();
-    } else {
-        buffer->updateData(stagingBuffer, kIndexCount * sizeof(uint16_t));
-    }
-    buffer->resourcePriv().setUniqueKey(gMiddleOutIndexBufferKey);
-    return std::move(buffer);
-}
-
 class GrMiddleOutCubicShader::Impl : public GrStencilPathShader::Impl {
     void onEmitCode(EmitArgs& args, GrGPArgs* gpArgs) override {
         const auto& shader = args.fGeomProc.cast<GrMiddleOutCubicShader>();
         args.fVaryingHandler->emitAttributes(shader);
-        args.fVertBuilder->defineConstantf("int", "kMaxVertexID", "%i", 1 << kMaxResolveLevel);
-        args.fVertBuilder->defineConstantf("float", "kInverseMaxVertexID",
-                                           "(1.0 / float(kMaxVertexID))");
         args.fVertBuilder->insertFunction(kUnpackRationalCubicFn);
         args.fVertBuilder->insertFunction(kEvalRationalCubicFn);
+        if (args.fShaderCaps->bitManipulationSupport()) {
+            // Determines the T value at which to place the given vertex in a "middle-out" topology.
+            args.fVertBuilder->insertFunction(R"(
+            float find_middle_out_T() {
+                int totalTriangleIdx = sk_VertexID/3 + 1;
+                int depth = findMSB(totalTriangleIdx);
+                int firstTriangleAtDepth = (1 << depth);
+                int triangleIdxWithinDepth = totalTriangleIdx - firstTriangleAtDepth;
+                int vertexIdxWithinDepth = triangleIdxWithinDepth * 2 + sk_VertexID % 3;
+                return ldexp(float(vertexIdxWithinDepth), -1 - depth);
+            })");
+        } else {
+            // Determines the T value at which to place the given vertex in a "middle-out" topology.
+            args.fVertBuilder->insertFunction(R"(
+            float find_middle_out_T() {
+                float totalTriangleIdx = float(sk_VertexID/3) + 1;
+                float depth = floor(log2(totalTriangleIdx));
+                float firstTriangleAtDepth = exp2(depth);
+                float triangleIdxWithinDepth = totalTriangleIdx - firstTriangleAtDepth;
+                float vertexIdxWithinDepth = triangleIdxWithinDepth * 2 + float(sk_VertexID % 3);
+                return vertexIdxWithinDepth * exp2(-1 - depth);
+            })");
+        }
         args.fVertBuilder->codeAppend(R"(
         float2 pos;
         if (isinf(inputPoints_2_3.z)) {
             // A conic with w=Inf is an exact triangle.
-            pos = (sk_VertexID == 0) ? inputPoints_0_1.xy :
-                  (sk_VertexID != kMaxVertexID) ? inputPoints_0_1.zw : inputPoints_2_3.xy;
+            pos = (sk_VertexID < 1)  ? inputPoints_0_1.xy
+                : (sk_VertexID == 1) ? inputPoints_0_1.zw
+                                     : inputPoints_2_3.xy;
         } else {
-            // Evaluate the cubic at T = (sk_VertexID / 2^kMaxResolveLevel).
-            float T = float(sk_VertexID) * kInverseMaxVertexID;
             float4x3 P = unpack_rational_cubic(inputPoints_0_1.xy, inputPoints_0_1.zw,
                                                inputPoints_2_3.xy, inputPoints_2_3.zw);
+            float T = find_middle_out_T();
             pos = eval_rational_cubic(P, T);
         })");
-
-        GrShaderVar vertexPos("pos", kFloat2_GrSLType);
         if (!shader.viewMatrix().isIdentity()) {
             const char* viewMatrix;
             fViewMatrixUniform = args.fUniformHandler->addUniform(
                     nullptr, kVertex_GrShaderFlag, kFloat3x3_GrSLType, "view_matrix", &viewMatrix);
             args.fVertBuilder->codeAppendf(R"(
-            float2 transformedPoint = (%s * float3(pos, 1)).xy;)", viewMatrix);
-            vertexPos.set(kFloat2_GrSLType, "transformedPoint");
+            pos = (%s * float3(pos, 1)).xy;)", viewMatrix);
         }
-        gpArgs->fPositionVar = vertexPos;
+        gpArgs->fPositionVar.set(kFloat2_GrSLType, "pos");
         // No fragment shader.
     }
 };
diff --git a/src/gpu/tessellate/GrStencilPathShader.h b/src/gpu/tessellate/GrStencilPathShader.h
index 203a184..ea12276 100644
--- a/src/gpu/tessellate/GrStencilPathShader.h
+++ b/src/gpu/tessellate/GrStencilPathShader.h
@@ -133,10 +133,17 @@
     GrGLSLGeometryProcessor* createGLSLInstance(const GrShaderCaps&) const override;
 };
 
-// Uses indirect (instanced) draws to triangulate standalone closed cubics with a "middle-out"
-// topology. The caller must compute each cubic's resolveLevel on the CPU (i.e., the log2 number of
-// line segments it will be divided into; see GrWangsFormula::cubic_log2/quadratic_log2), and then
-// sort the instance buffer by resolveLevel for efficient batching of indirect draws.
+// Uses instanced draws to triangulate standalone closed curves with a "middle-out" topology.
+// Middle-out draws a triangle with vertices at T=[0, 1/2, 1] and then recurses breadth first:
+//
+//   depth=0: T=[0, 1/2, 1]
+//   depth=1: T=[0, 1/4, 2/4], T=[2/4, 3/4, 1]
+//   depth=2: T=[0, 1/8, 2/8], T=[2/8, 3/8, 4/8], T=[4/8, 5/8, 6/8], T=[6/8, 7/8, 1]
+//   ...
+//
+// The caller may compute each cubic's resolveLevel on the CPU (i.e., the log2 number of line
+// segments it will be divided into; see GrWangsFormula::cubic_log2/quadratic_log2/conic_log2), and
+// then sort the instance buffer by resolveLevel for efficient batching of indirect draws.
 class GrMiddleOutCubicShader : public GrStencilPathShader {
 public:
     // How many vertices do we need to draw in order to triangulate a cubic with 2^resolveLevel
@@ -152,21 +159,16 @@
 
     // Configures an indirect draw to render cubic instances with 2^resolveLevel evenly-spaced (in
     // the parametric sense) line segments.
-    static void WriteDrawIndirectCmd(GrDrawIndexedIndirectWriter* indirectWriter, int resolveLevel,
+    static void WriteDrawIndirectCmd(GrDrawIndirectWriter* indirectWriter, int resolveLevel,
                                      uint32_t instanceCount, uint32_t baseInstance) {
         SkASSERT(resolveLevel > 0 && resolveLevel <= GrTessellationPathRenderer::kMaxResolveLevel);
-        // Starting at baseIndex=3, the index buffer triangulates a cubic with 2^kMaxResolveLevel
-        // line segments. Each index value corresponds to a parametric T value on the curve. Since
-        // the triangles are arranged in "middle-out" order, we can conveniently control the
-        // resolveLevel by changing only the indexCount.
-        uint32_t indexCount = NumVerticesAtResolveLevel(resolveLevel);
-        indirectWriter->writeIndexed(indexCount, 3, instanceCount, baseInstance, 0);
+        // The vertex shader determines the T value at which to draw each vertex. Since the
+        // triangles are arranged in "middle-out" order, we can conveniently control the
+        // resolveLevel by changing only the vertexCount.
+        uint32_t vertexCount = NumVerticesAtResolveLevel(resolveLevel);
+        indirectWriter->write(instanceCount, baseInstance, vertexCount, 0);
     }
 
-    // Returns the index buffer that should be bound when drawing with this shader.
-    // (Our vertex shader uses raw index values directly, so there is no vertex buffer.)
-    static sk_sp<const GrGpuBuffer> FindOrMakeMiddleOutIndexBuffer(GrResourceProvider*);
-
     GrMiddleOutCubicShader(const SkMatrix& viewMatrix)
             : GrStencilPathShader(kTessellate_GrMiddleOutCubicShader_ClassID, viewMatrix,
                                   GrPrimitiveType::kTriangles) {
diff --git a/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp b/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp
index 593b38f..5fdf450 100644
--- a/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp
+++ b/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp
@@ -758,7 +758,7 @@
     GrDrawIndirectWriter indirectWriter = target->makeDrawIndirectSpace(fChainedDrawIndirectCount,
                                                                         &fDrawIndirectBuffer,
                                                                         &fDrawIndirectOffset);
-    if (!indirectWriter.isValid()) {
+    if (!indirectWriter) {
         SkASSERT(!fDrawIndirectBuffer);
         return;
     }
diff --git a/src/gpu/vk/GrVkCaps.cpp b/src/gpu/vk/GrVkCaps.cpp
index 223b361..563b3df 100644
--- a/src/gpu/vk/GrVkCaps.cpp
+++ b/src/gpu/vk/GrVkCaps.cpp
@@ -716,7 +716,7 @@
     shaderCaps->fIntegerSupport = true;
     shaderCaps->fNonsquareMatrixSupport = true;
     shaderCaps->fVertexIDSupport = true;
-    shaderCaps->fFPManipulationSupport = true;
+    shaderCaps->fBitManipulationSupport = true;
 
     // Assume the minimum precisions mandated by the SPIR-V spec.
     shaderCaps->fFloatIs32Bits = true;