Chain indirect stroke ops that have mismatched colors

The indirect tessellator can't combine overlapping, mismatched colors
because the log2 binning draws things out of order. But we can still
chain them together and generate a single long list of indirect draws.

Bug: chromium:1172543
Bug: skia:10419
Change-Id: Id7fc7488411a2a189e24cd7dd692e5c78497f498
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/370197
Reviewed-by: Brian Salomon <bsalomon@google.com>
diff --git a/bench/TessellateBench.cpp b/bench/TessellateBench.cpp
index 6d1e6fa..b9d9a9d 100644
--- a/bench/TessellateBench.cpp
+++ b/bench/TessellateBench.cpp
@@ -229,10 +229,10 @@
         }
         for (int i = 0; i < loops; ++i) {
             SkMatrix matrix = SkMatrix::Scale(fMatrixScale, fMatrixScale);
-            GrStrokeHardwareTessellator tessellator(ShaderFlags::kNone,
+            GrStrokeHardwareTessellator tessellator(ShaderFlags::kNone, {fPath, fStrokeRec,
+                                                    SK_PMColor4fWHITE}, fPath.countVerbs(),
                                                     *fTarget->caps().shaderCaps());
-            tessellator.prepare(fTarget.get(), matrix, {fPath, fStrokeRec, SK_PMColor4fWHITE},
-                                fPath.countVerbs());
+            tessellator.prepare(fTarget.get(), matrix);
         }
     }
 
@@ -273,8 +273,7 @@
                 GrStrokeIndirectTessellator tessellator(ShaderFlags::kNone, SkMatrix::I(),
                                                         {path, fStrokeRec, SK_PMColor4fWHITE},
                                                         path.countVerbs(), fTarget->allocator());
-                tessellator.prepare(fTarget.get(), SkMatrix::I(),
-                                    {path, fStrokeRec, SK_PMColor4fWHITE}, path.countVerbs());
+                tessellator.prepare(fTarget.get(), SkMatrix::I());
             }
             fTarget->resetAllocator();
         }
diff --git a/src/gpu/tessellate/GrStrokeHardwareTessellator.cpp b/src/gpu/tessellate/GrStrokeHardwareTessellator.cpp
index 31dc3b0..20d06d4 100644
--- a/src/gpu/tessellate/GrStrokeHardwareTessellator.cpp
+++ b/src/gpu/tessellate/GrStrokeHardwareTessellator.cpp
@@ -94,9 +94,8 @@
     return a.cross(b) == 0 && a.dot(b) < 0;
 }
 
-void GrStrokeHardwareTessellator::prepare(GrMeshDrawOp::Target* target, const SkMatrix& viewMatrix,
-                                          const GrSTArenaList<PathStroke>& pathStrokeList,
-                                          int totalCombinedVerbCnt) {
+void GrStrokeHardwareTessellator::prepare(GrMeshDrawOp::Target* target,
+                                          const SkMatrix& viewMatrix) {
     SkASSERT(!fTarget);
     SkASSERT(!fViewMatrix);
     SkASSERT(!fStroke);
@@ -110,11 +109,11 @@
     }
 
     // Pre-allocate at least enough vertex space for 1 in 4 strokes to chop, and for 8 caps.
-    int strokePreallocCount = totalCombinedVerbCnt * 5/4;
+    int strokePreallocCount = fTotalCombinedVerbCnt * 5/4;
     int capPreallocCount = 8;
     this->allocPatchChunkAtLeast(strokePreallocCount + capPreallocCount);
 
-    for (const auto& pathStroke : pathStrokeList) {
+    for (const auto& pathStroke : fPathStrokeList) {
         const SkStrokeRec& stroke = pathStroke.fStroke;
         if (!fStroke || fStroke->getWidth() != stroke.getWidth() ||
             fStroke->getJoin() != stroke.getJoin()) {
diff --git a/src/gpu/tessellate/GrStrokeHardwareTessellator.h b/src/gpu/tessellate/GrStrokeHardwareTessellator.h
index 20766a6..4caff1b 100644
--- a/src/gpu/tessellate/GrStrokeHardwareTessellator.h
+++ b/src/gpu/tessellate/GrStrokeHardwareTessellator.h
@@ -18,16 +18,18 @@
 // MSAA if antialiasing is desired.
 class GrStrokeHardwareTessellator : public GrStrokeTessellator {
 public:
-    GrStrokeHardwareTessellator(ShaderFlags shaderFlags, const GrShaderCaps& shaderCaps)
-            : GrStrokeTessellator(shaderFlags)
+    GrStrokeHardwareTessellator(ShaderFlags shaderFlags,
+                                GrSTArenaList<PathStroke>&& pathStrokeList,
+                                int totalCombinedVerbCnt, const GrShaderCaps& shaderCaps)
+            : GrStrokeTessellator(shaderFlags, std::move(pathStrokeList))
+            , fTotalCombinedVerbCnt(totalCombinedVerbCnt)
             , fPatchStride(GrStrokeTessellateShader::PatchStride(fShaderFlags))
             // Subtract 2 because the tessellation shader chops every cubic at two locations, and
             // each chop has the potential to introduce an extra segment.
             , fMaxTessellationSegments(shaderCaps.maxTessellationSegments() - 2) {
     }
 
-    void prepare(GrMeshDrawOp::Target*, const SkMatrix&, const GrSTArenaList<PathStroke>&,
-                 int totalCombinedVerbCnt) override;
+    void prepare(GrMeshDrawOp::Target*, const SkMatrix&) override;
 
     void draw(GrOpFlushState*) const override;
 
@@ -76,6 +78,9 @@
     bool reservePatch();
     void allocPatchChunkAtLeast(int minPatchAllocCount);
 
+    // The combined number of path verbs from all paths in fPathStrokeList.
+    const int fTotalCombinedVerbCnt;
+
     // Size in bytes of a tessellation patch with our shader flags.
     const size_t fPatchStride;
 
diff --git a/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp b/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp
index 15ed2a9..b3ae838 100644
--- a/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp
+++ b/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp
@@ -439,18 +439,12 @@
 
 }  // namespace
 
-GrStrokeIndirectTessellator::GrStrokeIndirectTessellator(
-        ShaderFlags shaderFlags, const SkMatrix& viewMatrix,
-        const GrSTArenaList<PathStroke>& pathStrokeList, int totalCombinedVerbCnt,
-        SkArenaAlloc* alloc)
-        : GrStrokeTessellator(shaderFlags) {
-    // We can't combine colors because our log2 binning draws things out of order.
-    SkASSERT(!(fShaderFlags & ShaderFlags::kDynamicColor));
-    SkASSERT(!fTotalInstanceCount);
-    SkASSERT(!fResolveLevels);
-    SkASSERT(!fResolveLevelArrayCount);
-    SkASSERT(!fChopTsArrayCount);
-
+GrStrokeIndirectTessellator::GrStrokeIndirectTessellator(ShaderFlags shaderFlags,
+                                                         const SkMatrix& viewMatrix,
+                                                         GrSTArenaList<PathStroke>&& pathStrokeList,
+                                                         int totalCombinedVerbCnt,
+                                                         SkArenaAlloc* alloc)
+        : GrStrokeTessellator(shaderFlags, std::move(pathStrokeList)) {
     // The maximum potential number of values we will need in fResolveLevels is:
     //
     //   * 3 segments per verb (from two chops)
@@ -470,7 +464,7 @@
 
     float lastStrokeWidth = -1;
     SkPoint lastControlPoint = {0,0};
-    for (const auto& pathStroke : pathStrokeList) {
+    for (const auto& pathStroke : fPathStrokeList) {
         const SkStrokeRec& stroke = pathStroke.fStroke;
         SkASSERT(stroke.getWidth() >= 0);  // Otherwise we can't initialize lastStrokeWidth=-1.
         if (stroke.getWidth() != lastStrokeWidth ||
@@ -528,7 +522,6 @@
                     if (counter.countLine(pts, lastControlPoint, nextResolveLevel)) {
                         ++nextResolveLevel;
                     }
-                    ++fTotalInstanceCount;
                     break;
                 case Verb::kConic:
                     // We use the same quadratic formula for conics, ignoring w. This is pretty
@@ -549,10 +542,8 @@
                             ++nextResolveLevel;
                         }
                         ++fResolveLevelCounts[0];  // Second line instance.
-                        fTotalInstanceCount += 3;
                     } else {
                         counter.countQuad(pts, lastControlPoint, nextResolveLevel++);
-                        ++fTotalInstanceCount;
                     }
                     break;
                 }
@@ -562,7 +553,6 @@
                     int numChops = GrPathUtils::findCubicConvex180Chops(pts, nextChopTs, &areCusps);
                     if (areCusps && numChops > 0) {
                         cuspResolveLevel = counter.countCircles(numChops);
-                        fTotalInstanceCount += numChops;
                     }
                     if (numChops == 0) {
                         counter.countCubic(pts, lastControlPoint, nextResolveLevel);
@@ -589,13 +579,11 @@
                     }
                     nextResolveLevel += numChops + 1;
                     nextChopTs += numChops;
-                    fTotalInstanceCount += numChops + 1;
                     break;
                 }
                 case Verb::kCircle:
                     // The iterator implements round caps as circles.
                     *nextResolveLevel++ = counter.countCircles(1);
-                    ++fTotalInstanceCount;
                     break;
                 case Verb::kMoveWithinContour:
                 case Verb::kContourFinished:
@@ -607,6 +595,14 @@
     }
     counter.flush();
 
+    for (int resolveLevelInstanceCount : fResolveLevelCounts) {
+        fTotalInstanceCount += resolveLevelInstanceCount;
+        if (resolveLevelInstanceCount) {
+            ++fChainedDrawIndirectCount;
+        }
+    }
+    fChainedInstanceCount = fTotalInstanceCount;
+
 #ifdef SK_DEBUG
     SkASSERT(nextResolveLevel <= fResolveLevels + resolveLevelAllocCount);
     fResolveLevelArrayCount = nextResolveLevel - fResolveLevels;
@@ -616,6 +612,24 @@
 #endif
 }
 
+void GrStrokeIndirectTessellator::addToChain(GrStrokeIndirectTessellator* tessellator) {
+    SkASSERT(tessellator->fShaderFlags == fShaderFlags);
+
+    fChainedInstanceCount += tessellator->fChainedInstanceCount;
+    tessellator->fChainedInstanceCount = 0;
+
+    fChainedDrawIndirectCount += tessellator->fChainedDrawIndirectCount;
+    tessellator->fChainedDrawIndirectCount = 0;
+
+    fMaxNumExtraEdgesInJoin = std::max(tessellator->fMaxNumExtraEdgesInJoin,
+                                       fMaxNumExtraEdgesInJoin);
+    tessellator->fMaxNumExtraEdgesInJoin = 0;
+
+    *fChainTail = tessellator;
+    fChainTail = tessellator->fChainTail;
+    tessellator->fChainTail = nullptr;
+}
+
 namespace {
 
 constexpr static int num_edges_in_resolve_level(int resolveLevel) {
@@ -639,6 +653,7 @@
                           ShaderFlags shaderFlags, size_t instanceStride, int baseInstance,
                           int numExtraEdgesInJoin, const int resolveLevelCounts[kNumBins])
             : fShaderFlags(shaderFlags) {
+        SkASSERT(numExtraEdgesInJoin == 3 || numExtraEdgesInJoin == 4);
         // Partition the instance buffer into bins and write out indirect draw commands per bin.
         int runningInstanceCount = 0;
         for (int i = 0; i < kNumBins; ++i) {
@@ -670,6 +685,13 @@
         fDynamicStroke.set(stroke);
     }
 
+    void updateDynamicColor(const SkPMColor4f& color) {
+        SkASSERT(fShaderFlags & ShaderFlags::kDynamicColor);
+        bool wideColor = fShaderFlags & ShaderFlags::kWideColor;
+        SkASSERT(wideColor || color.fitsInBytes());
+        fDynamicColor.set(color, wideColor);
+    }
+
     void writeStroke(int8_t resolveLevel, const SkPoint pts[4], SkPoint prevControlPoint,
                      bool isInternalChop = false) {
         SkASSERT(0 <= resolveLevel && resolveLevel < kNumBins);
@@ -710,6 +732,9 @@
         if (fShaderFlags & ShaderFlags::kDynamicStroke) {
             fInstanceWriters[resolveLevel].write(fDynamicStroke);
         }
+        if (fShaderFlags & ShaderFlags::kDynamicColor) {
+            fInstanceWriters[resolveLevel].write(fDynamicColor);
+        }
     }
 
     const ShaderFlags fShaderFlags;
@@ -717,45 +742,39 @@
     float fNumEdgesPerResolveLevel[kNumBins];
     SkDEBUGCODE(GrVertexWriter fEndWriters[kNumBins];)
 
-    // Stateful value for the dynamic stroke (if any) that will get written out with each instance.
+    // Stateful values for the dynamic state (if any) that will get written out with each instance.
     DynamicStroke fDynamicStroke;
+    GrVertexColor fDynamicColor;
 };
 
 }  // namespace
 
-void GrStrokeIndirectTessellator::prepare(GrMeshDrawOp::Target* target, const SkMatrix& viewMatrix,
-                                          const GrSTArenaList<PathStroke>& pathStrokeList,
-                                          int totalCombinedVerbCnt) {
+void GrStrokeIndirectTessellator::prepare(GrMeshDrawOp::Target* target,
+                                          const SkMatrix& viewMatrix) {
     SkASSERT(fResolveLevels);
     SkASSERT(!fDrawIndirectBuffer);
     SkASSERT(!fInstanceBuffer);
-    SkASSERT(!fDrawIndirectCount);
 
-    if (!fTotalInstanceCount) {
+    if (!fChainedDrawIndirectCount) {
         return;
     }
-
-    for (int resolveLevelCount : fResolveLevelCounts) {
-        if (resolveLevelCount) {
-            ++fDrawIndirectCount;
-        }
-    }
-    SkASSERT(fDrawIndirectCount);
+    SkASSERT(fChainedDrawIndirectCount > 0);
+    SkASSERT(fChainedInstanceCount > 0);
 
     // Allocate indirect draw commands.
-    GrDrawIndirectWriter indirectWriter = target->makeDrawIndirectSpace(fDrawIndirectCount,
+    GrDrawIndirectWriter indirectWriter = target->makeDrawIndirectSpace(fChainedDrawIndirectCount,
                                                                         &fDrawIndirectBuffer,
                                                                         &fDrawIndirectOffset);
     if (!indirectWriter.isValid()) {
         SkASSERT(!fDrawIndirectBuffer);
         return;
     }
-    SkDEBUGCODE(auto endIndirectWriter = indirectWriter.makeOffset(fDrawIndirectCount));
+    SkDEBUGCODE(auto endIndirectWriter = indirectWriter.makeOffset(fChainedDrawIndirectCount));
 
     // We already know the instance count. Allocate an instance for each.
     int baseInstance;
     size_t instanceStride = GrStrokeTessellateShader::IndirectInstanceStride(fShaderFlags);
-    GrVertexWriter instanceWriter = {target->makeVertexSpace(instanceStride, fTotalInstanceCount,
+    GrVertexWriter instanceWriter = {target->makeVertexSpace(instanceStride, fChainedInstanceCount,
                                                              &fInstanceBuffer, &baseInstance)};
     if (!instanceWriter.isValid()) {
         SkASSERT(!fInstanceBuffer);
@@ -763,10 +782,26 @@
         return;
     }
     SkDEBUGCODE(auto endInstanceWriter = instanceWriter.makeOffset(instanceStride *
-                                                                   fTotalInstanceCount);)
+                                                                   fChainedInstanceCount);)
 
-    BinningInstanceWriter binningWriter(&indirectWriter, &instanceWriter, fShaderFlags,
-                                        instanceStride, baseInstance, fMaxNumExtraEdgesInJoin,
+    // Fill in the indirect-draw and instance buffers.
+    for (auto* tess = this; tess; tess = tess->fNextInChain) {
+        tess->writeBuffers(&indirectWriter, &instanceWriter, viewMatrix, instanceStride,
+                           baseInstance, fMaxNumExtraEdgesInJoin);
+        baseInstance += tess->fTotalInstanceCount;
+    }
+
+    SkASSERT(indirectWriter == endIndirectWriter);
+    SkASSERT(instanceWriter == endInstanceWriter);
+}
+
+void GrStrokeIndirectTessellator::writeBuffers(GrDrawIndirectWriter* indirectWriter,
+                                               GrVertexWriter* instanceWriter,
+                                               const SkMatrix& viewMatrix,
+                                               size_t instanceStride, int baseInstance,
+                                               int numExtraEdgesInJoin) {
+    BinningInstanceWriter binningWriter(indirectWriter, instanceWriter, fShaderFlags,
+                                        instanceStride, baseInstance, numExtraEdgesInJoin,
                                         fResolveLevelCounts);
 
     SkPoint scratchBuffer[4 + 10];
@@ -781,12 +816,16 @@
     int8_t resolveLevel;
 
     // Now write out each instance to its resolveLevel's designated location in the instance buffer.
-    for (const auto& pathStroke : pathStrokeList) {
+    for (const auto& pathStroke : fPathStrokeList) {
         const SkStrokeRec& stroke = pathStroke.fStroke;
+        SkASSERT(stroke.getJoin() != SkPaint::kMiter_Join || numExtraEdgesInJoin == 4);
         bool isRoundJoin = (stroke.getJoin() == SkPaint::kRound_Join);
         if (fShaderFlags & ShaderFlags::kDynamicStroke) {
             binningWriter.updateDynamicStroke(stroke);
         }
+        if (fShaderFlags & ShaderFlags::kDynamicColor) {
+            binningWriter.updateDynamicColor(pathStroke.fColor);
+        }
         GrStrokeIterator iter(pathStroke.fPath, &stroke, &viewMatrix);
         bool hasLastControlPoint = false;
         while (iter.next()) {
@@ -911,20 +950,19 @@
         }
     }
 
-    SkASSERT(indirectWriter == endIndirectWriter);
-    SkASSERT(instanceWriter == endInstanceWriter);
     SkASSERT(nextResolveLevel == fResolveLevels + fResolveLevelArrayCount);
     SkASSERT(nextChopTs == fChopTs + fChopTsArrayCount);
 }
 
 void GrStrokeIndirectTessellator::draw(GrOpFlushState* flushState) const {
-    if (!fInstanceBuffer) {
+    if (!fDrawIndirectBuffer) {
         return;
     }
 
-    SkASSERT(fDrawIndirectCount);
-    SkASSERT(fTotalInstanceCount > 0);
+    SkASSERT(fChainedDrawIndirectCount > 0);
+    SkASSERT(fChainedInstanceCount > 0);
 
     flushState->bindBuffers(nullptr, fInstanceBuffer, nullptr);
-    flushState->drawIndirect(fDrawIndirectBuffer.get(), fDrawIndirectOffset, fDrawIndirectCount);
+    flushState->drawIndirect(fDrawIndirectBuffer.get(), fDrawIndirectOffset,
+                             fChainedDrawIndirectCount);
 }
diff --git a/src/gpu/tessellate/GrStrokeIndirectTessellator.h b/src/gpu/tessellate/GrStrokeIndirectTessellator.h
index e3f3012..9552ff3 100644
--- a/src/gpu/tessellate/GrStrokeIndirectTessellator.h
+++ b/src/gpu/tessellate/GrStrokeIndirectTessellator.h
@@ -22,17 +22,29 @@
     // become an issue if we try to draw a stroke with an astronomically wide width.
     constexpr static int8_t kMaxResolveLevel = 15;
 
-    GrStrokeIndirectTessellator(ShaderFlags, const SkMatrix&, const GrSTArenaList<PathStroke>&,
+    GrStrokeIndirectTessellator(ShaderFlags, const SkMatrix&, GrSTArenaList<PathStroke>&&,
                                 int totalCombinedVerbCnt, SkArenaAlloc*);
 
-    void prepare(GrMeshDrawOp::Target*, const SkMatrix&, const GrSTArenaList<PathStroke>&,
-                 int totalCombinedVerbCnt) override;
+    // Adds the given tessellator to our chain. The chained tessellators all append to a shared
+    // indirect draw list during prepare().
+    void addToChain(GrStrokeIndirectTessellator*);
+
+    void prepare(GrMeshDrawOp::Target*, const SkMatrix&) override;
 
     void draw(GrOpFlushState*) const override;
 
 private:
+    // Called during prepare(). Appends our indirect-draw commands and instance data onto the
+    // provided writers.
+    void writeBuffers(GrDrawIndirectWriter*, GrVertexWriter*, const SkMatrix&,
+                      size_t instanceStride, int baseInstance, int numExtraEdgesInJoin);
+
     int fResolveLevelCounts[kMaxResolveLevel + 1] = {0};  // # of instances at each resolve level.
     int fTotalInstanceCount = 0;  // Total number of stroke instances we will draw.
+    // Total number of indirect draw commands in the chain, or zero if we are not the chain head.
+    int fChainedDrawIndirectCount = 0;
+    // Total number of stroke instances in the entire chain, or zero if we are not the chain head.
+    int fChainedInstanceCount = 0;
 
     // This array holds a resolveLevel for each stroke in the path, stored in the iteration order of
     // GrStrokeIterator. If a stroke needs to be chopped, the array will contain a negative number
@@ -53,11 +65,14 @@
     // of additional edges to every instance.
     int fMaxNumExtraEdgesInJoin = 0;
 
+    // Chained tessellators. These all append to our shared indirect draw list during prepare().
+    GrStrokeIndirectTessellator* fNextInChain = nullptr;
+    GrStrokeIndirectTessellator** fChainTail = &fNextInChain;  // Null if we are not the chain head.
+
     // GPU buffers for drawing.
     sk_sp<const GrBuffer> fDrawIndirectBuffer;
     sk_sp<const GrBuffer> fInstanceBuffer;
     size_t fDrawIndirectOffset;
-    int fDrawIndirectCount = 0;
 
     friend class GrOp;  // For ctor.
 
diff --git a/src/gpu/tessellate/GrStrokeTessellateOp.cpp b/src/gpu/tessellate/GrStrokeTessellateOp.cpp
index 5885e10..f8fbfea 100644
--- a/src/gpu/tessellate/GrStrokeTessellateOp.cpp
+++ b/src/gpu/tessellate/GrStrokeTessellateOp.cpp
@@ -14,6 +14,8 @@
 #include "src/gpu/tessellate/GrStrokeHardwareTessellator.h"
 #include "src/gpu/tessellate/GrStrokeIndirectTessellator.h"
 
+using DynamicStroke = GrStrokeTessellateShader::DynamicStroke;
+
 GrStrokeTessellateOp::GrStrokeTessellateOp(GrAAType aaType, const SkMatrix& viewMatrix,
                                            const SkPath& path, const SkStrokeRec& stroke,
                                            GrPaint&& paint)
@@ -73,7 +75,6 @@
 
 GrOp::CombineResult GrStrokeTessellateOp::onCombineIfPossible(GrOp* grOp, SkArenaAlloc* alloc,
                                                               const GrCaps& caps) {
-    using DynamicStroke = GrStrokeTessellateShader::DynamicStroke;
     SkASSERT(grOp->classID() == this->classID());
     auto* op = static_cast<GrStrokeTessellateOp*>(grOp);
 
@@ -110,10 +111,13 @@
         }
     }
 
-    // The indirect tessellator can't combine colors because its log2 binning draws things out of
-    // order. Only enable dynamic color if we have hardware tessellation.
-    if ((combinedFlags & ShaderFlags::kDynamicColor) && !this->canUseHardwareTessellation(caps)) {
-        return CombineResult::kCannotCombine;
+    // The indirect tessellator can't combine overlapping, mismatched colors because the log2
+    // binning draws things out of order. But we can still chain them together and generate a single
+    // long list of indirect draws.
+    if ((combinedFlags & ShaderFlags::kDynamicColor) &&
+        !this->canUseHardwareTessellation(caps) &&
+        this->bounds().intersects(op->bounds())) {
+        return CombineResult::kMayChain;
     }
 
     fPathStrokeList.concat(std::move(op->fPathStrokeList), alloc);
@@ -158,12 +162,40 @@
     GrStrokeTessellateShader::Mode shaderMode;
     if (this->canUseHardwareTessellation(caps) &&
         ((fShaderFlags & ShaderFlags::kDynamicColor) || fTotalCombinedVerbCnt > 50)) {
-        fTessellator = arena->make<GrStrokeHardwareTessellator>(fShaderFlags, *caps.shaderCaps());
+        SkASSERT(!this->nextInChain());  // We never chain when hw tessellation is an option.
+        fTessellator = arena->make<GrStrokeHardwareTessellator>(fShaderFlags,
+                                                                std::move(fPathStrokeList),
+                                                                fTotalCombinedVerbCnt,
+                                                                *caps.shaderCaps());
         shaderMode = GrStrokeTessellateShader::Mode::kTessellation;
     } else {
-        fTessellator = arena->make<GrStrokeIndirectTessellator>(fShaderFlags, fViewMatrix,
-                                                                fPathStrokeList,
-                                                                fTotalCombinedVerbCnt, arena);
+        if (this->nextInChain()) {
+            // We are a chained list of indirect stroke ops. The only reason we would have chained
+            // is if everything was a match except color.
+            fShaderFlags |= ShaderFlags::kDynamicColor;
+            // Collect any other shader flags in the chain.
+            const SkStrokeRec& headStroke = this->headStroke();
+            for (GrStrokeTessellateOp* op = this->nextInChain(); op; op = op->nextInChain()) {
+                fShaderFlags |= op->fShaderFlags;
+                if (!(fShaderFlags & ShaderFlags::kDynamicStroke) &&
+                    !DynamicStroke::StrokesHaveEqualDynamicState(headStroke, op->headStroke())) {
+                    fShaderFlags |= ShaderFlags::kDynamicStroke;
+                }
+            }
+        }
+        auto* headTessellator = arena->make<GrStrokeIndirectTessellator>(
+                fShaderFlags, fViewMatrix, std::move(fPathStrokeList), fTotalCombinedVerbCnt,
+                arena);
+        // Make a tessellator for every chained op after us. These will all append to the head
+        // tessellator's shared indirect-draw list during prepare().
+        for (GrStrokeTessellateOp* op = this->nextInChain(); op; op = op->nextInChain()) {
+            SkASSERT(fViewMatrix == op->fViewMatrix);
+            auto* chainedTessellator = arena->make<GrStrokeIndirectTessellator>(
+                    fShaderFlags, fViewMatrix, std::move(op->fPathStrokeList),
+                    op->fTotalCombinedVerbCnt, arena);
+            headTessellator->addToChain(chainedTessellator);
+        }
+        fTessellator = headTessellator;
         shaderMode = GrStrokeTessellateShader::Mode::kIndirect;
     }
 
@@ -219,18 +251,17 @@
                                     flushState->detachAppliedClip());
     }
     SkASSERT(fTessellator);
-    fTessellator->prepare(flushState, fViewMatrix, fPathStrokeList, fTotalCombinedVerbCnt);
+    fTessellator->prepare(flushState, fViewMatrix);
 }
 
 void GrStrokeTessellateOp::onExecute(GrOpFlushState* flushState, const SkRect& chainBounds) {
-    SkASSERT(chainBounds == this->bounds());
     if (fStencilProgram) {
-        flushState->bindPipelineAndScissorClip(*fStencilProgram, this->bounds());
+        flushState->bindPipelineAndScissorClip(*fStencilProgram, chainBounds);
         flushState->bindTextures(fStencilProgram->primProc(), nullptr, fStencilProgram->pipeline());
         fTessellator->draw(flushState);
     }
     if (fFillProgram) {
-        flushState->bindPipelineAndScissorClip(*fFillProgram, this->bounds());
+        flushState->bindPipelineAndScissorClip(*fFillProgram, chainBounds);
         flushState->bindTextures(fFillProgram->primProc(), nullptr, fFillProgram->pipeline());
         fTessellator->draw(flushState);
     }
diff --git a/src/gpu/tessellate/GrStrokeTessellateOp.h b/src/gpu/tessellate/GrStrokeTessellateOp.h
index 42b8d45..819d69e 100644
--- a/src/gpu/tessellate/GrStrokeTessellateOp.h
+++ b/src/gpu/tessellate/GrStrokeTessellateOp.h
@@ -21,8 +21,6 @@
 public:
     using ShaderFlags = GrStrokeTessellateShader::ShaderFlags;
 
-    GrStrokeTessellator(ShaderFlags shaderFlags) : fShaderFlags(shaderFlags) {}
-
     struct PathStroke {
         PathStroke(const SkPath& path, const SkStrokeRec& stroke, const SkPMColor4f& color)
                 : fPath(path), fStroke(stroke), fColor(color) {}
@@ -31,9 +29,11 @@
         SkPMColor4f fColor;
     };
 
+    GrStrokeTessellator(ShaderFlags shaderFlags, GrSTArenaList<PathStroke>&& pathStrokeList)
+            : fShaderFlags(shaderFlags), fPathStrokeList(std::move(pathStrokeList)) {}
+
     // Called before draw(). Prepares GPU buffers containing the geometry to tessellate.
-    virtual void prepare(GrMeshDrawOp::Target*, const SkMatrix&, const GrSTArenaList<PathStroke>&,
-                         int totalCombinedVerbCnt) = 0;
+    virtual void prepare(GrMeshDrawOp::Target*, const SkMatrix&) = 0;
 
     // Issues draw calls for the tessellated stroke. The caller is responsible for binding its
     // desired pipeline ahead of time.
@@ -43,6 +43,7 @@
 
 protected:
     const ShaderFlags fShaderFlags;
+    const GrSTArenaList<PathStroke> fPathStrokeList;
 };
 
 // Renders strokes by linearizing them into sorted "parametric" and "radial" edges. See
@@ -58,6 +59,9 @@
 
     SkStrokeRec& headStroke() { return fPathStrokeList.head().fStroke; }
     SkPMColor4f& headColor() { return fPathStrokeList.head().fColor; }
+    GrStrokeTessellateOp* nextInChain() const {
+        return static_cast<GrStrokeTessellateOp*>(this->GrDrawOp::nextInChain());
+    }
 
     // Returns whether it is a good tradeoff to use the dynamic states flagged in the given
     // bitfield. Dynamic states improve batching, but if they aren't already enabled, they come at
diff --git a/src/gpu/tessellate/GrStrokeTessellateShader.cpp b/src/gpu/tessellate/GrStrokeTessellateShader.cpp
index 73d81d4..81ab565 100644
--- a/src/gpu/tessellate/GrStrokeTessellateShader.cpp
+++ b/src/gpu/tessellate/GrStrokeTessellateShader.cpp
@@ -1230,11 +1230,18 @@
             gpArgs->fLocalCoordVar.set(kFloat2_GrSLType, "localCoord");
         }
 
-        // The fragment shader just outputs a uniform color.
-        const char* colorUniformName;
-        fColorUniform = args.fUniformHandler->addUniform(
-                nullptr, kFragment_GrShaderFlag, kHalf4_GrSLType, "color", &colorUniformName);
-        args.fFragBuilder->codeAppendf("%s = %s;", args.fOutputColor, colorUniformName);
+        if (!shader.hasDynamicColor()) {
+            // The fragment shader just outputs a uniform color.
+            const char* colorUniformName;
+            fColorUniform = args.fUniformHandler->addUniform(
+                    nullptr, kFragment_GrShaderFlag, kHalf4_GrSLType, "color", &colorUniformName);
+            args.fFragBuilder->codeAppendf("%s = %s;", args.fOutputColor, colorUniformName);
+        } else {
+            // Color gets passed in through an instance attrib.
+            args.fVaryingHandler->addPassThroughAttribute(
+                    shader.fAttribs.back(), args.fOutputColor,
+                    GrGLSLVaryingHandler::Interpolation::kCanBeFlat);
+        }
         args.fFragBuilder->codeAppendf("%s = half4(1);", args.fOutputCoverage);
     }
 
@@ -1273,7 +1280,9 @@
                         m.getScaleY());
         }
 
-        pdman.set4fv(fColorUniform, 1, shader.fColor.vec());
+        if (!shader.hasDynamicColor()) {
+            pdman.set4fv(fColorUniform, 1, shader.fColor.vec());
+        }
     }
 
     GrGLSLUniformHandler::UniformHandle fTessControlArgsUniform;
diff --git a/tests/StrokeIndirectTest.cpp b/tests/StrokeIndirectTest.cpp
index 39135a0..b920b77 100644
--- a/tests/StrokeIndirectTest.cpp
+++ b/tests/StrokeIndirectTest.cpp
@@ -45,8 +45,7 @@
                                                     matrix, {path, stroke, SK_PMColor4fWHITE},
                                                     path.countVerbs(), target->allocator());
             tessellator.verifyResolveLevels(r, target, matrix, path, stroke);
-            tessellator.prepare(target, matrix, {path, stroke, SK_PMColor4fWHITE},
-                                path.countVerbs());
+            tessellator.prepare(target, matrix);
             tessellator.verifyBuffers(r, target, matrix, stroke);
         }
     }
@@ -441,7 +440,7 @@
     auto* indirect = static_cast<const GrDrawIndirectCommand*>(target->peekStaticIndirectData());
     GrStrokeTessellateShader::Tolerances tolerances(viewMatrix.getMaxScale(), stroke.getWidth());
     float tolerance = test_tolerance(stroke.getJoin());
-    for (int i = 0; i < fDrawIndirectCount; ++i) {
+    for (int i = 0; i < fChainedDrawIndirectCount; ++i) {
         int numExtraEdgesInJoin = (stroke.getJoin() == SkPaint::kMiter_Join) ? 4 : 3;
         int numStrokeEdges = indirect->fVertexCount/2 - numExtraEdgesInJoin;
         int numSegments = numStrokeEdges - 1;