Use SIMD to update hw tessellation tolerances

Bug: chromium:1172543
Change-Id: I223566197d1f2fd5fea07302f48ab89f50a36187
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/374840
Reviewed-by: John Stiles <johnstiles@google.com>
diff --git a/bench/TessellateBench.cpp b/bench/TessellateBench.cpp
index 0737aa7..c0cb7dd 100644
--- a/bench/TessellateBench.cpp
+++ b/bench/TessellateBench.cpp
@@ -271,9 +271,10 @@
 
 class GrStrokeHardwareTessellator::TestingOnly_Benchmark : public Benchmark {
 public:
-    TestingOnly_Benchmark(MakePathStrokesFn MakePathStrokesFn, float matrixScale,
-                          const char* suffix)
+    TestingOnly_Benchmark(MakePathStrokesFn MakePathStrokesFn, ShaderFlags shaderFlags,
+                          float matrixScale, const char* suffix)
             : fMakePathStrokesFn(MakePathStrokesFn)
+            , fShaderFlags(shaderFlags)
             , fMatrixScale(matrixScale) {
         fName.printf("tessellate_GrStrokeHardwareTessellator_prepare%s", suffix);
     }
@@ -301,7 +302,7 @@
     void onDraw(int loops, SkCanvas*) final {
         SkMatrix matrix = SkMatrix::Scale(fMatrixScale, fMatrixScale);
         for (int i = 0; i < loops; ++i) {
-            GrStrokeHardwareTessellator tessellator(ShaderFlags::kNone, fPathStrokes.data(),
+            GrStrokeHardwareTessellator tessellator(fShaderFlags, fPathStrokes.data(),
                                                     fTotalVerbCount, *fTarget->caps().shaderCaps());
             tessellator.prepare(fTarget.get(), matrix);
             fTarget->resetAllocator();
@@ -310,6 +311,7 @@
 
     SkString fName;
     MakePathStrokesFn fMakePathStrokesFn;
+    const ShaderFlags fShaderFlags;
     float fMatrixScale;
     std::unique_ptr<GrMockOpTarget> fTarget;
     std::vector<PathStrokeList> fPathStrokes;
@@ -317,12 +319,22 @@
     int fTotalVerbCount = 0;
 };
 
-DEF_BENCH( return new GrStrokeHardwareTessellator::TestingOnly_Benchmark(make_simple_cubic_path, 1,
-                                                                         ""); )
-DEF_BENCH( return new GrStrokeHardwareTessellator::TestingOnly_Benchmark(make_simple_cubic_path, 5,
-                                                                         "_one_chop"); )
-DEF_BENCH( return new GrStrokeHardwareTessellator::TestingOnly_Benchmark(make_motionmark_paths, 1,
-                                                                         "_motionmark"); )
+DEF_BENCH(
+    return new GrStrokeHardwareTessellator::TestingOnly_Benchmark(make_simple_cubic_path,
+                                                                  ShaderFlags::kNone, 1, "");
+)
+
+DEF_BENCH(
+    return new GrStrokeHardwareTessellator::TestingOnly_Benchmark(make_simple_cubic_path,
+                                                                  ShaderFlags::kNone, 5,
+                                                                  "_one_chop");
+)
+
+DEF_BENCH(
+    return new GrStrokeHardwareTessellator::TestingOnly_Benchmark(make_motionmark_paths,
+                                                                  ShaderFlags::kDynamicStroke, 1,
+                                                                  "_motionmark");
+)
 
 class GrStrokeIndirectTessellator::Benchmark : public ::Benchmark {
 protected:
diff --git a/src/gpu/GrVx.h b/src/gpu/GrVx.h
index 560f15a..7323607 100644
--- a/src/gpu/GrVx.h
+++ b/src/gpu/GrVx.h
@@ -66,7 +66,7 @@
 //
 // NOTE: This function deviates immediately from pi and 0 outside -1 and 1. (The derivatives are
 // infinite at -1 and 1). So the input must still be clamped between -1 and 1.
-#define GRVX_FAST_ACOS_MAX_ERROR SkDegreesToRadians(.96f)
+#define GRVX_APPROX_ACOS_MAX_ERROR SkDegreesToRadians(.96f)
 template<int N> SK_ALWAYS_INLINE vec<N> approx_acos(vec<N> x) {
     constexpr static float a = -0.939115566365855f;
     constexpr static float b =  0.9217841528914573f;
diff --git a/src/gpu/tessellate/GrStrokeHardwareTessellator.cpp b/src/gpu/tessellate/GrStrokeHardwareTessellator.cpp
index 79d4592..83dddfa 100644
--- a/src/gpu/tessellate/GrStrokeHardwareTessellator.cpp
+++ b/src/gpu/tessellate/GrStrokeHardwareTessellator.cpp
@@ -9,6 +9,7 @@
 
 #include "src/core/SkPathPriv.h"
 #include "src/gpu/GrRecordingContextPriv.h"
+#include "src/gpu/GrVx.h"
 #include "src/gpu/geometry/GrPathUtils.h"
 #include "src/gpu/tessellate/GrWangsFormula.h"
 
@@ -35,14 +36,8 @@
     return numParametricSegments + numRadialSegments - 1;
 }
 
-static float num_parametric_segments(float numCombinedSegments, float numRadialSegments) {
-    // numCombinedSegments = numParametricSegments + numRadialSegments - 1.
-    // (See num_combined_segments()).
-    return std::max(numCombinedSegments + 1 - numRadialSegments, 0.f);
-}
-
-static float pow4(float x) {
-    float xx = x*x;
+static grvx::float2 pow4(grvx::float2 x) {
+    auto xx = x*x;
     return xx*xx;
 }
 
@@ -58,7 +53,7 @@
         kBowtie = SkPaint::kLast_Join + 1  // Double sided round join.
     };
 
-    PatchWriter(ShaderFlags shaderFlags, GrMeshDrawOp::Target* target,
+    PatchWriter(ShaderFlags shaderFlags, GrMeshDrawOp::Target* target, float matrixMaxScale,
                 SkTArray<PatchChunk>* patchChunks, int totalCombinedVerbCnt)
             : fShaderFlags(shaderFlags)
             , fTarget(target)
@@ -66,7 +61,8 @@
             , fPatchStride(GrStrokeTessellateShader::PatchStride(fShaderFlags))
             // Subtract 2 because the tessellation shader chops every cubic at two locations, and
             // each chop has the potential to introduce an extra segment.
-            , fMaxTessellationSegments(target->caps().shaderCaps()->maxTessellationSegments() - 2) {
+            , fMaxTessellationSegments(target->caps().shaderCaps()->maxTessellationSegments() - 2)
+            , fParametricIntolerance(Tolerances::CalcParametricIntolerance(matrixMaxScale)) {
         // Pre-allocate at least enough vertex space for 1 in 4 strokes to chop, and for 8 caps.
         int strokePreallocCount = totalCombinedVerbCnt * 5/4;
         int capPreallocCount = 8;
@@ -83,7 +79,7 @@
     // This is the intolerance value, adjusted for the view matrix, to use with Wang's formulas when
     // determining how many parametric segments a curve will require.
     float parametricIntolerance() const {
-        return fTolerances.fParametricIntolerance;
+        return fParametricIntolerance;
     }
     // Will a line and worst-case previous join both fit in a single patch together?
     bool lineFitsInPatch_withJoin() {
@@ -92,65 +88,61 @@
     // Will a stroke with the given number of parametric segments and a worst-case rotation of 180
     // degrees fit in a single patch?
     bool stroke180FitsInPatch(float numParametricSegments_pow4) {
-        return numParametricSegments_pow4 <= fMaxParametricSegments180_pow4;
+        return numParametricSegments_pow4 <= fMaxParametricSegments_pow4[0];
     }
     // Will a worst-case 180-degree stroke with the given number of parametric segments, and a
     // worst-case join fit in a single patch together?
     bool stroke180FitsInPatch_withJoin(float numParametricSegments_pow4) {
-        return numParametricSegments_pow4 <= fMaxParametricSegments180_pow4_withJoin;
+        return numParametricSegments_pow4 <= fMaxParametricSegments_pow4_withJoin[0];
     }
     // Will a stroke with the given number of parametric segments and a worst-case rotation of 360
     // degrees fit in a single patch?
     bool stroke360FitsInPatch(float numParametricSegments_pow4) {
-        return numParametricSegments_pow4 <= fMaxParametricSegments360_pow4;
+        return numParametricSegments_pow4 <= fMaxParametricSegments_pow4[1];
     }
     // Will a worst-case 360-degree stroke with the given number of parametric segments, and a
     // worst-case join fit in a single patch together?
     bool stroke360FitsInPatch_withJoin(float numParametricSegments_pow4) {
-        return numParametricSegments_pow4 <= fMaxParametricSegments360_pow4_withJoin;
+        return numParametricSegments_pow4 <= fMaxParametricSegments_pow4_withJoin[1];
     }
 
-    void updateTolerances(Tolerances tolerances, SkPaint::Join joinType) {
+    void updateTolerances(float numRadialSegmentsPerRadian, SkPaint::Join joinType) {
+        using grvx::float2;
+
+        fNumRadialSegmentsPerRadian = numRadialSegmentsPerRadian;
+
         // Calculate the worst-case numbers of parametric segments our hardware can support for the
         // current stroke radius, in the event that there are also enough radial segments to rotate
         // 180 and 360 degrees respectively. These are used for "quick accepts" that allow us to
         // send almost all curves directly to the hardware without having to chop.
-        float numRadialSegments180 = std::max(std::ceil(
-                SK_ScalarPI * tolerances.fNumRadialSegmentsPerRadian), 1.f);
-        float maxParametricSegments180 = num_parametric_segments(fMaxTessellationSegments,
-                                                                 numRadialSegments180);
-        fMaxParametricSegments180_pow4 = pow4(maxParametricSegments180);
+        float2 numRadialSegments_180_360 = skvx::max(skvx::ceil(
+                float2{SK_ScalarPI, 2*SK_ScalarPI} * fNumRadialSegmentsPerRadian), 1);
+        // numEdges = numSegments + 1. See num_combined_segments().
+        float maxTotalEdges = fMaxTessellationSegments + 1;
+        // numParametricSegments = numTotalEdges - numRadialSegments. See num_combined_segments().
+        float2 maxParametricSegments = skvx::max(maxTotalEdges - numRadialSegments_180_360, 0);
+        float2 maxParametricSegments_pow4 = pow4(maxParametricSegments);
+        maxParametricSegments_pow4.store(fMaxParametricSegments_pow4);
 
-        float numRadialSegments360 = std::max(std::ceil(
-                2*SK_ScalarPI * tolerances.fNumRadialSegmentsPerRadian), 1.f);
-        float maxParametricSegments360 = num_parametric_segments(fMaxTessellationSegments,
-                                                                 numRadialSegments360);
-        fMaxParametricSegments360_pow4 = pow4(maxParametricSegments360);
-
-        // Now calculate the worst-case numbers of parametric segments if we are to integrate a join
-        // into the same patch as the curve.
-        float maxNumSegmentsInJoin;
+        // Find the worst-case numbers of parametric segments if we are to integrate a join into the
+        // same patch as the curve.
+        float numRadialSegments180 = numRadialSegments_180_360[0];
+        float worstCaseNumSegmentsInJoin;
         switch (joinType) {
-            case SkPaint::kBevel_Join:
-                maxNumSegmentsInJoin = 1;
-                break;
-            case SkPaint::kMiter_Join:
-                maxNumSegmentsInJoin = 2;
-                break;
-            case SkPaint::kRound_Join:
-                // 180-degree round join.
-                maxNumSegmentsInJoin = numRadialSegments180;
-                break;
+            case SkPaint::kBevel_Join: worstCaseNumSegmentsInJoin = 1; break;
+            case SkPaint::kMiter_Join: worstCaseNumSegmentsInJoin = 2; break;
+            case SkPaint::kRound_Join: worstCaseNumSegmentsInJoin = numRadialSegments180; break;
         }
-        // Subtract an extra 1 off the end because when we integrate a join, the tessellator has to
-        // add a redundant edge between the join and curve.
-        fMaxParametricSegments180_pow4_withJoin = pow4(std::max(
-                maxParametricSegments180 - maxNumSegmentsInJoin - 1, 0.f));
-        fMaxParametricSegments360_pow4_withJoin = pow4(std::max(
-                maxParametricSegments360 - maxNumSegmentsInJoin - 1, 0.f));
-        fMaxCombinedSegments_withJoin = fMaxTessellationSegments - maxNumSegmentsInJoin - 1;
+
+        // Now calculate the worst-case numbers of parametric segments if we also want to combine a
+        // join with the patch. Subtract an extra 1 off the end because when we integrate a join,
+        // the tessellator has to add a redundant edge between the join and curve.
+        float2 maxParametricSegments_pow4_withJoin = pow4(skvx::max(
+                maxParametricSegments - worstCaseNumSegmentsInJoin - 1, 0));
+        maxParametricSegments_pow4_withJoin.store(fMaxParametricSegments_pow4_withJoin);
+
+        fMaxCombinedSegments_withJoin = fMaxTessellationSegments - worstCaseNumSegmentsInJoin - 1;
         fSoloRoundJoinAlwaysFitsInPatch = (numRadialSegments180 <= fMaxTessellationSegments);
-        fTolerances = tolerances;
         fStrokeJoinType = JoinType(joinType);
     }
 
@@ -389,7 +381,7 @@
         }
 
         float numParametricSegments_pow4 =
-                GrWangsFormula::quadratic_pow4(fTolerances.fParametricIntolerance, p);
+                GrWangsFormula::quadratic_pow4(fParametricIntolerance, p);
         if (this->stroke180FitsInPatch(numParametricSegments_pow4) || maxDepth == 0) {
             this->internalPatchTo(prevJoinType,
                                   this->stroke180FitsInPatch_withJoin(numParametricSegments_pow4),
@@ -399,8 +391,7 @@
 
         // We still might have enough tessellation segments to render the curve. Check again with
         // the actual rotation.
-        float numRadialSegments =
-                SkMeasureQuadRotation(p) * fTolerances.fNumRadialSegmentsPerRadian;
+        float numRadialSegments = SkMeasureQuadRotation(p) * fNumRadialSegmentsPerRadian;
         numRadialSegments = std::max(std::ceil(numRadialSegments), 1.f);
         float numParametricSegments = GrWangsFormula::root4(numParametricSegments_pow4);
         numParametricSegments = std::max(std::ceil(numParametricSegments), 1.f);
@@ -454,8 +445,7 @@
             return;
         }
 
-        float numParametricSegments_pow4 =
-                GrWangsFormula::cubic_pow4(fTolerances.fParametricIntolerance, p);
+        float numParametricSegments_pow4 = GrWangsFormula::cubic_pow4(fParametricIntolerance, p);
         if (this->stroke180FitsInPatch(numParametricSegments_pow4) || maxDepth == 0) {
             this->internalPatchTo(prevJoinType,
                                   this->stroke180FitsInPatch_withJoin(numParametricSegments_pow4),
@@ -465,8 +455,7 @@
 
         // We still might have enough tessellation segments to render the curve. Check again with
         // its actual rotation.
-        float numRadialSegments =
-                SkMeasureNonInflectCubicRotation(p) * fTolerances.fNumRadialSegmentsPerRadian;
+        float numRadialSegments = SkMeasureNonInflectCubicRotation(p) * fNumRadialSegmentsPerRadian;
         numRadialSegments = std::max(std::ceil(numRadialSegments), 1.f);
         float numParametricSegments = GrWangsFormula::root4(numParametricSegments_pow4);
         numParametricSegments = std::max(std::ceil(numParametricSegments), 1.f);
@@ -553,7 +542,7 @@
             SkVector tan0 = junctionPoint - fLastControlPoint;
             SkVector tan1 = nextControlPoint - junctionPoint;
             float rotation = SkMeasureAngleBetweenVectors(tan0, tan1);
-            float numRadialSegments = rotation * fTolerances.fNumRadialSegmentsPerRadian;
+            float numRadialSegments = rotation * fNumRadialSegmentsPerRadian;
             if (numRadialSegments > fMaxTessellationSegments) {
                 // This is a round join that requires more segments than the tessellator supports.
                 // Split it and recurse.
@@ -659,19 +648,29 @@
     // The maximum number of tessellation segments the hardware can emit for a single patch.
     const int fMaxTessellationSegments;
 
-    // These values contain worst-case numbers of parametric segments, raised to the 4th power, that
+    // This is the intolerance value, adjusted for the view matrix, to use with Wang's formulas when
+    // determining how many parametric segments a curve will require.
+    const float fParametricIntolerance;
+
+    // Number of radial segments required for each radian of rotation in order to look smooth with
+    // the current stroke radius.
+    float fNumRadialSegmentsPerRadian;
+
+    // These arrays contain worst-case numbers of parametric segments, raised to the 4th power, that
     // our hardware can support for the current stroke radius. They assume curve rotations of 180
     // and 360 degrees respectively. These are used for "quick accepts" that allow us to send almost
     // all curves directly to the hardware without having to chop. We raise to the 4th power because
     // the "pow4" variants of Wang's formula are the quickest to evaluate.
-    GrStrokeTessellateShader::Tolerances fTolerances;
-    JoinType fStrokeJoinType;
-    float fMaxParametricSegments180_pow4;
-    float fMaxParametricSegments360_pow4;
-    float fMaxParametricSegments180_pow4_withJoin;
-    float fMaxParametricSegments360_pow4_withJoin;
+    float fMaxParametricSegments_pow4[2];  // Values for strokes that rotate 180 and 360 degrees.
+    float fMaxParametricSegments_pow4_withJoin[2];  // For strokes that rotate 180 and 360 degrees.
+
+    // Maximum number of segments we can allocate for a stroke if we are stuffing it in a patch
+    // together with a worst-case join.
     float fMaxCombinedSegments_withJoin;
+
+    // Additional info on the current stroke radius/join type.
     bool fSoloRoundJoinAlwaysFitsInPatch;
+    JoinType fStrokeJoinType;
 
     // Variables related to the patch chunk that we are currently writing out during prepareBuffers.
     int fCurrChunkPatchCount = 0;
@@ -691,7 +690,42 @@
     GrVertexColor fDynamicColor;
 };
 
-}  // namespace
+// Calculates and buffers up future values for "numRadialSegmentsPerRadian" using SIMD.
+class alignas(sizeof(grvx::float4)) RadialSegmentsPerRadianBuffer {
+public:
+    using PathStrokeList = GrStrokeTessellator::PathStrokeList;
+
+    RadialSegmentsPerRadianBuffer(float parametricIntolerance)
+            : fParametricIntolerance(parametricIntolerance) {
+    }
+
+    float fetchNext(PathStrokeList* head) {
+        // GrStrokeTessellateOp::onCombineIfPossible does not allow hairlines to become dynamic. If
+        // this changes, we will need to call Tolerances::GetLocalStrokeWidth() for each stroke.
+        SkASSERT(!head->fStroke.isHairlineStyle());
+        if (fBufferIdx == 4) {
+            // We ran out of values. Peek ahead and buffer up 4 more.
+            PathStrokeList* peekAhead = head;
+            int i = 0;
+            do {
+                fStrokeWidths[i++] = peekAhead->fStroke.getWidth();
+            } while ((peekAhead = peekAhead->fNext) && i < 4);
+            Tolerances::ApproxNumRadialSegmentsPerRadian(fParametricIntolerance,
+                                                         fStrokeWidths).store(
+                    fNumRadialSegmentsPerRadian);
+            fBufferIdx = 0;
+        }
+        SkASSERT(0 <= fBufferIdx && fBufferIdx < 4);
+        SkASSERT(fStrokeWidths[fBufferIdx] == head->fStroke.getWidth());
+        return fNumRadialSegmentsPerRadian[fBufferIdx++];
+    }
+
+private:
+    grvx::float4 fStrokeWidths{};  // Must be first for alignment purposes.
+    float fNumRadialSegmentsPerRadian[4];
+    const float fParametricIntolerance;
+    int fBufferIdx = 4;  // Initialize the buffer as "empty";
+};
 
 SK_ALWAYS_INLINE static bool conic_has_cusp(const SkPoint p[3]) {
     SkVector a = p[1] - p[0];
@@ -736,27 +770,40 @@
            (!(skvx::all(p0 == p1) || skvx::all(p2 == p3)) || (a == 0 && b == 0 && c == 0));
 }
 
+}  // namespace
+
 void GrStrokeHardwareTessellator::prepare(GrMeshDrawOp::Target* target,
                                           const SkMatrix& viewMatrix) {
     using JoinType = PatchWriter::JoinType;
 
-    std::array<float, 2> matrixScales;
-    if (!viewMatrix.getMinMaxScales(matrixScales.data())) {
-        matrixScales.fill(1);
+    std::array<float, 2> matrixMinMaxScales;
+    if (!viewMatrix.getMinMaxScales(matrixMinMaxScales.data())) {
+        matrixMinMaxScales.fill(1);
     }
 
-    PatchWriter patchWriter(fShaderFlags, target, &fPatchChunks, fTotalCombinedVerbCnt);
-    const SkStrokeRec* strokeForTolerances = nullptr;
+    PatchWriter patchWriter(fShaderFlags, target, matrixMinMaxScales[1], &fPatchChunks,
+                            fTotalCombinedVerbCnt);
+    if (!(fShaderFlags & ShaderFlags::kDynamicStroke)) {
+        // Strokes are static. Calculate tolerances once.
+        const SkStrokeRec& stroke = fPathStrokeList->fStroke;
+        float localStrokeWidth = Tolerances::GetLocalStrokeWidth(matrixMinMaxScales.data(),
+                                                                 stroke.getWidth());
+        float numRadialSegmentsPerRadian = Tolerances::CalcNumRadialSegmentsPerRadian(
+                patchWriter.parametricIntolerance(), localStrokeWidth);
+        patchWriter.updateTolerances(numRadialSegmentsPerRadian, stroke.getJoin());
+    }
+
+    // Fast SIMD queue that buffers up values for "numRadialSegmentsPerRadian". Only used when we
+    // have dynamic strokes.
+    RadialSegmentsPerRadianBuffer radialSegmentsPerRadianBuffer(
+            patchWriter.parametricIntolerance());
 
     for (PathStrokeList* pathStroke = fPathStrokeList; pathStroke; pathStroke = pathStroke->fNext) {
         const SkStrokeRec& stroke = pathStroke->fStroke;
-        if (!strokeForTolerances || strokeForTolerances->getWidth() != stroke.getWidth() ||
-            strokeForTolerances->getCap() != stroke.getCap()) {
-            auto tolerances = Tolerances::MakePreTransform(matrixScales.data(), stroke.getWidth());
-            patchWriter.updateTolerances(tolerances, stroke.getJoin());
-            strokeForTolerances = &stroke;
-        }
         if (fShaderFlags & ShaderFlags::kDynamicStroke) {
+            // Strokes are dynamic. Update tolerances with every new stroke.
+            patchWriter.updateTolerances(radialSegmentsPerRadianBuffer.fetchNext(pathStroke),
+                                         stroke.getJoin());
             patchWriter.updateDynamicStroke(stroke);
         }
         if (fShaderFlags & ShaderFlags::kDynamicColor) {
diff --git a/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp b/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp
index 3e44604..4dd5c2f 100644
--- a/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp
+++ b/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp
@@ -75,8 +75,8 @@
 
     void updateTolerances(float strokeWidth, bool isRoundJoin) {
         this->flush();
-        fTolerances = GrStrokeTessellateShader::Tolerances::MakePreTransform(
-                fMatrixMinMaxScales.data(), strokeWidth);
+        fTolerances = GrStrokeTessellateShader::Tolerances::Make(fMatrixMinMaxScales.data(),
+                                                                 strokeWidth);
         fResolveLevelForCircles = SkTPin<float>(
                 sk_float_nextlog2(fTolerances.fNumRadialSegmentsPerRadian * SK_ScalarPI),
                 1, kMaxResolveLevel);
diff --git a/src/gpu/tessellate/GrStrokeTessellateOp.cpp b/src/gpu/tessellate/GrStrokeTessellateOp.cpp
index a278b0b..16db49b 100644
--- a/src/gpu/tessellate/GrStrokeTessellateOp.cpp
+++ b/src/gpu/tessellate/GrStrokeTessellateOp.cpp
@@ -92,6 +92,9 @@
         !DynamicStroke::StrokesHaveEqualDynamicState(this->headStroke(), op->headStroke())) {
         // The paths have different stroke properties. We will need to enable dynamic stroke if we
         // still decide to combine them.
+        if (this->headStroke().isHairlineStyle()) {
+            return CombineResult::kCannotCombine;  // Dynamic hairlines aren't supported.
+        }
         combinedFlags |= ShaderFlags::kDynamicStroke;
     }
     if (!(combinedFlags & ShaderFlags::kDynamicColor) && this->headColor() != op->headColor()) {
diff --git a/src/gpu/tessellate/GrStrokeTessellateShader.cpp b/src/gpu/tessellate/GrStrokeTessellateShader.cpp
index e147714..866313b 100644
--- a/src/gpu/tessellate/GrStrokeTessellateShader.cpp
+++ b/src/gpu/tessellate/GrStrokeTessellateShader.cpp
@@ -421,11 +421,12 @@
         if (!shader.hasDynamicStroke()) {
             Tolerances tolerances;
             if (!stroke.isHairlineStyle()) {
-                tolerances.set(shader.viewMatrix().getMaxScale(), stroke.getWidth());
+                tolerances = Tolerances::MakeNonHairline(shader.viewMatrix().getMaxScale(),
+                                                         stroke.getWidth());
             } else {
                 // In the hairline case we transform prior to tessellation. Set up tolerances for an
                 // identity viewMatrix and a strokeWidth of 1.
-                tolerances.set(1, 1);
+                tolerances = Tolerances::MakeNonHairline(1, 1);
             }
             float strokeRadius = (stroke.isHairlineStyle()) ? .5f : stroke.getWidth() * .5;
             pdman.set4f(fTessArgsUniform,
@@ -1262,11 +1263,12 @@
             // Set up the tessellation control uniforms.
             Tolerances tolerances;
             if (!stroke.isHairlineStyle()) {
-                tolerances.set(shader.viewMatrix().getMaxScale(), stroke.getWidth());
+                tolerances = Tolerances::MakeNonHairline(shader.viewMatrix().getMaxScale(),
+                                                         stroke.getWidth());
             } else {
                 // In the hairline case we transform prior to tessellation. Set up tolerances for an
                 // identity viewMatrix and a strokeWidth of 1.
-                tolerances.set(1, 1);
+                tolerances = Tolerances::MakeNonHairline(1, 1);
             }
             float strokeRadius = (stroke.isHairlineStyle()) ? .5f : stroke.getWidth() * .5;
             pdman.set4f(fTessControlArgsUniform,
diff --git a/src/gpu/tessellate/GrStrokeTessellateShader.h b/src/gpu/tessellate/GrStrokeTessellateShader.h
index d491d1d..da125bf 100644
--- a/src/gpu/tessellate/GrStrokeTessellateShader.h
+++ b/src/gpu/tessellate/GrStrokeTessellateShader.h
@@ -11,6 +11,7 @@
 #include "src/gpu/tessellate/GrPathShader.h"
 
 #include "include/core/SkStrokeRec.h"
+#include "src/gpu/GrVx.h"
 #include "src/gpu/tessellate/GrTessellationPathRenderer.h"
 #include <array>
 
@@ -69,17 +70,35 @@
     // These tolerances decide the number of parametric and radial segments the tessellator will
     // linearize curves into. These decisions are made in (pre-viewMatrix) local path space.
     struct Tolerances {
-        // See fParametricIntolerance.
+        // Decides the number of parametric segments the tessellator adds for each curve. (Uniform
+        // steps in parametric space.) The tessellator will add enough parametric segments so that,
+        // once transformed into device space, they never deviate by more than
+        // 1/GrTessellationPathRenderer::kLinearizationIntolerance pixels from the true curve.
         constexpr static float CalcParametricIntolerance(float matrixMaxScale) {
             return matrixMaxScale * GrTessellationPathRenderer::kLinearizationIntolerance;
         }
-        // Returns the equivalent tolerances in (pre-viewMatrix) local path space that the
-        // tessellator will use when rendering this stroke.
-        static Tolerances MakePreTransform(const float matrixMinMaxScales[2], float strokeWidth) {
-            float matrixMaxScale = matrixMinMaxScales[1];
+        // Decides the number of radial segments the tessellator adds for each curve. (Uniform steps
+        // in tangent angle.) The tessellator will add this number of radial segments for each
+        // radian of rotation in local path space.
+        static float CalcNumRadialSegmentsPerRadian(float parametricIntolerance,
+                                                    float strokeWidth) {
+            return .5f / acosf(std::max(1 - 2 / (parametricIntolerance * strokeWidth), -1.f));
+        }
+        template<int N> static grvx::vec<N> ApproxNumRadialSegmentsPerRadian(
+                float parametricIntolerance, grvx::vec<N> strokeWidths) {
+            grvx::vec<N> cosTheta = skvx::max(1 - 2 / (parametricIntolerance * strokeWidths), -1);
+            // Subtract GRVX_APPROX_ACOS_MAX_ERROR so we never account for too few segments.
+            return .5f / (grvx::approx_acos(cosTheta) - GRVX_APPROX_ACOS_MAX_ERROR);
+        }
+        // Returns the equivalent stroke width in (pre-viewMatrix) local path space that the
+        // tessellator will use when rendering this stroke. This only differs from the actual stroke
+        // width for hairlines.
+        static float GetLocalStrokeWidth(const float matrixMinMaxScales[2], float strokeWidth) {
+            SkASSERT(strokeWidth >= 0);
             float localStrokeWidth = strokeWidth;
-            if (localStrokeWidth == 0) {
+            if (localStrokeWidth == 0) {  // Is the stroke a hairline?
                 float matrixMinScale = matrixMinMaxScales[0];
+                float matrixMaxScale = matrixMinMaxScales[1];
                 // If the stroke is hairline then the tessellator will operate in post-transform
                 // space instead. But for the sake of CPU methods that need to conservatively
                 // approximate the number of segments to emit, we use
@@ -90,26 +109,25 @@
                 // of segments to emit.)
                 approxScale = std::max(matrixMinScale, matrixMaxScale * .25f);
                 localStrokeWidth = 1/approxScale;
+                if (localStrokeWidth == 0) {
+                    // We just can't accidentally return zero from this method because zero means
+                    // "hairline". Otherwise return whatever we calculated above.
+                    localStrokeWidth = SK_ScalarNearlyZero;
+                }
             }
-            return GrStrokeTessellateShader::Tolerances(matrixMaxScale, localStrokeWidth);
+            return localStrokeWidth;
         }
-        Tolerances() = default;
-        Tolerances(float matrixMaxScale, float strokeWidth) {
-            this->set(matrixMaxScale, strokeWidth);
+        static Tolerances Make(const float matrixMinMaxScales[2], float strokeWidth) {
+            return MakeNonHairline(matrixMinMaxScales[1],
+                                   GetLocalStrokeWidth(matrixMinMaxScales, strokeWidth));
         }
-        void set(float matrixMaxScale, float strokeWidth) {
-            fParametricIntolerance = CalcParametricIntolerance(matrixMaxScale);
-            fNumRadialSegmentsPerRadian =
-                    .5f / acosf(std::max(1 - 2/(fParametricIntolerance * strokeWidth), -1.f));
+        static Tolerances MakeNonHairline(float matrixMaxScale, float strokeWidth) {
+            SkASSERT(strokeWidth > 0);
+            float parametricIntolerance = CalcParametricIntolerance(matrixMaxScale);
+            return {parametricIntolerance,
+                    CalcNumRadialSegmentsPerRadian(parametricIntolerance, strokeWidth)};
         }
-        // Decides the number of parametric segments the tessellator adds for each curve. (Uniform
-        // steps in parametric space.) The tessellator will add enough parametric segments so that,
-        // once transformed into device space, they never deviate by more than
-        // 1/GrTessellationPathRenderer::kLinearizationIntolerance pixels from the true curve.
         float fParametricIntolerance;
-        // Decides the number of radial segments the tessellator adds for each curve. (Uniform steps
-        // in tangent angle.) The tessellator will add this number of radial segments for each
-        // radian of rotation in local path space.
         float fNumRadialSegmentsPerRadian;
     };
 
diff --git a/tests/GrVxTest.cpp b/tests/GrVxTest.cpp
index f7e05a9..829fcef 100644
--- a/tests/GrVxTest.cpp
+++ b/tests/GrVxTest.cpp
@@ -43,7 +43,7 @@
 static bool check_approx_acos(skiatest::Reporter* r, float x, float approx_acos_x) {
     float acosf_x = acosf(x);
     float error = acosf_x - approx_acos_x;
-    if (!(fabsf(error) <= GRVX_FAST_ACOS_MAX_ERROR)) {
+    if (!(fabsf(error) <= GRVX_APPROX_ACOS_MAX_ERROR)) {
         ERRORF(r, "Larger-than-expected error from grvx::approx_acos\n"
                   "  x=              %f\n"
                   "  approx_acos_x=  %f  (%f degrees\n"
@@ -52,7 +52,7 @@
                   "  tolerance=      %f  (%f degrees)\n\n",
                   x, approx_acos_x, SkRadiansToDegrees(approx_acos_x), acosf_x,
                   SkRadiansToDegrees(acosf_x), error, SkRadiansToDegrees(error),
-                  GRVX_FAST_ACOS_MAX_ERROR, SkRadiansToDegrees(GRVX_FAST_ACOS_MAX_ERROR));
+                  GRVX_APPROX_ACOS_MAX_ERROR, SkRadiansToDegrees(GRVX_APPROX_ACOS_MAX_ERROR));
         return false;
     }
     return true;
@@ -146,7 +146,7 @@
                                                float approxTheta) {
     float expectedTheta = precise_angle_between_vectors(a, b);
     float error = expectedTheta - approxTheta;
-    if (!(fabsf(error) <= GRVX_FAST_ACOS_MAX_ERROR + SK_ScalarNearlyZero)) {
+    if (!(fabsf(error) <= GRVX_APPROX_ACOS_MAX_ERROR + SK_ScalarNearlyZero)) {
         int expAx = SkFloat2Bits(a.fX) >> 23 & 0xff;
         int expAy = SkFloat2Bits(a.fY) >> 23 & 0xff;
         int expBx = SkFloat2Bits(b.fX) >> 23 & 0xff;
@@ -162,8 +162,8 @@
                   "  tolerance=         %f  (%f degrees)\n\n",
                   a.fX, a.fY, b.fX, b.fY, expAx, expAy, expBx, expBy, approxTheta,
                   SkRadiansToDegrees(approxTheta), expectedTheta, SkRadiansToDegrees(expectedTheta),
-                  error, SkRadiansToDegrees(error), GRVX_FAST_ACOS_MAX_ERROR,
-                  SkRadiansToDegrees(GRVX_FAST_ACOS_MAX_ERROR));
+                  error, SkRadiansToDegrees(error), GRVX_APPROX_ACOS_MAX_ERROR,
+                  SkRadiansToDegrees(GRVX_APPROX_ACOS_MAX_ERROR));
         return false;
     }
     return true;
diff --git a/tests/StrokeIndirectTest.cpp b/tests/StrokeIndirectTest.cpp
index 9b67b66..e2fa51b 100644
--- a/tests/StrokeIndirectTest.cpp
+++ b/tests/StrokeIndirectTest.cpp
@@ -16,6 +16,8 @@
 #include "src/gpu/tessellate/GrTessellationPathRenderer.h"
 #include "src/gpu/tessellate/GrWangsFormula.h"
 
+using Tolerances = GrStrokeTessellateShader::Tolerances;
+
 static sk_sp<GrDirectContext> make_mock_context() {
     GrMockOptions mockOptions;
     mockOptions.fDrawInstancedSupport = true;
@@ -270,7 +272,7 @@
                                                       const SkMatrix& viewMatrix,
                                                       const SkPath& path,
                                                       const SkStrokeRec& stroke) {
-    GrStrokeTessellateShader::Tolerances tolerances(viewMatrix.getMaxScale(), stroke.getWidth());
+    auto tolerances = Tolerances::MakeNonHairline(viewMatrix.getMaxScale(), stroke.getWidth());
     int8_t resolveLevelForCircles = SkTPin<float>(
             sk_float_nextlog2(tolerances.fNumRadialSegmentsPerRadian * SK_ScalarPI),
             1, kMaxResolveLevel);
@@ -439,7 +441,7 @@
     };
     auto instance = static_cast<const IndirectInstance*>(target->peekStaticVertexData());
     auto* indirect = static_cast<const GrDrawIndirectCommand*>(target->peekStaticIndirectData());
-    GrStrokeTessellateShader::Tolerances tolerances(viewMatrix.getMaxScale(), stroke.getWidth());
+    auto tolerances = Tolerances::MakeNonHairline(viewMatrix.getMaxScale(), stroke.getWidth());
     float tolerance = test_tolerance(stroke.getJoin());
     for (int i = 0; i < fChainedDrawIndirectCount; ++i) {
         int numExtraEdgesInJoin = (stroke.getJoin() == SkPaint::kMiter_Join) ? 4 : 3;