Use back-face culling to give PLS triangles negative coverage

Previously, we would check at every pixel whether its triangle was clockwise, and use this information to decide if coverage should be positive or negative.

This PR updates to PLS to enable back-face culling emit every triangle twice: once clockwise and once counterclockwise; once with positive coverage and once with negative. And it emits this geometry in such a way that back-face culling naturally selects the triangle with the appropriately signed coverage (discarding the other).

For strokes, we take care to ensure the triangles are always clockwise and only emit the triangles once. This has a nice side effect of discarding some backwards triangles that we didn't need anyway when the stroke folded over onto itself.

This change gets some nice speedups by eliminating another branch in the fragment shader, dropping some unnecessary stroke triangles, and by no longer requiring the GPU to surface winding information to the shader.

It also paves the way for some really neat fill rule optimizations by always drawing negative coverage FIRST, and then drawing the positive coverage after.

Perf improvements:

187 martys on M1: 114 - 122 fps
27 martys on Intel: 27.4 - 31.7 fps
paper on Intel: 60.6 - 86.8 fps
3 tigers on Intel: 54.9 - 69.5 fps

Diffs=
f6d19cc24 Use back-face culling to give PLS triangles negative coverage (#5637)

Co-authored-by: Chris Dalton <99840794+csmartdalton@users.noreply.github.com>
diff --git a/.rive_head b/.rive_head
index ee132d1..344c9b1 100644
--- a/.rive_head
+++ b/.rive_head
@@ -1 +1 @@
-420d27a5bc052e39f9592f1561070ad509589ed1
+f6d19cc2449cb487ff0bb007817d5e77fe117380
diff --git a/include/rive/pls/pls.hpp b/include/rive/pls/pls.hpp
index b659666..30c6ee1 100644
--- a/include/rive/pls/pls.hpp
+++ b/include/rive/pls/pls.hpp
@@ -301,13 +301,46 @@
 // Each curve gets tessellated into vertices. This is performed by rendering a horizontal span of
 // positions and normals into the tessellation data texture, GP-GPU style. TessVertexSpan defines
 // one instance of a horizontal tessellation span for rendering.
+//
+// Each span has an optional reflection, rendered right to left, with the same vertices in reverse
+// order. These are used to draw mirrored patches with negative coverage when we have back-face
+// culling enabled. This emits every triangle twice, once clockwise and once counterclockwise, and
+// back-face culling naturally selects the triangle with the appropriately signed coverage
+// (discarding the other).
 struct TessVertexSpan
 {
     RIVE_ALWAYS_INLINE void set(const Vec2D pts_[4],
                                 Vec2D joinTangent_,
+                                float y_,
                                 int32_t x0,
                                 int32_t x1,
-                                uint32_t y_,
+                                uint32_t parametricSegmentCount,
+                                uint32_t polarSegmentCount,
+                                uint32_t joinSegmentCount,
+                                uint32_t contourIDWithFlags_)
+    {
+        set(pts_,
+            joinTangent_,
+            y_,
+            x0,
+            x1,
+            std::numeric_limits<float>::quiet_NaN(), // Discard the reflection.
+            -1,
+            -1,
+            parametricSegmentCount,
+            polarSegmentCount,
+            joinSegmentCount,
+            contourIDWithFlags_);
+    }
+
+    RIVE_ALWAYS_INLINE void set(const Vec2D pts_[4],
+                                Vec2D joinTangent_,
+                                float y_,
+                                int32_t x0,
+                                int32_t x1,
+                                float reflectionY_,
+                                int32_t reflectionX0,
+                                int32_t reflectionX1,
                                 uint32_t parametricSegmentCount,
                                 uint32_t polarSegmentCount,
                                 uint32_t joinSegmentCount,
@@ -315,24 +348,30 @@
     {
         RIVE_INLINE_MEMCPY(pts, pts_, sizeof(pts));
         joinTangent = joinTangent_;
-        x0x1 = (x1 << 16) | (x0 & 0xffff);
         y = y_;
+        reflectionY = reflectionY_;
+        x0x1 = (x1 << 16) | (x0 & 0xffff);
+        reflectionX0X1 = (reflectionX1 << 16) | (reflectionX0 & 0xffff);
         segmentCounts =
             (joinSegmentCount << 20) | (polarSegmentCount << 10) | parametricSegmentCount;
         contourIDWithFlags = contourIDWithFlags_;
 
         // Ensure we didn't lose any data from packing.
-        assert(x0 == (x0x1 << 16) >> 16);
+        assert(x0 == x0x1 << 16 >> 16);
         assert(x1 == x0x1 >> 16);
+        assert(reflectionX0 == reflectionX0X1 << 16 >> 16);
+        assert(reflectionX1 == reflectionX0X1 >> 16);
         assert((segmentCounts & 0x3ff) == parametricSegmentCount);
         assert(((segmentCounts >> 10) & 0x3ff) == polarSegmentCount);
         assert(segmentCounts >> 20 == joinSegmentCount);
     }
+
     Vec2D pts[4];      // Cubic bezier curve.
     Vec2D joinTangent; // Ending tangent of the join that follows the cubic.
-    Vec2D pad;         // Align our attributes on 128-bit boundaries.
+    float y;
+    float reflectionY;
     int32_t x0x1;
-    uint32_t y;
+    int32_t reflectionX0X1;
     uint32_t segmentCounts;      // [joinSegmentCount, polarSegmentCount, parametricSegmentCount]
     uint32_t contourIDWithFlags; // flags | contourID
 };
@@ -346,7 +385,7 @@
         point(point_), weight_pathID((static_cast<int32_t>(weight) << 16) | pathID)
     {}
     Vec2D point;
-    int32_t weight_pathID; // [(weight << 16]
+    int32_t weight_pathID; // [(weight << 16 | pathID]
 };
 static_assert(sizeof(TriangleVertex) == sizeof(float) * 3);
 
@@ -379,6 +418,25 @@
 
 struct PatchVertex
 {
+    void set(float localVertexID_, float outset_, float fillCoverage_, float params_)
+    {
+        localVertexID = localVertexID_;
+        outset = outset_;
+        fillCoverage = fillCoverage_;
+        params = params_;
+        setMirroredPosition(localVertexID_, outset_, fillCoverage_);
+    }
+
+    // Patch vertices can have an optional, alternate position when mirrored. This is so we can
+    // ensure the diagonals inside the stroke line up on both versions of the patch (mirrored and
+    // not).
+    void setMirroredPosition(float localVertexID_, float outset_, float fillCoverage_)
+    {
+        mirroredVertexID = localVertexID_;
+        mirroredOutset = outset_;
+        mirroredFillCoverage = fillCoverage_;
+    }
+
     float localVertexID; // 0 or 1 -- which tessellated vertex of the two that we are connecting?
     float outset;        // Outset from the tessellated position, in the direction of the normal.
     float fillCoverage;  // 0..1 for the stroke. 1 all around for the triangles.
@@ -386,7 +444,12 @@
     int32_t params;      // "(patchSize << 2) | [flags::kStrokeVertex,
                          //                      flags::kFanVertex,
                          //                      flags::kFanMidpointVertex]"
+    float mirroredVertexID;
+    float mirroredOutset;
+    float mirroredFillCoverage;
+    int32_t pad = 0;
 };
+static_assert(sizeof(PatchVertex) == sizeof(float) * 8);
 
 // # of tessellation segments spanned by the midpoint fan patch.
 constexpr static uint32_t kMidpointFanPatchSegmentSpan = 8;
@@ -397,15 +460,15 @@
 
 // Define vertex and index buffers that contain all the triangles in every PatchType.
 constexpr static uint32_t kMidpointFanPatchVertexCount =
-    (kMidpointFanPatchSegmentSpan + 1) * 2 /*AA outer ramp*/ +
+    kMidpointFanPatchSegmentSpan * 4 /*Stroke and/or AA outer ramp*/ +
     (kMidpointFanPatchSegmentSpan + 1) /*Curve fan*/ + 1 /*Triangle from path midpoint*/;
 constexpr static uint32_t kMidpointFanPatchIndexCount =
-    kMidpointFanPatchSegmentSpan * 6 /*AA outer ramp*/ +
+    kMidpointFanPatchSegmentSpan * 6 /*Stroke and/or AA outer ramp*/ +
     (kMidpointFanPatchSegmentSpan - 1) * 3 /*Curve fan*/ + 3 /*Triangle from path midpoint*/;
 constexpr static uint32_t kMidpointFanPatchBaseIndex = 0;
 static_assert((kMidpointFanPatchBaseIndex * sizeof(uint16_t)) % 4 == 0);
 constexpr static uint32_t kOuterCurvePatchVertexCount =
-    (kOuterCurvePatchSegmentSpan + 1) * 3 /*AA center ramp with bowtie*/ +
+    kOuterCurvePatchSegmentSpan * 8 /*AA center ramp with bowtie*/ +
     kOuterCurvePatchSegmentSpan /*Curve fan*/;
 constexpr static uint32_t kOuterCurvePatchIndexCount =
     kOuterCurvePatchSegmentSpan * 12 /*AA center ramp with bowtie*/ +
diff --git a/include/rive/pls/pls_render_context.hpp b/include/rive/pls/pls_render_context.hpp
index 183d47e..f13c98a 100644
--- a/include/rive/pls/pls_render_context.hpp
+++ b/include/rive/pls/pls_render_context.hpp
@@ -152,19 +152,27 @@
         }
 
         // Returns the required number of padding vertices to insert before the path.
-        template <size_t PatchSize> [[nodiscard]] size_t countPath(size_t pathTessVertexCount)
+        template <size_t PatchSize>
+        [[nodiscard]] size_t countPath(size_t pathTessVertexCount, bool isStroked)
         {
-            size_t padding = PaddingToAlignUp<PatchSize>(m_runningTessVertexCount);
-            m_runningTessVertexCount += padding + pathTessVertexCount;
+            // Ensure there is always at least one padding vertex at the beginning of the
+            // tessellation texture.
+            size_t padding = m_runningTessVertexCount != 0
+                                 ? PaddingToAlignUp<PatchSize>(m_runningTessVertexCount)
+                                 : PatchSize;
+            m_runningTessVertexCount += padding;
+            m_runningTessVertexCount += isStroked ? pathTessVertexCount : pathTessVertexCount * 2;
             return padding;
         }
 
-        size_t totalVertexCount() const
+    private:
+        friend class PLSRenderContext;
+
+        size_t totalVertexCountIncludingReflectionsAndPadding() const
         {
             return m_runningTessVertexCount - m_initialTessVertexCount;
         }
 
-    private:
         size_t m_initialTessVertexCount;
         size_t m_runningTessVertexCount;
     };
@@ -177,7 +185,7 @@
     [[nodiscard]] bool reservePathData(size_t pathCount,
                                        size_t contourCount,
                                        size_t curveCount,
-                                       uint32_t tessVertexCount);
+                                       const TessVertexCounter&);
 
     // Adds the given paint to the GPU data library and fills out 'PaintData' with the information
     // required by the GPU to access it.
@@ -517,6 +525,17 @@
                                                   uint32_t joinSegmentCount,
                                                   uint32_t contourIDWithFlags);
 
+    // Same as pushTessellationSpans(), but also pushes a reflection of the span, rendered right to
+    // left, that emits a mirrored version of the patch with negative coverage. (See
+    // TessVertexSpan.)
+    RIVE_ALWAYS_INLINE void pushMirroredTessellationSpans(const Vec2D pts[4],
+                                                          Vec2D joinTangent,
+                                                          uint32_t totalVertexCount,
+                                                          uint32_t parametricSegmentCount,
+                                                          uint32_t polarSegmentCount,
+                                                          uint32_t joinSegmentCount,
+                                                          uint32_t contourIDWithFlags);
+
     // Capacities of all our GPU resource allocations.
     struct GPUResourceLimits
     {
@@ -635,12 +654,15 @@
 
     // Most recent path and contour state.
     bool m_currentPathIsStroked = false;
+    bool m_currentPathNeedsMirroredContours = false;
     uint32_t m_currentPathID = 0;
     uint32_t m_currentContourID = 0;
     uint32_t m_currentContourPaddingVertexCount = 0; // Padding vertices to add to the first curve.
     uint32_t m_tessVertexCount = 0;
+    uint32_t m_mirroredTessLocation = 0; // Used for back-face culling and mirrored patches.
     RIVE_DEBUG_CODE(uint32_t m_expectedTessVertexCountAtNextReserve = 0;)
     RIVE_DEBUG_CODE(uint32_t m_expectedTessVertexCountAtEndOfPath = 0;)
+    RIVE_DEBUG_CODE(uint32_t m_expectedMirroredTessLocationAtEndOfPath = 0;)
 
     // Simple gradients have one stop at t=0 and one stop at t=1. They're implemented with 2 texels.
     std::unordered_map<uint64_t, uint32_t> m_simpleGradients; // [color0, color1] -> rampTexelsIdx
diff --git a/renderer/d3d/pls_render_context_d3d.cpp b/renderer/d3d/pls_render_context_d3d.cpp
index 2d5a770..1c9f526 100644
--- a/renderer/d3d/pls_render_context_d3d.cpp
+++ b/renderer/d3d/pls_render_context_d3d.cpp
@@ -161,7 +161,7 @@
 {
     D3D11_RASTERIZER_DESC rasterDesc;
     rasterDesc.FillMode = D3D11_FILL_SOLID;
-    rasterDesc.CullMode = D3D11_CULL_NONE;
+    rasterDesc.CullMode = D3D11_CULL_BACK;
     rasterDesc.FrontCounterClockwise = FALSE; // FrontCounterClockwise must be FALSE in order to
                                               // match the winding sense of interior triangulations.
     rasterDesc.DepthBias = 0;
@@ -227,34 +227,36 @@
                                                             s.str().c_str(),
                                                             GLSL_tessellateFragmentMain,
                                                             "ps_5_0");
+        // Draw two instances per TessVertexSpan: one normal and one optional reflection.
+        constexpr static UINT kTessAttribsStepRate = 2;
         D3D11_INPUT_ELEMENT_DESC attribsDesc[] = {{GLSL_a_p0p1_,
                                                    0,
                                                    DXGI_FORMAT_R32G32B32A32_FLOAT,
                                                    0,
                                                    D3D11_APPEND_ALIGNED_ELEMENT,
                                                    D3D11_INPUT_PER_INSTANCE_DATA,
-                                                   1},
+                                                   kTessAttribsStepRate},
                                                   {GLSL_a_p2p3_,
                                                    0,
                                                    DXGI_FORMAT_R32G32B32A32_FLOAT,
                                                    0,
                                                    D3D11_APPEND_ALIGNED_ELEMENT,
                                                    D3D11_INPUT_PER_INSTANCE_DATA,
-                                                   1},
-                                                  {GLSL_a_joinTangent,
+                                                   kTessAttribsStepRate},
+                                                  {GLSL_a_joinTan_and_ys,
                                                    0,
                                                    DXGI_FORMAT_R32G32B32A32_FLOAT,
                                                    0,
                                                    D3D11_APPEND_ALIGNED_ELEMENT,
                                                    D3D11_INPUT_PER_INSTANCE_DATA,
-                                                   1},
+                                                   kTessAttribsStepRate},
                                                   {GLSL_a_args,
                                                    0,
                                                    DXGI_FORMAT_R32G32B32A32_UINT,
                                                    0,
                                                    D3D11_APPEND_ALIGNED_ELEMENT,
                                                    D3D11_INPUT_PER_INSTANCE_DATA,
-                                                   1}};
+                                                   kTessAttribsStepRate}};
         VERIFY_OK(m_gpu->CreateInputLayout(attribsDesc,
                                            std::size(attribsDesc),
                                            vertexBlob->GetBufferPointer(),
@@ -588,31 +590,41 @@
                                                            shader.c_str(),
                                                            GLSL_drawVertexMain,
                                                            "vs_5_0");
-            D3D11_INPUT_ELEMENT_DESC layoutDesc;
+            D3D11_INPUT_ELEMENT_DESC layoutDesc[2];
+            size_t vertexAttribCount;
             switch (drawType)
             {
                 case DrawType::midpointFanPatches:
                 case DrawType::outerCurvePatches:
-                    layoutDesc = {GLSL_a_patchVertexData,
-                                  0,
-                                  DXGI_FORMAT_R32G32B32A32_FLOAT,
-                                  kPatchVertexDataSlot,
-                                  0,
-                                  D3D11_INPUT_PER_VERTEX_DATA,
-                                  0};
+                    layoutDesc[0] = {GLSL_a_patchVertexData,
+                                     0,
+                                     DXGI_FORMAT_R32G32B32A32_FLOAT,
+                                     kPatchVertexDataSlot,
+                                     D3D11_APPEND_ALIGNED_ELEMENT,
+                                     D3D11_INPUT_PER_VERTEX_DATA,
+                                     0};
+                    layoutDesc[1] = {GLSL_a_mirroredVertexData,
+                                     0,
+                                     DXGI_FORMAT_R32G32B32A32_FLOAT,
+                                     kPatchVertexDataSlot,
+                                     D3D11_APPEND_ALIGNED_ELEMENT,
+                                     D3D11_INPUT_PER_VERTEX_DATA,
+                                     0};
+                    vertexAttribCount = 2;
                     break;
                 case DrawType::interiorTriangulation:
-                    layoutDesc = {GLSL_a_triangleVertex,
-                                  0,
-                                  DXGI_FORMAT_R32G32B32_FLOAT,
-                                  kTriangleVertexDataSlot,
-                                  0,
-                                  D3D11_INPUT_PER_VERTEX_DATA,
-                                  0};
+                    layoutDesc[0] = {GLSL_a_triangleVertex,
+                                     0,
+                                     DXGI_FORMAT_R32G32B32_FLOAT,
+                                     kTriangleVertexDataSlot,
+                                     0,
+                                     D3D11_INPUT_PER_VERTEX_DATA,
+                                     0};
+                    vertexAttribCount = 1;
                     break;
             }
-            VERIFY_OK(m_gpu->CreateInputLayout(&layoutDesc,
-                                               1,
+            VERIFY_OK(m_gpu->CreateInputLayout(layoutDesc,
+                                               vertexAttribCount,
                                                blob->GetBufferPointer(),
                                                blob->GetBufferSize(),
                                                &drawVertexShader.layout));
@@ -743,7 +755,8 @@
 
         m_gpuContext->OMSetRenderTargets(1, m_tessTextureRTV.GetAddressOf(), NULL);
 
-        m_gpuContext->DrawInstanced(4, tessVertexSpanCount, 0, 0);
+        // Draw two instances per TessVertexSpan: one normal and one optional reflection.
+        m_gpuContext->DrawInstanced(4, tessVertexSpanCount * 2, 0, 0);
 
         if (m_isIntel)
         {
diff --git a/renderer/gl/pls_render_context_gl.cpp b/renderer/gl/pls_render_context_gl.cpp
index b69ded7..4619613 100644
--- a/renderer/gl/pls_render_context_gl.cpp
+++ b/renderer/gl/pls_render_context_gl.cpp
@@ -118,7 +118,8 @@
     for (int i = 0; i < 4; ++i)
     {
         glEnableVertexAttribArray(i);
-        glVertexAttribDivisor(i, 1);
+        // Draw two instances per TessVertexSpan: one normal and one optional reflection.
+        glVertexAttribDivisor(i, 2);
     }
 
     glGenFramebuffers(1, &m_tessellateFBO);
@@ -139,13 +140,23 @@
     glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(patchIndices), patchIndices, GL_STATIC_DRAW);
 
     glEnableVertexAttribArray(0);
-    glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 0, nullptr);
+    glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, sizeof(PatchVertex), nullptr);
+
+    glEnableVertexAttribArray(1);
+    glVertexAttribPointer(1,
+                          4,
+                          GL_FLOAT,
+                          GL_FALSE,
+                          sizeof(PatchVertex),
+                          reinterpret_cast<void*>(sizeof(float) * 4));
 
     glGenVertexArrays(1, &m_interiorTrianglesVAO);
     bindVAO(m_interiorTrianglesVAO);
     glEnableVertexAttribArray(0);
 
     glFrontFace(GL_CW);
+    glCullFace(GL_BACK);
+    glEnable(GL_CULL_FACE);
 
     // ANGLE_shader_pixel_local_storage doesn't allow dither.
     glDisable(GL_DITHER);
@@ -402,7 +413,8 @@
         glViewport(0, 0, kTessTextureWidth, tessDataHeight);
         glBindFramebuffer(GL_FRAMEBUFFER, m_tessellateFBO);
         bindProgram(m_tessellateProgram);
-        glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, tessVertexSpanCount);
+        // Draw two instances per TessVertexSpan: one normal and one optional reflection.
+        glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, tessVertexSpanCount * 2);
     }
 
     // Compile the draw programs before activating pixel local storage.
diff --git a/renderer/gr_inner_fan_triangulator.hpp b/renderer/gr_inner_fan_triangulator.hpp
index 4f22605..32b90fb 100644
--- a/renderer/gr_inner_fan_triangulator.hpp
+++ b/renderer/gr_inner_fan_triangulator.hpp
@@ -23,10 +23,12 @@
     using GrTriangulator::GroutTriangleList;
 
     GrInnerFanTriangulator(const RawPath& path,
+                           const Mat2D& viewMatrix,
                            const AABB& pathBounds,
                            FillRule fillRule,
                            TrivialBlockAllocator* alloc) :
-        GrTriangulator(pathBounds, fillRule, alloc)
+        GrTriangulator(pathBounds, fillRule, alloc),
+        m_shouldReverseTriangles(viewMatrix[0] * viewMatrix[3] - viewMatrix[2] * viewMatrix[1] < 0)
     {
         fPreserveCollinearVertices = true;
         fCollectGroutTriangles = true;
@@ -53,12 +55,19 @@
         {
             return 0;
         }
-        return GrTriangulator::polysToTriangles(m_polys, m_maxVertexCount, m_pathID, bufferRing);
+        return GrTriangulator::polysToTriangles(m_polys,
+                                                m_maxVertexCount,
+                                                m_pathID,
+                                                m_shouldReverseTriangles,
+                                                bufferRing);
     }
 
     const GroutTriangleList& groutList() const { return fGroutList; }
 
 private:
+    // We reverse triangles whe using a left-handed view matrix, in order to ensure we always emit
+    // clockwise triangles.
+    bool m_shouldReverseTriangles;
     uint16_t m_pathID = 0;
     Poly* m_polys = nullptr;
     uint64_t m_maxVertexCount = 0;
diff --git a/renderer/gr_triangulator.cpp b/renderer/gr_triangulator.cpp
index 576a38b..e1ef9e9 100644
--- a/renderer/gr_triangulator.cpp
+++ b/renderer/gr_triangulator.cpp
@@ -409,6 +409,7 @@
 
 void GrTriangulator::emitMonotonePoly(const MonotonePoly* monotonePoly,
                                       uint16_t pathID,
+                                      bool reverseTriangles,
                                       pls::BufferRing<pls::TriangleVertex>* bufferRing) const
 {
     assert(monotonePoly->fWinding != 0);
@@ -440,7 +441,13 @@
         Vertex* next = v->fNext;
         if (count == 3)
         {
-            return emitTriangle(prev, curr, next, monotonePoly->fWinding, pathID, bufferRing);
+            return emitTriangle(prev,
+                                curr,
+                                next,
+                                monotonePoly->fWinding,
+                                pathID,
+                                reverseTriangles,
+                                bufferRing);
         }
         double ax = static_cast<double>(curr->fPoint.x) - prev->fPoint.x;
         double ay = static_cast<double>(curr->fPoint.y) - prev->fPoint.y;
@@ -448,7 +455,13 @@
         double by = static_cast<double>(next->fPoint.y) - curr->fPoint.y;
         if (ax * by - ay * bx >= 0.0)
         {
-            emitTriangle(prev, curr, next, monotonePoly->fWinding, pathID, bufferRing);
+            emitTriangle(prev,
+                         curr,
+                         next,
+                         monotonePoly->fWinding,
+                         pathID,
+                         reverseTriangles,
+                         bufferRing);
             v->fPrev->fNext = v->fNext;
             v->fNext->fPrev = v->fPrev;
             count--;
@@ -473,12 +486,11 @@
                                   Vertex* next,
                                   int winding,
                                   uint16_t pathID,
+                                  bool reverseTriangles,
                                   pls::BufferRing<pls::TriangleVertex>* bufferRing) const
 {
-    if (winding > 0)
+    if (reverseTriangles)
     {
-        // Ensure our triangles always wind in the same direction as if the path had been
-        // triangulated as a simple fan (a la red book).
         std::swap(prev, next);
     }
     return emit_triangle(prev, curr, next, winding, pathID, bufferRing);
@@ -564,6 +576,7 @@
 
 void GrTriangulator::emitPoly(const Poly* poly,
                               uint16_t pathID,
+                              bool reverseTriangles,
                               pls::BufferRing<pls::TriangleVertex>* bufferRing) const
 {
     if (poly->fCount < 3)
@@ -573,7 +586,7 @@
     TESS_LOG("emit() %d, size %d\n", poly->fID, poly->fCount);
     for (MonotonePoly* m = poly->fHead; m != nullptr; m = m->fNext)
     {
-        emitMonotonePoly(m, pathID, bufferRing);
+        emitMonotonePoly(m, pathID, reverseTriangles, bufferRing);
     }
 }
 
@@ -2074,13 +2087,14 @@
 void GrTriangulator::polysToTriangles(const Poly* polys,
                                       FillRule overrideFillType,
                                       uint16_t pathID,
+                                      bool reverseTriangles,
                                       pls::BufferRing<pls::TriangleVertex>* bufferRing) const
 {
     for (const Poly* poly = polys; poly; poly = poly->fNext)
     {
         if (apply_fill_type(overrideFillType, poly))
         {
-            emitPoly(poly, pathID, bufferRing);
+            emitPoly(poly, pathID, reverseTriangles, bufferRing);
         }
     }
 }
@@ -2167,6 +2181,7 @@
 size_t GrTriangulator::polysToTriangles(const Poly* polys,
                                         uint64_t maxVertexCount,
                                         uint16_t pathID,
+                                        bool reverseTriangles,
                                         pls::BufferRing<pls::TriangleVertex>* bufferRing) const
 {
     if (0 == maxVertexCount || maxVertexCount > std::numeric_limits<int32_t>::max())
@@ -2185,7 +2200,7 @@
 #endif
 
     size_t start = bufferRing->bytesWritten();
-    polysToTriangles(polys, fFillRule, pathID, bufferRing);
+    polysToTriangles(polys, fFillRule, pathID, reverseTriangles, bufferRing);
     size_t actualCount = (bufferRing->bytesWritten() - start) / vertexStride;
     assert(actualCount <= maxVertexCount * vertexStride);
     return actualCount;
diff --git a/renderer/gr_triangulator.hpp b/renderer/gr_triangulator.hpp
index b38e7b7..45a4ea3 100644
--- a/renderer/gr_triangulator.hpp
+++ b/renderer/gr_triangulator.hpp
@@ -98,6 +98,7 @@
     void polysToTriangles(const Poly* polys,
                           FillRule overrideFillRule,
                           uint16_t pathID,
+                          bool reverseTriangles,
                           pls::BufferRing<pls::TriangleVertex>*) const;
 
     // The vertex sorting in step (3) is a merge sort, since it plays well with the linked list
@@ -146,14 +147,19 @@
     // Additional helpers and driver functions.
     void emitMonotonePoly(const MonotonePoly*,
                           uint16_t pathID,
+                          bool reverseTriangles,
                           pls::BufferRing<pls::TriangleVertex>*) const;
     void emitTriangle(Vertex* prev,
                       Vertex* curr,
                       Vertex* next,
                       int winding,
                       uint16_t pathID,
+                      bool reverseTriangles,
                       pls::BufferRing<pls::TriangleVertex>*) const;
-    void emitPoly(const Poly*, uint16_t pathID, pls::BufferRing<pls::TriangleVertex>*) const;
+    void emitPoly(const Poly*,
+                  uint16_t pathID,
+                  bool reverseTriangles,
+                  pls::BufferRing<pls::TriangleVertex>*) const;
 
     Poly* makePoly(Poly** head, Vertex* v, int winding) const;
     void appendPointToContour(const Vec2D& p, VertexList* contour) const;
@@ -240,6 +246,7 @@
     size_t polysToTriangles(const Poly*,
                             uint64_t maxVertexCount,
                             uint16_t pathID,
+                            bool reverseTriangles,
                             pls::BufferRing<pls::TriangleVertex>*) const;
 
     AABB fPathBounds;
diff --git a/renderer/metal/pls_render_context_metal.mm b/renderer/metal/pls_render_context_metal.mm
index aee8c2a..97a407e 100644
--- a/renderer/metal/pls_render_context_metal.mm
+++ b/renderer/metal/pls_render_context_metal.mm
@@ -336,6 +336,7 @@
         [gradEncoder setRenderPipelineState:m_colorRampPipeline->pipelineState()];
         [gradEncoder setVertexBuffer:mtl_buffer(uniformBufferRing()) offset:0 atIndex:0];
         [gradEncoder setVertexBuffer:mtl_buffer(gradSpanBufferRing()) offset:0 atIndex:1];
+        [gradEncoder setCullMode:MTLCullModeBack];
         [gradEncoder drawPrimitives:MTLPrimitiveTypeTriangleStrip
                         vertexStart:0
                         vertexCount:4
@@ -366,10 +367,12 @@
         [tessEncoder setVertexBuffer:mtl_buffer(tessSpanBufferRing()) offset:0 atIndex:1];
         [tessEncoder setVertexTexture:mtl_texture(pathBufferRing()) atIndex:kPathTextureIdx];
         [tessEncoder setVertexTexture:mtl_texture(contourBufferRing()) atIndex:kContourTextureIdx];
+        [tessEncoder setCullMode:MTLCullModeBack];
+        // Draw two instances per TessVertexSpan: one normal and one optional reflection.
         [tessEncoder drawPrimitives:MTLPrimitiveTypeTriangleStrip
                         vertexStart:0
                         vertexCount:4
-                      instanceCount:tessVertexSpanCount];
+                      instanceCount:tessVertexSpanCount * 2];
         [tessEncoder endEncoding];
     }
 
@@ -415,6 +418,7 @@
     [encoder setVertexTexture:mtl_texture(pathBufferRing()) atIndex:kPathTextureIdx];
     [encoder setVertexTexture:mtl_texture(contourBufferRing()) atIndex:kContourTextureIdx];
     [encoder setFragmentTexture:mtl_texture(gradTexelBufferRing()) atIndex:kGradTextureIdx];
+    [encoder setCullMode:MTLCullModeBack];
     if (frameDescriptor().wireframe)
     {
         [encoder setTriangleFillMode:MTLTriangleFillModeLines];
diff --git a/renderer/pls.cpp b/renderer/pls.cpp
index 27cd3de..9cf365d 100644
--- a/renderer/pls.cpp
+++ b/renderer/pls.cpp
@@ -6,9 +6,9 @@
 
 namespace rive::pls
 {
-constexpr static int32_t pack_params(int32_t patchSegmentSpan, int32_t vertexType)
+constexpr static float pack_params(int32_t patchSegmentSpan, int32_t vertexType)
 {
-    return (patchSegmentSpan << 2) | vertexType;
+    return static_cast<float>((patchSegmentSpan << 2) | vertexType);
 }
 
 static void generate_buffer_data_for_patch_type(PatchType patchType,
@@ -17,87 +17,138 @@
                                                 uint16_t baseVertex)
 {
     // AA border vertices. "Inner tessellation curves" have one more segment without a fan triangle
-    // whose purpose is to fill the join.
+    // whose purpose is to be a bowtie join.
+    size_t vertexCount = 0;
     size_t patchSegmentSpan = patchType == PatchType::midpointFan ? kMidpointFanPatchSegmentSpan
                                                                   : kOuterCurvePatchSegmentSpan;
-    size_t vertexCount = 0;
-    for (int i = 0; i <= patchSegmentSpan; ++i)
+    for (int i = 0; i < patchSegmentSpan; ++i)
     {
+        float params = pack_params(patchSegmentSpan, flags::kStrokeVertex);
+        float l = static_cast<float>(i);
+        float r = l + 1;
         if (patchType == PatchType::outerCurves)
         {
-            vertices[vertexCount++] = {static_cast<float>(i),
-                                       1,
-                                       0,
-                                       pack_params(patchSegmentSpan, flags::kStrokeVertex)};
-            vertices[vertexCount++] = {static_cast<float>(i),
-                                       0,
-                                       .5f,
-                                       pack_params(patchSegmentSpan, flags::kStrokeVertex)};
-            vertices[vertexCount++] = {static_cast<float>(i),
-                                       -1,
-                                       0,
-                                       pack_params(patchSegmentSpan, flags::kStrokeVertex)};
+            vertices[vertexCount + 0].set(l, 0.f, .5f, params);
+            vertices[vertexCount + 1].set(l, 1.f, .0f, params);
+            vertices[vertexCount + 2].set(r, 0.f, .5f, params);
+            vertices[vertexCount + 3].set(r, 1.f, .0f, params);
+
+            // Give the vertex an alternate position when mirrored so the border has the same
+            // diagonals whether morrored or not.
+            vertices[vertexCount + 0].setMirroredPosition(r, 0.f, .5f);
+            vertices[vertexCount + 1].setMirroredPosition(l, 0.f, .5f);
+            vertices[vertexCount + 2].setMirroredPosition(r, 1.f, .0f);
+            vertices[vertexCount + 3].setMirroredPosition(l, 1.f, .0f);
         }
         else
         {
             assert(patchType == PatchType::midpointFan);
-            vertices[vertexCount++] = {static_cast<float>(i),
-                                       -1,
-                                       1,
-                                       pack_params(patchSegmentSpan, flags::kStrokeVertex)};
-            vertices[vertexCount++] = {static_cast<float>(i),
-                                       1,
-                                       0,
-                                       pack_params(patchSegmentSpan, flags::kStrokeVertex)};
+            vertices[vertexCount + 0].set(l, -1.f, 1.f, params);
+            vertices[vertexCount + 1].set(l, +1.f, 0.f, params);
+            vertices[vertexCount + 2].set(r, -1.f, 1.f, params);
+            vertices[vertexCount + 3].set(r, +1.f, 0.f, params);
+
+            // Give the vertex an alternate position when mirrored so the border has the same
+            // diagonals whether morrored or not.
+            vertices[vertexCount + 0].setMirroredPosition(r - 1.f, -1.f, 1.f);
+            vertices[vertexCount + 1].setMirroredPosition(l - 1.f, -1.f, 1.f);
+            vertices[vertexCount + 2].setMirroredPosition(r - 1.f, +1.f, 0.f);
+            vertices[vertexCount + 3].setMirroredPosition(l - 1.f, +1.f, 0.f);
+        }
+        vertexCount += 4;
+    }
+
+    // Bottom (negative coverage) side of the AA border.
+    if (patchType == PatchType::outerCurves)
+    {
+        float params = pack_params(patchSegmentSpan, flags::kStrokeVertex);
+        for (int i = 0; i < patchSegmentSpan; ++i)
+        {
+            float l = static_cast<float>(i);
+            float r = l + 1;
+
+            vertices[vertexCount + 0].set(l, -.0f, .5f, params);
+            vertices[vertexCount + 1].set(r, -.0f, .5f, params);
+            vertices[vertexCount + 2].set(l, -1.f, .0f, params);
+            vertices[vertexCount + 3].set(r, -1.f, .0f, params);
+
+            // Give the vertex an alternate position when mirrored so the border has the same
+            // diagonals whether morrored or not.
+            vertices[vertexCount + 0].setMirroredPosition(r, -0.f, .5f);
+            vertices[vertexCount + 1].setMirroredPosition(r, -1.f, .0f);
+            vertices[vertexCount + 2].setMirroredPosition(l, -0.f, .5f);
+            vertices[vertexCount + 3].setMirroredPosition(l, -1.f, .0f);
+
+            vertexCount += 4;
         }
     }
 
     // Triangle fan vertices. (These only touch the first "fanSegmentSpan" segments on inner
     // tessellation curves.
+    size_t fanVerticesIdx = vertexCount;
     size_t fanSegmentSpan =
         patchType == PatchType::midpointFan ? patchSegmentSpan : patchSegmentSpan - 1;
     assert((fanSegmentSpan & (fanSegmentSpan - 1)) == 0); // The fan must be a power of two.
-    size_t fanVerticesIdx = vertexCount;
     for (int i = 0; i <= fanSegmentSpan; ++i)
     {
-        vertices[vertexCount++] = {static_cast<float>(i),
-                                   patchType == PatchType::outerCurves ? 0.f : -1.f,
-                                   1,
-                                   pack_params(patchSegmentSpan, flags::kFanVertex)};
+        float params = pack_params(patchSegmentSpan, flags::kFanVertex);
+        if (patchType == PatchType::outerCurves)
+        {
+            vertices[vertexCount].set(static_cast<float>(i), 0.f, 1, params);
+        }
+        else
+        {
+            vertices[vertexCount].set(static_cast<float>(i), -1.f, 1, params);
+            vertices[vertexCount].setMirroredPosition(static_cast<float>(i) - 1, -1.f, 1);
+        }
+        ++vertexCount;
     }
 
     // The midpoint vertex is only included on midpoint fan patches.
     size_t midpointIdx = vertexCount;
     if (patchType == PatchType::midpointFan)
     {
-        vertices[vertexCount++] = {0,
-                                   0,
-                                   1,
-                                   pack_params(patchSegmentSpan, flags::kFanMidpointVertex)};
+        vertices[vertexCount++].set(0,
+                                    0,
+                                    1,
+                                    pack_params(patchSegmentSpan, flags::kFanMidpointVertex));
     }
     assert(vertexCount == (patchType == PatchType::outerCurves ? kOuterCurvePatchVertexCount
                                                                : kMidpointFanPatchVertexCount));
 
     // AA border indices.
-    constexpr static size_t kCenterBorderPatternSize = 12;
-    constexpr static uint16_t kCenterBorderPattern[kCenterBorderPatternSize] =
-        {3, 4, 0, 0, 4, 1, 5, 4, 2, 2, 4, 1};
+    constexpr static size_t kBorderPatternVertexCount = 4;
+    constexpr static size_t kBorderPatternIndexCount = 6;
+    constexpr static uint16_t kBorderPattern[kBorderPatternIndexCount] = {0, 1, 2, 2, 1, 3};
+    constexpr static uint16_t kNegativeBorderPattern[kBorderPatternIndexCount] = {0, 2, 1, 1, 2, 3};
 
-    constexpr static size_t kOuterBorderPatternSize = 6;
-    constexpr static uint16_t kOuterBorderPattern[kOuterBorderPatternSize] = {0, 1, 2, 2, 1, 3};
-
-    size_t borderPatternSize =
-        patchType == PatchType::outerCurves ? kCenterBorderPatternSize : kOuterBorderPatternSize;
-    const uint16_t* borderPattern =
-        patchType == PatchType::outerCurves ? kCenterBorderPattern : kOuterBorderPattern;
-    size_t verticesPerNormal = patchType == PatchType::outerCurves ? 3 : 2;
     size_t indexCount = 0;
-    for (int i = 0; i < borderPatternSize * patchSegmentSpan; ++i)
+    size_t borderEdgeVerticesIdx = 0;
+    for (size_t borderSegmentIdx = 0; borderSegmentIdx < patchSegmentSpan; ++borderSegmentIdx)
     {
-        indices[indexCount++] = borderPattern[i % borderPatternSize] +
-                                i / borderPatternSize * verticesPerNormal + baseVertex;
+        for (size_t i = 0; i < kBorderPatternIndexCount; ++i)
+        {
+            indices[indexCount++] = baseVertex + borderEdgeVerticesIdx + kBorderPattern[i];
+        }
+        borderEdgeVerticesIdx += kBorderPatternVertexCount;
     }
 
+    // Bottom (negative coverage) side of the AA border.
+    if (patchType == PatchType::outerCurves)
+    {
+        for (size_t borderSegmentIdx = 0; borderSegmentIdx < patchSegmentSpan; ++borderSegmentIdx)
+        {
+            for (size_t i = 0; i < kBorderPatternIndexCount; ++i)
+            {
+                indices[indexCount++] =
+                    baseVertex + borderEdgeVerticesIdx + kNegativeBorderPattern[i];
+            }
+            borderEdgeVerticesIdx += kBorderPatternVertexCount;
+        }
+    }
+
+    assert(borderEdgeVerticesIdx == fanVerticesIdx);
+
     // Triangle fan indices, in a middle-out topology.
     // Don't include the final bowtie join if this is an "outerStroke" patch. (i.e., use
     // fanSegmentSpan and not "patchSegmentSpan".)
diff --git a/renderer/pls_render_context.cpp b/renderer/pls_render_context.cpp
index 1b0afe0..566c600 100644
--- a/renderer/pls_render_context.cpp
+++ b/renderer/pls_render_context.cpp
@@ -414,18 +414,18 @@
 bool PLSRenderContext::reservePathData(size_t pathCount,
                                        size_t contourCount,
                                        size_t curveCount,
-                                       uint32_t tessVertexCount)
+                                       const TessVertexCounter& tessVertexCounter)
 {
     assert(m_didBeginFrame);
     assert(m_tessVertexCount == m_expectedTessVertexCountAtNextReserve);
 
-    // +1 for the padding vertex at the end.
-    size_t maxTessVertexCountWithInternalPadding = tessVertexCount + 1;
+    // +1 for the padding vertex at the end of the tessellation data.
+    size_t maxTessVertexCountWithInternalPadding =
+        tessVertexCounter.totalVertexCountIncludingReflectionsAndPadding() + 1;
     // Line breaks potentially introduce a new span. Count the maximum number of line breaks we
-    // might encounter.
-    size_t y0 = m_tessVertexCount / kTessTextureWidth;
-    size_t y1 = (m_tessVertexCount + maxTessVertexCountWithInternalPadding - 1) / kTessTextureWidth;
-    size_t maxSpanBreakCount = y1 - y0;
+    // might encounter. Since line breaks may also occur from the reflection, just find a simple
+    // upper bound.
+    size_t maxSpanBreakCount = (1 + maxTessVertexCountWithInternalPadding / kTessTextureWidth) * 2;
     // +pathCount for a span of padding vertices at the beginning of each path.
     // +1 for the padding vertex at the end of the entire tessellation texture (in case this happens
     // to be the final batch of paths in the flush).
@@ -484,7 +484,10 @@
         assert(m_pathBuffer.hasRoomFor(pathCount));
         assert(m_contourBuffer.hasRoomFor(contourCount));
         RIVE_DEBUG_CODE(m_expectedTessVertexCountAtNextReserve =
-                            m_tessVertexCount + tessVertexCount);
+                            m_tessVertexCount +
+                            tessVertexCounter.totalVertexCountIncludingReflectionsAndPadding());
+        assert(m_expectedTessVertexCountAtNextReserve <=
+               m_currentResourceLimits.maxTessellationVertices);
         return true;
     }
 
@@ -636,8 +639,10 @@
 {
     assert(m_didBeginFrame);
     assert(m_tessVertexCount == m_expectedTessVertexCountAtEndOfPath);
+    assert(m_mirroredTessLocation == m_expectedMirroredTessLocationAtEndOfPath);
 
     m_currentPathIsStroked = strokeRadius != 0;
+    m_currentPathNeedsMirroredContours = !m_currentPathIsStroked;
     m_pathBuffer.set_back(matrix, strokeRadius, fillRule, paintType, clipID, blendMode, paintData);
 
     ++m_currentPathID;
@@ -656,13 +661,15 @@
     assert(m_drawList.tail().baseVertexOrInstance + m_drawList.tail().vertexOrInstanceCount ==
            baseInstance);
     uint32_t vertexCountToDraw = tessVertexCount - paddingVertexCount;
+    if (m_currentPathNeedsMirroredContours)
+    {
+        vertexCountToDraw *= 2;
+    }
     uint32_t instanceCount = vertexCountToDraw / patchSize;
     // The caller is responsible to pad each contour so it ends on a multiple of the patch size.
     assert(instanceCount * patchSize == vertexCountToDraw);
     m_drawList.tail().vertexOrInstanceCount += instanceCount;
 
-    RIVE_DEBUG_CODE(m_expectedTessVertexCountAtEndOfPath = m_tessVertexCount + tessVertexCount);
-
     // The first curve of the path will be pre-padded with 'paddingVertexCount' tessellation
     // vertices, colocated at T=0. The caller must use this argument align the beginning of the path
     // on a boundary of the patch size. (See PLSRenderContext::TessVertexCounter.)
@@ -670,6 +677,19 @@
     {
         pushPaddingVertices(paddingVertexCount);
     }
+
+    size_t tessVertexCountWithoutPadding = tessVertexCount - paddingVertexCount;
+    if (m_currentPathNeedsMirroredContours)
+    {
+        m_tessVertexCount = m_mirroredTessLocation =
+            m_tessVertexCount + tessVertexCountWithoutPadding;
+        RIVE_DEBUG_CODE(m_expectedMirroredTessLocationAtEndOfPath =
+                            m_mirroredTessLocation - tessVertexCountWithoutPadding);
+    }
+    RIVE_DEBUG_CODE(m_expectedTessVertexCountAtEndOfPath =
+                        m_tessVertexCount + tessVertexCountWithoutPadding);
+    assert(m_expectedTessVertexCountAtEndOfPath <= m_expectedTessVertexCountAtNextReserve);
+    assert(m_expectedTessVertexCountAtEndOfPath <= m_currentResourceLimits.maxTessellationVertices);
 }
 
 void PLSRenderContext::pushContour(Vec2D midpoint, bool closed, uint32_t paddingVertexCount)
@@ -719,13 +739,26 @@
     // Only the first curve of a contour gets padding vertices.
     m_currentContourPaddingVertexCount = 0;
 
-    pushTessellationSpans(pts,
-                          joinTangent,
-                          totalVertexCount,
-                          parametricSegmentCount,
-                          polarSegmentCount,
-                          joinSegmentCount,
-                          m_currentContourID | additionalPLSFlags);
+    if (m_currentPathNeedsMirroredContours)
+    {
+        pushMirroredTessellationSpans(pts,
+                                      joinTangent,
+                                      totalVertexCount,
+                                      parametricSegmentCount,
+                                      polarSegmentCount,
+                                      joinSegmentCount,
+                                      m_currentContourID | additionalPLSFlags);
+    }
+    else
+    {
+        pushTessellationSpans(pts,
+                              joinTangent,
+                              totalVertexCount,
+                              parametricSegmentCount,
+                              polarSegmentCount,
+                              joinSegmentCount,
+                              m_currentContourID | additionalPLSFlags);
+    }
 }
 
 void PLSRenderContext::pushPaddingVertices(uint32_t count)
@@ -733,9 +766,11 @@
     constexpr static Vec2D kEmptyCubic[4]{};
     // This is guaranteed to not collide with a neighboring contour ID.
     constexpr static uint32_t kInvalidContourID = 0;
-    RIVE_DEBUG_CODE(size_t startingVertexCount = m_tessVertexCount;)
+    assert(m_tessVertexCount == m_expectedTessVertexCountAtEndOfPath);
+    RIVE_DEBUG_CODE(m_expectedTessVertexCountAtEndOfPath = m_tessVertexCount + count;)
+    assert(m_expectedTessVertexCountAtEndOfPath <= m_currentResourceLimits.maxTessellationVertices);
     pushTessellationSpans(kEmptyCubic, {0, 0}, count, 0, 0, 1, kInvalidContourID);
-    assert(m_tessVertexCount == startingVertexCount + count);
+    assert(m_tessVertexCount == m_expectedTessVertexCountAtEndOfPath);
 }
 
 RIVE_ALWAYS_INLINE void PLSRenderContext::pushTessellationSpans(const Vec2D pts[4],
@@ -746,35 +781,91 @@
                                                                 uint32_t joinSegmentCount,
                                                                 uint32_t contourIDWithFlags)
 {
-    int32_t y = m_tessVertexCount / kTessTextureWidth;
+    uint32_t y = m_tessVertexCount / kTessTextureWidth;
     int32_t x0 = m_tessVertexCount % kTessTextureWidth;
     int32_t x1 = x0 + totalVertexCount;
     for (;;)
     {
         m_tessSpanBuffer.set_back(pts,
                                   joinTangent,
+                                  static_cast<float>(y),
                                   x0,
                                   x1,
-                                  y,
                                   parametricSegmentCount,
                                   polarSegmentCount,
                                   joinSegmentCount,
                                   contourIDWithFlags);
-        if (x1 > kTessTextureWidth)
+        if (x1 > static_cast<int32_t>(kTessTextureWidth))
         {
             // The span was too long to fit on the current line. Wrap and draw it again, this
             // time behind the left edge of the texture so we capture what got clipped off last
             // time.
+            ++y;
             x0 -= kTessTextureWidth;
             x1 -= kTessTextureWidth;
+            continue;
+        }
+        break;
+    }
+    assert(y == (m_tessVertexCount + totalVertexCount - 1) / kTessTextureWidth);
+
+    m_tessVertexCount += totalVertexCount;
+    assert(m_tessVertexCount <= m_expectedTessVertexCountAtEndOfPath);
+}
+
+RIVE_ALWAYS_INLINE void PLSRenderContext::pushMirroredTessellationSpans(
+    const Vec2D pts[4],
+    Vec2D joinTangent,
+    uint32_t totalVertexCount,
+    uint32_t parametricSegmentCount,
+    uint32_t polarSegmentCount,
+    uint32_t joinSegmentCount,
+    uint32_t contourIDWithFlags)
+{
+    int32_t y = m_tessVertexCount / kTessTextureWidth;
+    int32_t x0 = m_tessVertexCount % kTessTextureWidth;
+    int32_t x1 = x0 + totalVertexCount;
+
+    uint32_t reflectionY = (m_mirroredTessLocation - 1) / kTessTextureWidth;
+    int32_t reflectionX0 = (m_mirroredTessLocation - 1) % kTessTextureWidth + 1;
+    int32_t reflectionX1 = reflectionX0 - totalVertexCount;
+
+    for (;;)
+    {
+        m_tessSpanBuffer.set_back(pts,
+                                  joinTangent,
+                                  static_cast<float>(y),
+                                  x0,
+                                  x1,
+                                  static_cast<float>(reflectionY),
+                                  reflectionX0,
+                                  reflectionX1,
+                                  parametricSegmentCount,
+                                  polarSegmentCount,
+                                  joinSegmentCount,
+                                  contourIDWithFlags);
+        if (x1 > static_cast<int32_t>(kTessTextureWidth) || reflectionX1 < 0)
+        {
+            // Either the span or its reflection was too long to fit on the current line. Wrap and
+            // draw one both of them both again, this time behind the opposite edge of the texture
+            // so we capture what got clipped off last time.
             ++y;
+            x0 -= kTessTextureWidth;
+            x1 -= kTessTextureWidth;
+
+            --reflectionY;
+            reflectionX0 += kTessTextureWidth;
+            reflectionX1 += kTessTextureWidth;
             continue;
         }
         break;
     }
 
     m_tessVertexCount += totalVertexCount;
-    assert(m_tessVertexCount <= m_currentResourceLimits.maxTessellationVertices);
+    assert(m_tessVertexCount <= m_expectedTessVertexCountAtEndOfPath);
+
+    m_mirroredTessLocation -= totalVertexCount;
+    assert(m_mirroredTessLocation >= m_expectedMirroredTessLocationAtEndOfPath);
 }
 
 void PLSRenderContext::pushInteriorTriangulation(GrInnerFanTriangulator* triangulator,
@@ -830,7 +921,6 @@
 void PLSRenderContext::flush(FlushType flushType)
 {
     assert(m_didBeginFrame);
-    assert(m_tessVertexCount == m_expectedTessVertexCountAtEndOfPath);
     if (flushType == FlushType::intermediate)
     {
         // We might not have pushed as many tessellation vertices as expected if we ran out of room
@@ -841,6 +931,8 @@
     {
         assert(m_tessVertexCount == m_expectedTessVertexCountAtNextReserve);
     }
+    assert(m_tessVertexCount == m_expectedTessVertexCountAtEndOfPath);
+    assert(m_mirroredTessLocation == m_expectedMirroredTessLocationAtEndOfPath);
 
     // The final vertex of the final patch of each contour crosses over into the next contour. (This
     // is how we wrap around back to the beginning.) Therefore, the final contour of the flush needs
@@ -973,8 +1065,10 @@
     m_complexGradients.clear();
 
     m_tessVertexCount = 0;
+    m_mirroredTessLocation = 0;
     RIVE_DEBUG_CODE(m_expectedTessVertexCountAtNextReserve = 0);
     RIVE_DEBUG_CODE(m_expectedTessVertexCountAtEndOfPath = 0);
+    RIVE_DEBUG_CODE(m_expectedMirroredTessLocationAtEndOfPath = 0);
 
     m_maxTriangleVertexCount = 0;
 
diff --git a/renderer/pls_renderer.cpp b/renderer/pls_renderer.cpp
index e53314d..d45abe2 100644
--- a/renderer/pls_renderer.cpp
+++ b/renderer/pls_renderer.cpp
@@ -101,6 +101,12 @@
         return;
     }
 
+    // A stroke width of zero means a path is filled in PLS.
+    if (stroked && paint->getThickness() <= 0)
+    {
+        return;
+    }
+
     // Make (up to) two attempts to draw the path plus any necessary clip updates in a single batch.
     // If the first attempt fails, flush to make room and try again.
     for (size_t i = 0; i < 2; ++i)
@@ -557,6 +563,7 @@
             assert(!path->triangulator);
             path->triangulator =
                 context->make<GrInnerFanTriangulator>(*scratchPath,
+                                                      *path->matrix,
                                                       path->pathBounds,
                                                       path->fillRule,
                                                       context->trivialPerFlushAllocator());
@@ -951,6 +958,8 @@
             continue;
         }
 
+        bool stroked = currentPathIdx == strokeIdx;
+
         // (If we used interior triangulation, interiorTriHelper already counted the path's vertices
         // for us.)
         if (path.triangulator != nullptr)
@@ -960,8 +969,10 @@
             // a multiple of the patch size.
             if (path.tessVertexCount > 0)
             {
+                assert(!stroked);
                 path.paddingVertexCount =
-                    tessVertexCounter.countPath<kOuterCurvePatchSegmentSpan>(path.tessVertexCount);
+                    tessVertexCounter.countPath<kOuterCurvePatchSegmentSpan>(path.tessVertexCount,
+                                                                             false);
                 path.tessVertexCount += path.paddingVertexCount;
             }
         }
@@ -998,7 +1009,6 @@
                     contourVertexCount -= m_parametricSegmentCounts[j];
                 }
 
-                bool stroked = currentPathIdx == strokeIdx;
                 if (stroked)
                 {
                     // Finish calculating and counting polar segments for each stroked curve and
@@ -1136,7 +1146,8 @@
             if (path.tessVertexCount > 0)
             {
                 path.paddingVertexCount =
-                    tessVertexCounter.countPath<kMidpointFanPatchSegmentSpan>(path.tessVertexCount);
+                    tessVertexCounter.countPath<kMidpointFanPatchSegmentSpan>(path.tessVertexCount,
+                                                                              stroked);
                 path.tessVertexCount += path.paddingVertexCount;
             }
         }
@@ -1151,7 +1162,7 @@
     if (!m_context->reservePathData(m_pathBatch.size(),
                                     contourCount,
                                     curveReserveCount,
-                                    tessVertexCounter.totalVertexCount()))
+                                    tessVertexCounter))
     {
         // The paths don't fit. Give up and let the caller flush and try again.
         return false;
diff --git a/renderer/shaders/color_ramp.glsl b/renderer/shaders/color_ramp.glsl
index d099f4d..d8415d2 100644
--- a/renderer/shaders/color_ramp.glsl
+++ b/renderer/shaders/color_ramp.glsl
@@ -12,7 +12,7 @@
 #endif
 
 VARYING_BLOCK_BEGIN(Varyings)
-NO_PERSPECTIVE VARYING(half4, v_rampColor);
+NO_PERSPECTIVE VARYING(0, half4, v_rampColor);
 VARYING_BLOCK_END(_pos)
 
 #ifdef @VERTEX
@@ -42,10 +42,16 @@
     VARYING_INIT(varyings, v_rampColor, half4);
 
     float x = float((_vertexID & 1) == 0 ? @a_span.x & 0xffffu : @a_span.x >> 16) / 65536.;
-    float y = float(@a_span.y) + ((_vertexID & 2) == 0 ? .0 : 1.);
+    float offsetY = (_vertexID & 2) == 0 ? 1. : .0;
+    if (uniforms.gradInverseViewportY < .0)
+    {
+        // Make sure we always emit clockwise triangles. Swap the top and bottom vertices.
+        offsetY = 1. - offsetY;
+    }
     v_rampColor = unpackColorInt((_vertexID & 1) == 0 ? @a_span.z : @a_span.w);
     _pos.x = x * 2. - 1.;
-    _pos.y = y * uniforms.gradInverseViewportY - sign(uniforms.gradInverseViewportY);
+    _pos.y = (float(@a_span.y) + offsetY) * uniforms.gradInverseViewportY -
+             sign(uniforms.gradInverseViewportY);
     _pos.zw = float2(0, 1);
 
     VARYING_PACK(varyings, v_rampColor);
diff --git a/renderer/shaders/common.glsl b/renderer/shaders/common.glsl
index 1c9d0f9..bf1c4e8 100644
--- a/renderer/shaders/common.glsl
+++ b/renderer/shaders/common.glsl
@@ -7,6 +7,7 @@
 #define TESS_TEXTURE_WIDTH 2048.
 #define TESS_TEXTURE_WIDTH_LOG2 11
 
+// Flags that must stay in sync with pls.hpp.
 #define RETROFITTED_TRIANGLE_FLAG (1u << 31)
 #define CULL_EXCESS_TESSELLATION_SEGMENTS_FLAG (1u << 30)
 #define JOIN_TYPE_MASK (3u << 28)
@@ -14,10 +15,13 @@
 #define MITER_REVERT_JOIN (2u << 28)
 #define BEVEL_JOIN (1u << 28)
 #define EMULATED_STROKE_CAP_FLAG (1u << 27)
-#define JOIN_TANGENT_0_FLAG (1u << 26)
-#define JOIN_TANGENT_INNER_FLAG (1u << 25)
-#define LEFT_JOIN_FLAG (1u << 24)
-#define RIGHT_JOIN_FLAG (1u << 23)
+
+// Internal flags.
+#define MIRRORED_CONTOUR_FLAG (1u << 26)
+#define JOIN_TANGENT_0_FLAG (1u << 25)
+#define JOIN_TANGENT_INNER_FLAG (1u << 24)
+#define LEFT_JOIN_FLAG (1u << 23)
+#define RIGHT_JOIN_FLAG (1u << 22)
 #define CONTOUR_ID_MASK 0xffffu
 
 #define PI 3.141592653589793238
diff --git a/renderer/shaders/draw.glsl b/renderer/shaders/draw.glsl
index 90c9a6a..3b35db5 100644
--- a/renderer/shaders/draw.glsl
+++ b/renderer/shaders/draw.glsl
@@ -14,23 +14,24 @@
 ATTR(0, packed_float3, @a_triangleVertex);
 #else
 ATTR(0, float4, @a_patchVertexData); // [localVertexID, outset, fillCoverage, vertexType]
+ATTR(1, float4, @a_mirroredVertexData);
 #endif
 ATTR_BLOCK_END
 #endif
 
 VARYING_BLOCK_BEGIN(Varyings)
-NO_PERSPECTIVE VARYING(float4, v_paint);
+NO_PERSPECTIVE VARYING(0, float4, v_paint);
 #ifdef @DRAW_INTERIOR_TRIANGLES
-@OPTIONALLY_FLAT VARYING(half, v_windingWeight);
+@OPTIONALLY_FLAT VARYING(1, half, v_windingWeight);
 #else
-NO_PERSPECTIVE VARYING(half2, v_edgeDistance);
+NO_PERSPECTIVE VARYING(2, half2, v_edgeDistance);
 #endif
-@OPTIONALLY_FLAT VARYING(half, v_pathID);
+@OPTIONALLY_FLAT VARYING(3, half, v_pathID);
 #ifdef @ENABLE_PATH_CLIPPING
-@OPTIONALLY_FLAT VARYING(half, v_clipID);
+@OPTIONALLY_FLAT VARYING(4, half, v_clipID);
 #endif
 #ifdef @ENABLE_ADVANCED_BLEND
-@OPTIONALLY_FLAT VARYING(half, v_blendMode);
+@OPTIONALLY_FLAT VARYING(5, half, v_blendMode);
 #endif
 VARYING_BLOCK_END(_pos)
 
@@ -77,6 +78,7 @@
     ATTR_UNPACK(_vertexID, attrs, @a_triangleVertex, float3);
 #else
     ATTR_UNPACK(_vertexID, attrs, @a_patchVertexData, float4);
+    ATTR_UNPACK(_vertexID, attrs, @a_mirroredVertexData, float4);
 #endif
 
     VARYING_INIT(varyings, v_paint, float4);
@@ -139,9 +141,19 @@
     float strokeRadius = uintBitsToFloat(pathData.z);
 
     // Fix the tessellation vertex if we fetched the wrong one in order to guarantee we got the
-    // correct contour ID and flags.
+    // correct contour ID and flags, or if we belong to a mirrored contour and this vertex has an
+    // alternate position when mirrored.
+    uint mirroredContourFlag = contourIDWithFlags & MIRRORED_CONTOUR_FLAG;
+    if (mirroredContourFlag != 0u)
+    {
+        localVertexID = int(@a_mirroredVertexData.x);
+        outset = @a_mirroredVertexData.y;
+        fillCoverage = @a_mirroredVertexData.z;
+    }
     if (localVertexID != vertexIDOnContour)
     {
+        // This can peek one vertex before or after the contour, but the tessellator guarantees
+        // there is always at least one padding vertex at the beginning and end of the data.
         tessVertexIdx += localVertexID - vertexIDOnContour;
         uint4 replacementTessVertexData =
             TEXEL_FETCH(textures, @tessVertexTexture, tessTexelCoord(tessVertexIdx));
@@ -161,7 +173,10 @@
         {
             tessVertexData = replacementTessVertexData;
         }
-        contourIDWithFlags = tessVertexData.w;
+        // MIRRORED_CONTOUR_FLAG is not preserved at vertexIndex0. Preserve it here. By not
+        // preserving this flag, the normal and mirrored contour can both share the same contour
+        // record.
+        contourIDWithFlags = tessVertexData.w | mirroredContourFlag;
     }
 
     // Finish unpacking tessVertexData.
@@ -172,6 +187,9 @@
 
     if (strokeRadius != .0) // Is this a stroke?
     {
+        // Ensure strokes always emit clockwise triangles.
+        outset *= sign(determinant(mat));
+
         // Joins only emanate from the outer side of the stroke.
         if ((contourIDWithFlags & LEFT_JOIN_FLAG) != 0u)
             outset = min(outset, .0);
@@ -202,7 +220,11 @@
             // This vertex belongs to a miter or bevel join. Begin by finding the bisector, which is
             // the same as the miter line. The first two vertices in the join peek forward to figure
             // out the bisector, and the final two peek backward.
-            int peekDir = (contourIDWithFlags & JOIN_TANGENT_0_FLAG) != 0u ? 2 : -2;
+            int peekDir = 2;
+            if ((contourIDWithFlags & JOIN_TANGENT_0_FLAG) == 0u)
+                peekDir = -peekDir;
+            if ((contourIDWithFlags & MIRRORED_CONTOUR_FLAG) != 0u)
+                peekDir = -peekDir;
             int2 otherJoinTexelCoord = tessTexelCoord(tessVertexIdx + peekDir);
             uint4 otherJoinData = TEXEL_FETCH(textures, @tessVertexTexture, otherJoinTexelCoord);
             float otherJoinTheta = uintBitsToFloat(otherJoinData.z);
@@ -299,6 +321,9 @@
         // Offset the vertex for Manhattan AA.
         postTransformVertexOffset = sign(MUL(mat, outset * norm)) * AA_RADIUS;
 
+        if ((contourIDWithFlags & MIRRORED_CONTOUR_FLAG) != 0u)
+            fillCoverage = -fillCoverage;
+
         // "v_edgeDistance.y < 0" indicates to the fragment shader that this is a fill.
         v_edgeDistance = make_half2(fillCoverage, -1);
 
@@ -416,7 +441,7 @@
 PLS_DECL2F(3, clipBuffer);
 PLS_BLOCK_END
 
-PLS_MAIN(@drawFragmentMain, Varyings, varyings, FragmentTextures, textures, _pos, _clockwise)
+PLS_MAIN(@drawFragmentMain, Varyings, varyings, FragmentTextures, textures, _pos)
 {
     VARYING_UNPACK(varyings, v_paint, float4);
 #ifdef @DRAW_INTERIOR_TRIANGLES
@@ -465,12 +490,10 @@
 #ifdef @DRAW_INTERIOR_TRIANGLES
     coverageCount += v_windingWeight;
 #else
-    if (v_edgeDistance.y >= .0 /*stroke*/)
+    if (v_edgeDistance.y >= .0) // Stroke.
         coverageCount = max(min(v_edgeDistance.x, v_edgeDistance.y), coverageCount);
-    else if (_clockwise /*clockwise fill*/)
+    else // Fill. (Back-face culling ensures v_edgeDistance.x is appropriately signed.)
         coverageCount += v_edgeDistance.x;
-    else /*counterclockwise fill*/
-        coverageCount -= v_edgeDistance.x;
 
     // Save the updated coverage.
     PLS_STORE2F(coverageCountBuffer, v_pathID, coverageCount);
diff --git a/renderer/shaders/glsl.glsl b/renderer/shaders/glsl.glsl
index 3945131..109fb55 100644
--- a/renderer/shaders/glsl.glsl
+++ b/renderer/shaders/glsl.glsl
@@ -77,9 +77,9 @@
 #define ATTR_UNPACK(ID, attrs, NAME, TYPE)
 
 #ifdef @VERTEX
-#define VARYING(TYPE, NAME) out TYPE NAME
+#define VARYING(IDX, TYPE, NAME) out TYPE NAME
 #else
-#define VARYING(TYPE, NAME) in TYPE NAME
+#define VARYING(IDX, TYPE, NAME) in TYPE NAME
 #endif
 #define FLAT flat
 #define VARYING_BLOCK_BEGIN(NAME)
@@ -251,16 +251,14 @@
 #define EMIT_FRAG_DATA(VALUE) _fd = VALUE
 
 #ifdef @PLS_IMPL_RW_TEXTURE
-#define PLS_MAIN(NAME, Varyings, varyings, FragmentTextures, textures, _pos, _clockwise)           \
+#define PLS_MAIN(NAME, Varyings, varyings, FragmentTextures, textures, _pos)                       \
     void main()                                                                                    \
     {                                                                                              \
-        bool _clockwise = gl_FrontFacing;                                                          \
         highp ivec2 plsCoord = ivec2(floor(gl_FragCoord.xy));
 #else
-#define PLS_MAIN(NAME, Varyings, varyings, FragmentTextures, textures, _pos, _clockwise)           \
+#define PLS_MAIN(NAME, Varyings, varyings, FragmentTextures, textures, _pos)                       \
     void main()                                                                                    \
-    {                                                                                              \
-        bool _clockwise = gl_FrontFacing;
+    {
 #endif
 
 #define EMIT_PLS }
diff --git a/renderer/shaders/hlsl.glsl b/renderer/shaders/hlsl.glsl
index afe51c4..f31e58c 100644
--- a/renderer/shaders/hlsl.glsl
+++ b/renderer/shaders/hlsl.glsl
@@ -90,7 +90,7 @@
 #define NO_PERSPECTIVE noperspective
 #define @OPTIONALLY_FLAT nointerpolation
 #define FLAT nointerpolation
-#define VARYING(TYPE, NAME) TYPE NAME : NAME
+#define VARYING(IDX, TYPE, NAME) TYPE NAME : TEXCOORD##IDX
 
 #define VARYING_BLOCK_END(_pos)                                                                    \
     float4 _pos : SV_Position;                                                                     \
@@ -185,8 +185,8 @@
     return VALUE;                                                                                  \
     }
 
-#define PLS_MAIN(NAME, Varyings, varyings, FragmentTextures, textures, _pos, _clockwise)           \
-    [earlydepthstencil] void NAME(Varyings varyings, bool _clockwise : SV_IsFrontFace) {           \
+#define PLS_MAIN(NAME, Varyings, varyings, FragmentTextures, textures, _pos)                       \
+    [earlydepthstencil] void NAME(Varyings varyings) {                                             \
         int2 _plsCoord = int2(floor(varyings._pos.xy));
 
 #define EMIT_PLS }
diff --git a/renderer/shaders/metal.glsl b/renderer/shaders/metal.glsl
index 022cbab..14f8b90 100644
--- a/renderer/shaders/metal.glsl
+++ b/renderer/shaders/metal.glsl
@@ -76,7 +76,7 @@
 #define VARYING_BLOCK_BEGIN(N)                                                                     \
     struct N                                                                                       \
     {
-#define VARYING(TYPE, NAME) TYPE NAME
+#define VARYING(IDX, TYPE, NAME) TYPE NAME
 #define FLAT [[flat]]
 #define NO_PERSPECTIVE [[center_no_perspective]]
 // No-persective interpolation appears to break the guarantee that a varying == "x" when all
@@ -167,10 +167,9 @@
     return VALUE;                                                                                  \
     }
 
-#define PLS_MAIN(NAME, Varyings, varyings, FragmentTextures, textures, _pos, _clockwise)           \
+#define PLS_MAIN(NAME, Varyings, varyings, FragmentTextures, textures, _pos)                       \
     __attribute__((visibility("default"))) PLS fragment NAME(PLS _inpls,                           \
                                                              Varyings varyings [[stage_in]],       \
-                                                             bool _clockwise [[front_facing]],     \
                                                              FragmentTextures textures)            \
     {                                                                                              \
         PLS _pls;
diff --git a/renderer/shaders/minify.py b/renderer/shaders/minify.py
index 4c09129..5445c5b 100644
--- a/renderer/shaders/minify.py
+++ b/renderer/shaders/minify.py
@@ -286,10 +286,14 @@
 # renaming to names like, e.g., "rg".
 xyzw_pattern = re.compile(r"^[xyzw]{1,4}$")
 
+# HLSL registers (e.g., t0, u1) can't be overwritten by a #define.
+hlsl_register_pattern = re.compile(r"^[TtSsUu]\d+$")
+
 # can we rename to or from 'name'?
 def is_reserved_keyword(name):
     return name in glsl_reserved\
            or xyzw_pattern.match(name)\
+           or hlsl_register_pattern.match(name)\
            or name.startswith("$")\
            or name.startswith("gl_")\
            or name.startswith("__pixel_local")\
@@ -323,7 +327,7 @@
 # Exported variables only use upper case letters in their names. HLSL semantics are not case
 # sensitive and may also assign special meaning to numbers.
 upper_case_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-upper_case_name_generator = NameGenerator(upper_case_chars, upper_case_chars + '_')
+upper_case_name_generator = NameGenerator(upper_case_chars, upper_case_chars + "_")
 
 # Don't begin new names with the the '_' character. Internal code can begin names with '_' without
 # fear of renaming collisions.
diff --git a/renderer/shaders/tessellate.glsl b/renderer/shaders/tessellate.glsl
index 66c39c7..2e89ae7 100644
--- a/renderer/shaders/tessellate.glsl
+++ b/renderer/shaders/tessellate.glsl
@@ -13,20 +13,20 @@
 
 #ifdef @VERTEX
 ATTR_BLOCK_BEGIN(Attrs)
-ATTR(0, float4, @a_p0p1_);
+ATTR(0, float4, @a_p0p1_); // End in '_' because D3D interprets the '1' as a semantic index.
 ATTR(1, float4, @a_p2p3_);
-ATTR(2, float4, @a_joinTangent);
-ATTR(3, uint4, @a_args); // [x0, x1, y, polarSegmentCount, contourIDWithFlags]
+ATTR(2, float4, @a_joinTan_and_ys); // [joinTangent, y, reflectionY]
+ATTR(3, uint4, @a_args);            // [x0x1, reflectionX0X1, segmentCounts, contourIDWithFlags]
 ATTR_BLOCK_END
 #endif
 
 VARYING_BLOCK_BEGIN(Varyings)
-NO_PERSPECTIVE VARYING(float4, v_p0p1);
-NO_PERSPECTIVE VARYING(float4, v_p2p3);
-NO_PERSPECTIVE VARYING(float4, v_args);     // [vertexIdx, totalVertexCount, joinSegmentCount,
-                                            //  parametricSegmentCount, radsPerPolarSegment]
-NO_PERSPECTIVE VARYING(float3, v_joinArgs); // [joinTangent, radsPerJoinSegment]
-FLAT VARYING(uint, v_contourIDWithFlags);
+NO_PERSPECTIVE VARYING(0, float4, v_p0p1);
+NO_PERSPECTIVE VARYING(1, float4, v_p2p3);
+NO_PERSPECTIVE VARYING(2, float4, v_args);     // [vertexIdx, totalVertexCount, joinSegmentCount,
+                                               //  parametricSegmentCount, radsPerPolarSegment]
+NO_PERSPECTIVE VARYING(3, float3, v_joinArgs); // [joinTangent, radsPerJoinSegment]
+FLAT VARYING(4, uint, v_contourIDWithFlags);
 VARYING_BLOCK_END(_pos)
 
 // Tangent of the curve at T=0 and T=1.
@@ -65,10 +65,11 @@
             _instanceID,
             _pos)
 {
-    ATTR_UNPACK(_instanceID, attrs, @a_p0p1_, float4);
-    ATTR_UNPACK(_instanceID, attrs, @a_p2p3_, float4);
-    ATTR_UNPACK(_instanceID, attrs, @a_joinTangent, float4);
-    ATTR_UNPACK(_instanceID, attrs, @a_args, uint4);
+    // Each instance repeats twice. Once for normal patch(es) and once for reflection(s).
+    ATTR_UNPACK(_instanceID >> 1, attrs, @a_p0p1_, float4);
+    ATTR_UNPACK(_instanceID >> 1, attrs, @a_p2p3_, float4);
+    ATTR_UNPACK(_instanceID >> 1, attrs, @a_joinTan_and_ys, float4);
+    ATTR_UNPACK(_instanceID >> 1, attrs, @a_args, uint4);
 
     VARYING_INIT(varyings, v_p0p1, float4);
     VARYING_INIT(varyings, v_p2p3, float4);
@@ -80,15 +81,26 @@
     float2 p1 = @a_p0p1_.zw;
     float2 p2 = @a_p2p3_.xy;
     float2 p3 = @a_p2p3_.zw;
-    float x0 = float(int(@a_args.x << 16) >> 16);
-    float x1 = float(int(@a_args.x) >> 16);
-    float y = float(@a_args.y);
-    float2 coord = float2((_vertexID & 1) == 0 ? x0 : x1, (_vertexID & 2) == 0 ? y : y + 1.);
+    // Odd-numbered instances are reflections.
+    float y = (_instanceID & 1) == 0 ? @a_joinTan_and_ys.z : @a_joinTan_and_ys.w;
+    int x0x1 = int((_instanceID & 1) == 0 ? @a_args.x : @a_args.y);
+    float x0 = float(x0x1 << 16 >> 16);
+    float x1 = float(x0x1 >> 16);
+    float2 coord = float2((_vertexID & 1) == 0 ? x0 : x1, (_vertexID & 2) == 0 ? y + 1. : y);
 
     uint parametricSegmentCount = @a_args.z & 0x3ffu;
     uint polarSegmentCount = (@a_args.z >> 10) & 0x3ffu;
     uint joinSegmentCount = @a_args.z >> 20;
     uint contourIDWithFlags = @a_args.w;
+    if (x1 < x0) // Are we a reflection?
+    {
+        contourIDWithFlags |= MIRRORED_CONTOUR_FLAG;
+    }
+    if ((x1 - x0) * uniforms.tessInverseViewportY < .0)
+    {
+        // Make sure we always emit clockwise triangles. Swap the top and bottom vertices.
+        coord.y = 2. * y + 1. - coord.y;
+    }
     if ((contourIDWithFlags & CULL_EXCESS_TESSELLATION_SEGMENTS_FLAG) != 0u)
     {
         // This span may have more tessellation vertices allocated to it than necessary (e.g.,
@@ -124,13 +136,13 @@
 
     v_p0p1 = float4(p0, p1);
     v_p2p3 = float4(p2, p3);
-    v_args = float4(float(totalVertexCount) + coord.x - x1, // vertexIdx
-                    float(totalVertexCount),                // totalVertexCount
+    v_args = float4(float(totalVertexCount) - abs(x1 - coord.x), // vertexIdx
+                    float(totalVertexCount),                     // totalVertexCount
                     (joinSegmentCount << 10) | parametricSegmentCount,
                     radsPerPolarSegment);
     if (joinSegmentCount > 1u)
     {
-        float2x2 joinTangents = float2x2(tangents[1], @a_joinTangent.xy);
+        float2x2 joinTangents = float2x2(tangents[1], @a_joinTan_and_ys.xy);
         float joinTheta = acos(cosine_between_vectors(joinTangents[0], joinTangents[1]));
         float joinSpan = float(joinSegmentCount);
         if ((contourIDWithFlags & (JOIN_TYPE_MASK | EMULATED_STROKE_CAP_FLAG)) ==
@@ -145,7 +157,7 @@
         float radsPerJoinSegment = joinTheta / joinSpan;
         if (determinant(joinTangents) < .0)
             radsPerJoinSegment = -radsPerJoinSegment;
-        v_joinArgs.xy = @a_joinTangent.xy;
+        v_joinArgs.xy = @a_joinTan_and_ys.xy;
         v_joinArgs.z = radsPerJoinSegment;
     }
     v_contourIDWithFlags = contourIDWithFlags;