pls/renderer/pls_render_context.cpp - external/github.com/rive-app/rive-cpp - Git at Google

 /*
  * Copyright 2022 Rive
  */

 #include "rive/pls/pls_render_context.hpp"

 #include "gr_inner_fan_triangulator.hpp"
 #include "intersection_board.hpp"
 #include "pls_paint.hpp"
 #include "rive/pls/pls_draw.hpp"
 #include "rive/pls/pls_image.hpp"
 #include "rive/pls/pls_render_context_impl.hpp"
 #include "shaders/constants.glsl"

 #include <string_view>

 namespace rive::pls
 {
 constexpr size_t kDefaultSimpleGradientCapacity = 512;
 constexpr size_t kDefaultComplexGradientCapacity = 1024;
 constexpr size_t kDefaultDrawCapacity = 2048;

 constexpr uint32_t kMaxTextureHeight = 2048; // TODO: Move this variable to PlatformFeatures.
 constexpr size_t kMaxTessellationVertexCount = kMaxTextureHeight * kTessTextureWidth;
 constexpr size_t kMaxTessellationPaddingVertexCount =
     pls::kMidpointFanPatchSegmentSpan +      // Padding at the beginning of the tess texture
     (pls::kOuterCurvePatchSegmentSpan - 1) + // Max padding between patch types in the tess texture
     1;                                       // Padding at the end of the tessellation texture
 constexpr size_t kMaxTessellationVertexCountBeforePadding =
     kMaxTessellationVertexCount - kMaxTessellationPaddingVertexCount;

 // Metal requires vertex buffers to be 256-byte aligned.
 constexpr size_t kMaxTessellationAlignmentVertices = pls::kTessVertexBufferAlignmentInElements - 1;

 // We can only reorder 32767 draws at a time since the one-based groupIndex returned by
 // IntersectionBoard is a signed 16-bit integer.
 constexpr size_t kMaxReorderedDrawCount = std::numeric_limits<int16_t>::max();

 // How tall to make a resource texture in order to support the given number of items.
 template <size_t WidthInItems> constexpr static size_t resource_texture_height(size_t itemCount)
 {
     return (itemCount + WidthInItems - 1) / WidthInItems;
 }

 constexpr static size_t gradient_data_height(size_t simpleRampCount, size_t complexRampCount)
 {
     return resource_texture_height<pls::kGradTextureWidthInSimpleRamps>(simpleRampCount) +
            complexRampCount;
 }

 inline GradientContentKey::GradientContentKey(rcp<const PLSGradient> gradient) :
     m_gradient(std::move(gradient))
 {}

 inline GradientContentKey::GradientContentKey(GradientContentKey&& other) :
     m_gradient(std::move(other.m_gradient))
 {}

 bool GradientContentKey::operator==(const GradientContentKey& other) const
 {
     if (m_gradient.get() == other.m_gradient.get())
     {
         return true;
     }
     else
     {
         return m_gradient->count() == other.m_gradient->count() &&
                !memcmp(m_gradient->stops(),
                        other.m_gradient->stops(),
                        m_gradient->count() * sizeof(float)) &&
                !memcmp(m_gradient->colors(),
                        other.m_gradient->colors(),
                        m_gradient->count() * sizeof(ColorInt));
     }
 }

 size_t DeepHashGradient::operator()(const GradientContentKey& key) const
 {
     const PLSGradient* grad = key.gradient();
     std::hash<std::string_view> hash;
     size_t x = hash(std::string_view(reinterpret_cast<const char*>(grad->stops()),
                                      grad->count() * sizeof(float)));
     size_t y = hash(std::string_view(reinterpret_cast<const char*>(grad->colors()),
                                      grad->count() * sizeof(ColorInt)));
     return x ^ y;
 }

 PLSRenderContext::PLSRenderContext(std::unique_ptr<PLSRenderContextImpl> impl) :
     m_impl(std::move(impl)),
     // -1 from m_maxPathID so we reserve a path record for the clearColor paint (for atomic mode).
     // This also allows us to index the storage buffers directly by pathID.
     m_maxPathID(MaxPathID(m_impl->platformFeatures().pathIDGranularity) - 1)
 {
     setResourceSizes(ResourceAllocationCounts(), /*forceRealloc =*/true);
     releaseResources();
 }

 PLSRenderContext::~PLSRenderContext()
 {
     // Always call flush() to avoid deadlock.
     assert(!m_didBeginFrame);
     // Delete the logical flushes before the block allocators let go of their allocations.
     m_logicalFlushes.clear();
 }

 const pls::PlatformFeatures& PLSRenderContext::platformFeatures() const
 {
     return m_impl->platformFeatures();
 }

 rcp<RenderBuffer> PLSRenderContext::makeRenderBuffer(RenderBufferType type,
                                                      RenderBufferFlags flags,
                                                      size_t sizeInBytes)
 {
     return m_impl->makeRenderBuffer(type, flags, sizeInBytes);
 }

 rcp<RenderImage> PLSRenderContext::decodeImage(Span<const uint8_t> encodedBytes)
 {
     rcp<PLSTexture> texture = m_impl->decodeImageTexture(encodedBytes);
     return texture != nullptr ? make_rcp<PLSImage>(std::move(texture)) : nullptr;
 }

 void PLSRenderContext::releaseResources()
 {
     assert(!m_didBeginFrame);
     resetContainers();
     setResourceSizes(ResourceAllocationCounts());
     m_maxRecentResourceRequirements = ResourceAllocationCounts();
     m_lastResourceTrimTimeInSeconds = m_impl->secondsNow();
 }

 void PLSRenderContext::resetContainers()
 {
     assert(!m_didBeginFrame);

     if (!m_logicalFlushes.empty())
     {
         assert(m_logicalFlushes.size() == 1); // Should get reset to 1 after flush().
         m_logicalFlushes.resize(1);
         m_logicalFlushes.front()->resetContainers();
     }

     m_indirectDrawList.clear();
     m_indirectDrawList.shrink_to_fit();

     m_intersectionBoard = nullptr;
 }

 PLSRenderContext::LogicalFlush::LogicalFlush(PLSRenderContext* parent) : m_ctx(parent) { rewind(); }

 void PLSRenderContext::LogicalFlush::rewind()
 {
     m_resourceCounts = PLSDraw::ResourceCounters();
     m_simpleGradients.clear();
     m_pendingSimpleGradientWrites.clear();
     m_complexGradients.clear();
     m_pendingComplexColorRampDraws.clear();
     m_clips.clear();
     m_plsDraws.clear();
     m_combinedDrawBounds = {std::numeric_limits<int32_t>::max(),
                             std::numeric_limits<int32_t>::max(),
                             std::numeric_limits<int32_t>::min(),
                             std::numeric_limits<int32_t>::min()};

     m_pathPaddingCount = 0;
     m_paintPaddingCount = 0;
     m_paintAuxPaddingCount = 0;
     m_contourPaddingCount = 0;
     m_gradSpanPaddingCount = 0;
     m_midpointFanTessEndLocation = 0;
     m_outerCubicTessEndLocation = 0;
     m_outerCubicTessVertexIdx = 0;
     m_midpointFanTessVertexIdx = 0;

     m_flushDesc = FlushDescriptor();

     m_drawList.reset();
     m_combinedShaderFeatures = pls::ShaderFeatures::NONE;

     m_currentPathIsStroked = false;
     m_currentPathContourDirections = pls::ContourDirections::none;
     m_currentPathID = 0;
     m_currentContourID = 0;
     m_currentContourPaddingVertexCount = 0;
     m_pathTessLocation = 0;
     m_pathMirroredTessLocation = 0;
     RIVE_DEBUG_CODE(m_expectedPathTessLocationAtEndOfPath = 0;)
     RIVE_DEBUG_CODE(m_expectedPathMirroredTessLocationAtEndOfPath = 0;)
     RIVE_DEBUG_CODE(m_pathCurveCount = 0;)

     m_currentZIndex = 0;

     RIVE_DEBUG_CODE(m_hasDoneLayout = false;)
 }

 void PLSRenderContext::LogicalFlush::resetContainers()
 {
     m_clips.clear();
     m_clips.shrink_to_fit();
     m_plsDraws.clear();
     m_plsDraws.shrink_to_fit();
     m_plsDraws.reserve(kDefaultDrawCapacity);

     m_simpleGradients.rehash(0);
     m_simpleGradients.reserve(kDefaultSimpleGradientCapacity);

     m_pendingSimpleGradientWrites.clear();
     m_pendingSimpleGradientWrites.shrink_to_fit();
     m_pendingSimpleGradientWrites.reserve(kDefaultSimpleGradientCapacity);

     m_complexGradients.rehash(0);
     m_complexGradients.reserve(kDefaultComplexGradientCapacity);

     m_pendingComplexColorRampDraws.clear();
     m_pendingComplexColorRampDraws.shrink_to_fit();
     m_pendingComplexColorRampDraws.reserve(kDefaultComplexGradientCapacity);
 }

 void PLSRenderContext::beginFrame(const FrameDescriptor& frameDescriptor)
 {
     assert(!m_didBeginFrame);
     assert(frameDescriptor.renderTargetWidth > 0);
     assert(frameDescriptor.renderTargetHeight > 0);
     m_frameDescriptor = frameDescriptor;
     if (!platformFeatures().supportsPixelLocalStorage)
     {
         // Use 4x MSAA if we don't have pixel local storage and MSAA wasn't specified.
         m_frameDescriptor.msaaSampleCount =
             m_frameDescriptor.msaaSampleCount > 0 ? m_frameDescriptor.msaaSampleCount : 4;
     }
     if (m_frameDescriptor.msaaSampleCount > 0)
     {
         m_frameInterlockMode = pls::InterlockMode::depthStencil;
     }
     else if (m_frameDescriptor.disableRasterOrdering || !platformFeatures().supportsRasterOrdering)
     {
         m_frameInterlockMode = pls::InterlockMode::atomics;
     }
     else
     {
         m_frameInterlockMode = pls::InterlockMode::rasterOrdering;
     }
     m_frameShaderFeaturesMask = pls::ShaderFeaturesMaskFor(m_frameInterlockMode);
     if (m_logicalFlushes.empty())
     {
         m_logicalFlushes.emplace_back(new LogicalFlush(this));
     }
     RIVE_DEBUG_CODE(m_didBeginFrame = true);
 }

 bool PLSRenderContext::isOutsideCurrentFrame(const IAABB& pixelBounds)
 {
     assert(m_didBeginFrame);
     int4 bounds = simd::load4i(&pixelBounds);
     auto renderTargetSize = simd::cast<int32_t>(
         uint2{m_frameDescriptor.renderTargetWidth, m_frameDescriptor.renderTargetHeight});
     return simd::any(bounds.xy >= renderTargetSize || bounds.zw <= 0 || bounds.xy >= bounds.zw);
 }

 bool PLSRenderContext::frameSupportsClipRects() const
 {
     assert(m_didBeginFrame);
     return m_frameInterlockMode != pls::InterlockMode::depthStencil ||
            platformFeatures().supportsClipPlanes;
 }

 bool PLSRenderContext::frameSupportsImagePaintForPaths() const
 {
     assert(m_didBeginFrame);
     return m_frameInterlockMode != pls::InterlockMode::atomics ||
            platformFeatures().supportsBindlessTextures;
 }

 uint32_t PLSRenderContext::generateClipID(const IAABB& contentBounds)
 {
     assert(m_didBeginFrame);
     assert(!m_logicalFlushes.empty());
     return m_logicalFlushes.back()->generateClipID(contentBounds);
 }

 uint32_t PLSRenderContext::LogicalFlush::generateClipID(const IAABB& contentBounds)
 {
     if (m_clips.size() < m_ctx->m_maxPathID) // maxClipID == maxPathID.
     {
         m_clips.emplace_back(contentBounds);
         assert(m_ctx->m_clipContentID != m_clips.size());
         return math::lossless_numeric_cast<uint32_t>(m_clips.size());
     }
     return 0; // There are no available clip IDs. The caller should flush and try again.
 }

 PLSRenderContext::LogicalFlush::ClipInfo& PLSRenderContext::LogicalFlush::getWritableClipInfo(
     uint32_t clipID)
 {
     assert(clipID > 0);
     assert(clipID <= m_clips.size());
     return m_clips[clipID - 1];
 }

 void PLSRenderContext::LogicalFlush::addClipReadBounds(uint32_t clipID, const IAABB& bounds)
 {
     assert(clipID > 0);
     assert(clipID <= m_clips.size());
     ClipInfo& clipInfo = getWritableClipInfo(clipID);
     clipInfo.readBounds = clipInfo.readBounds.join(bounds);
 }

 bool PLSRenderContext::pushDrawBatch(PLSDrawUniquePtr draws[], size_t drawCount)
 {
     assert(m_didBeginFrame);
     assert(!m_logicalFlushes.empty());
     return m_logicalFlushes.back()->pushDrawBatch(draws, drawCount);
 }

 bool PLSRenderContext::LogicalFlush::pushDrawBatch(PLSDrawUniquePtr draws[], size_t drawCount)
 {
     assert(!m_hasDoneLayout);

     if (m_flushDesc.interlockMode == pls::InterlockMode::atomics &&
         m_drawList.count() + drawCount > kMaxReorderedDrawCount)
     {
         // We can only reorder 64k draws at a time since the sort key addresses them with a 16-bit
         // index.
         return false;
     }

     auto countsVector = m_resourceCounts.toVec();
     for (size_t i = 0; i < drawCount; ++i)
     {
         assert(!draws[i]->pixelBounds().empty());
         assert(m_ctx->frameSupportsClipRects() || draws[i]->clipRectInverseMatrix() == nullptr);
         countsVector += draws[i]->resourceCounts().toVec();
     }
     PLSDraw::ResourceCounters countsWithNewBatch = countsVector;

     // Textures have hard size limits. If new batch doesn't fit in one of the textures, the caller
     // needs to flush and try again.
     if (countsWithNewBatch.pathCount > m_ctx->m_maxPathID ||
         countsWithNewBatch.contourCount > kMaxContourID ||
         countsWithNewBatch.midpointFanTessVertexCount +
                 countsWithNewBatch.outerCubicTessVertexCount >
             kMaxTessellationVertexCountBeforePadding)
     {
         return false;
     }

     // Allocate spans in the gradient texture.
     for (size_t i = 0; i < drawCount; ++i)
     {
         if (!draws[i]->allocateGradientIfNeeded(this, &countsWithNewBatch))
         {
             // The gradient doesn't fit. Give up and let the caller flush and try again.
             return false;
         }
     }

     for (size_t i = 0; i < drawCount; ++i)
     {
         m_plsDraws.push_back(std::move(draws[i]));
         m_combinedDrawBounds = m_combinedDrawBounds.join(m_plsDraws.back()->pixelBounds());
     }

     m_resourceCounts = countsWithNewBatch;
     return true;
 }

 bool PLSRenderContext::LogicalFlush::allocateGradient(const PLSGradient* gradient,
                                                       PLSDraw::ResourceCounters* counters,
                                                       pls::ColorRampLocation* colorRampLocation)
 {
     assert(!m_hasDoneLayout);

     const float* stops = gradient->stops();
     size_t stopCount = gradient->count();

     if (stopCount == 2 && stops[0] == 0 && stops[1] == 1)
     {
         // This is a simple gradient that can be implemented by a two-texel color ramp.
         uint64_t simpleKey;
         static_assert(sizeof(simpleKey) == sizeof(ColorInt) * 2);
         RIVE_INLINE_MEMCPY(&simpleKey, gradient->colors(), sizeof(ColorInt) * 2);
         uint32_t rampTexelsIdx;
         auto iter = m_simpleGradients.find(simpleKey);
         if (iter != m_simpleGradients.end())
         {
             rampTexelsIdx = iter->second; // This gradient is already in the texture.
         }
         else
         {
             if (gradient_data_height(m_simpleGradients.size() + 1, m_complexGradients.size()) >
                 kMaxTextureHeight)
             {
                 // We ran out of rows in the gradient texture. Caller has to flush and try again.
                 return false;
             }
             rampTexelsIdx = math::lossless_numeric_cast<uint32_t>(m_simpleGradients.size() * 2);
             m_simpleGradients.insert({simpleKey, rampTexelsIdx});
             m_pendingSimpleGradientWrites.emplace_back().set(gradient->colors());
         }
         colorRampLocation->row = rampTexelsIdx / kGradTextureWidth;
         colorRampLocation->col = rampTexelsIdx % kGradTextureWidth;
     }
     else
     {
         // This is a complex gradient. Render it to an entire row of the gradient texture.
         GradientContentKey key(ref_rcp(gradient));
         auto iter = m_complexGradients.find(key);
         uint16_t row;
         if (iter != m_complexGradients.end())
         {
             row = iter->second; // This gradient is already in the texture.
         }
         else
         {
             if (gradient_data_height(m_simpleGradients.size(), m_complexGradients.size() + 1) >
                 kMaxTextureHeight)
             {
                 // We ran out of rows in the gradient texture. Caller has to flush and try again.
                 return false;
             }

             size_t spanCount = stopCount + 1;
             counters->complexGradientSpanCount += spanCount;

             row = static_cast<uint32_t>(m_complexGradients.size());
             m_complexGradients.emplace(std::move(key), row);
             m_pendingComplexColorRampDraws.push_back(gradient);
         }
         colorRampLocation->row = row;
         colorRampLocation->col = ColorRampLocation::kComplexGradientMarker;
     }
     return true;
 }

 void PLSRenderContext::logicalFlush()
 {
     assert(m_didBeginFrame);

     // Reset clipping state after every logical flush because the clip buffer is not preserved
     // between render passes.
     m_clipContentID = 0;

     // Don't issue any GPU commands between logical flushes. Instead, build up a list of flushes
     // that we will submit all at once at the end of the frame.
     m_logicalFlushes.emplace_back(new LogicalFlush(this));
 }

 void PLSRenderContext::flush(const FlushResources& flushResources)
 {
     assert(m_didBeginFrame);
     assert(flushResources.renderTarget->width() == m_frameDescriptor.renderTargetWidth);
     assert(flushResources.renderTarget->height() == m_frameDescriptor.renderTargetHeight);

     m_clipContentID = 0;

     // Layout this frame's resource buffers and textures.
     LogicalFlush::ResourceCounters totalFrameResourceCounts;
     LogicalFlush::LayoutCounters layoutCounts;
     for (size_t i = 0; i < m_logicalFlushes.size(); ++i)
     {
         m_logicalFlushes[i]->layoutResources(flushResources,
                                              i,
                                              i == m_logicalFlushes.size() - 1,
                                              &totalFrameResourceCounts,
                                              &layoutCounts);
     }
     assert(layoutCounts.maxGradTextureHeight <= kMaxTextureHeight);
     assert(layoutCounts.maxTessTextureHeight <= kMaxTextureHeight);

     // Determine the minimum required resource allocation sizes to service this flush.
     ResourceAllocationCounts allocs;
     allocs.flushUniformBufferCount = m_logicalFlushes.size();
     allocs.imageDrawUniformBufferCount = totalFrameResourceCounts.imageDrawCount;
     allocs.pathBufferCount = totalFrameResourceCounts.pathCount + layoutCounts.pathPaddingCount;
     allocs.paintBufferCount = totalFrameResourceCounts.pathCount + layoutCounts.paintPaddingCount;
     allocs.paintAuxBufferCount =
         totalFrameResourceCounts.pathCount + layoutCounts.paintAuxPaddingCount;
     allocs.contourBufferCount =
         totalFrameResourceCounts.contourCount + layoutCounts.contourPaddingCount;
     // The gradient texture needs to be updated in entire rows at a time. Extend its
     // texture-transfer buffer's length in order to be able to serve a worst-case scenario.
     allocs.simpleGradientBufferCount =
         layoutCounts.simpleGradCount + pls::kGradTextureWidthInSimpleRamps - 1;
     allocs.complexGradSpanBufferCount =
         totalFrameResourceCounts.complexGradientSpanCount + layoutCounts.gradSpanPaddingCount;
     allocs.tessSpanBufferCount = totalFrameResourceCounts.maxTessellatedSegmentCount;
     allocs.triangleVertexBufferCount = totalFrameResourceCounts.maxTriangleVertexCount;
     allocs.gradTextureHeight = layoutCounts.maxGradTextureHeight;
     allocs.tessTextureHeight = layoutCounts.maxTessTextureHeight;

     // Track m_maxRecentResourceRequirements so we can trim GPU allocations when steady-state usage
     // goes down.
     m_maxRecentResourceRequirements =
         simd::max(allocs.toVec(), m_maxRecentResourceRequirements.toVec());

     // Grow resources enough to handle this flush.
     // If "allocs" already fits in our current allocations, then don't change them.
     // If they don't fit, overallocate by 25% in order to create some slack for growth.
     allocs = simd::if_then_else(allocs.toVec() <= m_currentResourceAllocations.toVec(),
                                 m_currentResourceAllocations.toVec(),
                                 allocs.toVec() * size_t(5) / size_t(4));

     // Additionally, every 5 seconds, trim resources down to the most recent steady-state usage.
     double flushTime = m_impl->secondsNow();
     bool needsResourceTrim = flushTime - m_lastResourceTrimTimeInSeconds >= 5;
     if (needsResourceTrim)
     {
         // Trim GPU resource allocations to 125% of their maximum recent usage, and only if the
         // recent usage is 2/3 or less of the current allocation.
         allocs = simd::if_then_else(m_maxRecentResourceRequirements.toVec() <=
                                         allocs.toVec() * size_t(2) / size_t(3),
                                     m_maxRecentResourceRequirements.toVec() * size_t(5) / size_t(4),
                                     allocs.toVec());

         // Zero out m_maxRecentResourceRequirements for the next interval.
         m_maxRecentResourceRequirements = ResourceAllocationCounts();
         m_lastResourceTrimTimeInSeconds = flushTime;
     }

     setResourceSizes(allocs);

     // Write out the GPU buffers for this frame.
     mapResourceBuffers(allocs);

     for (const auto& flush : m_logicalFlushes)
     {
         flush->writeResources();
     }

     assert(m_flushUniformData.elementsWritten() == m_logicalFlushes.size());
     assert(m_imageDrawUniformData.elementsWritten() == totalFrameResourceCounts.imageDrawCount);
     assert(m_pathData.elementsWritten() ==
            totalFrameResourceCounts.pathCount + layoutCounts.pathPaddingCount);
     assert(m_paintData.elementsWritten() ==
            totalFrameResourceCounts.pathCount + layoutCounts.paintPaddingCount);
     assert(m_paintAuxData.elementsWritten() ==
            totalFrameResourceCounts.pathCount + layoutCounts.paintAuxPaddingCount);
     assert(m_contourData.elementsWritten() ==
            totalFrameResourceCounts.contourCount + layoutCounts.contourPaddingCount);
     assert(m_simpleColorRampsData.elementsWritten() == layoutCounts.simpleGradCount);
     assert(m_gradSpanData.elementsWritten() ==
            totalFrameResourceCounts.complexGradientSpanCount + layoutCounts.gradSpanPaddingCount);
     assert(m_tessSpanData.elementsWritten() <= totalFrameResourceCounts.maxTessellatedSegmentCount);
     assert(m_triangleVertexData.elementsWritten() <=
            totalFrameResourceCounts.maxTriangleVertexCount);

     unmapResourceBuffers();

     // Issue logical flushes to the backend.
     for (const auto& flush : m_logicalFlushes)
     {
         m_impl->flush(flush->desc());
     }

     if (!m_logicalFlushes.empty())
     {
         m_logicalFlushes.resize(1);
         m_logicalFlushes.front()->rewind();
     }

     // Drop all memory that was allocated for this frame using TrivialBlockAllocator.
     m_perFrameAllocator.reset();
     m_numChopsAllocator.reset();
     m_chopVerticesAllocator.reset();
     m_tangentPairsAllocator.reset();
     m_polarSegmentCountsAllocator.reset();
     m_parametricSegmentCountsAllocator.reset();

     m_frameDescriptor = FrameDescriptor();

     RIVE_DEBUG_CODE(m_didBeginFrame = false;)

     // Wait to reset CPU-side containers until after the flush has finished.
     if (needsResourceTrim)
     {
         resetContainers();
     }
 }

 void PLSRenderContext::LogicalFlush::layoutResources(const FlushResources& flushResources,
                                                      size_t logicalFlushIdx,
                                                      bool isFinalFlushOfFrame,
                                                      ResourceCounters* runningFrameResourceCounts,
                                                      LayoutCounters* runningFrameLayoutCounts)
 {
     assert(!m_hasDoneLayout);

     const FrameDescriptor& frameDescriptor = m_ctx->frameDescriptor();

     // Reserve a path record for the clearColor paint (used by atomic mode).
     // This also allows us to index the storage buffers directly by pathID.
     ++m_resourceCounts.pathCount;

     // Storage buffer offsets are required to be aligned on multiples of 256.
     m_pathPaddingCount =
         pls::PaddingToAlignUp<pls::kPathBufferAlignmentInElements>(m_resourceCounts.pathCount);
     m_paintPaddingCount =
         pls::PaddingToAlignUp<pls::kPaintBufferAlignmentInElements>(m_resourceCounts.pathCount);
     m_paintAuxPaddingCount =
         pls::PaddingToAlignUp<pls::kPaintAuxBufferAlignmentInElements>(m_resourceCounts.pathCount);
     m_contourPaddingCount = pls::PaddingToAlignUp<pls::kContourBufferAlignmentInElements>(
         m_resourceCounts.contourCount);

     // Metal requires vertex buffers to be 256-byte aligned.
     m_gradSpanPaddingCount = pls::PaddingToAlignUp<pls::kGradSpanBufferAlignmentInElements>(
         m_resourceCounts.complexGradientSpanCount);

     size_t totalTessVertexCountWithPadding = 0;
     if ((m_resourceCounts.midpointFanTessVertexCount |
          m_resourceCounts.outerCubicTessVertexCount) != 0)
     {
         // midpointFan tessellation vertices reside at the beginning of the tessellation texture,
         // after 1 patch of padding vertices.
         constexpr uint32_t kPrePadding = pls::kMidpointFanPatchSegmentSpan;
         m_midpointFanTessVertexIdx = kPrePadding;
         m_midpointFanTessEndLocation =
             m_midpointFanTessVertexIdx +
             math::lossless_numeric_cast<uint32_t>(m_resourceCounts.midpointFanTessVertexCount);

         // outerCubic tessellation vertices reside after the midpointFan vertices, aligned on a
         // multiple of the outerCubic patch size.
         uint32_t interiorPadding =
             PaddingToAlignUp<pls::kOuterCurvePatchSegmentSpan>(m_midpointFanTessEndLocation);
         m_outerCubicTessVertexIdx = m_midpointFanTessEndLocation + interiorPadding;
         m_outerCubicTessEndLocation =
             m_outerCubicTessVertexIdx +
             math::lossless_numeric_cast<uint32_t>(m_resourceCounts.outerCubicTessVertexCount);

         // We need one more padding vertex after all the tessellation vertices.
         constexpr uint32_t kPostPadding = 1;
         totalTessVertexCountWithPadding = m_outerCubicTessEndLocation + kPostPadding;

         assert(kPrePadding + interiorPadding + kPostPadding <= kMaxTessellationPaddingVertexCount);
         assert(totalTessVertexCountWithPadding <= kMaxTessellationVertexCount);
     }

     uint32_t tessDataHeight = math::lossless_numeric_cast<uint32_t>(
         resource_texture_height<kTessTextureWidth>(totalTessVertexCountWithPadding));
     if (m_resourceCounts.maxTessellatedSegmentCount != 0)
     {
         // Conservatively account for line breaks and padding in the tessellation span count.
         // Line breaks potentially introduce a new span. Count the maximum number of line breaks we
         // might encounter, which is at most TWO for every line in the tessellation texture (one for
         // a forward span, and one for its reflection.)
         size_t maxSpanBreakCount = tessDataHeight * 2;
         // The tessellation texture requires 3 separate spans of padding vertices (see above and
         // below).
         constexpr size_t kPaddingSpanCount = 3;
         m_resourceCounts.maxTessellatedSegmentCount +=
             maxSpanBreakCount + kPaddingSpanCount + kMaxTessellationAlignmentVertices;
     }

     m_flushDesc.renderTarget = flushResources.renderTarget;
     m_flushDesc.interlockMode = m_ctx->frameInterlockMode();
     m_flushDesc.msaaSampleCount = frameDescriptor.msaaSampleCount;

     // In atomic mode, we may be able to skip the explicit clear of the color buffer and fold it
     // into the atomic "resolve" operation instead.
     bool doClearDuringAtomicResolve = false;

     if (logicalFlushIdx != 0)
     {
         // We always have to preserve the renderTarget between logical flushes.
         m_flushDesc.colorLoadAction = pls::LoadAction::preserveRenderTarget;
     }
     else if (frameDescriptor.loadAction == pls::LoadAction::clear)
     {
         // In atomic mode, we can clear during the resolve operation if the clearColor is opaque
         // (because we don't want or have a "source only" blend mode).
         doClearDuringAtomicResolve = m_ctx->frameInterlockMode() == pls::InterlockMode::atomics &&
                                      colorAlpha(frameDescriptor.clearColor) == 255;
         m_flushDesc.colorLoadAction =
             doClearDuringAtomicResolve ? pls::LoadAction::dontCare : pls::LoadAction::clear;
     }
     else
     {
         m_flushDesc.colorLoadAction = frameDescriptor.loadAction;
     }
     m_flushDesc.clearColor = frameDescriptor.clearColor;

     if (doClearDuringAtomicResolve)
     {
         // In atomic mode we can accomplish a clear of the color buffer while the shader resolves
         // coverage, instead of actually clearing. writeResources() will configure the fill for
         // pathID=0 to be a solid fill matching the clearColor, so if we just initialize coverage
         // buffer to solid coverage with pathID=0, the resolve step will write out the correct clear
         // color.
         assert(m_flushDesc.interlockMode == pls::InterlockMode::atomics);
         m_flushDesc.coverageClearValue = static_cast<uint32_t>(FIXED_COVERAGE_ONE);
     }
     else if (m_flushDesc.interlockMode == pls::InterlockMode::atomics)
     {
         // When we don't skip the initial clear in atomic mode, clear the coverage buffer to
         // pathID=0 and a transparent coverage value.
         // pathID=0 meets the requirement that pathID is always monotonically increasing.
         // Transparent coverage makes sure the clearColor doesn't get written out while resolving.
         m_flushDesc.coverageClearValue = static_cast<uint32_t>(FIXED_COVERAGE_ZERO);
     }
     else
     {
         // In non-atomic mode, the coverage buffer just needs to be initialized with "pathID=0" to
         // avoid collisions with any pathIDs being rendered.
         m_flushDesc.coverageClearValue = 0;
     }

     if (doClearDuringAtomicResolve || m_flushDesc.colorLoadAction == pls::LoadAction::clear)
     {
         // If we're clearing then we always update the entire render target.
         m_flushDesc.renderTargetUpdateBounds = m_flushDesc.renderTarget->bounds();
     }
     else
     {
         // When we don't clear, we only update the draw bounds.
         m_flushDesc.renderTargetUpdateBounds =
             m_flushDesc.renderTarget->bounds().intersect(m_combinedDrawBounds);
     }
     if (m_flushDesc.renderTargetUpdateBounds.empty())
     {
         // If this is empty it means there are no draws and no clear.
         m_flushDesc.renderTargetUpdateBounds = {0, 0, 0, 0};
     }

     m_flushDesc.flushUniformDataOffsetInBytes = logicalFlushIdx * sizeof(pls::FlushUniforms);
     m_flushDesc.pathCount = math::lossless_numeric_cast<uint32_t>(m_resourceCounts.pathCount);
     m_flushDesc.firstPath =
         runningFrameResourceCounts->pathCount + runningFrameLayoutCounts->pathPaddingCount;
     m_flushDesc.firstPaint =
         runningFrameResourceCounts->pathCount + runningFrameLayoutCounts->paintPaddingCount;
     m_flushDesc.firstPaintAux =
         runningFrameResourceCounts->pathCount + runningFrameLayoutCounts->paintAuxPaddingCount;
     m_flushDesc.contourCount = math::lossless_numeric_cast<uint32_t>(m_resourceCounts.contourCount);
     m_flushDesc.firstContour =
         runningFrameResourceCounts->contourCount + runningFrameLayoutCounts->contourPaddingCount;
     m_flushDesc.complexGradSpanCount =
         math::lossless_numeric_cast<uint32_t>(m_resourceCounts.complexGradientSpanCount);
     m_flushDesc.firstComplexGradSpan = runningFrameResourceCounts->complexGradientSpanCount +
                                        runningFrameLayoutCounts->gradSpanPaddingCount;
     m_flushDesc.simpleGradTexelsWidth =
         std::min<uint32_t>(math::lossless_numeric_cast<uint32_t>(m_simpleGradients.size()),
                            pls::kGradTextureWidthInSimpleRamps) *
         2;
     m_flushDesc.simpleGradTexelsHeight = static_cast<uint32_t>(
         resource_texture_height<pls::kGradTextureWidthInSimpleRamps>(m_simpleGradients.size()));
     m_flushDesc.simpleGradDataOffsetInBytes =
         runningFrameLayoutCounts->simpleGradCount * sizeof(pls::TwoTexelRamp);
     m_flushDesc.complexGradRowsTop = m_flushDesc.simpleGradTexelsHeight;
     m_flushDesc.complexGradRowsHeight =
         math::lossless_numeric_cast<uint32_t>(m_complexGradients.size());
     m_flushDesc.tessDataHeight = tessDataHeight;

     m_flushDesc.externalCommandBuffer = flushResources.externalCommandBuffer;
     if (isFinalFlushOfFrame)
     {
         m_flushDesc.frameCompletionFence = flushResources.frameCompletionFence;
     }

     m_flushDesc.wireframe = frameDescriptor.wireframe;
     m_flushDesc.isFinalFlushOfFrame = isFinalFlushOfFrame;

     *runningFrameResourceCounts = runningFrameResourceCounts->toVec() + m_resourceCounts.toVec();
     runningFrameLayoutCounts->pathPaddingCount += m_pathPaddingCount;
     runningFrameLayoutCounts->paintPaddingCount += m_paintPaddingCount;
     runningFrameLayoutCounts->paintAuxPaddingCount += m_paintAuxPaddingCount;
     runningFrameLayoutCounts->contourPaddingCount += m_contourPaddingCount;
     runningFrameLayoutCounts->simpleGradCount += m_simpleGradients.size();
     runningFrameLayoutCounts->gradSpanPaddingCount += m_gradSpanPaddingCount;
     runningFrameLayoutCounts->maxGradTextureHeight =
         std::max(m_flushDesc.simpleGradTexelsHeight + m_flushDesc.complexGradRowsHeight,
                  runningFrameLayoutCounts->maxGradTextureHeight);
     runningFrameLayoutCounts->maxTessTextureHeight =
         std::max(m_flushDesc.tessDataHeight, runningFrameLayoutCounts->maxTessTextureHeight);

     assert(m_flushDesc.firstPath % pls::kPathBufferAlignmentInElements == 0);
     assert(m_flushDesc.firstPaint % pls::kPaintBufferAlignmentInElements == 0);
     assert(m_flushDesc.firstPaintAux % pls::kPaintAuxBufferAlignmentInElements == 0);
     assert(m_flushDesc.firstContour % pls::kContourBufferAlignmentInElements == 0);
     assert(m_flushDesc.firstComplexGradSpan % pls::kGradSpanBufferAlignmentInElements == 0);
     RIVE_DEBUG_CODE(m_hasDoneLayout = true;)
 }

 void PLSRenderContext::LogicalFlush::writeResources()
 {
     const pls::PlatformFeatures& platformFeatures = m_ctx->platformFeatures();
     assert(m_hasDoneLayout);
     assert(m_flushDesc.firstPath == m_ctx->m_pathData.elementsWritten());
     assert(m_flushDesc.firstPaint == m_ctx->m_paintData.elementsWritten());
     assert(m_flushDesc.firstPaintAux == m_ctx->m_paintAuxData.elementsWritten());

     // Wait until here to layout the gradient texture because the final gradient texture height is
     // not decided until after all LogicalFlushes have run layoutResources().
     m_gradTextureLayout.inverseHeight = 1.f / m_ctx->m_currentResourceAllocations.gradTextureHeight;
     m_gradTextureLayout.complexOffsetY = m_flushDesc.complexGradRowsTop;

     // Exact tessSpan/triangleVertex counts aren't known until after their data is written out.
     size_t firstTessVertexSpan = m_ctx->m_tessSpanData.elementsWritten();
     size_t initialTriangleVertexDataSize = m_ctx->m_triangleVertexData.bytesWritten();

     // Metal requires vertex buffers to be 256-byte aligned.
     size_t tessAlignmentPadding =
         pls::PaddingToAlignUp<pls::kTessVertexBufferAlignmentInElements>(firstTessVertexSpan);
     assert(tessAlignmentPadding <= kMaxTessellationAlignmentVertices);
     m_ctx->m_tessSpanData.push_back_n(nullptr, tessAlignmentPadding);
     m_flushDesc.firstTessVertexSpan = firstTessVertexSpan + tessAlignmentPadding;
     assert(m_flushDesc.firstTessVertexSpan == m_ctx->m_tessSpanData.elementsWritten());

     // Write out the uniforms for this flush.
     m_ctx->m_flushUniformData.emplace_back(m_flushDesc, platformFeatures);

     // Write out the simple gradient data.
     assert(m_simpleGradients.size() == m_pendingSimpleGradientWrites.size());
     if (!m_pendingSimpleGradientWrites.empty())
     {
         m_ctx->m_simpleColorRampsData.push_back_n(m_pendingSimpleGradientWrites.data(),
                                                   m_pendingSimpleGradientWrites.size());
     }

     // Write out the vertex data for rendering complex gradients.
     assert(m_complexGradients.size() == m_pendingComplexColorRampDraws.size());
     if (!m_pendingComplexColorRampDraws.empty())
     {
         // The viewport will start at simpleGradDataHeight when rendering color ramps.
         for (uint32_t y = 0; y < m_pendingComplexColorRampDraws.size(); ++y)
         {
             const PLSGradient* gradient = m_pendingComplexColorRampDraws[y];
             const ColorInt* colors = gradient->colors();
             const float* stops = gradient->stops();
             size_t stopCount = gradient->count();

             // Push "GradientSpan" instances that will render each section of the color ramp.
             ColorInt lastColor = colors[0];
             uint32_t lastXFixed = 0;
             // "stop * w + .5" converts a stop position to an x-coordinate in the gradient texture.
             // Stops should be aligned (ideally) on pixel centers to prevent bleed.
             // Render half-pixel-wide caps at the beginning and end to ensure the boundary pixels
             // get filled.
             float w = kGradTextureWidth - 1.f;
             for (size_t i = 0; i < stopCount; ++i)
             {
                 float x = stops[i] * w + .5f;
                 uint32_t xFixed = static_cast<uint32_t>(x * (65536.f / kGradTextureWidth));
                 assert(lastXFixed <= xFixed && xFixed < 65536); // stops[] must be ordered.
                 m_ctx->m_gradSpanData.set_back(lastXFixed, xFixed, y, lastColor, colors[i]);
                 lastColor = colors[i];
                 lastXFixed = xFixed;
             }
             m_ctx->m_gradSpanData.set_back(lastXFixed, 65535u, y, lastColor, lastColor);
         }
     }

     // Write a path record for the clearColor paint (used by atomic mode).
     // This also allows us to index the storage buffers directly by pathID.
     pls::SimplePaintValue clearColorValue;
     clearColorValue.color = m_ctx->frameDescriptor().clearColor;
     m_ctx->m_pathData.skip_back();
     m_ctx->m_paintData.set_back(FillRule::nonZero,
                                 PaintType::solidColor,
                                 clearColorValue,
                                 GradTextureLayout(),
                                 /*clipID =*/0,
                                 /*hasClipRect =*/false,
                                 BlendMode::srcOver);
     m_ctx->m_paintAuxData.skip_back();

     // Render padding vertices in the tessellation texture.
     if (m_flushDesc.tessDataHeight > 0)
     {
         // Padding at the beginning of the tessellation texture.
         pushPaddingVertices(0, pls::kMidpointFanPatchSegmentSpan);
         // Padding between patch types in the tessellation texture.
         if (m_outerCubicTessVertexIdx > m_midpointFanTessEndLocation)
         {
             pushPaddingVertices(m_midpointFanTessEndLocation,
                                 m_outerCubicTessVertexIdx - m_midpointFanTessEndLocation);
         }
         // The final vertex of the final patch of each contour crosses over into the next contour.
         // (This is how we wrap around back to the beginning.) Therefore, the final contour of the
         // flush needs an out-of-contour vertex to cross into as well, so we emit a padding vertex
         // here at the end.
         pushPaddingVertices(m_outerCubicTessEndLocation, 1);
     }

     // Write out all the data for our high level draws, and build up a low-level draw list.
     if (m_ctx->frameInterlockMode() == pls::InterlockMode::rasterOrdering)
     {
         for (const PLSDrawUniquePtr& draw : m_plsDraws)
         {
             draw->pushToRenderContext(this);
         }
     }
     else
     {
         assert(m_plsDraws.size() <= kMaxReorderedDrawCount);

         // Sort the draw list to optimize batching, since we can only batch non-overlapping draws.
         std::vector<int64_t>& indirectDrawList = m_ctx->m_indirectDrawList;
         indirectDrawList.resize(m_plsDraws.size());

         if (m_ctx->m_intersectionBoard == nullptr)
         {
             m_ctx->m_intersectionBoard = std::make_unique<IntersectionBoard>();
         }
         IntersectionBoard* intersectionBoard = m_ctx->m_intersectionBoard.get();
         intersectionBoard->resizeAndReset(m_flushDesc.renderTarget->width(),
                                           m_flushDesc.renderTarget->height());

         // Build a list of sort keys that determine the final draw order.
         constexpr static int kDrawGroupShift = 48; // Where in the key does the draw group begin?
         constexpr static int64_t kDrawGroupMask = 0xffffllu << kDrawGroupShift;
         constexpr static int kDrawTypeShift = 45;
         constexpr static int64_t kDrawTypeMask RIVE_MAYBE_UNUSED = 7llu << kDrawTypeShift;
         constexpr static int kTextureHashShift = 26;
         constexpr static int64_t kTextureHashMask = 0x7ffffllu << kTextureHashShift;
         constexpr static int kBlendModeShift = 22;
         constexpr static int kBlendModeMask = 0xf << kBlendModeShift;
         constexpr static int kDrawContentsShift = 16;
         constexpr static int64_t kDrawContentsMask = 0x3fllu << kDrawContentsShift;
         constexpr static int64_t kDrawIndexMask = 0xffff;
         for (size_t i = 0; i < m_plsDraws.size(); ++i)
         {
             PLSDraw* draw = m_plsDraws[i].get();

             int4 drawBounds = simd::load4i(&m_plsDraws[i]->pixelBounds());

             // Add one extra pixel of padding to the draw bounds to make absolutely certain we get
             // no overlapping pixels, which destroy the atomic shader.
             const int32_t kMax32i = std::numeric_limits<int32_t>::max();
             const int32_t kMin32i = std::numeric_limits<int32_t>::min();
             drawBounds = simd::if_then_else(drawBounds != int4{kMin32i, kMin32i, kMax32i, kMax32i},
                                             drawBounds + int4{-1, -1, 1, 1},
                                             drawBounds);

             // Our top priority in re-ordering is to group non-overlapping draws together, in order
             // to maximize batching while preserving correctness.
             int64_t drawGroupIdx = intersectionBoard->addRectangle(drawBounds);
             assert(drawGroupIdx > 0);
             if (m_flushDesc.interlockMode == pls::InterlockMode::depthStencil && draw->isOpaque())
             {
                 // In depthStencil mode we can reverse-sort opaque paths front to back, draw them
                 // first, and take advantage of early Z culling.
                 //
                 // To keep things simple initially, we don't reverse-sort draws that use clipping.
                 // (Otherwise if a clip affects both opaque and transparent content, we would have
                 // to apply it twice.)
                 bool usesClipping = draw->drawContents() &
                                     (pls::DrawContents::activeClip | pls::DrawContents::clipUpdate);
                 if (!usesClipping)
                 {
                     drawGroupIdx = -drawGroupIdx;
                 }
             }
             int64_t key = drawGroupIdx << kDrawGroupShift;

             // Within sub-groups of non-overlapping draws, sort similar draw types together.
             int64_t drawType = static_cast<int64_t>(draw->type());
             assert(drawType <= kDrawTypeMask >> kDrawTypeShift);
             key |= drawType << kDrawTypeShift;

             // Within sub-groups of matching draw type, sort by texture binding.
             int64_t textureHash = draw->imageTexture() != nullptr
                                       ? draw->imageTexture()->textureResourceHash() &
                                             (kTextureHashMask >> kTextureHashShift)
                                       : 0;
             key |= textureHash << kTextureHashShift;

             // If using KHR_blend_equation_advanced, we need a batching barrier between draws with
             // different blend modes.
             // If not using KHR_blend_equation_advanced, sorting by blend mode may still give us
             // better branching on the GPU.
             int64_t blendMode = pls::ConvertBlendModeToPLSBlendMode(draw->blendMode());
             assert(blendMode <= kBlendModeMask >> kBlendModeShift);
             key |= blendMode << kBlendModeShift;

             // depthStencil mode draws strokes, fills, and even/odd with different stencil settings.
             int64_t drawContents = static_cast<int64_t>(draw->drawContents());
             assert(drawContents <= kDrawContentsMask >> kDrawContentsShift);
             key |= drawContents << kDrawContentsShift;

             // Draw index goes at the bottom of the key so we know which PLSDraw it corresponds to.
             assert(i <= kDrawIndexMask);
             key |= i;

             assert((key & kDrawGroupMask) >> kDrawGroupShift == drawGroupIdx);
             assert((key & kDrawTypeMask) >> kDrawTypeShift == drawType);
             assert((key & kTextureHashMask) >> kTextureHashShift == textureHash);
             assert((key & kBlendModeMask) >> kBlendModeShift == blendMode);
             assert((key & kDrawContentsMask) >> kDrawContentsShift == drawContents);
             assert((key & kDrawIndexMask) == i);

             indirectDrawList[i] = key;
         }

         // Re-order the draws!!
         std::sort(indirectDrawList.begin(), indirectDrawList.end());

         // Atomic mode sometimes needs to initialize PLS with a draw when the backend can't do it
         // with typical clear/load APIs.
         if (m_ctx->frameInterlockMode() == pls::InterlockMode::atomics &&
             platformFeatures.atomicPLSMustBeInitializedAsDraw)
         {
             m_drawList.emplace_back(m_ctx->perFrameAllocator(),
                                     DrawType::plsAtomicInitialize,
                                     nullptr,
                                     1,
                                     0);
             pushBarrier();
         }

         // Draws with the same drawGroupIdx don't overlap, but once we cross into a new draw group,
         // we need to insert a barrier between the overlaps.
         int64_t needsBarrierMask = kDrawGroupMask;
         if (m_flushDesc.interlockMode == pls::InterlockMode::depthStencil)
         {
             // depthStencil mode also draws clips, strokes, fills, and even/odd with different
             // stencil settings, so these also need a barrier.
             needsBarrierMask |= kDrawContentsMask;
             if (platformFeatures.supportsKHRBlendEquations)
             {
                 // If using KHR_blend_equation_advanced, we also need a barrier between blend modes
                 // in order to change the blend equation.
                 needsBarrierMask |= kBlendModeMask;
             }
         }

         // Write out the draw data from the sorted draw list, and build up a condensed/batched list
         // of low-level draws.
         int64_t priorKey = !indirectDrawList.empty() ? indirectDrawList[0] : 0;
         for (int64_t key : indirectDrawList)
         {
             if ((priorKey & needsBarrierMask) != (key & needsBarrierMask))
             {
                 pushBarrier();
             }
             // We negate drawGroupIdx on opaque paths in order to draw them first and in reverse
             // order, but their z index should still remain positive.
             m_currentZIndex = math::lossless_numeric_cast<uint32_t>(
                 abs(key >> static_cast<int64_t>(kDrawGroupShift)));
             m_plsDraws[key & kDrawIndexMask]->pushToRenderContext(this);
             priorKey = key;
         }

         // Atomic mode needs one more draw to resolve all the pixels.
         if (m_ctx->frameInterlockMode() == pls::InterlockMode::atomics)
         {
             pushBarrier();
             m_drawList.emplace_back(m_ctx->perFrameAllocator(),
                                     DrawType::plsAtomicResolve,
                                     nullptr,
                                     1,
                                     0);
             m_drawList.tail().shaderFeatures = m_combinedShaderFeatures;
         }
     }

     // Pad our buffers to 256-byte alignment.
     m_ctx->m_pathData.push_back_n(nullptr, m_pathPaddingCount);
     m_ctx->m_paintData.push_back_n(nullptr, m_paintPaddingCount);
     m_ctx->m_paintAuxData.push_back_n(nullptr, m_paintAuxPaddingCount);
     m_ctx->m_contourData.push_back_n(nullptr, m_contourPaddingCount);
     m_ctx->m_gradSpanData.push_back_n(nullptr, m_gradSpanPaddingCount);

     assert(m_ctx->m_pathData.elementsWritten() ==
            m_flushDesc.firstPath + m_resourceCounts.pathCount + m_pathPaddingCount);
     assert(m_ctx->m_paintData.elementsWritten() ==
            m_flushDesc.firstPaint + m_resourceCounts.pathCount + m_paintPaddingCount);
     assert(m_ctx->m_paintAuxData.elementsWritten() ==
            m_flushDesc.firstPaintAux + m_resourceCounts.pathCount + m_paintAuxPaddingCount);
     assert(m_ctx->m_contourData.elementsWritten() ==
            m_flushDesc.firstContour + m_resourceCounts.contourCount + m_contourPaddingCount);
     assert(m_ctx->m_gradSpanData.elementsWritten() ==
            m_flushDesc.firstComplexGradSpan + m_resourceCounts.complexGradientSpanCount +
                m_gradSpanPaddingCount);

     assert(m_pathTessLocation == m_expectedPathTessLocationAtEndOfPath);
     assert(m_pathMirroredTessLocation == m_expectedPathMirroredTessLocationAtEndOfPath);
     assert(m_midpointFanTessVertexIdx == m_midpointFanTessEndLocation);
     assert(m_outerCubicTessVertexIdx == m_outerCubicTessEndLocation);

     // Update the flush descriptor's data counts that aren't known until it's written out.
     m_flushDesc.tessVertexSpanCount = math::lossless_numeric_cast<uint32_t>(
         m_ctx->m_tessSpanData.elementsWritten() - m_flushDesc.firstTessVertexSpan);
     m_flushDesc.hasTriangleVertices =
         m_ctx->m_triangleVertexData.bytesWritten() != initialTriangleVertexDataSize;

     m_flushDesc.drawList = &m_drawList;
     m_flushDesc.combinedShaderFeatures = m_combinedShaderFeatures;
 }

 void PLSRenderContext::setResourceSizes(ResourceAllocationCounts allocs, bool forceRealloc)
 {
 #if 0
     class Logger
     {
     public:
         void logSize(const char* name, size_t oldSize, size_t newSize, size_t newSizeInBytes)
         {
             m_totalSizeInBytes += newSizeInBytes;
             if (oldSize == newSize)
             {
                 return;
             }
             if (!m_hasChanged)
             {
                 printf("PLSRenderContext::setResourceSizes():\n");
                 m_hasChanged = true;
             }
             printf("  resize %s: %zu -> %zu (%zu KiB)\n",
                    name,
                    oldSize,
                    newSize,
                    newSizeInBytes >> 10);
         }

         ~Logger()
         {
             if (!m_hasChanged)
             {
                 return;
             }
             printf("  TOTAL GPU resource usage: %zu KiB\n", m_totalSizeInBytes >> 10);
         }

     private:
         size_t m_totalSizeInBytes = 0;
         bool m_hasChanged = false;
     } logger;
 #define LOG_BUFFER_RING_SIZE(NAME, ITEM_SIZE_IN_BYTES)                                             \
     logger.logSize(#NAME,                                                                          \
                    m_currentResourceAllocations.NAME,                                              \
                    allocs.NAME,                                                                    \
                    allocs.NAME* ITEM_SIZE_IN_BYTES* pls::kBufferRingSize)
 #define LOG_TEXTURE_HEIGHT(NAME, BYTES_PER_ROW)                                                    \
     logger.logSize(#NAME,                                                                          \
                    m_currentResourceAllocations.NAME,                                              \
                    allocs.NAME,                                                                    \
                    allocs.NAME* BYTES_PER_ROW)
 #else
 #define LOG_BUFFER_RING_SIZE(NAME, ITEM_SIZE_IN_BYTES)
 #define LOG_TEXTURE_HEIGHT(NAME, BYTES_PER_ROW)
 #endif

     LOG_BUFFER_RING_SIZE(flushUniformBufferCount, sizeof(pls::FlushUniforms));
     if (allocs.flushUniformBufferCount != m_currentResourceAllocations.flushUniformBufferCount ||
         forceRealloc)
     {
         m_impl->resizeFlushUniformBuffer(allocs.flushUniformBufferCount *
                                          sizeof(pls::FlushUniforms));
     }

     LOG_BUFFER_RING_SIZE(imageDrawUniformBufferCount, sizeof(pls::ImageDrawUniforms));
     if (allocs.imageDrawUniformBufferCount !=
             m_currentResourceAllocations.imageDrawUniformBufferCount ||
         forceRealloc)
     {
         m_impl->resizeImageDrawUniformBuffer(allocs.imageDrawUniformBufferCount *
                                              sizeof(pls::ImageDrawUniforms));
     }

     LOG_BUFFER_RING_SIZE(pathBufferCount, sizeof(pls::PathData));
     if (allocs.pathBufferCount != m_currentResourceAllocations.pathBufferCount || forceRealloc)
     {
         m_impl->resizePathBuffer(allocs.pathBufferCount * sizeof(pls::PathData),
                                  pls::PathData::kBufferStructure);
     }

     LOG_BUFFER_RING_SIZE(paintBufferCount, sizeof(pls::PaintData));
     if (allocs.paintBufferCount != m_currentResourceAllocations.paintBufferCount || forceRealloc)
     {
         m_impl->resizePaintBuffer(allocs.paintBufferCount * sizeof(pls::PaintData),
                                   pls::PaintData::kBufferStructure);
     }

     LOG_BUFFER_RING_SIZE(paintAuxBufferCount, sizeof(pls::PaintAuxData));
     if (allocs.paintAuxBufferCount != m_currentResourceAllocations.paintAuxBufferCount ||
         forceRealloc)
     {
         m_impl->resizePaintAuxBuffer(allocs.paintAuxBufferCount * sizeof(pls::PaintAuxData),
                                      pls::PaintAuxData::kBufferStructure);
     }

     LOG_BUFFER_RING_SIZE(contourBufferCount, sizeof(pls::ContourData));
     if (allocs.contourBufferCount != m_currentResourceAllocations.contourBufferCount ||
         forceRealloc)
     {
         m_impl->resizeContourBuffer(allocs.contourBufferCount * sizeof(pls::ContourData),
                                     pls::ContourData::kBufferStructure);
     }

     LOG_BUFFER_RING_SIZE(simpleGradientBufferCount, sizeof(pls::TwoTexelRamp));
     if (allocs.simpleGradientBufferCount !=
             m_currentResourceAllocations.simpleGradientBufferCount ||
         forceRealloc)
     {
         m_impl->resizeSimpleColorRampsBuffer(allocs.simpleGradientBufferCount *
                                              sizeof(pls::TwoTexelRamp));
     }

     LOG_BUFFER_RING_SIZE(complexGradSpanBufferCount, sizeof(pls::GradientSpan));
     if (allocs.complexGradSpanBufferCount !=
             m_currentResourceAllocations.complexGradSpanBufferCount ||
         forceRealloc)
     {
         m_impl->resizeGradSpanBuffer(allocs.complexGradSpanBufferCount * sizeof(pls::GradientSpan));
     }

     LOG_BUFFER_RING_SIZE(tessSpanBufferCount, sizeof(pls::TessVertexSpan));
     if (allocs.tessSpanBufferCount != m_currentResourceAllocations.tessSpanBufferCount ||
         forceRealloc)
     {
         m_impl->resizeTessVertexSpanBuffer(allocs.tessSpanBufferCount *
                                            sizeof(pls::TessVertexSpan));
     }

     LOG_BUFFER_RING_SIZE(triangleVertexBufferCount, sizeof(pls::TriangleVertex));
     if (allocs.triangleVertexBufferCount !=
             m_currentResourceAllocations.triangleVertexBufferCount ||
         forceRealloc)
     {
         m_impl->resizeTriangleVertexBuffer(allocs.triangleVertexBufferCount *
                                            sizeof(pls::TriangleVertex));
     }

     allocs.gradTextureHeight = std::min<size_t>(allocs.gradTextureHeight, kMaxTextureHeight);
     LOG_TEXTURE_HEIGHT(gradTextureHeight, pls::kGradTextureWidth * 4);
     if (allocs.gradTextureHeight != m_currentResourceAllocations.gradTextureHeight || forceRealloc)
     {
         m_impl->resizeGradientTexture(
             pls::kGradTextureWidth,
             math::lossless_numeric_cast<uint32_t>(allocs.gradTextureHeight));
     }

     allocs.tessTextureHeight = std::min<size_t>(allocs.tessTextureHeight, kMaxTextureHeight);
     LOG_TEXTURE_HEIGHT(tessTextureHeight, pls::kTessTextureWidth * 4 * 4);
     if (allocs.tessTextureHeight != m_currentResourceAllocations.tessTextureHeight || forceRealloc)
     {
         m_impl->resizeTessellationTexture(
             pls::kTessTextureWidth,
             math::lossless_numeric_cast<uint32_t>(allocs.tessTextureHeight));
     }

     m_currentResourceAllocations = allocs;
 }

 void PLSRenderContext::mapResourceBuffers(const ResourceAllocationCounts& mapCounts)
 {
     m_impl->prepareToMapBuffers();

     if (mapCounts.flushUniformBufferCount > 0)
     {
         m_flushUniformData.mapElements(m_impl.get(),
                                        &PLSRenderContextImpl::mapFlushUniformBuffer,
                                        mapCounts.flushUniformBufferCount);
     }
     assert(m_flushUniformData.hasRoomFor(mapCounts.flushUniformBufferCount));

     if (mapCounts.imageDrawUniformBufferCount > 0)
     {
         m_imageDrawUniformData.mapElements(m_impl.get(),
                                            &PLSRenderContextImpl::mapImageDrawUniformBuffer,
                                            mapCounts.imageDrawUniformBufferCount);
     }
     assert(m_imageDrawUniformData.hasRoomFor(mapCounts.imageDrawUniformBufferCount > 0));

     if (mapCounts.pathBufferCount > 0)
     {
         m_pathData.mapElements(m_impl.get(),
                                &PLSRenderContextImpl::mapPathBuffer,
                                mapCounts.pathBufferCount);
     }
     assert(m_pathData.hasRoomFor(mapCounts.pathBufferCount));

     if (mapCounts.paintBufferCount > 0)
     {
         m_paintData.mapElements(m_impl.get(),
                                 &PLSRenderContextImpl::mapPaintBuffer,
                                 mapCounts.paintBufferCount);
     }
     assert(m_paintData.hasRoomFor(mapCounts.paintBufferCount));

     if (mapCounts.paintAuxBufferCount > 0)
     {
         m_paintAuxData.mapElements(m_impl.get(),
                                    &PLSRenderContextImpl::mapPaintAuxBuffer,
                                    mapCounts.paintAuxBufferCount);
     }
     assert(m_paintAuxData.hasRoomFor(mapCounts.paintAuxBufferCount));

     if (mapCounts.contourBufferCount > 0)
     {
         m_contourData.mapElements(m_impl.get(),
                                   &PLSRenderContextImpl::mapContourBuffer,
                                   mapCounts.contourBufferCount);
     }
     assert(m_contourData.hasRoomFor(mapCounts.contourBufferCount));

     if (mapCounts.simpleGradientBufferCount > 0)
     {
         m_simpleColorRampsData.mapElements(m_impl.get(),
                                            &PLSRenderContextImpl::mapSimpleColorRampsBuffer,
                                            mapCounts.simpleGradientBufferCount);
     }
     assert(m_simpleColorRampsData.hasRoomFor(mapCounts.simpleGradientBufferCount));

     if (mapCounts.complexGradSpanBufferCount > 0)
     {
         m_gradSpanData.mapElements(m_impl.get(),
                                    &PLSRenderContextImpl::mapGradSpanBuffer,
                                    mapCounts.complexGradSpanBufferCount);
     }
     assert(m_gradSpanData.hasRoomFor(mapCounts.complexGradSpanBufferCount));

     if (mapCounts.tessSpanBufferCount > 0)
     {
         m_tessSpanData.mapElements(m_impl.get(),
                                    &PLSRenderContextImpl::mapTessVertexSpanBuffer,
                                    mapCounts.tessSpanBufferCount);
     }
     assert(m_tessSpanData.hasRoomFor(mapCounts.tessSpanBufferCount));

     if (mapCounts.triangleVertexBufferCount > 0)
     {
         m_triangleVertexData.mapElements(m_impl.get(),
                                          &PLSRenderContextImpl::mapTriangleVertexBuffer,
                                          mapCounts.triangleVertexBufferCount);
     }
     assert(m_triangleVertexData.hasRoomFor(mapCounts.triangleVertexBufferCount));
 }

 void PLSRenderContext::unmapResourceBuffers()
 {
     if (m_flushUniformData)
     {
         m_impl->unmapFlushUniformBuffer();
         m_flushUniformData.reset();
     }
     if (m_imageDrawUniformData)
     {
         m_impl->unmapImageDrawUniformBuffer();
         m_imageDrawUniformData.reset();
     }
     if (m_pathData)
     {
         m_impl->unmapPathBuffer();
         m_pathData.reset();
     }
     if (m_paintData)
     {
         m_impl->unmapPaintBuffer();
         m_paintData.reset();
     }
     if (m_paintAuxData)
     {
         m_impl->unmapPaintAuxBuffer();
         m_paintAuxData.reset();
     }
     if (m_contourData)
     {
         m_impl->unmapContourBuffer();
         m_contourData.reset();
     }
     if (m_simpleColorRampsData)
     {
         m_impl->unmapSimpleColorRampsBuffer();
         m_simpleColorRampsData.reset();
     }
     if (m_gradSpanData)
     {
         m_impl->unmapGradSpanBuffer();
         m_gradSpanData.reset();
     }
     if (m_tessSpanData)
     {
         m_impl->unmapTessVertexSpanBuffer();
         m_tessSpanData.reset();
     }
     if (m_triangleVertexData)
     {
         m_impl->unmapTriangleVertexBuffer();
         m_triangleVertexData.reset();
     }
 }

 void PLSRenderContext::LogicalFlush::pushPaddingVertices(uint32_t tessLocation, uint32_t count)
 {
     assert(m_hasDoneLayout);
     assert(count > 0);

     constexpr static Vec2D kEmptyCubic[4]{};
     // This is guaranteed to not collide with a neighboring contour ID.
     constexpr static uint32_t kInvalidContourID = 0;
     assert(m_pathTessLocation == m_expectedPathTessLocationAtEndOfPath);
     assert(m_pathMirroredTessLocation == m_expectedPathMirroredTessLocationAtEndOfPath);
     m_pathTessLocation = tessLocation;
     RIVE_DEBUG_CODE(m_expectedPathTessLocationAtEndOfPath = m_pathTessLocation + count;)
     assert(m_expectedPathTessLocationAtEndOfPath <= kMaxTessellationVertexCount);
     pushTessellationSpans(kEmptyCubic, {0, 0}, count, 0, 0, 1, kInvalidContourID);
     assert(m_pathTessLocation == m_expectedPathTessLocationAtEndOfPath);
 }

 void PLSRenderContext::LogicalFlush::pushPath(PLSPathDraw* draw,
                                               pls::PatchType patchType,
                                               uint32_t tessVertexCount)
 {
     assert(m_hasDoneLayout);
     assert(m_pathTessLocation == m_expectedPathTessLocationAtEndOfPath);
     assert(m_pathMirroredTessLocation == m_expectedPathMirroredTessLocationAtEndOfPath);

     m_currentPathIsStroked = draw->strokeRadius() != 0;
     m_currentPathContourDirections = draw->contourDirections();
     ++m_currentPathID;
     assert(0 < m_currentPathID && m_currentPathID <= m_ctx->m_maxPathID);

     m_ctx->m_pathData.set_back(draw->matrix(), draw->strokeRadius(), m_currentZIndex);
     m_ctx->m_paintData.set_back(draw->fillRule(),
                                 draw->paintType(),
                                 draw->simplePaintValue(),
                                 m_gradTextureLayout,
                                 draw->clipID(),
                                 draw->hasClipRect(),
                                 draw->blendMode());
     m_ctx->m_paintAuxData.set_back(draw->matrix(),
                                    draw->paintType(),
                                    draw->simplePaintValue(),
                                    draw->gradient(),
                                    draw->imageTexture(),
                                    draw->clipRectInverseMatrix(),
                                    m_flushDesc.renderTarget,
                                    m_ctx->platformFeatures());

     assert(m_flushDesc.firstPath + m_currentPathID + 1 == m_ctx->m_pathData.elementsWritten());
     assert(m_flushDesc.firstPaint + m_currentPathID + 1 == m_ctx->m_paintData.elementsWritten());
     assert(m_flushDesc.firstPaintAux + m_currentPathID + 1 ==
            m_ctx->m_paintAuxData.elementsWritten());

     pls::DrawType drawType;
     uint32_t tessLocation;
     if (patchType == PatchType::midpointFan)
     {
         drawType = DrawType::midpointFanPatches;
         tessLocation = m_midpointFanTessVertexIdx;
         m_midpointFanTessVertexIdx += tessVertexCount;
     }
     else
     {
         drawType = DrawType::outerCurvePatches;
         tessLocation = m_outerCubicTessVertexIdx;
         m_outerCubicTessVertexIdx += tessVertexCount;
     }

     RIVE_DEBUG_CODE(m_expectedPathTessLocationAtEndOfPath = tessLocation + tessVertexCount);
     RIVE_DEBUG_CODE(m_expectedPathMirroredTessLocationAtEndOfPath = tessLocation);
     assert(m_expectedPathTessLocationAtEndOfPath <= kMaxTessellationVertexCount);

     uint32_t patchSize = PatchSegmentSpan(drawType);
     uint32_t baseInstance = math::lossless_numeric_cast<uint32_t>(tessLocation / patchSize);
     assert(baseInstance * patchSize == tessLocation); // flush() is responsible for alignment.

     if (m_currentPathContourDirections == pls::ContourDirections::reverseAndForward)
     {
         assert(tessVertexCount % 2 == 0);
         m_pathTessLocation = m_pathMirroredTessLocation = tessLocation + tessVertexCount / 2;
     }
     else if (m_currentPathContourDirections == pls::ContourDirections::forward)
     {
         m_pathTessLocation = m_pathMirroredTessLocation = tessLocation;
     }
     else
     {
         assert(m_currentPathContourDirections == pls::ContourDirections::reverse);
         m_pathTessLocation = m_pathMirroredTessLocation = tessLocation + tessVertexCount;
     }

     uint32_t instanceCount = tessVertexCount / patchSize;
     assert(instanceCount * patchSize == tessVertexCount); // flush() is responsible for alignment.
     pushPathDraw(draw, drawType, instanceCount, baseInstance);
 }

 void PLSRenderContext::LogicalFlush::pushContour(Vec2D midpoint,
                                                  bool closed,
                                                  uint32_t paddingVertexCount)
 {
     assert(m_hasDoneLayout);
     assert(m_ctx->m_pathData.bytesWritten() > 0);
     assert(m_currentPathIsStroked || closed);
     assert(m_currentPathID != 0); // pathID can't be zero.

     if (m_currentPathIsStroked)
     {
         midpoint.x = closed ? 1 : 0;
     }
     // If the contour is closed, the shader needs a vertex to wrap back around to at the end of it.
     uint32_t vertexIndex0 = m_currentPathContourDirections & pls::ContourDirections::forward
                                 ? m_pathTessLocation
                                 : m_pathMirroredTessLocation - 1;
     m_ctx->m_contourData.emplace_back(midpoint, m_currentPathID, vertexIndex0);
     ++m_currentContourID;
     assert(0 < m_currentContourID && m_currentContourID <= pls::kMaxContourID);
     assert(m_flushDesc.firstContour + m_currentContourID == m_ctx->m_contourData.elementsWritten());

     // The first curve of the contour will be pre-padded with 'paddingVertexCount' tessellation
     // vertices, colocated at T=0. The caller must use this argument align the end of the contour on
     // a boundary of the patch size. (See pls::PaddingToAlignUp().)
     m_currentContourPaddingVertexCount = paddingVertexCount;
 }

 void PLSRenderContext::LogicalFlush::pushCubic(const Vec2D pts[4],
                                                Vec2D joinTangent,
                                                uint32_t additionalContourFlags,
                                                uint32_t parametricSegmentCount,
                                                uint32_t polarSegmentCount,
                                                uint32_t joinSegmentCount)
 {
     assert(m_hasDoneLayout);
     assert(0 <= parametricSegmentCount && parametricSegmentCount <= kMaxParametricSegments);
     assert(0 <= polarSegmentCount && polarSegmentCount <= kMaxPolarSegments);
     assert(joinSegmentCount > 0);
     assert(m_currentContourID != 0); // contourID can't be zero.

     // Polar and parametric segments share the same beginning and ending vertices, so the merged
     // *vertex* count is equal to the sum of polar and parametric *segment* counts.
     uint32_t curveMergedVertexCount = parametricSegmentCount + polarSegmentCount;
     // -1 because the curve and join share an ending/beginning vertex.
     uint32_t totalVertexCount =
         m_currentContourPaddingVertexCount + curveMergedVertexCount + joinSegmentCount - 1;

     // Only the first curve of a contour gets padding vertices.
     m_currentContourPaddingVertexCount = 0;

     if (m_currentPathContourDirections == pls::ContourDirections::reverseAndForward)
     {
         pushMirroredAndForwardTessellationSpans(pts,
                                                 joinTangent,
                                                 totalVertexCount,
                                                 parametricSegmentCount,
                                                 polarSegmentCount,
                                                 joinSegmentCount,
                                                 m_currentContourID | additionalContourFlags);
     }
     else if (m_currentPathContourDirections == pls::ContourDirections::forward)
     {
         pushTessellationSpans(pts,
                               joinTangent,
                               totalVertexCount,
                               parametricSegmentCount,
                               polarSegmentCount,
                               joinSegmentCount,
                               m_currentContourID | additionalContourFlags);
     }
     else
     {
         assert(m_currentPathContourDirections == pls::ContourDirections::reverse);
         pushMirroredTessellationSpans(pts,
                                       joinTangent,
                                       totalVertexCount,
                                       parametricSegmentCount,
                                       polarSegmentCount,
                                       joinSegmentCount,
                                       m_currentContourID | additionalContourFlags);
     }

     RIVE_DEBUG_CODE(++m_pathCurveCount;)
 }

 RIVE_ALWAYS_INLINE void PLSRenderContext::LogicalFlush::pushTessellationSpans(
     const Vec2D pts[4],
     Vec2D joinTangent,
     uint32_t totalVertexCount,
     uint32_t parametricSegmentCount,
     uint32_t polarSegmentCount,
     uint32_t joinSegmentCount,
     uint32_t contourIDWithFlags)
 {
     assert(m_hasDoneLayout);
     assert(totalVertexCount > 0);

     uint32_t y = m_pathTessLocation / kTessTextureWidth;
     int32_t x0 = m_pathTessLocation % kTessTextureWidth;
     int32_t x1 = x0 + totalVertexCount;
     for (;;)
     {
         m_ctx->m_tessSpanData.set_back(pts,
                                        joinTangent,
                                        static_cast<float>(y),
                                        x0,
                                        x1,
                                        parametricSegmentCount,
                                        polarSegmentCount,
                                        joinSegmentCount,
                                        contourIDWithFlags);
         if (x1 > static_cast<int32_t>(kTessTextureWidth))
         {
             // The span was too long to fit on the current line. Wrap and draw it again, this
             // time behind the left edge of the texture so we capture what got clipped off last
             // time.
             ++y;
             x0 -= kTessTextureWidth;
             x1 -= kTessTextureWidth;
             continue;
         }
         break;
     }
     assert(y == (m_pathTessLocation + totalVertexCount - 1) / kTessTextureWidth);

     m_pathTessLocation += totalVertexCount;
     assert(m_pathTessLocation <= m_expectedPathTessLocationAtEndOfPath);
 }

 RIVE_ALWAYS_INLINE void PLSRenderContext::LogicalFlush::pushMirroredTessellationSpans(
     const Vec2D pts[4],
     Vec2D joinTangent,
     uint32_t totalVertexCount,
     uint32_t parametricSegmentCount,
     uint32_t polarSegmentCount,
     uint32_t joinSegmentCount,
     uint32_t contourIDWithFlags)
 {
     assert(m_hasDoneLayout);
     assert(totalVertexCount > 0);

     uint32_t reflectionY = (m_pathMirroredTessLocation - 1) / kTessTextureWidth;
     int32_t reflectionX0 = (m_pathMirroredTessLocation - 1) % kTessTextureWidth + 1;
     int32_t reflectionX1 = reflectionX0 - totalVertexCount;

     for (;;)
     {
         m_ctx->m_tessSpanData.set_back(pts,
                                        joinTangent,
                                        static_cast<float>(reflectionY),
                                        reflectionX0,
                                        reflectionX1,
                                        parametricSegmentCount,
                                        polarSegmentCount,
                                        joinSegmentCount,
                                        contourIDWithFlags);
         if (reflectionX1 < 0)
         {
             --reflectionY;
             reflectionX0 += kTessTextureWidth;
             reflectionX1 += kTessTextureWidth;
             continue;
         }
         break;
     }

     m_pathMirroredTessLocation -= totalVertexCount;
     assert(m_pathMirroredTessLocation >= m_expectedPathMirroredTessLocationAtEndOfPath);
 }

 RIVE_ALWAYS_INLINE void PLSRenderContext::LogicalFlush::pushMirroredAndForwardTessellationSpans(
     const Vec2D pts[4],
     Vec2D joinTangent,
     uint32_t totalVertexCount,
     uint32_t parametricSegmentCount,
     uint32_t polarSegmentCount,
     uint32_t joinSegmentCount,
     uint32_t contourIDWithFlags)
 {
     assert(m_hasDoneLayout);
     assert(totalVertexCount > 0);

     int32_t y = m_pathTessLocation / kTessTextureWidth;
     int32_t x0 = m_pathTessLocation % kTessTextureWidth;
     int32_t x1 = x0 + totalVertexCount;

     uint32_t reflectionY = (m_pathMirroredTessLocation - 1) / kTessTextureWidth;
     int32_t reflectionX0 = (m_pathMirroredTessLocation - 1) % kTessTextureWidth + 1;
     int32_t reflectionX1 = reflectionX0 - totalVertexCount;

     for (;;)
     {
         m_ctx->m_tessSpanData.set_back(pts,
                                        joinTangent,
                                        static_cast<float>(y),
                                        x0,
                                        x1,
                                        static_cast<float>(reflectionY),
                                        reflectionX0,
                                        reflectionX1,
                                        parametricSegmentCount,
                                        polarSegmentCount,
                                        joinSegmentCount,
                                        contourIDWithFlags);
         if (x1 > static_cast<int32_t>(kTessTextureWidth) || reflectionX1 < 0)
         {
             // Either the span or its reflection was too long to fit on the current line. Wrap and
             // draw both of them again, this time beyond the opposite edge of the texture so we
             // capture what got clipped off last time.
             ++y;
             x0 -= kTessTextureWidth;
             x1 -= kTessTextureWidth;

             --reflectionY;
             reflectionX0 += kTessTextureWidth;
             reflectionX1 += kTessTextureWidth;
             continue;
         }
         break;
     }

     m_pathTessLocation += totalVertexCount;
     assert(m_pathTessLocation <= m_expectedPathTessLocationAtEndOfPath);

     m_pathMirroredTessLocation -= totalVertexCount;
     assert(m_pathMirroredTessLocation >= m_expectedPathMirroredTessLocationAtEndOfPath);
 }

 void PLSRenderContext::LogicalFlush::pushInteriorTriangulation(InteriorTriangulationDraw* draw)
 {
     assert(m_hasDoneLayout);

     assert(m_ctx->m_triangleVertexData.hasRoomFor(draw->triangulator()->maxVertexCount()));
     uint32_t baseVertex =
         math::lossless_numeric_cast<uint32_t>(m_ctx->m_triangleVertexData.elementsWritten());
     size_t actualVertexCount =
         draw->triangulator()->polysToTriangles(&m_ctx->m_triangleVertexData, m_currentPathID);
     assert(actualVertexCount <= draw->triangulator()->maxVertexCount());
     DrawBatch& batch = pushPathDraw(draw,
                                     DrawType::interiorTriangulation,
                                     math::lossless_numeric_cast<uint32_t>(actualVertexCount),
                                     baseVertex);
     // Interior triangulations are allowed to disable raster ordering since they are guaranteed to
     // not overlap.
     batch.needsBarrier = true;
 }

 void PLSRenderContext::LogicalFlush::pushImageRect(ImageRectDraw* draw)
 {
     assert(m_hasDoneLayout);

     // If we support image paints for paths, the client should use pushPath() with an image paint
     // instead of calling this method.
     assert(!m_ctx->frameSupportsImagePaintForPaths());

     size_t imageDrawDataOffset = m_ctx->m_imageDrawUniformData.bytesWritten();
     m_ctx->m_imageDrawUniformData.emplace_back(draw->matrix(),
                                                draw->opacity(),
                                                draw->clipRectInverseMatrix(),
                                                draw->clipID(),
                                                draw->blendMode(),
                                                m_currentZIndex);

     DrawBatch& batch = pushDraw(draw, DrawType::imageRect, PaintType::image, 1, 0);
     batch.imageDrawDataOffset = math::lossless_numeric_cast<uint32_t>(imageDrawDataOffset);
 }

 void PLSRenderContext::LogicalFlush::pushImageMesh(ImageMeshDraw* draw)
 {

     assert(m_hasDoneLayout);

     size_t imageDrawDataOffset = m_ctx->m_imageDrawUniformData.bytesWritten();
     m_ctx->m_imageDrawUniformData.emplace_back(draw->matrix(),
                                                draw->opacity(),
                                                draw->clipRectInverseMatrix(),
                                                draw->clipID(),
                                                draw->blendMode(),
                                                m_currentZIndex);

     DrawBatch& batch = pushDraw(draw, DrawType::imageMesh, PaintType::image, draw->indexCount(), 0);
     batch.vertexBuffer = draw->vertexBuffer();
     batch.uvBuffer = draw->uvBuffer();
     batch.indexBuffer = draw->indexBuffer();
     batch.imageDrawDataOffset = math::lossless_numeric_cast<uint32_t>(imageDrawDataOffset);
 }

 void PLSRenderContext::LogicalFlush::pushStencilClipReset(StencilClipReset* draw)
 {
     assert(m_hasDoneLayout);

     uint32_t baseVertex =
         math::lossless_numeric_cast<uint32_t>(m_ctx->m_triangleVertexData.elementsWritten());
     auto [L, T, R, B] = AABB(getClipInfo(draw->previousClipID()).contentBounds);
     uint32_t Z = m_currentZIndex;
     assert(AABB(L, T, R, B).round() == draw->pixelBounds());
     assert(draw->resourceCounts().maxTriangleVertexCount == 6);
     assert(m_ctx->m_triangleVertexData.hasRoomFor(6));
     m_ctx->m_triangleVertexData.emplace_back(Vec2D{L, B}, 0, Z);
     m_ctx->m_triangleVertexData.emplace_back(Vec2D{L, T}, 0, Z);
     m_ctx->m_triangleVertexData.emplace_back(Vec2D{R, B}, 0, Z);
     m_ctx->m_triangleVertexData.emplace_back(Vec2D{R, B}, 0, Z);
     m_ctx->m_triangleVertexData.emplace_back(Vec2D{L, T}, 0, Z);
     m_ctx->m_triangleVertexData.emplace_back(Vec2D{R, T}, 0, Z);
     pushDraw(draw, DrawType::stencilClipReset, PaintType::clipUpdate, 6, baseVertex);
 }

 void PLSRenderContext::LogicalFlush::pushBarrier()
 {
     assert(m_hasDoneLayout);
     assert(m_flushDesc.interlockMode != pls::InterlockMode::rasterOrdering);

     if (!m_drawList.empty())
     {
         m_drawList.tail().needsBarrier = true;
     }
 }

 pls::DrawBatch& PLSRenderContext::LogicalFlush::pushPathDraw(PLSPathDraw* draw,
                                                              DrawType drawType,
                                                              uint32_t vertexCount,
                                                              uint32_t baseVertex)
 {
     assert(m_hasDoneLayout);

     DrawBatch& batch = pushDraw(draw, drawType, draw->paintType(), vertexCount, baseVertex);
     auto pathShaderFeatures = pls::ShaderFeatures::NONE;
     if (draw->fillRule() == FillRule::evenOdd)
     {
         pathShaderFeatures |= ShaderFeatures::ENABLE_EVEN_ODD;
     }
     if (draw->paintType() == PaintType::clipUpdate && draw->simplePaintValue().outerClipID != 0)
     {
         pathShaderFeatures |= ShaderFeatures::ENABLE_NESTED_CLIPPING;
     }
     batch.shaderFeatures |= pathShaderFeatures & m_ctx->m_frameShaderFeaturesMask;
     m_combinedShaderFeatures |= batch.shaderFeatures;
     assert((batch.shaderFeatures &
             pls::ShaderFeaturesMaskFor(drawType, m_ctx->frameInterlockMode())) ==
            batch.shaderFeatures);
     return batch;
 }

 RIVE_ALWAYS_INLINE static bool can_combine_draw_images(const PLSTexture* currentDrawTexture,
                                                        const PLSTexture* nextDrawTexture)
 {
     if (currentDrawTexture == nullptr || nextDrawTexture == nullptr)
     {
         // We can always combine two draws if one or both do not use an image paint.
         return true;
     }
     // Since the image paint's texture must be bound to a specific slot, we can't combine draws that
     // use different textures.
     return currentDrawTexture == nextDrawTexture;
 }

 pls::DrawBatch& PLSRenderContext::LogicalFlush::pushDraw(PLSDraw* draw,
                                                          DrawType drawType,
                                                          pls::PaintType paintType,
                                                          uint32_t elementCount,
                                                          uint32_t baseElement)
 {
     assert(m_hasDoneLayout);

     bool needsNewBatch;
     switch (drawType)
     {
         case DrawType::midpointFanPatches:
         case DrawType::outerCurvePatches:
         case DrawType::plsAtomicInitialize:
         case DrawType::plsAtomicResolve:
         case DrawType::stencilClipReset:
             needsNewBatch =
                 m_drawList.empty() || m_drawList.tail().drawType != drawType ||
                 m_drawList.tail().needsBarrier ||
                 !can_combine_draw_images(m_drawList.tail().imageTexture, draw->imageTexture());
             break;
         case DrawType::interiorTriangulation:
         case DrawType::imageRect:
         case DrawType::imageMesh:
             // We can't combine interior triangulations or image draws yet.
             needsNewBatch = true;
             break;
     }

     DrawBatch& batch = needsNewBatch ? m_drawList.emplace_back(m_ctx->perFrameAllocator(),
                                                                drawType,
                                                                draw,
                                                                elementCount,
                                                                baseElement)
                                      : m_drawList.tail();
     if (!needsNewBatch)
     {
         assert(batch.drawType == drawType);
         assert(can_combine_draw_images(batch.imageTexture, draw->imageTexture()));
         assert(!batch.needsBarrier);
         if (m_flushDesc.interlockMode == pls::InterlockMode::depthStencil)
         {
             // depthStencil can't mix drawContents in a batch.
             assert(batch.drawContents == draw->drawContents());
             assert((batch.shaderFeatures & pls::ShaderFeatures::ENABLE_ADVANCED_BLEND) ==
                    (draw->blendMode() != BlendMode::srcOver));
             // If using KHR_blend_equation_advanced, we can't mix blend modes in a batch.
             assert(!m_ctx->platformFeatures().supportsKHRBlendEquations ||
                    batch.internalDrawList->blendMode() == draw->blendMode());
         }
         assert(batch.baseElement + batch.elementCount == baseElement);
         draw->setBatchInternalNeighbor(batch.internalDrawList);
         batch.internalDrawList = draw;
         batch.elementCount += elementCount;
     }

     if (paintType == PaintType::image)
     {
         assert(draw->imageTexture() != nullptr);
         if (batch.imageTexture == nullptr)
         {
             batch.imageTexture = draw->imageTexture();
         }
         assert(batch.imageTexture == draw->imageTexture());
     }

     auto shaderFeatures = ShaderFeatures::NONE;
     if (draw->clipID() != 0)
     {
         shaderFeatures |= ShaderFeatures::ENABLE_CLIPPING;
     }
     if (draw->hasClipRect() && paintType != PaintType::clipUpdate)
     {
         shaderFeatures |= ShaderFeatures::ENABLE_CLIP_RECT;
     }
     if (paintType != PaintType::clipUpdate)
     {
         switch (draw->blendMode())
         {
             case BlendMode::hue:
             case BlendMode::saturation:
             case BlendMode::color:
             case BlendMode::luminosity:
                 shaderFeatures |= ShaderFeatures::ENABLE_HSL_BLEND_MODES;
                 [[fallthrough]];
             case BlendMode::screen:
             case BlendMode::overlay:
             case BlendMode::darken:
             case BlendMode::lighten:
             case BlendMode::colorDodge:
             case BlendMode::colorBurn:
             case BlendMode::hardLight:
             case BlendMode::softLight:
             case BlendMode::difference:
             case BlendMode::exclusion:
             case BlendMode::multiply:
                 shaderFeatures |= ShaderFeatures::ENABLE_ADVANCED_BLEND;
                 break;
             case BlendMode::srcOver:
                 break;
         }
     }
     batch.shaderFeatures |= shaderFeatures & m_ctx->m_frameShaderFeaturesMask;
     m_combinedShaderFeatures |= batch.shaderFeatures;
     batch.drawContents |= draw->drawContents();
     assert((batch.shaderFeatures &
             pls::ShaderFeaturesMaskFor(drawType, m_ctx->frameInterlockMode())) ==
            batch.shaderFeatures);
     return batch;
 }
 } // namespace rive::pls