/*
 * Copyright 2022 Rive
 */

#include "rive/pls/pls_render_context.hpp"

#include "gr_inner_fan_triangulator.hpp"
#include "intersection_board.hpp"
#include "pls_paint.hpp"
#include "rive/pls/pls_draw.hpp"
#include "rive/pls/pls_image.hpp"
#include "rive/pls/pls_render_context_impl.hpp"
#include "shaders/constants.glsl"

#include <string_view>

namespace rive::pls
{
constexpr size_t kDefaultSimpleGradientCapacity = 512;
constexpr size_t kDefaultComplexGradientCapacity = 1024;
constexpr size_t kDefaultDrawCapacity = 2048;

constexpr size_t kMaxTextureHeight = 2048; // TODO: Move this variable to PlatformFeatures.
constexpr size_t kMaxTessellationVertexCount = kMaxTextureHeight * kTessTextureWidth;
constexpr size_t kMaxTessellationPaddingVertexCount =
    pls::kMidpointFanPatchSegmentSpan +      // Padding at the beginning of the tess texture
    (pls::kOuterCurvePatchSegmentSpan - 1) + // Max padding between patch types in the tess texture
    1;                                       // Padding at the end of the tessellation texture
constexpr size_t kMaxTessellationVertexCountBeforePadding =
    kMaxTessellationVertexCount - kMaxTessellationPaddingVertexCount;

// We can only reorder 32767 draws at a time since the one-based groupIndex returned by
// IntersectionBoard is a signed 16-bit integer.
constexpr size_t kMaxReorderedDrawCount = std::numeric_limits<int16_t>::max();

// How tall to make a resource texture in order to support the given number of items.
template <size_t WidthInItems> constexpr static size_t resource_texture_height(size_t itemCount)
{
    return (itemCount + WidthInItems - 1) / WidthInItems;
}

constexpr static size_t gradient_data_height(size_t simpleRampCount, size_t complexRampCount)
{
    return resource_texture_height<pls::kGradTextureWidthInSimpleRamps>(simpleRampCount) +
           complexRampCount;
}

inline GradientContentKey::GradientContentKey(rcp<const PLSGradient> gradient) :
    m_gradient(std::move(gradient))
{}

inline GradientContentKey::GradientContentKey(GradientContentKey&& other) :
    m_gradient(std::move(other.m_gradient))
{}

bool GradientContentKey::operator==(const GradientContentKey& other) const
{
    if (m_gradient.get() == other.m_gradient.get())
    {
        return true;
    }
    else
    {
        return m_gradient->count() == other.m_gradient->count() &&
               !memcmp(m_gradient->stops(),
                       other.m_gradient->stops(),
                       m_gradient->count() * sizeof(float)) &&
               !memcmp(m_gradient->colors(),
                       other.m_gradient->colors(),
                       m_gradient->count() * sizeof(ColorInt));
    }
}

size_t DeepHashGradient::operator()(const GradientContentKey& key) const
{
    const PLSGradient* grad = key.gradient();
    std::hash<std::string_view> hash;
    size_t x = hash(std::string_view(reinterpret_cast<const char*>(grad->stops()),
                                     grad->count() * sizeof(float)));
    size_t y = hash(std::string_view(reinterpret_cast<const char*>(grad->colors()),
                                     grad->count() * sizeof(ColorInt)));
    return x ^ y;
}

PLSRenderContext::PLSRenderContext(std::unique_ptr<PLSRenderContextImpl> impl) :
    m_impl(std::move(impl)),
    // -1 from m_maxPathID so we reserve a path record for the clearColor paint (for atomic mode).
    // This also allows us to index the storage buffers directly by pathID.
    m_maxPathID(MaxPathID(m_impl->platformFeatures().pathIDGranularity) - 1)
{
    setResourceSizes(ResourceAllocationCounts(), /*forceRealloc =*/true);
    releaseResources();
}

PLSRenderContext::~PLSRenderContext()
{
    // Always call flush() to avoid deadlock.
    assert(!m_didBeginFrame);
    // Delete the logical flushes before the block allocators let go of their allocations.
    m_logicalFlushes.clear();
}

const pls::PlatformFeatures& PLSRenderContext::platformFeatures() const
{
    return m_impl->platformFeatures();
}

rcp<RenderBuffer> PLSRenderContext::makeRenderBuffer(RenderBufferType type,
                                                     RenderBufferFlags flags,
                                                     size_t sizeInBytes)
{
    return m_impl->makeRenderBuffer(type, flags, sizeInBytes);
}

rcp<RenderImage> PLSRenderContext::decodeImage(Span<const uint8_t> encodedBytes)
{
    rcp<PLSTexture> texture = m_impl->decodeImageTexture(encodedBytes);
    return texture != nullptr ? make_rcp<PLSImage>(std::move(texture)) : nullptr;
}

void PLSRenderContext::releaseResources()
{
    assert(!m_didBeginFrame);
    resetContainers();
    setResourceSizes(ResourceAllocationCounts());
    m_maxRecentResourceRequirements = ResourceAllocationCounts();
    m_lastResourceTrimTimeInSeconds = m_impl->secondsNow();
}

void PLSRenderContext::resetContainers()
{
    assert(!m_didBeginFrame);

    if (!m_logicalFlushes.empty())
    {
        assert(m_logicalFlushes.size() == 1); // Should get reset to 1 after flush().
        m_logicalFlushes.resize(1);
        m_logicalFlushes.front()->resetContainers();
    }

    m_indirectDrawList.clear();
    m_indirectDrawList.shrink_to_fit();

    m_intersectionBoard = nullptr;
}

PLSRenderContext::LogicalFlush::LogicalFlush(PLSRenderContext* parent) : m_ctx(parent) { rewind(); }

void PLSRenderContext::LogicalFlush::rewind()
{
    m_resourceCounts = PLSDraw::ResourceCounters();
    m_simpleGradients.clear();
    m_pendingSimpleGradientWrites.clear();
    m_complexGradients.clear();
    m_pendingComplexColorRampDraws.clear();
    m_clips.clear();
    m_plsDraws.clear();
    m_combinedDrawBounds = {std::numeric_limits<int32_t>::max(),
                            std::numeric_limits<int32_t>::max(),
                            std::numeric_limits<int32_t>::min(),
                            std::numeric_limits<int32_t>::min()};

    m_pathPaddingCount = 0;
    m_paintPaddingCount = 0;
    m_paintAuxPaddingCount = 0;
    m_contourPaddingCount = 0;
    m_midpointFanTessEndLocation = 0;
    m_outerCubicTessEndLocation = 0;
    m_outerCubicTessVertexIdx = 0;
    m_midpointFanTessVertexIdx = 0;

    m_flushDesc = FlushDescriptor();

    m_drawList.reset();
    m_combinedShaderFeatures = pls::ShaderFeatures::NONE;

    m_currentPathIsStroked = false;
    m_currentPathContourDirections = pls::ContourDirections::none;
    m_currentPathID = 0;
    m_currentContourID = 0;
    m_currentContourPaddingVertexCount = 0;
    m_pathTessLocation = 0;
    m_pathMirroredTessLocation = 0;
    RIVE_DEBUG_CODE(m_expectedPathTessLocationAtEndOfPath = 0;)
    RIVE_DEBUG_CODE(m_expectedPathMirroredTessLocationAtEndOfPath = 0;)
    RIVE_DEBUG_CODE(m_pathCurveCount = 0;)

    m_currentZIndex = 0;

    RIVE_DEBUG_CODE(m_hasDoneLayout = false;)
}

void PLSRenderContext::LogicalFlush::resetContainers()
{
    m_clips.clear();
    m_clips.shrink_to_fit();
    m_plsDraws.clear();
    m_plsDraws.shrink_to_fit();
    m_plsDraws.reserve(kDefaultDrawCapacity);

    m_simpleGradients.rehash(0);
    m_simpleGradients.reserve(kDefaultSimpleGradientCapacity);

    m_pendingSimpleGradientWrites.clear();
    m_pendingSimpleGradientWrites.shrink_to_fit();
    m_pendingSimpleGradientWrites.reserve(kDefaultSimpleGradientCapacity);

    m_complexGradients.rehash(0);
    m_complexGradients.reserve(kDefaultComplexGradientCapacity);

    m_pendingComplexColorRampDraws.clear();
    m_pendingComplexColorRampDraws.shrink_to_fit();
    m_pendingComplexColorRampDraws.reserve(kDefaultComplexGradientCapacity);
}

void PLSRenderContext::beginFrame(const FrameDescriptor& frameDescriptor)
{
    assert(!m_didBeginFrame);
    assert(frameDescriptor.renderTargetWidth > 0);
    assert(frameDescriptor.renderTargetHeight > 0);
    m_frameDescriptor = frameDescriptor;
    if (m_frameDescriptor.msaaSampleCount > 0 || !platformFeatures().supportsPixelLocalStorage)
    {
        m_frameInterlockMode = pls::InterlockMode::depthStencil;
        m_frameDescriptor.msaaSampleCount = std::max(m_frameDescriptor.msaaSampleCount, 1);
    }
    else if (m_frameDescriptor.disableRasterOrdering || !platformFeatures().supportsRasterOrdering)
    {
        m_frameInterlockMode = pls::InterlockMode::atomics;
    }
    else
    {
        m_frameInterlockMode = pls::InterlockMode::rasterOrdering;
    }
    m_frameShaderFeaturesMask = pls::ShaderFeaturesMaskFor(m_frameInterlockMode);
    if (m_logicalFlushes.empty())
    {
        m_logicalFlushes.emplace_back(new LogicalFlush(this));
    }
    RIVE_DEBUG_CODE(m_didBeginFrame = true);
}

bool PLSRenderContext::isOutsideCurrentFrame(const IAABB& pixelBounds)
{
    assert(m_didBeginFrame);
    int4 bounds = simd::load4i(&pixelBounds);
    auto renderTargetSize = simd::cast<int32_t>(
        uint2{m_frameDescriptor.renderTargetWidth, m_frameDescriptor.renderTargetHeight});
    return simd::any(bounds.xy >= renderTargetSize || bounds.zw <= 0 || bounds.xy >= bounds.zw);
}

bool PLSRenderContext::frameSupportsClipRects() const
{
    assert(m_didBeginFrame);
    return m_frameInterlockMode != pls::InterlockMode::depthStencil ||
           platformFeatures().supportsClipPlanes;
}

bool PLSRenderContext::frameSupportsImagePaintForPaths() const
{
    assert(m_didBeginFrame);
    return m_frameInterlockMode != pls::InterlockMode::atomics ||
           platformFeatures().supportsBindlessTextures;
}

uint32_t PLSRenderContext::generateClipID(const IAABB& contentBounds)
{
    assert(m_didBeginFrame);
    assert(!m_logicalFlushes.empty());
    return m_logicalFlushes.back()->generateClipID(contentBounds);
}

uint32_t PLSRenderContext::LogicalFlush::generateClipID(const IAABB& contentBounds)
{
    if (m_clips.size() < m_ctx->m_maxPathID) // maxClipID == maxPathID.
    {
        m_clips.emplace_back(contentBounds);
        assert(m_ctx->m_clipContentID != m_clips.size());
        return m_clips.size();
    }
    return 0; // There are no available clip IDs. The caller should flush and try again.
}

PLSRenderContext::LogicalFlush::ClipInfo& PLSRenderContext::LogicalFlush::getWritableClipInfo(
    uint32_t clipID)
{
    assert(clipID > 0);
    assert(clipID <= m_clips.size());
    return m_clips[clipID - 1];
}

void PLSRenderContext::LogicalFlush::addClipReadBounds(uint32_t clipID, const IAABB& bounds)
{
    assert(clipID > 0);
    assert(clipID <= m_clips.size());
    ClipInfo& clipInfo = getWritableClipInfo(clipID);
    clipInfo.readBounds = clipInfo.readBounds.join(bounds);
}

bool PLSRenderContext::pushDrawBatch(PLSDrawUniquePtr draws[], size_t drawCount)
{
    assert(m_didBeginFrame);
    assert(!m_logicalFlushes.empty());
    return m_logicalFlushes.back()->pushDrawBatch(draws, drawCount);
}

bool PLSRenderContext::LogicalFlush::pushDrawBatch(PLSDrawUniquePtr draws[], size_t drawCount)
{
    assert(!m_hasDoneLayout);

    if (m_flushDesc.interlockMode == pls::InterlockMode::atomics &&
        m_drawList.count() + drawCount > kMaxReorderedDrawCount)
    {
        // We can only reorder 64k draws at a time since the sort key addresses them with a 16-bit
        // index.
        return false;
    }

    auto countsVector = m_resourceCounts.toVec();
    for (size_t i = 0; i < drawCount; ++i)
    {
        assert(!draws[i]->pixelBounds().empty());
        assert(m_ctx->frameSupportsClipRects() || draws[i]->clipRectInverseMatrix() == nullptr);
        countsVector += draws[i]->resourceCounts().toVec();
    }
    PLSDraw::ResourceCounters countsWithNewBatch = countsVector;

    // Textures have hard size limits. If new batch doesn't fit in one of the textures, the caller
    // needs to flush and try again.
    if (countsWithNewBatch.pathCount > m_ctx->m_maxPathID ||
        countsWithNewBatch.contourCount > kMaxContourID ||
        countsWithNewBatch.midpointFanTessVertexCount +
                countsWithNewBatch.outerCubicTessVertexCount >
            kMaxTessellationVertexCountBeforePadding)
    {
        return false;
    }

    // Allocate spans in the gradient texture.
    for (size_t i = 0; i < drawCount; ++i)
    {
        if (!draws[i]->allocateGradientIfNeeded(this, &countsWithNewBatch))
        {
            // The gradient doesn't fit. Give up and let the caller flush and try again.
            return false;
        }
    }

    for (size_t i = 0; i < drawCount; ++i)
    {
        m_plsDraws.push_back(std::move(draws[i]));
        m_combinedDrawBounds = m_combinedDrawBounds.join(m_plsDraws.back()->pixelBounds());
    }

    m_resourceCounts = countsWithNewBatch;
    return true;
}

bool PLSRenderContext::LogicalFlush::allocateGradient(const PLSGradient* gradient,
                                                      PLSDraw::ResourceCounters* counters,
                                                      pls::ColorRampLocation* colorRampLocation)
{
    assert(!m_hasDoneLayout);

    const float* stops = gradient->stops();
    size_t stopCount = gradient->count();

    if (stopCount == 2 && stops[0] == 0)
    {
        // This is a simple gradient that can be implemented by a two-texel color ramp.
        assert(stops[1] == 1); // PLSGradient transforms the stops so that the final stop == 1.
        uint64_t simpleKey;
        static_assert(sizeof(simpleKey) == sizeof(ColorInt) * 2);
        RIVE_INLINE_MEMCPY(&simpleKey, gradient->colors(), sizeof(ColorInt) * 2);
        uint32_t rampTexelsIdx;
        auto iter = m_simpleGradients.find(simpleKey);
        if (iter != m_simpleGradients.end())
        {
            rampTexelsIdx = iter->second; // This gradient is already in the texture.
        }
        else
        {
            if (gradient_data_height(m_simpleGradients.size() + 1, m_complexGradients.size()) >
                kMaxTextureHeight)
            {
                // We ran out of rows in the gradient texture. Caller has to flush and try again.
                return false;
            }
            rampTexelsIdx = m_simpleGradients.size() * 2;
            m_simpleGradients.insert({simpleKey, rampTexelsIdx});
            m_pendingSimpleGradientWrites.emplace_back().set(gradient->colors());
        }
        colorRampLocation->row = rampTexelsIdx / kGradTextureWidth;
        colorRampLocation->col = rampTexelsIdx % kGradTextureWidth;
    }
    else
    {
        // This is a complex gradient. Render it to an entire row of the gradient texture.
        GradientContentKey key(ref_rcp(gradient));
        auto iter = m_complexGradients.find(key);
        uint16_t row;
        if (iter != m_complexGradients.end())
        {
            row = iter->second; // This gradient is already in the texture.
        }
        else
        {
            if (gradient_data_height(m_simpleGradients.size(), m_complexGradients.size() + 1) >
                kMaxTextureHeight)
            {
                // We ran out of rows in the gradient texture. Caller has to flush and try again.
                return false;
            }

            size_t spanCount = stopCount + 1;
            counters->complexGradientSpanCount += spanCount;

            row = static_cast<uint32_t>(m_complexGradients.size());
            m_complexGradients.emplace(std::move(key), row);
            m_pendingComplexColorRampDraws.push_back(gradient);
        }
        colorRampLocation->row = row;
        colorRampLocation->col = ColorRampLocation::kComplexGradientMarker;
    }
    return true;
}

void PLSRenderContext::logicalFlush()
{
    assert(m_didBeginFrame);

    // Reset clipping state after every logical flush because the clip buffer is not preserved
    // between render passes.
    m_clipContentID = 0;

    // Don't issue any GPU commands between logical flushes. Instead, build up a list of flushes
    // that we will submit all at once at the end of the frame.
    m_logicalFlushes.emplace_back(new LogicalFlush(this));
}

void PLSRenderContext::flush(const FlushResources& flushResources)
{
    assert(m_didBeginFrame);
    assert(flushResources.renderTarget->width() == m_frameDescriptor.renderTargetWidth);
    assert(flushResources.renderTarget->height() == m_frameDescriptor.renderTargetHeight);

    m_clipContentID = 0;

    // Layout this frame's resource buffers and textures.
    LogicalFlush::ResourceCounters totalFrameResourceCounts;
    LogicalFlush::LayoutCounters layoutCounts;
    for (size_t i = 0; i < m_logicalFlushes.size(); ++i)
    {
        m_logicalFlushes[i]->layoutResources(flushResources,
                                             i,
                                             i == m_logicalFlushes.size() - 1,
                                             &totalFrameResourceCounts,
                                             &layoutCounts);
    }
    assert(layoutCounts.maxGradTextureHeight <= kMaxTextureHeight);
    assert(layoutCounts.maxTessTextureHeight <= kMaxTextureHeight);

    // Determine the minimum required resource allocation sizes to service this flush.
    ResourceAllocationCounts allocs;
    allocs.flushUniformBufferCount = m_logicalFlushes.size();
    allocs.imageDrawUniformBufferCount = totalFrameResourceCounts.imageDrawCount;
    allocs.pathBufferCount = totalFrameResourceCounts.pathCount + layoutCounts.pathPaddingCount;
    allocs.paintBufferCount = totalFrameResourceCounts.pathCount + layoutCounts.paintPaddingCount;
    allocs.paintAuxBufferCount =
        totalFrameResourceCounts.pathCount + layoutCounts.paintAuxPaddingCount;
    allocs.contourBufferCount =
        totalFrameResourceCounts.contourCount + layoutCounts.contourPaddingCount;
    // The gradient texture needs to be updated in entire rows at a time. Extend its
    // texture-transfer buffer's length in order to be able to serve a worst-case scenario.
    allocs.simpleGradientBufferCount =
        layoutCounts.simpleGradCount + pls::kGradTextureWidthInSimpleRamps - 1;
    allocs.complexGradSpanBufferCount = totalFrameResourceCounts.complexGradientSpanCount;
    allocs.tessSpanBufferCount = totalFrameResourceCounts.maxTessellatedSegmentCount;
    allocs.triangleVertexBufferCount = totalFrameResourceCounts.maxTriangleVertexCount;
    allocs.gradTextureHeight = layoutCounts.maxGradTextureHeight;
    allocs.tessTextureHeight = layoutCounts.maxTessTextureHeight;

    // Track m_maxRecentResourceRequirements so we can trim GPU allocations when steady-state usage
    // goes down.
    m_maxRecentResourceRequirements =
        simd::max(allocs.toVec(), m_maxRecentResourceRequirements.toVec());

    // Grow resources enough to handle this flush.
    // If "allocs" already fits in our current allocations, then don't change them.
    // If they don't fit, overallocate by 25% in order to create some slack for growth.
    allocs = simd::if_then_else(allocs.toVec() <= m_currentResourceAllocations.toVec(),
                                m_currentResourceAllocations.toVec(),
                                allocs.toVec() * size_t(5) / size_t(4));

    // Additionally, every 5 seconds, trim resources down to the most recent steady-state usage.
    double flushTime = m_impl->secondsNow();
    bool needsResourceTrim = flushTime - m_lastResourceTrimTimeInSeconds >= 5;
    if (needsResourceTrim)
    {
        // Trim GPU resource allocations to 125% of their maximum recent usage, and only if the
        // recent usage is 2/3 or less of the current allocation.
        allocs = simd::if_then_else(m_maxRecentResourceRequirements.toVec() <=
                                        allocs.toVec() * size_t(2) / size_t(3),
                                    m_maxRecentResourceRequirements.toVec() * size_t(5) / size_t(4),
                                    allocs.toVec());

        // Zero out m_maxRecentResourceRequirements for the next interval.
        m_maxRecentResourceRequirements = ResourceAllocationCounts();
        m_lastResourceTrimTimeInSeconds = flushTime;
    }

    setResourceSizes(allocs);

    // Write out the GPU buffers for this frame.
    mapResourceBuffers(allocs);

    for (const auto& flush : m_logicalFlushes)
    {
        flush->writeResources();
    }

    assert(m_flushUniformData.elementsWritten() == m_logicalFlushes.size());
    assert(m_imageDrawUniformData.elementsWritten() == totalFrameResourceCounts.imageDrawCount);
    assert(m_pathData.elementsWritten() ==
           totalFrameResourceCounts.pathCount + layoutCounts.pathPaddingCount);
    assert(m_paintData.elementsWritten() ==
           totalFrameResourceCounts.pathCount + layoutCounts.paintPaddingCount);
    assert(m_paintAuxData.elementsWritten() ==
           totalFrameResourceCounts.pathCount + layoutCounts.paintAuxPaddingCount);
    assert(m_contourData.elementsWritten() ==
           totalFrameResourceCounts.contourCount + layoutCounts.contourPaddingCount);
    assert(m_simpleColorRampsData.elementsWritten() == layoutCounts.simpleGradCount);
    assert(m_gradSpanData.elementsWritten() == totalFrameResourceCounts.complexGradientSpanCount);
    assert(m_tessSpanData.elementsWritten() <= totalFrameResourceCounts.maxTessellatedSegmentCount);
    assert(m_triangleVertexData.elementsWritten() <=
           totalFrameResourceCounts.maxTriangleVertexCount);

    unmapResourceBuffers();

    // Issue logical flushes to the backend.
    for (const auto& flush : m_logicalFlushes)
    {
        m_impl->flush(flush->desc());
    }

    if (!m_logicalFlushes.empty())
    {
        m_logicalFlushes.resize(1);
        m_logicalFlushes.front()->rewind();
    }

    // Drop all memory that was allocated for this frame using TrivialBlockAllocator.
    m_perFrameAllocator.reset();
    m_numChopsAllocator.reset();
    m_chopVerticesAllocator.reset();
    m_tangentPairsAllocator.reset();
    m_polarSegmentCountsAllocator.reset();
    m_parametricSegmentCountsAllocator.reset();

    m_frameDescriptor = FrameDescriptor();

    RIVE_DEBUG_CODE(m_didBeginFrame = false;)

    // Wait to reset CPU-side containers until after the flush has finished.
    if (needsResourceTrim)
    {
        resetContainers();
    }
}

void PLSRenderContext::LogicalFlush::layoutResources(const FlushResources& flushResources,
                                                     size_t logicalFlushIdx,
                                                     bool isFinalFlushOfFrame,
                                                     ResourceCounters* runningFrameResourceCounts,
                                                     LayoutCounters* runningFrameLayoutCounts)
{
    assert(!m_hasDoneLayout);

    const FrameDescriptor& frameDescriptor = m_ctx->frameDescriptor();

    // Reserve a path record for the clearColor paint (used by atomic mode).
    // This also allows us to index the storage buffers directly by pathID.
    ++m_resourceCounts.pathCount;

    // Storage buffer offsets are required to be aligned on multiples of 256, so add padding
    // elements to our storage buffers.
    m_pathPaddingCount =
        pls::PaddingToAlignUp<pls::kPathBufferAlignmentInElements>(m_resourceCounts.pathCount);
    m_paintPaddingCount =
        pls::PaddingToAlignUp<pls::kPaintBufferAlignmentInElements>(m_resourceCounts.pathCount);
    m_paintAuxPaddingCount =
        pls::PaddingToAlignUp<pls::kPaintAuxBufferAlignmentInElements>(m_resourceCounts.pathCount);
    m_contourPaddingCount = pls::PaddingToAlignUp<pls::kContourBufferAlignmentInElements>(
        m_resourceCounts.contourCount);

    size_t totalTessVertexCountWithPadding = 0;
    if ((m_resourceCounts.midpointFanTessVertexCount |
         m_resourceCounts.outerCubicTessVertexCount) != 0)
    {
        // midpointFan tessellation vertices reside at the beginning of the tessellation texture,
        // after 1 patch of padding vertices.
        constexpr uint32_t kPrePadding = pls::kMidpointFanPatchSegmentSpan;
        m_midpointFanTessVertexIdx = kPrePadding;
        m_midpointFanTessEndLocation =
            m_midpointFanTessVertexIdx + m_resourceCounts.midpointFanTessVertexCount;

        // outerCubic tessellation vertices reside after the midpointFan vertices, aligned on a
        // multiple of the outerCubic patch size.
        uint32_t interiorPadding =
            PaddingToAlignUp<pls::kOuterCurvePatchSegmentSpan>(m_midpointFanTessEndLocation);
        m_outerCubicTessVertexIdx = m_midpointFanTessEndLocation + interiorPadding;
        m_outerCubicTessEndLocation =
            m_outerCubicTessVertexIdx + m_resourceCounts.outerCubicTessVertexCount;

        // We need one more padding vertex after all the tessellation vertices.
        constexpr uint32_t kPostPadding = 1;
        totalTessVertexCountWithPadding = m_outerCubicTessEndLocation + kPostPadding;

        assert(kPrePadding + interiorPadding + kPostPadding <= kMaxTessellationPaddingVertexCount);
        assert(totalTessVertexCountWithPadding <= kMaxTessellationVertexCount);
    }

    uint32_t tessDataHeight =
        resource_texture_height<kTessTextureWidth>(totalTessVertexCountWithPadding);
    if (m_resourceCounts.maxTessellatedSegmentCount != 0)
    {
        // Conservatively account for line breaks and padding in the tessellation span count.
        // Line breaks potentially introduce a new span. Count the maximum number of line breaks we
        // might encounter, which is at most TWO for every line in the tessellation texture (one for
        // a forward span, and one for its reflection.)
        size_t maxSpanBreakCount = tessDataHeight * 2;
        // The tessellation texture requires 3 separate spans of padding vertices (see above and
        // below).
        constexpr size_t kPaddingSpanCount = 3;
        m_resourceCounts.maxTessellatedSegmentCount += maxSpanBreakCount + kPaddingSpanCount;
    }

    m_flushDesc.renderTarget = flushResources.renderTarget;
    m_flushDesc.interlockMode = m_ctx->frameInterlockMode();
    m_flushDesc.msaaSampleCount = frameDescriptor.msaaSampleCount;

    // In atomic mode, we may be able to skip the explicit clear of the color buffer and fold it
    // into the atomic "resolve" operation instead.
    bool doClearDuringAtomicResolve = false;

    if (logicalFlushIdx != 0)
    {
        // We always have to preserve the renderTarget between logical flushes.
        m_flushDesc.colorLoadAction = pls::LoadAction::preserveRenderTarget;
    }
    else if (frameDescriptor.loadAction == pls::LoadAction::clear)
    {
        // In atomic mode, we can clear during the resolve operation if the clearColor is opaque
        // (because we don't want or have a "source only" blend mode).
        doClearDuringAtomicResolve = m_ctx->frameInterlockMode() == pls::InterlockMode::atomics &&
                                     colorAlpha(frameDescriptor.clearColor) == 255;
        m_flushDesc.colorLoadAction =
            doClearDuringAtomicResolve ? pls::LoadAction::dontCare : pls::LoadAction::clear;
    }
    else
    {
        m_flushDesc.colorLoadAction = frameDescriptor.loadAction;
    }
    m_flushDesc.clearColor = frameDescriptor.clearColor;

    if (doClearDuringAtomicResolve)
    {
        // In atomic mode we can accomplish a clear of the color buffer while the shader resolves
        // coverage, instead of actually clearing. writeResources() will configure the fill for
        // pathID=0 to be a solid fill matching the clearColor, so if we just initialize coverage
        // buffer to solid coverage with pathID=0, the resolve step will write out the correct clear
        // color.
        assert(m_flushDesc.interlockMode == pls::InterlockMode::atomics);
        m_flushDesc.coverageClearValue = static_cast<uint32_t>(FIXED_COVERAGE_ONE);
    }
    else if (m_flushDesc.interlockMode == pls::InterlockMode::atomics)
    {
        // When we don't skip the initial clear in atomic mode, clear the coverage buffer to
        // pathID=0 and a transparent coverage value.
        // pathID=0 meets the requirement that pathID is always monotonically increasing.
        // Transparent coverage makes sure the clearColor doesn't get written out while resolving.
        m_flushDesc.coverageClearValue = static_cast<uint32_t>(FIXED_COVERAGE_ZERO);
    }
    else
    {
        // In non-atomic mode, the coverage buffer just needs to be initialized with "pathID=0" to
        // avoid collisions with any pathIDs being rendered.
        m_flushDesc.coverageClearValue = 0;
    }

    if (doClearDuringAtomicResolve || m_flushDesc.colorLoadAction == pls::LoadAction::clear)
    {
        // If we're clearing then we always update the entire render target.
        m_flushDesc.renderTargetUpdateBounds = m_flushDesc.renderTarget->bounds();
    }
    else
    {
        // When we don't clear, we only update the draw bounds.
        m_flushDesc.renderTargetUpdateBounds =
            m_flushDesc.renderTarget->bounds().intersect(m_combinedDrawBounds);
    }
    if (m_flushDesc.renderTargetUpdateBounds.empty())
    {
        // If this is empty it means there are no draws and no clear.
        m_flushDesc.renderTargetUpdateBounds = {0, 0, 0, 0};
    }

    m_flushDesc.flushUniformDataOffsetInBytes = logicalFlushIdx * sizeof(pls::FlushUniforms);
    m_flushDesc.pathCount = m_resourceCounts.pathCount;
    m_flushDesc.pathCount = m_resourceCounts.pathCount;
    m_flushDesc.firstPath =
        runningFrameResourceCounts->pathCount + runningFrameLayoutCounts->pathPaddingCount;
    m_flushDesc.firstPaint =
        runningFrameResourceCounts->pathCount + runningFrameLayoutCounts->paintPaddingCount;
    m_flushDesc.firstPaintAux =
        runningFrameResourceCounts->pathCount + runningFrameLayoutCounts->paintAuxPaddingCount;
    m_flushDesc.contourCount = m_resourceCounts.contourCount;
    m_flushDesc.firstContour =
        runningFrameResourceCounts->contourCount + runningFrameLayoutCounts->contourPaddingCount;
    m_flushDesc.complexGradSpanCount = m_resourceCounts.complexGradientSpanCount;
    m_flushDesc.firstComplexGradSpan = runningFrameResourceCounts->complexGradientSpanCount;
    m_flushDesc.simpleGradTexelsWidth =
        std::min<uint32_t>(m_simpleGradients.size(), pls::kGradTextureWidthInSimpleRamps) * 2;
    m_flushDesc.simpleGradTexelsHeight =
        resource_texture_height<pls::kGradTextureWidthInSimpleRamps>(m_simpleGradients.size());
    m_flushDesc.simpleGradDataOffsetInBytes =
        runningFrameLayoutCounts->simpleGradCount * sizeof(pls::TwoTexelRamp);
    m_flushDesc.complexGradRowsTop = m_flushDesc.simpleGradTexelsHeight;
    m_flushDesc.complexGradRowsHeight = m_complexGradients.size();
    m_flushDesc.tessDataHeight = tessDataHeight;

    m_flushDesc.wireframe = frameDescriptor.wireframe;
    m_flushDesc.externalCommandBuffer = flushResources.externalCommandBuffer;
    m_flushDesc.isFinalFlushOfFrame = isFinalFlushOfFrame;

    *runningFrameResourceCounts = runningFrameResourceCounts->toVec() + m_resourceCounts.toVec();
    runningFrameLayoutCounts->pathPaddingCount += m_pathPaddingCount;
    runningFrameLayoutCounts->paintPaddingCount += m_paintPaddingCount;
    runningFrameLayoutCounts->paintAuxPaddingCount += m_paintAuxPaddingCount;
    runningFrameLayoutCounts->contourPaddingCount += m_contourPaddingCount;
    runningFrameLayoutCounts->simpleGradCount += m_simpleGradients.size();
    runningFrameLayoutCounts->maxGradTextureHeight =
        std::max(m_flushDesc.simpleGradTexelsHeight + m_flushDesc.complexGradRowsHeight,
                 runningFrameLayoutCounts->maxGradTextureHeight);
    runningFrameLayoutCounts->maxTessTextureHeight =
        std::max(m_flushDesc.tessDataHeight, runningFrameLayoutCounts->maxTessTextureHeight);

    RIVE_DEBUG_CODE(m_hasDoneLayout = true;)
}

void PLSRenderContext::LogicalFlush::writeResources()
{
    const pls::PlatformFeatures& platformFeatures = m_ctx->platformFeatures();
    assert(m_hasDoneLayout);
    assert(m_flushDesc.firstPath == m_ctx->m_pathData.elementsWritten());
    assert(m_flushDesc.firstPaint == m_ctx->m_paintData.elementsWritten());
    assert(m_flushDesc.firstPaintAux == m_ctx->m_paintAuxData.elementsWritten());

    // Wait until here to layout the gradient texture because the final gradient texture height is
    // not decided until after all LogicalFlushes have run layoutResources().
    m_gradTextureLayout.inverseHeight = 1.f / m_ctx->m_currentResourceAllocations.gradTextureHeight;
    m_gradTextureLayout.complexOffsetY = m_flushDesc.complexGradRowsTop;

    // Exact tessSpan/triangleVertex counts aren't known until after their data is written out.
    m_flushDesc.firstTessVertexSpan = m_ctx->m_tessSpanData.elementsWritten();
    size_t initialTriangleVertexDataSize = m_ctx->m_triangleVertexData.bytesWritten();

    m_ctx->m_flushUniformData.emplace_back(m_flushDesc, platformFeatures);

    // Write out the simple gradient data.
    assert(m_simpleGradients.size() == m_pendingSimpleGradientWrites.size());
    if (!m_pendingSimpleGradientWrites.empty())
    {
        m_ctx->m_simpleColorRampsData.push_back_n(m_pendingSimpleGradientWrites.data(),
                                                  m_pendingSimpleGradientWrites.size());
    }

    // Write out the vertex data for rendering complex gradients.
    assert(m_complexGradients.size() == m_pendingComplexColorRampDraws.size());
    if (!m_pendingComplexColorRampDraws.empty())
    {
        // The viewport will start at simpleGradDataHeight when rendering color ramps.
        for (uint32_t y = 0; y < m_pendingComplexColorRampDraws.size(); ++y)
        {
            const PLSGradient* gradient = m_pendingComplexColorRampDraws[y];
            const ColorInt* colors = gradient->colors();
            const float* stops = gradient->stops();
            size_t stopCount = gradient->count();

            // Push "GradientSpan" instances that will render each section of the color ramp.
            ColorInt lastColor = colors[0];
            uint32_t lastXFixed = 0;
            // "stop * w + .5" converts a stop position to an x-coordinate in the gradient texture.
            // Stops should be aligned (ideally) on pixel centers to prevent bleed.
            // Render half-pixel-wide caps at the beginning and end to ensure the boundary pixels
            // get filled.
            float w = kGradTextureWidth - 1.f;
            for (size_t i = 0; i < stopCount; ++i)
            {
                float x = stops[i] * w + .5f;
                uint32_t xFixed = static_cast<uint32_t>(x * (65536.f / kGradTextureWidth));
                assert(lastXFixed <= xFixed && xFixed < 65536); // stops[] must be ordered.
                m_ctx->m_gradSpanData.set_back(lastXFixed, xFixed, y, lastColor, colors[i]);
                lastColor = colors[i];
                lastXFixed = xFixed;
            }
            m_ctx->m_gradSpanData.set_back(lastXFixed, 65535u, y, lastColor, lastColor);
        }
    }

    // Write a path record for the clearColor paint (used by atomic mode).
    // This also allows us to index the storage buffers directly by pathID.
    pls::SimplePaintValue clearColorValue;
    clearColorValue.color = m_ctx->frameDescriptor().clearColor;
    m_ctx->m_pathData.skip_back();
    m_ctx->m_paintData.set_back(FillRule::nonZero,
                                PaintType::solidColor,
                                clearColorValue,
                                GradTextureLayout(),
                                /*clipID =*/0,
                                /*hasClipRect =*/false,
                                BlendMode::srcOver);
    m_ctx->m_paintAuxData.skip_back();

    // Render padding vertices in the tessellation texture.
    if (m_flushDesc.tessDataHeight > 0)
    {
        // Padding at the beginning of the tessellation texture.
        pushPaddingVertices(0, pls::kMidpointFanPatchSegmentSpan);
        // Padding between patch types in the tessellation texture.
        if (m_outerCubicTessVertexIdx > m_midpointFanTessEndLocation)
        {
            pushPaddingVertices(m_midpointFanTessEndLocation,
                                m_outerCubicTessVertexIdx - m_midpointFanTessEndLocation);
        }
        // The final vertex of the final patch of each contour crosses over into the next contour.
        // (This is how we wrap around back to the beginning.) Therefore, the final contour of the
        // flush needs an out-of-contour vertex to cross into as well, so we emit a padding vertex
        // here at the end.
        pushPaddingVertices(m_outerCubicTessEndLocation, 1);
    }

    // Write out all the data for our high level draws, and build up a low-level draw list.
    if (m_ctx->frameInterlockMode() == pls::InterlockMode::rasterOrdering)
    {
        for (const PLSDrawUniquePtr& draw : m_plsDraws)
        {
            draw->pushToRenderContext(this);
        }
    }
    else
    {
        assert(m_plsDraws.size() <= kMaxReorderedDrawCount);

        // Sort the draw list to optimize batching, since we can only batch non-overlapping draws.
        std::vector<int64_t>& indirectDrawList = m_ctx->m_indirectDrawList;
        indirectDrawList.resize(m_plsDraws.size());

        if (m_ctx->m_intersectionBoard == nullptr)
        {
            m_ctx->m_intersectionBoard = std::make_unique<IntersectionBoard>();
        }
        IntersectionBoard* intersectionBoard = m_ctx->m_intersectionBoard.get();
        intersectionBoard->resizeAndReset(m_flushDesc.renderTarget->width(),
                                          m_flushDesc.renderTarget->height());

        // Build a list of sort keys that determine the final draw order.
        constexpr static int kDrawGroupShift = 48; // Where in the key does the draw group begin?
        constexpr static int64_t kDrawGroupMask = 0xffffllu << kDrawGroupShift;
        constexpr static int kDrawTypeShift = 45;
        constexpr static int64_t kDrawTypeMask RIVE_MAYBE_UNUSED = 7llu << kDrawTypeShift;
        constexpr static int kTextureHashShift = 26;
        constexpr static int64_t kTextureHashMask = 0x7ffffllu << kTextureHashShift;
        constexpr static int kBlendModeShift = 22;
        constexpr static int kBlendModeMask = 0xf << kBlendModeShift;
        constexpr static int kDrawContentsShift = 16;
        constexpr static int64_t kDrawContentsMask = 0x3fllu << kDrawContentsShift;
        constexpr static int64_t kDrawIndexMask = 0xffff;
        for (size_t i = 0; i < m_plsDraws.size(); ++i)
        {
            PLSDraw* draw = m_plsDraws[i].get();

            int4 drawBounds = simd::load4i(&m_plsDraws[i]->pixelBounds());

            // Add one extra pixel of padding to the draw bounds to make absolutely certain we get
            // no overlapping pixels, which destroy the atomic shader.
            const int32_t kMax32i = std::numeric_limits<int32_t>::max();
            const int32_t kMin32i = std::numeric_limits<int32_t>::min();
            drawBounds = simd::if_then_else(drawBounds != int4{kMin32i, kMin32i, kMax32i, kMax32i},
                                            drawBounds + int4{-1, -1, 1, 1},
                                            drawBounds);

            // Our top priority in re-ordering is to group non-overlapping draws together, in order
            // to maximize batching while preserving correctness.
            int64_t drawGroupIdx = intersectionBoard->addRectangle(drawBounds);
            assert(drawGroupIdx > 0);
            if (m_flushDesc.interlockMode == pls::InterlockMode::depthStencil && draw->isOpaque())
            {
                // In depthStencil mode we can reverse-sort opaque paths front to back, draw them
                // first, and take advantage of early Z culling.
                //
                // To keep things simple initially, we don't reverse-sort draws that use clipping.
                // (Otherwise if a clip affects both opaque and transparent content, we would have
                // to apply it twice.)
                bool usesClipping = draw->drawContents() &
                                    (pls::DrawContents::activeClip | pls::DrawContents::clipUpdate);
                if (!usesClipping)
                {
                    drawGroupIdx = -drawGroupIdx;
                }
            }
            int64_t key = drawGroupIdx << kDrawGroupShift;

            // Within sub-groups of non-overlapping draws, sort similar draw types together.
            int64_t drawType = static_cast<int64_t>(draw->type());
            assert(drawType <= kDrawTypeMask >> kDrawTypeShift);
            key |= drawType << kDrawTypeShift;

            // Within sub-groups of matching draw type, sort by texture binding.
            int64_t textureHash = draw->imageTexture() != nullptr
                                      ? draw->imageTexture()->textureResourceHash() &
                                            (kTextureHashMask >> kTextureHashShift)
                                      : 0;
            key |= textureHash << kTextureHashShift;

            // If using KHR_blend_equation_advanced, we need a batching barrier between draws with
            // different blend modes.
            // If not using KHR_blend_equation_advanced, sorting by blend mode may still give us
            // better branching on the GPU.
            int64_t blendMode = pls::ConvertBlendModeToPLSBlendMode(draw->blendMode());
            assert(blendMode <= kBlendModeMask >> kBlendModeShift);
            key |= blendMode << kBlendModeShift;

            // depthStencil mode draws strokes, fills, and even/odd with different stencil settings.
            int64_t drawContents = static_cast<int64_t>(draw->drawContents());
            assert(drawContents <= kDrawContentsMask >> kDrawContentsShift);
            key |= drawContents << kDrawContentsShift;

            // Draw index goes at the bottom of the key so we know which PLSDraw it corresponds to.
            assert(i <= kDrawIndexMask);
            key |= i;

            assert((key & kDrawGroupMask) >> kDrawGroupShift == drawGroupIdx);
            assert((key & kDrawTypeMask) >> kDrawTypeShift == drawType);
            assert((key & kTextureHashMask) >> kTextureHashShift == textureHash);
            assert((key & kBlendModeMask) >> kBlendModeShift == blendMode);
            assert((key & kDrawContentsMask) >> kDrawContentsShift == drawContents);
            assert((key & kDrawIndexMask) == i);

            indirectDrawList[i] = key;
        }

        // Re-order the draws!!
        std::sort(indirectDrawList.begin(), indirectDrawList.end());

        // Atomic mode sometimes needs to initialize PLS with a draw when the backend can't do it
        // with typical clear/load APIs.
        if (m_ctx->frameInterlockMode() == pls::InterlockMode::atomics &&
            platformFeatures.atomicPLSMustBeInitializedAsDraw)
        {
            m_drawList.emplace_back(m_ctx->perFrameAllocator(),
                                    DrawType::plsAtomicInitialize,
                                    nullptr,
                                    1,
                                    0);
            pushBarrier();
        }

        // Draws with the same drawGroupIdx don't overlap, but once we cross into a new draw group,
        // we need to insert a barrier between the overlaps.
        int64_t needsBarrierMask = kDrawGroupMask;
        if (m_flushDesc.interlockMode == pls::InterlockMode::depthStencil)
        {
            // depthStencil mode also draws clips, strokes, fills, and even/odd with different
            // stencil settings, so these also need a barrier.
            needsBarrierMask |= kDrawContentsMask;
            if (platformFeatures.supportsKHRBlendEquations)
            {
                // If using KHR_blend_equation_advanced, we also need a barrier between blend modes
                // in order to change the blend equation.
                needsBarrierMask |= kBlendModeMask;
            }
        }

        // Write out the draw data from the sorted draw list, and build up a condensed/batched list
        // of low-level draws.
        int64_t priorKey = !indirectDrawList.empty() ? indirectDrawList[0] : 0;
        for (int64_t key : indirectDrawList)
        {
            if ((priorKey & needsBarrierMask) != (key & needsBarrierMask))
            {
                pushBarrier();
            }
            // We negate drawGroupIdx on opaque paths in order to draw them first and in reverse
            // order, but their z index should still remain positive.
            m_currentZIndex = abs(key >> kDrawGroupShift);
            m_plsDraws[key & kDrawIndexMask]->pushToRenderContext(this);
            priorKey = key;
        }

        // Atomic mode needs one more draw to resolve all the pixels.
        if (m_ctx->frameInterlockMode() == pls::InterlockMode::atomics)
        {
            pushBarrier();
            m_drawList.emplace_back(m_ctx->perFrameAllocator(),
                                    DrawType::plsAtomicResolve,
                                    nullptr,
                                    1,
                                    0);
            m_drawList.tail().shaderFeatures = m_combinedShaderFeatures;
        }
    }

    // Pad our storage buffers to 256-byte alignment.
    m_ctx->m_pathData.push_back_n(nullptr, m_pathPaddingCount);
    m_ctx->m_paintData.push_back_n(nullptr, m_paintPaddingCount);
    m_ctx->m_paintAuxData.push_back_n(nullptr, m_paintAuxPaddingCount);
    m_ctx->m_contourData.push_back_n(nullptr, m_contourPaddingCount);

    assert(m_ctx->m_pathData.elementsWritten() ==
           m_flushDesc.firstPath + m_resourceCounts.pathCount + m_pathPaddingCount);
    assert(m_ctx->m_paintData.elementsWritten() ==
           m_flushDesc.firstPaint + m_resourceCounts.pathCount + m_paintPaddingCount);
    assert(m_ctx->m_paintAuxData.elementsWritten() ==
           m_flushDesc.firstPaintAux + m_resourceCounts.pathCount + m_paintAuxPaddingCount);
    assert(m_ctx->m_contourData.elementsWritten() ==
           m_flushDesc.firstContour + m_resourceCounts.contourCount + m_contourPaddingCount);

    assert(m_pathTessLocation == m_expectedPathTessLocationAtEndOfPath);
    assert(m_pathMirroredTessLocation == m_expectedPathMirroredTessLocationAtEndOfPath);
    assert(m_midpointFanTessVertexIdx == m_midpointFanTessEndLocation);
    assert(m_outerCubicTessVertexIdx == m_outerCubicTessEndLocation);

    // Update the flush descriptor's data counts that aren't known until it's written out.
    m_flushDesc.tessVertexSpanCount =
        m_ctx->m_tessSpanData.elementsWritten() - m_flushDesc.firstTessVertexSpan;
    m_flushDesc.hasTriangleVertices =
        m_ctx->m_triangleVertexData.bytesWritten() != initialTriangleVertexDataSize;

    m_flushDesc.drawList = &m_drawList;
    m_flushDesc.combinedShaderFeatures = m_combinedShaderFeatures;
}

void PLSRenderContext::setResourceSizes(ResourceAllocationCounts allocs, bool forceRealloc)
{
#if 0
    class Logger
    {
    public:
        void logSize(const char* name, size_t oldSize, size_t newSize, size_t newSizeInBytes)
        {
            m_totalSizeInBytes += newSizeInBytes;
            if (oldSize == newSize)
            {
                return;
            }
            if (!m_hasChanged)
            {
                printf("PLSRenderContext::setResourceSizes():\n");
                m_hasChanged = true;
            }
            printf("  resize %s: %zu -> %zu (%zu KiB)\n",
                   name,
                   oldSize,
                   newSize,
                   newSizeInBytes >> 10);
        }

        ~Logger()
        {
            if (!m_hasChanged)
            {
                return;
            }
            printf("  TOTAL GPU resource usage: %zu KiB\n", m_totalSizeInBytes >> 10);
        }

    private:
        size_t m_totalSizeInBytes = 0;
        bool m_hasChanged = false;
    } logger;
#define LOG_BUFFER_RING_SIZE(NAME, ITEM_SIZE_IN_BYTES)                                             \
    logger.logSize(#NAME,                                                                          \
                   m_currentResourceAllocations.NAME,                                              \
                   allocs.NAME,                                                                    \
                   allocs.NAME* ITEM_SIZE_IN_BYTES* pls::kBufferRingSize)
#define LOG_TEXTURE_HEIGHT(NAME, BYTES_PER_ROW)                                                    \
    logger.logSize(#NAME,                                                                          \
                   m_currentResourceAllocations.NAME,                                              \
                   allocs.NAME,                                                                    \
                   allocs.NAME* BYTES_PER_ROW)
#else
#define LOG_BUFFER_RING_SIZE(NAME, ITEM_SIZE_IN_BYTES)
#define LOG_TEXTURE_HEIGHT(NAME, BYTES_PER_ROW)
#endif

    LOG_BUFFER_RING_SIZE(flushUniformBufferCount, sizeof(pls::FlushUniforms));
    if (allocs.flushUniformBufferCount != m_currentResourceAllocations.flushUniformBufferCount ||
        forceRealloc)
    {
        m_impl->resizeFlushUniformBuffer(allocs.flushUniformBufferCount *
                                         sizeof(pls::FlushUniforms));
    }

    LOG_BUFFER_RING_SIZE(imageDrawUniformBufferCount, sizeof(pls::ImageDrawUniforms));
    if (allocs.imageDrawUniformBufferCount !=
            m_currentResourceAllocations.imageDrawUniformBufferCount ||
        forceRealloc)
    {
        m_impl->resizeImageDrawUniformBuffer(allocs.imageDrawUniformBufferCount *
                                             sizeof(pls::ImageDrawUniforms));
    }

    LOG_BUFFER_RING_SIZE(pathBufferCount, sizeof(pls::PathData));
    if (allocs.pathBufferCount != m_currentResourceAllocations.pathBufferCount || forceRealloc)
    {
        m_impl->resizePathBuffer(allocs.pathBufferCount * sizeof(pls::PathData),
                                 pls::PathData::kBufferStructure);
    }

    LOG_BUFFER_RING_SIZE(paintBufferCount, sizeof(pls::PaintData));
    if (allocs.paintBufferCount != m_currentResourceAllocations.paintBufferCount || forceRealloc)
    {
        m_impl->resizePaintBuffer(allocs.paintBufferCount * sizeof(pls::PaintData),
                                  pls::PaintData::kBufferStructure);
    }

    LOG_BUFFER_RING_SIZE(paintAuxBufferCount, sizeof(pls::PaintAuxData));
    if (allocs.paintAuxBufferCount != m_currentResourceAllocations.paintAuxBufferCount ||
        forceRealloc)
    {
        m_impl->resizePaintAuxBuffer(allocs.paintAuxBufferCount * sizeof(pls::PaintAuxData),
                                     pls::PaintAuxData::kBufferStructure);
    }

    LOG_BUFFER_RING_SIZE(contourBufferCount, sizeof(pls::ContourData));
    if (allocs.contourBufferCount != m_currentResourceAllocations.contourBufferCount ||
        forceRealloc)
    {
        m_impl->resizeContourBuffer(allocs.contourBufferCount * sizeof(pls::ContourData),
                                    pls::ContourData::kBufferStructure);
    }

    LOG_BUFFER_RING_SIZE(simpleGradientBufferCount, sizeof(pls::TwoTexelRamp));
    if (allocs.simpleGradientBufferCount !=
            m_currentResourceAllocations.simpleGradientBufferCount ||
        forceRealloc)
    {
        m_impl->resizeSimpleColorRampsBuffer(allocs.simpleGradientBufferCount *
                                             sizeof(pls::TwoTexelRamp));
    }

    LOG_BUFFER_RING_SIZE(complexGradSpanBufferCount, sizeof(pls::GradientSpan));
    if (allocs.complexGradSpanBufferCount !=
            m_currentResourceAllocations.complexGradSpanBufferCount ||
        forceRealloc)
    {
        m_impl->resizeGradSpanBuffer(allocs.complexGradSpanBufferCount * sizeof(pls::GradientSpan));
    }

    LOG_BUFFER_RING_SIZE(tessSpanBufferCount, sizeof(pls::TessVertexSpan));
    if (allocs.tessSpanBufferCount != m_currentResourceAllocations.tessSpanBufferCount ||
        forceRealloc)
    {
        m_impl->resizeTessVertexSpanBuffer(allocs.tessSpanBufferCount *
                                           sizeof(pls::TessVertexSpan));
    }

    LOG_BUFFER_RING_SIZE(triangleVertexBufferCount, sizeof(pls::TriangleVertex));
    if (allocs.triangleVertexBufferCount !=
            m_currentResourceAllocations.triangleVertexBufferCount ||
        forceRealloc)
    {
        m_impl->resizeTriangleVertexBuffer(allocs.triangleVertexBufferCount *
                                           sizeof(pls::TriangleVertex));
    }

    allocs.gradTextureHeight = std::min(allocs.gradTextureHeight, kMaxTextureHeight);
    LOG_TEXTURE_HEIGHT(gradTextureHeight, pls::kGradTextureWidth * 4);
    if (allocs.gradTextureHeight != m_currentResourceAllocations.gradTextureHeight || forceRealloc)
    {
        m_impl->resizeGradientTexture(pls::kGradTextureWidth, allocs.gradTextureHeight);
    }

    allocs.tessTextureHeight = std::min(allocs.tessTextureHeight, kMaxTextureHeight);
    LOG_TEXTURE_HEIGHT(tessTextureHeight, pls::kTessTextureWidth * 4 * 4);
    if (allocs.tessTextureHeight != m_currentResourceAllocations.tessTextureHeight || forceRealloc)
    {
        m_impl->resizeTessellationTexture(pls::kTessTextureWidth, allocs.tessTextureHeight);
    }

    m_currentResourceAllocations = allocs;
}

void PLSRenderContext::mapResourceBuffers(const ResourceAllocationCounts& mapCounts)
{
    m_impl->prepareToMapBuffers();

    if (mapCounts.flushUniformBufferCount > 0)
    {
        m_flushUniformData.mapElements(m_impl.get(),
                                       &PLSRenderContextImpl::mapFlushUniformBuffer,
                                       mapCounts.flushUniformBufferCount);
    }
    assert(m_flushUniformData.hasRoomFor(mapCounts.flushUniformBufferCount));

    if (mapCounts.imageDrawUniformBufferCount > 0)
    {
        m_imageDrawUniformData.mapElements(m_impl.get(),
                                           &PLSRenderContextImpl::mapImageDrawUniformBuffer,
                                           mapCounts.imageDrawUniformBufferCount);
    }
    assert(m_imageDrawUniformData.hasRoomFor(mapCounts.imageDrawUniformBufferCount > 0));

    if (mapCounts.pathBufferCount > 0)
    {
        m_pathData.mapElements(m_impl.get(),
                               &PLSRenderContextImpl::mapPathBuffer,
                               mapCounts.pathBufferCount);
    }
    assert(m_pathData.hasRoomFor(mapCounts.pathBufferCount));

    if (mapCounts.paintBufferCount > 0)
    {
        m_paintData.mapElements(m_impl.get(),
                                &PLSRenderContextImpl::mapPaintBuffer,
                                mapCounts.paintBufferCount);
    }
    assert(m_paintData.hasRoomFor(mapCounts.paintBufferCount));

    if (mapCounts.paintAuxBufferCount > 0)
    {
        m_paintAuxData.mapElements(m_impl.get(),
                                   &PLSRenderContextImpl::mapPaintAuxBuffer,
                                   mapCounts.paintAuxBufferCount);
    }
    assert(m_paintAuxData.hasRoomFor(mapCounts.paintAuxBufferCount));

    if (mapCounts.contourBufferCount > 0)
    {
        m_contourData.mapElements(m_impl.get(),
                                  &PLSRenderContextImpl::mapContourBuffer,
                                  mapCounts.contourBufferCount);
    }
    assert(m_contourData.hasRoomFor(mapCounts.contourBufferCount));

    if (mapCounts.simpleGradientBufferCount > 0)
    {
        m_simpleColorRampsData.mapElements(m_impl.get(),
                                           &PLSRenderContextImpl::mapSimpleColorRampsBuffer,
                                           mapCounts.simpleGradientBufferCount);
    }
    assert(m_simpleColorRampsData.hasRoomFor(mapCounts.simpleGradientBufferCount));

    if (mapCounts.complexGradSpanBufferCount > 0)
    {
        m_gradSpanData.mapElements(m_impl.get(),
                                   &PLSRenderContextImpl::mapGradSpanBuffer,
                                   mapCounts.complexGradSpanBufferCount);
    }
    assert(m_gradSpanData.hasRoomFor(mapCounts.complexGradSpanBufferCount));

    if (mapCounts.tessSpanBufferCount > 0)
    {
        m_tessSpanData.mapElements(m_impl.get(),
                                   &PLSRenderContextImpl::mapTessVertexSpanBuffer,
                                   mapCounts.tessSpanBufferCount);
    }
    assert(m_tessSpanData.hasRoomFor(mapCounts.tessSpanBufferCount));

    if (mapCounts.triangleVertexBufferCount > 0)
    {
        m_triangleVertexData.mapElements(m_impl.get(),
                                         &PLSRenderContextImpl::mapTriangleVertexBuffer,
                                         mapCounts.triangleVertexBufferCount);
    }
    assert(m_triangleVertexData.hasRoomFor(mapCounts.triangleVertexBufferCount));
}

void PLSRenderContext::unmapResourceBuffers()
{
    if (m_flushUniformData)
    {
        m_impl->unmapFlushUniformBuffer();
        m_flushUniformData.reset();
    }
    if (m_imageDrawUniformData)
    {
        m_impl->unmapImageDrawUniformBuffer();
        m_imageDrawUniformData.reset();
    }
    if (m_pathData)
    {
        m_impl->unmapPathBuffer();
        m_pathData.reset();
    }
    if (m_paintData)
    {
        m_impl->unmapPaintBuffer();
        m_paintData.reset();
    }
    if (m_paintAuxData)
    {
        m_impl->unmapPaintAuxBuffer();
        m_paintAuxData.reset();
    }
    if (m_contourData)
    {
        m_impl->unmapContourBuffer();
        m_contourData.reset();
    }
    if (m_simpleColorRampsData)
    {
        m_impl->unmapSimpleColorRampsBuffer();
        m_simpleColorRampsData.reset();
    }
    if (m_gradSpanData)
    {
        m_impl->unmapGradSpanBuffer();
        m_gradSpanData.reset();
    }
    if (m_tessSpanData)
    {
        m_impl->unmapTessVertexSpanBuffer();
        m_tessSpanData.reset();
    }
    if (m_triangleVertexData)
    {
        m_impl->unmapTriangleVertexBuffer();
        m_triangleVertexData.reset();
    }
}

void PLSRenderContext::LogicalFlush::pushPaddingVertices(uint32_t tessLocation, uint32_t count)
{
    assert(m_hasDoneLayout);
    assert(count > 0);

    constexpr static Vec2D kEmptyCubic[4]{};
    // This is guaranteed to not collide with a neighboring contour ID.
    constexpr static uint32_t kInvalidContourID = 0;
    assert(m_pathTessLocation == m_expectedPathTessLocationAtEndOfPath);
    assert(m_pathMirroredTessLocation == m_expectedPathMirroredTessLocationAtEndOfPath);
    m_pathTessLocation = tessLocation;
    RIVE_DEBUG_CODE(m_expectedPathTessLocationAtEndOfPath = m_pathTessLocation + count;)
    assert(m_expectedPathTessLocationAtEndOfPath <= kMaxTessellationVertexCount);
    pushTessellationSpans(kEmptyCubic, {0, 0}, count, 0, 0, 1, kInvalidContourID);
    assert(m_pathTessLocation == m_expectedPathTessLocationAtEndOfPath);
}

void PLSRenderContext::LogicalFlush::pushPath(PLSPathDraw* draw,
                                              pls::PatchType patchType,
                                              uint32_t tessVertexCount)
{
    assert(m_hasDoneLayout);
    assert(m_pathTessLocation == m_expectedPathTessLocationAtEndOfPath);
    assert(m_pathMirroredTessLocation == m_expectedPathMirroredTessLocationAtEndOfPath);

    m_currentPathIsStroked = draw->strokeRadius() != 0;
    m_currentPathContourDirections = draw->contourDirections();
    ++m_currentPathID;
    assert(0 < m_currentPathID && m_currentPathID <= m_ctx->m_maxPathID);

    m_ctx->m_pathData.set_back(draw->matrix(), draw->strokeRadius(), m_currentZIndex);
    m_ctx->m_paintData.set_back(draw->fillRule(),
                                draw->paintType(),
                                draw->simplePaintValue(),
                                m_gradTextureLayout,
                                draw->clipID(),
                                draw->hasClipRect(),
                                draw->blendMode());
    m_ctx->m_paintAuxData.set_back(draw->matrix(),
                                   draw->paintType(),
                                   draw->simplePaintValue(),
                                   draw->gradient(),
                                   draw->imageTexture(),
                                   draw->clipRectInverseMatrix(),
                                   m_flushDesc.renderTarget,
                                   m_ctx->platformFeatures());

    assert(m_flushDesc.firstPath + m_currentPathID + 1 == m_ctx->m_pathData.elementsWritten());
    assert(m_flushDesc.firstPaint + m_currentPathID + 1 == m_ctx->m_paintData.elementsWritten());
    assert(m_flushDesc.firstPaintAux + m_currentPathID + 1 ==
           m_ctx->m_paintAuxData.elementsWritten());

    pls::DrawType drawType;
    size_t tessLocation;
    if (patchType == PatchType::midpointFan)
    {
        drawType = DrawType::midpointFanPatches;
        tessLocation = m_midpointFanTessVertexIdx;
        m_midpointFanTessVertexIdx += tessVertexCount;
    }
    else
    {
        drawType = DrawType::outerCurvePatches;
        tessLocation = m_outerCubicTessVertexIdx;
        m_outerCubicTessVertexIdx += tessVertexCount;
    }

    RIVE_DEBUG_CODE(m_expectedPathTessLocationAtEndOfPath = tessLocation + tessVertexCount);
    RIVE_DEBUG_CODE(m_expectedPathMirroredTessLocationAtEndOfPath = tessLocation);
    assert(m_expectedPathTessLocationAtEndOfPath <= kMaxTessellationVertexCount);

    uint32_t patchSize = PatchSegmentSpan(drawType);
    uint32_t baseInstance = tessLocation / patchSize;
    assert(baseInstance * patchSize == tessLocation); // flush() is responsible for alignment.

    if (m_currentPathContourDirections == pls::ContourDirections::reverseAndForward)
    {
        assert(tessVertexCount % 2 == 0);
        m_pathTessLocation = m_pathMirroredTessLocation = tessLocation + tessVertexCount / 2;
    }
    else if (m_currentPathContourDirections == pls::ContourDirections::forward)
    {
        m_pathTessLocation = m_pathMirroredTessLocation = tessLocation;
    }
    else
    {
        assert(m_currentPathContourDirections == pls::ContourDirections::reverse);
        m_pathTessLocation = m_pathMirroredTessLocation = tessLocation + tessVertexCount;
    }

    uint32_t instanceCount = tessVertexCount / patchSize;
    assert(instanceCount * patchSize == tessVertexCount); // flush() is responsible for alignment.
    pushPathDraw(draw, drawType, instanceCount, baseInstance);
}

void PLSRenderContext::LogicalFlush::pushContour(Vec2D midpoint,
                                                 bool closed,
                                                 uint32_t paddingVertexCount)
{
    assert(m_hasDoneLayout);
    assert(m_ctx->m_pathData.bytesWritten() > 0);
    assert(m_currentPathIsStroked || closed);
    assert(m_currentPathID != 0); // pathID can't be zero.

    if (m_currentPathIsStroked)
    {
        midpoint.x = closed ? 1 : 0;
    }
    // If the contour is closed, the shader needs a vertex to wrap back around to at the end of it.
    uint32_t vertexIndex0 = m_currentPathContourDirections & pls::ContourDirections::forward
                                ? m_pathTessLocation
                                : m_pathMirroredTessLocation - 1;
    m_ctx->m_contourData.emplace_back(midpoint, m_currentPathID, vertexIndex0);
    ++m_currentContourID;
    assert(0 < m_currentContourID && m_currentContourID <= pls::kMaxContourID);
    assert(m_flushDesc.firstContour + m_currentContourID == m_ctx->m_contourData.elementsWritten());

    // The first curve of the contour will be pre-padded with 'paddingVertexCount' tessellation
    // vertices, colocated at T=0. The caller must use this argument align the end of the contour on
    // a boundary of the patch size. (See pls::PaddingToAlignUp().)
    m_currentContourPaddingVertexCount = paddingVertexCount;
}

void PLSRenderContext::LogicalFlush::pushCubic(const Vec2D pts[4],
                                               Vec2D joinTangent,
                                               uint32_t additionalContourFlags,
                                               uint32_t parametricSegmentCount,
                                               uint32_t polarSegmentCount,
                                               uint32_t joinSegmentCount)
{
    assert(m_hasDoneLayout);
    assert(0 <= parametricSegmentCount && parametricSegmentCount <= kMaxParametricSegments);
    assert(0 <= polarSegmentCount && polarSegmentCount <= kMaxPolarSegments);
    assert(joinSegmentCount > 0);
    assert(m_currentContourID != 0); // contourID can't be zero.

    // Polar and parametric segments share the same beginning and ending vertices, so the merged
    // *vertex* count is equal to the sum of polar and parametric *segment* counts.
    uint32_t curveMergedVertexCount = parametricSegmentCount + polarSegmentCount;
    // -1 because the curve and join share an ending/beginning vertex.
    uint32_t totalVertexCount =
        m_currentContourPaddingVertexCount + curveMergedVertexCount + joinSegmentCount - 1;

    // Only the first curve of a contour gets padding vertices.
    m_currentContourPaddingVertexCount = 0;

    if (m_currentPathContourDirections == pls::ContourDirections::reverseAndForward)
    {
        pushMirroredAndForwardTessellationSpans(pts,
                                                joinTangent,
                                                totalVertexCount,
                                                parametricSegmentCount,
                                                polarSegmentCount,
                                                joinSegmentCount,
                                                m_currentContourID | additionalContourFlags);
    }
    else if (m_currentPathContourDirections == pls::ContourDirections::forward)
    {
        pushTessellationSpans(pts,
                              joinTangent,
                              totalVertexCount,
                              parametricSegmentCount,
                              polarSegmentCount,
                              joinSegmentCount,
                              m_currentContourID | additionalContourFlags);
    }
    else
    {
        assert(m_currentPathContourDirections == pls::ContourDirections::reverse);
        pushMirroredTessellationSpans(pts,
                                      joinTangent,
                                      totalVertexCount,
                                      parametricSegmentCount,
                                      polarSegmentCount,
                                      joinSegmentCount,
                                      m_currentContourID | additionalContourFlags);
    }

    RIVE_DEBUG_CODE(++m_pathCurveCount;)
}

RIVE_ALWAYS_INLINE void PLSRenderContext::LogicalFlush::pushTessellationSpans(
    const Vec2D pts[4],
    Vec2D joinTangent,
    uint32_t totalVertexCount,
    uint32_t parametricSegmentCount,
    uint32_t polarSegmentCount,
    uint32_t joinSegmentCount,
    uint32_t contourIDWithFlags)
{
    assert(m_hasDoneLayout);
    assert(totalVertexCount > 0);

    uint32_t y = m_pathTessLocation / kTessTextureWidth;
    int32_t x0 = m_pathTessLocation % kTessTextureWidth;
    int32_t x1 = x0 + totalVertexCount;
    for (;;)
    {
        m_ctx->m_tessSpanData.set_back(pts,
                                       joinTangent,
                                       static_cast<float>(y),
                                       x0,
                                       x1,
                                       parametricSegmentCount,
                                       polarSegmentCount,
                                       joinSegmentCount,
                                       contourIDWithFlags);
        if (x1 > static_cast<int32_t>(kTessTextureWidth))
        {
            // The span was too long to fit on the current line. Wrap and draw it again, this
            // time behind the left edge of the texture so we capture what got clipped off last
            // time.
            ++y;
            x0 -= kTessTextureWidth;
            x1 -= kTessTextureWidth;
            continue;
        }
        break;
    }
    assert(y == (m_pathTessLocation + totalVertexCount - 1) / kTessTextureWidth);

    m_pathTessLocation += totalVertexCount;
    assert(m_pathTessLocation <= m_expectedPathTessLocationAtEndOfPath);
}

RIVE_ALWAYS_INLINE void PLSRenderContext::LogicalFlush::pushMirroredTessellationSpans(
    const Vec2D pts[4],
    Vec2D joinTangent,
    uint32_t totalVertexCount,
    uint32_t parametricSegmentCount,
    uint32_t polarSegmentCount,
    uint32_t joinSegmentCount,
    uint32_t contourIDWithFlags)
{
    assert(m_hasDoneLayout);
    assert(totalVertexCount > 0);

    uint32_t reflectionY = (m_pathMirroredTessLocation - 1) / kTessTextureWidth;
    int32_t reflectionX0 = (m_pathMirroredTessLocation - 1) % kTessTextureWidth + 1;
    int32_t reflectionX1 = reflectionX0 - totalVertexCount;

    for (;;)
    {
        m_ctx->m_tessSpanData.set_back(pts,
                                       joinTangent,
                                       static_cast<float>(reflectionY),
                                       reflectionX0,
                                       reflectionX1,
                                       parametricSegmentCount,
                                       polarSegmentCount,
                                       joinSegmentCount,
                                       contourIDWithFlags);
        if (reflectionX1 < 0)
        {
            --reflectionY;
            reflectionX0 += kTessTextureWidth;
            reflectionX1 += kTessTextureWidth;
            continue;
        }
        break;
    }

    m_pathMirroredTessLocation -= totalVertexCount;
    assert(m_pathMirroredTessLocation >= m_expectedPathMirroredTessLocationAtEndOfPath);
}

RIVE_ALWAYS_INLINE void PLSRenderContext::LogicalFlush::pushMirroredAndForwardTessellationSpans(
    const Vec2D pts[4],
    Vec2D joinTangent,
    uint32_t totalVertexCount,
    uint32_t parametricSegmentCount,
    uint32_t polarSegmentCount,
    uint32_t joinSegmentCount,
    uint32_t contourIDWithFlags)
{
    assert(m_hasDoneLayout);
    assert(totalVertexCount > 0);

    int32_t y = m_pathTessLocation / kTessTextureWidth;
    int32_t x0 = m_pathTessLocation % kTessTextureWidth;
    int32_t x1 = x0 + totalVertexCount;

    uint32_t reflectionY = (m_pathMirroredTessLocation - 1) / kTessTextureWidth;
    int32_t reflectionX0 = (m_pathMirroredTessLocation - 1) % kTessTextureWidth + 1;
    int32_t reflectionX1 = reflectionX0 - totalVertexCount;

    for (;;)
    {
        m_ctx->m_tessSpanData.set_back(pts,
                                       joinTangent,
                                       static_cast<float>(y),
                                       x0,
                                       x1,
                                       static_cast<float>(reflectionY),
                                       reflectionX0,
                                       reflectionX1,
                                       parametricSegmentCount,
                                       polarSegmentCount,
                                       joinSegmentCount,
                                       contourIDWithFlags);
        if (x1 > static_cast<int32_t>(kTessTextureWidth) || reflectionX1 < 0)
        {
            // Either the span or its reflection was too long to fit on the current line. Wrap and
            // draw both of them again, this time beyond the opposite edge of the texture so we
            // capture what got clipped off last time.
            ++y;
            x0 -= kTessTextureWidth;
            x1 -= kTessTextureWidth;

            --reflectionY;
            reflectionX0 += kTessTextureWidth;
            reflectionX1 += kTessTextureWidth;
            continue;
        }
        break;
    }

    m_pathTessLocation += totalVertexCount;
    assert(m_pathTessLocation <= m_expectedPathTessLocationAtEndOfPath);

    m_pathMirroredTessLocation -= totalVertexCount;
    assert(m_pathMirroredTessLocation >= m_expectedPathMirroredTessLocationAtEndOfPath);
}

void PLSRenderContext::LogicalFlush::pushInteriorTriangulation(InteriorTriangulationDraw* draw)
{
    assert(m_hasDoneLayout);

    assert(m_ctx->m_triangleVertexData.hasRoomFor(draw->triangulator()->maxVertexCount()));
    uint32_t baseVertex = m_ctx->m_triangleVertexData.elementsWritten();
    size_t actualVertexCount =
        draw->triangulator()->polysToTriangles(&m_ctx->m_triangleVertexData, m_currentPathID);
    assert(actualVertexCount <= draw->triangulator()->maxVertexCount());
    DrawBatch& batch =
        pushPathDraw(draw, DrawType::interiorTriangulation, actualVertexCount, baseVertex);
    // Interior triangulations are allowed to disable raster ordering since they are guaranteed to
    // not overlap.
    batch.needsBarrier = true;
}

void PLSRenderContext::LogicalFlush::pushImageRect(ImageRectDraw* draw)
{
    assert(m_hasDoneLayout);

    // If we support image paints for paths, the client should use pushPath() with an image paint
    // instead of calling this method.
    assert(!m_ctx->frameSupportsImagePaintForPaths());

    size_t imageDrawDataOffset = m_ctx->m_imageDrawUniformData.bytesWritten();
    m_ctx->m_imageDrawUniformData.emplace_back(draw->matrix(),
                                               draw->opacity(),
                                               draw->clipRectInverseMatrix(),
                                               draw->clipID(),
                                               draw->blendMode(),
                                               m_currentZIndex);

    DrawBatch& batch = pushDraw(draw, DrawType::imageRect, PaintType::image, 1, 0);
    batch.imageDrawDataOffset = imageDrawDataOffset;
}

void PLSRenderContext::LogicalFlush::pushImageMesh(ImageMeshDraw* draw)
{

    assert(m_hasDoneLayout);

    size_t imageDrawDataOffset = m_ctx->m_imageDrawUniformData.bytesWritten();
    m_ctx->m_imageDrawUniformData.emplace_back(draw->matrix(),
                                               draw->opacity(),
                                               draw->clipRectInverseMatrix(),
                                               draw->clipID(),
                                               draw->blendMode(),
                                               m_currentZIndex);

    DrawBatch& batch = pushDraw(draw, DrawType::imageMesh, PaintType::image, draw->indexCount(), 0);
    batch.vertexBuffer = draw->vertexBuffer();
    batch.uvBuffer = draw->uvBuffer();
    batch.indexBuffer = draw->indexBuffer();
    batch.imageDrawDataOffset = imageDrawDataOffset;
}

void PLSRenderContext::LogicalFlush::pushStencilClipReset(StencilClipReset* draw)
{
    assert(m_hasDoneLayout);

    uint32_t baseVertex = m_ctx->m_triangleVertexData.elementsWritten();
    auto [L, T, R, B] = AABB(getClipInfo(draw->previousClipID()).contentBounds);
    uint32_t Z = m_currentZIndex;
    assert(AABB(L, T, R, B).round() == draw->pixelBounds());
    assert(draw->resourceCounts().maxTriangleVertexCount == 6);
    assert(m_ctx->m_triangleVertexData.hasRoomFor(6));
    m_ctx->m_triangleVertexData.emplace_back(Vec2D{L, B}, 0, Z);
    m_ctx->m_triangleVertexData.emplace_back(Vec2D{L, T}, 0, Z);
    m_ctx->m_triangleVertexData.emplace_back(Vec2D{R, B}, 0, Z);
    m_ctx->m_triangleVertexData.emplace_back(Vec2D{R, B}, 0, Z);
    m_ctx->m_triangleVertexData.emplace_back(Vec2D{L, T}, 0, Z);
    m_ctx->m_triangleVertexData.emplace_back(Vec2D{R, T}, 0, Z);
    pushDraw(draw, DrawType::stencilClipReset, PaintType::clipUpdate, 6, baseVertex);
}

void PLSRenderContext::LogicalFlush::pushBarrier()
{
    assert(m_hasDoneLayout);
    assert(m_flushDesc.interlockMode != pls::InterlockMode::rasterOrdering);

    if (!m_drawList.empty())
    {
        m_drawList.tail().needsBarrier = true;
    }
}

pls::DrawBatch& PLSRenderContext::LogicalFlush::pushPathDraw(PLSPathDraw* draw,
                                                             DrawType drawType,
                                                             uint32_t vertexCount,
                                                             uint32_t baseVertex)
{
    assert(m_hasDoneLayout);

    DrawBatch& batch = pushDraw(draw, drawType, draw->paintType(), vertexCount, baseVertex);
    auto pathShaderFeatures = pls::ShaderFeatures::NONE;
    if (draw->fillRule() == FillRule::evenOdd)
    {
        pathShaderFeatures |= ShaderFeatures::ENABLE_EVEN_ODD;
    }
    if (draw->paintType() == PaintType::clipUpdate && draw->simplePaintValue().outerClipID != 0)
    {
        pathShaderFeatures |= ShaderFeatures::ENABLE_NESTED_CLIPPING;
    }
    batch.shaderFeatures |= pathShaderFeatures & m_ctx->m_frameShaderFeaturesMask;
    m_combinedShaderFeatures |= batch.shaderFeatures;
    assert((batch.shaderFeatures &
            pls::ShaderFeaturesMaskFor(drawType, m_ctx->frameInterlockMode())) ==
           batch.shaderFeatures);
    return batch;
}

RIVE_ALWAYS_INLINE static bool can_combine_draw_images(const PLSTexture* currentDrawTexture,
                                                       const PLSTexture* nextDrawTexture)
{
    if (currentDrawTexture == nullptr || nextDrawTexture == nullptr)
    {
        // We can always combine two draws if one or both do not use an image paint.
        return true;
    }
    // Since the image paint's texture must be bound to a specific slot, we can't combine draws that
    // use different textures.
    return currentDrawTexture == nextDrawTexture;
}

pls::DrawBatch& PLSRenderContext::LogicalFlush::pushDraw(PLSDraw* draw,
                                                         DrawType drawType,
                                                         pls::PaintType paintType,
                                                         uint32_t elementCount,
                                                         uint32_t baseElement)
{
    assert(m_hasDoneLayout);

    bool needsNewBatch;
    switch (drawType)
    {
        case DrawType::midpointFanPatches:
        case DrawType::outerCurvePatches:
        case DrawType::plsAtomicInitialize:
        case DrawType::plsAtomicResolve:
        case DrawType::stencilClipReset:
            needsNewBatch =
                m_drawList.empty() || m_drawList.tail().drawType != drawType ||
                m_drawList.tail().needsBarrier ||
                !can_combine_draw_images(m_drawList.tail().imageTexture, draw->imageTexture());
            break;
        case DrawType::interiorTriangulation:
        case DrawType::imageRect:
        case DrawType::imageMesh:
            // We can't combine interior triangulations or image draws yet.
            needsNewBatch = true;
            break;
    }

    DrawBatch& batch = needsNewBatch ? m_drawList.emplace_back(m_ctx->perFrameAllocator(),
                                                               drawType,
                                                               draw,
                                                               elementCount,
                                                               baseElement)
                                     : m_drawList.tail();
    if (!needsNewBatch)
    {
        assert(batch.drawType == drawType);
        assert(can_combine_draw_images(batch.imageTexture, draw->imageTexture()));
        assert(!batch.needsBarrier);
        if (m_flushDesc.interlockMode == pls::InterlockMode::depthStencil)
        {
            // depthStencil can't mix drawContents in a batch.
            assert(batch.drawContents == draw->drawContents());
            assert((batch.shaderFeatures & pls::ShaderFeatures::ENABLE_ADVANCED_BLEND) ==
                   (draw->blendMode() != BlendMode::srcOver));
            // If using KHR_blend_equation_advanced, we can't mix blend modes in a batch.
            assert(!m_ctx->platformFeatures().supportsKHRBlendEquations ||
                   batch.internalDrawList->blendMode() == draw->blendMode());
        }
        assert(batch.baseElement + batch.elementCount == baseElement);
        draw->setBatchInternalNeighbor(batch.internalDrawList);
        batch.internalDrawList = draw;
        batch.elementCount += elementCount;
    }

    if (paintType == PaintType::image)
    {
        assert(draw->imageTexture() != nullptr);
        if (batch.imageTexture == nullptr)
        {
            batch.imageTexture = draw->imageTexture();
        }
        assert(batch.imageTexture == draw->imageTexture());
    }

    auto shaderFeatures = ShaderFeatures::NONE;
    if (draw->clipID() != 0)
    {
        shaderFeatures |= ShaderFeatures::ENABLE_CLIPPING;
    }
    if (draw->hasClipRect() && paintType != PaintType::clipUpdate)
    {
        shaderFeatures |= ShaderFeatures::ENABLE_CLIP_RECT;
    }
    if (paintType != PaintType::clipUpdate)
    {
        switch (draw->blendMode())
        {
            case BlendMode::hue:
            case BlendMode::saturation:
            case BlendMode::color:
            case BlendMode::luminosity:
                shaderFeatures |= ShaderFeatures::ENABLE_HSL_BLEND_MODES;
                [[fallthrough]];
            case BlendMode::screen:
            case BlendMode::overlay:
            case BlendMode::darken:
            case BlendMode::lighten:
            case BlendMode::colorDodge:
            case BlendMode::colorBurn:
            case BlendMode::hardLight:
            case BlendMode::softLight:
            case BlendMode::difference:
            case BlendMode::exclusion:
            case BlendMode::multiply:
                shaderFeatures |= ShaderFeatures::ENABLE_ADVANCED_BLEND;
                break;
            case BlendMode::srcOver:
                break;
        }
    }
    batch.shaderFeatures |= shaderFeatures & m_ctx->m_frameShaderFeaturesMask;
    m_combinedShaderFeatures |= batch.shaderFeatures;
    batch.drawContents |= draw->drawContents();
    assert((batch.shaderFeatures &
            pls::ShaderFeaturesMaskFor(drawType, m_ctx->frameInterlockMode())) ==
           batch.shaderFeatures);
    return batch;
}
} // namespace rive::pls
