include/rive/pls/pls.hpp - external/github.com/rive-app/rive-cpp - Git at Google

 /*
  * Copyright 2022 Rive
  */

 #pragma once

 #include "rive/enum_bitset.hpp"
 #include "rive/math/aabb.hpp"
 #include "rive/math/mat2d.hpp"
 #include "rive/math/path_types.hpp"
 #include "rive/math/vec2d.hpp"
 #include "rive/shapes/paint/blend_mode.hpp"
 #include "rive/shapes/paint/color.hpp"
 #include "rive/pls/trivial_block_allocator.hpp"

 namespace rive
 {
 class GrInnerFanTriangulator;
 class RenderBuffer;
 } // namespace rive

 // This header defines constants and data structures for Rive's pixel local storage path rendering
 // algorithm.
 //
 // Main algorithm:
 // https://docs.google.com/document/d/19Uk9eyFxav6dNSYsI2ZyiX9zHU1YOaJsMB2sdDFVz6s/edit
 //
 // Batching multiple unique paths:
 // https://docs.google.com/document/d/1DLrQimS5pbNaJJ2sAW5oSOsH6_glwDPo73-mtG5_zns/edit
 //
 // Batching strokes as well:
 // https://docs.google.com/document/d/1CRKihkFjbd1bwT08ErMCP4fwSR7D4gnHvgdw_esY9GM/edit
 namespace rive::pls
 {
 class PLSDraw;
 class PLSGradient;
 class PLSRenderContextImpl;
 class PLSRenderTarget;
 class PLSTexture;

 // Tessellate in parametric space until each segment is within 1/4 pixel of the true curve.
 constexpr static int kParametricPrecision = 4;

 // Tessellate in polar space until the outset edge is within 1/8 pixel of the true stroke.
 constexpr static int kPolarPrecision = 8;

 // Maximum supported numbers of tessellated segments in a single curve.
 constexpr static uint32_t kMaxParametricSegments = 1023;
 constexpr static uint32_t kMaxPolarSegments = 1023;

 // We allocate all our GPU buffers in rings. This ensures the CPU can prepare frames in parallel
 // while the GPU renders them.
 constexpr static int kBufferRingSize = 3;

 // Every coverage value in pixel local storage has an associated 16-bit path ID. This ID enables us
 // to batch multiple paths together without having to clear the coverage buffer in between. This ID
 // is implemented as an fp16, so the maximum path ID therefore cannot be NaN (or conservatively, all
 // 5 exponent bits cannot be 1's). We also skip denormalized values (exp == 0) because they have
 // been empirically unreliable on Android as ID values.
 constexpr static int kLargestFP16BeforeExponentAll1s = (0x1f << 10) - 1;
 constexpr static int kLargestDenormalizedFP16 = 1023;
 constexpr static int MaxPathID(int granularity)
 {
     // Floating point equality gets funky when the exponent bits are all 1's, so the largest pathID
     // we can support is kLargestFP16BeforeExponentAll1s.
     //
     // The shader converts an integer path ID to fp16 as:
     //
     //     (id + kLargestDenormalizedFP16) * granularity
     //
     // So the largest path ID we can support is as follows.
     return kLargestFP16BeforeExponentAll1s / granularity - kLargestDenormalizedFP16;
 }

 // Each contour has its own unique ID, which it uses to index a data record containing per-contour
 // information. This value is currently 16 bit.
 constexpr static size_t kMaxContourID = 65535;
 constexpr static uint32_t kContourIDMask = 0xffff;
 static_assert((kMaxContourID & kContourIDMask) == kMaxContourID);

 // Tessellation is performed by rendering vertices into a data texture. These values define the
 // dimensions of the tessellation data texture.
 constexpr static size_t kTessTextureWidth = 2048; // GL_MAX_TEXTURE_SIZE spec minimum on ES3/WebGL2.
 constexpr static size_t kTessTextureWidthLog2 = 11;
 static_assert(1 << kTessTextureWidthLog2 == kTessTextureWidth);

 // Gradients are implemented by sampling a horizontal ramp of pixels allocated in a global gradient
 // texture.
 constexpr static size_t kGradTextureWidth = 512;
 constexpr static size_t kGradTextureWidthInSimpleRamps = kGradTextureWidth / 2;

 // Backend-specific capabilities/workarounds and fine tuning.
 struct PlatformFeatures
 {
     bool supportsPixelLocalStorage = true;
     bool supportsRasterOrdering = true;     // Can pixel local storage accesses be raster ordered?
     bool supportsKHRBlendEquations = false; // Use KHR_blend_equation_advanced in depthStencil mode?
     bool supportsClipPlanes = false;        // Required for @ENABLE_CLIP_RECT in depthStencil mode.
     bool supportsBindlessTextures = false;
     bool avoidFlatVaryings = false;
     bool invertOffscreenY = false;  // Invert Y when drawing to offscreen render targets? (Gradient
                                     // and tessellation textures.)
     bool uninvertOnScreenY = false; // Specifies whether the graphics layer appends a negation of Y
                                     // to on-screen vertex shaders that needs to be undone.
     bool fragCoordBottomUp = false; // Does the built-in pixel coordinate in the fragment shader go
                                     // bottom-up or top-down?
     bool atomicPLSMustBeInitializedAsDraw = false; // Backend cannot initialize PLS with typical
                                                    // clear/load APIs in atomic mode. Issue a
                                                    // "DrawType::plsAtomicInitialize" draw instead.
     uint8_t pathIDGranularity = 1; // Workaround for precision issues. Determines how far apart we
                                    // space unique path IDs.
 };

 // Gradient color stops are implemented as a horizontal span of pixels in a global gradient
 // texture. They are rendered by "GradientSpan" instances.
 struct GradientSpan
 {
     // x0Fixed and x1Fixed are normalized texel x coordinates, in the fixed-point range
     // 0..65535.
     RIVE_ALWAYS_INLINE void set(uint32_t x0Fixed,
                                 uint32_t x1Fixed,
                                 float y_,
                                 ColorInt color0_,
                                 ColorInt color1_)
     {
         assert(x0Fixed < 65536);
         assert(x1Fixed < 65536);
         horizontalSpan = (x1Fixed << 16) | x0Fixed;
         y = y_;
         color0 = color0_;
         color1 = color1_;
     }
     uint32_t horizontalSpan;
     uint32_t y;
     uint32_t color0;
     uint32_t color1;
 };
 static_assert(sizeof(GradientSpan) == sizeof(uint32_t) * 4);

 // Each curve gets tessellated into vertices. This is performed by rendering a horizontal span
 // of positions and normals into the tessellation data texture, GP-GPU style. TessVertexSpan
 // defines one instance of a horizontal tessellation span for rendering.
 //
 // Each span has an optional reflection, rendered right to left, with the same vertices in
 // reverse order. These are used to draw mirrored patches with negative coverage when we have
 // back-face culling enabled. This emits every triangle twice, once clockwise and once
 // counterclockwise, and back-face culling naturally selects the triangle with the appropriately
 // signed coverage (discarding the other).
 struct TessVertexSpan
 {
     RIVE_ALWAYS_INLINE void set(const Vec2D pts_[4],
                                 Vec2D joinTangent_,
                                 float y_,
                                 int32_t x0,
                                 int32_t x1,
                                 uint32_t parametricSegmentCount,
                                 uint32_t polarSegmentCount,
                                 uint32_t joinSegmentCount,
                                 uint32_t contourIDWithFlags_)
     {
         set(pts_,
             joinTangent_,
             y_,
             x0,
             x1,
             std::numeric_limits<float>::quiet_NaN(), // Discard the reflection.
             -1,
             -1,
             parametricSegmentCount,
             polarSegmentCount,
             joinSegmentCount,
             contourIDWithFlags_);
     }

     RIVE_ALWAYS_INLINE void set(const Vec2D pts_[4],
                                 Vec2D joinTangent_,
                                 float y_,
                                 int32_t x0,
                                 int32_t x1,
                                 float reflectionY_,
                                 int32_t reflectionX0,
                                 int32_t reflectionX1,
                                 uint32_t parametricSegmentCount,
                                 uint32_t polarSegmentCount,
                                 uint32_t joinSegmentCount,
                                 uint32_t contourIDWithFlags_)
     {
         RIVE_INLINE_MEMCPY(pts, pts_, sizeof(pts));
         joinTangent = joinTangent_;
         y = y_;
         reflectionY = reflectionY_;
         x0x1 = (x1 << 16) | (x0 & 0xffff);
         reflectionX0X1 = (reflectionX1 << 16) | (reflectionX0 & 0xffff);
         segmentCounts =
             (joinSegmentCount << 20) | (polarSegmentCount << 10) | parametricSegmentCount;
         contourIDWithFlags = contourIDWithFlags_;

         // Ensure we didn't lose any data from packing.
         assert(x0 == x0x1 << 16 >> 16);
         assert(x1 == x0x1 >> 16);
         assert(reflectionX0 == reflectionX0X1 << 16 >> 16);
         assert(reflectionX1 == reflectionX0X1 >> 16);
         assert((segmentCounts & 0x3ff) == parametricSegmentCount);
         assert(((segmentCounts >> 10) & 0x3ff) == polarSegmentCount);
         assert(segmentCounts >> 20 == joinSegmentCount);
     }

     Vec2D pts[4];      // Cubic bezier curve.
     Vec2D joinTangent; // Ending tangent of the join that follows the cubic.
     float y;
     float reflectionY;
     int32_t x0x1;
     int32_t reflectionX0X1;
     uint32_t segmentCounts;      // [joinSegmentCount, polarSegmentCount, parametricSegmentCount]
     uint32_t contourIDWithFlags; // flags | contourID
 };
 static_assert(sizeof(TessVertexSpan) == sizeof(float) * 16);

 // Tessellation spans are drawn as two distinct, 1px-tall rectangles: the span and its reflection.
 constexpr uint16_t kTessSpanIndices[4 * 3] = {0, 1, 2, 2, 1, 3, 4, 5, 6, 6, 5, 7};

 // ImageRects are a special type of non-overlapping antialiased draw that we only have to use when
 // we don't have bindless textures in atomic mode. They allow us to bind a texture and draw it in
 // its entirety in a single pass.
 struct ImageRectVertex
 {
     float x;
     float y;
     float aaOffsetX;
     float aaOffsetY;
 };

 constexpr ImageRectVertex kImageRectVertices[12] = {
     {0, 0, .0, -1},
     {1, 0, .0, -1},
     {1, 0, +1, .0},
     {1, 1, +1, .0},
     {1, 1, .0, +1},
     {0, 1, .0, +1},
     {0, 1, -1, .0},
     {0, 0, -1, .0},
     {0, 0, +1, +1},
     {1, 0, -1, +1},
     {1, 1, -1, -1},
     {0, 1, +1, -1},
 };

 constexpr uint16_t kImageRectIndices[14 * 3] = {
     8,  0, 9, 9, 0, 1,  1,  2, 9, 9, 2, 10, 10, 2, 3, 3, 4,  10, 10, 4, 11,
     11, 4, 5, 5, 6, 11, 11, 6, 8, 8, 6, 7,  7,  0, 8, 9, 10, 8,  10, 8, 11,
 };

 enum class PaintType : uint32_t
 {
     solidColor,
     linearGradient,
     radialGradient,
     image,
     clipUpdate, // Update the clip buffer instead of drawing to the framebuffer.
 };

 // Specifies the location of a simple or complex horizontal color ramp within the gradient texture.
 // A simple color ramp is two texels wide, beginning at the specified row and column.
 // A complex color ramp spans the entire width of the gradient texture, on the row:
 //     "GradTextureLayout::complexOffsetY + ColorRampLocation::row".
 struct ColorRampLocation
 {
     constexpr static uint16_t kComplexGradientMarker = 0xffff;
     bool isComplex() const { return col == kComplexGradientMarker; }
     uint16_t row;
     uint16_t col;
 };

 // Most of a paint's information can be described in a single value. Gradients and images reference
 // an additional PLSGradient* and PLSTexture* respectively.
 union SimplePaintValue
 {
     ColorInt color = 0xff000000;         // PaintType::solidColor
     ColorRampLocation colorRampLocation; // Paintype::linearGradient, Paintype::radialGradient
     float imageOpacity;                  // PaintType::image
     uint32_t outerClipID;                // Paintype::clipUpdate
 };
 static_assert(sizeof(SimplePaintValue) == 4);

 // This class encapsulates a matrix that maps from _fragCoord to a space where the clipRect is the
 // normalized rectangle: [-1, -1, +1, +1]
 class ClipRectInverseMatrix
 {
 public:
     // When the clipRect inverse matrix is singular (e.g., all 0 in scale and skew), the shader
     // uses tx and ty as fixed clip coverage values instead of finding edge distances.
     constexpr static ClipRectInverseMatrix WideOpen() { return Mat2D{0, 0, 0, 0, 1, 1}; }
     constexpr static ClipRectInverseMatrix Empty() { return Mat2D{0, 0, 0, 0, 0, 0}; }

     ClipRectInverseMatrix() = default;

     ClipRectInverseMatrix(const Mat2D& clipMatrix, const AABB& clipRect)
     {
         reset(clipMatrix, clipRect);
     }

     void reset(const Mat2D& clipMatrix, const AABB& clipRect);

     const Mat2D& inverseMatrix() const { return m_inverseMatrix; }

 private:
     constexpr ClipRectInverseMatrix(const Mat2D& inverseMatrix) : m_inverseMatrix(inverseMatrix) {}
     Mat2D m_inverseMatrix;
 };

 // Specifies the height of the gradient texture, and the row at which we transition from simple
 // color ramps to complex.
 //
 // This information is computed at flush time, once we know exactly how many color ramps of each
 // type will be in the gradient texture.
 struct GradTextureLayout
 {
     uint32_t complexOffsetY; // Row of the first complex gradient.
     float inverseHeight;     // 1 / textureHeight
 };

 // Once all curves in a contour have been tessellated, we render the tessellated vertices in
 // "patches" (aka specific instanced geometry).
 //
 // See:
 // https://docs.google.com/document/d/19Uk9eyFxav6dNSYsI2ZyiX9zHU1YOaJsMB2sdDFVz6s/edit#heading=h.fa4kubk3vimk
 //
 // With strokes:
 // https://docs.google.com/document/d/1CRKihkFjbd1bwT08ErMCP4fwSR7D4gnHvgdw_esY9GM/edit#heading=h.dcd0c58pxfs5
 //
 // A single patch spans N tessellation segments, connecting N + 1 tessellation vertices. It is
 // composed of a an AA border and fan triangles. The specifics of the fan triangles depend on
 // the PatchType.
 enum class PatchType
 {
     // Patches fan around the contour midpoint. Outer edges are inset by ~1px, followed by a
     // ~1px AA ramp.
     midpointFan,

     // Patches only cover the AA ramps and interiors of bezier curves. The interior path
     // triangles that connect the outer curves are triangulated on the CPU to eliminate overlap,
     // and are drawn in a separate call. AA ramps are split down the middle (on the same lines
     // as the interior triangulation), and drawn with a ~1/2px outset AA ramp and a ~1/2px inset
     // AA ramp that overlaps the inner tessellation and has negative coverage. A lone bowtie
     // join is emitted at the end of the patch to tie the outer curves together.
     outerCurves,
 };

 // When tessellating path vertices, we have the ability to generate the triangles wound in forward
 // or reverse order.
 // Depending on the path and the rendering algorithm, we will either want the triangles wound
 // forward, reverse, or BOTH.
 enum class ContourDirections
 {
     none = 0,
     forward = 1 << 0,
     reverse = 1 << 1,
     reverseAndForward = reverse | forward, // Generate two sets of triangles: reverse then forward.
 };
 RIVE_MAKE_ENUM_BITSET(ContourDirections)

 struct PatchVertex
 {
     void set(float localVertexID_, float outset_, float fillCoverage_, float params_)
     {
         localVertexID = localVertexID_;
         outset = outset_;
         fillCoverage = fillCoverage_;
         params = params_;
         setMirroredPosition(localVertexID_, outset_, fillCoverage_);
     }

     // Patch vertices can have an optional, alternate position when mirrored. This is so we can
     // ensure the diagonals inside the stroke line up on both versions of the patch (mirrored
     // and not).
     void setMirroredPosition(float localVertexID_, float outset_, float fillCoverage_)
     {
         mirroredVertexID = localVertexID_;
         mirroredOutset = outset_;
         mirroredFillCoverage = fillCoverage_;
     }

     float localVertexID; // 0 or 1 -- which tessellated vertex of the two that we are connecting?
     float outset;        // Outset from the tessellated position, in the direction of the normal.
     float fillCoverage;  // 0..1 for the stroke. 1 all around for the triangles.
                          // (Coverage will be negated later for counterclockwise triangles.)
     int32_t params;      // "(patchSize << 2) | [flags::kStrokeVertex,
                          //                      flags::kFanVertex,
                          //                      flags::kFanMidpointVertex]"
     float mirroredVertexID;
     float mirroredOutset;
     float mirroredFillCoverage;
     int32_t padding = 0;
 };
 static_assert(sizeof(PatchVertex) == sizeof(float) * 8);

 // # of tessellation segments spanned by the midpoint fan patch.
 constexpr static uint32_t kMidpointFanPatchSegmentSpan = 8;

 // # of tessellation segments spanned by the outer curve patch. (In this particular instance,
 // the final segment is a bowtie join with zero length and no fan triangle.)
 constexpr static uint32_t kOuterCurvePatchSegmentSpan = 17;

 // Define vertex and index buffers that contain all the triangles in every PatchType.
 constexpr static uint32_t kMidpointFanPatchVertexCount =
     kMidpointFanPatchSegmentSpan * 4 /*Stroke and/or AA outer ramp*/ +
     (kMidpointFanPatchSegmentSpan + 1) /*Curve fan*/ + 1 /*Triangle from path midpoint*/;
 constexpr static uint32_t kMidpointFanPatchBorderIndexCount =
     kMidpointFanPatchSegmentSpan * 6 /*Stroke and/or AA outer ramp*/;
 constexpr static uint32_t kMidpointFanPatchIndexCount =
     kMidpointFanPatchBorderIndexCount /*Stroke and/or AA outer ramp*/ +
     (kMidpointFanPatchSegmentSpan - 1) * 3 /*Curve fan*/ + 3 /*Triangle from path midpoint*/;
 constexpr static uint32_t kMidpointFanPatchBaseIndex = 0;
 static_assert((kMidpointFanPatchBaseIndex * sizeof(uint16_t)) % 4 == 0);
 constexpr static uint32_t kOuterCurvePatchVertexCount =
     kOuterCurvePatchSegmentSpan * 8 /*AA center ramp with bowtie*/ +
     kOuterCurvePatchSegmentSpan /*Curve fan*/;
 constexpr static uint32_t kOuterCurvePatchBorderIndexCount =
     kOuterCurvePatchSegmentSpan * 12 /*AA center ramp with bowtie*/;
 constexpr static uint32_t kOuterCurvePatchIndexCount =
     kOuterCurvePatchBorderIndexCount /*AA center ramp with bowtie*/ +
     (kOuterCurvePatchSegmentSpan - 2) * 3 /*Curve fan*/;
 constexpr static uint32_t kOuterCurvePatchBaseIndex = kMidpointFanPatchIndexCount;
 static_assert((kOuterCurvePatchBaseIndex * sizeof(uint16_t)) % 4 == 0);
 constexpr static uint32_t kPatchVertexBufferCount =
     kMidpointFanPatchVertexCount + kOuterCurvePatchVertexCount;
 constexpr static uint32_t kPatchIndexBufferCount =
     kMidpointFanPatchIndexCount + kOuterCurvePatchIndexCount;
 void GeneratePatchBufferData(PatchVertex[kPatchVertexBufferCount],
                              uint16_t indices[kPatchIndexBufferCount]);

 enum class DrawType : uint8_t
 {
     midpointFanPatches, // Standard paths and/or strokes.
     outerCurvePatches,  // Just the outer curves of a path; the interior will be triangulated.
     interiorTriangulation,
     imageRect,
     imageMesh,
     plsAtomicInitialize, // Clear/init PLS data when we can't do it with existing clear/load APIs.
     plsAtomicResolve,    // Resolve PLS data to the final renderTarget color in atomic mode.
     stencilClipReset,    // Clear or intersect (based on DrawContents) the stencil clip bit.
 };

 constexpr static uint32_t PatchSegmentSpan(DrawType drawType)
 {
     switch (drawType)
     {
         case DrawType::midpointFanPatches:
             return kMidpointFanPatchSegmentSpan;
         case DrawType::outerCurvePatches:
             return kOuterCurvePatchSegmentSpan;
         default:
             RIVE_UNREACHABLE();
     }
 }

 constexpr static uint32_t PatchIndexCount(DrawType drawType)
 {
     switch (drawType)
     {
         case DrawType::midpointFanPatches:
             return kMidpointFanPatchIndexCount;
         case DrawType::outerCurvePatches:
             return kOuterCurvePatchIndexCount;
         default:
             RIVE_UNREACHABLE();
     }
 }

 constexpr static uint32_t PatchBorderIndexCount(DrawType drawType)
 {
     switch (drawType)
     {
         case DrawType::midpointFanPatches:
             return kMidpointFanPatchBorderIndexCount;
         case DrawType::outerCurvePatches:
             return kOuterCurvePatchBorderIndexCount;
         default:
             RIVE_UNREACHABLE();
     }
 }

 constexpr static uint32_t PatchFanIndexCount(DrawType drawType)
 {
     return PatchIndexCount(drawType) - PatchBorderIndexCount(drawType);
 }

 constexpr static uintptr_t PatchBaseIndex(DrawType drawType)
 {
     switch (drawType)
     {
         case DrawType::midpointFanPatches:
             return kMidpointFanPatchBaseIndex;
         case DrawType::outerCurvePatches:
             return kOuterCurvePatchBaseIndex;
         default:
             RIVE_UNREACHABLE();
     }
 }

 constexpr static uintptr_t PatchFanBaseIndex(DrawType drawType)
 {
     return PatchBaseIndex(drawType) + PatchBorderIndexCount(drawType);
 }

 // Specifies what to do with the render target at the beginning of a flush.
 enum class LoadAction
 {
     clear,
     preserveRenderTarget,
     dontCare,
 };

 // Synchronization method for pixel local storage with overlapping fragments.
 enum class InterlockMode
 {
     rasterOrdering,
     atomics,
     depthStencil,
 };

 // "Uber shader" features that can be #defined in a draw shader.
 // This set is strictly limited to switches that don't *change* the behavior of the shader, i.e.,
 // turning them all on will enable all types Rive content, but simple content will still draw
 // identically; we can turn a feature off if we know a batch doesn't need it for better performance.
 enum class ShaderFeatures
 {
     NONE = 0,

     // Whole program features.
     ENABLE_CLIPPING = 1 << 0,
     ENABLE_CLIP_RECT = 1 << 1,
     ENABLE_ADVANCED_BLEND = 1 << 2,

     // Fragment-only features.
     ENABLE_EVEN_ODD = 1 << 3,
     ENABLE_NESTED_CLIPPING = 1 << 4,
     ENABLE_HSL_BLEND_MODES = 1 << 5,
 };
 RIVE_MAKE_ENUM_BITSET(ShaderFeatures)
 constexpr static size_t kShaderFeatureCount = 6;
 constexpr static ShaderFeatures kAllShaderFeatures =
     static_cast<pls::ShaderFeatures>((1 << kShaderFeatureCount) - 1);
 constexpr static ShaderFeatures kVertexShaderFeaturesMask = ShaderFeatures::ENABLE_CLIPPING |
                                                             ShaderFeatures::ENABLE_CLIP_RECT |
                                                             ShaderFeatures::ENABLE_ADVANCED_BLEND;

 constexpr static ShaderFeatures ShaderFeaturesMaskFor(InterlockMode interlockMode)
 {
     switch (interlockMode)
     {
         case InterlockMode::rasterOrdering:
             return kAllShaderFeatures;
         case InterlockMode::atomics:
             return kAllShaderFeatures & ~ShaderFeatures::ENABLE_NESTED_CLIPPING;
         case InterlockMode::depthStencil:
             return ShaderFeatures::ENABLE_CLIP_RECT | ShaderFeatures::ENABLE_ADVANCED_BLEND |
                    ShaderFeatures::ENABLE_HSL_BLEND_MODES;
     }
     RIVE_UNREACHABLE();
 }

 constexpr static ShaderFeatures ShaderFeaturesMaskFor(DrawType drawType,
                                                       InterlockMode interlockMode)
 {
     ShaderFeatures mask = ShaderFeatures::NONE;
     switch (drawType)
     {
         case DrawType::imageRect:
         case DrawType::imageMesh:
             if (interlockMode != pls::InterlockMode::atomics)
             {
                 mask = ShaderFeatures::ENABLE_CLIPPING | ShaderFeatures::ENABLE_CLIP_RECT |
                        ShaderFeatures::ENABLE_ADVANCED_BLEND |
                        ShaderFeatures::ENABLE_HSL_BLEND_MODES;
                 break;
             }
             // Since atomic mode has to resolve previous draws, images need to consider the same
             // shader features for path draws.
             [[fallthrough]];
         case DrawType::midpointFanPatches:
         case DrawType::outerCurvePatches:
         case DrawType::interiorTriangulation:
         case DrawType::plsAtomicResolve:
             mask = kAllShaderFeatures;
             break;
         case DrawType::plsAtomicInitialize:
             assert(interlockMode == pls::InterlockMode::atomics);
             mask = ShaderFeatures::ENABLE_CLIPPING | ShaderFeatures::ENABLE_ADVANCED_BLEND;
             break;
         case DrawType::stencilClipReset:
             mask = ShaderFeatures::NONE;
             break;
     }
     return mask & ShaderFeaturesMaskFor(interlockMode);
 }

 // Miscellaneous switches that *do* affect the behavior of a shader. A backend can add these to a
 // shader key if it wants to implement the behavior.
 enum class ShaderMiscFlags : uint32_t
 {
     none = 0,

     // DrawType::plsAtomicInitialize only. Also store the color clear value to PLS when drawing a
     // clear, in addition to clearing the other PLS planes.
     storeColorClear = 1 << 0,

     // DrawType::plsAtomicInitialize only. Swizzle the existing framebuffer contents from BGRA to
     // RGBA. (For when this data had to get copied from a BGRA target.)
     swizzleColorBGRAToRGBA = 1 << 1,

     // DrawType::plsAtomicResolve only. Optimization for when rendering to an offscreen texture.
     //
     // It renders the final "resolve" operation directly to the renderTarget in a single pass,
     // instead of (1) resolving the offscreen texture, and then (2) copying the offscreen texture to
     // back the renderTarget.
     coalescedResolveAndTransfer = 1 << 2,
 };
 RIVE_MAKE_ENUM_BITSET(ShaderMiscFlags)

 // Returns a unique value that can be used to key a shader.
 uint32_t ShaderUniqueKey(DrawType, ShaderFeatures, InterlockMode, ShaderMiscFlags);

 extern const char* GetShaderFeatureGLSLName(ShaderFeatures feature);

 // Flags indicating the contents of a draw. These don't affect shaders, but in depthStencil mode
 // they are needed to break up batching. (depthStencil needs different stencil/blend state,
 // depending on the DrawContents.)
 //
 // These also affect the draw sort order, so we attempt associate more expensive shader branch
 // misses with higher flags.
 enum class DrawContents
 {
     none = 0,
     opaquePaint = 1 << 0,
     stroke = 1 << 1,
     evenOddFill = 1 << 2,
     activeClip = 1 << 3,
     clipUpdate = 1 << 4,
     advancedBlend = 1 << 5,
 };
 RIVE_MAKE_ENUM_BITSET(DrawContents)

 // A nestedClip draw updates the clip buffer while simultaneously clipping against the outerClip
 // that is currently in the clip buffer.
 constexpr static pls::DrawContents kNestedClipUpdateMask =
     (pls::DrawContents::activeClip | pls::DrawContents::clipUpdate);

 // Low-level batch of geometry to submit to the GPU.
 struct DrawBatch
 {
     DrawBatch(DrawType drawType_,
               const PLSDraw* internalDrawList_,
               uint32_t elementCount_,
               uint32_t baseElement_) :
         drawType(drawType_),
         internalDrawList(internalDrawList_),
         elementCount(elementCount_),
         baseElement(baseElement_)
     {}

     const DrawType drawType;
     const PLSDraw* internalDrawList;
     uint32_t elementCount; // Vertex, index, or instance count.
     uint32_t baseElement;  // Base vertex, index, or instance.
     DrawContents drawContents = DrawContents::none;
     ShaderFeatures shaderFeatures = ShaderFeatures::NONE;
     bool needsBarrier = false; // Pixel-local-storage barrier required after submitting this batch.

     // DrawType::imageRect and DrawType::imageMesh.
     uint32_t imageDrawDataOffset = 0;
     const PLSTexture* imageTexture = nullptr;

     // DrawType::imageMesh.
     const RenderBuffer* vertexBuffer;
     const RenderBuffer* uvBuffer;
     const RenderBuffer* indexBuffer;
 };

 // Simple gradients only have 2 texels, so we write them to mapped texture memory from the CPU
 // instead of rendering them.
 struct TwoTexelRamp
 {
     void set(const ColorInt colors[2])
     {
         UnpackColorToRGBA8(colors[0], colorData);
         UnpackColorToRGBA8(colors[1], colorData + 4);
     }
     uint8_t colorData[8];
 };
 static_assert(sizeof(TwoTexelRamp) == 8 * sizeof(uint8_t));

 // Detailed description of exactly how a PLSRenderContextImpl should bind its buffers and draw a
 // flush. A typical flush is done in 4 steps:
 //
 //  1. Render the complex gradients from the gradSpanBuffer to the gradient texture
 //     (complexGradSpanCount, firstComplexGradSpan, complexGradRowsTop, complexGradRowsHeight).
 //
 //  2. Transfer the simple gradient texels from the simpleColorRampsBuffer to the top of the
 //     gradient texture (simpleGradTexelsWidth, simpleGradTexelsHeight,
 //     simpleGradDataOffsetInBytes, tessDataHeight).
 //
 //  3. Render the tessellation texture from the tessVertexSpanBuffer (tessVertexSpanCount,
 //     firstTessVertexSpan).
 //
 //  4. Execute the drawList, reading from the newly rendered resource textures.
 //
 struct FlushDescriptor
 {
     PLSRenderTarget* renderTarget = nullptr;
     ShaderFeatures combinedShaderFeatures = ShaderFeatures::NONE;
     InterlockMode interlockMode = InterlockMode::rasterOrdering;
     int msaaSampleCount = 0; // (0 unless interlockMode is depthStencil.)

     LoadAction colorLoadAction = LoadAction::clear;
     ColorInt clearColor = 0; // When loadAction == LoadAction::clear.
     uint32_t coverageClearValue = 0;

     IAABB renderTargetUpdateBounds; // drawBounds, or renderTargetBounds if loadAction ==
                                     // LoadAction::clear.

     size_t flushUniformDataOffsetInBytes = 0;
     size_t pathCount = 0;
     size_t firstPath = 0;
     size_t firstPaint = 0;
     size_t firstPaintAux = 0;
     size_t contourCount = 0;
     size_t firstContour = 0;
     size_t complexGradSpanCount = 0;
     size_t firstComplexGradSpan = 0;
     size_t tessVertexSpanCount = 0;
     size_t firstTessVertexSpan = 0;
     uint32_t simpleGradTexelsWidth = 0;
     uint32_t simpleGradTexelsHeight = 0;
     size_t simpleGradDataOffsetInBytes = 0;
     uint32_t complexGradRowsTop = 0;
     uint32_t complexGradRowsHeight = 0;
     uint32_t tessDataHeight = 0;
     const BlockAllocatedLinkedList<DrawBatch>* drawList = nullptr;

     bool hasTriangleVertices = false;
     bool wireframe = false;

     void* externalCommandBuffer = nullptr; // Required on Metal.
     bool isFinalFlushOfFrame = false;
 };

 // Returns true if the PLS shaders emit color directly to the raster pipeline, instead of rendering
 // via pixel local storage. In this case, the shaders will expect hardware blending to be enabled
 // and configured for premultiplied "src-over". It is the backend's responsibility to check this
 // method and configure the hardware blend state as needed.
 constexpr RIVE_ALWAYS_INLINE bool ShadersEmitColorToRasterPipeline(
     InterlockMode interlockMode,
     ShaderFeatures combinedShaderFeatures)
 {
     return interlockMode == InterlockMode::atomics &&
            !(combinedShaderFeatures & ShaderFeatures::ENABLE_ADVANCED_BLEND);
 }

 // Returns the smallest number that can be added to 'value', such that 'value % alignment' == 0.
 template <uint32_t Alignment> RIVE_ALWAYS_INLINE uint32_t PaddingToAlignUp(uint32_t value)
 {
     constexpr uint32_t maxMultipleOfAlignment =
         std::numeric_limits<uint32_t>::max() / Alignment * Alignment;
     uint32_t padding = (maxMultipleOfAlignment - value) % Alignment;
     assert((value + padding) % Alignment == 0);
     return padding;
 }

 // Returns the area of the (potentially non-rectangular) quadrilateral that results from
 // transforming the given bounds by the given matrix.
 float FindTransformedArea(const AABB& bounds, const Mat2D&);

 // Convert a BlendMode to the tightly-packed range used by PLS shaders.
 uint32_t ConvertBlendModeToPLSBlendMode(BlendMode riveMode);

 // Swizzles the byte order of ColorInt to litte-endian RGBA (the order expected by GLSL).
 RIVE_ALWAYS_INLINE uint32_t SwizzleRiveColorToRGBA(ColorInt riveColor)
 {
     return (riveColor & 0xff00ff00) | (math::rotateleft32(riveColor, 16) & 0x00ff00ff);
 }

 // Used for fields that are used to layout write-only mapped GPU memory.
 // "volatile" to discourage the compiler from generating code that reads these values
 // (e.g., don't let the compiler generate "x ^= x" instead of "x = 0").
 // "RIVE_MAYBE_UNUSED" to suppress -Wunused-private-field.
 #define WRITEONLY RIVE_MAYBE_UNUSED volatile

 // Per-flush shared uniforms used by all shaders.
 struct FlushUniforms
 {
 public:
     FlushUniforms(const FlushDescriptor&, const PlatformFeatures&);

     FlushUniforms(const FlushUniforms& other) { *this = other; }

     void operator=(const FlushUniforms& rhs)
     {
         memcpy(this, &rhs, sizeof(*this) - sizeof(m_padTo256Bytes));
     }

     bool operator!=(const FlushUniforms& rhs) const
     {
         return memcmp(this, &rhs, sizeof(*this) - sizeof(m_padTo256Bytes)) != 0;
     }

 private:
     class InverseViewports
     {
     public:
         InverseViewports() = default;

         InverseViewports(const FlushDescriptor&, const PlatformFeatures&);

     private:
         WRITEONLY float m_vals[4]; // [complexGradientsY, tessDataY, renderTargetX,  renderTargetY]
     };

     WRITEONLY InverseViewports m_inverseViewports;
     WRITEONLY uint32_t m_renderTargetWidth = 0;
     WRITEONLY uint32_t m_renderTargetHeight = 0;
     WRITEONLY uint32_t m_colorClearValue;       // Only used if clears are implemented as draws.
     WRITEONLY uint32_t m_coverageClearValue;    // Only used if clears are implemented as draws.
     WRITEONLY IAABB m_renderTargetUpdateBounds; // drawBounds, or renderTargetBounds if there is a
                                                 // clear. (Used by the "@RESOLVE_PLS" step in
                                                 // InterlockMode::atomics.)
     WRITEONLY uint32_t m_pathIDGranularity = 0; // Spacing between adjacent path IDs
                                                 // (1 if IEEE compliant).
     WRITEONLY float m_vertexDiscardValue = std::numeric_limits<float>::quiet_NaN();
     WRITEONLY uint8_t m_padTo256Bytes[256 - 56]; // Uniform blocks must be multiples of 256 bytes in
                                                  // size.
 };
 static_assert(sizeof(FlushUniforms) == 256);

 // Storage buffers are logically layed out as arrays of structs on the CPU, but the GPU shaders
 // access them as arrays of basic types. We do it this way in order to be able to easily polyfill
 // them with textures.
 //
 // This enum defines the underlying basic type that each storage buffer struct is layed on top of.
 enum StorageBufferStructure
 {
     uint32x4,
     uint32x2,
     float32x4,
 };

 constexpr static uint32_t StorageBufferElementSizeInBytes(StorageBufferStructure bufferStructure)
 {
     switch (bufferStructure)
     {
         case StorageBufferStructure::uint32x4:
             return sizeof(uint32_t) * 4;
         case StorageBufferStructure::uint32x2:
             return sizeof(uint32_t) * 2;
         case StorageBufferStructure::float32x4:
             return sizeof(float) * 4;
     }
     RIVE_UNREACHABLE();
 }

 // High level structure of the "path" storage buffer. Each path has a unique data record on the GPU
 // that is accessed from the vertex shader.
 struct PathData
 {
 public:
     constexpr static StorageBufferStructure kBufferStructure = StorageBufferStructure::uint32x4;

     void set(const Mat2D&, float strokeRadius, uint32_t zIndex);

 private:
     WRITEONLY float m_matrix[6];
     WRITEONLY float m_strokeRadius; // "0" indicates that the path is filled, not stroked.
     WRITEONLY uint32_t m_zIndex;    // pls::InterlockMode::depthStencil only.
 };
 static_assert(sizeof(PathData) == StorageBufferElementSizeInBytes(PathData::kBufferStructure) * 2);
 static_assert(256 % sizeof(PathData) == 0);
 constexpr static size_t kPathBufferAlignmentInElements = 256 / sizeof(PathData);

 // High level structure of the "paint" storage buffer. Each path also has a data small record
 // describing its paint at a high level. Complex paints (gradients, images, or any path with a
 // clipRect) store additional rendering info in the PaintAuxData buffer.
 struct PaintData
 {
 public:
     constexpr static StorageBufferStructure kBufferStructure = StorageBufferStructure::uint32x2;

     void set(FillRule,
              PaintType,
              SimplePaintValue,
              GradTextureLayout,
              uint32_t clipID,
              bool hasClipRect,
              BlendMode);

 private:
     WRITEONLY uint32_t m_params; // [clipID, flags, paintType]
     union
     {
         WRITEONLY uint32_t m_color;     // PaintType::solidColor
         WRITEONLY float m_gradTextureY; // Paintype::linearGradient, Paintype::radialGradient
         WRITEONLY float m_opacity;      // PaintType::image
         WRITEONLY uint32_t m_shiftedClipReplacementID; // PaintType::clipUpdate
     };
 };
 static_assert(sizeof(PaintData) == StorageBufferElementSizeInBytes(PaintData::kBufferStructure));
 static_assert(256 % sizeof(PaintData) == 0);
 constexpr static size_t kPaintBufferAlignmentInElements = 256 / sizeof(PaintData);

 // Structure of the "paintAux" storage buffer. Gradients, images, and clipRects store their details
 // here, indexed by pathID.
 struct PaintAuxData
 {
 public:
     constexpr static StorageBufferStructure kBufferStructure = StorageBufferStructure::float32x4;

     void set(const Mat2D& viewMatrix,
              PaintType,
              SimplePaintValue,
              const PLSGradient*,
              const PLSTexture*,
              const ClipRectInverseMatrix*,
              const PLSRenderTarget*,
              const pls::PlatformFeatures&);

 private:
     WRITEONLY float m_matrix[6]; // Maps _fragCoord to paint coordinates.
     union
     {
         WRITEONLY float m_gradTextureHorizontalSpan[2]; // Paintype::linearGradient,
                                                         // Paintype::radialGradient
         WRITEONLY uint32_t m_bindlessTextureHandle[2];  // PaintType::image
     };

     WRITEONLY float m_clipRectInverseMatrix[6]; // Maps _fragCoord to normalized clipRect coords.
     WRITEONLY Vec2D m_inverseFwidth; // -1 / fwidth(matrix * _fragCoord) -- for antialiasing.
 };
 static_assert(sizeof(PaintAuxData) ==
               StorageBufferElementSizeInBytes(PaintAuxData::kBufferStructure) * 4);
 static_assert(256 % sizeof(PaintAuxData) == 0);
 constexpr static size_t kPaintAuxBufferAlignmentInElements = 256 / sizeof(PaintAuxData);

 // High level structure of the "contour" storage buffer. Each contour of every path has a data
 // record describing its info.
 struct ContourData
 {
 public:
     constexpr static StorageBufferStructure kBufferStructure = StorageBufferStructure::uint32x4;

     ContourData(Vec2D midpoint, uint32_t pathID, uint32_t vertexIndex0) :
         m_midpoint(midpoint), m_pathID(pathID), m_vertexIndex0(vertexIndex0)
     {}

 private:
     WRITEONLY Vec2D m_midpoint;        // Midpoint of the curve endpoints in just this contour.
     WRITEONLY uint32_t m_pathID;       // ID of the path this contour belongs to.
     WRITEONLY uint32_t m_vertexIndex0; // Index of the first tessellation vertex of the contour.
 };
 static_assert(sizeof(ContourData) ==
               StorageBufferElementSizeInBytes(ContourData::kBufferStructure));
 static_assert(256 % sizeof(ContourData) == 0);
 constexpr static size_t kContourBufferAlignmentInElements = 256 / sizeof(ContourData);

 // Per-vertex data for shaders that draw triangles.
 struct TriangleVertex
 {
 public:
     TriangleVertex() = default;
     TriangleVertex(Vec2D point, int16_t weight, uint16_t pathID) :
         m_point(point), m_weight_pathID((static_cast<int32_t>(weight) << 16) | pathID)
     {}

 #ifdef TESTING
     Vec2D testing_point() const { return {m_point.x, m_point.y}; }
     int32_t testing_weight_pathID() const { return m_weight_pathID; }
 #endif

 private:
     WRITEONLY Vec2D m_point;
     WRITEONLY int32_t m_weight_pathID; // [(weight << 16 | pathID]
 };
 static_assert(sizeof(TriangleVertex) == sizeof(float) * 3);

 // Per-draw uniforms used by image meshes.
 struct ImageDrawUniforms
 {
 public:
     ImageDrawUniforms() = default;

     ImageDrawUniforms(const Mat2D&,
                       float opacity,
                       const ClipRectInverseMatrix*,
                       uint32_t clipID,
                       BlendMode,
                       uint32_t zIndex);

 private:
     WRITEONLY float m_matrix[6];
     WRITEONLY float m_opacity;
     WRITEONLY float m_padding = 0;
     WRITEONLY float m_clipRectInverseMatrix[6];
     WRITEONLY uint32_t m_clipID;
     WRITEONLY uint32_t m_blendMode;
     WRITEONLY uint32_t m_zIndex; // pls::InterlockMode::depthStencil only.
     // Uniform blocks must be multiples of 256 bytes in size.
     WRITEONLY uint8_t m_padTo256Bytes[256 - 68];

     constexpr void staticChecks()
     {
         static_assert(offsetof(ImageDrawUniforms, m_matrix) % 16 == 0);
         static_assert(offsetof(ImageDrawUniforms, m_clipRectInverseMatrix) % 16 == 0);
         static_assert(sizeof(ImageDrawUniforms) == 256);
     }
 };

 #undef WRITEONLY

 // The maximum number of storage buffers we will ever use in a vertex or fragment shader.
 constexpr static size_t kMaxStorageBuffers = 4;

 // If the backend doesn't support "kMaxStorageBuffers" a shader, we polyfill with textures. This
 // function returns the dimensions to use for these textures.
 std::tuple<uint32_t, uint32_t> StorageTextureSize(size_t bufferSizeInBytes, StorageBufferStructure);

 // If the backend doesn't support "kMaxStorageBuffers" in a shader, we polyfill with textures. The
 // polyfill texture needs to be updated in entire rows at a time, meaning, its transfer buffer might
 // need to be larger than requested. This function returns a size that is large enough to service a
 // worst-case texture update.
 size_t StorageTextureBufferSize(size_t bufferSizeInBytes, StorageBufferStructure);

 // Represents a block of mapped GPU memory. Since it can be extremely expensive to read mapped
 // memory, we use this class to enforce the write-only nature of this memory.
 template <typename T> class WriteOnlyMappedMemory
 {
 public:
     WriteOnlyMappedMemory() { reset(); }
     WriteOnlyMappedMemory(T* ptr, size_t elementCount) { reset(ptr, elementCount); }

     void reset() { reset(nullptr, 0); }

     void reset(T* ptr, size_t elementCount)
     {
         m_mappedMemory = ptr;
         m_nextMappedItem = ptr;
         m_mappingEnd = ptr + elementCount;
     }

     using MapResourceBufferFn = void* (PLSRenderContextImpl::*)(size_t mapSizeInBytes);
     void mapElements(PLSRenderContextImpl* impl, MapResourceBufferFn mapFn, size_t elementCount)
     {
         void* ptr = (impl->*mapFn)(elementCount * sizeof(T));
         reset(reinterpret_cast<T*>(ptr), elementCount);
     }

     operator bool() const { return m_mappedMemory; }

     // How many bytes have been written to the buffer?
     size_t bytesWritten() const
     {
         return reinterpret_cast<uintptr_t>(m_nextMappedItem) -
                reinterpret_cast<uintptr_t>(m_mappedMemory);
     }

     size_t elementsWritten() const { return bytesWritten() / sizeof(T); }

     // Is there room to push() itemCount items to the buffer?
     bool hasRoomFor(size_t itemCount) { return m_nextMappedItem + itemCount <= m_mappingEnd; }

     // Append and write a new item to the buffer. In order to enforce the write-only requirement
     // of a mapped buffer, these methods do not return any pointers to the client.
     template <typename... Args> RIVE_ALWAYS_INLINE void emplace_back(Args&&... args)
     {
         new (&push()) T(std::forward<Args>(args)...);
     }
     template <typename... Args> RIVE_ALWAYS_INLINE void set_back(Args&&... args)
     {
         push().set(std::forward<Args>(args)...);
     }
     void push_back_n(const T* values, size_t count)
     {
         T* dst = push(count);
         if (values != nullptr)
         {
             memcpy(dst, values, count * sizeof(T));
         }
     }
     void skip_back() { push(); }

 private:
     RIVE_ALWAYS_INLINE T& push()
     {
         assert(hasRoomFor(1));
         return *m_nextMappedItem++;
     }
     RIVE_ALWAYS_INLINE T* push(size_t count)
     {
         assert(hasRoomFor(count));
         T* ret = m_nextMappedItem;
         m_nextMappedItem += count;
         return ret;
     }

     T* m_mappedMemory;
     T* m_nextMappedItem;
     const T* m_mappingEnd;
 };

 // Utility for tracking booleans that may be unknown (e.g., lazily computed values, GL state, etc.)
 enum class TriState
 {
     no,
     yes,
     unknown
 };
 } // namespace rive::pls