renderer/include/rive/renderer/gpu.hpp - external/github.com/rive-app/rive-cpp - Git at Google

 /*
  * Copyright 2022 Rive
  */

 #pragma once

 #include "rive/enum_bitset.hpp"
 #include "rive/math/aabb.hpp"
 #include "rive/math/mat2d.hpp"
 #include "rive/math/vec2d.hpp"
 #include "rive/math/simd.hpp"
 #include "rive/shapes/paint/blend_mode.hpp"
 #include "rive/shapes/paint/color.hpp"
 #include "rive/renderer/trivial_block_allocator.hpp"
 #include "rive/shapes/paint/image_sampler.hpp"

 namespace rive
 {
 class GrInnerFanTriangulator;
 class RenderBuffer;
 } // namespace rive

 // This header defines constants and data structures for Rive's pixel local
 // storage path rendering algorithm.
 //
 // Main algorithm:
 // https://docs.google.com/document/d/19Uk9eyFxav6dNSYsI2ZyiX9zHU1YOaJsMB2sdDFVz6s/edit
 //
 // Batching multiple unique paths:
 // https://docs.google.com/document/d/1DLrQimS5pbNaJJ2sAW5oSOsH6_glwDPo73-mtG5_zns/edit
 //
 // Batching strokes as well:
 // https://docs.google.com/document/d/1CRKihkFjbd1bwT08ErMCP4fwSR7D4gnHvgdw_esY9GM/edit
 namespace rive::gpu
 {
 class Draw;
 class Gradient;
 class RenderContextImpl;
 class RenderTarget;
 class Texture;

 // Tessellate in parametric space until each segment is within 1/4 pixel of the
 // true curve.
 constexpr static int kParametricPrecision = 4;

 // Tessellate in polar space until the outset edge is within 1/8 pixel of the
 // true stroke.
 constexpr static int kPolarPrecision = 8;

 // Maximum supported numbers of tessellated segments in a single curve.
 constexpr static uint32_t kMaxParametricSegments = 1023;
 constexpr static uint32_t kMaxPolarSegments = 1023;

 // The Gaussian distribution is very blurry on the outer edges. Regardless of
 // how wide a feather is, the polar segments never need to have a finer angle
 // than this value.
 constexpr static float FEATHER_POLAR_SEGMENT_MIN_ANGLE = math::PI / 16;

 // cos(FEATHER_MIN_POLAR_SEGMENT_ANGLE / 2)
 constexpr static float COS_FEATHER_POLAR_SEGMENT_MIN_ANGLE_OVER_2 =
     0.99518472667f;

 // We allocate all our GPU buffers in rings. This ensures the CPU can prepare
 // frames in parallel while the GPU renders them.
 constexpr static int kBufferRingSize = 3;

 // Every coverage value in pixel local storage has an associated 16-bit path ID.
 // This ID enables us to batch multiple paths together without having to clear
 // the coverage buffer in between. This ID is implemented as an fp16, so the
 // maximum path ID therefore cannot be NaN (or conservatively, all 5 exponent
 // bits cannot be 1's). We also skip denormalized values (exp == 0) because they
 // have been empirically unreliable on Android as ID values.
 constexpr static int kLargestFP16BeforeExponentAll1s = (0x1f << 10) - 1;
 constexpr static int kLargestDenormalizedFP16 = 1023;
 constexpr static int MaxPathID(int granularity)
 {
     // Floating point equality gets funky when the exponent bits are all 1's, so
     // the largest pathID we can support is kLargestFP16BeforeExponentAll1s.
     //
     // The shader converts an integer path ID to fp16 as:
     //
     //     (id + kLargestDenormalizedFP16) * granularity
     //
     // So the largest path ID we can support is as follows.
     return kLargestFP16BeforeExponentAll1s / granularity -
            kLargestDenormalizedFP16;
 }

 // Each contour has its own unique ID, which it uses to index a data record
 // containing per-contour information. This value is currently 16 bit.
 constexpr static size_t kMaxContourID = 65535;
 constexpr static uint32_t kContourIDMask = 0xffff;
 static_assert((kMaxContourID & kContourIDMask) == kMaxContourID);

 // Tessellation is performed by rendering vertices into a data texture. These
 // values define the dimensions of the tessellation data texture.
 constexpr static size_t kTessTextureWidth =
     2048; // GL_MAX_TEXTURE_SIZE spec minimum on ES3/WebGL2.
 constexpr static size_t kTessTextureWidthLog2 = 11;
 static_assert(1 << kTessTextureWidthLog2 == kTessTextureWidth);

 // Gradients are implemented by sampling a horizontal ramp of pixels allocated
 // in a global gradient texture.
 constexpr static uint32_t kGradTextureWidth = 512;
 constexpr static uint32_t kGradTextureWidthInSimpleRamps =
     kGradTextureWidth / 2;

 // Depth/stencil parameters
 constexpr static float DEPTH_MIN = 0.0f;
 constexpr static float DEPTH_MAX = 1.0f;
 constexpr static uint8_t STENCIL_CLEAR = 0u;

 // Backend-specific capabilities/workarounds and fine tunin// g.
 struct PlatformFeatures
 {
     // InterlockMode::rasterOrdering.
     bool supportsRasterOrdering = false;
     // InterlockMode::atomics.
     bool supportsFragmentShaderAtomics = false;
     // Experimental rendering mode selected by InterlockMode::clockwiseAtomic.
     bool supportsClockwiseAtomicRendering = false;
     // Use KHR_blend_equation_advanced in msaa mode?
     bool supportsBlendAdvancedKHR = false;
     bool supportsBlendAdvancedCoherentKHR = false;
     // Required for @ENABLE_CLIP_RECT in msaa mode.
     bool supportsClipPlanes = false;
     bool avoidFlatVaryings = false;
     // Vivo Y21 (PowerVR Rogue GE8320; OpenGL ES 3.2 build 1.13@5776728a) seems
     // to hit some sort of reset condition that corrupts pixel local storage
     // when rendering a complex feather. Provide a workaround that allows the
     // implementation to opt in to always feathering to the atlas instead of
     // rendering directly to the screen.
     bool alwaysFeatherToAtlas = false;
     // clipSpaceBottomUp specifies whether the top of the viewport, in clip
     // coordinates, is at Y=+1 (OpenGL, Metal, D3D, WebGPU) or Y=-1 (Vulkan).
     //
     // framebufferBottomUp specifies whether "row 0" of the framebuffer is the
     // bottom of the image (OpenGL) or the top (Metal, D3D, WebGPU, Vulkan).
     //
     //
     //                                OpenGL
     //           (clipSpaceBottomUp=true, framebufferBottomUp=true)
     //
     //  Rive Pixel Space             Clip Space              Framebuffer
     //
     //  0 ----------->                   ^ +1                ^ height
     //  |          width                 |                   |
     //  |                         -1     |     +1            |
     //  |                 ===>    <------|------>    ===>    |
     //  |                                |                   |
     //  |                                |                   |          width
     //  v height                         v -1                0 ----------->
     //
     //
     //
     //                            Metal/D3D/WebGPU
     //           (clipSpaceBottomUp=true, framebufferBottomUp=false)
     //
     //  Rive Pixel Space             Clip Space              Framebuffer
     //
     //  0 ----------->                   ^ +1                0 ----------->
     //  |          width                 |                   |          width
     //  |                         -1     |     +1            |
     //  |                 ===>    <------|------>    ===>    |
     //  |                                |                   |
     //  |                                |                   |
     //  v height                         v -1                v height
     //
     //
     //
     //                                Vulkan
     //          (clipSpaceBottomUp=false, framebufferBottomUp=false)
     //
     //  Rive Pixel Space             Clip Space              Framebuffer
     //
     //  0 ----------->                   ^ -1                0 ----------->
     //  |          width                 |                   |          width
     //  |                         -1     |     +1            |
     //  |                 ===>    <------|------>    ===>    |
     //  |                                |                   |
     //  |                                |                   |
     //  v height                         v +1                v height
     //
     bool clipSpaceBottomUp = false;
     bool framebufferBottomUp = false;
     // Backend cannot initialize PLS with typical clear/load APIs in atomic
     // mode. Issue a "DrawType::renderPassInitialize" draw instead.
     bool atomicPLSInitNeedsDraw = false;
     // Backend API does not support initializing our (transient) MSAA color
     // buffer with the existing (non-MSAA) target texture at the beginning of a
     // render pass. Draw the previous renderTarget contents into it manually via
     // DrawType::renderPassInitialize when LoadAction::preserveRenderTarget is
     // specified.
     bool msaaColorPreserveNeedsDraw = false;
     // Workaround for precision issues. Determines how far apart we space unique
     // path IDs when they will be bit-casted to fp16.
     uint8_t pathIDGranularity = 1;
     // Maximum size (width or height) of a texture.
     uint32_t maxTextureSize = 2048;
     // Maximum length (in 32-bit uints) of the coverage buffer used for paths in
     // clockwiseFill/atomic mode. 2^27 bytes is the minimum storage buffer size
     // requirement in the Vulkan, GL, and D3D11 specs. Metal guarantees 256 MB.
     size_t maxCoverageBufferLength = (1 << 27) / sizeof(uint32_t);
 };

 // Gradient color stops are implemented as a horizontal span of pixels in a
 // global gradient texture. They are rendered by "GradientSpan" instances.
 struct GradientSpan
 {
     // x0Fixed and x1Fixed are normalized texel x coordinates, in the
     // fixed-point range 0..65535.
     RIVE_ALWAYS_INLINE void set(uint32_t x0Fixed,
                                 uint32_t x1Fixed,
                                 uint32_t y,
                                 uint32_t flags,
                                 ColorInt color0_,
                                 ColorInt color1_)
     {
         assert(x0Fixed < 65536);
         assert(x1Fixed < 65536);
         horizontalSpan = (x1Fixed << 16) | x0Fixed;
         yWithFlags = flags | y;
         color0 = color0_;
         color1 = color1_;
     }
     uint32_t horizontalSpan;
     uint32_t yWithFlags;
     uint32_t color0;
     uint32_t color1;
 };
 static_assert(sizeof(GradientSpan) == sizeof(uint32_t) * 4);
 static_assert(256 % sizeof(GradientSpan) == 0);
 // Metal requires vertex buffers to be 256-byte aligned.
 constexpr static size_t kGradSpanBufferAlignmentInElements =
     256 / sizeof(GradientSpan);

 // Gradient spans are drawn as 1px-tall triangle strips with 3 sub-rectangles.
 constexpr uint32_t GRAD_SPAN_TRI_STRIP_VERTEX_COUNT = 8;

 // Each curve gets tessellated into vertices. This is performed by rendering a
 // horizontal span of positions and normals into the tessellation data texture,
 // GP-GPU style. TessVertexSpan defines one instance of a horizontal
 // tessellation span for rendering.
 //
 // Each span has an optional reflection, rendered right to left, with the same
 // vertices in reverse order. These are used to draw mirrored patches with
 // negative coverage when we have back-face culling enabled. This emits every
 // triangle twice, once clockwise and once counterclockwise, and back-face
 // culling naturally selects the triangle with the appropriately signed coverage
 // (discarding the other).
 struct TessVertexSpan
 {
     RIVE_ALWAYS_INLINE void set(const Vec2D pts_[4],
                                 Vec2D joinTangent_,
                                 float y_,
                                 int32_t x0,
                                 int32_t x1,
                                 uint32_t parametricSegmentCount,
                                 uint32_t polarSegmentCount,
                                 uint32_t joinSegmentCount,
                                 uint32_t contourIDWithFlags_)
     {
         set(pts_,
             joinTangent_,
             y_,
             x0,
             x1,
             std::numeric_limits<float>::quiet_NaN(), // Discard the reflection.
             -1,
             -1,
             parametricSegmentCount,
             polarSegmentCount,
             joinSegmentCount,
             contourIDWithFlags_);
     }

     RIVE_ALWAYS_INLINE void set(const Vec2D pts_[4],
                                 Vec2D joinTangent_,
                                 float y_,
                                 int32_t x0,
                                 int32_t x1,
                                 float reflectionY_,
                                 int32_t reflectionX0,
                                 int32_t reflectionX1,
                                 uint32_t parametricSegmentCount,
                                 uint32_t polarSegmentCount,
                                 uint32_t joinSegmentCount,
                                 uint32_t contourIDWithFlags_)
     {
 #ifndef NDEBUG
         // Write to an intermediate local object in debug mode, so we can check
         // its values. (Otherwise we can't read it because mapped memory is
         // write-only.)
         TessVertexSpan localCopy;
 #define LOCAL(VAR) localCopy.VAR
 #else
 #define LOCAL(VAR) VAR
 #endif
         RIVE_INLINE_MEMCPY(LOCAL(pts), pts_, sizeof(LOCAL(pts)));
         LOCAL(joinTangent) = joinTangent_;
         LOCAL(y) = y_;
         LOCAL(reflectionY) = reflectionY_;
         LOCAL(x0x1) = (x1 << 16 | (x0 & 0xffff));
         LOCAL(reflectionX0X1) = (reflectionX1 << 16 | (reflectionX0 & 0xffff));
         LOCAL(segmentCounts) = (joinSegmentCount << 20) |
                                (polarSegmentCount << 10) |
                                parametricSegmentCount;
         LOCAL(contourIDWithFlags) = contourIDWithFlags_;
 #undef LOCAL

         // Ensure we didn't lose any data from packing.
         assert(localCopy.x0x1 << 16 >> 16 == x0);
         assert(localCopy.x0x1 >> 16 == x1);
         assert(localCopy.reflectionX0X1 << 16 >> 16 == reflectionX0);
         assert(localCopy.reflectionX0X1 >> 16 == reflectionX1);
         assert((localCopy.segmentCounts & 0x3ff) == parametricSegmentCount);
         assert(((localCopy.segmentCounts >> 10) & 0x3ff) == polarSegmentCount);
         assert(localCopy.segmentCounts >> 20 == joinSegmentCount);

 #ifndef NDEBUG
         memcpy(this, &localCopy, sizeof(*this));
 #endif
     }

     Vec2D pts[4];      // Cubic bezier curve.
     Vec2D joinTangent; // Ending tangent of the join that follows the cubic.
     float y;
     float reflectionY;
     int32_t x0x1;
     int32_t reflectionX0X1;
     uint32_t segmentCounts;      // [joinSegmentCount, polarSegmentCount,
                                  // parametricSegmentCount]
     uint32_t contourIDWithFlags; // flags | contourID
 };
 static_assert(sizeof(TessVertexSpan) == sizeof(float) * 16);
 static_assert(256 % sizeof(TessVertexSpan) == 0);
 // Metal requires vertex buffers to be 256-byte aligned.
 constexpr static size_t kTessVertexBufferAlignmentInElements =
     256 / sizeof(TessVertexSpan);

 // Tessellation spans are drawn as two distinct, 1px-tall rectangles: the span
 // and its reflection.
 constexpr uint16_t kTessSpanIndices[4 * 3] =
     {0, 1, 2, 2, 1, 3, 4, 5, 6, 6, 5, 7};

 // ImageRects are a special type of non-overlapping antialiased draw that we
 // only have to use in atomic mode. They allow us to bind a texture and draw it
 // in its entirety in a single pass.
 struct ImageRectVertex
 {
     float x;
     float y;
     float aaOffsetX;
     float aaOffsetY;
 };

 constexpr ImageRectVertex kImageRectVertices[12] = {
     {0, 0, .0, -1},
     {1, 0, .0, -1},
     {1, 0, +1, .0},
     {1, 1, +1, .0},
     {1, 1, .0, +1},
     {0, 1, .0, +1},
     {0, 1, -1, .0},
     {0, 0, -1, .0},
     {0, 0, +1, +1},
     {1, 0, -1, +1},
     {1, 1, -1, -1},
     {0, 1, +1, -1},
 };

 constexpr uint16_t kImageRectIndices[14 * 3] = {
     8,  0, 9, 9, 0, 1,  1,  2, 9, 9, 2, 10, 10, 2, 3, 3, 4,  10, 10, 4, 11,
     11, 4, 5, 5, 6, 11, 11, 6, 8, 8, 6, 7,  7,  0, 8, 9, 10, 8,  10, 8, 11,
 };

 enum class PaintType : uint32_t
 {
     clipUpdate, // Update the clip buffer instead of drawing to the framebuffer.
     solidColor,
     linearGradient,
     radialGradient,
     image,
 };

 // Specifies the location of a simple or complex horizontal color ramp within
 // the gradient texture. A simple color ramp is two texels wide, beginning at
 // the specified row and column. A complex color ramp spans the entire width of
 // the gradient texture, on the row:
 //     "GradTextureLayout::complexOffsetY + ColorRampLocation::row".
 struct ColorRampLocation
 {
     constexpr static uint16_t kComplexGradientMarker = 0xffff;
     bool isComplex() const { return col == kComplexGradientMarker; }
     uint16_t row;
     uint16_t col;
 };

 // Most of a paint's information can be described in a single value. Gradients
 // and images reference an additional Gradient* and Texture* respectively.
 union SimplePaintValue
 {
     ColorInt color = 0xff000000;         // PaintType::solidColor
     ColorRampLocation colorRampLocation; // Paintype::linear/radialGradient
     float imageOpacity;                  // PaintType::image
     uint32_t outerClipID;                // Paintype::clipUpdate
 };
 static_assert(sizeof(SimplePaintValue) == 4);

 // This class encapsulates a matrix that maps from _fragCoord to a space where
 // the clipRect is the normalized rectangle: [-1, -1, +1, +1]
 class ClipRectInverseMatrix
 {
 public:
     // When the clipRect inverse matrix is singular (e.g., all 0 in scale and
     // skew), the shader uses tx and ty as fixed clip coverage values instead of
     // finding edge distances.
     constexpr static ClipRectInverseMatrix WideOpen()
     {
         return Mat2D{0, 0, 0, 0, 1, 1};
     }
     constexpr static ClipRectInverseMatrix Empty()
     {
         return Mat2D{0, 0, 0, 0, 0, 0};
     }

     ClipRectInverseMatrix() = default;

     ClipRectInverseMatrix(const Mat2D& clipMatrix, const AABB& clipRect)
     {
         reset(clipMatrix, clipRect);
     }

     void reset(const Mat2D& clipMatrix, const AABB& clipRect);

     const Mat2D& inverseMatrix() const { return m_inverseMatrix; }

 private:
     constexpr ClipRectInverseMatrix(const Mat2D& inverseMatrix) :
         m_inverseMatrix(inverseMatrix)
     {}
     Mat2D m_inverseMatrix;
 };

 // Specifies the height of the gradient texture, and the row at which we
 // transition from simple color ramps to complex.
 //
 // This information is computed at flush time, once we know exactly how many
 // color ramps of each type will be in the gradient texture.
 struct GradTextureLayout
 {
     uint32_t complexOffsetY; // Row of the first complex gradient.
     float inverseHeight;     // 1 / textureHeight
 };

 // Once all curves in a contour have been tessellated, we render the tessellated
 // vertices in "patches" (aka specific instanced geometry).
 //
 // See:
 // https://docs.google.com/document/d/19Uk9eyFxav6dNSYsI2ZyiX9zHU1YOaJsMB2sdDFVz6s/edit#heading=h.fa4kubk3vimk
 //
 // With strokes:
 // https://docs.google.com/document/d/1CRKihkFjbd1bwT08ErMCP4fwSR7D4gnHvgdw_esY9GM/edit#heading=h.dcd0c58pxfs5
 //
 // A single patch spans N tessellation segments, connecting N + 1 tessellation
 // vertices. It is composed of a an AA border and fan triangles. The specifics
 // of the fan triangles depend on the PatchType.
 enum class PatchType
 {
     // Patches fan around the contour midpoint. Outer edges are inset by ~1px,
     // followed by a ~1px AA ramp.
     midpointFan,

     // Similar to midpointFan, except AA ramps are split down the center and
     // drawn with a ~1/2px outset AA ramp and a ~1/2px inset AA ramp that
     // overlaps the inner tessellation and has negative coverage.
     midpointFanCenterAA,

     // Patches only cover the AA ramps and interiors of bezier curves. The
     // interior path triangles that connect the outer curves are triangulated on
     // the CPU to eliminate overlap, and are drawn in a separate call. AA ramps
     // are split down the center (on the same lines as the interior
     // triangulation), and drawn with a ~1/2px outset AA ramp and a ~1/2px inset
     // AA ramp that overlaps the inner tessellation and has negative coverage. A
     // lone bowtie join is emitted at the end of the patch to tie the outer
     // curves together.
     outerCurves,
 };

 // When tessellating path vertices, we have the ability to generate the
 // triangles wound in forward or reverse order. Depending on the path and the
 // rendering algorithm, we will either want the triangles wound forward,
 // reverse, or BOTH.
 enum class ContourDirections
 {
     forward,
     reverse,
     // Generate two tessellations of the contour: reverse first, then forward.
     reverseThenForward,
     // Generate two tessellations of the contour: forward first, then reverse.
     forwardThenReverse,
 };
 constexpr static bool ContourDirectionsAreDoubleSided(
     ContourDirections contourDirections)
 {
     return contourDirections >= ContourDirections::reverseThenForward;
 }

 struct PatchVertex
 {
     void set(float localVertexID_,
              float outset_,
              float fillCoverage_,
              float params_)
     {
         localVertexID = localVertexID_;
         outset = outset_;
         fillCoverage = fillCoverage_;
         params = params_;
         setMirroredPosition(localVertexID_, outset_, fillCoverage_);
     }

     // Patch vertices can have an optional, alternate position when mirrored.
     // This is so we can ensure the diagonals inside the stroke line up on both
     // versions of the patch (mirrored and not).
     void setMirroredPosition(float localVertexID_,
                              float outset_,
                              float fillCoverage_)
     {
         mirroredVertexID = localVertexID_;
         mirroredOutset = outset_;
         mirroredFillCoverage = fillCoverage_;
     }

     float localVertexID; // 0 or 1 -- which tessellated vertex of the two that
                          // we are connecting?
     float outset; // Outset from the tessellated position, in the direction of
                   // the normal.
     float fillCoverage; // 0..1 for the stroke. 1 all around for the triangles.
                         // (Coverage will be negated later for counterclockwise
                         // triangles.)
     int32_t params;     // "(patchSize << 2) | [flags::kStrokeVertex,
                         //                      flags::kFanVertex,
                         //                      flags::kFanMidpointVertex]"
     float mirroredVertexID;
     float mirroredOutset;
     float mirroredFillCoverage;
     int32_t padding = 0;
 };
 static_assert(sizeof(PatchVertex) == sizeof(float) * 8);

 // # of tessellation segments spanned by the midpoint fan patch.
 constexpr static uint32_t kMidpointFanPatchSegmentSpan = 8;

 // # of tessellation segments spanned by the outer curve patch. (In this
 // particular instance, the final segment is a bowtie join with zero length and
 // no fan triangle.)
 constexpr static uint32_t kOuterCurvePatchSegmentSpan = 17;

 // Define vertex and index buffers that contain all the triangles in every
 // PatchType.
 constexpr static uint32_t kMidpointFanPatchVertexCount =
     kMidpointFanPatchSegmentSpan * 4 /*Stroke and/or AA outer ramp*/ +
     (kMidpointFanPatchSegmentSpan + 1) /*Curve fan*/ +
     1 /*Triangle from path midpoint*/;
 constexpr static uint32_t kMidpointFanPatchBorderIndexCount =
     kMidpointFanPatchSegmentSpan * 6 /*Stroke and/or AA outer ramp*/;
 constexpr static uint32_t kMidpointFanPatchIndexCount =
     kMidpointFanPatchBorderIndexCount /*Stroke and/or AA outer ramp*/ +
     (kMidpointFanPatchSegmentSpan - 1) * 3 /*Curve fan*/ +
     3 /*Triangle from path midpoint*/;
 constexpr static uint32_t kMidpointFanPatchBaseIndex = 0;
 static_assert((kMidpointFanPatchBaseIndex * sizeof(uint16_t)) % 4 == 0);

 constexpr static uint32_t kMidpointFanCenterAAPatchVertexCount =
     kMidpointFanPatchSegmentSpan * 4 * 2 /*Stroke and/or AA outer ramp*/ +
     (kMidpointFanPatchSegmentSpan + 1) /*Curve fan*/ +
     1 /*Triangle from path midpoint*/;
 constexpr static uint32_t kMidpointFanCenterAAPatchBorderIndexCount =
     kMidpointFanPatchSegmentSpan * 12 /*Stroke and/or AA outer ramp*/;
 constexpr static uint32_t kMidpointFanCenterAAPatchIndexCount =
     kMidpointFanCenterAAPatchBorderIndexCount /*Stroke and/or AA outer ramp*/ +
     (kMidpointFanPatchSegmentSpan - 1) * 3 /*Curve fan*/ +
     3 /*Triangle from path midpoint*/;
 constexpr static uint32_t kMidpointFanCenterAAPatchBaseIndex =
     kMidpointFanPatchBaseIndex + kMidpointFanPatchIndexCount;
 static_assert((kMidpointFanCenterAAPatchBaseIndex * sizeof(uint16_t)) % 4 == 0);

 constexpr static uint32_t kOuterCurvePatchVertexCount =
     kOuterCurvePatchSegmentSpan * 8 /*AA center ramp with bowtie*/ +
     kOuterCurvePatchSegmentSpan /*Curve fan*/;
 constexpr static uint32_t kOuterCurvePatchBorderIndexCount =
     kOuterCurvePatchSegmentSpan * 12 /*AA center ramp with bowtie*/;
 constexpr static uint32_t kOuterCurvePatchIndexCount =
     kOuterCurvePatchBorderIndexCount /*AA center ramp with bowtie*/ +
     (kOuterCurvePatchSegmentSpan - 2) * 3 /*Curve fan*/;
 constexpr static uint32_t kOuterCurvePatchBaseIndex =
     kMidpointFanCenterAAPatchBaseIndex + kMidpointFanCenterAAPatchIndexCount;
 static_assert((kOuterCurvePatchBaseIndex * sizeof(uint16_t)) % 4 == 0);

 constexpr static uint32_t kPatchVertexBufferCount =
     kMidpointFanPatchVertexCount + kMidpointFanCenterAAPatchVertexCount +
     kOuterCurvePatchVertexCount;
 constexpr static uint32_t kPatchIndexBufferCount =
     kMidpointFanPatchIndexCount + kMidpointFanCenterAAPatchIndexCount +
     kOuterCurvePatchIndexCount;
 void GeneratePatchBufferData(PatchVertex[kPatchVertexBufferCount],
                              uint16_t indices[kPatchIndexBufferCount]);

 enum class DrawType : uint8_t
 {
     // Fills, strokes, feathered strokes.
     midpointFanPatches,

     // Feathered fills.
     midpointFanCenterAAPatches,

     // Just the outer curves of a path; the interior will be triangulated.
     outerCurvePatches,

     interiorTriangulation,
     atlasBlit,
     imageRect,
     imageMesh,

     // MSAA strokes can't be merged with fills because they require their own
     // dedicated stencil settings.
     msaaStrokes,

     // MSAA "fast" path: (effectively) single pass rendering.
     msaaMidpointFanBorrowedCoverage,
     msaaMidpointFans,
     msaaMidpointFanStencilReset,

     // MSAA "slow" path: stencil-then-cover.
     msaaMidpointFanPathsStencil,
     msaaMidpointFanPathsCover,

     // MSAA interior triangulation is not currently supported, but this one draw
     // type is included in order to support the "retrofittedcubictriangles" GM.
     msaaOuterCubics,

     // Clear or intersect (based on DrawContents) the stencil clip bit.
     msaaStencilClipReset,

     // Clear/init render pass data with a fullscreen draw when we can't do it
     // with existing clear/load APIs. (e.g., for pixel local storage in buffers
     // that don't have copy/clear commands, or preserving existing color data in
     // a transient MSAA arrachment).
     renderPassInitialize,

     // Resolve render pass data (e.g., by applying the final deferred color in
     // atomic mode, or copying an offscreen attachment to the final
     // renderTarget).
     renderPassResolve,

 };

 constexpr static bool DrawTypeIsImageDraw(DrawType drawType)
 {
     switch (drawType)
     {
         case DrawType::imageRect:
         case DrawType::imageMesh:
             return true;
         case DrawType::midpointFanPatches:
         case DrawType::midpointFanCenterAAPatches:
         case DrawType::outerCurvePatches:
         case DrawType::interiorTriangulation:
         case DrawType::atlasBlit:
         case DrawType::msaaStrokes:
         case DrawType::msaaMidpointFanBorrowedCoverage:
         case DrawType::msaaMidpointFans:
         case DrawType::msaaMidpointFanStencilReset:
         case DrawType::msaaMidpointFanPathsStencil:
         case DrawType::msaaMidpointFanPathsCover:
         case DrawType::msaaOuterCubics:
         case DrawType::msaaStencilClipReset:
         case DrawType::renderPassInitialize:
         case DrawType::renderPassResolve:
             return false;
     }
     RIVE_UNREACHABLE();
 }

 constexpr static uint32_t PatchIndexCount(DrawType drawType)
 {
     switch (drawType)
     {
         case DrawType::midpointFanPatches:
             return kMidpointFanPatchIndexCount;
         case DrawType::midpointFanCenterAAPatches:
             return kMidpointFanCenterAAPatchIndexCount;
         case DrawType::outerCurvePatches:
             return kOuterCurvePatchIndexCount;
         case DrawType::msaaStrokes:
             return kMidpointFanPatchBorderIndexCount;
         case DrawType::msaaMidpointFanBorrowedCoverage:
         case DrawType::msaaMidpointFans:
         case DrawType::msaaMidpointFanStencilReset:
         case DrawType::msaaMidpointFanPathsStencil:
         case DrawType::msaaMidpointFanPathsCover:
             return kMidpointFanPatchIndexCount -
                    kMidpointFanPatchBorderIndexCount;
         case DrawType::msaaOuterCubics:
             return kOuterCurvePatchIndexCount -
                    kOuterCurvePatchBorderIndexCount;
         case DrawType::interiorTriangulation:
         case DrawType::atlasBlit:
         case DrawType::imageRect:
         case DrawType::imageMesh:
         case DrawType::msaaStencilClipReset:
         case DrawType::renderPassInitialize:
         case DrawType::renderPassResolve:
             RIVE_UNREACHABLE();
     }
     RIVE_UNREACHABLE();
 }

 constexpr static uint32_t PatchBaseIndex(DrawType drawType)
 {
     switch (drawType)
     {
         case DrawType::midpointFanPatches:
         case DrawType::msaaStrokes:
             return kMidpointFanPatchBaseIndex;
         case DrawType::midpointFanCenterAAPatches:
             return kMidpointFanCenterAAPatchBaseIndex;
         case DrawType::outerCurvePatches:
             return kOuterCurvePatchBaseIndex;
         case DrawType::msaaMidpointFanBorrowedCoverage:
         case DrawType::msaaMidpointFans:
         case DrawType::msaaMidpointFanStencilReset:
         case DrawType::msaaMidpointFanPathsStencil:
         case DrawType::msaaMidpointFanPathsCover:
             return kMidpointFanPatchBaseIndex +
                    kMidpointFanPatchBorderIndexCount;
         case DrawType::msaaOuterCubics:
             return kOuterCurvePatchBaseIndex + kOuterCurvePatchBorderIndexCount;
         case DrawType::interiorTriangulation:
         case DrawType::atlasBlit:
         case DrawType::imageRect:
         case DrawType::imageMesh:
         case DrawType::msaaStencilClipReset:
         case DrawType::renderPassInitialize:
         case DrawType::renderPassResolve:
             RIVE_UNREACHABLE();
     }
     RIVE_UNREACHABLE();
 }

 // Specifies what to do with the render target at the beginning of a flush.
 enum class LoadAction
 {
     clear,
     preserveRenderTarget,
     dontCare,
 };

 // Synchronization method for pixel local storage with overlapping fragments.
 enum class InterlockMode
 {
     rasterOrdering,
     atomics,
     // Use an experimental path rendering algorithm that utilizes atomics
     // without barriers. This requires that we override all paths' fill rules
     // (winding or even/odd) with a "clockwise" fill rule, where only regions
     // with a positive winding number get filled.
     clockwiseAtomic,
     msaa,
 };
 constexpr static size_t kInterlockModeCount = 4;

 // Low-level batch of scissored geometry for rendering to the offscreen atlas.
 struct AtlasDrawBatch
 {
     TAABB<uint16_t> scissor;
     uint32_t patchCount;
     uint32_t basePatch;
 };

 // "Uber shader" features that can be #defined in a draw shader.
 // This set is strictly limited to switches that don't *change* the behavior of
 // the shader, i.e., turning them all on will enable all types Rive content, but
 // simple content will still draw identically; we can turn a feature off if we
 // know a batch doesn't need it for better performance.
 enum class ShaderFeatures
 {
     NONE = 0,

     // Whole program features.
     ENABLE_CLIPPING = 1 << 0,
     ENABLE_CLIP_RECT = 1 << 1,
     ENABLE_ADVANCED_BLEND = 1 << 2,
     ENABLE_FEATHER = 1 << 3,

     // Fragment-only features.
     ENABLE_EVEN_ODD = 1 << 4,
     ENABLE_NESTED_CLIPPING = 1 << 5,
     ENABLE_HSL_BLEND_MODES = 1 << 6,
 };
 RIVE_MAKE_ENUM_BITSET(ShaderFeatures)
 constexpr static size_t kShaderFeatureCount = 7;
 constexpr static ShaderFeatures kAllShaderFeatures =
     static_cast<gpu::ShaderFeatures>((1 << kShaderFeatureCount) - 1);
 constexpr static ShaderFeatures kVertexShaderFeaturesMask =
     ShaderFeatures::ENABLE_CLIPPING | ShaderFeatures::ENABLE_CLIP_RECT |
     ShaderFeatures::ENABLE_ADVANCED_BLEND | ShaderFeatures::ENABLE_FEATHER;

 // These shader features change the way atomic pipelines are set up (or cause
 //  validation failures when enabled but not used)
 constexpr static ShaderFeatures kExclusiveAtomicUbershaderFeaturesMask =
     ShaderFeatures::ENABLE_ADVANCED_BLEND;

 constexpr static ShaderFeatures ShaderFeaturesMaskFor(
     InterlockMode interlockMode)
 {
     switch (interlockMode)
     {
         case InterlockMode::rasterOrdering:
             return kAllShaderFeatures;
         case InterlockMode::atomics:
             return kAllShaderFeatures & ~ShaderFeatures::ENABLE_NESTED_CLIPPING;
         case InterlockMode::clockwiseAtomic:
             // TODO: shader features aren't fully implemented yet in
             // clockwiseAtomic mode.
             return ShaderFeatures::ENABLE_FEATHER;
         case InterlockMode::msaa:
             return ShaderFeatures::ENABLE_CLIP_RECT |
                    ShaderFeatures::ENABLE_ADVANCED_BLEND |
                    ShaderFeatures::ENABLE_HSL_BLEND_MODES;
     }
     RIVE_UNREACHABLE();
 }

 // Miscellaneous switches that *do* affect the behavior of the fragment shader.
 // The renderContext may add some of these, and a backend may also add them to a
 // shader key if it wants to implement the behavior.
 enum class ShaderMiscFlags : uint32_t
 {
     none = 0,

     // InterlockMode::atomics only (without advanced blend). Render color to a
     // standard attachment instead of PLS. The backend implementation is
     // responsible to turn on src-over blending.
     fixedFunctionColorOutput = 1 << 0,

     // Override all paths' fill rules (winding or even/odd) with an experimental
     // "clockwise" fill rule, where only regions with a positive winding number
     // get filled.
     clockwiseFill = 1 << 1,

     // clockwiseAtomic mode only. This shader is a prepass that only subtracts
     // (counterclockwise) borrowed coverage from the coverage buffer. It doesn't
     // output color or clip.
     borrowedCoveragePrepass = 1 << 2,

     // DrawType::renderPassInitialize only. Also store the color clear value to
     // PLS when drawing a clear, in addition to clearing the other PLS planes.
     storeColorClear = 1 << 3,

     // DrawType::renderPassInitialize only. Swizzle the existing framebuffer
     // contents from BGRA to RGBA. (For when this data had to get copied from a
     // BGRA target.)
     swizzleColorBGRAToRGBA = 1 << 4,

     // DrawType::renderPassResolve only. Optimization for when rendering to an
     // offscreen texture.
     //
     // It renders the final "resolve" operation directly to the renderTarget in
     // a single pass, instead of (1) resolving the offscreen texture, and then
     // (2) copying the offscreen texture to back the renderTarget.
     coalescedResolveAndTransfer = 1 << 5,
 };
 RIVE_MAKE_ENUM_BITSET(ShaderMiscFlags)

 constexpr static ShaderFeatures ShaderFeaturesMaskFor(
     DrawType drawType,
     InterlockMode interlockMode)
 {
     ShaderFeatures mask = ShaderFeatures::NONE;
     switch (drawType)
     {
         case DrawType::imageRect:
         case DrawType::imageMesh:
         case DrawType::atlasBlit:
             if (interlockMode != gpu::InterlockMode::atomics)
             {
                 mask = ShaderFeatures::ENABLE_CLIPPING |
                        ShaderFeatures::ENABLE_CLIP_RECT |
                        ShaderFeatures::ENABLE_ADVANCED_BLEND |
                        ShaderFeatures::ENABLE_HSL_BLEND_MODES;
                 break;
             }
             // Since atomic mode has to resolve previous draws, images need to
             // consider the same shader features for path draws.
             [[fallthrough]];
         case DrawType::midpointFanPatches:
         case DrawType::midpointFanCenterAAPatches:
         case DrawType::outerCurvePatches:
         case DrawType::interiorTriangulation:
         case DrawType::msaaStrokes:
         case DrawType::msaaMidpointFanBorrowedCoverage:
         case DrawType::msaaMidpointFans:
         case DrawType::msaaMidpointFanStencilReset:
         case DrawType::msaaMidpointFanPathsStencil:
         case DrawType::msaaMidpointFanPathsCover:
         case DrawType::msaaOuterCubics:
             mask = kAllShaderFeatures;
             break;
         case DrawType::msaaStencilClipReset:
             mask = ShaderFeatures::NONE;
             break;
         case DrawType::renderPassInitialize:
             if (interlockMode == gpu::InterlockMode::atomics)
             {
                 // Atomic mode initializes clipping and color (when advanced
                 // blend is active).
                 mask = ShaderFeatures::ENABLE_CLIPPING |
                        ShaderFeatures::ENABLE_ADVANCED_BLEND;
             }
             else
             {
                 assert(interlockMode == gpu::InterlockMode::msaa);
                 // MSAA mode only needs to initialize color, and only when
                 // preserving the render target but using a transient MSAA
                 // attachment.
                 mask = ShaderFeatures::NONE;
             }
             break;
         case DrawType::renderPassResolve:
             assert(interlockMode == gpu::InterlockMode::atomics);
             mask = kAllShaderFeatures;
             break;
     }
     return mask & ShaderFeaturesMaskFor(interlockMode);
 }

 // Returns the flags that are valid for an ubershader version of the currently-
 //  requested shader feature set. There are some shader features that change
 //  how the render passes are set up in atomic mode that need to be accounted
 //  for beyond just using ShaderFeaturesMaskFor.
 constexpr static ShaderFeatures UbershaderFeaturesMaskFor(
     ShaderFeatures requestedFeatures,
     DrawType drawType,
     InterlockMode interlockMode,
     const PlatformFeatures& platformFeatures)
 {
     ShaderFeatures outFeatures = ShaderFeaturesMaskFor(drawType, interlockMode);
     if (interlockMode == InterlockMode::atomics)
     {
         // Turn off the exclusive atomic features unless they're set in our
         //  requested feature flags.
         outFeatures &=
             (requestedFeatures | ~kExclusiveAtomicUbershaderFeaturesMask);
     }

     // Ensure that we haven't dropped features we care about somehow
     assert((requestedFeatures & outFeatures) == requestedFeatures);

     // ENABLE_CLIP_RECT shouldn't be set if we're in MSAA mode without clip
     // plane support.
     if (interlockMode == InterlockMode::msaa &&
         !platformFeatures.supportsClipPlanes)
     {
         outFeatures &= ~ShaderFeatures::ENABLE_CLIP_RECT;
     }
     return outFeatures;
 }

 // Returns a unique value that can be used to key a shader.
 uint32_t ShaderUniqueKey(DrawType,
                          ShaderFeatures,
                          InterlockMode,
                          ShaderMiscFlags);

 extern const char* GetShaderFeatureGLSLName(ShaderFeatures feature);

 // Flags indicating the contents of a draw. These don't affect shaders, but in
 // msaa mode they are needed to break up batching. (msaa needs different
 // stencil/blend state, depending on the DrawContents.)
 //
 // These also affect the draw sort order, so we attempt associate more expensive
 // shader branch misses with higher flags.
 enum class DrawContents
 {
     none = 0,
     opaquePaint = 1 << 0,
     // Put feathered fills down low because they only need to draw different
     // geometry, which isn't really a context switch at all.
     featheredFill = 1 << 1,
     stroke = 1 << 2,
     clockwiseFill = 1 << 3,
     nonZeroFill = 1 << 4,
     evenOddFill = 1 << 5,
     activeClip = 1 << 6,
     clipUpdate = 1 << 7,
     advancedBlend = 1 << 8,
 };
 RIVE_MAKE_ENUM_BITSET(DrawContents)

 // A nestedClip draw updates the clip buffer while simultaneously clipping
 // against the outerClip that is currently in the clip buffer.
 constexpr static gpu::DrawContents kNestedClipUpdateMask =
     (gpu::DrawContents::activeClip | gpu::DrawContents::clipUpdate);

 // Types of barriers that may be required between DrawBatches.
 enum class BarrierFlags : uint8_t
 {
     none = 0,

     // Pixel-local dependency in the PLS planes. (Atomic mode only.) Ensure
     // prior draws complete at each pixel before beginning new ones.
     plsAtomic = 1 << 0,
     plsAtomicPreResolve = 1 << 1, // Once before the final resolve.

     // MSAA needs a special barrier (e.g., subpass transition) after manually
     // loading the render target into the transient MSAA attachment.
     msaaPostInit = 1 << 2,

     // Pixel-local dependency in the coverage buffer. (clockwiseAtomic mode
     // only.) All "borrowed coverage" draws have now been issued. Ensure they
     // complete at each pixel before beginning the "forward coverage" draws.
     clockwiseBorrowedCoverage = 1 << 3,

     // The next DrawBatch needs to perform an advanced blend, but the current
     // hardware requires an implementation-dependent barrier before reading the
     // dstColor (pipeline barrier for input attachments, KHR blend barrier, or
     // even a full MSAA resolve & blit into a separate texture.)
     dstBlend = 1 << 4,

     // Only prevent future DrawBatches from being combined with the current
     // drawList. (No GPU dependencies.)
     drawBatchBreak = 1 << 5,
 };
 RIVE_MAKE_ENUM_BITSET(BarrierFlags);

 // Low-level batch of geometry to submit to the GPU.
 struct DrawBatch
 {
     DrawBatch(DrawType drawType_,
               gpu::ShaderMiscFlags shaderMiscFlags_,
               uint32_t elementCount_,
               uint32_t baseElement_,
               rive::BlendMode blendMode_,
               rive::ImageSampler imageSampler_,
               BarrierFlags barrierFlags_) :
         drawType(drawType_),
         shaderMiscFlags(shaderMiscFlags_),
         elementCount(elementCount_),
         baseElement(baseElement_),
         firstBlendMode(blendMode_),
         barriers(barrierFlags_),
         imageSampler(imageSampler_)
     {}

     const DrawType drawType;
     const ShaderMiscFlags shaderMiscFlags;
     uint32_t elementCount; // Vertex, index, or instance count.
     uint32_t baseElement;  // Base vertex, index, or instance.
     rive::BlendMode firstBlendMode;
     BarrierFlags barriers; // Barriers to execute before drawing this batch.

     DrawContents drawContents = DrawContents::none;
     ShaderFeatures shaderFeatures = ShaderFeatures::NONE;

     // DrawType::imageRect and DrawType::imageMesh.
     uint32_t imageDrawDataOffset = 0;
     Texture* imageTexture = nullptr;
     const ImageSampler imageSampler = ImageSampler::LinearClamp();

     // DrawType::imageMesh.
     RenderBuffer* vertexBuffer;
     RenderBuffer* uvBuffer;
     RenderBuffer* indexBuffer;

     // When shaders don't have a mechanism to read the framebuffer (e.g.,
     // WebGL msaa), this is a linked list of all the draws in the batch whose
     // bounding boxes needs to be blitted to the "dstRead" texture before
     // drawing.
     const Draw* dstReadList = nullptr;
 };

 // Simple gradients only have 2 texels, so we write them to mapped texture
 // memory from the CPU instead of rendering them.
 struct TwoTexelRamp
 {
     ColorInt color0, color1;
 };
 static_assert(sizeof(TwoTexelRamp) == 8 * sizeof(uint8_t));

 #ifdef WITH_RIVE_TOOLS

 enum class SynthesizedFailureType
 {
     none,
     ubershaderLoad,
     shaderCompilation,
     pipelineCreation,
 };

 #endif

 // Detailed description of exactly how a RenderContextImpl should bind its
 // buffers and draw a flush. A typical flush is done in 4 steps:
 //
 //  1. Render the complex gradients from the gradSpanBuffer to the gradient
 //     texture (gradSpanCount, firstComplexGradSpan, complexGradRowsTop,
 //     complexGradRowsHeight).
 //
 //  2. Transfer the simple gradient texels from the simpleColorRampsBuffer to
 //     the top of the gradient texture (simpleGradTexelsWidth,
 //     simpleGradTexelsHeight, simpleGradDataOffsetInBytes, tessDataHeight).
 //
 //  3. Render the tessellation texture from the tessVertexSpanBuffer
 //     (tessVertexSpanCount, firstTessVertexSpan).
 //
 //  4. Execute the drawList, reading from the newly rendered resource textures.
 //
 struct FlushDescriptor
 {
     RenderTarget* renderTarget = nullptr;
     ShaderFeatures combinedShaderFeatures = ShaderFeatures::NONE;
     InterlockMode interlockMode = InterlockMode::rasterOrdering;
     // Atomic mode only: there a no advanced blend modes, so we can render
     // directly to the main target with fixed function (src-over) blending.
     bool atomicFixedFunctionColorOutput = false;
     int msaaSampleCount = 0; // (0 unless interlockMode is msaa.)

     LoadAction colorLoadAction = LoadAction::clear;
     ColorInt colorClearValue = 0; // When loadAction == LoadAction::clear.
     uint32_t coverageClearValue = 0;
     float depthClearValue = DEPTH_MAX;
     uint8_t stencilClearValue = STENCIL_CLEAR;

     IAABB renderTargetUpdateBounds; // drawBounds, or renderTargetBounds if
                                     // loadAction == LoadAction::clear.

     // Physical size of the atlas texture.
     uint16_t atlasTextureWidth;
     uint16_t atlasTextureHeight;

     // Boundaries of the content for this specific flush within the atlas
     // texture.
     uint16_t atlasContentWidth;
     uint16_t atlasContentHeight;

     // Monotonically increasing prefix that gets appended to the most
     // significant "32 - CLOCKWISE_COVERAGE_BIT_COUNT" bits of coverage buffer
     // values.
     //
     // The coverage buffer is used in clockwiseAtomic mode.
     //
     // Increasing this prefix implicitly clears the entire coverage buffer to
     // zero.
     uint32_t coverageBufferPrefix = 0;

     // (clockwiseAtomic mode only.) We usually don't have to clear the coverage
     // buffer because of coverageBufferPrefix, but when this value is true, the
     // entire coverage buffer must be cleared to zero before rendering.
     bool needsCoverageBufferClear = false;

     size_t flushUniformDataOffsetInBytes = 0;
     uint32_t pathCount = 0;
     size_t firstPath = 0;
     size_t firstPaint = 0;
     size_t firstPaintAux = 0;
     uint32_t contourCount = 0;
     size_t firstContour = 0;
     uint32_t gradSpanCount = 0;
     size_t firstGradSpan = 0;
     uint32_t tessVertexSpanCount = 0;
     size_t firstTessVertexSpan = 0;
     uint32_t gradDataHeight = 0;
     uint32_t tessDataHeight = 0;
     // Override path fill rules with "clockwise".
     bool clockwiseFillOverride = false;
     bool hasTriangleVertices = false;
     bool wireframe = false;
 #ifdef WITH_RIVE_TOOLS
     // Synthesize compilation failures to make sure the device handles them
     // gracefully. (e.g., by falling back on an uber shader or at least not
     // crashing.) Valid compilations may fail in the real world if the device is
     // pressed for resources or in a bad state.
     SynthesizedFailureType synthesizedFailureType =
         SynthesizedFailureType::none;
 #endif

     // Command buffer that rendering commands will be added to.
     //  - VkCommandBuffer on Vulkan.
     //  - id<MTLCommandBuffer> on Metal.
     //  - Unused otherwise.
     void* externalCommandBuffer = nullptr;

     // List of feathered fills (if any) that must be rendered to the atlas
     // before the main render pass.
     const AtlasDrawBatch* atlasFillBatches = nullptr;
     size_t atlasFillBatchCount = 0;

     // List of feathered strokes (if any) that must be rendered to the atlas
     // before the main render pass.
     const AtlasDrawBatch* atlasStrokeBatches = nullptr;
     size_t atlasStrokeBatchCount = 0;

     // List of draws in the main render pass. These are rendered directly to the
     // renderTarget.
     const BlockAllocatedLinkedList<DrawBatch>* drawList = nullptr;
 };

 // Returns the area of the (potentially non-rectangular) quadrilateral that
 // results from transforming the given bounds by the given matrix.
 float find_transformed_area(const AABB& bounds, const Mat2D&);

 // Convert a BlendMode to the tightly-packed range used by PLS shaders.
 uint32_t ConvertBlendModeToPLSBlendMode(BlendMode riveMode);

 // Swizzles the byte order of ColorInt to litte-endian RGBA (the order expected
 // by GLSL).
 RIVE_ALWAYS_INLINE uint32_t SwizzleRiveColorToRGBA(ColorInt riveColor)
 {
     return (riveColor & 0xff00ff00) |
            (math::rotateleft32(riveColor, 16) & 0x00ff00ff);
 }

 // Swizzles the byte order of ColorInt to litte-endian RGBA (the order expected
 // by GLSL), and premultiplies alpha.
 uint32_t SwizzleRiveColorToRGBAPremul(ColorInt riveColor);

 // Used for fields that are used to layout write-only mapped GPU memory.
 // "volatile" to discourage the compiler from generating code that reads these
 // values (e.g., don't let the compiler generate "x ^= x" instead of "x = 0").
 // "RIVE_MAYBE_UNUSED" to suppress -Wunused-private-field.
 #define WRITEONLY RIVE_MAYBE_UNUSED volatile

 // Per-flush shared uniforms used by all shaders.
 struct FlushUniforms
 {
 public:
     FlushUniforms(const FlushDescriptor&, const PlatformFeatures&);

     FlushUniforms(const FlushUniforms& other) { *this = other; }

     void operator=(const FlushUniforms& rhs)
     {
         memcpy(static_cast<void*>(this),
                &rhs,
                sizeof(*this) - sizeof(m_padTo256Bytes));
     }

     bool operator!=(const FlushUniforms& rhs) const
     {
         return memcmp(this, &rhs, sizeof(*this) - sizeof(m_padTo256Bytes)) != 0;
     }

 private:
     class InverseViewports
     {
     public:
         InverseViewports() = default;

         InverseViewports(const FlushDescriptor&, const PlatformFeatures&);

     private:
         // [complexGradientsY, tessDataY, renderTargetX,  renderTargetY]
         WRITEONLY float m_vals[4];
     };

     WRITEONLY InverseViewports m_inverseViewports;
     WRITEONLY uint32_t m_renderTargetWidth;
     WRITEONLY uint32_t m_renderTargetHeight;
     // Only used if clears are implemented as draws.
     WRITEONLY uint32_t m_colorClearValue;
     // Only used if clears are implemented as draws.
     WRITEONLY uint32_t m_coverageClearValue;
     // drawBounds, or renderTargetBounds if there is a clear. (Used by the
     // "@RESOLVE_PLS" step in InterlockMode::atomics.)
     WRITEONLY IAABB m_renderTargetUpdateBounds;
     WRITEONLY Vec2D m_atlasTextureInverseSize; // 1 / [atlasWidth, atlasHeight]
     WRITEONLY Vec2D m_atlasContentInverseViewport; // 2 / atlasContentBounds
     // Monotonically increasing prefix that gets appended to the most
     // significant "32 - CLOCKWISE_COVERAGE_BIT_COUNT" bits of coverage buffer
     // values. (clockwiseAtomic mode only.)
     WRITEONLY uint32_t m_coverageBufferPrefix;
     // Spacing between adjacent path IDs (1 if IEEE compliant).
     WRITEONLY uint32_t m_pathIDGranularity;
     WRITEONLY float m_vertexDiscardValue;
     WRITEONLY uint32_t m_wireframeEnabled; // Forces coverage to solid.
     // Uniform blocks must be multiples of 256 bytes in size.
     WRITEONLY uint8_t m_padTo256Bytes[256 - 80];
 };
 static_assert(sizeof(FlushUniforms) == 256);

 // Storage buffers are logically layed out as arrays of structs on the CPU, but
 // the GPU shaders access them as arrays of basic types. We do it this way in
 // order to be able to easily polyfill them with textures.
 //
 // This enum defines the underlying basic type that each storage buffer struct
 // is layed on top of.
 enum StorageBufferStructure
 {
     uint32x4,
     uint32x2,
     float32x4,
 };

 constexpr static uint32_t StorageBufferElementSizeInBytes(
     StorageBufferStructure bufferStructure)
 {
     switch (bufferStructure)
     {
         case StorageBufferStructure::uint32x4:
             return sizeof(uint32_t) * 4;
         case StorageBufferStructure::uint32x2:
             return sizeof(uint32_t) * 2;
         case StorageBufferStructure::float32x4:
             return sizeof(float) * 4;
     }
     RIVE_UNREACHABLE();
 }

 // Defines a transform from screen space into a region of the atlas.
 // The atlas may have a different scale factor than the screen.
 struct AtlasTransform
 {
     float scaleFactor;
     float translateX;
     float translateY;
 };

 // Defines a sub-allocation for a path's coverage data within the
 // renderContext's coverage buffer. (clockwiseAtomic mode only.)
 struct CoverageBufferRange
 {
     // Index of the first pixel of this allocation within the coverage buffer.
     // Must be a multiple of 32*32.
     uint32_t offset;
     // Line width in pixels of the image in this coverage allocation.
     // Must be a multiple of 32.
     uint32_t pitch;
     // Offset from screen space to image coords within the coverage allocation.
     float offsetX;
     float offsetY;
 };

 // High level structure of the "path" storage buffer. Each path has a unique
 // data record on the GPU that is accessed from the vertex shader.
 struct PathData
 {
 public:
     constexpr static StorageBufferStructure kBufferStructure =
         StorageBufferStructure::uint32x4;

     void set(const Mat2D&,
              float strokeRadius,
              float featherRadius,
              uint32_t zIndex,
              const AtlasTransform&,
              const CoverageBufferRange&);

 private:
     WRITEONLY float m_matrix[6];
     // "0" indicates that the path is filled, not stroked.
     WRITEONLY float m_strokeRadius;
     WRITEONLY float m_featherRadius;
     // InterlockMode::msaa.
     WRITEONLY uint32_t m_zIndex;
     // Only used when rendering coverage via the atlas.
     WRITEONLY AtlasTransform m_atlasTransform;
     // InterlockMode::clockwiseAtomic.
     WRITEONLY CoverageBufferRange m_coverageBufferRange;
 };
 static_assert(sizeof(PathData) ==
               StorageBufferElementSizeInBytes(PathData::kBufferStructure) * 4);
 static_assert(256 % sizeof(PathData) == 0);
 constexpr static size_t kPathBufferAlignmentInElements = 256 / sizeof(PathData);

 // High level structure of the "paint" storage buffer. Each path also has a data
 // small record describing its paint at a high level. Complex paints (gradients,
 // images, or any path with a clipRect) store additional rendering info in the
 // PaintAuxData buffer.
 struct PaintData
 {
 public:
     constexpr static StorageBufferStructure kBufferStructure =
         StorageBufferStructure::uint32x2;

     void set(DrawContents singleDrawContents,
              PaintType,
              SimplePaintValue,
              GradTextureLayout,
              uint32_t clipID,
              bool hasClipRect,
              BlendMode);

 private:
     WRITEONLY uint32_t m_params; // [clipID, flags, paintType]
     union
     {
         WRITEONLY uint32_t m_color;     // PaintType::solidColor
         WRITEONLY float m_gradTextureY; // Paintype::linearGradient,
                                         // Paintype::radialGradient
         WRITEONLY float m_opacity;      // PaintType::image
         WRITEONLY uint32_t m_shiftedClipReplacementID; // PaintType::clipUpdate
     };
 };
 static_assert(sizeof(PaintData) ==
               StorageBufferElementSizeInBytes(PaintData::kBufferStructure));
 static_assert(256 % sizeof(PaintData) == 0);
 constexpr static size_t kPaintBufferAlignmentInElements =
     256 / sizeof(PaintData);

 // Structure of the "paintAux" storage buffer. Gradients, images, and clipRects
 // store their details here, indexed by pathID.
 struct PaintAuxData
 {
 public:
     constexpr static StorageBufferStructure kBufferStructure =
         StorageBufferStructure::float32x4;

     void set(const Mat2D& viewMatrix,
              PaintType,
              SimplePaintValue,
              const Gradient*,
              const Texture*,
              const ClipRectInverseMatrix*,
              const RenderTarget*,
              const gpu::PlatformFeatures&);

 private:
     WRITEONLY float m_matrix[6]; // Maps _fragCoord to paint coordinates.
     union
     {
         WRITEONLY float
             m_gradTextureHorizontalSpan[2]; // Paintype::linearGradient,
                                             // Paintype::radialGradient
         WRITEONLY float m_imageTextureLOD;  // PaintType::image
     };

     WRITEONLY float m_clipRectInverseMatrix[6]; // Maps _fragCoord to normalized
                                                 // clipRect coords.
     WRITEONLY Vec2D m_inverseFwidth; // -1 / fwidth(matrix * _fragCoord) -- for
                                      // antialiasing.
 };
 static_assert(sizeof(PaintAuxData) ==
               StorageBufferElementSizeInBytes(PaintAuxData::kBufferStructure) *
                   4);
 static_assert(256 % sizeof(PaintAuxData) == 0);
 constexpr static size_t kPaintAuxBufferAlignmentInElements =
     256 / sizeof(PaintAuxData);

 // High level structure of the "contour" storage buffer. Each contour of every
 // path has a data record describing its info.
 struct ContourData
 {
 public:
     constexpr static StorageBufferStructure kBufferStructure =
         StorageBufferStructure::uint32x4;

     ContourData(Vec2D midpoint, uint32_t pathID, uint32_t vertexIndex0) :
         m_midpoint(midpoint), m_pathID(pathID), m_vertexIndex0(vertexIndex0)
     {}

 private:
     WRITEONLY Vec2D
         m_midpoint; // Midpoint of the curve endpoints in just this contour.
     WRITEONLY uint32_t m_pathID; // ID of the path this contour belongs to.
     WRITEONLY uint32_t m_vertexIndex0; // Index of the first tessellation vertex
                                        // of the contour.
 };
 static_assert(sizeof(ContourData) ==
               StorageBufferElementSizeInBytes(ContourData::kBufferStructure));
 static_assert(256 % sizeof(ContourData) == 0);
 constexpr static size_t kContourBufferAlignmentInElements =
     256 / sizeof(ContourData);

 // Per-vertex data for shaders that draw triangles.
 struct TriangleVertex
 {
 public:
     TriangleVertex() = default;
     TriangleVertex(Vec2D point, int16_t weight, uint16_t pathID) :
         m_point(point),
         m_weight_pathID((static_cast<int32_t>(weight) << 16) | pathID)
     {}

 #ifdef TESTING
     Vec2D testing_point() const { return {m_point.x, m_point.y}; }
     int32_t testing_weight_pathID() const { return m_weight_pathID; }
 #endif

 private:
     WRITEONLY Vec2D m_point;
     WRITEONLY int32_t m_weight_pathID; // [(weight << 16 | pathID]
 };
 static_assert(sizeof(TriangleVertex) == sizeof(float) * 3);

 // Per-draw uniforms used by image meshes.
 struct ImageDrawUniforms
 {
 public:
     ImageDrawUniforms() = default;

     ImageDrawUniforms(const Mat2D&,
                       float opacity,
                       const ClipRectInverseMatrix*,
                       uint32_t clipID,
                       BlendMode,
                       uint32_t zIndex);

 private:
     WRITEONLY float m_matrix[6];
     WRITEONLY float m_opacity;
     WRITEONLY float m_padding = 0;
     WRITEONLY float m_clipRectInverseMatrix[6];
     WRITEONLY uint32_t m_clipID;
     WRITEONLY uint32_t m_blendMode;
     WRITEONLY uint32_t m_zIndex; // gpu::InterlockMode::msaa only.
     // Uniform blocks must be multiples of 256 bytes in size.
     WRITEONLY uint8_t m_padTo256Bytes[256 - 68];

     constexpr void staticChecks()
     {
         static_assert(offsetof(ImageDrawUniforms, m_matrix) % 16 == 0);
         static_assert(
             offsetof(ImageDrawUniforms, m_clipRectInverseMatrix) % 16 == 0);
         static_assert(sizeof(ImageDrawUniforms) == 256);
     }
 };

 #undef WRITEONLY

 // The maximum number of storage buffers we will ever use in a vertex or
 // fragment shader.
 constexpr static size_t kMaxStorageBuffers = 4;

 // If the backend doesn't support "kMaxStorageBuffers" a shader, we polyfill
 // with textures. This function returns the dimensions to use for these
 // textures.
 std::tuple<uint32_t, uint32_t> StorageTextureSize(size_t bufferSizeInBytes,
                                                   StorageBufferStructure);

 // If the backend doesn't support "kMaxStorageBuffers" in a shader, we polyfill
 // with textures. The polyfill texture needs to be updated in entire rows at a
 // time, meaning, its transfer buffer might need to be larger than requested.
 // This function returns a size that is large enough to service a worst-case
 // texture update.
 size_t StorageTextureBufferSize(size_t bufferSizeInBytes,
                                 StorageBufferStructure);

 // Should the triangulator emit triangles with negative winding, positive
 // winding, or both?
 enum class WindingFaces
 {
     negative = 1 << 0,
     positive = 1 << 1,
     all = negative | positive,
 };
 RIVE_MAKE_ENUM_BITSET(WindingFaces)

 // Represents a block of mapped GPU memory. Since it can be extremely expensive
 // to read mapped memory, we use this class to enforce the write-only nature of
 // this memory.
 template <typename T> class WriteOnlyMappedMemory
 {
 public:
     WriteOnlyMappedMemory() { reset(); }
     WriteOnlyMappedMemory(T* ptr, size_t elementCount)
     {
         reset(ptr, elementCount);
     }

     void reset() { reset(nullptr, 0); }

     void reset(T* ptr, size_t elementCount)
     {
         m_mappedMemory = ptr;
         m_nextMappedItem = ptr;
         m_mappingEnd = ptr + elementCount;
     }

     using MapResourceBufferFn =
         void* (RenderContextImpl::*)(size_t mapSizeInBytes);
     void mapElements(RenderContextImpl* impl,
                      MapResourceBufferFn mapFn,
                      size_t elementCount)
     {
         assert(m_mappedMemory == nullptr);
         void* ptr = (impl->*mapFn)(elementCount * sizeof(T));
         reset(reinterpret_cast<T*>(ptr), elementCount);
     }

     using UnmapResourceBufferFn =
         void (RenderContextImpl::*)(size_t mapSizeInBytes);
     void unmapElements(RenderContextImpl* impl,
                        UnmapResourceBufferFn unmapFn,
                        size_t elementCount)
     {
         assert(m_mappedMemory != nullptr);
         assert(m_mappingEnd - m_mappedMemory == elementCount);
         (impl->*unmapFn)(elementCount * sizeof(T));
         reset();
     }

     operator bool() const { return m_mappedMemory; }

     // How many bytes have been written to the buffer?
     size_t bytesWritten() const
     {
         return reinterpret_cast<uintptr_t>(m_nextMappedItem) -
                reinterpret_cast<uintptr_t>(m_mappedMemory);
     }

     size_t elementsWritten() const { return bytesWritten() / sizeof(T); }

     // Is there room to push() itemCount items to the buffer?
     bool hasRoomFor(size_t itemCount)
     {
         return m_nextMappedItem + itemCount <= m_mappingEnd;
     }

     // Append and write a new item to the buffer. In order to enforce the
     // write-only requirement of a mapped buffer, these methods do not return
     // any pointers to the client.
     template <typename... Args>
     RIVE_ALWAYS_INLINE void emplace_back(Args&&... args)
     {
         new (&push()) T(std::forward<Args>(args)...);
     }
     template <typename... Args> RIVE_ALWAYS_INLINE void set_back(Args&&... args)
     {
         push().set(std::forward<Args>(args)...);
     }
     void push_back_n(const T* values, size_t count)
     {
         T* dst = push(count);
         if (values != nullptr)
         {
             memcpy(static_cast<void*>(dst), values, count * sizeof(T));
         }
     }
     void skip_back() { push(); }

 private:
     RIVE_ALWAYS_INLINE T& push()
     {
         assert(hasRoomFor(1));
         return *m_nextMappedItem++;
     }
     RIVE_ALWAYS_INLINE T* push(size_t count)
     {
         assert(hasRoomFor(count));
         T* ret = m_nextMappedItem;
         m_nextMappedItem += count;
         return ret;
     }

     T* m_mappedMemory;
     T* m_nextMappedItem;
     const T* m_mappingEnd;
 };

 // Utility for tracking booleans that may be unknown (e.g., lazily computed
 // values, GL state, etc.)
 enum class TriState
 {
     no,
     yes,
     unknown
 };

 enum class StencilOp : uint8_t
 {
     keep,
     replace,
     zero,
     decrClamp,
     incrWrap,
     decrWrap
 };

 enum class StencilCompareOp : uint8_t
 {
     less,
     equal,
     lessOrEqual,
     notEqual,
     always,
 };

 struct StencilFaceOps
 {
     StencilOp failOp;
     StencilOp passOp;
     StencilOp depthFailOp;
     StencilCompareOp compareOp;
 };

 enum class CullFace : uint8_t
 {
     none,
     clockwise,
     counterclockwise,
 };

 // Blend equation to select for the fixed-function GPU pipeline (not our own
 // in-shader blending). For now, the backend is free to decide whether it will
 // use premultiplied alpha or not.
 enum class BlendEquation : uint8_t
 {
     // Hardware blend is disabled.
     none = 0,

     // Core hardware blend equations supported on all platforms.
     srcOver = static_cast<int>(rive::BlendMode::srcOver),
     plus = srcOver + 1,
     max = plus + 1,

     // "Advanced" hardware blend equations.
     // PlatformFeatures::supportsKHRBlendEquations is required.
     screen = static_cast<int>(rive::BlendMode::screen),
     overlay = static_cast<int>(rive::BlendMode::overlay),
     darken = static_cast<int>(rive::BlendMode::darken),
     lighten = static_cast<int>(rive::BlendMode::lighten),
     colorDodge = static_cast<int>(rive::BlendMode::colorDodge),
     colorBurn = static_cast<int>(rive::BlendMode::colorBurn),
     hardLight = static_cast<int>(rive::BlendMode::hardLight),
     softLight = static_cast<int>(rive::BlendMode::softLight),
     difference = static_cast<int>(rive::BlendMode::difference),
     exclusion = static_cast<int>(rive::BlendMode::exclusion),
     multiply = static_cast<int>(rive::BlendMode::multiply),
     hue = static_cast<int>(rive::BlendMode::hue),
     saturation = static_cast<int>(rive::BlendMode::saturation),
     color = static_cast<int>(rive::BlendMode::color),
     luminosity = static_cast<int>(rive::BlendMode::luminosity),
 };

 // Common pipeline state that applies to every Rive draw and every backend.
 struct PipelineState
 {
     // Depth.
     bool depthTestEnabled;
     bool depthWriteEnabled;

     // Stencil.
     bool stencilTestEnabled;
     uint8_t stencilCompareMask;
     uint8_t stencilWriteMask;
     uint8_t stencilReference;
     StencilFaceOps stencilFrontOps;
     StencilFaceOps stencilBackOps;
     bool stencilDoubleSided; // If true, use stencilFrontOps for both faces.

     CullFace cullFace;
     BlendEquation blendEquation;
     bool colorWriteEnabled;

     // 18-bit key that uniquely identifies the pipeline state.
     constexpr static int UNIQUE_KEY_BIT_COUNT = 18;
     uint32_t uniqueKey;
 };

 void get_pipeline_state(const DrawBatch&,
                         const FlushDescriptor&,
                         const PlatformFeatures&,
                         PipelineState*);

 constexpr static PipelineState COLOR_ONLY_PIPELINE_STATE = {
     .depthTestEnabled = false,
     .depthWriteEnabled = false,
     .stencilTestEnabled = false,
     .stencilWriteMask = 0,
     .cullFace = CullFace::none,
     .blendEquation = BlendEquation::none,
     .colorWriteEnabled = true,
 };

 constexpr static PipelineState ATLAS_FILL_PIPELINE_STATE = {
     .depthTestEnabled = false,
     .depthWriteEnabled = false,
     .stencilTestEnabled = false,
     .stencilWriteMask = 0,
     .cullFace = CullFace::counterclockwise,
     .blendEquation = BlendEquation::plus,
     .colorWriteEnabled = true,
 };

 constexpr static PipelineState ATLAS_STROKE_PIPELINE_STATE = {
     .depthTestEnabled = false,
     .depthWriteEnabled = false,
     .stencilTestEnabled = false,
     .stencilWriteMask = 0,
     .cullFace = CullFace::counterclockwise,
     .blendEquation = BlendEquation::max,
     .colorWriteEnabled = true,
 };

 float4 cast_f16_to_f32(uint16x4 x);
 uint16x4 cast_f32_to_f16(float4);

 // These tables integrate the gaussian function, and its inverse, covering a
 // spread of -FEATHER_TEXTURE_STDDEVS to +FEATHER_TEXTURE_STDDEVS.
 constexpr static uint32_t GAUSSIAN_TABLE_SIZE = 512;
 extern const uint16_t g_gaussianIntegralTableF16[GAUSSIAN_TABLE_SIZE];
 extern const uint16_t g_inverseGaussianIntegralTableF16[GAUSSIAN_TABLE_SIZE];

 // Code to generate g_gaussianIntegralTableF16 and
 // g_inverseGaussianIntegralTableF32. This is left in the codebase but #ifdef'd
 // out in case we ever want to change any parameters of the built-in tables.
 #if 0
 void generate_gausian_integral_table(float (&)[GAUSSIAN_TABLE_SIZE]);
 void generate_inverse_gausian_integral_table(float (&)[GAUSSIAN_TABLE_SIZE]);
 #endif
 } // namespace rive::gpu