renderer/src/metal/render_context_metal_impl.mm - external/github.com/rive-app/rive-cpp - Git at Google

 /*
  * Copyright 2023 Rive
  */

 #include "rive/renderer/metal/render_context_metal_impl.h"

 #include "background_shader_compiler.h"
 #include "rive/renderer/buffer_ring.hpp"
 #include "rive/renderer/texture.hpp"
 #include "rive/renderer/rive_render_buffer.hpp"
 #include "shaders/constants.glsl"
 #include <sstream>

 #include "generated/shaders/color_ramp.exports.h"
 #include "generated/shaders/tessellate.exports.h"

 #if defined(RIVE_IOS_SIMULATOR)
 #import <mach-o/arch.h>
 #endif

 namespace rive::gpu
 {
 #if defined(RIVE_IOS)
 #include "generated/shaders/rive_pls_ios.metallib.c"
 #elif defined(RIVE_IOS_SIMULATOR)
 #include "generated/shaders/rive_pls_ios_simulator.metallib.c"
 #elif defined(RIVE_XROS)
 #include "generated/shaders/rive_renderer_xros.metallib.c"
 #elif defined(RIVE_XROS_SIMULATOR)
 #include "generated/shaders/rive_renderer_xros_simulator.metallib.c"
 #elif defined(RIVE_APPLETVOS)
 #include "generated/shaders/rive_renderer_appletvos.metallib.c"
 #elif defined(RIVE_APPLETVOS_SIMULATOR)
 #include "generated/shaders/rive_renderer_appletvsimulator.metallib.c"
 #else
 #include "generated/shaders/rive_pls_macosx.metallib.c"
 #endif

 static id<MTLRenderPipelineState> make_pipeline_state(
     id<MTLDevice> gpu, MTLRenderPipelineDescriptor* desc)
 {
     NSError* err = [NSError errorWithDomain:@"pipeline_create"
                                        code:201
                                    userInfo:nil];
     id<MTLRenderPipelineState> state =
         [gpu newRenderPipelineStateWithDescriptor:desc error:&err];
     if (!state)
     {
         fprintf(stderr, "%s\n", err.localizedDescription.UTF8String);
         abort();
     }
     return state;
 }

 // Renders color ramps to the gradient texture.
 class RenderContextMetalImpl::ColorRampPipeline
 {
 public:
     ColorRampPipeline(id<MTLDevice> gpu, id<MTLLibrary> plsLibrary)
     {
         MTLRenderPipelineDescriptor* desc =
             [[MTLRenderPipelineDescriptor alloc] init];
         desc.vertexFunction =
             [plsLibrary newFunctionWithName:@GLSL_colorRampVertexMain];
         desc.fragmentFunction =
             [plsLibrary newFunctionWithName:@GLSL_colorRampFragmentMain];
         desc.colorAttachments[0].pixelFormat = MTLPixelFormatRGBA8Unorm;
         m_pipelineState = make_pipeline_state(gpu, desc);
     }

     id<MTLRenderPipelineState> pipelineState() const { return m_pipelineState; }

 private:
     id<MTLRenderPipelineState> m_pipelineState;
 };

 // Renders tessellated vertices to the tessellation texture.
 class RenderContextMetalImpl::TessellatePipeline
 {
 public:
     TessellatePipeline(id<MTLDevice> gpu, id<MTLLibrary> plsLibrary)
     {
         MTLRenderPipelineDescriptor* desc =
             [[MTLRenderPipelineDescriptor alloc] init];
         desc.vertexFunction =
             [plsLibrary newFunctionWithName:@GLSL_tessellateVertexMain];
         desc.fragmentFunction =
             [plsLibrary newFunctionWithName:@GLSL_tessellateFragmentMain];
         desc.colorAttachments[0].pixelFormat = MTLPixelFormatRGBA32Uint;
         m_pipelineState = make_pipeline_state(gpu, desc);
     }

     id<MTLRenderPipelineState> pipelineState() const { return m_pipelineState; }

 private:
     id<MTLRenderPipelineState> m_pipelineState;
 };

 // Renders paths to the main render target.
 class RenderContextMetalImpl::DrawPipeline
 {
 public:
     // Precompiled functions are embedded in namespaces. Return the fully
     // qualified name of the desired function, including its namespace.
     static NSString* GetPrecompiledFunctionName(
         DrawType drawType,
         gpu::ShaderFeatures shaderFeatures,
         gpu::ShaderMiscFlags shaderMiscFlags,
         id<MTLLibrary> precompiledLibrary,
         const char* functionBaseName)
     {
         // Each feature corresponds to a specific index in the namespaceID.
         // These must stay in sync with generate_draw_combinations.py.
         char namespaceID[] = "00000000";
         static_assert(sizeof(namespaceID) == gpu::kShaderFeatureCount +
                                                  1 /*DRAW_INTERIOR_TRIANGLES*/ +
                                                  1 /*null terminator*/);
         for (size_t i = 0; i < gpu::kShaderFeatureCount; ++i)
         {
             ShaderFeatures feature = static_cast<ShaderFeatures>(1 << i);
             if (shaderFeatures & feature)
             {
                 namespaceID[i] = '1';
             }
             static_assert((int)ShaderFeatures::ENABLE_CLIPPING == 1 << 0);
             static_assert((int)ShaderFeatures::ENABLE_CLIP_RECT == 1 << 1);
             static_assert((int)ShaderFeatures::ENABLE_ADVANCED_BLEND == 1 << 2);
             static_assert((int)ShaderFeatures::ENABLE_FEATHER == 1 << 3);
             static_assert((int)ShaderFeatures::ENABLE_EVEN_ODD == 1 << 4);
             static_assert((int)ShaderFeatures::ENABLE_NESTED_CLIPPING ==
                           1 << 5);
             static_assert((int)ShaderFeatures::ENABLE_HSL_BLEND_MODES ==
                           1 << 6);
         }
         if (drawType == DrawType::interiorTriangulation)
         {
             namespaceID[gpu::kShaderFeatureCount] = '1';
         }

         char namespacePrefix;
         switch (drawType)
         {
             case DrawType::midpointFanPatches:
             case DrawType::midpointFanCenterAAPatches:
             case DrawType::outerCurvePatches:
             case DrawType::interiorTriangulation:
                 namespacePrefix =
                     (shaderMiscFlags & gpu::ShaderMiscFlags::clockwiseFill)
                         ? 'c'
                         : 'p';
                 break;
             case DrawType::imageRect:
                 RIVE_UNREACHABLE();
             case DrawType::imageMesh:
                 namespacePrefix = 'm';
                 break;
             case DrawType::atomicInitialize:
             case DrawType::atomicResolve:
             case DrawType::stencilClipReset:
                 RIVE_UNREACHABLE();
         }

         return [NSString stringWithFormat:@"%c%s::%s",
                                           namespacePrefix,
                                           namespaceID,
                                           functionBaseName];
     }

     DrawPipeline(id<MTLDevice> gpu,
                  id<MTLLibrary> library,
                  NSString* vertexFunctionName,
                  NSString* fragmentFunctionName,
                  gpu::DrawType drawType,
                  gpu::InterlockMode interlockMode,
                  gpu::ShaderFeatures shaderFeatures,
                  gpu::ShaderMiscFlags shaderMiscFlags)
     {
         auto makePipelineState = [=](id<MTLFunction> vertexMain,
                                      id<MTLFunction> fragmentMain,
                                      MTLPixelFormat pixelFormat) {
             MTLRenderPipelineDescriptor* desc =
                 [[MTLRenderPipelineDescriptor alloc] init];
             desc.vertexFunction = vertexMain;
             desc.fragmentFunction = fragmentMain;

             auto* framebuffer = desc.colorAttachments[COLOR_PLANE_IDX];
             framebuffer.pixelFormat = pixelFormat;

             switch (interlockMode)
             {
                 case gpu::InterlockMode::rasterOrdering:
                     // In rasterOrdering mode, the PLS planes are accessed as
                     // color attachments.
                     desc.colorAttachments[CLIP_PLANE_IDX].pixelFormat =
                         MTLPixelFormatR32Uint;
                     desc.colorAttachments[SCRATCH_COLOR_PLANE_IDX].pixelFormat =
                         pixelFormat;
                     desc.colorAttachments[COVERAGE_PLANE_IDX].pixelFormat =
                         MTLPixelFormatR32Uint;
                     break;

                 case gpu::InterlockMode::atomics:
                     // In atomic mode, the PLS planes are accessed as device
                     // buffers. We only use the "framebuffer" attachment
                     // configured above.
                     if (shaderMiscFlags &
                         gpu::ShaderMiscFlags::fixedFunctionColorOutput)
                     {
                         // The shader expectes a "src-over" blend function in
                         // order to to implement antialiasing and opacity.
                         framebuffer.blendingEnabled = TRUE;
                         framebuffer.sourceRGBBlendFactor = MTLBlendFactorOne;
                         framebuffer.destinationRGBBlendFactor =
                             MTLBlendFactorOneMinusSourceAlpha;
                         framebuffer.rgbBlendOperation = MTLBlendOperationAdd;
                         framebuffer.sourceAlphaBlendFactor = MTLBlendFactorOne;
                         framebuffer.destinationAlphaBlendFactor =
                             MTLBlendFactorOneMinusSourceAlpha;
                         framebuffer.alphaBlendOperation = MTLBlendOperationAdd;
                         framebuffer.writeMask = MTLColorWriteMaskAll;
                     }
                     else if (drawType == gpu::DrawType::atomicResolve)
                     {
                         // We're resolving from the offscreen color buffer to
                         // the framebuffer attachment. Write out the final color
                         // directly without any blend modes.
                         framebuffer.blendingEnabled = FALSE;
                         framebuffer.writeMask = MTLColorWriteMaskAll;
                     }
                     else
                     {
                         // This pipeline renders by storing to the offscreen
                         // color buffer; disable writes to the framebuffer
                         // attachment.
                         framebuffer.blendingEnabled = FALSE;
                         framebuffer.writeMask = MTLColorWriteMaskNone;
                     }
                     break;

                 case gpu::InterlockMode::clockwiseAtomic:
                 case gpu::InterlockMode::msaa:
                     RIVE_UNREACHABLE();
             }
             return make_pipeline_state(gpu, desc);
         };
         id<MTLFunction> vertexMain =
             [library newFunctionWithName:vertexFunctionName];
         id<MTLFunction> fragmentMain =
             [library newFunctionWithName:fragmentFunctionName];
         m_pipelineStateRGBA8 = makePipelineState(
             vertexMain, fragmentMain, MTLPixelFormatRGBA8Unorm);
         m_pipelineStateBGRA8 = makePipelineState(
             vertexMain, fragmentMain, MTLPixelFormatBGRA8Unorm);
     }

     id<MTLRenderPipelineState> pipelineState(MTLPixelFormat pixelFormat) const
     {
         assert(pixelFormat == MTLPixelFormatRGBA8Unorm ||
                pixelFormat == MTLPixelFormatRGBA16Float ||
                pixelFormat == MTLPixelFormatRGBA8Unorm_sRGB ||
                pixelFormat == MTLPixelFormatBGRA8Unorm ||
                pixelFormat == MTLPixelFormatBGRA8Unorm_sRGB);

         switch (pixelFormat)
         {
             case MTLPixelFormatRGBA8Unorm_sRGB:
             case MTLPixelFormatRGBA8Unorm:
             case MTLPixelFormatRGBA16Float:
                 return m_pipelineStateRGBA8;
             default:
                 return m_pipelineStateBGRA8;
         }
     }

 private:
     id<MTLRenderPipelineState> m_pipelineStateRGBA8;
     id<MTLRenderPipelineState> m_pipelineStateBGRA8;
 };

 #if defined(RIVE_IOS) || defined(RIVE_XROS) || defined(RIVE_APPLETVOS)
 static bool is_apple_silicon(id<MTLDevice> gpu)
 {
     if (@available(iOS 13, tvOS 13, visionOS 1, *))
     {
         return [gpu supportsFamily:MTLGPUFamilyApple4];
     }
     return false;
 }
 #endif

 class BufferRingMetalImpl : public BufferRing
 {
 public:
     static std::unique_ptr<BufferRingMetalImpl> Make(id<MTLDevice> gpu,
                                                      size_t capacityInBytes)
     {
         return capacityInBytes != 0
                    ? std::make_unique<BufferRingMetalImpl>(gpu, capacityInBytes)
                    : nullptr;
     }

     BufferRingMetalImpl(id<MTLDevice> gpu, size_t capacityInBytes) :
         BufferRing(capacityInBytes)
     {
         for (int i = 0; i < kBufferRingSize; ++i)
         {
             m_buffers[i] =
                 [gpu newBufferWithLength:capacityInBytes
                                  options:MTLResourceStorageModeShared];
         }
     }

     id<MTLBuffer> submittedBuffer() const
     {
         return m_buffers[submittedBufferIdx()];
     }

 protected:
     void* onMapBuffer(int bufferIdx, size_t mapSizeInBytes) override
     {
         return m_buffers[bufferIdx].contents;
     }

     void onUnmapAndSubmitBuffer(int bufferIdx, size_t mapSizeInBytes) override
     {}

 private:
     id<MTLBuffer> m_buffers[kBufferRingSize];
 };

 std::unique_ptr<RenderContext> RenderContextMetalImpl::MakeContext(
     id<MTLDevice> gpu, const ContextOptions& contextOptions)
 {
     auto renderContextImpl = std::unique_ptr<RenderContextMetalImpl>(
         new RenderContextMetalImpl(gpu, contextOptions));
     return std::make_unique<RenderContext>(std::move(renderContextImpl));
 }

 RenderContextMetalImpl::RenderContextMetalImpl(
     id<MTLDevice> gpu, const ContextOptions& contextOptions) :
     m_contextOptions(contextOptions), m_gpu(gpu)
 {
     // It appears, so far, that we don't need to use flat interpolation for path
     // IDs on any Apple device, and it's faster not to.
     m_platformFeatures.avoidFlatVaryings = true;
     m_platformFeatures.invertOffscreenY = true;
 #if defined(RIVE_IOS) || defined(RIVE_XROS) || defined(RIVE_APPLETVOS)
     m_platformFeatures.supportsRasterOrdering = true;
     m_platformFeatures.supportsFragmentShaderAtomics = false;
     if (!is_apple_silicon(m_gpu))
     {
         // The PowerVR GPU, at least on A10, has fp16 precision issues. We can't
         // use the the bottom 3 bits of the path and clip IDs in order for our
         // equality testing to work.
         m_platformFeatures.pathIDGranularity = 8;
     }
 #elif defined(RIVE_IOS_SIMULATOR) || defined(RIVE_XROS_SIMULATOR) ||           \
     defined(RIVE_APPLETVOS_SIMULATOR)
     // The simulator does not support framebuffer reads. Fall back on atomic
     // mode.
     m_platformFeatures.supportsRasterOrdering = false;
     m_platformFeatures.supportsFragmentShaderAtomics = true;
 #else
     m_platformFeatures.supportsRasterOrdering =
         [m_gpu supportsFamily:MTLGPUFamilyApple1] &&
         !contextOptions.disableFramebufferReads;
     m_platformFeatures.supportsFragmentShaderAtomics = true;
 #endif
     m_platformFeatures.atomicPLSMustBeInitializedAsDraw = true;

 #if defined(RIVE_IOS) || defined(RIVE_XROS) || defined(RIVE_XROS_SIMULATOR) || \
     defined(RIVE_APPLETVOS) || defined(RIVE_APPLETVOS_SIMULATOR)
     // Atomic barriers are never used on iOS, but if we ever did need them, we
     // would use rasterOrderGroups.
     m_metalFeatures.atomicBarrierType = AtomicBarrierType::rasterOrderGroup;
 #elif defined(RIVE_IOS_SIMULATOR)
     const NXArchInfo* hostArchitecture = NXGetLocalArchInfo();
     if (strncmp(hostArchitecture->name, "arm64", 5) == 0)
     {
         // The simulator doesn't advertise support for raster order groups, but
         // they appear to work anyway on an Apple-Silicon-hosted simulator. Use
         // rasterOrderGroup in this case because it's much faster than
         // renderPassBreak. (On Intel/AMD this doesn't matter anyway because
         // renderPassBreaks are cheap and actually faster than
         // rasterOrderGroups.)
         m_metalFeatures.atomicBarrierType = AtomicBarrierType::rasterOrderGroup;
     }
     else
     {
         m_metalFeatures.atomicBarrierType = AtomicBarrierType::renderPassBreak;
     }
 #else
     // Use real memory barriers for atomic mode if they're availabile.
     // "GPU devices in Apple3 through Apple9 families don’t support memory
     // barriers that include the MTLRenderStages.fragment or .tile stages in the
     // after argument..."
     if (([m_gpu supportsFamily:MTLGPUFamilyCommon2] ||
          [m_gpu supportsFamily:MTLGPUFamilyMac2]) &&
         ![m_gpu supportsFamily:MTLGPUFamilyApple3])
     {
         m_metalFeatures.atomicBarrierType = AtomicBarrierType::memoryBarrier;
     }
     else if (m_gpu.rasterOrderGroupsSupported)
     {
         m_metalFeatures.atomicBarrierType = AtomicBarrierType::rasterOrderGroup;
     }
     else
     {
         m_metalFeatures.atomicBarrierType = AtomicBarrierType::renderPassBreak;
     }
 #endif

     m_backgroundShaderCompiler =
         std::make_unique<BackgroundShaderCompiler>(m_gpu, m_metalFeatures);

     // Load the precompiled shaders.
     dispatch_data_t metallibData = dispatch_data_create(
 #if defined(RIVE_IOS)
         rive_pls_ios_metallib,
         rive_pls_ios_metallib_len,
 #elif defined(RIVE_IOS_SIMULATOR)
         rive_pls_ios_simulator_metallib,
         rive_pls_ios_simulator_metallib_len,
 #elif defined(RIVE_XROS)
         rive_renderer_xros_metallib,
         rive_renderer_xros_metallib_len,
 #elif defined(RIVE_XROS_SIMULATOR)
         rive_renderer_xros_simulator_metallib,
         rive_renderer_xros_simulator_metallib_len,
 #elif defined(RIVE_APPLETVOS)
         rive_renderer_appletvos_metallib,
         rive_renderer_appletvos_metallib_len,
 #elif defined(RIVE_APPLETVOS_SIMULATOR)
         rive_renderer_appletvsimulator_metallib,
         rive_renderer_appletvsimulator_metallib_len,
 #else
         rive_pls_macosx_metallib,
         rive_pls_macosx_metallib_len,
 #endif
         nil,
         nil);
     NSError* err = [NSError errorWithDomain:@"metallib_load"
                                        code:200
                                    userInfo:nil];
     m_plsPrecompiledLibrary = [m_gpu newLibraryWithData:metallibData
                                                   error:&err];
     if (m_plsPrecompiledLibrary == nil)
     {
         fprintf(stderr, "Failed to load pls metallib.\n");
         fprintf(stderr, "%s\n", err.localizedDescription.UTF8String);
         abort();
     }

     m_colorRampPipeline =
         std::make_unique<ColorRampPipeline>(m_gpu, m_plsPrecompiledLibrary);

     MTLTextureDescriptor* desc = [[MTLTextureDescriptor alloc] init];
     desc.pixelFormat = MTLPixelFormatR16Float;
     desc.width = gpu::GAUSSIAN_TABLE_SIZE;
     desc.height = 1;
     desc.usage = MTLTextureUsageShaderRead;
     desc.textureType = MTLTextureType2D;
     desc.mipmapLevelCount = 1;
     m_featherTexture = [m_gpu newTextureWithDescriptor:desc];
     [m_featherTexture
         replaceRegion:MTLRegionMake2D(0, 0, gpu::GAUSSIAN_TABLE_SIZE, 1)
           mipmapLevel:0
             withBytes:gpu::g_gaussianIntegralTableF16
           bytesPerRow:sizeof(gpu::g_gaussianIntegralTableF16)];

     m_tessPipeline =
         std::make_unique<TessellatePipeline>(m_gpu, m_plsPrecompiledLibrary);
     m_tessSpanIndexBuffer =
         [m_gpu newBufferWithBytes:gpu::kTessSpanIndices
                            length:sizeof(gpu::kTessSpanIndices)
                           options:MTLResourceStorageModeShared];

     // The precompiled static library has a fully-featured shader for each
     // drawType in "rasterOrdering" mode. We load these at initialization and
     // use them while waiting for the background compiler to generate more
     // specialized, higher performance shaders.
     if (m_platformFeatures.supportsRasterOrdering)
     {
         for (auto drawType : {DrawType::midpointFanPatches,
                               DrawType::interiorTriangulation,
                               DrawType::imageMesh})
         {
             for (auto shaderMiscFlags : {gpu::ShaderMiscFlags::none,
                                          gpu::ShaderMiscFlags::clockwiseFill})
             {
                 gpu::ShaderFeatures allShaderFeatures =
                     gpu::ShaderFeaturesMaskFor(
                         drawType, gpu::InterlockMode::rasterOrdering);
                 uint32_t pipelineKey =
                     ShaderUniqueKey(drawType,
                                     allShaderFeatures,
                                     gpu::InterlockMode::rasterOrdering,
                                     shaderMiscFlags);
                 m_drawPipelines[pipelineKey] = std::make_unique<DrawPipeline>(
                     m_gpu,
                     m_plsPrecompiledLibrary,
                     DrawPipeline::GetPrecompiledFunctionName(
                         drawType,
                         allShaderFeatures & gpu::kVertexShaderFeaturesMask,
                         gpu::ShaderMiscFlags::none,
                         m_plsPrecompiledLibrary,
                         GLSL_drawVertexMain),
                     DrawPipeline::GetPrecompiledFunctionName(
                         drawType,
                         allShaderFeatures,
                         shaderMiscFlags,
                         m_plsPrecompiledLibrary,
                         GLSL_drawFragmentMain),
                     drawType,
                     gpu::InterlockMode::rasterOrdering,
                     allShaderFeatures,
                     shaderMiscFlags);
             }
         }
     }

     // Create vertex and index buffers for the different PLS patches.
     m_pathPatchVertexBuffer =
         [m_gpu newBufferWithLength:kPatchVertexBufferCount * sizeof(PatchVertex)
                            options:MTLResourceStorageModeShared];
     m_pathPatchIndexBuffer =
         [m_gpu newBufferWithLength:kPatchIndexBufferCount * sizeof(uint16_t)
                            options:MTLResourceStorageModeShared];
     GeneratePatchBufferData(
         reinterpret_cast<PatchVertex*>(m_pathPatchVertexBuffer.contents),
         reinterpret_cast<uint16_t*>(m_pathPatchIndexBuffer.contents));

     // Set up the imageRect rendering buffers. (gpu::InterlockMode::atomics
     // only.)
     m_imageRectVertexBuffer =
         [m_gpu newBufferWithBytes:gpu::kImageRectVertices
                            length:sizeof(gpu::kImageRectVertices)
                           options:MTLResourceStorageModeShared];
     m_imageRectIndexBuffer =
         [m_gpu newBufferWithBytes:gpu::kImageRectIndices
                            length:sizeof(gpu::kImageRectIndices)
                           options:MTLResourceStorageModeShared];
 }

 RenderContextMetalImpl::~RenderContextMetalImpl() {}

 // If the GPU supports framebuffer reads (called "programmable blending" in the
 // feature tables), PLS planes besides the main framebuffer can exist in
 // ephemeral "memoryless" storage. This means their contents are never actually
 // written to main memory, and they only exist in fast tiled memory.
 static id<MTLTexture> make_pls_memoryless_texture(id<MTLDevice> gpu,
                                                   MTLPixelFormat pixelFormat,
                                                   uint32_t width,
                                                   uint32_t height)
 {
     MTLTextureDescriptor* desc = [[MTLTextureDescriptor alloc] init];
     desc.pixelFormat = pixelFormat;
     desc.width = width;
     desc.height = height;
     desc.usage = MTLTextureUsageRenderTarget;
     desc.textureType = MTLTextureType2D;
     desc.mipmapLevelCount = 1;
     desc.storageMode = MTLStorageModeMemoryless;
     return [gpu newTextureWithDescriptor:desc];
 }

 RenderTargetMetal::RenderTargetMetal(id<MTLDevice> gpu,
                                      MTLPixelFormat pixelFormat,
                                      uint32_t width,
                                      uint32_t height,
                                      const PlatformFeatures& platformFeatures) :
     RenderTarget(width, height), m_gpu(gpu), m_pixelFormat(pixelFormat)
 {
     m_targetTexture = nil; // Will be configured later by setTargetTexture().
     if (platformFeatures.supportsRasterOrdering)
     {
         m_coverageMemorylessTexture = make_pls_memoryless_texture(
             gpu, MTLPixelFormatR32Uint, width, height);
         m_clipMemorylessTexture = make_pls_memoryless_texture(
             gpu, MTLPixelFormatR32Uint, width, height);
         m_scratchColorMemorylessTexture =
             make_pls_memoryless_texture(gpu, m_pixelFormat, width, height);
     }
 }

 void RenderTargetMetal::setTargetTexture(id<MTLTexture> texture)
 {
     assert(!texture || compatibleWith(texture));
     m_targetTexture = texture;
 }

 rcp<RenderTargetMetal> RenderContextMetalImpl::makeRenderTarget(
     MTLPixelFormat pixelFormat, uint32_t width, uint32_t height)
 {
     return rcp(new RenderTargetMetal(
         m_gpu, pixelFormat, width, height, m_platformFeatures));
 }

 class RenderBufferMetalImpl
     : public LITE_RTTI_OVERRIDE(RiveRenderBuffer, RenderBufferMetalImpl)
 {
 public:
     RenderBufferMetalImpl(RenderBufferType renderBufferType,
                           RenderBufferFlags renderBufferFlags,
                           size_t sizeInBytes,
                           id<MTLDevice> gpu) :
         lite_rtti_override(renderBufferType, renderBufferFlags, sizeInBytes),
         m_gpu(gpu)
     {
         int bufferCount =
             flags() & RenderBufferFlags::mappedOnceAtInitialization
                 ? 1
                 : gpu::kBufferRingSize;
         for (int i = 0; i < bufferCount; ++i)
         {
             m_buffers[i] =
                 [gpu newBufferWithLength:sizeInBytes
                                  options:MTLResourceStorageModeShared];
         }
     }

     id<MTLBuffer> submittedBuffer() { return m_buffers[frontBufferIdx()]; }

 protected:
     void* onMap() override
     {
         assert(m_buffers[backBufferIdx()] != nil);
         return m_buffers[backBufferIdx()].contents;
     }

     void onUnmap() override {}

 private:
     id<MTLDevice> m_gpu;
     id<MTLBuffer> m_buffers[gpu::kBufferRingSize];
     int m_submittedBufferIdx = -1;
 };

 rcp<RenderBuffer> RenderContextMetalImpl::makeRenderBuffer(
     RenderBufferType type, RenderBufferFlags flags, size_t sizeInBytes)
 {
     return make_rcp<RenderBufferMetalImpl>(type, flags, sizeInBytes, m_gpu);
 }

 class TextureMetalImpl : public Texture
 {
 public:
     TextureMetalImpl(id<MTLDevice> gpu,
                      uint32_t width,
                      uint32_t height,
                      uint32_t mipLevelCount,
                      const uint8_t imageDataRGBA[]) :
         Texture(width, height)
     {
         // Create the texture.
         MTLTextureDescriptor* desc = [[MTLTextureDescriptor alloc] init];
         desc.pixelFormat = MTLPixelFormatRGBA8Unorm;
         desc.width = width;
         desc.height = height;
         desc.mipmapLevelCount = mipLevelCount;
         desc.usage = MTLTextureUsageShaderRead;
         desc.textureType = MTLTextureType2D;
         m_texture = [gpu newTextureWithDescriptor:desc];

         // Specify the top-level image in the mipmap chain.
         MTLRegion region = MTLRegionMake2D(0, 0, width, height);
         [m_texture replaceRegion:region
                      mipmapLevel:0
                        withBytes:imageDataRGBA
                      bytesPerRow:width * 4];
     }

     void ensureMipmaps(id<MTLCommandBuffer> commandBuffer) const
     {
         if (m_mipsDirty)
         {
             // Generate mipmaps.
             id<MTLBlitCommandEncoder> mipEncoder =
                 [commandBuffer blitCommandEncoder];
             [mipEncoder generateMipmapsForTexture:m_texture];
             [mipEncoder endEncoding];
             m_mipsDirty = false;
         }
     }

     id<MTLTexture> texture() const { return m_texture; }

 private:
     id<MTLTexture> m_texture;
     mutable bool m_mipsDirty = true;
 };

 rcp<Texture> RenderContextMetalImpl::makeImageTexture(
     uint32_t width,
     uint32_t height,
     uint32_t mipLevelCount,
     const uint8_t imageDataRGBA[])
 {
     return make_rcp<TextureMetalImpl>(
         m_gpu, width, height, mipLevelCount, imageDataRGBA);
 }

 std::unique_ptr<BufferRing> RenderContextMetalImpl::makeUniformBufferRing(
     size_t capacityInBytes)
 {
     return BufferRingMetalImpl::Make(m_gpu, capacityInBytes);
 }

 std::unique_ptr<BufferRing> RenderContextMetalImpl::makeStorageBufferRing(
     size_t capacityInBytes, gpu::StorageBufferStructure)
 {
     return BufferRingMetalImpl::Make(m_gpu, capacityInBytes);
 }

 std::unique_ptr<BufferRing> RenderContextMetalImpl::makeVertexBufferRing(
     size_t capacityInBytes)
 {
     return BufferRingMetalImpl::Make(m_gpu, capacityInBytes);
 }

 void RenderContextMetalImpl::resizeGradientTexture(uint32_t width,
                                                    uint32_t height)
 {
     if (width == 0 || height == 0)
     {
         m_gradientTexture = nil;
         return;
     }
     MTLTextureDescriptor* desc = [[MTLTextureDescriptor alloc] init];
     desc.pixelFormat = MTLPixelFormatRGBA8Unorm;
     desc.width = width;
     desc.height = height;
     desc.usage = MTLTextureUsageRenderTarget | MTLTextureUsageShaderRead;
     desc.textureType = MTLTextureType2D;
     desc.mipmapLevelCount = 1;
     desc.storageMode = MTLStorageModePrivate;
     m_gradientTexture = [m_gpu newTextureWithDescriptor:desc];
 }

 void RenderContextMetalImpl::resizeTessellationTexture(uint32_t width,
                                                        uint32_t height)
 {
     if (width == 0 || height == 0)
     {
         m_tessVertexTexture = nil;
         return;
     }
     MTLTextureDescriptor* desc = [[MTLTextureDescriptor alloc] init];
     desc.pixelFormat = MTLPixelFormatRGBA32Uint;
     desc.width = width;
     desc.height = height;
     desc.usage = MTLTextureUsageRenderTarget | MTLTextureUsageShaderRead;
     desc.textureType = MTLTextureType2D;
     desc.mipmapLevelCount = 1;
     desc.storageMode = MTLStorageModePrivate;
     m_tessVertexTexture = [m_gpu newTextureWithDescriptor:desc];
 }

 const RenderContextMetalImpl::DrawPipeline* RenderContextMetalImpl::
     findCompatibleDrawPipeline(gpu::DrawType drawType,
                                gpu::ShaderFeatures shaderFeatures,
                                gpu::InterlockMode interlockMode,
                                gpu::ShaderMiscFlags shaderMiscFlags)
 {
     uint32_t pipelineKey = gpu::ShaderUniqueKey(
         drawType, shaderFeatures, interlockMode, shaderMiscFlags);
     auto pipelineIter = m_drawPipelines.find(pipelineKey);
     if (pipelineIter == m_drawPipelines.end())
     {
         // The shader for this pipeline hasn't been scheduled for compiling yet.
         // Schedule it to compile in the background.
         m_backgroundShaderCompiler->pushJob({
             .drawType = drawType,
             .shaderFeatures = shaderFeatures,
             .interlockMode = interlockMode,
             .shaderMiscFlags = shaderMiscFlags,
         });
         pipelineIter = m_drawPipelines.insert({pipelineKey, nullptr}).first;
     }

     if (pipelineIter->second != nullptr)
     {
         // The pipeline is fully compiled and loaded.
         return pipelineIter->second.get();
     }

     // The shader for this pipeline hasn't finished compiling yet. Start by
     // finding a fully-featured superset of features whose pipeline we can fall
     // back on while waiting for it to compile.
     ShaderFeatures fullyFeaturedPipelineFeatures =
         gpu::ShaderFeaturesMaskFor(drawType, interlockMode);
     if (interlockMode == gpu::InterlockMode::atomics)
     {
         // Never add ENABLE_ADVANCED_BLEND to an atomic pipeline that doesn't
         // use advanced blend, since in atomic mode, the shaders behave
         // differently depending on whether advanced blend is enabled.
         fullyFeaturedPipelineFeatures &=
             shaderFeatures | ~ShaderFeatures::ENABLE_ADVANCED_BLEND;
         // Never add ENABLE_CLIPPING to an atomic pipeline that doesn't use
         // clipping; it will cause a "missing buffer binding" validation error
         // because the shader will define an (unused) clipBuffer, but we won't
         // bind anything to it.
         fullyFeaturedPipelineFeatures &=
             shaderFeatures | ~ShaderFeatures::ENABLE_CLIPPING;
     }
     shaderFeatures &= fullyFeaturedPipelineFeatures;

     // Fully-featured "rasterOrdering" pipelines should have already been
     // pre-loaded from the static library.
     assert(shaderFeatures != fullyFeaturedPipelineFeatures ||
            interlockMode != gpu::InterlockMode::rasterOrdering);

     // Poll to see if the shader is actually done compiling, but only wait if
     // it's a fully-feature pipeline. Otherwise, we can fall back on the
     // fully-featured pipeline while we wait for compilation.
     BackgroundCompileJob job;
     bool shouldWaitForBackgroundCompilation =
         shaderFeatures == fullyFeaturedPipelineFeatures ||
         m_contextOptions.synchronousShaderCompilations;
     while (m_backgroundShaderCompiler->popFinishedJob(
         &job, shouldWaitForBackgroundCompilation))
     {
         uint32_t jobKey = gpu::ShaderUniqueKey(job.drawType,
                                                job.shaderFeatures,
                                                job.interlockMode,
                                                job.shaderMiscFlags);
         m_drawPipelines[jobKey] =
             std::make_unique<DrawPipeline>(m_gpu,
                                            job.compiledLibrary,
                                            @GLSL_drawVertexMain,
                                            @GLSL_drawFragmentMain,
                                            job.drawType,
                                            job.interlockMode,
                                            job.shaderFeatures,
                                            job.shaderMiscFlags);
         if (jobKey == pipelineKey)
         {
             // The shader we wanted was actually done compiling and pending
             // being built into a pipeline.
             return pipelineIter->second.get();
         }
     }

     // The shader for this feature set hasn't finished compiling. Use the
     // pipeline that has all features enabled while we wait for it to finish.
     assert(shaderFeatures != fullyFeaturedPipelineFeatures);
     return findCompatibleDrawPipeline(drawType,
                                       fullyFeaturedPipelineFeatures,
                                       interlockMode,
                                       shaderMiscFlags);
 }

 void RenderContextMetalImpl::prepareToMapBuffers()
 {
     // Wait until the GPU finishes rendering flush "N + 1 - kBufferRingSize".
     // This ensures it is safe for the CPU to begin modifying the next buffers
     // in our rings.
     m_bufferRingIdx = (m_bufferRingIdx + 1) % kBufferRingSize;
     m_bufferRingLocks[m_bufferRingIdx].lock();
 }

 static id<MTLBuffer> mtl_buffer(const BufferRing* bufferRing)
 {
     assert(bufferRing != nullptr);
     return static_cast<const BufferRingMetalImpl*>(bufferRing)
         ->submittedBuffer();
 }

 static MTLViewport make_viewport(uint32_t x,
                                  uint32_t y,
                                  uint32_t width,
                                  uint32_t height)
 {
     return {
         static_cast<double>(x),
         static_cast<double>(y),
         static_cast<double>(width),
         static_cast<double>(height),
         0,
         1,
     };
 }

 id<MTLRenderCommandEncoder> RenderContextMetalImpl::makeRenderPassForDraws(
     const gpu::FlushDescriptor& flushDesc,
     MTLRenderPassDescriptor* passDesc,
     id<MTLCommandBuffer> commandBuffer,
     gpu::ShaderMiscFlags baselineShaderMiscFlags)
 {
     auto* renderTarget =
         static_cast<RenderTargetMetal*>(flushDesc.renderTarget);

     id<MTLRenderCommandEncoder> encoder =
         [commandBuffer renderCommandEncoderWithDescriptor:passDesc];

     [encoder
         setViewport:make_viewport(
                         0, 0, renderTarget->width(), renderTarget->height())];
     [encoder setVertexBuffer:mtl_buffer(flushUniformBufferRing())
                       offset:flushDesc.flushUniformDataOffsetInBytes
                      atIndex:FLUSH_UNIFORM_BUFFER_IDX];
     [encoder setFragmentBuffer:mtl_buffer(flushUniformBufferRing())
                         offset:flushDesc.flushUniformDataOffsetInBytes
                        atIndex:FLUSH_UNIFORM_BUFFER_IDX];
     [encoder setVertexTexture:m_tessVertexTexture
                       atIndex:TESS_VERTEX_TEXTURE_IDX];
     [encoder setFragmentTexture:m_gradientTexture atIndex:GRAD_TEXTURE_IDX];
     [encoder setFragmentTexture:m_featherTexture atIndex:FEATHER_TEXTURE_IDX];
     if (flushDesc.pathCount > 0)
     {
         [encoder setVertexBuffer:mtl_buffer(pathBufferRing())
                           offset:flushDesc.firstPath * sizeof(gpu::PathData)
                          atIndex:PATH_BUFFER_IDX];
         if (flushDesc.interlockMode == gpu::InterlockMode::atomics)
         {
             [encoder
                 setFragmentBuffer:mtl_buffer(paintBufferRing())
                            offset:flushDesc.firstPaint * sizeof(gpu::PaintData)
                           atIndex:PAINT_BUFFER_IDX];
             [encoder setFragmentBuffer:mtl_buffer(paintAuxBufferRing())
                                 offset:flushDesc.firstPaintAux *
                                        sizeof(gpu::PaintAuxData)
                                atIndex:PAINT_AUX_BUFFER_IDX];
         }
         else
         {
             [encoder
                 setVertexBuffer:mtl_buffer(paintBufferRing())
                          offset:flushDesc.firstPaint * sizeof(gpu::PaintData)
                         atIndex:PAINT_BUFFER_IDX];
             [encoder setVertexBuffer:mtl_buffer(paintAuxBufferRing())
                               offset:flushDesc.firstPaintAux *
                                      sizeof(gpu::PaintAuxData)
                              atIndex:PAINT_AUX_BUFFER_IDX];
         }
     }
     if (flushDesc.contourCount > 0)
     {
         [encoder
             setVertexBuffer:mtl_buffer(contourBufferRing())
                      offset:flushDesc.firstContour * sizeof(gpu::ContourData)
                     atIndex:CONTOUR_BUFFER_IDX];
     }
     if (flushDesc.interlockMode == gpu::InterlockMode::atomics)
     {
         // In atomic mode, the PLS planes are buffers that we need to bind
         // separately. Since the PLS plane indices collide with other buffer
         // bindings, offset the binding indices of these buffers by
         // DEFAULT_BINDINGS_SET_SIZE.
         if (!(baselineShaderMiscFlags &
               gpu::ShaderMiscFlags::fixedFunctionColorOutput))
         {
             [encoder
                 setFragmentBuffer:renderTarget->colorAtomicBuffer()
                            offset:0
                           atIndex:COLOR_PLANE_IDX + DEFAULT_BINDINGS_SET_SIZE];
         }
         if (flushDesc.combinedShaderFeatures &
             gpu::ShaderFeatures::ENABLE_CLIPPING)
         {
             [encoder
                 setFragmentBuffer:renderTarget->clipAtomicBuffer()
                            offset:0
                           atIndex:CLIP_PLANE_IDX + DEFAULT_BINDINGS_SET_SIZE];
         }
         [encoder
             setFragmentBuffer:renderTarget->coverageAtomicBuffer()
                        offset:0
                       atIndex:COVERAGE_PLANE_IDX + DEFAULT_BINDINGS_SET_SIZE];
     }
     if (flushDesc.wireframe)
     {
         [encoder setTriangleFillMode:MTLTriangleFillModeLines];
     }
     return encoder;
 }

 void RenderContextMetalImpl::flush(const FlushDescriptor& desc)
 {
     assert(desc.interlockMode != gpu::InterlockMode::clockwiseAtomic);
     assert(desc.interlockMode != gpu::InterlockMode::msaa); // TODO: msaa.

     auto* renderTarget = static_cast<RenderTargetMetal*>(desc.renderTarget);
     id<MTLCommandBuffer> commandBuffer =
         (__bridge id<MTLCommandBuffer>)desc.externalCommandBuffer;

     // Render the complex color ramps to the gradient texture.
     if (desc.gradSpanCount > 0)
     {
         MTLRenderPassDescriptor* gradPass =
             [MTLRenderPassDescriptor renderPassDescriptor];
         gradPass.renderTargetWidth = kGradTextureWidth;
         gradPass.renderTargetHeight = desc.gradDataHeight;
         gradPass.colorAttachments[0].loadAction = MTLLoadActionDontCare;
         gradPass.colorAttachments[0].storeAction = MTLStoreActionStore;
         gradPass.colorAttachments[0].texture = m_gradientTexture;

         id<MTLRenderCommandEncoder> gradEncoder =
             [commandBuffer renderCommandEncoderWithDescriptor:gradPass];
         [gradEncoder
             setViewport:make_viewport(0,
                                       0,
                                       kGradTextureWidth,
                                       static_cast<float>(desc.gradDataHeight))];
         [gradEncoder
             setRenderPipelineState:m_colorRampPipeline->pipelineState()];
         [gradEncoder setVertexBuffer:mtl_buffer(flushUniformBufferRing())
                               offset:desc.flushUniformDataOffsetInBytes
                              atIndex:FLUSH_UNIFORM_BUFFER_IDX];
         [gradEncoder
             setVertexBuffer:mtl_buffer(gradSpanBufferRing())
                      offset:desc.firstGradSpan * sizeof(gpu::GradientSpan)
                     atIndex:0];
         [gradEncoder setCullMode:MTLCullModeBack];
         [gradEncoder drawPrimitives:MTLPrimitiveTypeTriangleStrip
                         vertexStart:0
                         vertexCount:gpu::GRAD_SPAN_TRI_STRIP_VERTEX_COUNT
                       instanceCount:desc.gradSpanCount];
         [gradEncoder endEncoding];
     }

     // Tessellate all curves into vertices in the tessellation texture.
     if (desc.tessVertexSpanCount > 0)
     {
         MTLRenderPassDescriptor* tessPass =
             [MTLRenderPassDescriptor renderPassDescriptor];
         tessPass.renderTargetWidth = kTessTextureWidth;
         tessPass.renderTargetHeight = desc.tessDataHeight;
         tessPass.colorAttachments[0].loadAction = MTLLoadActionDontCare;
         tessPass.colorAttachments[0].storeAction = MTLStoreActionStore;
         tessPass.colorAttachments[0].texture = m_tessVertexTexture;

         id<MTLRenderCommandEncoder> tessEncoder =
             [commandBuffer renderCommandEncoderWithDescriptor:tessPass];
         [tessEncoder
             setViewport:make_viewport(
                             0, 0, kTessTextureWidth, desc.tessDataHeight)];
         [tessEncoder setRenderPipelineState:m_tessPipeline->pipelineState()];
         [tessEncoder setVertexBuffer:mtl_buffer(flushUniformBufferRing())
                               offset:desc.flushUniformDataOffsetInBytes
                              atIndex:FLUSH_UNIFORM_BUFFER_IDX];
         [tessEncoder setVertexBuffer:mtl_buffer(tessSpanBufferRing())
                               offset:desc.firstTessVertexSpan *
                                      sizeof(gpu::TessVertexSpan)
                              atIndex:0];
         assert(desc.pathCount > 0);
         [tessEncoder setVertexBuffer:mtl_buffer(pathBufferRing())
                               offset:desc.firstPath * sizeof(gpu::PathData)
                              atIndex:PATH_BUFFER_IDX];
         assert(desc.contourCount > 0);
         [tessEncoder
             setVertexBuffer:mtl_buffer(contourBufferRing())
                      offset:desc.firstContour * sizeof(gpu::ContourData)
                     atIndex:CONTOUR_BUFFER_IDX];
         [tessEncoder setCullMode:MTLCullModeBack];
         [tessEncoder drawIndexedPrimitives:MTLPrimitiveTypeTriangle
                                 indexCount:std::size(gpu::kTessSpanIndices)
                                  indexType:MTLIndexTypeUInt16
                                indexBuffer:m_tessSpanIndexBuffer
                          indexBufferOffset:0
                              instanceCount:desc.tessVertexSpanCount];
         [tessEncoder endEncoding];
     }

     // Generate mipmaps if needed.
     for (const DrawBatch& batch : *desc.drawList)
     {
         if (auto imageTextureMetal =
                 static_cast<const TextureMetalImpl*>(batch.imageTexture))
         {
             imageTextureMetal->ensureMipmaps(commandBuffer);
         }
     }

     // Set up a render pass to do the final rendering using (some form of) pixel
     // local storage.
     MTLRenderPassDescriptor* pass =
         [MTLRenderPassDescriptor renderPassDescriptor];
     pass.renderTargetWidth = desc.renderTargetUpdateBounds.right;
     pass.renderTargetHeight = desc.renderTargetUpdateBounds.bottom;
     pass.colorAttachments[COLOR_PLANE_IDX].texture =
         renderTarget->targetTexture();
     switch (desc.colorLoadAction)
     {
         case gpu::LoadAction::clear:
         {
             float cc[4];
             UnpackColorToRGBA32FPremul(desc.clearColor, cc);
             pass.colorAttachments[COLOR_PLANE_IDX].loadAction =
                 MTLLoadActionClear;
             pass.colorAttachments[COLOR_PLANE_IDX].clearColor =
                 MTLClearColorMake(cc[0], cc[1], cc[2], cc[3]);
             break;
         }
         case gpu::LoadAction::preserveRenderTarget:
             pass.colorAttachments[COLOR_PLANE_IDX].loadAction =
                 MTLLoadActionLoad;
             break;
         case gpu::LoadAction::dontCare:
             pass.colorAttachments[COLOR_PLANE_IDX].loadAction =
                 MTLLoadActionDontCare;
             break;
     }
     pass.colorAttachments[COLOR_PLANE_IDX].storeAction = MTLStoreActionStore;

     auto baselineShaderMiscFlags = gpu::ShaderMiscFlags::none;
     if (desc.interlockMode == gpu::InterlockMode::rasterOrdering)
     {
         // In rasterOrdering mode, the PLS planes are accessed as color
         // attachments.
         pass.colorAttachments[CLIP_PLANE_IDX].texture =
             renderTarget->m_clipMemorylessTexture;
         pass.colorAttachments[CLIP_PLANE_IDX].loadAction = MTLLoadActionClear;
         pass.colorAttachments[CLIP_PLANE_IDX].clearColor =
             MTLClearColorMake(0, 0, 0, 0);
         pass.colorAttachments[CLIP_PLANE_IDX].storeAction =
             MTLStoreActionDontCare;

         pass.colorAttachments[SCRATCH_COLOR_PLANE_IDX].texture =
             renderTarget->m_scratchColorMemorylessTexture;
         pass.colorAttachments[SCRATCH_COLOR_PLANE_IDX].loadAction =
             MTLLoadActionDontCare;
         pass.colorAttachments[SCRATCH_COLOR_PLANE_IDX].storeAction =
             MTLStoreActionDontCare;

         pass.colorAttachments[COVERAGE_PLANE_IDX].texture =
             renderTarget->m_coverageMemorylessTexture;
         pass.colorAttachments[COVERAGE_PLANE_IDX].loadAction =
             MTLLoadActionClear;
         pass.colorAttachments[COVERAGE_PLANE_IDX].clearColor =
             MTLClearColorMake(desc.coverageClearValue, 0, 0, 0);
         pass.colorAttachments[COVERAGE_PLANE_IDX].storeAction =
             MTLStoreActionDontCare;
     }
     else if (!(desc.combinedShaderFeatures &
                gpu::ShaderFeatures::ENABLE_ADVANCED_BLEND))
     {
         assert(desc.interlockMode == gpu::InterlockMode::atomics);
         baselineShaderMiscFlags |=
             gpu::ShaderMiscFlags::fixedFunctionColorOutput;
     }
     else if (desc.colorLoadAction == gpu::LoadAction::preserveRenderTarget)
     {
         // Since we need to preserve the renderTarget during load, and since
         // we're rendering to an offscreen color buffer, we have to literally
         // copy the renderTarget into the color buffer.
         assert(desc.interlockMode == gpu::InterlockMode::atomics);
         id<MTLBlitCommandEncoder> copyEncoder =
             [commandBuffer blitCommandEncoder];
         auto updateOrigin = MTLOriginMake(desc.renderTargetUpdateBounds.left,
                                           desc.renderTargetUpdateBounds.top,
                                           0);
         auto updateSize = MTLSizeMake(desc.renderTargetUpdateBounds.width(),
                                       desc.renderTargetUpdateBounds.height(),
                                       1);
         [copyEncoder copyFromTexture:renderTarget->targetTexture()
                          sourceSlice:0
                          sourceLevel:0
                         sourceOrigin:updateOrigin
                           sourceSize:updateSize
                             toBuffer:renderTarget->colorAtomicBuffer()
                    destinationOffset:(updateOrigin.y * renderTarget->width() +
                                       updateOrigin.x) *
                                      sizeof(uint32_t)
               destinationBytesPerRow:renderTarget->width() * sizeof(uint32_t)
             destinationBytesPerImage:renderTarget->height() *
                                      renderTarget->width() * sizeof(uint32_t)];
         [copyEncoder endEncoding];
     }

     // Execute the DrawList.
     id<MTLRenderCommandEncoder> encoder = makeRenderPassForDraws(
         desc, pass, commandBuffer, baselineShaderMiscFlags);
     for (const DrawBatch& batch : *desc.drawList)
     {
         if (batch.elementCount == 0)
         {
             continue;
         }

         // Setup the pipeline for this specific drawType and shaderFeatures.
         gpu::ShaderFeatures shaderFeatures =
             desc.interlockMode == gpu::InterlockMode::atomics
                 ? desc.combinedShaderFeatures
                 : batch.shaderFeatures;
         gpu::ShaderMiscFlags batchMiscFlags = baselineShaderMiscFlags;
         if (desc.interlockMode == gpu::InterlockMode::rasterOrdering &&
             (batch.drawContents & gpu::DrawContents::clockwiseFill))
         {
             batchMiscFlags |= gpu::ShaderMiscFlags::clockwiseFill;
         }
         if (!(batchMiscFlags & gpu::ShaderMiscFlags::fixedFunctionColorOutput))
         {
             if (batch.drawType == gpu::DrawType::atomicResolve)
             {
                 // Atomic mode can always do a coalesced resolve when rendering
                 // to an offscreen color buffer.
                 batchMiscFlags |=
                     gpu::ShaderMiscFlags::coalescedResolveAndTransfer;
             }
             else if (batch.drawType == gpu::DrawType::atomicInitialize)
             {
                 if (desc.colorLoadAction == gpu::LoadAction::clear)
                 {
                     batchMiscFlags |= gpu::ShaderMiscFlags::storeColorClear;
                 }
                 else if (desc.colorLoadAction ==
                              gpu::LoadAction::preserveRenderTarget &&
                          renderTarget->pixelFormat() ==
                              MTLPixelFormatBGRA8Unorm)
                 {
                     // We already copied the renderTarget to our color buffer,
                     // but since the target is BGRA, we also need to swizzle it
                     // to RGBA before it's ready for PLS.
                     batchMiscFlags |=
                         gpu::ShaderMiscFlags::swizzleColorBGRAToRGBA;
                 }
             }
         }
         id<MTLRenderPipelineState> drawPipelineState =
             findCompatibleDrawPipeline(batch.drawType,
                                        shaderFeatures,
                                        desc.interlockMode,
                                        batchMiscFlags)
                 ->pipelineState(renderTarget->pixelFormat());

         // Bind the appropriate image texture, if any.
         if (auto imageTextureMetal =
                 static_cast<const TextureMetalImpl*>(batch.imageTexture))
         {
             [encoder setFragmentTexture:imageTextureMetal->texture()
                                 atIndex:IMAGE_TEXTURE_IDX];
         }

         DrawType drawType = batch.drawType;
         switch (drawType)
         {
             case DrawType::midpointFanPatches:
             case DrawType::midpointFanCenterAAPatches:
             case DrawType::outerCurvePatches:
             {
                 // Draw PLS patches that connect the tessellation vertices.
                 [encoder setRenderPipelineState:drawPipelineState];
                 [encoder setVertexBuffer:m_pathPatchVertexBuffer
                                   offset:0
                                  atIndex:0];
                 [encoder setCullMode:MTLCullModeBack];
                 // Don't use baseInstance in order to run on Apple GPU Family 2.
                 // TODO: Use baseInstance instead once we deprecate Apple2.
                 [encoder setVertexBytes:&batch.baseElement
                                  length:sizeof(uint32_t)
                                 atIndex:PATH_BASE_INSTANCE_UNIFORM_BUFFER_IDX];
                 [encoder drawIndexedPrimitives:MTLPrimitiveTypeTriangle
                                     indexCount:gpu::PatchIndexCount(drawType)
                                      indexType:MTLIndexTypeUInt16
                                    indexBuffer:m_pathPatchIndexBuffer
                              indexBufferOffset:gpu::PatchBaseIndex(drawType) *
                                                sizeof(uint16_t)
                                  instanceCount:batch.elementCount];
                 break;
             }
             case DrawType::interiorTriangulation:
             {
                 [encoder setRenderPipelineState:drawPipelineState];
                 [encoder setVertexBuffer:mtl_buffer(triangleBufferRing())
                                   offset:0
                                  atIndex:0];
                 [encoder setCullMode:MTLCullModeBack];
                 [encoder drawPrimitives:MTLPrimitiveTypeTriangle
                             vertexStart:batch.baseElement
                             vertexCount:batch.elementCount];
                 break;
             }
             case DrawType::imageRect:
             case DrawType::imageMesh:
             {
                 [encoder setRenderPipelineState:drawPipelineState];
                 [encoder
                     setVertexBuffer:mtl_buffer(imageDrawUniformBufferRing())
                              offset:batch.imageDrawDataOffset
                             atIndex:IMAGE_DRAW_UNIFORM_BUFFER_IDX];
                 [encoder
                     setFragmentBuffer:mtl_buffer(imageDrawUniformBufferRing())
                                offset:batch.imageDrawDataOffset
                               atIndex:IMAGE_DRAW_UNIFORM_BUFFER_IDX];
                 [encoder setCullMode:MTLCullModeNone];
                 if (drawType == DrawType::imageRect)
                 {
                     assert(desc.interlockMode == gpu::InterlockMode::atomics);
                     [encoder setVertexBuffer:m_imageRectVertexBuffer
                                       offset:0
                                      atIndex:0];
                     [encoder
                         drawIndexedPrimitives:MTLPrimitiveTypeTriangle
                                    indexCount:std::size(gpu::kImageRectIndices)
                                     indexType:MTLIndexTypeUInt16
                                   indexBuffer:m_imageRectIndexBuffer
                             indexBufferOffset:0];
                 }
                 else
                 {
                     LITE_RTTI_CAST_OR_BREAK(vertexBuffer,
                                             RenderBufferMetalImpl*,
                                             batch.vertexBuffer);
                     LITE_RTTI_CAST_OR_BREAK(
                         uvBuffer, RenderBufferMetalImpl*, batch.uvBuffer);
                     LITE_RTTI_CAST_OR_BREAK(
                         indexBuffer, RenderBufferMetalImpl*, batch.indexBuffer);
                     [encoder setVertexBuffer:vertexBuffer->submittedBuffer()
                                       offset:0
                                      atIndex:0];
                     [encoder setVertexBuffer:uvBuffer->submittedBuffer()
                                       offset:0
                                      atIndex:1];
                     [encoder
                         drawIndexedPrimitives:MTLPrimitiveTypeTriangle
                                    indexCount:batch.elementCount
                                     indexType:MTLIndexTypeUInt16
                                   indexBuffer:indexBuffer->submittedBuffer()
                             indexBufferOffset:batch.baseElement *
                                               sizeof(uint16_t)];
                 }
                 break;
             }
             case DrawType::atomicInitialize:
             case DrawType::atomicResolve:
             {
                 assert(desc.interlockMode == gpu::InterlockMode::atomics);
                 [encoder setRenderPipelineState:drawPipelineState];
                 [encoder drawPrimitives:MTLPrimitiveTypeTriangleStrip
                             vertexStart:0
                             vertexCount:4];
                 break;
             }
             case DrawType::stencilClipReset:
             {
                 RIVE_UNREACHABLE();
             }
         }
         if (batch.needsBarrier)
         {
             assert(desc.interlockMode == gpu::InterlockMode::atomics);
             switch (m_metalFeatures.atomicBarrierType)
             {
                 case AtomicBarrierType::memoryBarrier:
                 {
 #if defined(RIVE_MACOSX)
                     if (@available(macOS 10.14, *))
                     {
                         [encoder
                             memoryBarrierWithScope:MTLBarrierScopeBuffers |
                                                    MTLBarrierScopeRenderTargets
                                        afterStages:MTLRenderStageFragment
                                       beforeStages:MTLRenderStageFragment];
                         break;
                     }
 #endif
                     // atomicBarrierType shouldn't be "memoryBarrier" in this
                     // case.
                     RIVE_UNREACHABLE();
                 }
                 case AtomicBarrierType::rasterOrderGroup:
                     break;
                 case AtomicBarrierType::renderPassBreak:
                     // On very old hardware that can't support barriers, we just
                     // take a sledge hammer and break the entire render pass
                     // between overlapping draws.
                     // TODO: Is there a lighter way to achieve this?
                     [encoder endEncoding];
                     pass.colorAttachments[COLOR_PLANE_IDX].loadAction =
                         MTLLoadActionLoad;
                     encoder = makeRenderPassForDraws(
                         desc, pass, commandBuffer, baselineShaderMiscFlags);
                     break;
             }
         }
     }
     [encoder endEncoding];

     if (desc.isFinalFlushOfFrame)
     {
         // Schedule a callback that will unlock the buffers used by this flush,
         // after the GPU has finished rendering with them. This unblocks the CPU
         // from reusing them in a future flush.
         std::mutex& thisFlushLock = m_bufferRingLocks[m_bufferRingIdx];
         [commandBuffer addCompletedHandler:^(id<MTLCommandBuffer>) {
           assert(
               !thisFlushLock.try_lock()); // The mutex should already be locked.
           thisFlushLock.unlock();
         }];
     }
 }
 } // namespace rive::gpu