experimental/graphite/src/DrawPass.cpp - skia - Git at Google

 /*
  * Copyright 2021 Google LLC
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */

 #include "experimental/graphite/src/DrawPass.h"

 #include "experimental/graphite/include/GraphiteTypes.h"
 #include "experimental/graphite/include/Recorder.h"
 #include "experimental/graphite/src/Buffer.h"
 #include "experimental/graphite/src/ContextPriv.h"
 #include "experimental/graphite/src/DrawBufferManager.h"
 #include "experimental/graphite/src/DrawContext.h"
 #include "experimental/graphite/src/DrawList.h"
 #include "experimental/graphite/src/DrawWriter.h"
 #include "experimental/graphite/src/GlobalCache.h"
 #include "experimental/graphite/src/GraphicsPipeline.h"
 #include "experimental/graphite/src/GraphicsPipelineDesc.h"
 #include "experimental/graphite/src/Renderer.h"
 #include "experimental/graphite/src/ResourceProvider.h"
 #include "experimental/graphite/src/TextureProxy.h"
 #include "experimental/graphite/src/UniformCache.h"
 #include "experimental/graphite/src/UniformManager.h"
 #include "experimental/graphite/src/geom/BoundsManager.h"

 #include "src/core/SkMathPriv.h"
 #include "src/core/SkTBlockList.h"
 #include "src/core/SkUniformData.h"
 #include "src/gpu/BufferWriter.h"

 #include <algorithm>
 #include <unordered_map>

 namespace skgpu {

 // Helper to manage packed fields within a uint64_t
 template <uint64_t Bits, uint64_t Offset>
 struct Bitfield {
     static constexpr uint64_t kMask = ((uint64_t) 1 << Bits) - 1;
     static constexpr uint64_t kOffset = Offset;
     static constexpr uint64_t kBits = Bits;

     static uint32_t get(uint64_t v) { return static_cast<uint32_t>((v >> kOffset) & kMask); }
     static uint64_t set(uint32_t v) { return (v & kMask) << kOffset; }
 };

 /**
  * Each Draw in a DrawList might be processed by multiple RenderSteps (determined by the Draw's
  * Renderer), which can be sorted independently. Each (step, draw) pair produces its own SortKey.
  *
  * The goal of sorting draws for the DrawPass is to minimize pipeline transitions and dynamic binds
  * within a pipeline, while still respecting the overall painter's order. This decreases the number
  * of low-level draw commands in a command buffer and increases the size of those, allowing the GPU
  * to operate more efficiently and have fewer bubbles within its own instruction stream.
  *
  * The Draw's CompresssedPaintersOrder and DisjointStencilINdex represent the most significant bits
  * of the key, and are shared by all SortKeys produced by the same draw. Next, the pipeline
  * description is encoded in two steps:
  *  1. The index of the RenderStep packed in the high bits to ensure each step for a draw is
  *     ordered correctly.
  *  2. An index into a cache of pipeline descriptions is used to encode the identity of the
  *     pipeline (SortKeys that differ in the bits from #1 necessarily would have different
  *     descriptions, but then the specific ordering of the RenderSteps isn't enforced).
  * Last, the SortKey encodes an index into the set of uniform bindings accumulated for a DrawPass.
  * This allows the SortKey to cluster draw steps that have both a compatible pipeline and do not
  * require rebinding uniform data or other state (e.g. scissor). Since the uniform data index and
  * the pipeline description index are packed into indices and not actual pointers, a given SortKey
  * is only valid for the a specific DrawList->DrawPass conversion.
  */
 class DrawPass::SortKey {
 public:
     SortKey(const DrawList::Draw* draw,
             int renderStep,
             uint32_t pipelineIndex,
             uint32_t geomUniformIndex,
             uint32_t shadingUniformIndex)
         : fPipelineKey(ColorDepthOrderField::set(draw->fOrder.paintOrder().bits()) |
                        StencilIndexField::set(draw->fOrder.stencilIndex().bits())  |
                        RenderStepField::set(static_cast<uint32_t>(renderStep))     |
                        PipelineField::set(pipelineIndex))
         , fUniformKey(GeometryUniformField::set(geomUniformIndex) |
                       ShadingUniformField::set(shadingUniformIndex))
         , fDraw(draw) {
         SkASSERT(renderStep <= draw->fRenderer.numRenderSteps());
     }

     bool operator<(const SortKey& k) const {
         return fPipelineKey < k.fPipelineKey ||
                (fPipelineKey == k.fPipelineKey && fUniformKey < k.fUniformKey);
     }

     const RenderStep& renderStep() const {
         return *fDraw->fRenderer.steps()[RenderStepField::get(fPipelineKey)];
     }

     const DrawList::Draw* draw() const { return fDraw; }

     uint32_t pipeline()          const { return PipelineField::get(fPipelineKey);       }
     uint32_t geometryUniforms()  const { return GeometryUniformField::get(fUniformKey); }
     uint32_t shadingUniforms()   const { return ShadingUniformField::get(fUniformKey);  }

 private:
     // Fields are ordered from most-significant to least when sorting by 128-bit value.
     // NOTE: We don't use bit fields because field ordering is implementation defined and we need
     // to sort consistently.
     using ColorDepthOrderField = Bitfield<16, 48>; // sizeof(CompressedPaintersOrder)
     using StencilIndexField    = Bitfield<16, 32>; // sizeof(DisjointStencilIndex)
     using RenderStepField      = Bitfield<2,  30>; // bits >= log2(Renderer::kMaxRenderSteps)
     using PipelineField        = Bitfield<30, 0>;  // bits >= log2(max steps*DrawList::kMaxDraws)
     uint64_t fPipelineKey;

     using GeometryUniformField = Bitfield<32, 32>; // bits >= log2(max steps * max draw count)
     using ShadingUniformField  = Bitfield<32, 0>;  //  ""
     uint64_t fUniformKey;

     // Backpointer to the draw that produced the sort key
     const DrawList::Draw* fDraw;

     static_assert(ColorDepthOrderField::kBits >= sizeof(CompressedPaintersOrder));
     static_assert(StencilIndexField::kBits    >= sizeof(DisjointStencilIndex));
     static_assert(RenderStepField::kBits      >= SkNextLog2_portable(Renderer::kMaxRenderSteps));
     static_assert(PipelineField::kBits        >=
                         SkNextLog2_portable(Renderer::kMaxRenderSteps * DrawList::kMaxDraws));
     static_assert(GeometryUniformField::kBits >= PipelineField::kBits);
     static_assert(ShadingUniformField::kBits  >= PipelineField::kBits);
 };

 class DrawPass::Drawer final : public DrawDispatcher {
 public:
     Drawer(DrawPass* drawPass) : fPass(drawPass) {}
     ~Drawer() override = default;

     void bindDrawBuffers(BindBufferInfo vertexAttribs,
                          BindBufferInfo instanceAttribs,
                          BindBufferInfo indices) override {
         fPass->fCommands.emplace_back(BindDrawBuffers{vertexAttribs, instanceAttribs, indices});
     }

     void draw(PrimitiveType type, unsigned int baseVertex, unsigned int vertexCount) override {
         fPass->fCommands.emplace_back(Draw{type, baseVertex, vertexCount});
     }

     void drawIndexed(PrimitiveType type, unsigned int baseIndex,
                      unsigned int indexCount, unsigned int baseVertex) override {
         fPass->fCommands.emplace_back(DrawIndexed{type, baseIndex, indexCount, baseVertex});
     }

     void drawInstanced(PrimitiveType type,
                        unsigned int baseVertex, unsigned int vertexCount,
                        unsigned int baseInstance, unsigned int instanceCount) override {
         fPass->fCommands.emplace_back(DrawInstanced{type, baseVertex, vertexCount,
                                                     baseInstance, instanceCount});
     }

     void drawIndexedInstanced(PrimitiveType type,
                               unsigned int baseIndex, unsigned int indexCount,
                               unsigned int baseVertex, unsigned int baseInstance,
                               unsigned int instanceCount) override {
         fPass->fCommands.emplace_back(DrawIndexedInstanced{type, baseIndex, indexCount, baseVertex,
                                                            baseInstance, instanceCount});
     }

 private:
     DrawPass* fPass;
 };

 ///////////////////////////////////////////////////////////////////////////////////////////////////

 namespace {

 class UniformBindingCache {
 public:
     UniformBindingCache(DrawBufferManager* bufferMgr, UniformCache* cache)
             : fBufferMgr(bufferMgr), fCache(cache) {}

     uint32_t addUniforms(std::unique_ptr<SkUniformBlock> uniformBlock) {
         if (!uniformBlock || uniformBlock->empty()) {
             return UniformCache::kInvalidUniformID;
         }

         uint32_t index = fCache->insert(std::move(uniformBlock));
         if (fBindings.find(index) == fBindings.end()) {
             SkUniformBlock* tmp = fCache->lookup(index);
             // First time encountering this data, so upload to the GPU
             size_t totalDataSize = tmp->totalSize();
             auto [writer, bufferInfo] = fBufferMgr->getUniformWriter(totalDataSize);
             for (auto& u : *tmp) {
                 writer.write(u->data(), u->dataSize());
             }

             fBindings.insert({index, bufferInfo});
         }

         return index;
     }

     BindBufferInfo getBinding(uint32_t uniformIndex) {
         auto lookup = fBindings.find(uniformIndex);
         SkASSERT(lookup != fBindings.end());
         return lookup->second;
     }

 private:
     DrawBufferManager* fBufferMgr;
     UniformCache*      fCache;

     std::unordered_map<uint32_t, BindBufferInfo> fBindings;
 };

 // std::unordered_map implementation for GraphicsPipelineDesc* that de-reference the pointers.
 struct Hash {
     size_t operator()(const skgpu::GraphicsPipelineDesc* desc) const noexcept {
         return skgpu::GraphicsPipelineDesc::Hash()(*desc);
     }
 };

 struct Eq {
     bool operator()(const skgpu::GraphicsPipelineDesc* a,
                     const skgpu::GraphicsPipelineDesc* b) const noexcept {
         return *a == *b;
     }
 };

 } // anonymous namespace

 DrawPass::DrawPass(sk_sp<TextureProxy> target,
                    std::pair<LoadOp, StoreOp> ops,
                    std::array<float, 4> clearColor,
                    int renderStepCount)
         : fCommands(std::max(1, renderStepCount / 4), SkBlockAllocator::GrowthPolicy::kFibonacci)
         , fTarget(std::move(target))
         , fBounds(SkIRect::MakeEmpty())
         , fOps(ops)
         , fClearColor(clearColor) {
     // TODO: Tune this estimate and the above "itemPerBlock" value for the command buffer sequence
     // After merging, etc. one pipeline per recorded draw+step combo is likely unnecessary.
     fPipelineDescs.reserve(renderStepCount);
     fCommands.reserve(renderStepCount);
 }

 DrawPass::~DrawPass() = default;

 std::unique_ptr<DrawPass> DrawPass::Make(Recorder* recorder,
                                          std::unique_ptr<DrawList> draws,
                                          sk_sp<TextureProxy> target,
                                          std::pair<LoadOp, StoreOp> ops,
                                          std::array<float, 4> clearColor,
                                          const BoundsManager* occlusionCuller) {
     // NOTE: This assert is here to ensure SortKey is as tightly packed as possible. Any change to
     // its size should be done with care and good reason. The performance of sorting the keys is
     // heavily tied to the total size.
     //
     // At 24 bytes (current), sorting is about 30% slower than if SortKey could be packed into just
     // 16 bytes. There are several ways this could be done if necessary:
     //  - Restricting the max draw count to 16k (14-bits) and only using a single index to refer to
     //    the uniform data => 8 bytes of key, 8 bytes of pointer.
     //  - Restrict the max draw count to 32k (15-bits), use a single uniform index, and steal the
     //    4 low bits from the Draw* pointer since it's 16 byte aligned.
     //  - Compact the Draw* to an index into the original collection, although that has extra
     //    indirection and does not work as well with SkTBlockList.
     // In pseudo tests, manipulating the pointer or having to mask out indices was about 15% slower
     // than an 8 byte key and unmodified pointer.
     static_assert(sizeof(DrawPass::SortKey) == 16 + sizeof(void*));

     // The DrawList is converted directly into the DrawPass' data structures, but once the DrawPass
     // is returned from Make(), it is considered immutable.
     std::unique_ptr<DrawPass> drawPass(new DrawPass(std::move(target), ops, clearColor,
                                                     draws->renderStepCount()));

     Rect passBounds = Rect::InfiniteInverted();

     DrawBufferManager* bufferMgr = recorder->drawBufferManager();
     UniformCache geometryUniforms;
     UniformBindingCache geometryUniformBindings(bufferMgr, &geometryUniforms);
     UniformBindingCache shadingUniformBindings(bufferMgr, recorder->uniformCache());

     std::unordered_map<const GraphicsPipelineDesc*, uint32_t, Hash, Eq> pipelineDescToIndex;

     std::vector<SortKey> keys;
     keys.reserve(draws->renderStepCount()); // will not exceed but may use less with occluded draws

     for (const DrawList::Draw& draw : draws->fDraws.items()) {
         if (occlusionCuller && occlusionCuller->isOccluded(draw.fClip.drawBounds(),
                                                            draw.fOrder.depth())) {
             continue;
         }

         // If we have two different descriptors, such that the uniforms from the PaintParams can be
         // bound independently of those used by the rest of the RenderStep, then we can upload now
         // and remember the location for re-use on any RenderStep that does shading.
         SkUniquePaintParamsID shaderID;
         std::unique_ptr<SkUniformBlock> shadingUniforms;
         uint32_t shadingIndex = UniformCache::kInvalidUniformID;
         if (draw.fPaintParams.has_value()) {
             SkShaderCodeDictionary* dict = recorder->resourceProvider()->shaderCodeDictionary();
             std::tie(shaderID, shadingUniforms) = ExtractPaintData(dict, draw.fPaintParams.value());
             shadingIndex = shadingUniformBindings.addUniforms(std::move(shadingUniforms));
         } // else depth-only

         for (int stepIndex = 0; stepIndex < draw.fRenderer.numRenderSteps(); ++stepIndex) {
             const RenderStep* const step = draw.fRenderer.steps()[stepIndex];
             const bool performsShading = draw.fPaintParams.has_value() && step->performsShading();

             SkUniquePaintParamsID stepShaderID;
             uint32_t stepShadingIndex = UniformCache::kInvalidUniformID;
             if (performsShading) {
                 stepShaderID = shaderID;
                 stepShadingIndex = shadingIndex;
             } // else depth-only draw or stencil-only step of renderer so no shading is needed

             uint32_t geometryIndex = UniformCache::kInvalidUniformID;
             if (step->numUniforms() > 0) {
                 // TODO: Get layout from the GPU
                 auto uniforms = step->writeUniforms(Layout::kMetal,
                                                     draw.fClip.scissor(),
                                                     draw.fTransform,
                                                     draw.fShape);

                 geometryIndex = geometryUniformBindings.addUniforms(
                         std::make_unique<SkUniformBlock>(std::move(uniforms)));
             }

             GraphicsPipelineDesc desc;
             desc.setProgram(step, stepShaderID);
             uint32_t pipelineIndex = 0;
             auto pipelineLookup = pipelineDescToIndex.find(&desc);
             if (pipelineLookup == pipelineDescToIndex.end()) {
                 // Assign new index to first appearance of this pipeline description
                 pipelineIndex = SkTo<uint32_t>(drawPass->fPipelineDescs.count());
                 const GraphicsPipelineDesc& finalDesc = drawPass->fPipelineDescs.push_back(desc);
                 pipelineDescToIndex.insert({&finalDesc, pipelineIndex});
             } else {
                 // Reuse the existing pipeline description for better batching after sorting
                 pipelineIndex = pipelineLookup->second;
             }

             keys.push_back({&draw, stepIndex, pipelineIndex, geometryIndex, stepShadingIndex});
         }

         passBounds.join(draw.fClip.drawBounds());
         drawPass->fDepthStencilFlags |= draw.fRenderer.depthStencilFlags();
         drawPass->fRequiresMSAA |= draw.fRenderer.requiresMSAA();
     }

     // TODO: Explore sorting algorithms; in all likelihood this will be mostly sorted already, so
     // algorithms that approach O(n) in that condition may be favorable. Alternatively, could
     // explore radix sort that is always O(n). Brief testing suggested std::sort was faster than
     // std::stable_sort and SkTQSort on my [ml]'s Windows desktop. Also worth considering in-place
     // vs. algorithms that require an extra O(n) storage.
     // TODO: It's not strictly necessary, but would a stable sort be useful or just end up hiding
     // bugs in the DrawOrder determination code?
     std::sort(keys.begin(), keys.end());

     // Used to record vertex/instance data, buffer binds, and draw calls
     Drawer drawer(drawPass.get());
     DrawWriter drawWriter(&drawer, bufferMgr);

     // Used to track when a new pipeline or dynamic state needs recording between draw steps.
     // Setting to # render steps ensures the very first time through the loop will bind a pipeline.
     uint32_t lastPipeline = draws->renderStepCount();
     uint32_t lastShadingUniforms = UniformCache::kInvalidUniformID;
     uint32_t lastGeometryUniforms = UniformCache::kInvalidUniformID;
     SkIRect lastScissor = SkIRect::MakeSize(drawPass->fTarget->dimensions());

     for (const SortKey& key : keys) {
         const DrawList::Draw& draw = *key.draw();
         const RenderStep& renderStep = key.renderStep();

         const bool pipelineChange = key.pipeline() != lastPipeline;
         const bool stateChange = key.geometryUniforms() != lastGeometryUniforms ||
                                  key.shadingUniforms() != lastShadingUniforms ||
                                  draw.fClip.scissor() != lastScissor;

         // Update DrawWriter *before* we actually change any state so that accumulated draws from
         // the previous state use the proper state.
         if (pipelineChange) {
             drawWriter.newPipelineState(renderStep.primitiveType(),
                                         renderStep.vertexStride(),
                                         renderStep.instanceStride());
         } else if (stateChange) {
             drawWriter.newDynamicState();
         }

         // Make state changes before accumulating new draw data
         if (pipelineChange) {
             drawPass->fCommands.emplace_back(BindGraphicsPipeline{key.pipeline()});
             lastPipeline = key.pipeline();
             lastShadingUniforms = UniformCache::kInvalidUniformID;
             lastGeometryUniforms = UniformCache::kInvalidUniformID;
         }
         if (stateChange) {
             if (key.geometryUniforms() != lastGeometryUniforms) {
                 if (key.geometryUniforms() != UniformCache::kInvalidUniformID) {
                     auto binding = geometryUniformBindings.getBinding(key.geometryUniforms());
                     drawPass->fCommands.emplace_back(
                             BindUniformBuffer{binding, UniformSlot::kRenderStep});
                 }
                 lastGeometryUniforms = key.geometryUniforms();
             }
             if (key.shadingUniforms() != lastShadingUniforms) {
                 if (key.shadingUniforms() != UniformCache::kInvalidUniformID) {
                     auto binding = shadingUniformBindings.getBinding(key.shadingUniforms());
                     drawPass->fCommands.emplace_back(
                             BindUniformBuffer{binding, UniformSlot::kPaint});
                 }
                 lastShadingUniforms = key.shadingUniforms();
             }
             if (draw.fClip.scissor() != lastScissor) {
                 drawPass->fCommands.emplace_back(SetScissor{draw.fClip.scissor()});
                 lastScissor = draw.fClip.scissor();
             }
         }

         renderStep.writeVertices(&drawWriter, draw.fClip.scissor(), draw.fTransform, draw.fShape);
     }
     // Finish recording draw calls for any collected data at the end of the loop
     drawWriter.flush();

     passBounds.roundOut();
     drawPass->fBounds = SkIRect::MakeLTRB((int) passBounds.left(), (int) passBounds.top(),
                                           (int) passBounds.right(), (int) passBounds.bot());
     return drawPass;
 }

 void DrawPass::addCommands(ResourceProvider* resourceProvider,
                            CommandBuffer* buffer,
                            const RenderPassDesc& renderPassDesc) const {
     // TODO: Validate RenderPass state against DrawPass's target and requirements?
     // Generate actual GraphicsPipeline objects combining the target-level properties and each of
     // the GraphicsPipelineDesc's referenced in this DrawPass.

     // Use a vector instead of SkTBlockList for the full pipelines so that random access is fast.
     std::vector<sk_sp<GraphicsPipeline>> fullPipelines;
     fullPipelines.reserve(fPipelineDescs.count());
     for (const GraphicsPipelineDesc& pipelineDesc : fPipelineDescs.items()) {
         fullPipelines.push_back(resourceProvider->findOrCreateGraphicsPipeline(pipelineDesc,
                                                                                renderPassDesc));
     }

     // Set viewport to the entire texture for now (eventually, we may have logically smaller bounds
     // within an approx-sized texture). It is assumed that this also configures the sk_rtAdjust
     // intrinsic for programs (however the backend chooses to do so).
     buffer->setViewport(0, 0, fTarget->dimensions().width(), fTarget->dimensions().height());

     for (const Command& c : fCommands.items()) {
         switch(c.fType) {
             case CommandType::kBindGraphicsPipeline: {
                 auto& d = c.fBindGraphicsPipeline;
                 buffer->bindGraphicsPipeline(fullPipelines[d.fPipelineIndex]);
                 break; }
             case CommandType::kBindUniformBuffer: {
                 auto& d = c.fBindUniformBuffer;
                 buffer->bindUniformBuffer(d.fSlot, sk_ref_sp(d.fInfo.fBuffer), d.fInfo.fOffset);
                 break; }
             case CommandType::kBindDrawBuffers: {
                 auto& d = c.fBindDrawBuffers;
                 buffer->bindDrawBuffers(d.fVertices, d.fInstances, d.fIndices);
                 break; }
             case CommandType::kDraw: {
                 auto& d = c.fDraw;
                 buffer->draw(d.fType, d.fBaseVertex, d.fVertexCount);
                 break; }
             case CommandType::kDrawIndexed: {
                 auto& d = c.fDrawIndexed;
                 buffer->drawIndexed(d.fType, d.fBaseIndex, d.fIndexCount, d.fBaseVertex);
                 break; }
             case CommandType::kDrawInstanced: {
                 auto& d = c.fDrawInstanced;
                 buffer->drawInstanced(d.fType, d.fBaseVertex, d.fVertexCount,
                                       d.fBaseInstance, d.fInstanceCount);
                 break; }
             case CommandType::kDrawIndexedInstanced: {
                 auto& d = c.fDrawIndexedInstanced;
                 buffer->drawIndexedInstanced(d.fType, d.fBaseIndex, d.fIndexCount, d.fBaseVertex,
                                              d.fBaseInstance, d.fInstanceCount);
                 break; }
             case CommandType::kSetScissor: {
                 auto& d = c.fSetScissor;
                 buffer->setScissor(d.fScissor.fLeft, d.fScissor.fTop,
                                    d.fScissor.fRight, d.fScissor.fBottom);
                 break;
             }
         }
     }
 }

 } // namespace skgpu
	/*
	* Copyright 2021 Google LLC
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/

	#include "experimental/graphite/src/DrawPass.h"

	#include "experimental/graphite/include/GraphiteTypes.h"
	#include "experimental/graphite/include/Recorder.h"
	#include "experimental/graphite/src/Buffer.h"
	#include "experimental/graphite/src/ContextPriv.h"
	#include "experimental/graphite/src/DrawBufferManager.h"
	#include "experimental/graphite/src/DrawContext.h"
	#include "experimental/graphite/src/DrawList.h"
	#include "experimental/graphite/src/DrawWriter.h"
	#include "experimental/graphite/src/GlobalCache.h"
	#include "experimental/graphite/src/GraphicsPipeline.h"
	#include "experimental/graphite/src/GraphicsPipelineDesc.h"
	#include "experimental/graphite/src/Renderer.h"
	#include "experimental/graphite/src/ResourceProvider.h"
	#include "experimental/graphite/src/TextureProxy.h"
	#include "experimental/graphite/src/UniformCache.h"
	#include "experimental/graphite/src/UniformManager.h"
	#include "experimental/graphite/src/geom/BoundsManager.h"

	#include "src/core/SkMathPriv.h"
	#include "src/core/SkTBlockList.h"
	#include "src/core/SkUniformData.h"
	#include "src/gpu/BufferWriter.h"

	#include <algorithm>
	#include <unordered_map>

	namespace skgpu {

	// Helper to manage packed fields within a uint64_t
	template <uint64_t Bits, uint64_t Offset>
	struct Bitfield {
	static constexpr uint64_t kMask = ((uint64_t) 1 << Bits) - 1;
	static constexpr uint64_t kOffset = Offset;
	static constexpr uint64_t kBits = Bits;

	static uint32_t get(uint64_t v) { return static_cast<uint32_t>((v >> kOffset) & kMask); }
	static uint64_t set(uint32_t v) { return (v & kMask) << kOffset; }
	};

	/**
	* Each Draw in a DrawList might be processed by multiple RenderSteps (determined by the Draw's
	* Renderer), which can be sorted independently. Each (step, draw) pair produces its own SortKey.
	*
	* The goal of sorting draws for the DrawPass is to minimize pipeline transitions and dynamic binds
	* within a pipeline, while still respecting the overall painter's order. This decreases the number
	* of low-level draw commands in a command buffer and increases the size of those, allowing the GPU
	* to operate more efficiently and have fewer bubbles within its own instruction stream.
	*
	* The Draw's CompresssedPaintersOrder and DisjointStencilINdex represent the most significant bits
	* of the key, and are shared by all SortKeys produced by the same draw. Next, the pipeline
	* description is encoded in two steps:
	* 1. The index of the RenderStep packed in the high bits to ensure each step for a draw is
	* ordered correctly.
	* 2. An index into a cache of pipeline descriptions is used to encode the identity of the
	* pipeline (SortKeys that differ in the bits from #1 necessarily would have different
	* descriptions, but then the specific ordering of the RenderSteps isn't enforced).
	* Last, the SortKey encodes an index into the set of uniform bindings accumulated for a DrawPass.
	* This allows the SortKey to cluster draw steps that have both a compatible pipeline and do not
	* require rebinding uniform data or other state (e.g. scissor). Since the uniform data index and
	* the pipeline description index are packed into indices and not actual pointers, a given SortKey
	* is only valid for the a specific DrawList->DrawPass conversion.
	*/
	class DrawPass::SortKey {
	public:
	SortKey(const DrawList::Draw* draw,
	int renderStep,
	uint32_t pipelineIndex,
	uint32_t geomUniformIndex,
	uint32_t shadingUniformIndex)
	: fPipelineKey(ColorDepthOrderField::set(draw->fOrder.paintOrder().bits()) \|
	StencilIndexField::set(draw->fOrder.stencilIndex().bits()) \|
	RenderStepField::set(static_cast<uint32_t>(renderStep)) \|
	PipelineField::set(pipelineIndex))
	, fUniformKey(GeometryUniformField::set(geomUniformIndex) \|
	ShadingUniformField::set(shadingUniformIndex))
	, fDraw(draw) {
	SkASSERT(renderStep <= draw->fRenderer.numRenderSteps());
	}

	bool operator<(const SortKey& k) const {
	return fPipelineKey < k.fPipelineKey \|\|
	(fPipelineKey == k.fPipelineKey && fUniformKey < k.fUniformKey);
	}

	const RenderStep& renderStep() const {
	return *fDraw->fRenderer.steps()[RenderStepField::get(fPipelineKey)];
	}

	const DrawList::Draw* draw() const { return fDraw; }

	uint32_t pipeline() const { return PipelineField::get(fPipelineKey); }
	uint32_t geometryUniforms() const { return GeometryUniformField::get(fUniformKey); }
	uint32_t shadingUniforms() const { return ShadingUniformField::get(fUniformKey); }

	private:
	// Fields are ordered from most-significant to least when sorting by 128-bit value.
	// NOTE: We don't use bit fields because field ordering is implementation defined and we need
	// to sort consistently.
	using ColorDepthOrderField = Bitfield<16, 48>; // sizeof(CompressedPaintersOrder)
	using StencilIndexField = Bitfield<16, 32>; // sizeof(DisjointStencilIndex)
	using RenderStepField = Bitfield<2, 30>; // bits >= log2(Renderer::kMaxRenderSteps)
	using PipelineField = Bitfield<30, 0>; // bits >= log2(max steps*DrawList::kMaxDraws)
	uint64_t fPipelineKey;

	using GeometryUniformField = Bitfield<32, 32>; // bits >= log2(max steps * max draw count)
	using ShadingUniformField = Bitfield<32, 0>; // ""
	uint64_t fUniformKey;

	// Backpointer to the draw that produced the sort key
	const DrawList::Draw* fDraw;

	static_assert(ColorDepthOrderField::kBits >= sizeof(CompressedPaintersOrder));
	static_assert(StencilIndexField::kBits >= sizeof(DisjointStencilIndex));
	static_assert(RenderStepField::kBits >= SkNextLog2_portable(Renderer::kMaxRenderSteps));
	static_assert(PipelineField::kBits >=
	SkNextLog2_portable(Renderer::kMaxRenderSteps * DrawList::kMaxDraws));
	static_assert(GeometryUniformField::kBits >= PipelineField::kBits);
	static_assert(ShadingUniformField::kBits >= PipelineField::kBits);
	};

	class DrawPass::Drawer final : public DrawDispatcher {
	public:
	Drawer(DrawPass* drawPass) : fPass(drawPass) {}
	~Drawer() override = default;

	void bindDrawBuffers(BindBufferInfo vertexAttribs,
	BindBufferInfo instanceAttribs,
	BindBufferInfo indices) override {
	fPass->fCommands.emplace_back(BindDrawBuffers{vertexAttribs, instanceAttribs, indices});
	}

	void draw(PrimitiveType type, unsigned int baseVertex, unsigned int vertexCount) override {
	fPass->fCommands.emplace_back(Draw{type, baseVertex, vertexCount});
	}

	void drawIndexed(PrimitiveType type, unsigned int baseIndex,
	unsigned int indexCount, unsigned int baseVertex) override {
	fPass->fCommands.emplace_back(DrawIndexed{type, baseIndex, indexCount, baseVertex});
	}

	void drawInstanced(PrimitiveType type,
	unsigned int baseVertex, unsigned int vertexCount,
	unsigned int baseInstance, unsigned int instanceCount) override {
	fPass->fCommands.emplace_back(DrawInstanced{type, baseVertex, vertexCount,
	baseInstance, instanceCount});
	}

	void drawIndexedInstanced(PrimitiveType type,
	unsigned int baseIndex, unsigned int indexCount,
	unsigned int baseVertex, unsigned int baseInstance,
	unsigned int instanceCount) override {
	fPass->fCommands.emplace_back(DrawIndexedInstanced{type, baseIndex, indexCount, baseVertex,
	baseInstance, instanceCount});
	}

	private:
	DrawPass* fPass;
	};

	///////////////////////////////////////////////////////////////////////////////////////////////////

	namespace {

	class UniformBindingCache {
	public:
	UniformBindingCache(DrawBufferManager* bufferMgr, UniformCache* cache)
	: fBufferMgr(bufferMgr), fCache(cache) {}

	uint32_t addUniforms(std::unique_ptr<SkUniformBlock> uniformBlock) {
	if (!uniformBlock \|\| uniformBlock->empty()) {
	return UniformCache::kInvalidUniformID;
	}

	uint32_t index = fCache->insert(std::move(uniformBlock));
	if (fBindings.find(index) == fBindings.end()) {
	SkUniformBlock* tmp = fCache->lookup(index);
	// First time encountering this data, so upload to the GPU
	size_t totalDataSize = tmp->totalSize();
	auto [writer, bufferInfo] = fBufferMgr->getUniformWriter(totalDataSize);
	for (auto& u : *tmp) {
	writer.write(u->data(), u->dataSize());
	}

	fBindings.insert({index, bufferInfo});
	}

	return index;
	}

	BindBufferInfo getBinding(uint32_t uniformIndex) {
	auto lookup = fBindings.find(uniformIndex);
	SkASSERT(lookup != fBindings.end());
	return lookup->second;
	}

	private:
	DrawBufferManager* fBufferMgr;
	UniformCache* fCache;

	std::unordered_map<uint32_t, BindBufferInfo> fBindings;
	};

	// std::unordered_map implementation for GraphicsPipelineDesc* that de-reference the pointers.
	struct Hash {
	size_t operator()(const skgpu::GraphicsPipelineDesc* desc) const noexcept {
	return skgpu::GraphicsPipelineDesc::Hash()(*desc);
	}
	};

	struct Eq {
	bool operator()(const skgpu::GraphicsPipelineDesc* a,
	const skgpu::GraphicsPipelineDesc* b) const noexcept {
	return a == b;
	}
	};

	} // anonymous namespace

	DrawPass::DrawPass(sk_sp<TextureProxy> target,
	std::pair<LoadOp, StoreOp> ops,
	std::array<float, 4> clearColor,
	int renderStepCount)
	: fCommands(std::max(1, renderStepCount / 4), SkBlockAllocator::GrowthPolicy::kFibonacci)
	, fTarget(std::move(target))
	, fBounds(SkIRect::MakeEmpty())
	, fOps(ops)
	, fClearColor(clearColor) {
	// TODO: Tune this estimate and the above "itemPerBlock" value for the command buffer sequence
	// After merging, etc. one pipeline per recorded draw+step combo is likely unnecessary.
	fPipelineDescs.reserve(renderStepCount);
	fCommands.reserve(renderStepCount);
	}

	DrawPass::~DrawPass() = default;

	std::unique_ptr<DrawPass> DrawPass::Make(Recorder* recorder,
	std::unique_ptr<DrawList> draws,
	sk_sp<TextureProxy> target,
	std::pair<LoadOp, StoreOp> ops,
	std::array<float, 4> clearColor,
	const BoundsManager* occlusionCuller) {
	// NOTE: This assert is here to ensure SortKey is as tightly packed as possible. Any change to
	// its size should be done with care and good reason. The performance of sorting the keys is
	// heavily tied to the total size.
	//
	// At 24 bytes (current), sorting is about 30% slower than if SortKey could be packed into just
	// 16 bytes. There are several ways this could be done if necessary:
	// - Restricting the max draw count to 16k (14-bits) and only using a single index to refer to
	// the uniform data => 8 bytes of key, 8 bytes of pointer.
	// - Restrict the max draw count to 32k (15-bits), use a single uniform index, and steal the
	// 4 low bits from the Draw* pointer since it's 16 byte aligned.
	// - Compact the Draw* to an index into the original collection, although that has extra
	// indirection and does not work as well with SkTBlockList.
	// In pseudo tests, manipulating the pointer or having to mask out indices was about 15% slower
	// than an 8 byte key and unmodified pointer.
	static_assert(sizeof(DrawPass::SortKey) == 16 + sizeof(void*));

	// The DrawList is converted directly into the DrawPass' data structures, but once the DrawPass
	// is returned from Make(), it is considered immutable.
	std::unique_ptr<DrawPass> drawPass(new DrawPass(std::move(target), ops, clearColor,
	draws->renderStepCount()));

	Rect passBounds = Rect::InfiniteInverted();

	DrawBufferManager* bufferMgr = recorder->drawBufferManager();
	UniformCache geometryUniforms;
	UniformBindingCache geometryUniformBindings(bufferMgr, &geometryUniforms);
	UniformBindingCache shadingUniformBindings(bufferMgr, recorder->uniformCache());

	std::unordered_map<const GraphicsPipelineDesc*, uint32_t, Hash, Eq> pipelineDescToIndex;

	std::vector<SortKey> keys;
	keys.reserve(draws->renderStepCount()); // will not exceed but may use less with occluded draws

	for (const DrawList::Draw& draw : draws->fDraws.items()) {
	if (occlusionCuller && occlusionCuller->isOccluded(draw.fClip.drawBounds(),
	draw.fOrder.depth())) {
	continue;
	}

	// If we have two different descriptors, such that the uniforms from the PaintParams can be
	// bound independently of those used by the rest of the RenderStep, then we can upload now
	// and remember the location for re-use on any RenderStep that does shading.
	SkUniquePaintParamsID shaderID;
	std::unique_ptr<SkUniformBlock> shadingUniforms;
	uint32_t shadingIndex = UniformCache::kInvalidUniformID;
	if (draw.fPaintParams.has_value()) {
	SkShaderCodeDictionary* dict = recorder->resourceProvider()->shaderCodeDictionary();
	std::tie(shaderID, shadingUniforms) = ExtractPaintData(dict, draw.fPaintParams.value());
	shadingIndex = shadingUniformBindings.addUniforms(std::move(shadingUniforms));
	} // else depth-only

	for (int stepIndex = 0; stepIndex < draw.fRenderer.numRenderSteps(); ++stepIndex) {
	const RenderStep* const step = draw.fRenderer.steps()[stepIndex];
	const bool performsShading = draw.fPaintParams.has_value() && step->performsShading();

	SkUniquePaintParamsID stepShaderID;
	uint32_t stepShadingIndex = UniformCache::kInvalidUniformID;
	if (performsShading) {
	stepShaderID = shaderID;
	stepShadingIndex = shadingIndex;
	} // else depth-only draw or stencil-only step of renderer so no shading is needed

	uint32_t geometryIndex = UniformCache::kInvalidUniformID;
	if (step->numUniforms() > 0) {
	// TODO: Get layout from the GPU
	auto uniforms = step->writeUniforms(Layout::kMetal,
	draw.fClip.scissor(),
	draw.fTransform,
	draw.fShape);

	geometryIndex = geometryUniformBindings.addUniforms(
	std::make_unique<SkUniformBlock>(std::move(uniforms)));
	}

	GraphicsPipelineDesc desc;
	desc.setProgram(step, stepShaderID);
	uint32_t pipelineIndex = 0;
	auto pipelineLookup = pipelineDescToIndex.find(&desc);
	if (pipelineLookup == pipelineDescToIndex.end()) {
	// Assign new index to first appearance of this pipeline description
	pipelineIndex = SkTo<uint32_t>(drawPass->fPipelineDescs.count());
	const GraphicsPipelineDesc& finalDesc = drawPass->fPipelineDescs.push_back(desc);
	pipelineDescToIndex.insert({&finalDesc, pipelineIndex});
	} else {
	// Reuse the existing pipeline description for better batching after sorting
	pipelineIndex = pipelineLookup->second;
	}

	keys.push_back({&draw, stepIndex, pipelineIndex, geometryIndex, stepShadingIndex});
	}

	passBounds.join(draw.fClip.drawBounds());
	drawPass->fDepthStencilFlags \|= draw.fRenderer.depthStencilFlags();
	drawPass->fRequiresMSAA \|= draw.fRenderer.requiresMSAA();
	}

	// TODO: Explore sorting algorithms; in all likelihood this will be mostly sorted already, so
	// algorithms that approach O(n) in that condition may be favorable. Alternatively, could
	// explore radix sort that is always O(n). Brief testing suggested std::sort was faster than
	// std::stable_sort and SkTQSort on my [ml]'s Windows desktop. Also worth considering in-place
	// vs. algorithms that require an extra O(n) storage.
	// TODO: It's not strictly necessary, but would a stable sort be useful or just end up hiding
	// bugs in the DrawOrder determination code?
	std::sort(keys.begin(), keys.end());

	// Used to record vertex/instance data, buffer binds, and draw calls
	Drawer drawer(drawPass.get());
	DrawWriter drawWriter(&drawer, bufferMgr);

	// Used to track when a new pipeline or dynamic state needs recording between draw steps.
	// Setting to # render steps ensures the very first time through the loop will bind a pipeline.
	uint32_t lastPipeline = draws->renderStepCount();
	uint32_t lastShadingUniforms = UniformCache::kInvalidUniformID;
	uint32_t lastGeometryUniforms = UniformCache::kInvalidUniformID;
	SkIRect lastScissor = SkIRect::MakeSize(drawPass->fTarget->dimensions());

	for (const SortKey& key : keys) {
	const DrawList::Draw& draw = *key.draw();
	const RenderStep& renderStep = key.renderStep();

	const bool pipelineChange = key.pipeline() != lastPipeline;
	const bool stateChange = key.geometryUniforms() != lastGeometryUniforms \|\|
	key.shadingUniforms() != lastShadingUniforms \|\|
	draw.fClip.scissor() != lastScissor;

	// Update DrawWriter before we actually change any state so that accumulated draws from
	// the previous state use the proper state.
	if (pipelineChange) {
	drawWriter.newPipelineState(renderStep.primitiveType(),
	renderStep.vertexStride(),
	renderStep.instanceStride());
	} else if (stateChange) {
	drawWriter.newDynamicState();
	}

	// Make state changes before accumulating new draw data
	if (pipelineChange) {
	drawPass->fCommands.emplace_back(BindGraphicsPipeline{key.pipeline()});
	lastPipeline = key.pipeline();
	lastShadingUniforms = UniformCache::kInvalidUniformID;
	lastGeometryUniforms = UniformCache::kInvalidUniformID;
	}
	if (stateChange) {
	if (key.geometryUniforms() != lastGeometryUniforms) {
	if (key.geometryUniforms() != UniformCache::kInvalidUniformID) {
	auto binding = geometryUniformBindings.getBinding(key.geometryUniforms());
	drawPass->fCommands.emplace_back(
	BindUniformBuffer{binding, UniformSlot::kRenderStep});
	}
	lastGeometryUniforms = key.geometryUniforms();
	}
	if (key.shadingUniforms() != lastShadingUniforms) {
	if (key.shadingUniforms() != UniformCache::kInvalidUniformID) {
	auto binding = shadingUniformBindings.getBinding(key.shadingUniforms());
	drawPass->fCommands.emplace_back(
	BindUniformBuffer{binding, UniformSlot::kPaint});
	}
	lastShadingUniforms = key.shadingUniforms();
	}
	if (draw.fClip.scissor() != lastScissor) {
	drawPass->fCommands.emplace_back(SetScissor{draw.fClip.scissor()});
	lastScissor = draw.fClip.scissor();
	}
	}

	renderStep.writeVertices(&drawWriter, draw.fClip.scissor(), draw.fTransform, draw.fShape);
	}
	// Finish recording draw calls for any collected data at the end of the loop
	drawWriter.flush();

	passBounds.roundOut();
	drawPass->fBounds = SkIRect::MakeLTRB((int) passBounds.left(), (int) passBounds.top(),
	(int) passBounds.right(), (int) passBounds.bot());
	return drawPass;
	}

	void DrawPass::addCommands(ResourceProvider* resourceProvider,
	CommandBuffer* buffer,
	const RenderPassDesc& renderPassDesc) const {
	// TODO: Validate RenderPass state against DrawPass's target and requirements?
	// Generate actual GraphicsPipeline objects combining the target-level properties and each of
	// the GraphicsPipelineDesc's referenced in this DrawPass.

	// Use a vector instead of SkTBlockList for the full pipelines so that random access is fast.
	std::vector<sk_sp<GraphicsPipeline>> fullPipelines;
	fullPipelines.reserve(fPipelineDescs.count());
	for (const GraphicsPipelineDesc& pipelineDesc : fPipelineDescs.items()) {
	fullPipelines.push_back(resourceProvider->findOrCreateGraphicsPipeline(pipelineDesc,
	renderPassDesc));
	}

	// Set viewport to the entire texture for now (eventually, we may have logically smaller bounds
	// within an approx-sized texture). It is assumed that this also configures the sk_rtAdjust
	// intrinsic for programs (however the backend chooses to do so).
	buffer->setViewport(0, 0, fTarget->dimensions().width(), fTarget->dimensions().height());

	for (const Command& c : fCommands.items()) {
	switch(c.fType) {
	case CommandType::kBindGraphicsPipeline: {
	auto& d = c.fBindGraphicsPipeline;
	buffer->bindGraphicsPipeline(fullPipelines[d.fPipelineIndex]);
	break; }
	case CommandType::kBindUniformBuffer: {
	auto& d = c.fBindUniformBuffer;
	buffer->bindUniformBuffer(d.fSlot, sk_ref_sp(d.fInfo.fBuffer), d.fInfo.fOffset);
	break; }
	case CommandType::kBindDrawBuffers: {
	auto& d = c.fBindDrawBuffers;
	buffer->bindDrawBuffers(d.fVertices, d.fInstances, d.fIndices);
	break; }
	case CommandType::kDraw: {
	auto& d = c.fDraw;
	buffer->draw(d.fType, d.fBaseVertex, d.fVertexCount);
	break; }
	case CommandType::kDrawIndexed: {
	auto& d = c.fDrawIndexed;
	buffer->drawIndexed(d.fType, d.fBaseIndex, d.fIndexCount, d.fBaseVertex);
	break; }
	case CommandType::kDrawInstanced: {
	auto& d = c.fDrawInstanced;
	buffer->drawInstanced(d.fType, d.fBaseVertex, d.fVertexCount,
	d.fBaseInstance, d.fInstanceCount);
	break; }
	case CommandType::kDrawIndexedInstanced: {
	auto& d = c.fDrawIndexedInstanced;
	buffer->drawIndexedInstanced(d.fType, d.fBaseIndex, d.fIndexCount, d.fBaseVertex,
	d.fBaseInstance, d.fInstanceCount);
	break; }
	case CommandType::kSetScissor: {
	auto& d = c.fSetScissor;
	buffer->setScissor(d.fScissor.fLeft, d.fScissor.fTop,
	d.fScissor.fRight, d.fScissor.fBottom);
	break;
	}
	}
	}
	}

	} // namespace skgpu