| /* |
| * Copyright 2021 Google LLC |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #include "experimental/graphite/src/DrawPass.h" |
| |
| #include "experimental/graphite/include/GraphiteTypes.h" |
| #include "experimental/graphite/include/Recorder.h" |
| #include "experimental/graphite/src/Buffer.h" |
| #include "experimental/graphite/src/ContextPriv.h" |
| #include "experimental/graphite/src/ContextUtils.h" |
| #include "experimental/graphite/src/DrawBufferManager.h" |
| #include "experimental/graphite/src/DrawContext.h" |
| #include "experimental/graphite/src/DrawList.h" |
| #include "experimental/graphite/src/DrawWriter.h" |
| #include "experimental/graphite/src/GraphicsPipeline.h" |
| #include "experimental/graphite/src/GraphicsPipelineDesc.h" |
| #include "experimental/graphite/src/Renderer.h" |
| #include "experimental/graphite/src/ResourceProvider.h" |
| #include "experimental/graphite/src/TextureProxy.h" |
| #include "experimental/graphite/src/UniformCache.h" |
| #include "experimental/graphite/src/UniformManager.h" |
| #include "experimental/graphite/src/geom/BoundsManager.h" |
| |
| #include "src/core/SkMathPriv.h" |
| #include "src/core/SkTBlockList.h" |
| #include "src/gpu/BufferWriter.h" |
| |
| #include <algorithm> |
| #include <unordered_map> |
| |
| namespace skgpu { |
| |
| // Helper to manage packed fields within a uint64_t |
| template <uint64_t Bits, uint64_t Offset> |
| struct Bitfield { |
| static constexpr uint64_t kMask = ((uint64_t) 1 << Bits) - 1; |
| static constexpr uint64_t kOffset = Offset; |
| static constexpr uint64_t kBits = Bits; |
| |
| static uint32_t get(uint64_t v) { return static_cast<uint32_t>((v >> kOffset) & kMask); } |
| static uint64_t set(uint32_t v) { return (v & kMask) << kOffset; } |
| }; |
| |
| /** |
| * Each Draw in a DrawList might be processed by multiple RenderSteps (determined by the Draw's |
| * Renderer), which can be sorted independently. Each (step, draw) pair produces its own SortKey. |
| * |
| * The goal of sorting draws for the DrawPass is to minimize pipeline transitions and dynamic binds |
| * within a pipeline, while still respecting the overall painter's order. This decreases the number |
| * of low-level draw commands in a command buffer and increases the size of those, allowing the GPU |
| * to operate more efficiently and have fewer bubbles within its own instruction stream. |
| * |
| * The Draw's CompresssedPaintersOrder and DisjointStencilINdex represent the most significant bits |
| * of the key, and are shared by all SortKeys produced by the same draw. Next, the pipeline |
| * description is encoded in two steps: |
| * 1. The index of the RenderStep packed in the high bits to ensure each step for a draw is |
| * ordered correctly. |
| * 2. An index into a cache of pipeline descriptions is used to encode the identity of the |
| * pipeline (SortKeys that differ in the bits from #1 necessarily would have different |
| * descriptions, but then the specific ordering of the RenderSteps isn't enforced). |
| * Last, the SortKey encodes an index into the set of uniform bindings accumulated for a DrawPass. |
| * This allows the SortKey to cluster draw steps that have both a compatible pipeline and do not |
| * require rebinding uniform data or other state (e.g. scissor). Since the uniform data index and |
| * the pipeline description index are packed into indices and not actual pointers, a given SortKey |
| * is only valid for the a specific DrawList->DrawPass conversion. |
| */ |
| class DrawPass::SortKey { |
| public: |
| SortKey(const DrawList::Draw* draw, |
| int renderStep, |
| uint32_t pipelineIndex, |
| uint32_t geomUniformIndex, |
| uint32_t shadingUniformIndex) |
| : fPipelineKey(ColorDepthOrderField::set(draw->fOrder.paintOrder().bits()) | |
| StencilIndexField::set(draw->fOrder.stencilIndex().bits()) | |
| RenderStepField::set(static_cast<uint32_t>(renderStep)) | |
| PipelineField::set(pipelineIndex)) |
| , fUniformKey(GeometryUniformField::set(geomUniformIndex) | |
| ShadingUniformField::set(shadingUniformIndex)) |
| , fDraw(draw) { |
| SkASSERT(renderStep <= draw->fRenderer.numRenderSteps()); |
| } |
| |
| bool operator<(const SortKey& k) const { |
| return fPipelineKey < k.fPipelineKey || |
| (fPipelineKey == k.fPipelineKey && fUniformKey < k.fUniformKey); |
| } |
| |
| const RenderStep& renderStep() const { |
| return *fDraw->fRenderer.steps()[RenderStepField::get(fPipelineKey)]; |
| } |
| |
| const DrawList::Draw* draw() const { return fDraw; } |
| |
| uint32_t pipeline() const { return PipelineField::get(fPipelineKey); } |
| uint32_t geometryUniforms() const { return GeometryUniformField::get(fUniformKey); } |
| uint32_t shadingUniforms() const { return ShadingUniformField::get(fUniformKey); } |
| |
| private: |
| // Fields are ordered from most-significant to least when sorting by 128-bit value. |
| // NOTE: We don't use bit fields because field ordering is implementation defined and we need |
| // to sort consistently. |
| using ColorDepthOrderField = Bitfield<16, 48>; // sizeof(CompressedPaintersOrder) |
| using StencilIndexField = Bitfield<16, 32>; // sizeof(DisjointStencilIndex) |
| using RenderStepField = Bitfield<2, 30>; // bits >= log2(Renderer::kMaxRenderSteps) |
| using PipelineField = Bitfield<30, 0>; // bits >= log2(max steps*DrawList::kMaxDraws) |
| uint64_t fPipelineKey; |
| |
| using GeometryUniformField = Bitfield<32, 32>; // bits >= log2(max steps * max draw count) |
| using ShadingUniformField = Bitfield<32, 0>; // "" |
| uint64_t fUniformKey; |
| |
| // Backpointer to the draw that produced the sort key |
| const DrawList::Draw* fDraw; |
| |
| static_assert(ColorDepthOrderField::kBits >= sizeof(CompressedPaintersOrder)); |
| static_assert(StencilIndexField::kBits >= sizeof(DisjointStencilIndex)); |
| static_assert(RenderStepField::kBits >= SkNextLog2_portable(Renderer::kMaxRenderSteps)); |
| static_assert(PipelineField::kBits >= |
| SkNextLog2_portable(Renderer::kMaxRenderSteps * DrawList::kMaxDraws)); |
| static_assert(GeometryUniformField::kBits >= PipelineField::kBits); |
| static_assert(ShadingUniformField::kBits >= PipelineField::kBits); |
| }; |
| |
| class DrawPass::Drawer final : public DrawDispatcher { |
| public: |
| Drawer(DrawPass* drawPass) : fPass(drawPass) {} |
| ~Drawer() override = default; |
| |
| void bindDrawBuffers(BindBufferInfo vertexAttribs, |
| BindBufferInfo instanceAttribs, |
| BindBufferInfo indices) override { |
| fPass->fCommands.emplace_back(BindDrawBuffers{vertexAttribs, instanceAttribs, indices}); |
| } |
| |
| void draw(PrimitiveType type, unsigned int baseVertex, unsigned int vertexCount) override { |
| fPass->fCommands.emplace_back(Draw{type, baseVertex, vertexCount}); |
| } |
| |
| void drawIndexed(PrimitiveType type, unsigned int baseIndex, |
| unsigned int indexCount, unsigned int baseVertex) override { |
| fPass->fCommands.emplace_back(DrawIndexed{type, baseIndex, indexCount, baseVertex}); |
| } |
| |
| void drawInstanced(PrimitiveType type, |
| unsigned int baseVertex, unsigned int vertexCount, |
| unsigned int baseInstance, unsigned int instanceCount) override { |
| fPass->fCommands.emplace_back(DrawInstanced{type, baseVertex, vertexCount, |
| baseInstance, instanceCount}); |
| } |
| |
| void drawIndexedInstanced(PrimitiveType type, |
| unsigned int baseIndex, unsigned int indexCount, |
| unsigned int baseVertex, unsigned int baseInstance, |
| unsigned int instanceCount) override { |
| fPass->fCommands.emplace_back(DrawIndexedInstanced{type, baseIndex, indexCount, baseVertex, |
| baseInstance, instanceCount}); |
| } |
| |
| private: |
| DrawPass* fPass; |
| }; |
| |
| /////////////////////////////////////////////////////////////////////////////////////////////////// |
| |
| namespace { |
| |
| class UniformBindingCache { |
| public: |
| UniformBindingCache(DrawBufferManager* bufferMgr, UniformCache* cache) |
| : fBufferMgr(bufferMgr), fCache(cache) {} |
| |
| uint32_t addUniforms(sk_sp<UniformData> data) { |
| if (!data) { |
| return UniformCache::kInvalidUniformID; |
| } |
| |
| uint32_t index = fCache->insert(data); |
| if (fBindings.find(index) == fBindings.end()) { |
| // First time encountering this data, so upload to the GPU |
| auto [writer, bufferInfo] = fBufferMgr->getUniformWriter(data->dataSize()); |
| writer.write(data->data(), data->dataSize()); |
| fBindings.insert({index, bufferInfo}); |
| } |
| |
| return index; |
| } |
| |
| BindBufferInfo getBinding(uint32_t uniformIndex) { |
| auto lookup = fBindings.find(uniformIndex); |
| SkASSERT(lookup != fBindings.end()); |
| return lookup->second; |
| } |
| |
| private: |
| DrawBufferManager* fBufferMgr; |
| UniformCache* fCache; |
| |
| std::unordered_map<uint32_t, BindBufferInfo> fBindings; |
| }; |
| |
| // std::unordered_map implementation for GraphicsPipelineDesc* that de-reference the pointers. |
| struct Hash { |
| size_t operator()(const skgpu::GraphicsPipelineDesc* desc) const noexcept { |
| return skgpu::GraphicsPipelineDesc::Hash()(*desc); |
| } |
| }; |
| |
| struct Eq { |
| bool operator()(const skgpu::GraphicsPipelineDesc* a, |
| const skgpu::GraphicsPipelineDesc* b) const noexcept { |
| return *a == *b; |
| } |
| }; |
| |
| } // anonymous namespace |
| |
| DrawPass::DrawPass(sk_sp<TextureProxy> target, |
| std::pair<LoadOp, StoreOp> ops, |
| std::array<float, 4> clearColor, |
| int renderStepCount) |
| : fCommands(std::max(1, renderStepCount / 4), SkBlockAllocator::GrowthPolicy::kFibonacci) |
| , fTarget(std::move(target)) |
| , fBounds(SkIRect::MakeEmpty()) |
| , fOps(ops) |
| , fClearColor(clearColor) { |
| // TODO: Tune this estimate and the above "itemPerBlock" value for the command buffer sequence |
| // After merging, etc. one pipeline per recorded draw+step combo is likely unnecessary. |
| fPipelineDescs.reserve(renderStepCount); |
| fCommands.reserve(renderStepCount); |
| } |
| |
| DrawPass::~DrawPass() = default; |
| |
| std::unique_ptr<DrawPass> DrawPass::Make(Recorder* recorder, |
| std::unique_ptr<DrawList> draws, |
| sk_sp<TextureProxy> target, |
| std::pair<LoadOp, StoreOp> ops, |
| std::array<float, 4> clearColor, |
| const BoundsManager* occlusionCuller) { |
| // NOTE: This assert is here to ensure SortKey is as tightly packed as possible. Any change to |
| // its size should be done with care and good reason. The performance of sorting the keys is |
| // heavily tied to the total size. |
| // |
| // At 24 bytes (current), sorting is about 30% slower than if SortKey could be packed into just |
| // 16 bytes. There are several ways this could be done if necessary: |
| // - Restricting the max draw count to 16k (14-bits) and only using a single index to refer to |
| // the uniform data => 8 bytes of key, 8 bytes of pointer. |
| // - Restrict the max draw count to 32k (15-bits), use a single uniform index, and steal the |
| // 4 low bits from the Draw* pointer since it's 16 byte aligned. |
| // - Compact the Draw* to an index into the original collection, although that has extra |
| // indirection and does not work as well with SkTBlockList. |
| // In pseudo tests, manipulating the pointer or having to mask out indices was about 15% slower |
| // than an 8 byte key and unmodified pointer. |
| static_assert(sizeof(DrawPass::SortKey) == 16 + sizeof(void*)); |
| |
| // The DrawList is converted directly into the DrawPass' data structures, but once the DrawPass |
| // is returned from Make(), it is considered immutable. |
| std::unique_ptr<DrawPass> drawPass(new DrawPass(std::move(target), ops, clearColor, |
| draws->renderStepCount())); |
| |
| Rect passBounds = Rect::InfiniteInverted(); |
| |
| DrawBufferManager* bufferMgr = recorder->drawBufferManager(); |
| UniformCache geometryUniforms; |
| UniformBindingCache geometryUniformBindings(bufferMgr, &geometryUniforms); |
| UniformBindingCache shadingUniformBindings(bufferMgr, recorder->uniformCache()); |
| |
| std::unordered_map<const GraphicsPipelineDesc*, uint32_t, Hash, Eq> pipelineDescToIndex; |
| |
| std::vector<SortKey> keys; |
| keys.reserve(draws->renderStepCount()); // will not exceed but may use less with occluded draws |
| |
| for (const DrawList::Draw& draw : draws->fDraws.items()) { |
| if (occlusionCuller && occlusionCuller->isOccluded(draw.fClip.drawBounds(), |
| draw.fOrder.depth())) { |
| continue; |
| } |
| |
| // If we have two different descriptors, such that the uniforms from the PaintParams can be |
| // bound independently of those used by the rest of the RenderStep, then we can upload now |
| // and remember the location for re-use on any RenderStep that does shading. |
| SkUniquePaintParamsID shaderID; |
| sk_sp<UniformData> shadingUniforms = nullptr; |
| uint32_t shadingIndex = UniformCache::kInvalidUniformID; |
| if (draw.fPaintParams.has_value()) { |
| std::tie(shaderID, shadingUniforms) = ExtractPaintData(recorder->context(), |
| draw.fPaintParams.value()); |
| shadingIndex = shadingUniformBindings.addUniforms(shadingUniforms); |
| } // else depth-only |
| |
| for (int stepIndex = 0; stepIndex < draw.fRenderer.numRenderSteps(); ++stepIndex) { |
| const RenderStep* const step = draw.fRenderer.steps()[stepIndex]; |
| const bool performsShading = draw.fPaintParams.has_value() && step->performsShading(); |
| |
| SkUniquePaintParamsID stepShaderID; |
| uint32_t stepShadingIndex = UniformCache::kInvalidUniformID; |
| if (performsShading) { |
| stepShaderID = shaderID; |
| stepShadingIndex = shadingIndex; |
| } // else depth-only draw or stencil-only step of renderer so no shading is needed |
| |
| uint32_t geometryIndex = UniformCache::kInvalidUniformID; |
| if (step->numUniforms() > 0) { |
| // TODO: Get layout from the GPU |
| auto uniforms = step->writeUniforms(Layout::kMetal, |
| draw.fClip.scissor(), |
| draw.fTransform, |
| draw.fShape); |
| geometryIndex = geometryUniformBindings.addUniforms(std::move(uniforms)); |
| } |
| |
| GraphicsPipelineDesc desc; |
| desc.setProgram(step, stepShaderID); |
| uint32_t pipelineIndex = 0; |
| auto pipelineLookup = pipelineDescToIndex.find(&desc); |
| if (pipelineLookup == pipelineDescToIndex.end()) { |
| // Assign new index to first appearance of this pipeline description |
| pipelineIndex = SkTo<uint32_t>(drawPass->fPipelineDescs.count()); |
| const GraphicsPipelineDesc& finalDesc = drawPass->fPipelineDescs.push_back(desc); |
| pipelineDescToIndex.insert({&finalDesc, pipelineIndex}); |
| } else { |
| // Reuse the existing pipeline description for better batching after sorting |
| pipelineIndex = pipelineLookup->second; |
| } |
| |
| keys.push_back({&draw, stepIndex, pipelineIndex, geometryIndex, stepShadingIndex}); |
| } |
| |
| passBounds.join(draw.fClip.drawBounds()); |
| drawPass->fDepthStencilFlags |= draw.fRenderer.depthStencilFlags(); |
| drawPass->fRequiresMSAA |= draw.fRenderer.requiresMSAA(); |
| } |
| |
| // TODO: Explore sorting algorithms; in all likelihood this will be mostly sorted already, so |
| // algorithms that approach O(n) in that condition may be favorable. Alternatively, could |
| // explore radix sort that is always O(n). Brief testing suggested std::sort was faster than |
| // std::stable_sort and SkTQSort on my [ml]'s Windows desktop. Also worth considering in-place |
| // vs. algorithms that require an extra O(n) storage. |
| // TODO: It's not strictly necessary, but would a stable sort be useful or just end up hiding |
| // bugs in the DrawOrder determination code? |
| std::sort(keys.begin(), keys.end()); |
| |
| // Used to record vertex/instance data, buffer binds, and draw calls |
| Drawer drawer(drawPass.get()); |
| DrawWriter drawWriter(&drawer, bufferMgr); |
| |
| // Used to track when a new pipeline or dynamic state needs recording between draw steps. |
| // Setting to # render steps ensures the very first time through the loop will bind a pipeline. |
| uint32_t lastPipeline = draws->renderStepCount(); |
| uint32_t lastShadingUniforms = UniformCache::kInvalidUniformID; |
| uint32_t lastGeometryUniforms = UniformCache::kInvalidUniformID; |
| SkIRect lastScissor = SkIRect::MakeSize(drawPass->fTarget->dimensions()); |
| |
| for (const SortKey& key : keys) { |
| const DrawList::Draw& draw = *key.draw(); |
| const RenderStep& renderStep = key.renderStep(); |
| |
| const bool pipelineChange = key.pipeline() != lastPipeline; |
| const bool stateChange = key.geometryUniforms() != lastGeometryUniforms || |
| key.shadingUniforms() != lastShadingUniforms || |
| draw.fClip.scissor() != lastScissor; |
| |
| // Update DrawWriter *before* we actually change any state so that accumulated draws from |
| // the previous state use the proper state. |
| if (pipelineChange) { |
| drawWriter.newPipelineState(renderStep.primitiveType(), |
| renderStep.vertexStride(), |
| renderStep.instanceStride()); |
| } else if (stateChange) { |
| drawWriter.newDynamicState(); |
| } |
| |
| // Make state changes before accumulating new draw data |
| if (pipelineChange) { |
| drawPass->fCommands.emplace_back(BindGraphicsPipeline{key.pipeline()}); |
| lastPipeline = key.pipeline(); |
| lastShadingUniforms = UniformCache::kInvalidUniformID; |
| lastGeometryUniforms = UniformCache::kInvalidUniformID; |
| } |
| if (stateChange) { |
| if (key.geometryUniforms() != lastGeometryUniforms) { |
| if (key.geometryUniforms() != UniformCache::kInvalidUniformID) { |
| auto binding = geometryUniformBindings.getBinding(key.geometryUniforms()); |
| drawPass->fCommands.emplace_back( |
| BindUniformBuffer{binding, UniformSlot::kRenderStep}); |
| } |
| lastGeometryUniforms = key.geometryUniforms(); |
| } |
| if (key.shadingUniforms() != lastShadingUniforms) { |
| if (key.shadingUniforms() != UniformCache::kInvalidUniformID) { |
| auto binding = shadingUniformBindings.getBinding(key.shadingUniforms()); |
| drawPass->fCommands.emplace_back( |
| BindUniformBuffer{binding, UniformSlot::kPaint}); |
| } |
| lastShadingUniforms = key.shadingUniforms(); |
| } |
| if (draw.fClip.scissor() != lastScissor) { |
| drawPass->fCommands.emplace_back(SetScissor{draw.fClip.scissor()}); |
| lastScissor = draw.fClip.scissor(); |
| } |
| } |
| |
| renderStep.writeVertices(&drawWriter, draw.fClip.scissor(), draw.fTransform, draw.fShape); |
| } |
| // Finish recording draw calls for any collected data at the end of the loop |
| drawWriter.flush(); |
| |
| passBounds.roundOut(); |
| drawPass->fBounds = SkIRect::MakeLTRB((int) passBounds.left(), (int) passBounds.top(), |
| (int) passBounds.right(), (int) passBounds.bot()); |
| return drawPass; |
| } |
| |
| void DrawPass::addCommands(Context* context, CommandBuffer* buffer, |
| const RenderPassDesc& renderPassDesc) const { |
| auto resourceProvider = context->priv().resourceProvider(); |
| |
| // TODO: Validate RenderPass state against DrawPass's target and requirements? |
| // Generate actual GraphicsPipeline objects combining the target-level properties and each of |
| // the GraphicsPipelineDesc's referenced in this DrawPass. |
| |
| // Use a vector instead of SkTBlockList for the full pipelines so that random access is fast. |
| std::vector<sk_sp<GraphicsPipeline>> fullPipelines; |
| fullPipelines.reserve(fPipelineDescs.count()); |
| for (const GraphicsPipelineDesc& pipelineDesc : fPipelineDescs.items()) { |
| fullPipelines.push_back(resourceProvider->findOrCreateGraphicsPipeline( |
| context, pipelineDesc, renderPassDesc)); |
| } |
| |
| // Set viewport to the entire texture for now (eventually, we may have logically smaller bounds |
| // within an approx-sized texture). It is assumed that this also configures the sk_rtAdjust |
| // intrinsic for programs (however the backend chooses to do so). |
| buffer->setViewport(0, 0, fTarget->dimensions().width(), fTarget->dimensions().height()); |
| |
| for (const Command& c : fCommands.items()) { |
| switch(c.fType) { |
| case CommandType::kBindGraphicsPipeline: { |
| auto& d = c.fBindGraphicsPipeline; |
| buffer->bindGraphicsPipeline(fullPipelines[d.fPipelineIndex]); |
| break; } |
| case CommandType::kBindUniformBuffer: { |
| auto& d = c.fBindUniformBuffer; |
| buffer->bindUniformBuffer(d.fSlot, sk_ref_sp(d.fInfo.fBuffer), d.fInfo.fOffset); |
| break; } |
| case CommandType::kBindDrawBuffers: { |
| auto& d = c.fBindDrawBuffers; |
| buffer->bindDrawBuffers(d.fVertices, d.fInstances, d.fIndices); |
| break; } |
| case CommandType::kDraw: { |
| auto& d = c.fDraw; |
| buffer->draw(d.fType, d.fBaseVertex, d.fVertexCount); |
| break; } |
| case CommandType::kDrawIndexed: { |
| auto& d = c.fDrawIndexed; |
| buffer->drawIndexed(d.fType, d.fBaseIndex, d.fIndexCount, d.fBaseVertex); |
| break; } |
| case CommandType::kDrawInstanced: { |
| auto& d = c.fDrawInstanced; |
| buffer->drawInstanced(d.fType, d.fBaseVertex, d.fVertexCount, |
| d.fBaseInstance, d.fInstanceCount); |
| break; } |
| case CommandType::kDrawIndexedInstanced: { |
| auto& d = c.fDrawIndexedInstanced; |
| buffer->drawIndexedInstanced(d.fType, d.fBaseIndex, d.fIndexCount, d.fBaseVertex, |
| d.fBaseInstance, d.fInstanceCount); |
| break; } |
| case CommandType::kSetScissor: { |
| auto& d = c.fSetScissor; |
| buffer->setScissor(d.fScissor.fLeft, d.fScissor.fTop, |
| d.fScissor.fRight, d.fScissor.fBottom); |
| break; |
| } |
| } |
| } |
| } |
| |
| } // namespace skgpu |