[Graphite] Bin Uniform Buffer binding sizes into powers of 2. The goal here is to reduce the number of uniform descriptor sets that we need to create which are keyed on the size of the binding. Change-Id: I04768e97a446571f6d48b31ce8e71ca1b802bbd1 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/1203836 Reviewed-by: Michael Ludwig <michaelludwig@google.com> Commit-Queue: Greg Daniel <egdaniel@google.com>
diff --git a/src/gpu/graphite/BufferManager.cpp b/src/gpu/graphite/BufferManager.cpp index c041879..bd726251 100644 --- a/src/gpu/graphite/BufferManager.cpp +++ b/src/gpu/graphite/BufferManager.cpp
@@ -45,21 +45,26 @@ // are addressed, we can tighten this and decide on the transfer buffer sizing as well. [[maybe_unused]] static constexpr uint32_t kMaxStaticDataSize = 6 << 10; -uint32_t validate_count_and_stride(size_t count, size_t stride, uint32_t alignment) { +uint32_t validate_count_and_stride(size_t count, size_t stride, size_t headroom, + uint32_t alignment) { // size_t may just be uint32_t, so this ensures we have enough bits to // compute the required byte product. - uint64_t count64 = SkTo<uint64_t>(count); - uint64_t stride64 = SkTo<uint64_t>(stride); - uint64_t bytes64 = count64*stride64; + const uint64_t count64 = SkTo<uint64_t>(count); + const uint64_t stride64 = SkTo<uint64_t>(stride); + const uint64_t bytes64 = count64*stride64; + const uint64_t headroom64 = SkTo<uint64_t>(headroom); + const uint64_t bytesWithHeadroom64 = std::max(headroom64, bytes64); if (count64 > std::numeric_limits<uint32_t>::max() || stride64 > std::numeric_limits<uint32_t>::max() || - bytes64 > std::numeric_limits<uint32_t>::max() - (alignment + 1)) { + bytes64 > std::numeric_limits<uint32_t>::max() || + headroom64 > std::numeric_limits<uint32_t>::max() || + bytesWithHeadroom64 > std::numeric_limits<uint32_t>::max() - (alignment + 1)) { // Return 0 to skip further allocation attempts. return 0; } // Since count64 and stride64 fit into 32-bits, their product won't overflow a 64-bit multiply, // and we've confirmed product fits into 32-bits with head room to be aligned w/o overflow. - return SkTo<uint32_t>(bytes64); + return SkTo<uint32_t>(bytesWithHeadroom64); } // Calculates the LCM of `alignMaybePow2` and `alignProbNonPow2`. Neither value needs to be a @@ -209,7 +214,8 @@ return *this; } -void BufferSubAllocator::prepForStride(size_t stride, size_t align, size_t minCount) { +void BufferSubAllocator::prepForStride(size_t stride, size_t align, size_t minCount, + size_t headroom) { SkASSERT(stride > 0 && align > 0); // Expect valid inputs if (fBuffer) { if (fStride == stride && (align == 1 || align == stride)) { @@ -231,12 +237,15 @@ align32 = lcm_alignment(minAlignment, align32); } + const uint32_t stride32 = SkTo<uint32_t>(stride); + const uint32_t headroom32 = SkTo<uint32_t>(headroom); + const uint32_t reserveForHeadroom = headroom32 > stride32 ? headroom32 - stride32 : 0; // Ensures we won't overflow fOffset past buffer size once we align it - if (this->remainingBytes() >= align32 - 1) { + if (this->remainingBytes() >= align32 - 1 + reserveForHeadroom) { const uint32_t offset = SkAlignNonPow2(fOffset, align32); - SkASSERT(offset <= fBuffer->size()); - fStride = SkTo<uint32_t>(stride); - fRemaining = (SkTo<uint32_t>(fBuffer->size()) - offset) / fStride; + SkASSERT(offset + reserveForHeadroom <= fBuffer->size()); + fStride = stride32; + fRemaining = (SkTo<uint32_t>(fBuffer->size()) - offset - reserveForHeadroom) / fStride; if (fRemaining > 0 && fRemaining >= minCount) { // Successful prep, so preserve the aligned offset fOffset = offset; @@ -449,12 +458,14 @@ size_t count, size_t stride, size_t xtraAlignment, + size_t headroom, ClearBuffer cleared, Shareable shareable) { BufferState& state = fCurrentBuffers[stateIndex]; // The size for a buffer is aligned to the minimum block size for better resource reuse, which // is more conservative than fMinAlignment. - uint32_t requiredBytes32 = validate_count_and_stride(count, stride, state.fMinBlockSize); + uint32_t requiredBytes32 = validate_count_and_stride(count, stride, headroom, + state.fMinBlockSize); if (fMappingFailed || !requiredBytes32) { return {}; } @@ -469,7 +480,7 @@ // managed by the caller, so always create a new BufferSubAllocator. if (shareable == Shareable::kNo) { state.fAvailableBuffer.resetForNewBinding(); // ensure we include min binding alignment - state.fAvailableBuffer.prepForStride(stride, xtraAlignment, count); + state.fAvailableBuffer.prepForStride(stride, xtraAlignment, count, headroom); if (state.fAvailableBuffer.availableWithStride() >= count) { SkASSERT(state.fAvailableBuffer.fBuffer); SkASSERT(state.fAvailableBuffer.fBuffer->shareable() == shareable); @@ -587,7 +598,8 @@ SkASSERT(target); *target = {nullptr, 0}; - uint32_t size32 = validate_count_and_stride(requiredBytes, /*stride=*/1, align32); + uint32_t size32 = validate_count_and_stride(requiredBytes, /*stride=*/1, /*headroom=*/0, + align32); if (!size32 || fMappingFailed) { return nullptr; }
diff --git a/src/gpu/graphite/BufferManager.h b/src/gpu/graphite/BufferManager.h index 96f36d3..192ceeb 100644 --- a/src/gpu/graphite/BufferManager.h +++ b/src/gpu/graphite/BufferManager.h
@@ -115,10 +115,26 @@ size_t stride, size_t align=1) { SkASSERT(fMappedPtr || !fBuffer); // Writing should have checked validity of allocator first - this->prepForStride(stride, align, count); + this->prepForStride(stride, align, count, /*headroom=*/0); return this->reserve(count, &BufferSubAllocator::getWriterAndBinding); } + /** + * Similar to getMappedSubrange above but count and align = 1. + * + * Additionally it adds another paramenter headroom. Headroom is used to make sure there are at + * least headroom bytes from the beginning of the returned subrange to the end of the buffer. + * This is useful for when you want the bind buffer size to be larger than the actual data size + * of stride. This call will still return a BindBufferInfo that has a size of stride, so the + * caller will need to manually adjust the binding size if they want a larger value. + */ + std::pair<BufferWriter, BindBufferInfo> getMappedSubrangeWithHeadroom(size_t stride, + size_t headroom) { + SkASSERT(fMappedPtr || !fBuffer); // Writing should have checked validity of allocator first + this->prepForStride(stride, /*align=*/1, /*minCount=*/1, headroom); + return this->reserve(/*count=*/1, &BufferSubAllocator::getWriterAndBinding); + } + // Sub-allocate a slice within the scratch buffer object. This variation should be used when the // returned range will be written to by the GPU as part of executing a command buffer. // @@ -126,7 +142,7 @@ // suballocation behaves identically to getMappedSubrange(). BindBufferInfo getSubrange(size_t count, size_t stride, size_t align=1) { SkASSERT(!fMappedPtr); // Should not be used when data is intended to be written by CPU - this->prepForStride(stride, align, count); + this->prepForStride(stride, align, count, /*headroom=*/0); return this->reserve(count, &BufferSubAllocator::binding); } @@ -164,7 +180,7 @@ // binding alignment (when fStride == 0), then fOffset is updated and fRemaining is set to the // number of stride units that fit in the rest of the buffer after fOffset. If not, fRemaining // is set to 0 and fOffset is unmodified. - void prepForStride(size_t stride, size_t align, size_t minCount); + void prepForStride(size_t stride, size_t align, size_t minCount, size_t headroom); template <typename T> T reserve(size_t count, T (BufferSubAllocator::*create)(uint32_t offset, uint32_t size) const) { @@ -275,8 +291,16 @@ MappedAllocationInfo getMappedIndexBuffer(size_t count) { return this->getMappedBuffer(kIndexBufferIndex, count, sizeof(uint16_t)); } - MappedAllocationInfo getMappedUniformBuffer(size_t count, size_t stride) { - return this->getMappedBuffer(kUniformBufferIndex, count, stride); + MappedAllocationInfo getMappedUniformBuffer(size_t stride, size_t headroom) { + BufferSubAllocator buffer = this->getBuffer(kUniformBufferIndex, + /*count=*/1, + stride, + /*xtraAlignment=*/1, + headroom, + ClearBuffer::kNo, + Shareable::kNo); + auto [writer, binding] = buffer.getMappedSubrangeWithHeadroom(stride, headroom); + return {std::move(writer), binding, std::move(buffer)}; } MappedAllocationInfo getMappedStorageBuffer(size_t count, size_t stride) { return this->getMappedBuffer(kStorageBufferIndex, count, stride); @@ -316,7 +340,7 @@ // This type of usage is currently limited to GPU-only storage buffers. BufferSubAllocator getScratchStorage(size_t requiredBytes) { return this->getBuffer(kGpuOnlyStorageBufferIndex, requiredBytes, - /*stride=*/1, /*xtraAlignment=*/1, + /*stride=*/1, /*xtraAlignment=*/1, /*headroom=*/0, ClearBuffer::kNo, Shareable::kScratch); } @@ -362,6 +386,7 @@ size_t count, size_t stride, size_t xtraAlignment, + size_t headroom, ClearBuffer cleared, Shareable shareable); @@ -371,6 +396,7 @@ std::max(count, reservedCount), stride, xtraAlignment, + /*headroom=*/0, ClearBuffer::kNo, Shareable::kNo); auto [writer, binding] = buffer.getMappedSubrange(count, stride); @@ -380,7 +406,7 @@ // Helper method for the public GPU-only BufferBindInfo methods BindBufferInfo getBinding(int stateIndex, size_t requiredBytes, ClearBuffer cleared) { auto alloc = this->getBuffer(stateIndex, requiredBytes, - /*stride=*/1, /*xtraAlignment=*/1, + /*stride=*/1, /*xtraAlignment=*/1, /*headroom=*/0, cleared, Shareable::kNo); // `alloc` goes out of scope when this returns, but that is okay because it is only used // for GPU-only, non-shareable buffers. The returned BindBufferInfo will be unique still.
diff --git a/src/gpu/graphite/DrawListBase.h b/src/gpu/graphite/DrawListBase.h index a132e03..a747a57 100644 --- a/src/gpu/graphite/DrawListBase.h +++ b/src/gpu/graphite/DrawListBase.h
@@ -154,6 +154,8 @@ UniformDataCache::Entry& uniformData = uniformCache.lookup(index); const size_t uniformDataSize = uniformData.fCpuData.size(); + size_t uniformBindingSize = + fUseStorageBuffers ? uniformDataSize : SkNextSizePow2(uniformDataSize); // Upload the uniform data if we haven't already. // Alternatively, re-upload the uniform data to avoid a rebind if we're using storage @@ -163,13 +165,15 @@ uniformData.fBufferBinding.fBuffer != fLastBinding.fBuffer)) { BufferWriter writer; std::tie(writer, uniformData.fBufferBinding) = - fCurrentBuffer.getMappedSubrange(1, uniformDataSize); + fCurrentBuffer.getMappedSubrangeWithHeadroom(uniformDataSize, + uniformBindingSize); if (!writer) { // Allocate a new buffer std::tie(writer, uniformData.fBufferBinding, fCurrentBuffer) = fUseStorageBuffers ? bufferMgr->getMappedStorageBuffer(1, uniformDataSize) - : bufferMgr->getMappedUniformBuffer(1, uniformDataSize); + : bufferMgr->getMappedUniformBuffer(uniformDataSize, + uniformBindingSize); if (!writer) { return {}; // Allocation failed so early out } @@ -186,6 +190,7 @@ } else { // Every new set of uniform data has to be bound, this ensures its aligned // correctly + uniformData.fBufferBinding.fSize = SkTo<uint32_t>(uniformBindingSize); fCurrentBuffer.resetForNewBinding(); } }
diff --git a/src/gpu/graphite/compute/DispatchGroup.cpp b/src/gpu/graphite/compute/DispatchGroup.cpp index c5da0d9..844a6f0 100644 --- a/src/gpu/graphite/compute/DispatchGroup.cpp +++ b/src/gpu/graphite/compute/DispatchGroup.cpp
@@ -370,7 +370,7 @@ SkASSERT(!dataBlock.empty()); auto [writer, bufInfo, _] = - bufferMgr->getMappedUniformBuffer(dataBlock.size(), /*stride=*/1); + bufferMgr->getMappedUniformBuffer(dataBlock.size(), /*headroom=*/0); if (bufInfo) { writer.write(dataBlock.data(), dataBlock.size()); result = bufInfo;