Merge pull request #727 from billhollings/master
Improve performance of vkCmdFillBuffer().
diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md
index c9338d4..1d4a4b0 100644
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@@ -29,9 +29,10 @@
- Fix crash when `VkDeviceCreateInfo` specifies queue families out of numerical order.
- Fix crash in `vkDestroyPipelineLayout()`.
- Fix crash when signalling swapchain semaphore using `MTLEvent`.
-- `vkCmdBlitImage()` support format component swizzling.
-- `vkCmdClearImage()` set error if attempt made to clear 1D image, and fix validation of depth attachment formats.
-- `vkCreateRenderPass()` return `VK_ERROR_FORMAT_NOT_SUPPORTED` if format not supported.
+- `vkCmdBlitImage():` Support format component swizzling.
+- `vkCmdClearImage():` Set error if attempt made to clear 1D image, and fix validation of depth attachment formats.
+- `vkCreateRenderPass():` Return `VK_ERROR_FORMAT_NOT_SUPPORTED` if format not supported.
+- `vkCmdFillBuffer():` Improve performance 150x by using parallelism more effectively.
- Remove error logging on `VK_TIMEOUT` of `VkSemaphore` and `VkFence`.
- Consolidate the various linkable objects into a `MVKLinkableMixin` template base class.
- Use `MVKVector` whenever possible in MoltenVK, especially within render loop.
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.h b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.h
index 49812c8..eaea4c9 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.h
@@ -304,7 +304,7 @@
protected:
MVKBuffer* _dstBuffer;
VkDeviceSize _dstOffset;
- VkDeviceSize _size;
+ uint32_t _wordCount;
uint32_t _dataValue;
};
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
index a18ecd1..d0ac059 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
@@ -1151,43 +1151,60 @@
#pragma mark -
#pragma mark MVKCmdFillBuffer
-// Matches shader struct
-typedef struct {
- uint32_t size;
- uint32_t data;
-} MVKCmdFillBufferInfo;
-
void MVKCmdFillBuffer::setContent(VkBuffer dstBuffer,
VkDeviceSize dstOffset,
VkDeviceSize size,
uint32_t data) {
_dstBuffer = (MVKBuffer*)dstBuffer;
_dstOffset = dstOffset;
- _size = size;
_dataValue = data;
+
+ // Round up in case of VK_WHOLE_SIZE on a buffer size which is not aligned to 4 bytes.
+ VkDeviceSize byteCnt = (size == VK_WHOLE_SIZE) ? (_dstBuffer->getByteCount() - _dstOffset) : size;
+ VkDeviceSize wdCnt = (byteCnt + 3) >> 2;
+ if (mvkFits<uint32_t>(wdCnt)) {
+ _wordCount = (uint32_t)wdCnt;
+ } else {
+ setConfigurationResult(reportError(VK_ERROR_FEATURE_NOT_PRESENT, "vkCmdFillBuffer(): Buffer fill size must fit into a 32-bit unsigned integer. Fill size %llu is too large.", wdCnt));
+ _wordCount = std::numeric_limits<uint32_t>::max();
+ }
}
void MVKCmdFillBuffer::encode(MVKCommandEncoder* cmdEncoder) {
- id<MTLBuffer> dstMTLBuff = _dstBuffer->getMTLBuffer();
- VkDeviceSize dstMTLBuffOffset = _dstBuffer->getMTLBufferOffset();
- VkDeviceSize byteCnt = (_size == VK_WHOLE_SIZE) ? (_dstBuffer->getByteCount() - _dstOffset) : _size;
+ if (_wordCount == 0) { return; }
- // Round up in case of VK_WHOLE_SIZE on a buffer size which is not aligned to 4 bytes.
- VkDeviceSize wordCnt = (byteCnt + 3) >> 2;
+ id<MTLBuffer> dstMTLBuff = _dstBuffer->getMTLBuffer();
+ NSUInteger dstMTLBuffOffset = _dstBuffer->getMTLBufferOffset() + _dstOffset;
- MVKAssert(mvkFits<uint32_t>(wordCnt),
- "Buffer fill size must fit into a 32-bit unsigned integer.");
+ // Determine the number of full threadgroups we can dispatch to cover the buffer content efficiently.
+ id<MTLComputePipelineState> cps = getCommandEncodingPool()->getCmdFillBufferMTLComputePipelineState();
+ NSUInteger tgWidth = cps.maxTotalThreadsPerThreadgroup;
+ NSUInteger tgCount = _wordCount / tgWidth;
- MVKCmdFillBufferInfo fillInfo;
- fillInfo.size = (uint32_t)wordCnt;
- fillInfo.data = _dataValue;
-
- id<MTLComputeCommandEncoder> mtlComputeEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyBuffer);
+ id<MTLComputeCommandEncoder> mtlComputeEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseFillBuffer);
[mtlComputeEnc pushDebugGroup: @"vkCmdFillBuffer"];
- [mtlComputeEnc setComputePipelineState: getCommandEncodingPool()->getCmdFillBufferMTLComputePipelineState()];
- [mtlComputeEnc setBuffer: dstMTLBuff offset: dstMTLBuffOffset+_dstOffset atIndex: 0];
- [mtlComputeEnc setBytes: &fillInfo length: sizeof(fillInfo) atIndex: 1];
- [mtlComputeEnc dispatchThreadgroups: MTLSizeMake(1, 1, 1) threadsPerThreadgroup: MTLSizeMake(1, 1, 1)];
+ [mtlComputeEnc setComputePipelineState: cps];
+ [mtlComputeEnc setBytes: &_dataValue length: sizeof(_dataValue) atIndex: 1];
+ [mtlComputeEnc setBuffer: dstMTLBuff offset: dstMTLBuffOffset atIndex: 0];
+
+ // Run as many full threadgroups as will fit into the buffer content.
+ if (tgCount > 0) {
+ [mtlComputeEnc dispatchThreadgroups: MTLSizeMake(tgCount, 1, 1)
+ threadsPerThreadgroup: MTLSizeMake(tgWidth, 1, 1)];
+ }
+
+ // If there is left-over buffer content after running full threadgroups, or if the buffer content
+ // fits within a single threadgroup, run a single partial threadgroup of the appropriate size.
+ uint32_t remainderWordCount = _wordCount % tgWidth;
+ if (remainderWordCount > 0) {
+ if (tgCount > 0) { // If we've already written full threadgroups, skip ahead to unwritten content
+ dstMTLBuffOffset += tgCount * tgWidth * sizeof(_dataValue);
+ [mtlComputeEnc setBufferOffset: dstMTLBuffOffset atIndex: 0];
+ }
+ [mtlComputeEnc dispatchThreadgroups: MTLSizeMake(1, 1, 1)
+ threadsPerThreadgroup: MTLSizeMake(remainderWordCount, 1, 1)];
+ }
+
[mtlComputeEnc popDebugGroup];
}
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandPipelineStateFactoryShaderSource.h b/MoltenVK/MoltenVK/Commands/MVKCommandPipelineStateFactoryShaderSource.h
index 6726212..4e5065d 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandPipelineStateFactoryShaderSource.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandPipelineStateFactoryShaderSource.h
@@ -93,16 +93,10 @@
} \n\
}; \n\
\n\
-typedef struct { \n\
- uint32_t size; \n\
- uint32_t data; \n\
-} FillInfo; \n\
- \n\
kernel void cmdFillBuffer(device uint32_t* dst [[ buffer(0) ]], \n\
- constant FillInfo& info [[ buffer(1) ]]) { \n\
- for (uint32_t i = 0; i < info.size; i++) { \n\
- dst[i] = info.data; \n\
- } \n\
+ constant uint32_t& fillValue [[ buffer(1) ]], \n\
+ uint pos [[thread_position_in_grid]]) { \n\
+ dst[pos] = fillValue; \n\
}; \n\
\n\
typedef struct { \n\