Merge pull request #727 from billhollings/master

Improve performance of vkCmdFillBuffer().
diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md
index c9338d4..1d4a4b0 100644
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@@ -29,9 +29,10 @@
 - Fix crash when `VkDeviceCreateInfo` specifies queue families out of numerical order.
 - Fix crash in `vkDestroyPipelineLayout()`.
 - Fix crash when signalling swapchain semaphore using `MTLEvent`.
-- `vkCmdBlitImage()` support format component swizzling.
-- `vkCmdClearImage()` set error if attempt made to clear 1D image, and fix validation of depth attachment formats.
-- `vkCreateRenderPass()` return `VK_ERROR_FORMAT_NOT_SUPPORTED` if format not supported.
+- `vkCmdBlitImage():` Support format component swizzling.
+- `vkCmdClearImage():` Set error if attempt made to clear 1D image, and fix validation of depth attachment formats.
+- `vkCreateRenderPass():` Return `VK_ERROR_FORMAT_NOT_SUPPORTED` if format not supported.
+- `vkCmdFillBuffer():` Improve performance 150x by using parallelism more effectively.
 - Remove error logging on `VK_TIMEOUT` of `VkSemaphore` and `VkFence`.
 - Consolidate the various linkable objects into a `MVKLinkableMixin` template base class.
 - Use `MVKVector` whenever possible in MoltenVK, especially within render loop.
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.h b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.h
index 49812c8..eaea4c9 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.h
@@ -304,7 +304,7 @@
 protected:
     MVKBuffer* _dstBuffer;
     VkDeviceSize _dstOffset;
-    VkDeviceSize _size;
+    uint32_t _wordCount;
     uint32_t _dataValue;
 };
 
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
index a18ecd1..d0ac059 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
@@ -1151,43 +1151,60 @@
 #pragma mark -
 #pragma mark MVKCmdFillBuffer
 
-// Matches shader struct
-typedef struct {
-	uint32_t size;
-	uint32_t data;
-} MVKCmdFillBufferInfo;
-
 void MVKCmdFillBuffer::setContent(VkBuffer dstBuffer,
                                   VkDeviceSize dstOffset,
                                   VkDeviceSize size,
                                   uint32_t data) {
     _dstBuffer = (MVKBuffer*)dstBuffer;
     _dstOffset = dstOffset;
-    _size = size;
     _dataValue = data;
+
+	// Round up in case of VK_WHOLE_SIZE on a buffer size which is not aligned to 4 bytes.
+	VkDeviceSize byteCnt = (size == VK_WHOLE_SIZE) ? (_dstBuffer->getByteCount() - _dstOffset) : size;
+	VkDeviceSize wdCnt = (byteCnt + 3) >> 2;
+	if (mvkFits<uint32_t>(wdCnt)) {
+		_wordCount = (uint32_t)wdCnt;
+	} else {
+		setConfigurationResult(reportError(VK_ERROR_FEATURE_NOT_PRESENT, "vkCmdFillBuffer(): Buffer fill size must fit into a 32-bit unsigned integer. Fill size %llu is too large.", wdCnt));
+		_wordCount = std::numeric_limits<uint32_t>::max();
+	}
 }
 
 void MVKCmdFillBuffer::encode(MVKCommandEncoder* cmdEncoder) {
-    id<MTLBuffer> dstMTLBuff = _dstBuffer->getMTLBuffer();
-    VkDeviceSize dstMTLBuffOffset = _dstBuffer->getMTLBufferOffset();
-    VkDeviceSize byteCnt = (_size == VK_WHOLE_SIZE) ? (_dstBuffer->getByteCount() - _dstOffset) : _size;
+	if (_wordCount == 0) { return; }
 
-    // Round up in case of VK_WHOLE_SIZE on a buffer size which is not aligned to 4 bytes.
-    VkDeviceSize wordCnt = (byteCnt + 3) >> 2;
+	id<MTLBuffer> dstMTLBuff = _dstBuffer->getMTLBuffer();
+	NSUInteger dstMTLBuffOffset = _dstBuffer->getMTLBufferOffset() + _dstOffset;
 
-	MVKAssert(mvkFits<uint32_t>(wordCnt),
-			  "Buffer fill size must fit into a 32-bit unsigned integer.");
+	// Determine the number of full threadgroups we can dispatch to cover the buffer content efficiently.
+	id<MTLComputePipelineState> cps = getCommandEncodingPool()->getCmdFillBufferMTLComputePipelineState();
+	NSUInteger tgWidth = cps.maxTotalThreadsPerThreadgroup;
+	NSUInteger tgCount = _wordCount / tgWidth;
 
-	MVKCmdFillBufferInfo fillInfo;
-	fillInfo.size = (uint32_t)wordCnt;
-	fillInfo.data = _dataValue;
-
-	id<MTLComputeCommandEncoder> mtlComputeEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyBuffer);
+	id<MTLComputeCommandEncoder> mtlComputeEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseFillBuffer);
 	[mtlComputeEnc pushDebugGroup: @"vkCmdFillBuffer"];
-	[mtlComputeEnc setComputePipelineState: getCommandEncodingPool()->getCmdFillBufferMTLComputePipelineState()];
-	[mtlComputeEnc setBuffer: dstMTLBuff offset: dstMTLBuffOffset+_dstOffset atIndex: 0];
-	[mtlComputeEnc setBytes: &fillInfo length: sizeof(fillInfo) atIndex: 1];
-	[mtlComputeEnc dispatchThreadgroups: MTLSizeMake(1, 1, 1) threadsPerThreadgroup: MTLSizeMake(1, 1, 1)];
+	[mtlComputeEnc setComputePipelineState: cps];
+	[mtlComputeEnc setBytes: &_dataValue length: sizeof(_dataValue) atIndex: 1];
+	[mtlComputeEnc setBuffer: dstMTLBuff offset: dstMTLBuffOffset atIndex: 0];
+
+	// Run as many full threadgroups as will fit into the buffer content.
+	if (tgCount > 0) {
+		[mtlComputeEnc dispatchThreadgroups: MTLSizeMake(tgCount, 1, 1)
+					  threadsPerThreadgroup: MTLSizeMake(tgWidth, 1, 1)];
+	}
+
+	// If there is left-over buffer content after running full threadgroups, or if the buffer content
+	// fits within a single threadgroup, run a single partial threadgroup of the appropriate size.
+	uint32_t remainderWordCount = _wordCount % tgWidth;
+	if (remainderWordCount > 0) {
+		if (tgCount > 0) {		// If we've already written full threadgroups, skip ahead to unwritten content
+			dstMTLBuffOffset += tgCount * tgWidth * sizeof(_dataValue);
+			[mtlComputeEnc setBufferOffset: dstMTLBuffOffset atIndex: 0];
+		}
+		[mtlComputeEnc dispatchThreadgroups: MTLSizeMake(1, 1, 1)
+					  threadsPerThreadgroup: MTLSizeMake(remainderWordCount, 1, 1)];
+	}
+
 	[mtlComputeEnc popDebugGroup];
 }
 
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandPipelineStateFactoryShaderSource.h b/MoltenVK/MoltenVK/Commands/MVKCommandPipelineStateFactoryShaderSource.h
index 6726212..4e5065d 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandPipelineStateFactoryShaderSource.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandPipelineStateFactoryShaderSource.h
@@ -93,16 +93,10 @@
     }                                                                                                           \n\
 };                                                                                                              \n\
                                                                                                                 \n\
-typedef struct {                                                                                                \n\
-    uint32_t size;                                                                                              \n\
-    uint32_t data;                                                                                              \n\
-} FillInfo;                                                                                                     \n\
-                                                                                                                \n\
 kernel void cmdFillBuffer(device uint32_t* dst [[ buffer(0) ]],                                                 \n\
-                          constant FillInfo& info [[ buffer(1) ]]) {                                            \n\
-    for (uint32_t i = 0; i < info.size; i++) {                                                                  \n\
-        dst[i] = info.data;                                                                                     \n\
-    }                                                                                                           \n\
+                          constant uint32_t& fillValue [[ buffer(1) ]],                                         \n\
+                          uint pos [[thread_position_in_grid]]) {                                               \n\
+    dst[pos] = fillValue;                                                                                       \n\
 };                                                                                                              \n\
                                                                                                                 \n\
 typedef struct {                                                                                                \n\