vkCmdFillBuffer(): Avoid threadgroup Metal validation error on Intel devices.

On Intel devices, Metal reports different values for max threadgroup width
between the pipeline state and device, so conservatively use the minimum of
these two reported values.
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
index d0ac059..1c8e882 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
@@ -1177,8 +1177,10 @@
 	NSUInteger dstMTLBuffOffset = _dstBuffer->getMTLBufferOffset() + _dstOffset;
 
 	// Determine the number of full threadgroups we can dispatch to cover the buffer content efficiently.
+	// Some GPU's report different values for max threadgroup width between the pipeline state and device,
+	// so conservatively use the minimum of these two reported values.
 	id<MTLComputePipelineState> cps = getCommandEncodingPool()->getCmdFillBufferMTLComputePipelineState();
-	NSUInteger tgWidth = cps.maxTotalThreadsPerThreadgroup;
+	NSUInteger tgWidth = std::min(cps.maxTotalThreadsPerThreadgroup, getMTLDevice().maxThreadsPerThreadgroup.width);
 	NSUInteger tgCount = _wordCount / tgWidth;
 
 	id<MTLComputeCommandEncoder> mtlComputeEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseFillBuffer);