Merge pull request #104 from billhollings/master

 Derive vkCmdCopyBuffer() alignment requirement at runtime.
diff --git a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
index 234a51d..c90f657 100644
--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@@ -84,6 +84,7 @@
     VkDeviceSize maxMTLBufferSize;              /**< The max size of a MTLBuffer (in bytes). */
     VkDeviceSize mtlBufferAlignment;            /**< The alignment used when allocating memory for MTLBuffers. Must be PoT. */
     VkDeviceSize maxQueryBufferSize;            /**< The maximum size of an occlusion query buffer (in bytes). */
+	VkDeviceSize mtlCopyBufferAlignment;        /**< The alignment required during buffer copy operations (in bytes). */
     VkSampleCountFlags supportedSampleCounts;   /**< A bitmask identifying the sample counts supported by the device. */
 } MVKPhysicalDeviceMetalFeatures;
 
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdDispatch.mm b/MoltenVK/MoltenVK/Commands/MVKCmdDispatch.mm
index 68d19a8..0ab0d68 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdDispatch.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdDispatch.mm
@@ -37,8 +37,8 @@
 //    MVKLogDebug("vkCmdDispatch() dispatching (%d, %d, %d) threadgroups.", _x, _y, _z);
 
 	cmdEncoder->finalizeDispatchState();	// Ensure all updated state has been submitted to Metal
-    [cmdEncoder->getMTLComputeEncoder() dispatchThreadgroups: _mtlThreadgroupCount
-                                       threadsPerThreadgroup: cmdEncoder->_mtlThreadgroupSize];
+    [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseDispatch) dispatchThreadgroups: _mtlThreadgroupCount
+															 threadsPerThreadgroup: cmdEncoder->_mtlThreadgroupSize];
 }
 
 MVKCmdDispatch::MVKCmdDispatch(MVKCommandTypePool<MVKCmdDispatch>* pool)
@@ -59,9 +59,9 @@
 //    MVKLogDebug("vkCmdDispatchIndirect() dispatching indirectly.");
 
     cmdEncoder->finalizeDispatchState();	// Ensure all updated state has been submitted to Metal
-    [cmdEncoder->getMTLComputeEncoder() dispatchThreadgroupsWithIndirectBuffer: _mtlIndirectBuffer
-                                                          indirectBufferOffset: _mtlIndirectBufferOffset
-                                                         threadsPerThreadgroup: cmdEncoder->_mtlThreadgroupSize];
+    [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseDispatch) dispatchThreadgroupsWithIndirectBuffer: _mtlIndirectBuffer
+																				indirectBufferOffset: _mtlIndirectBufferOffset
+																			   threadsPerThreadgroup: cmdEncoder->_mtlThreadgroupSize];
 }
 
 MVKCmdDispatchIndirect::MVKCmdDispatchIndirect(MVKCommandTypePool<MVKCmdDispatchIndirect>* pool)
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
index 9b3e81c..ec628d1 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
@@ -555,45 +555,41 @@
 }
 
 void MVKCmdCopyBuffer::encode(MVKCommandEncoder* cmdEncoder) {
-    id<MTLBuffer> srcMTLBuff = _srcBuffer->getMTLBuffer();
-    NSUInteger srcMTLBuffOffset = _srcBuffer->getMTLBufferOffset();
+	id<MTLBuffer> srcMTLBuff = _srcBuffer->getMTLBuffer();
+	NSUInteger srcMTLBuffOffset = _srcBuffer->getMTLBufferOffset();
 
-    id<MTLBuffer> dstMTLBuff = _dstBuffer->getMTLBuffer();
-    NSUInteger dstMTLBuffOffset = _dstBuffer->getMTLBufferOffset();
+	id<MTLBuffer> dstMTLBuff = _dstBuffer->getMTLBuffer();
+	NSUInteger dstMTLBuffOffset = _dstBuffer->getMTLBufferOffset();
 
-    for (auto& cpyRgn : _mtlBuffCopyRegions) {
-#if MVK_MACOS
-        const bool useComputeCopy = cpyRgn.srcOffset % 4 != 0
-            || cpyRgn.dstOffset % 4 != 0
-            || cpyRgn.size % 4 != 0;
-#else
-        const bool useComputeCopy = false;
-#endif
-        if (useComputeCopy)
-        {
-            MVKAssert(
-                cpyRgn.srcOffset <= UINT32_MAX || cpyRgn.dstOffset <= UINT32_MAX || cpyRgn.size <= UINT32_MAX,
-                "Compute buffer copy region offsets and size must fit into a 32-bit unsigned integer.");
+	VkDeviceSize buffAlign = getDevice()->_pMetalFeatures->mtlCopyBufferAlignment;
 
-            id<MTLComputeCommandEncoder> mtlComputeEnc = cmdEncoder->getMTLComputeEncoder();
-            id<MTLComputePipelineState> pipelineState = cmdEncoder->getCommandEncodingPool()->getCopyBufferBytesComputePipelineState();
-            [mtlComputeEnc setComputePipelineState:pipelineState];
-            [mtlComputeEnc setBuffer:srcMTLBuff offset:srcMTLBuffOffset atIndex:0];
-            [mtlComputeEnc setBuffer:dstMTLBuff offset:dstMTLBuffOffset atIndex:1];
-            uint32_t copyInfo[3] = { (uint32_t)cpyRgn.srcOffset,  (uint32_t)cpyRgn.dstOffset, (uint32_t)cpyRgn.size };
-            [mtlComputeEnc setBytes:copyInfo length:sizeof(copyInfo) atIndex:2];
-            [mtlComputeEnc dispatchThreads:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-        }
-        else
-        {
-            id<MTLBlitCommandEncoder> mtlBlitEnc = cmdEncoder->getMTLBlitEncoder(kMVKCommandUseCopyBuffer);
-            [mtlBlitEnc copyFromBuffer: srcMTLBuff
-                          sourceOffset: (srcMTLBuffOffset + cpyRgn.srcOffset)
-                              toBuffer: dstMTLBuff
-                     destinationOffset: (dstMTLBuffOffset + cpyRgn.dstOffset)
-                                  size: cpyRgn.size];
-        }
-    }
+	for (auto& cpyRgn : _mtlBuffCopyRegions) {
+		const bool useComputeCopy = buffAlign > 1 && (cpyRgn.srcOffset % buffAlign != 0 ||
+													  cpyRgn.dstOffset % buffAlign != 0 ||
+													  cpyRgn.size      % buffAlign != 0);
+		if (useComputeCopy) {
+			MVKAssert(cpyRgn.srcOffset <= UINT32_MAX || cpyRgn.dstOffset <= UINT32_MAX || cpyRgn.size <= UINT32_MAX,
+					  "Compute buffer copy region offsets and size must fit into a 32-bit unsigned integer.");
+
+			id<MTLComputeCommandEncoder> mtlComputeEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyBuffer);
+			[mtlComputeEnc pushDebugGroup: @"vkCmdCopyBuffer"];
+			id<MTLComputePipelineState> pipelineState = cmdEncoder->getCommandEncodingPool()->getCopyBufferBytesComputePipelineState();
+			[mtlComputeEnc setComputePipelineState:pipelineState];
+			[mtlComputeEnc setBuffer:srcMTLBuff offset:srcMTLBuffOffset atIndex:0];
+			[mtlComputeEnc setBuffer:dstMTLBuff offset:dstMTLBuffOffset atIndex:1];
+			uint32_t copyInfo[3] = { (uint32_t)cpyRgn.srcOffset,  (uint32_t)cpyRgn.dstOffset, (uint32_t)cpyRgn.size };
+			[mtlComputeEnc setBytes:copyInfo length:sizeof(copyInfo) atIndex:2];
+			[mtlComputeEnc dispatchThreads: MTLSizeMake(1, 1, 1) threadsPerThreadgroup: MTLSizeMake(1, 1, 1)];
+			[mtlComputeEnc popDebugGroup];
+		} else {
+			id<MTLBlitCommandEncoder> mtlBlitEnc = cmdEncoder->getMTLBlitEncoder(kMVKCommandUseCopyBuffer);
+			[mtlBlitEnc copyFromBuffer: srcMTLBuff
+						  sourceOffset: (srcMTLBuffOffset + cpyRgn.srcOffset)
+							  toBuffer: dstMTLBuff
+					 destinationOffset: (dstMTLBuffOffset + cpyRgn.dstOffset)
+								  size: cpyRgn.size];
+		}
+	}
 }
 
 MVKCmdCopyBuffer::MVKCmdCopyBuffer(MVKCommandTypePool<MVKCmdCopyBuffer>* pool)
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
index 86d8841..e002162 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
@@ -294,12 +294,13 @@
 	void flush();
 
 	/** 
-	 * The current Metal compute encoder.
+	 * The current Metal compute encoder for the specified use,
+	 * which determines the label assigned to the returned encoder.
 	 *
 	 * If the current encoder is not a compute encoder, this function ends current before 
 	 * beginning compute encoding.
 	 */
-	id<MTLComputeCommandEncoder> getMTLComputeEncoder();
+	id<MTLComputeCommandEncoder> getMTLComputeEncoder(MVKCommandUse cmdUse);
 
 	/**
 	 * The current Metal BLIT encoder for the specified use,
@@ -308,7 +309,7 @@
 	 * If the current encoder is not a BLIT encoder, this function ends 
      * the current encoder before beginning BLIT encoding.
 	 */
-	id<MTLBlitCommandEncoder> getMTLBlitEncoder(MVKCommandUse cmdBlitEncUse);
+	id<MTLBlitCommandEncoder> getMTLBlitEncoder(MVKCommandUse cmdUse);
 
 	/** Returns the push constants associated with the specified shader stage. */
 	MVKPushConstantsCommandEncoderState* getPushConstants(VkShaderStageFlagBits shaderStage);
@@ -424,6 +425,7 @@
     MVKActivatedQueries* _pActivatedQueries;
 	std::vector<VkClearValue> _clearValues;
 	id<MTLComputeCommandEncoder> _mtlComputeEncoder;
+	MVKCommandUse _mtlComputeEncoderUse;
 	id<MTLBlitCommandEncoder> _mtlBlitEncoder;
     MVKCommandUse _mtlBlitEncoderUse;
 	MVKPushConstantsCommandEncoderState _vertexPushConstants;
@@ -499,11 +501,14 @@
 #pragma mark Support functions
 
 /** Returns a name, suitable for use as a MTLCommandBuffer label, based on the MVKCommandUse. */
-NSString* mvkMTLCommandBufferLabel(MVKCommandUse cmdBuffUse);
+NSString* mvkMTLCommandBufferLabel(MVKCommandUse cmdUse);
 
 /** Returns a name, suitable for use as a MTLRenderCommandEncoder label, based on the MVKCommandUse. */
-NSString* mvkMTLRenderCommandEncoderLabel(MVKCommandUse cmdBlitEncUse);
+NSString* mvkMTLRenderCommandEncoderLabel(MVKCommandUse cmdUse);
 
 /** Returns a name, suitable for use as a MTLBlitCommandEncoder label, based on the MVKCommandUse. */
-NSString* mvkMTLBlitCommandEncoderLabel(MVKCommandUse cmdBlitEncUse);
+NSString* mvkMTLBlitCommandEncoderLabel(MVKCommandUse cmdUse);
+
+/** Returns a name, suitable for use as a MTLComputeCommandEncoder label, based on the MVKCommandUse. */
+NSString* mvkMTLComputeCommandEncoderLabel(MVKCommandUse cmdUse);
 
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
index d56b5f6..19e1187 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
@@ -342,6 +342,7 @@
 
 	[_mtlComputeEncoder endEncoding];
 	_mtlComputeEncoder = nil;       // not retained
+	_mtlComputeEncoderUse = kMVKCommandUseNone;
 
 	[_mtlBlitEncoder endEncoding];
 	_mtlBlitEncoder = nil;          // not retained
@@ -362,22 +363,26 @@
 	}
 }
 
-id<MTLComputeCommandEncoder> MVKCommandEncoder::getMTLComputeEncoder() {
+id<MTLComputeCommandEncoder> MVKCommandEncoder::getMTLComputeEncoder(MVKCommandUse cmdUse) {
 	if ( !_mtlComputeEncoder ) {
 		endCurrentMetalEncoding();
 		_mtlComputeEncoder = [_mtlCmdBuffer computeCommandEncoder];		// not retained
 	}
+	if (_mtlComputeEncoderUse != cmdUse) {
+		_mtlComputeEncoderUse = cmdUse;
+		_mtlComputeEncoder.label = mvkMTLComputeCommandEncoderLabel(cmdUse);
+	}
 	return _mtlComputeEncoder;
 }
 
-id<MTLBlitCommandEncoder> MVKCommandEncoder::getMTLBlitEncoder(MVKCommandUse cmdBlitEncUse) {
+id<MTLBlitCommandEncoder> MVKCommandEncoder::getMTLBlitEncoder(MVKCommandUse cmdUse) {
 	if ( !_mtlBlitEncoder ) {
 		endCurrentMetalEncoding();
 		_mtlBlitEncoder = [_mtlCmdBuffer blitCommandEncoder];   // not retained
 	}
-    if (_mtlBlitEncoderUse != cmdBlitEncUse) {
-        _mtlBlitEncoderUse = cmdBlitEncUse;
-        _mtlBlitEncoder.label = mvkMTLBlitCommandEncoderLabel(cmdBlitEncUse);
+    if (_mtlBlitEncoderUse != cmdUse) {
+        _mtlBlitEncoderUse = cmdUse;
+        _mtlBlitEncoder.label = mvkMTLBlitCommandEncoderLabel(cmdUse);
     }
 	return _mtlBlitEncoder;
 }
@@ -513,6 +518,7 @@
             _mtlCmdBuffer = nil;
             _mtlRenderEncoder = nil;
             _mtlComputeEncoder = nil;
+			_mtlComputeEncoderUse = kMVKCommandUseNone;
             _mtlBlitEncoder = nil;
             _mtlBlitEncoderUse = kMVKCommandUseNone;
 }
@@ -550,8 +556,8 @@
 #pragma mark -
 #pragma mark Support functions
 
-NSString* mvkMTLCommandBufferLabel(MVKCommandUse cmdBuffUse) {
-    switch (cmdBuffUse) {
+NSString* mvkMTLCommandBufferLabel(MVKCommandUse cmdUse) {
+    switch (cmdUse) {
         case kMVKCommandUseQueueSubmit:     return @"vkQueueSubmit CommandBuffer";
         case kMVKCommandUseQueuePresent:    return @"vkQueuePresentKHR CommandBuffer";
         case kMVKCommandUseQueueWaitIdle:   return @"vkQueueWaitIdle CommandBuffer";
@@ -560,8 +566,8 @@
     }
 }
 
-NSString* mvkMTLRenderCommandEncoderLabel(MVKCommandUse cmdRendEncUse) {
-    switch (cmdRendEncUse) {
+NSString* mvkMTLRenderCommandEncoderLabel(MVKCommandUse cmdUse) {
+    switch (cmdUse) {
         case kMVKCommandUseBeginRenderPass:         return @"vkCmdBeginRenderPass RenderEncoder";
         case kMVKCommandUseNextSubpass:             return @"vkCmdNextSubpass RenderEncoder";
         case kMVKCommandUseBlitImage:               return @"vkCmdBlitImage RenderEncoder";
@@ -573,8 +579,8 @@
     }
 }
 
-NSString* mvkMTLBlitCommandEncoderLabel(MVKCommandUse cmdBlitEncUse) {
-    switch (cmdBlitEncUse) {
+NSString* mvkMTLBlitCommandEncoderLabel(MVKCommandUse cmdUse) {
+    switch (cmdUse) {
         case kMVKCommandUsePipelineBarrier:     return @"vkCmdPipelineBarrier BlitEncoder";
         case kMVKCommandUseCopyImage:           return @"vkCmdCopyImage BlitEncoder";
         case kMVKCommandUseResolveCopyImage:    return @"vkCmdResolveImage (copy stage) RenderEncoder";
@@ -588,3 +594,12 @@
     }
 }
 
+NSString* mvkMTLComputeCommandEncoderLabel(MVKCommandUse cmdUse) {
+	switch (cmdUse) {
+		case kMVKCommandUseDispatch:          	return @"vkCmdDispatch ComputeEncoder";
+		case kMVKCommandUseCopyBuffer:          return @"vkCmdCopyBuffer ComputeEncoder";
+		case kMVKCommandUseFillBuffer:          return @"vkCmdFillBuffer ComputeEncoder";
+		default:                                return @"Unknown Use ComputeEncoder";
+	}
+}
+
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm b/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm
index 65b3d51..3fa8ca4 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm
@@ -140,7 +140,7 @@
                                           _mtlBufferIndex);
             break;
         case VK_SHADER_STAGE_COMPUTE_BIT:
-            _cmdEncoder->setComputeBytes(_cmdEncoder->getMTLComputeEncoder(),
+            _cmdEncoder->setComputeBytes(_cmdEncoder->getMTLComputeEncoder(kMVKCommandUseDispatch),
                                          _pushConstants.data(),
                                          _pushConstants.size(),
                                          _mtlBufferIndex);
@@ -496,21 +496,21 @@
 
     encodeBinding<MVKMTLBufferBinding>(_bufferBindings, _areBufferBindingsDirty,
                                        [](MVKCommandEncoder* cmdEncoder, MVKMTLBufferBinding& b)->void {
-                                           [cmdEncoder->getMTLComputeEncoder() setBuffer: b.mtlBuffer
-                                                                                  offset: b.offset
-                                                                                 atIndex: b.index];
+                                           [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseDispatch) setBuffer: b.mtlBuffer
+																										offset: b.offset
+																									   atIndex: b.index];
                                        });
 
     encodeBinding<MVKMTLTextureBinding>(_textureBindings, _areTextureBindingsDirty,
                                         [](MVKCommandEncoder* cmdEncoder, MVKMTLTextureBinding& b)->void {
-                                            [cmdEncoder->getMTLComputeEncoder() setTexture: b.mtlTexture
-                                                                                   atIndex: b.index];
+                                            [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseDispatch) setTexture: b.mtlTexture
+																										 atIndex: b.index];
                                         });
 
     encodeBinding<MVKMTLSamplerStateBinding>(_samplerStateBindings, _areSamplerStateBindingsDirty,
                                              [](MVKCommandEncoder* cmdEncoder, MVKMTLSamplerStateBinding& b)->void {
-                                                 [cmdEncoder->getMTLComputeEncoder() setSamplerState: b.mtlSamplerState
-                                                                                             atIndex: b.index];
+                                                 [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseDispatch) setSamplerState: b.mtlSamplerState
+																												   atIndex: b.index];
                                              });
 }
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
index 77dd83d..2209fcc 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@@ -306,6 +306,7 @@
     _metalFeatures.mslVersion = SPIRVToMSLConverterOptions::makeMSLVersion(1);
     _metalFeatures.maxPerStageTextureCount = 31;
     _metalFeatures.mtlBufferAlignment = 64;
+	_metalFeatures.mtlCopyBufferAlignment = 1;
     _metalFeatures.texelBuffers = true;
 
     if ( [_mtlDevice supportsFeatureSet: MTLFeatureSet_iOS_GPUFamily3_v1] ) {
@@ -333,6 +334,7 @@
     _metalFeatures.mslVersion = SPIRVToMSLConverterOptions::makeMSLVersion(1, 1);
     _metalFeatures.maxPerStageTextureCount = 128;
     _metalFeatures.mtlBufferAlignment = 256;
+	_metalFeatures.mtlCopyBufferAlignment = 4;
 	_metalFeatures.indirectDrawing = true;
 	_metalFeatures.baseVertexInstanceDrawing = true;
     _metalFeatures.ioSurfaces = true;
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm b/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
index dc3740d..84af75e 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
@@ -395,9 +395,7 @@
 #pragma mark MVKComputePipeline
 
 void MVKComputePipeline::encode(MVKCommandEncoder* cmdEncoder) {
-    id<MTLComputeCommandEncoder> mtlCmdEnc = cmdEncoder->getMTLComputeEncoder();
-    [mtlCmdEnc setComputePipelineState: _mtlPipelineState];
-
+    [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseDispatch) setComputePipelineState: _mtlPipelineState];
     cmdEncoder->_mtlThreadgroupSize = _mtlThreadgroupSize;
 }
 
diff --git a/MoltenVK/MoltenVK/Utility/MVKFoundation.h b/MoltenVK/MoltenVK/Utility/MVKFoundation.h
index 9f1a4df..5066df8 100644
--- a/MoltenVK/MoltenVK/Utility/MVKFoundation.h
+++ b/MoltenVK/MoltenVK/Utility/MVKFoundation.h
@@ -88,6 +88,7 @@
     kMVKCommandUseClearColorImage,          /**< vkCmdClearColorImage. */
     kMVKCommandUseClearDepthStencilImage,   /**< vkCmdClearDepthStencilImage. */
     kMVKCommandUseResetQueryPool,           /**< vkCmdResetQueryPool. */
+	kMVKCommandUseDispatch,                 /**< vkCmdDispatch. */
 } MVKCommandUse;
 
 /**
diff --git a/README.md b/README.md
index 28f8a97..3e08899 100644
--- a/README.md
+++ b/README.md
@@ -38,6 +38,9 @@
 To learn how to integrate the **MoltenVK** runtime into a game or application, see the 
 [`Docs/MoltenVK_Runtime_UserGuide.md `](Docs/MoltenVK_Runtime_UserGuide.md) document in the `Docs` directory. 
 
+If you are just looking for a pre-built **MoltenVK** runtime binary, you can download it as part of the 
+[*LunarG SDK*](https://vulkan.lunarg.com).
+
 
 
 <a name="intro"></a>