Merge pull request #715 from billhollings/master

Add ability to automatically cause an Xcode GPU capture without developer intervention.
diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md
index fd3c8e8..3239b8d 100644
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@@ -21,6 +21,7 @@
 - Add support for extensions:
 	- `VK_KHR_device_group`
 - Add support for `VkEvent`, using either native `MTLEvent` or emulation when `MTLEvent` not available.
+- `vkInvalidateMappedMemoryRanges()` synchronizes managed device memory to CPU.
 - Revert to supporting host-coherent memory for linear images on macOS.
 - Ensure Vulkan loader magic number is set every time before returning any dispatchable Vulkan handle.
 - Fix crash when `VkDeviceCreateInfo` specifies queue families out of numerical order.
@@ -32,6 +33,7 @@
 - No longer prefer dedicated allocations for buffer memory, including buffer-backed images.
 - Handle the `compositeAlpha` member of `VkSwapchainCreateInfoKHR`.
 - `VkPhysicalDevicePortabilitySubsetFeaturesEXTX::events` set to `true`.
+- Add ability to automatically cause an *Xcode* GPU capture without developer intervention.
 
 
 
diff --git a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
index eef5e65..352b732 100644
--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@@ -102,8 +102,8 @@
  *      2: Log errors and informational messages.
  *    If neither is set, errors and informational messages are logged.
  *
- * 2. Setting the MVK_CONFIG_TRACE_VULKAN_CALLS runtime environment variable or MoltenVK compile-time build
- *    setting will cause MoltenVK to log the name of each Vulkan call made by the application. The logging
+ * 2. The MVK_CONFIG_TRACE_VULKAN_CALLS runtime environment variable or MoltenVK compile-time build
+ *    setting causes MoltenVK to log the name of each Vulkan call made by the application. The logging
  *    format options can be controlled by setting the value of MVK_CONFIG_TRACE_VULKAN_CALLS as follows:
  *        0: No Vulkan call logging.
  *        1: Log the name of each Vulkan call when the call is entered.
@@ -117,6 +117,18 @@
  * 4. Setting the MVK_ALLOW_METAL_EVENTS runtime environment variable or MoltenVK compile-time build
  *    setting to 1 will cause MoltenVK to use Metal events, if they are available on the device, for
  *    for VkSemaphore sychronization behaviour. This is disabled by default.
+ *
+ * 5. The MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE runtime environment variable or MoltenVK compile-time
+ *    build setting controls whether Xcode should run an automatic GPU capture without the user
+ *    having to trigger it manually via the Xcode user interface, and controls the scope under
+ *    which that GPU capture will occur. This is useful when trying to capture a one-shot GPU
+ *    trace, such as when running a Vulkan CTS test case. For the automatic GPU capture to occur,
+ *    the Xcode scheme under which the app is run must have the Metal GPU capture option turned on.
+ *    MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE should not be set to manually trigger a GPU capture via the
+ *    Xcode user interface.
+ *      0: No automatic GPU capture.
+ *      1: Capture all GPU commands issued during the lifetime of the VkDevice.
+ *    If none of these is set, no automatic GPU capture will occur.
  */
 typedef struct {
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm
index dcf566d..adbf683 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm
@@ -110,10 +110,8 @@
 #endif
 }
 
-/**
- * Returns whether the specified buffer memory barrier requires a sync between this
- * buffer and host memory for the purpose of the host reading texture memory.
- */
+// Returns whether the specified buffer memory barrier requires a sync between this
+// buffer and host memory for the purpose of the host reading texture memory.
 bool MVKBuffer::needsHostReadSync(VkPipelineStageFlags srcStageMask,
 								  VkPipelineStageFlags dstStageMask,
 								  VkBufferMemoryBarrier* pBufferMemoryBarrier) {
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
index 5f267df..b4a7461 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
@@ -360,6 +360,11 @@
 #pragma mark -
 #pragma mark MVKDevice
 
+typedef struct {
+	id<MTLBlitCommandEncoder> mtlBlitEncoder = nil;
+	id<MTLCommandBuffer> mtlCmdBuffer = nil;
+} MVKMTLBlitEncoder;
+
 /** Represents a Vulkan logical GPU device, associated with a physical device. */
 class MVKDevice : public MVKDispatchableVulkanAPIObject {
 
@@ -387,7 +392,7 @@
 	PFN_vkVoidFunction getProcAddr(const char* pName);
 
 	/** Retrieves a queue at the specified index within the specified family. */
-	MVKQueue* getQueue(uint32_t queueFamilyIndex, uint32_t queueIndex);
+	MVKQueue* getQueue(uint32_t queueFamilyIndex = 0, uint32_t queueIndex = 0);
 
 	/** Block the current thread until all queues in this device are idle. */
 	VkResult waitIdle();
@@ -528,6 +533,9 @@
 	void freeMemory(MVKDeviceMemory* mvkDevMem,
 					const VkAllocationCallbacks* pAllocator);
 
+
+#pragma mark Operations
+
 	/** Applies the specified global memory barrier to all resource issued by this device. */
 	void applyMemoryBarrier(VkPipelineStageFlags srcStageMask,
 							VkPipelineStageFlags dstStageMask,
@@ -565,6 +573,9 @@
     /** Populates the specified statistics structure from the current activity performance statistics. */
     void getPerformanceStatistics(MVKPerformanceStatistics* pPerf);
 
+	/** Invalidates the memory regions. */
+	VkResult invalidateMappedMemoryRanges(uint32_t memRangeCount, const VkMappedMemoryRange* pMemRanges);
+
 
 #pragma mark Metal
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
index e900a61..170a6c4 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@@ -2122,6 +2122,9 @@
 	mvkDevMem->destroy();
 }
 
+
+#pragma mark Operations
+
 // Adds the specified resource for tracking, and returns the added resource.
 MVKResource* MVKDevice::addResource(MVKResource* rez) {
 	lock_guard<mutex> lock(_rezLock);
@@ -2197,6 +2200,25 @@
     if (pPerf) { *pPerf = _performanceStatistics; }
 }
 
+VkResult MVKDevice::invalidateMappedMemoryRanges(uint32_t memRangeCount, const VkMappedMemoryRange* pMemRanges) {
+	@autoreleasepool {
+		VkResult rslt = VK_SUCCESS;
+		MVKMTLBlitEncoder mvkBlitEnc;
+		for (uint32_t i = 0; i < memRangeCount; i++) {
+			const VkMappedMemoryRange* pMem = &pMemRanges[i];
+			MVKDeviceMemory* mvkMem = (MVKDeviceMemory*)pMem->memory;
+			VkResult r = mvkMem->pullFromDevice(pMem->offset, pMem->size, false, &mvkBlitEnc);
+			if (rslt == VK_SUCCESS) { rslt = r; }
+		}
+		if (mvkBlitEnc.mtlBlitEncoder) { [mvkBlitEnc.mtlBlitEncoder endEncoding]; }
+		if (mvkBlitEnc.mtlCmdBuffer) {
+			[mvkBlitEnc.mtlCmdBuffer commit];
+			[mvkBlitEnc.mtlCmdBuffer waitUntilCompleted];
+		}
+		return rslt;
+	}
+}
+
 
 #pragma mark Metal
 
@@ -2279,6 +2301,10 @@
 
 	initQueues(pCreateInfo);
 
+	if (getInstance()->_autoGPUCaptureScope == MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE_DEVICE) {
+		[[MTLCaptureManager sharedCaptureManager] startCaptureWithDevice: getMTLDevice()];
+	}
+
 	MVKLogInfo("Created VkDevice to run on GPU %s with the following %d Vulkan extensions enabled:%s",
 			   _pProperties->deviceName,
 			   _enabledExtensions.getEnabledCount(),
@@ -2562,6 +2588,10 @@
 
 	[_mtlCompileOptions release];
     [_globalVisibilityResultMTLBuffer release];
+
+	if (getInstance()->_autoGPUCaptureScope == MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE_DEVICE) {
+		[[MTLCaptureManager sharedCaptureManager] stopCapture];
+	}
 }
 
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.h b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.h
index 9d468ef..557c51c 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.h
@@ -85,8 +85,18 @@
 	 * If this memory is host-visible, pulls the specified memory range from the device.
 	 * Normally, pulling will only occur if the device memory is non-coherent, but pulling
 	 * to coherent memory can be forced by setting evenIfCoherent to true.
+	 *
+	 * If pBlitEnc is not null, it points to a holder for a MTLBlitCommandEncoder and its
+	 * assocated MTLCommandBuffer. If this instance has a MTLBuffer using managed memory,
+	 * this function may call synchronizeResource: on the MTLBlitCommandEncoder to
+	 * synchronize the GPU contents to the CPU. If the contents of the pBlitEnc do not
+	 * include a MTLBlitCommandEncoder and MTLCommandBuffer, this function will create
+	 * them and populate the contents into the MVKMTLBlitEncoder struct.
 	 */
-	VkResult pullFromDevice(VkDeviceSize offset, VkDeviceSize size, bool evenIfCoherent = false);
+	VkResult pullFromDevice(VkDeviceSize offset,
+							VkDeviceSize size,
+							bool evenIfCoherent = false,
+							MVKMTLBlitEncoder* pBlitEnc = nullptr);
 
 
 #pragma mark Metal
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm
index ee4aedf..5d5dd09 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm
@@ -19,6 +19,7 @@
 #include "MVKDeviceMemory.h"
 #include "MVKBuffer.h"
 #include "MVKImage.h"
+#include "MVKQueue.h"
 #include "MVKEnvironment.h"
 #include "mvk_datatypes.hpp"
 #include "MVKFoundation.h"
@@ -91,12 +92,24 @@
 	return VK_SUCCESS;
 }
 
-VkResult MVKDeviceMemory::pullFromDevice(VkDeviceSize offset, VkDeviceSize size, bool evenIfCoherent) {
+VkResult MVKDeviceMemory::pullFromDevice(VkDeviceSize offset,
+										 VkDeviceSize size,
+										 bool evenIfCoherent,
+										 MVKMTLBlitEncoder* pBlitEnc) {
 	// Coherent memory is flushed on unmap(), so it is only flushed if forced
     VkDeviceSize memSize = adjustMemorySize(size, offset);
 	if (memSize > 0 && isMemoryHostAccessible() && (evenIfCoherent || !isMemoryHostCoherent()) ) {
 		lock_guard<mutex> lock(_rezLock);
         for (auto& img : _images) { img->pullFromDevice(offset, memSize); }
+
+#if MVK_MACOS
+		if (pBlitEnc && _mtlBuffer && _mtlStorageMode == MTLStorageModeManaged) {
+			if ( !pBlitEnc->mtlCmdBuffer) { pBlitEnc->mtlCmdBuffer = [_device->getQueue()->getMTLCommandQueue() commandBufferWithUnretainedReferences]; }
+			if ( !pBlitEnc->mtlBlitEncoder) { pBlitEnc->mtlBlitEncoder = [pBlitEnc->mtlCmdBuffer blitCommandEncoder]; }
+			[pBlitEnc->mtlBlitEncoder synchronizeResource: _mtlBuffer];
+		}
+#endif
+
 	}
 	return VK_SUCCESS;
 }
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKInstance.h b/MoltenVK/MoltenVK/GPUObjects/MVKInstance.h
index 4bbef89..301a4f0 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKInstance.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKInstance.h
@@ -196,6 +196,7 @@
 	bool _hasDebugUtilsMessengers;
 	bool _useCreationCallbacks;
 	const char* _debugReportCallbackLayerPrefix;
+	int32_t _autoGPUCaptureScope;
 };
 
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKInstance.mm b/MoltenVK/MoltenVK/GPUObjects/MVKInstance.mm
index 7a32e7a..c184f7d 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKInstance.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKInstance.mm
@@ -679,6 +679,8 @@
 	MVK_SET_FROM_ENV_OR_BUILD_BOOL( _mvkConfig.fullImageViewSwizzle,                   MVK_CONFIG_FULL_IMAGE_VIEW_SWIZZLE);
 	MVK_SET_FROM_ENV_OR_BUILD_BOOL( _mvkConfig.defaultGPUCaptureScopeQueueFamilyIndex, MVK_CONFIG_DEFAULT_GPU_CAPTURE_SCOPE_QUEUE_FAMILY_INDEX);
 	MVK_SET_FROM_ENV_OR_BUILD_BOOL( _mvkConfig.defaultGPUCaptureScopeQueueIndex,       MVK_CONFIG_DEFAULT_GPU_CAPTURE_SCOPE_QUEUE_INDEX);
+
+	MVK_SET_FROM_ENV_OR_BUILD_INT32(_autoGPUCaptureScope, MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE);
 }
 
 VkResult MVKInstance::verifyLayers(uint32_t count, const char* const* names) {
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKSwapchain.mm b/MoltenVK/MoltenVK/GPUObjects/MVKSwapchain.mm
index e1e6467..b10aac6 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKSwapchain.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKSwapchain.mm
@@ -166,14 +166,10 @@
 		signal(signaler);
 		if (_device->_useMTLEventsForSemaphores) {
 			// Unfortunately, we can't assume we have an MTLSharedEvent here.
-			// This means we need to execute a command on the device to signal
-			// the semaphore. Alternatively, we could always use an MTLSharedEvent,
-			// but that might impose unacceptable performance costs just to handle
-			// this one case.
-			MVKQueue* queue = _device->getQueue(0, 0);	
-			id<MTLCommandQueue> mtlQ = queue->getMTLCommandQueue();
-			id<MTLCommandBuffer> mtlCmdBuff = [mtlQ commandBufferWithUnretainedReferences];
-			[mtlCmdBuff enqueue];
+			// This means we need to execute a command on the device to signal the semaphore.
+			// Alternatively, we could always use an MTLSharedEvent, but that might impose
+			// unacceptable performance costs just to handle this one case.
+			id<MTLCommandBuffer> mtlCmdBuff = [_device->getQueue()->getMTLCommandQueue() commandBufferWithUnretainedReferences];
 			signaler.first->encodeSignal(mtlCmdBuff);
 			[mtlCmdBuff commit];
 		}
diff --git a/MoltenVK/MoltenVK/Utility/MVKEnvironment.h b/MoltenVK/MoltenVK/Utility/MVKEnvironment.h
index 56471ad..2114446 100644
--- a/MoltenVK/MoltenVK/Utility/MVKEnvironment.h
+++ b/MoltenVK/MoltenVK/Utility/MVKEnvironment.h
@@ -141,6 +141,17 @@
 #   define MVK_CONFIG_DEFAULT_GPU_CAPTURE_SCOPE_QUEUE_INDEX    0
 #endif
 
+/**
+ * The scope under which to automatically run a GPU capture within Xcode, without the
+ * developer having to trigger it manually via the Xcode UI. This is useful when trying
+ * to capture a one-shot trace, such as when running a Vulkan CTS test case.
+ */
+#define MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE_NONE		0
+#define MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE_DEVICE	1
+#ifndef MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE
+#   define MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE    	MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE_NONE
+#endif
+
 /** Force the use of a low-power GPU if it exists. Disabled by default. */
 #ifndef MVK_CONFIG_FORCE_LOW_POWER_GPU
 #   define MVK_CONFIG_FORCE_LOW_POWER_GPU    0
diff --git a/MoltenVK/MoltenVK/Vulkan/vulkan.mm b/MoltenVK/MoltenVK/Vulkan/vulkan.mm
index e44f368..a1e2fc9 100644
--- a/MoltenVK/MoltenVK/Vulkan/vulkan.mm
+++ b/MoltenVK/MoltenVK/Vulkan/vulkan.mm
@@ -429,15 +429,10 @@
     VkDevice                                    device,
     uint32_t                                    memRangeCount,
     const VkMappedMemoryRange*                  pMemRanges) {
-	
+
 	MVKTraceVulkanCallStart();
-	VkResult rslt = VK_SUCCESS;
-	for (uint32_t i = 0; i < memRangeCount; i++) {
-		const VkMappedMemoryRange* pMem = &pMemRanges[i];
-		MVKDeviceMemory* mvkMem = (MVKDeviceMemory*)pMem->memory;
-		VkResult r = mvkMem->pullFromDevice(pMem->offset, pMem->size);
-		if (rslt == VK_SUCCESS) { rslt = r; }
-	}
+	MVKDevice* mvkDev = MVKDevice::getMVKDevice(device);
+	VkResult rslt = mvkDev->invalidateMappedMemoryRanges(memRangeCount, pMemRanges);
 	MVKTraceVulkanCallEnd();
 	return rslt;
 }