Merge pull request #1367 from billhollings/device-loss-fixes

Device loss fixes
diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md
index 41781fc..5dfa180 100644
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@@ -27,6 +27,9 @@
 - Rename `kMVKShaderStageMax` to `kMVKShaderStageCount`.
 - Fix crash when requesting `MTLCommandBuffer` logs in runtime debug mode on older OS versions.
 - Fix synchronization issue with locking `MTLArgumentEncoder` for Metal Argument Buffers.
+- Fix race condition on submission fence during device loss.
+- On command buffer submission failure, if `MVKConfiguration::resumeLostDevice` enabled,  do not release 
+  waits on `VkDevice`, and do not return `VK_ERROR_DEVICE_LOST`, unless `VkPhysicalDevice` is also lost.
 - Fix inconsistent handling of linear attachment decisions on Apple Silicon.
 - Protect against crash when retrieving `MTLTexture` when `VkImage` has no `VkDeviceMemory` bound.
 - Adjust some `VkPhysicalDeviceLimits` values for Vulkan and Metal compliance. 
diff --git a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
index a78e93c..c77d52d 100644
--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@@ -780,24 +780,24 @@
 	 * Controls whether MoltenVK should treat a lost VkDevice as resumable, unless the
 	 * corresponding VkPhysicalDevice has also been lost. The VK_ERROR_DEVICE_LOST error has
 	 * a broad definitional range, and can mean anything from a GPU hiccup on the current
-	 * command buffer submission, to a phyically removed GPU. In the case where this error does
+	 * command buffer submission, to a physically removed GPU. In the case where this error does
 	 * not impact the VkPhysicalDevice, Vulkan requires that the app destroy and re-create a new
 	 * VkDevice. However, not all apps (including CTS) respect that requirement, leading to what
 	 * might be a transient command submission failure causing an unexpected catastophic app failure.
-	 * If this setting is enabled, in the case of a VK_ERROR_DEVICE_LOST error that does not
-	 * impact the VkPhysicalDevice, MoltenVK will remove the error condition on the VkDevice after
-	 * the current queue submission is finished, allowing the VkDevice to continue to be used.
-	 * If this setting is disabled, MoltenVK will maintain the VK_ERROR_DEVICE_LOST error condition
-	 * on the VkDevice, and subsequent use of that VkDevice will be reduced or prohibited.
 	 *
-	 * The value of this parameter should be changed before creating a VkDevice
-	 * that will use it, for the change to take effect.
+	 * If this setting is enabled, in the case of a VK_ERROR_DEVICE_LOST error that does NOT impact
+	 * the VkPhysicalDevice, MoltenVK will log the error, but will not mark the VkDevice as lost,
+	 * allowing the VkDevice to continue to be used. If this setting is disabled, MoltenVK will
+	 * mark the VkDevice as lost, and subsequent use of that VkDevice will be reduced or prohibited.
+	 *
+	 * The value of this parameter may be changed at any time during application runtime,
+	 * and the changed value will affect the error behavior of subsequent command submissions.
 	 *
 	 * The initial value or this parameter is set by the
 	 * MVK_CONFIG_RESUME_LOST_DEVICE
 	 * runtime environment variable or MoltenVK compile-time build setting.
-	 * If neither is set, this setting is disabled by default, and MoltenVK will not
-	 * resume a VkDevice that enters the VK_ERROR_DEVICE_LOST error state.
+	 * If neither is set, this setting is disabled by default, and MoltenVK
+	 * will mark the VkDevice as lost when a command submission failure occurs.
 	 */
 	VkBool32 resumeLostDevice;
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
index d2afd9e..b1bd357 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
@@ -447,8 +447,8 @@
 	/** Block the current thread until all queues in this device are idle. */
 	VkResult waitIdle();
 	
-	/** Mark this device as lost. Releases all waits for this device. */
-	VkResult markLost();
+	/** Mark this device (and optionally the physical device) as lost. Releases all waits for this device. */
+	VkResult markLost(bool alsoMarkPhysicalDevice = false);
 
 	/** Returns whether or not the given descriptor set layout is supported. */
 	void getDescriptorSetLayoutSupport(const VkDescriptorSetLayoutCreateInfo* pCreateInfo,
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
index b46eea6..e70de7e 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@@ -2861,9 +2861,12 @@
 	return VK_SUCCESS;
 }
 
-VkResult MVKDevice::markLost() {
+VkResult MVKDevice::markLost(bool alsoMarkPhysicalDevice) {
 	lock_guard<mutex> lock(_sem4Lock);
+
 	setConfigurationResult(VK_ERROR_DEVICE_LOST);
+	if (alsoMarkPhysicalDevice) { _physicalDevice->setConfigurationResult(VK_ERROR_DEVICE_LOST); }
+
 	for (auto* sem4 : _awaitingSemaphores) {
 		sem4->release();
 	}
@@ -2874,7 +2877,8 @@
 	}
 	_awaitingSemaphores.clear();
 	_awaitingTimelineSem4s.clear();
-	return VK_ERROR_DEVICE_LOST;
+
+	return getConfigurationResult();
 }
 
 void MVKDevice::getDescriptorSetLayoutSupport(const VkDescriptorSetLayoutCreateInfo* pCreateInfo,
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h
index 68c5353..cc4d3c2 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h
@@ -195,6 +195,8 @@
 
 	MVKQueueCommandBufferSubmission(MVKQueue* queue, const VkSubmitInfo* pSubmit, VkFence fence);
 
+	~MVKQueueCommandBufferSubmission() override;
+
 protected:
 	friend MVKCommandBuffer;
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
index fc6e06e..f9556af 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
@@ -139,10 +139,14 @@
 		.flags = 0,
 	};
 
-	MVKFence mvkFence(_device, &vkFenceInfo);
-	VkFence fence = (VkFence)&mvkFence;
-	submit(0, nullptr, fence);
-	return mvkWaitForFences(_device, 1, &fence, false);
+	// The MVKFence is retained by the command submission, and may outlive this function while
+	// the command submission finishes, so we can't allocate MVKFence locally on the stack.
+	MVKFence* mvkFence = new MVKFence(_device, &vkFenceInfo);
+	VkFence vkFence = (VkFence)mvkFence;
+	submit(0, nullptr, vkFence);
+	VkResult rslt = mvkWaitForFences(_device, 1, &vkFence, false);
+	mvkFence->destroy();
+	return rslt;
 }
 
 id<MTLCommandBuffer> MVKQueue::getMTLCommandBuffer(bool retainRefs) {
@@ -326,19 +330,21 @@
 	uint64_t startTime = mvkDev->getPerformanceTimestamp();
 	[mtlCmdBuff addCompletedHandler: ^(id<MTLCommandBuffer> mtlCB) {
 		if (mtlCB.status == MTLCommandBufferStatusError) {
-			getVulkanAPIObject()->reportError(mvkDev->markLost(), "Command buffer %p \"%s\" execution failed (code %li): %s", mtlCB, mtlCB.label ? mtlCB.label.UTF8String : "", mtlCB.error.code, mtlCB.error.localizedDescription.UTF8String);
-			// Some errors indicate we lost the physical device as well.
+			// If a command buffer error has occurred, report the error. If the error affects
+			// the physical device, always mark both the device and physical device as lost.
+			// If the error is local to this command buffer, optionally mark the device (but not the
+			// physical device) as lost, depending on the value of MVKConfiguration::resumeLostDevice.
+			getVulkanAPIObject()->reportError(VK_ERROR_DEVICE_LOST, "Command buffer %p \"%s\" execution failed (code %li): %s", mtlCB, mtlCB.label ? mtlCB.label.UTF8String : "", mtlCB.error.code, mtlCB.error.localizedDescription.UTF8String);
 			switch (mtlCB.error.code) {
 				case MTLCommandBufferErrorBlacklisted:
-				// XXX This may also be used for command buffers executed in the background without the right entitlement.
-				case MTLCommandBufferErrorNotPermitted:
+				case MTLCommandBufferErrorNotPermitted:	// May also be used for command buffers executed in the background without the right entitlement.
 #if MVK_MACOS && !MVK_MACCAT
 				case MTLCommandBufferErrorDeviceRemoved:
 #endif
-					mvkDev->getPhysicalDevice()->setConfigurationResult(VK_ERROR_DEVICE_LOST);
+					mvkDev->markLost(true);
 					break;
 				default:
-					if (mvkConfig().resumeLostDevice) { mvkDev->clearConfigurationResult(); }
+					if ( !mvkConfig().resumeLostDevice ) { mvkDev->markLost(); }
 					break;
 			}
 #if MVK_XCODE_12
@@ -399,6 +405,9 @@
 	this->destroy();
 }
 
+// On device loss, the fence and signal semaphores may be signalled early, and they might then
+// be destroyed on the waiting thread before this submission is done with them. We therefore
+// retain() each here to ensure they live long enough for this submission to finish using them.
 MVKQueueCommandBufferSubmission::MVKQueueCommandBufferSubmission(MVKQueue* queue,
 																 const VkSubmitInfo* pSubmit,
 																 VkFence fence)
@@ -427,20 +436,28 @@
         }
         uint32_t ssCnt = pSubmit->signalSemaphoreCount;
         _signalSemaphores.reserve(ssCnt);
-        for (uint32_t i = 0; i < ssCnt; i++) {
-			auto ss = make_pair((MVKSemaphore*)pSubmit->pSignalSemaphores[i], (uint64_t)0);
-            if (pTimelineSubmit) { ss.second = pTimelineSubmit->pSignalSemaphoreValues[i]; }
-            _signalSemaphores.push_back(ss);
-        }
+		for (uint32_t i = 0; i < ssCnt; i++) {
+			auto* sem4 = (MVKSemaphore*)pSubmit->pSignalSemaphores[i];
+			sem4->retain();
+			uint64_t sem4Val = pTimelineSubmit ? pTimelineSubmit->pSignalSemaphoreValues[i] : 0;
+			_signalSemaphores.emplace_back(sem4, sem4Val);
+		}
     }
 
 	_fence = (MVKFence*)fence;
+	if (_fence) { _fence->retain(); }
+
 	_activeMTLCommandBuffer = nil;
 
 //	static std::atomic<uint32_t> _subCount;
 //	MVKLogDebug("Creating submission %p. Submission count %u.", this, ++_subCount);
 }
 
+MVKQueueCommandBufferSubmission::~MVKQueueCommandBufferSubmission() {
+	if (_fence) { _fence->release(); }
+	for (auto s : _signalSemaphores) { s.first->release(); }
+}
+
 
 #pragma mark -
 #pragma mark MVKQueuePresentSurfaceSubmission