Changes to MVKConfiguration::resumeLostDevice behavior.
On command buffer submission failure, if MVKConfiguration::resumeLostDevice enabled,
do not release waits on VkDevice, and do not return VK_ERROR_DEVICE_LOST, unless
VkPhysicalDevice is also lost.
diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md
index 41781fc..5dfa180 100644
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@@ -27,6 +27,9 @@
- Rename `kMVKShaderStageMax` to `kMVKShaderStageCount`.
- Fix crash when requesting `MTLCommandBuffer` logs in runtime debug mode on older OS versions.
- Fix synchronization issue with locking `MTLArgumentEncoder` for Metal Argument Buffers.
+- Fix race condition on submission fence during device loss.
+- On command buffer submission failure, if `MVKConfiguration::resumeLostDevice` enabled, do not release
+ waits on `VkDevice`, and do not return `VK_ERROR_DEVICE_LOST`, unless `VkPhysicalDevice` is also lost.
- Fix inconsistent handling of linear attachment decisions on Apple Silicon.
- Protect against crash when retrieving `MTLTexture` when `VkImage` has no `VkDeviceMemory` bound.
- Adjust some `VkPhysicalDeviceLimits` values for Vulkan and Metal compliance.
diff --git a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
index a78e93c..7a08836 100644
--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@@ -780,24 +780,24 @@
* Controls whether MoltenVK should treat a lost VkDevice as resumable, unless the
* corresponding VkPhysicalDevice has also been lost. The VK_ERROR_DEVICE_LOST error has
* a broad definitional range, and can mean anything from a GPU hiccup on the current
- * command buffer submission, to a phyically removed GPU. In the case where this error does
+ * command buffer submission, to a phyisically removed GPU. In the case where this error does
* not impact the VkPhysicalDevice, Vulkan requires that the app destroy and re-create a new
* VkDevice. However, not all apps (including CTS) respect that requirement, leading to what
* might be a transient command submission failure causing an unexpected catastophic app failure.
- * If this setting is enabled, in the case of a VK_ERROR_DEVICE_LOST error that does not
- * impact the VkPhysicalDevice, MoltenVK will remove the error condition on the VkDevice after
- * the current queue submission is finished, allowing the VkDevice to continue to be used.
- * If this setting is disabled, MoltenVK will maintain the VK_ERROR_DEVICE_LOST error condition
- * on the VkDevice, and subsequent use of that VkDevice will be reduced or prohibited.
*
- * The value of this parameter should be changed before creating a VkDevice
- * that will use it, for the change to take effect.
+ * If this setting is enabled, in the case of a VK_ERROR_DEVICE_LOST error that does NOT impact
+ * the VkPhysicalDevice, MoltenVK will log the error, but will not mark the VkDevice as lost,
+ * allowing the VkDevice to continue to be used. If this setting is disabled, MoltenVK will
+ * mark the VkDevice as lost, and subsequent use of that VkDevice will be reduced or prohibited.
+ *
+ * The value of this parameter may be changed at any time during application runtime,
+ * and the changed value will affect the error behavior of subsequent command submissions.
*
* The initial value or this parameter is set by the
* MVK_CONFIG_RESUME_LOST_DEVICE
* runtime environment variable or MoltenVK compile-time build setting.
- * If neither is set, this setting is disabled by default, and MoltenVK will not
- * resume a VkDevice that enters the VK_ERROR_DEVICE_LOST error state.
+ * If neither is set, this setting is disabled by default, and MoltenVK
+ * will mark the VkDevice as lost when a command submission failure occurs.
*/
VkBool32 resumeLostDevice;
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
index d2afd9e..b1bd357 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
@@ -447,8 +447,8 @@
/** Block the current thread until all queues in this device are idle. */
VkResult waitIdle();
- /** Mark this device as lost. Releases all waits for this device. */
- VkResult markLost();
+ /** Mark this device (and optionally the physical device) as lost. Releases all waits for this device. */
+ VkResult markLost(bool alsoMarkPhysicalDevice = false);
/** Returns whether or not the given descriptor set layout is supported. */
void getDescriptorSetLayoutSupport(const VkDescriptorSetLayoutCreateInfo* pCreateInfo,
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
index b46eea6..e70de7e 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@@ -2861,9 +2861,12 @@
return VK_SUCCESS;
}
-VkResult MVKDevice::markLost() {
+VkResult MVKDevice::markLost(bool alsoMarkPhysicalDevice) {
lock_guard<mutex> lock(_sem4Lock);
+
setConfigurationResult(VK_ERROR_DEVICE_LOST);
+ if (alsoMarkPhysicalDevice) { _physicalDevice->setConfigurationResult(VK_ERROR_DEVICE_LOST); }
+
for (auto* sem4 : _awaitingSemaphores) {
sem4->release();
}
@@ -2874,7 +2877,8 @@
}
_awaitingSemaphores.clear();
_awaitingTimelineSem4s.clear();
- return VK_ERROR_DEVICE_LOST;
+
+ return getConfigurationResult();
}
void MVKDevice::getDescriptorSetLayoutSupport(const VkDescriptorSetLayoutCreateInfo* pCreateInfo,
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
index 7464ae1..f9556af 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
@@ -330,19 +330,21 @@
uint64_t startTime = mvkDev->getPerformanceTimestamp();
[mtlCmdBuff addCompletedHandler: ^(id<MTLCommandBuffer> mtlCB) {
if (mtlCB.status == MTLCommandBufferStatusError) {
- getVulkanAPIObject()->reportError(mvkDev->markLost(), "Command buffer %p \"%s\" execution failed (code %li): %s", mtlCB, mtlCB.label ? mtlCB.label.UTF8String : "", mtlCB.error.code, mtlCB.error.localizedDescription.UTF8String);
- // Some errors indicate we lost the physical device as well.
+ // If a command buffer error has occurred, report the error. If the error affects
+ // the physical device, always mark both the device and physical device as lost.
+ // If the error is local to this command buffer, optionally mark the device (but not the
+ // physical device) as lost, depending on the value of MVKConfiguration::resumeLostDevice.
+ getVulkanAPIObject()->reportError(VK_ERROR_DEVICE_LOST, "Command buffer %p \"%s\" execution failed (code %li): %s", mtlCB, mtlCB.label ? mtlCB.label.UTF8String : "", mtlCB.error.code, mtlCB.error.localizedDescription.UTF8String);
switch (mtlCB.error.code) {
case MTLCommandBufferErrorBlacklisted:
- // XXX This may also be used for command buffers executed in the background without the right entitlement.
- case MTLCommandBufferErrorNotPermitted:
+ case MTLCommandBufferErrorNotPermitted: // May also be used for command buffers executed in the background without the right entitlement.
#if MVK_MACOS && !MVK_MACCAT
case MTLCommandBufferErrorDeviceRemoved:
#endif
- mvkDev->getPhysicalDevice()->setConfigurationResult(VK_ERROR_DEVICE_LOST);
+ mvkDev->markLost(true);
break;
default:
- if (mvkConfig().resumeLostDevice) { mvkDev->clearConfigurationResult(); }
+ if ( !mvkConfig().resumeLostDevice ) { mvkDev->markLost(); }
break;
}
#if MVK_XCODE_12