Merge pull request #1693 from tellowkrinkle/SingleQueueSemaphore

Replace MTLFence semaphores with forcing a single queue
diff --git a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
index 12637c2..11aed6b 100644
--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@@ -566,36 +566,34 @@
 	VkBool32 forceLowPowerGPU;
 
 	/**
-	 * Use MTLFence, if it is available on the device, for VkSemaphore synchronization behaviour.
+	 * Use Metal's implicit guarantees that all operations submitted to a queue will give the same result as
+	 * if they had been run in submission order to implement VkSemaphore synchronization as no-ops.
 	 *
-	 * This parameter interacts with semaphoreUseMTLEvent. If both are enabled, on GPUs other than
-	 * NVIDIA, semaphoreUseMTLEvent takes priority and MTLEvent will be used if it is available,
-	 * otherwise MTLFence will be used if it is available. On NVIDIA GPUs, MTLEvent is disabled
-	 * for VkSemaphores, so CPU-based synchronization will be used unless semaphoreUseMTLFence
-	 * is enabled and MTLFence is available.
+	 * This requires all submissions be made to the same queue, and to guarantee that, MoltenVK will expose
+	 * only one queue to the application.
 	 *
 	 * In the special case of VK_SEMAPHORE_TYPE_TIMELINE semaphores, MoltenVK will always
 	 * use MTLSharedEvent if it is available on the platform, regardless of the values of
 	 * semaphoreUseMTLEvent or semaphoreUseMTLFence.
 	 *
-	 * The value of this parameter must be changed before creating a VkDevice,
-	 * for the change to take effect.
+	 * The value of this parameter must be changed before creating a VkDevice for the change to take effect.
 	 *
 	 * The initial value or this parameter is set by the
-	 * MVK_ALLOW_METAL_FENCES
+	 * MVK_ALLOW_SINGLE_QUEUE_SEMAPHORE
 	 * runtime environment variable or MoltenVK compile-time build setting.
-	 * If neither is set, this setting is disabled by default, and VkSemaphore will not use MTLFence.
+	 * If neither is set, this setting is enabled by default, and VkSemaphore will force a single queue
+	 * on NVIDIA GPUs and whenever MVK_ALLOW_METAL_EVENTS is not also set.
 	 */
-	VkBool32 semaphoreUseMTLFence;
+	VkBool32 semaphoreUseSingleQueue;
 
 	/**
 	 * Use MTLEvent, if it is available on the device, for VkSemaphore synchronization behaviour.
 	 *
-	 * This parameter interacts with semaphoreUseMTLFence. If both are enabled, on GPUs other than
+	 * This parameter interacts with semaphoreUseSingleQueue. If both are enabled, on GPUs other than
 	 * NVIDIA, semaphoreUseMTLEvent takes priority and MTLEvent will be used if it is available,
 	 * otherwise MTLFence will be used if it is available. On NVIDIA GPUs, MTLEvent is disabled
-	 * for VkSemaphores, so CPU-based synchronization will be used unless semaphoreUseMTLFence
-	 * is enabled and MTLFence is available.
+	 * for VkSemaphores, so CPU-based synchronization will be used unless semaphoreUseSingleQueue
+	 * is enabled.
 	 *
 	 * In the special case of VK_SEMAPHORE_TYPE_TIMELINE semaphores, MoltenVK will always
 	 * use MTLSharedEvent if it is available on the platform, regardless of the values of
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
index e7fda7e..3ce2caa 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
@@ -90,6 +90,12 @@
 #pragma mark -
 #pragma mark MVKPhysicalDevice
 
+typedef enum {
+	MVKSemaphoreStyleUseMTLEvent,
+	MVKSemaphoreStyleUseEmulation,
+	MVKSemaphoreStyleSingleQueue,
+} MVKSemaphoreStyle;
+
 /** VkPhysicalDeviceVulkan12Features entries that did not originate in a prior extension. */
 typedef struct MVKPhysicalDeviceVulkan12FeaturesNoExt {
 	VkBool32 samplerMirrorClampToEdge;
@@ -407,6 +413,7 @@
 	void initExtensions();
 	void initCounterSets();
 	bool needsCounterSetRetained();
+	MVKSemaphoreStyle getSemaphoreStyle();
 	MVKArrayRef<MVKQueueFamily*> getQueueFamilies();
 	void initPipelineCacheUUID();
 	uint32_t getHighestGPUCapability();
@@ -445,12 +452,6 @@
 	id<MTLCommandBuffer> mtlCmdBuffer = nil;
 } MVKMTLBlitEncoder;
 
-typedef enum {
-	MVKSemaphoreStyleUseMTLEvent,
-	MVKSemaphoreStyleUseMTLFence,
-	MVKSemaphoreStyleUseEmulation
-} MVKSemaphoreStyle;
-
 /** Represents a Vulkan logical GPU device, associated with a physical device. */
 class MVKDevice : public MVKDispatchableVulkanAPIObject {
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
index b2d2be8..dc13832 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@@ -1309,17 +1309,22 @@
 		qfProps.queueFlags = (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT);
 		_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
 
-		// Dedicated graphics queue family...or another general-purpose queue family.
-		if (specialize) { qfProps.queueFlags = (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_TRANSFER_BIT); }
-		_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
+		// Single queue semaphore requires using a single queue for everything
+		// So don't allow anyone to have more than one
+		if (getSemaphoreStyle() != MVKSemaphoreStyleSingleQueue)
+		{
+			// Dedicated graphics queue family...or another general-purpose queue family.
+			if (specialize) { qfProps.queueFlags = (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_TRANSFER_BIT); }
+			_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
 
-		// Dedicated compute queue family...or another general-purpose queue family.
-		if (specialize) { qfProps.queueFlags = (VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT); }
-		_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
+			// Dedicated compute queue family...or another general-purpose queue family.
+			if (specialize) { qfProps.queueFlags = (VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT); }
+			_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
 
-		// Dedicated transfer queue family...or another general-purpose queue family.
-		if (specialize) { qfProps.queueFlags = VK_QUEUE_TRANSFER_BIT; }
-		_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
+			// Dedicated transfer queue family...or another general-purpose queue family.
+			if (specialize) { qfProps.queueFlags = VK_QUEUE_TRANSFER_BIT; }
+			_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
+		}
 
 		MVKAssert(kMVKQueueFamilyCount >= _queueFamilies.size(), "Adjust value of kMVKQueueFamilyCount.");
 	}
@@ -3110,6 +3115,23 @@
 	}
 }
 
+MVKSemaphoreStyle MVKPhysicalDevice::getSemaphoreStyle() {
+	// Decide whether Vulkan semaphores should use a MTLEvent or forcing a single queue if they are available.
+	// Prefer MTLEvent, because MTLEvent handles sync across MTLCommandBuffers and MTLCommandQueues.
+	// However, do not allow use of MTLEvents on Rosetta2 (x86 build on M1 runtime) or NVIDIA GPUs,
+	// which have demonstrated trouble with MTLEvents. In that case, since a single queue will be used
+	// unless the option for it it has been disabled, in which case CPU emulation will be used
+	bool isNVIDIA = _properties.vendorID == kNVVendorId;
+	bool isRosetta2 = _properties.vendorID == kAppleVendorId && !MVK_APPLE_SILICON;
+	if (_metalFeatures.events && mvkConfig().semaphoreUseMTLEvent && !(isRosetta2 || isNVIDIA)) {
+		return MVKSemaphoreStyleUseMTLEvent;
+	}
+	if (mvkConfig().semaphoreUseSingleQueue) {
+		return MVKSemaphoreStyleSingleQueue;
+	}
+	return MVKSemaphoreStyleUseEmulation;
+}
+
 // Workaround for a bug in Intel Iris Plus Graphics driver where the counterSets array is
 // not properly retained internally, and becomes a zombie when counterSets is called more
 // than once, which occurs when an app creates more than one VkInstance. This workaround
@@ -3644,8 +3666,8 @@
 	} else {
 		switch (_vkSemaphoreStyle) {
 			case MVKSemaphoreStyleUseMTLEvent:  return new MVKSemaphoreMTLEvent(this, pCreateInfo, pExportInfo, pImportInfo);
-			case MVKSemaphoreStyleUseMTLFence:  return new MVKSemaphoreMTLFence(this, pCreateInfo, pExportInfo, pImportInfo);
 			case MVKSemaphoreStyleUseEmulation: return new MVKSemaphoreEmulated(this, pCreateInfo, pExportInfo, pImportInfo);
+			case MVKSemaphoreStyleSingleQueue:  return new MVKSemaphoreSingleQueue(this, pCreateInfo, pExportInfo, pImportInfo);
 		}
 	}
 }
@@ -4438,26 +4460,17 @@
 	_pProperties = &_physicalDevice->_properties;
 	_pMemoryProperties = &_physicalDevice->_memoryProperties;
 
-	// Decide whether Vulkan semaphores should use a MTLEvent or MTLFence if they are available.
-	// Prefer MTLEvent, because MTLEvent handles sync across MTLCommandBuffers and MTLCommandQueues.
-	// However, do not allow use of MTLEvents on Rosetta2 (x86 build on M1 runtime) or NVIDIA GPUs,
-	// which have demonstrated trouble with MTLEvents. In that case, since MTLFence use is disabled
-	// by default, unless MTLFence is deliberately enabled, CPU emulation will be used.
-	bool isNVIDIA = _pProperties->vendorID == kNVVendorId;
-	bool isRosetta2 = _pProperties->vendorID == kAppleVendorId && !MVK_APPLE_SILICON;
-	bool canUseMTLEventForSem4 = _pMetalFeatures->events && mvkConfig().semaphoreUseMTLEvent && !(isRosetta2 || isNVIDIA);
-	bool canUseMTLFenceForSem4 = _pMetalFeatures->fences && mvkConfig().semaphoreUseMTLFence;
-	_vkSemaphoreStyle = canUseMTLEventForSem4 ? MVKSemaphoreStyleUseMTLEvent : (canUseMTLFenceForSem4 ? MVKSemaphoreStyleUseMTLFence : MVKSemaphoreStyleUseEmulation);
+	_vkSemaphoreStyle = _physicalDevice->getSemaphoreStyle();
 	switch (_vkSemaphoreStyle) {
 		case MVKSemaphoreStyleUseMTLEvent:
 			MVKLogInfo("Using MTLEvent for Vulkan semaphores.");
 			break;
-		case MVKSemaphoreStyleUseMTLFence:
-			MVKLogInfo("Using MTLFence for Vulkan semaphores.");
-			break;
 		case MVKSemaphoreStyleUseEmulation:
 			MVKLogInfo("Using emulation for Vulkan semaphores.");
 			break;
+		case MVKSemaphoreStyleSingleQueue:
+			MVKLogInfo("Using Metal implicit guarantees within a single queue for Vulkan semaphores.");
+			break;
 	}
 }
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKSync.h b/MoltenVK/MoltenVK/GPUObjects/MVKSync.h
index 3562369..0a03499 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKSync.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKSync.h
@@ -201,27 +201,29 @@
 
 
 #pragma mark -
-#pragma mark MVKSemaphoreMTLFence
+#pragma mark MVKSemaphoreSingleQueue
 
-/** An MVKSemaphore that uses MTLFence to provide synchronization. */
-class MVKSemaphoreMTLFence : public MVKSemaphore {
+/**
+ * An MVKSemaphore that uses Metal's built-in guarantees on single-queue submission to provide semaphore-like guarantees.
+ *
+ * Relies on Metal's enabled-by-default hazard tracking, and will need to start doing things with MTLFences
+ * if we start using things with MTLHazardTrackingModeUntracked
+ */
+class MVKSemaphoreSingleQueue : public MVKSemaphore {
 
 public:
 	void encodeWait(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) override;
 	void encodeSignal(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) override;
 	uint64_t deferSignal() override;
 	void encodeDeferredSignal(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) override;
-	bool isUsingCommandEncoding() override { return true; }
+	bool isUsingCommandEncoding() override { return false; }
 
-	MVKSemaphoreMTLFence(MVKDevice* device,
-						 const VkSemaphoreCreateInfo* pCreateInfo,
-						 const VkExportMetalObjectCreateInfoEXT* pExportInfo,
-						 const VkImportMetalSharedEventInfoEXT* pImportInfo);
+	MVKSemaphoreSingleQueue(MVKDevice* device,
+	                        const VkSemaphoreCreateInfo* pCreateInfo,
+	                        const VkExportMetalObjectCreateInfoEXT* pExportInfo,
+	                        const VkImportMetalSharedEventInfoEXT* pImportInfo);
 
-	~MVKSemaphoreMTLFence() override;
-
-protected:
-	id<MTLFence> _mtlFence;
+	~MVKSemaphoreSingleQueue() override;
 };
 
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKSync.mm b/MoltenVK/MoltenVK/GPUObjects/MVKSync.mm
index 29d36d3..318e86a 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKSync.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKSync.mm
@@ -77,47 +77,34 @@
 
 
 #pragma mark -
-#pragma mark MVKSemaphoreMTLFence
+#pragma mark MVKSemaphoreSingleQueue
 
-// Could use any encoder. Assume BLIT is fastest and lightest.
-// Nil mtlCmdBuff will do nothing.
-void MVKSemaphoreMTLFence::encodeWait(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) {
-	id<MTLBlitCommandEncoder> mtlCmdEnc = mtlCmdBuff.blitCommandEncoder;
-	[mtlCmdEnc waitForFence: _mtlFence];
-	[mtlCmdEnc endEncoding];
+void MVKSemaphoreSingleQueue::encodeWait(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) {
+	// Metal will handle all synchronization for us automatically
 }
 
-// Could use any encoder. Assume BLIT is fastest and lightest.
-// Nil mtlCmdBuff will do nothing.
-void MVKSemaphoreMTLFence::encodeSignal(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) {
-	id<MTLBlitCommandEncoder> mtlCmdEnc = mtlCmdBuff.blitCommandEncoder;
-	[mtlCmdEnc updateFence: _mtlFence];
-	[mtlCmdEnc endEncoding];
+void MVKSemaphoreSingleQueue::encodeSignal(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) {
+	// Metal will handle all synchronization for us automatically
 }
 
-uint64_t MVKSemaphoreMTLFence::deferSignal() {
+uint64_t MVKSemaphoreSingleQueue::deferSignal() {
 	return 0;
 }
 
-void MVKSemaphoreMTLFence::encodeDeferredSignal(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) {
+void MVKSemaphoreSingleQueue::encodeDeferredSignal(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) {
 	encodeSignal(mtlCmdBuff, 0);
 }
 
-MVKSemaphoreMTLFence::MVKSemaphoreMTLFence(MVKDevice* device,
-										   const VkSemaphoreCreateInfo* pCreateInfo,
-										   const VkExportMetalObjectCreateInfoEXT* pExportInfo,
-										   const VkImportMetalSharedEventInfoEXT* pImportInfo) : MVKSemaphore(device, pCreateInfo) {
-
-	_mtlFence = [device->getMTLDevice() newFence];		//retained
-
+MVKSemaphoreSingleQueue::MVKSemaphoreSingleQueue(MVKDevice* device,
+                                                 const VkSemaphoreCreateInfo* pCreateInfo,
+                                                 const VkExportMetalObjectCreateInfoEXT* pExportInfo,
+                                                 const VkImportMetalSharedEventInfoEXT* pImportInfo) : MVKSemaphore(device, pCreateInfo) {
 	if ((pImportInfo && pImportInfo->mtlSharedEvent) || (pExportInfo && pExportInfo->exportObjectType == VK_EXPORT_METAL_OBJECT_TYPE_METAL_SHARED_EVENT_BIT_EXT)) {
-		setConfigurationResult(reportError(VK_ERROR_INITIALIZATION_FAILED, "vkCreateEvent(): MTLSharedEvent is not available with VkSemaphores that use MTLFence."));
+		setConfigurationResult(reportError(VK_ERROR_INITIALIZATION_FAILED, "vkCreateEvent(): MTLSharedEvent is not available with VkSemaphores that use implicit synchronization."));
 	}
 }
 
-MVKSemaphoreMTLFence::~MVKSemaphoreMTLFence() {
-	[_mtlFence release];
-}
+MVKSemaphoreSingleQueue::~MVKSemaphoreSingleQueue() = default;
 
 
 #pragma mark -
diff --git a/MoltenVK/MoltenVK/Utility/MVKEnvironment.cpp b/MoltenVK/MoltenVK/Utility/MVKEnvironment.cpp
index 97cafd0..56876d3 100644
--- a/MoltenVK/MoltenVK/Utility/MVKEnvironment.cpp
+++ b/MoltenVK/MoltenVK/Utility/MVKEnvironment.cpp
@@ -50,7 +50,7 @@
 	MVK_SET_FROM_ENV_OR_BUILD_INT32 (evCfg.logLevel,                               MVK_CONFIG_LOG_LEVEL);
 	MVK_SET_FROM_ENV_OR_BUILD_INT32 (evCfg.traceVulkanCalls,                       MVK_CONFIG_TRACE_VULKAN_CALLS);
 	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.forceLowPowerGPU,                       MVK_CONFIG_FORCE_LOW_POWER_GPU);
-	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.semaphoreUseMTLFence,                   MVK_ALLOW_METAL_FENCES);
+	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.semaphoreUseSingleQueue,                MVK_ALLOW_SINGLE_QUEUE_SEMAPHORE);
 	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.semaphoreUseMTLEvent,                   MVK_ALLOW_METAL_EVENTS);
 	MVK_SET_FROM_ENV_OR_BUILD_INT32 (evCfg.autoGPUCaptureScope,                    MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE);
 	MVK_SET_FROM_ENV_OR_BUILD_STRING(evCfg.autoGPUCaptureOutputFilepath,           MVK_CONFIG_AUTO_GPU_CAPTURE_OUTPUT_FILE, evGPUCapFileStrObj);
diff --git a/MoltenVK/MoltenVK/Utility/MVKEnvironment.h b/MoltenVK/MoltenVK/Utility/MVKEnvironment.h
index df8edf2..840381d 100644
--- a/MoltenVK/MoltenVK/Utility/MVKEnvironment.h
+++ b/MoltenVK/MoltenVK/Utility/MVKEnvironment.h
@@ -231,16 +231,16 @@
 #endif
 
 /**
- * Allow the use of MTLFence or MTLEvent for VkSemaphore synchronization behaviour.
+ * Allow the use of a single queue or MTLEvent for VkSemaphore synchronization behaviour.
  * By default:
  *   - MVK_ALLOW_METAL_EVENTS is enabled
- *   - MVK_ALLOW_METAL_FENCES is disabled
+ *   - MVK_ALLOW_SINGLE_QUEUE_SEMAPHORE is enabled
  * */
 #ifndef MVK_ALLOW_METAL_EVENTS
 #   define MVK_ALLOW_METAL_EVENTS    1
 #endif
-#ifndef MVK_ALLOW_METAL_FENCES
-#   define MVK_ALLOW_METAL_FENCES    0
+#ifndef MVK_ALLOW_SINGLE_QUEUE_SEMAPHORE
+#   define MVK_ALLOW_SINGLE_QUEUE_SEMAPHORE    1
 #endif
 
 /** Substitute Metal 2D textures for Vulkan 1D images. Enabled by default. */