Add single-queue handling of VkSemaphore
diff --git a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
index bd316f8..4958a51 100644
--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@@ -558,13 +558,34 @@
 	VkBool32 forceLowPowerGPU;
 
 	/**
+	 * Use Metal's implicit guarantees that all operations submitted to a queue will give the same result as
+	 * if they had been run in submission order to implement VkSemaphore synchronization as no-ops.
+	 *
+	 * This requires all submissions be made to the same queue, and to guarantee that, MoltenVK will expose
+	 * only one queue to the application.
+	 *
+	 * In the special case of VK_SEMAPHORE_TYPE_TIMELINE semaphores, MoltenVK will always
+	 * use MTLSharedEvent if it is available on the platform, regardless of the values of
+	 * semaphoreUseMTLEvent or semaphoreUseMTLFence.
+	 *
+	 * The value of this parameter must be changed before creating a VkDevice for the change to take effect.
+	 *
+	 * The initial value or this parameter is set by the
+	 * MVK_ALLOW_SINGLE_QUEUE_SEMAPHORE
+	 * runtime environment variable or MoltenVK compile-time build setting.
+	 * If neither is set, this setting is enabled by default, and VkSemaphore will force a single queue
+	 * on NVIDIA GPUs and whenever MVK_ALLOW_METAL_EVENTS is not also set.
+	 */
+	VkBool32 semaphoreUseSingleQueue;
+
+	/**
 	 * Use MTLEvent, if it is available on the device, for VkSemaphore synchronization behaviour.
 	 *
-	 * This parameter interacts with semaphoreUseMTLFence. If both are enabled, on GPUs other than
+	 * This parameter interacts with semaphoreUseSingleQueue. If both are enabled, on GPUs other than
 	 * NVIDIA, semaphoreUseMTLEvent takes priority and MTLEvent will be used if it is available,
 	 * otherwise MTLFence will be used if it is available. On NVIDIA GPUs, MTLEvent is disabled
-	 * for VkSemaphores, so CPU-based synchronization will be used unless semaphoreUseMTLFence
-	 * is enabled and MTLFence is available.
+	 * for VkSemaphores, so CPU-based synchronization will be used unless semaphoreUseSingleQueue
+	 * is enabled.
 	 *
 	 * In the special case of VK_SEMAPHORE_TYPE_TIMELINE semaphores, MoltenVK will always
 	 * use MTLSharedEvent if it is available on the platform, regardless of the values of
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
index d5ffd76..40b6383 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
@@ -90,6 +90,12 @@
 #pragma mark -
 #pragma mark MVKPhysicalDevice
 
+typedef enum {
+	MVKSemaphoreStyleUseMTLEvent,
+	MVKSemaphoreStyleUseEmulation,
+	MVKSemaphoreStyleSingleQueue,
+} MVKSemaphoreStyle;
+
 /** Represents a Vulkan physical GPU device. */
 class MVKPhysicalDevice : public MVKDispatchableVulkanAPIObject {
 
@@ -394,6 +400,7 @@
 	void initExtensions();
 	void initCounterSets();
 	bool needsCounterSetRetained();
+	MVKSemaphoreStyle getSemaphoreStyle();
 	MVKArrayRef<MVKQueueFamily*> getQueueFamilies();
 	void initPipelineCacheUUID();
 	uint32_t getHighestGPUCapability();
@@ -431,11 +438,6 @@
 	id<MTLCommandBuffer> mtlCmdBuffer = nil;
 } MVKMTLBlitEncoder;
 
-typedef enum {
-	MVKSemaphoreStyleUseMTLEvent,
-	MVKSemaphoreStyleUseEmulation
-} MVKSemaphoreStyle;
-
 /** Represents a Vulkan logical GPU device, associated with a physical device. */
 class MVKDevice : public MVKDispatchableVulkanAPIObject {
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
index 50e9dc9..46b1848 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@@ -1290,17 +1290,22 @@
 		qfProps.queueFlags = (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT);
 		_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
 
-		// Dedicated graphics queue family...or another general-purpose queue family.
-		if (specialize) { qfProps.queueFlags = (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_TRANSFER_BIT); }
-		_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
+		// Single queue semaphore requires using a single queue for everything
+		// So don't allow anyone to have more than one
+		if (getSemaphoreStyle() != MVKSemaphoreStyleSingleQueue)
+		{
+			// Dedicated graphics queue family...or another general-purpose queue family.
+			if (specialize) { qfProps.queueFlags = (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_TRANSFER_BIT); }
+			_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
 
-		// Dedicated compute queue family...or another general-purpose queue family.
-		if (specialize) { qfProps.queueFlags = (VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT); }
-		_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
+			// Dedicated compute queue family...or another general-purpose queue family.
+			if (specialize) { qfProps.queueFlags = (VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT); }
+			_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
 
-		// Dedicated transfer queue family...or another general-purpose queue family.
-		if (specialize) { qfProps.queueFlags = VK_QUEUE_TRANSFER_BIT; }
-		_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
+			// Dedicated transfer queue family...or another general-purpose queue family.
+			if (specialize) { qfProps.queueFlags = VK_QUEUE_TRANSFER_BIT; }
+			_queueFamilies.push_back(new MVKQueueFamily(this, qfIdx++, &qfProps));
+		}
 
 		MVKAssert(kMVKQueueFamilyCount >= _queueFamilies.size(), "Adjust value of kMVKQueueFamilyCount.");
 	}
@@ -3082,6 +3087,23 @@
 	}
 }
 
+MVKSemaphoreStyle MVKPhysicalDevice::getSemaphoreStyle() {
+	// Decide whether Vulkan semaphores should use a MTLEvent or forcing a single queue if they are available.
+	// Prefer MTLEvent, because MTLEvent handles sync across MTLCommandBuffers and MTLCommandQueues.
+	// However, do not allow use of MTLEvents on Rosetta2 (x86 build on M1 runtime) or NVIDIA GPUs,
+	// which have demonstrated trouble with MTLEvents. In that case, since a single queue will be used
+	// unless the option for it it has been disabled, in which case CPU emulation will be used
+	bool isNVIDIA = _properties.vendorID == kNVVendorId;
+	bool isRosetta2 = _properties.vendorID == kAppleVendorId && !MVK_APPLE_SILICON;
+	if (_metalFeatures.events && mvkConfig().semaphoreUseMTLEvent && !(isRosetta2 || isNVIDIA)) {
+		return MVKSemaphoreStyleUseMTLEvent;
+	}
+	if (mvkConfig().semaphoreUseSingleQueue) {
+		return MVKSemaphoreStyleSingleQueue;
+	}
+	return MVKSemaphoreStyleUseEmulation;
+}
+
 // Workaround for a bug in Intel Iris Plus Graphics driver where the counterSets array is
 // not properly retained internally, and becomes a zombie when counterSets is called more
 // than once, which occurs when an app creates more than one VkInstance. This workaround
@@ -3617,6 +3639,7 @@
 		switch (_vkSemaphoreStyle) {
 			case MVKSemaphoreStyleUseMTLEvent:  return new MVKSemaphoreMTLEvent(this, pCreateInfo, pExportInfo, pImportInfo);
 			case MVKSemaphoreStyleUseEmulation: return new MVKSemaphoreEmulated(this, pCreateInfo, pExportInfo, pImportInfo);
+			case MVKSemaphoreStyleSingleQueue:  return new MVKSemaphoreSingleQueue(this, pCreateInfo, pExportInfo, pImportInfo);
 		}
 	}
 }
@@ -4406,15 +4429,7 @@
 	_pProperties = &_physicalDevice->_properties;
 	_pMemoryProperties = &_physicalDevice->_memoryProperties;
 
-	// Decide whether Vulkan semaphores should use a MTLEvent or MTLFence if they are available.
-	// Prefer MTLEvent, because MTLEvent handles sync across MTLCommandBuffers and MTLCommandQueues.
-	// However, do not allow use of MTLEvents on Rosetta2 (x86 build on M1 runtime) or NVIDIA GPUs,
-	// which have demonstrated trouble with MTLEvents. In that case, since MTLFence use is disabled
-	// by default, unless MTLFence is deliberately enabled, CPU emulation will be used.
-	bool isNVIDIA = _pProperties->vendorID == kNVVendorId;
-	bool isRosetta2 = _pProperties->vendorID == kAppleVendorId && !MVK_APPLE_SILICON;
-	bool canUseMTLEventForSem4 = _pMetalFeatures->events && mvkConfig().semaphoreUseMTLEvent && !(isRosetta2 || isNVIDIA);
-	_vkSemaphoreStyle = canUseMTLEventForSem4 ? MVKSemaphoreStyleUseMTLEvent : MVKSemaphoreStyleUseEmulation;
+	_vkSemaphoreStyle = _physicalDevice->getSemaphoreStyle();
 	switch (_vkSemaphoreStyle) {
 		case MVKSemaphoreStyleUseMTLEvent:
 			MVKLogInfo("Using MTLEvent for Vulkan semaphores.");
@@ -4422,6 +4437,9 @@
 		case MVKSemaphoreStyleUseEmulation:
 			MVKLogInfo("Using emulation for Vulkan semaphores.");
 			break;
+		case MVKSemaphoreStyleSingleQueue:
+			MVKLogInfo("Using Metal implicit guarantees within a single queue for Vulkan semaphores.");
+			break;
 	}
 }
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKSync.h b/MoltenVK/MoltenVK/GPUObjects/MVKSync.h
index 84b61ff..0a03499 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKSync.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKSync.h
@@ -201,6 +201,33 @@
 
 
 #pragma mark -
+#pragma mark MVKSemaphoreSingleQueue
+
+/**
+ * An MVKSemaphore that uses Metal's built-in guarantees on single-queue submission to provide semaphore-like guarantees.
+ *
+ * Relies on Metal's enabled-by-default hazard tracking, and will need to start doing things with MTLFences
+ * if we start using things with MTLHazardTrackingModeUntracked
+ */
+class MVKSemaphoreSingleQueue : public MVKSemaphore {
+
+public:
+	void encodeWait(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) override;
+	void encodeSignal(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) override;
+	uint64_t deferSignal() override;
+	void encodeDeferredSignal(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) override;
+	bool isUsingCommandEncoding() override { return false; }
+
+	MVKSemaphoreSingleQueue(MVKDevice* device,
+	                        const VkSemaphoreCreateInfo* pCreateInfo,
+	                        const VkExportMetalObjectCreateInfoEXT* pExportInfo,
+	                        const VkImportMetalSharedEventInfoEXT* pImportInfo);
+
+	~MVKSemaphoreSingleQueue() override;
+};
+
+
+#pragma mark -
 #pragma mark MVKSemaphoreMTLEvent
 
 /** An MVKSemaphore that uses MTLEvent to provide synchronization. */
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKSync.mm b/MoltenVK/MoltenVK/GPUObjects/MVKSync.mm
index d5f2a4e..318e86a 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKSync.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKSync.mm
@@ -77,6 +77,37 @@
 
 
 #pragma mark -
+#pragma mark MVKSemaphoreSingleQueue
+
+void MVKSemaphoreSingleQueue::encodeWait(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) {
+	// Metal will handle all synchronization for us automatically
+}
+
+void MVKSemaphoreSingleQueue::encodeSignal(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) {
+	// Metal will handle all synchronization for us automatically
+}
+
+uint64_t MVKSemaphoreSingleQueue::deferSignal() {
+	return 0;
+}
+
+void MVKSemaphoreSingleQueue::encodeDeferredSignal(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) {
+	encodeSignal(mtlCmdBuff, 0);
+}
+
+MVKSemaphoreSingleQueue::MVKSemaphoreSingleQueue(MVKDevice* device,
+                                                 const VkSemaphoreCreateInfo* pCreateInfo,
+                                                 const VkExportMetalObjectCreateInfoEXT* pExportInfo,
+                                                 const VkImportMetalSharedEventInfoEXT* pImportInfo) : MVKSemaphore(device, pCreateInfo) {
+	if ((pImportInfo && pImportInfo->mtlSharedEvent) || (pExportInfo && pExportInfo->exportObjectType == VK_EXPORT_METAL_OBJECT_TYPE_METAL_SHARED_EVENT_BIT_EXT)) {
+		setConfigurationResult(reportError(VK_ERROR_INITIALIZATION_FAILED, "vkCreateEvent(): MTLSharedEvent is not available with VkSemaphores that use implicit synchronization."));
+	}
+}
+
+MVKSemaphoreSingleQueue::~MVKSemaphoreSingleQueue() = default;
+
+
+#pragma mark -
 #pragma mark MVKSemaphoreMTLEvent
 
 void MVKSemaphoreMTLEvent::encodeWait(id<MTLCommandBuffer> mtlCmdBuff, uint64_t) {
diff --git a/MoltenVK/MoltenVK/Utility/MVKEnvironment.cpp b/MoltenVK/MoltenVK/Utility/MVKEnvironment.cpp
index 1f9a1b9..3c3cedc 100644
--- a/MoltenVK/MoltenVK/Utility/MVKEnvironment.cpp
+++ b/MoltenVK/MoltenVK/Utility/MVKEnvironment.cpp
@@ -50,6 +50,7 @@
 	MVK_SET_FROM_ENV_OR_BUILD_INT32 (evCfg.logLevel,                               MVK_CONFIG_LOG_LEVEL);
 	MVK_SET_FROM_ENV_OR_BUILD_INT32 (evCfg.traceVulkanCalls,                       MVK_CONFIG_TRACE_VULKAN_CALLS);
 	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.forceLowPowerGPU,                       MVK_CONFIG_FORCE_LOW_POWER_GPU);
+	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.semaphoreUseSingleQueue,                MVK_ALLOW_SINGLE_QUEUE_SEMAPHORE);
 	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.semaphoreUseMTLEvent,                   MVK_ALLOW_METAL_EVENTS);
 	MVK_SET_FROM_ENV_OR_BUILD_INT32 (evCfg.autoGPUCaptureScope,                    MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE);
 	MVK_SET_FROM_ENV_OR_BUILD_STRING(evCfg.autoGPUCaptureOutputFilepath,           MVK_CONFIG_AUTO_GPU_CAPTURE_OUTPUT_FILE, evGPUCapFileStrObj);
diff --git a/MoltenVK/MoltenVK/Utility/MVKEnvironment.h b/MoltenVK/MoltenVK/Utility/MVKEnvironment.h
index 4c27b4c..92de02a 100644
--- a/MoltenVK/MoltenVK/Utility/MVKEnvironment.h
+++ b/MoltenVK/MoltenVK/Utility/MVKEnvironment.h
@@ -231,14 +231,17 @@
 #endif
 
 /**
- * Allow the use of MTLFence or MTLEvent for VkSemaphore synchronization behaviour.
+ * Allow the use of a single queue or MTLEvent for VkSemaphore synchronization behaviour.
  * By default:
  *   - MVK_ALLOW_METAL_EVENTS is enabled
- *   - MVK_ALLOW_METAL_FENCES is disabled
+ *   - MVK_ALLOW_SINGLE_QUEUE_SEMAPHORE is enabled
  * */
 #ifndef MVK_ALLOW_METAL_EVENTS
 #   define MVK_ALLOW_METAL_EVENTS    1
 #endif
+#ifndef MVK_ALLOW_SINGLE_QUEUE_SEMAPHORE
+#   define MVK_ALLOW_SINGLE_QUEUE_SEMAPHORE    1
+#endif
 
 /** Substitute Metal 2D textures for Vulkan 1D images. Enabled by default. */
 #ifndef MVK_CONFIG_TEXTURE_1D_AS_2D