MVKBuffer: Support texel buffers in "host-coherent" memory on Mac.

According to the Vulkan spec:

> * If `buffer` is a `VkBuffer` not created with the
>   `VK_BUFFER_CREATE_SPARSE_BINDING_BIT` bit set[...] then the
>   `memoryTypeBits` member always contains at least one bit set
>   corresponding to a `VkMemoryType` with a `propertyFlags` that has
>   both the `VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT` bit and the
>   `VK_MEMORY_PROPERTY_HOST_COHERENT_BIT` bit set. In other words,
>   mappable coherent memory **can** always be attached to these
>   objects.

There is no exception for texel buffers. Even though desktop Metal
disallows textures in shared memory, even linear textures created from a
buffer, we have to advertise host-coherent memory for texel buffers.
Some applications actually depend on this behavior, so it's not just a
theoretical concern.

To support host-coherent texel buffers, we implicitly create a managed
buffer and copy data between the device memory and the managed buffer,
just like for a linear image.

Signed-off-by: Chip Davis <cdavis@codeweavers.com>
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.h b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.h
index fb59c4d..d3cbb05 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.h
@@ -72,7 +72,7 @@
 	id<MTLBuffer> getMTLBuffer();
 
 	/** Returns the offset at which the contents of this instance starts within the underlying Metal buffer. */
-	inline NSUInteger getMTLBufferOffset() { return _deviceMemory && _deviceMemory->getMTLHeap() ? 0 : _deviceMemoryOffset; }
+	inline NSUInteger getMTLBufferOffset() { return _deviceMemory && _deviceMemory->getMTLHeap() && !_isHostCoherentTexelBuffer ? 0 : _deviceMemoryOffset; }
 
 
 #pragma mark Construction
@@ -82,14 +82,19 @@
 	~MVKBuffer() override;
 
 protected:
+	friend class MVKDeviceMemory;
 	using MVKResource::needsHostReadSync;
 
 	void propogateDebugName() override;
 	bool needsHostReadSync(VkPipelineStageFlags srcStageMask,
 						   VkPipelineStageFlags dstStageMask,
 						   VkBufferMemoryBarrier* pBufferMemoryBarrier);
+	bool shouldFlushHostMemory();
+	VkResult flushToDevice(VkDeviceSize offset, VkDeviceSize size);
+	VkResult pullFromDevice(VkDeviceSize offset, VkDeviceSize size);
 
 	VkBufferUsageFlags _usage;
+	bool _isHostCoherentTexelBuffer = false;
 	id<MTLBuffer> _mtlBuffer = nil;
 };
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm
index 9fdf6af..728fa61 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm
@@ -52,12 +52,6 @@
 		pMemoryRequirements->alignment = _byteAlignment;
 	}
 	pMemoryRequirements->memoryTypeBits = _device->getPhysicalDevice()->getAllMemoryTypes();
-#if MVK_MACOS
-	// Textures must not use shared memory
-	if (mvkIsAnyFlagEnabled(_usage, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT)) {
-		mvkDisableFlags(pMemoryRequirements->memoryTypeBits, _device->getPhysicalDevice()->getHostCoherentMemoryTypes());
-	}
-#endif
 #if MVK_IOS
 	// Memoryless storage is not allowed for buffers
 	mvkDisableFlags(pMemoryRequirements->memoryTypeBits, _device->getPhysicalDevice()->getLazilyAllocatedMemoryTypes());
@@ -88,6 +82,12 @@
 
 	MVKResource::bindDeviceMemory(mvkMem, memOffset);
 
+#if MVK_MACOS
+	if (_deviceMemory) {
+		_isHostCoherentTexelBuffer = _deviceMemory->isMemoryHostCoherent() && mvkIsAnyFlagEnabled(_usage, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT);
+	}
+#endif
+
 	propogateDebugName();
 
 	return _deviceMemory ? _deviceMemory->addBuffer(this) : VK_SUCCESS;
@@ -117,6 +117,31 @@
 #endif
 }
 
+#if MVK_MACOS
+bool MVKBuffer::shouldFlushHostMemory() { return _isHostCoherentTexelBuffer; }
+#endif
+
+// Flushes the device memory at the specified memory range into the MTLBuffer.
+VkResult MVKBuffer::flushToDevice(VkDeviceSize offset, VkDeviceSize size) {
+#if MVK_MACOS
+	if (shouldFlushHostMemory()) {
+		memcpy(getMTLBuffer().contents, reinterpret_cast<const char *>(_deviceMemory->getHostMemoryAddress()) + offset, size);
+		[getMTLBuffer() didModifyRange: NSMakeRange(0, size)];
+	}
+#endif
+	return VK_SUCCESS;
+}
+
+// Pulls content from the MTLBuffer into the device memory at the specified memory range.
+VkResult MVKBuffer::pullFromDevice(VkDeviceSize offset, VkDeviceSize size) {
+#if MVK_MACOS
+	if (shouldFlushHostMemory()) {
+		memcpy(reinterpret_cast<char *>(_deviceMemory->getHostMemoryAddress()) + offset, getMTLBuffer().contents, size);
+	}
+#endif
+	return VK_SUCCESS;
+}
+
 // Returns whether the specified buffer memory barrier requires a sync between this
 // buffer and host memory for the purpose of the host reading texture memory.
 bool MVKBuffer::needsHostReadSync(VkPipelineStageFlags srcStageMask,
@@ -128,7 +153,7 @@
 #if MVK_MACOS
 	return (mvkIsAnyFlagEnabled(dstStageMask, (VK_PIPELINE_STAGE_HOST_BIT)) &&
 			mvkIsAnyFlagEnabled(pBufferMemoryBarrier->dstAccessMask, (VK_ACCESS_HOST_READ_BIT)) &&
-			isMemoryHostAccessible() && !isMemoryHostCoherent());
+			isMemoryHostAccessible() && (!isMemoryHostCoherent() || _isHostCoherentTexelBuffer));
 #endif
 }
 
@@ -144,6 +169,16 @@
 																   offset: _deviceMemoryOffset];	// retained
 			propogateDebugName();
 			return _mtlBuffer;
+#if MVK_MACOS
+		} else if (_isHostCoherentTexelBuffer) {
+			// According to the Vulkan spec, buffers, like linear images, can always use host-coherent memory.
+                        // But texel buffers on Mac cannot use shared memory. So we need to use host-cached
+                        // memory here.
+			_mtlBuffer = [_device->getMTLDevice() newBufferWithLength: getByteCount()
+															  options: MTLResourceStorageModeManaged];	// retained
+			propogateDebugName();
+			return _mtlBuffer;
+#endif
 		} else {
 			return _deviceMemory->getMTLBuffer();
 		}
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm
index 6f3a5d4..714a8f9 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm
@@ -93,6 +93,7 @@
 		if (!_mtlHeap) {
 			lock_guard<mutex> lock(_rezLock);
 			for (auto& img : _images) { img->flushToDevice(offset, memSize); }
+			for (auto& buf : _buffers) { buf->flushToDevice(offset, memSize); }
 		}
 	}
 	return VK_SUCCESS;
@@ -107,6 +108,7 @@
 	if (memSize > 0 && isMemoryHostAccessible() && (evenIfCoherent || !isMemoryHostCoherent()) && !_mtlHeap) {
 		lock_guard<mutex> lock(_rezLock);
         for (auto& img : _images) { img->pullFromDevice(offset, memSize); }
+        for (auto& buf : _buffers) { buf->pullFromDevice(offset, memSize); }
 
 #if MVK_MACOS
 		if (pBlitEnc && _mtlBuffer && _mtlStorageMode == MTLStorageModeManaged) {