Merge pull request #711 from cdavis5e/metal3-placement-heaps

Use placement heaps for VkDeviceMemory when possible.
diff --git a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
index c9edf75..7a09f83 100644
--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@@ -542,6 +542,7 @@
 	VkBool32 postDepthCoverage;					/**< If true, coverage masks in fragment shaders post-depth-test are supported. */
 	VkBool32 native3DCompressedTextures;		/**< If true, 3D compressed images are supported natively, without manual decompression. */
 	VkBool32 nativeTextureSwizzle;				/**< If true, component swizzle is supported natively, without manual swizzling in shaders. */
+	VkBool32 placementHeaps;					/**< If true, MTLHeap objects support placement of resources. */
 } MVKPhysicalDeviceMetalFeatures;
 
 /**
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.h b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.h
index 82815ca..67d8844 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.h
@@ -69,10 +69,10 @@
 #pragma mark Metal
 
 	/** Returns the Metal buffer underlying this memory allocation. */
-	inline id<MTLBuffer> getMTLBuffer() { return _deviceMemory ? _deviceMemory->getMTLBuffer() : nullptr; }
+	id<MTLBuffer> getMTLBuffer();
 
 	/** Returns the offset at which the contents of this instance starts within the underlying Metal buffer. */
-	inline NSUInteger getMTLBufferOffset() { return _deviceMemoryOffset; }
+	inline NSUInteger getMTLBufferOffset() { return _deviceMemory && _deviceMemory->getMTLHeap() ? 0 : _deviceMemoryOffset; }
 
 
 #pragma mark Construction
@@ -90,6 +90,7 @@
 						   VkBufferMemoryBarrier* pBufferMemoryBarrier);
 
 	VkBufferUsageFlags _usage;
+	id<MTLBuffer> _mtlBuffer = nil;
 };
 
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm
index ee23a31..bf1be23 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm
@@ -29,21 +29,28 @@
 #pragma mark MVKBuffer
 
 void MVKBuffer::propogateDebugName() {
-	if (_debugName &&
-		_deviceMemory &&
+	if (!_debugName) { return; }
+	if (_deviceMemory &&
 		_deviceMemory->isDedicatedAllocation() &&
 		_deviceMemory->_debugName.length == 0) {
 
 		_deviceMemory->setDebugName(_debugName.UTF8String);
 	}
+	setLabelIfNotNil(_mtlBuffer, _debugName);
 }
 
 
 #pragma mark Resource memory
 
 VkResult MVKBuffer::getMemoryRequirements(VkMemoryRequirements* pMemoryRequirements) {
-	pMemoryRequirements->size = getByteCount();
-	pMemoryRequirements->alignment = _byteAlignment;
+	if (_device->_pMetalFeatures->placementHeaps) {
+		MTLSizeAndAlign sizeAndAlign = [_device->getMTLDevice() heapBufferSizeAndAlignWithLength: getByteCount() options: MTLResourceStorageModePrivate];
+		pMemoryRequirements->size = sizeAndAlign.size;
+		pMemoryRequirements->alignment = sizeAndAlign.align;
+	} else {
+		pMemoryRequirements->size = getByteCount();
+		pMemoryRequirements->alignment = _byteAlignment;
+	}
 	pMemoryRequirements->memoryTypeBits = _device->getPhysicalDevice()->getAllMemoryTypes();
 #if MVK_MACOS
 	// Textures must not use shared memory
@@ -61,21 +68,15 @@
 VkResult MVKBuffer::getMemoryRequirements(const void*, VkMemoryRequirements2* pMemoryRequirements) {
 	pMemoryRequirements->sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2;
 	getMemoryRequirements(&pMemoryRequirements->memoryRequirements);
-	auto* next = (VkStructureType*)pMemoryRequirements->pNext;
-	while (next) {
-		switch (*next) {
+	for (auto* next = (VkBaseOutStructure*)pMemoryRequirements->pNext; next; next = next->pNext) {
+		switch (next->sType) {
 		case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
 			auto* dedicatedReqs = (VkMemoryDedicatedRequirements*)next;
-			// TODO: Maybe someday we could do something with MTLHeaps
-			// and allocate non-dedicated memory from them. For now, we
-			// always prefer dedicated allocations.
-			dedicatedReqs->prefersDedicatedAllocation = VK_TRUE;
+			dedicatedReqs->prefersDedicatedAllocation = VK_FALSE;
 			dedicatedReqs->requiresDedicatedAllocation = VK_FALSE;
-			next = (VkStructureType*)dedicatedReqs->pNext;
 			break;
 		}
 		default:
-			next = (VkStructureType*)((VkMemoryRequirements2*)next)->pNext;
 			break;
 		}
 	}
@@ -134,6 +135,25 @@
 }
 
 
+#pragma mark Metal
+
+id<MTLBuffer> MVKBuffer::getMTLBuffer() {
+	if (_mtlBuffer) { return _mtlBuffer; }
+	if (_deviceMemory) {
+		if (_deviceMemory->getMTLHeap()) {
+			_mtlBuffer = [_deviceMemory->getMTLHeap() newBufferWithLength: getByteCount()
+																  options: _deviceMemory->getMTLResourceOptions()
+																   offset: _deviceMemoryOffset];	// retained
+			propogateDebugName();
+			return _mtlBuffer;
+		} else {
+			return _deviceMemory->getMTLBuffer();
+		}
+	}
+	return nil;
+}
+
+
 #pragma mark Construction
 
 MVKBuffer::MVKBuffer(MVKDevice* device, const VkBufferCreateInfo* pCreateInfo) : MVKResource(device), _usage(pCreateInfo->usage) {
@@ -143,6 +163,7 @@
 
 MVKBuffer::~MVKBuffer() {
 	if (_deviceMemory) { _deviceMemory->removeBuffer(this); }
+	if (_mtlBuffer) { [_mtlBuffer release]; }
 }
 
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
index 1360e4e..8873d69 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@@ -840,6 +840,7 @@
 
 	if ( mvkOSVersion() >= 13.0 ) {
 		_metalFeatures.mslVersionEnum = MTLLanguageVersion2_2;
+		_metalFeatures.placementHeaps = true;
 		if ( getSupportsGPUFamily(MTLGPUFamilyApple4) ) {
 			_metalFeatures.nativeTextureSwizzle = true;
 		}
@@ -894,6 +895,7 @@
 		_metalFeatures.native3DCompressedTextures = true;
 		if ( getSupportsGPUFamily(MTLGPUFamilyMac2) ) {
 			_metalFeatures.nativeTextureSwizzle = true;
+			_metalFeatures.placementHeaps = true;
 		}
 	}
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.h b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.h
index c87b443..bd15e52 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.h
@@ -94,6 +94,9 @@
 	/** Returns the Metal buffer underlying this memory allocation. */
 	inline id<MTLBuffer> getMTLBuffer() { return _mtlBuffer; }
 
+	/** Returns the Metal heap underlying this memory allocation. */
+	inline id<MTLHeap> getMTLHeap() { return _mtlHeap; }
+
 	/** Returns the Metal storage mode used by this memory allocation. */
 	inline MTLStorageMode getMTLStorageMode() { return _mtlStorageMode; }
 
@@ -123,6 +126,7 @@
 	void removeBuffer(MVKBuffer* mvkBuff);
 	VkResult addImage(MVKImage* mvkImg);
 	void removeImage(MVKImage* mvkImg);
+	bool ensureMTLHeap();
 	bool ensureMTLBuffer();
 	bool ensureHostMemory();
 	void freeHostMemory();
@@ -135,6 +139,7 @@
 	VkDeviceSize _mapOffset = 0;
 	VkDeviceSize _mapSize = 0;
 	id<MTLBuffer> _mtlBuffer = nil;
+	id<MTLHeap> _mtlHeap = nil;
 	void* _pMemory = nullptr;
 	void* _pHostMemory = nullptr;
 	bool _isMapped = false;
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm
index ee4aedf..c25500c 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm
@@ -31,7 +31,10 @@
 
 #pragma mark MVKDeviceMemory
 
-void MVKDeviceMemory::propogateDebugName() { setLabelIfNotNil(_mtlBuffer, _debugName); }
+void MVKDeviceMemory::propogateDebugName() {
+	setLabelIfNotNil(_mtlHeap, _debugName);
+	setLabelIfNotNil(_mtlBuffer, _debugName);
+}
 
 VkResult MVKDeviceMemory::map(VkDeviceSize offset, VkDeviceSize size, VkMemoryMapFlags flags, void** ppData) {
 
@@ -85,8 +88,11 @@
 		}
 #endif
 
-		lock_guard<mutex> lock(_rezLock);
-        for (auto& img : _images) { img->flushToDevice(offset, memSize); }
+		// If we have an MTLHeap object, there's no need to sync memory manually between images and the buffer.
+		if (!_mtlHeap) {
+			lock_guard<mutex> lock(_rezLock);
+			for (auto& img : _images) { img->flushToDevice(offset, memSize); }
+		}
 	}
 	return VK_SUCCESS;
 }
@@ -94,7 +100,7 @@
 VkResult MVKDeviceMemory::pullFromDevice(VkDeviceSize offset, VkDeviceSize size, bool evenIfCoherent) {
 	// Coherent memory is flushed on unmap(), so it is only flushed if forced
     VkDeviceSize memSize = adjustMemorySize(size, offset);
-	if (memSize > 0 && isMemoryHostAccessible() && (evenIfCoherent || !isMemoryHostCoherent()) ) {
+	if (memSize > 0 && isMemoryHostAccessible() && (evenIfCoherent || !isMemoryHostCoherent()) && !_mtlHeap) {
 		lock_guard<mutex> lock(_rezLock);
         for (auto& img : _images) { img->pullFromDevice(offset, memSize); }
 	}
@@ -140,8 +146,7 @@
 		return reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "Could not bind VkImage %p to a VkDeviceMemory dedicated to resource %p. A dedicated allocation may only be used with the resource it was dedicated to.", mvkImg, getDedicatedResource() );
 	}
 
-	if (!_isDedicated)
-		_images.push_back(mvkImg);
+	if (!_isDedicated) { _images.push_back(mvkImg); }
 
 	return VK_SUCCESS;
 }
@@ -151,6 +156,36 @@
 	mvkRemoveAllOccurances(_images, mvkImg);
 }
 
+// Ensures that this instance is backed by a MTLHeap object,
+// creating the MTLHeap if needed, and returns whether it was successful.
+bool MVKDeviceMemory::ensureMTLHeap() {
+
+	if (_mtlHeap) { return true; }
+
+	// Don't bother if we don't have placement heaps.
+	if (!getDevice()->_pMetalFeatures->placementHeaps) { return true; }
+
+#if MVK_MACOS
+	// MTLHeaps on Mac must use private storage for now.
+	if (_mtlStorageMode != MTLStorageModePrivate) { return true; }
+#endif
+
+	MTLHeapDescriptor* heapDesc = [MTLHeapDescriptor new];
+	heapDesc.type = MTLHeapTypePlacement;
+	heapDesc.resourceOptions = getMTLResourceOptions();
+	// For now, use tracked resources. Later, we should probably default
+	// to untracked, since Vulkan uses explicit barriers anyway.
+	heapDesc.hazardTrackingMode = MTLHazardTrackingModeTracked;
+	heapDesc.size = _allocationSize;
+	_mtlHeap = [_device->getMTLDevice() newHeapWithDescriptor: heapDesc];	// retained
+	[heapDesc release];
+	if (!_mtlHeap) { return false; }
+
+	propogateDebugName();
+
+	return true;
+}
+
 // Ensures that this instance is backed by a MTLBuffer object,
 // creating the MTLBuffer if needed, and returns whether it was successful.
 bool MVKDeviceMemory::ensureMTLBuffer() {
@@ -162,12 +197,20 @@
 	if (memLen > _device->_pMetalFeatures->maxMTLBufferSize) { return false; }
 
 	// If host memory was already allocated, it is copied into the new MTLBuffer, and then released.
-	if (_pHostMemory) {
+	if (_mtlHeap) {
+		_mtlBuffer = [_mtlHeap newBufferWithLength: memLen options: getMTLResourceOptions() offset: 0];	// retained
+		if (_pHostMemory) {
+			memcpy(_mtlBuffer.contents, _pHostMemory, memLen);
+			freeHostMemory();
+		}
+		[_mtlBuffer makeAliasable];
+	} else if (_pHostMemory) {
 		_mtlBuffer = [getMTLDevice() newBufferWithBytes: _pHostMemory length: memLen options: getMTLResourceOptions()];     // retained
 		freeHostMemory();
 	} else {
 		_mtlBuffer = [getMTLDevice() newBufferWithLength: memLen options: getMTLResourceOptions()];     // retained
 	}
+	if (!_mtlBuffer) { return false; }
 	_pMemory = isMemoryHostAccessible() ? _mtlBuffer.contents : nullptr;
 
 	propogateDebugName();
@@ -254,6 +297,15 @@
 		return;
 	}
 
+	// If we can, create a MTLHeap. This should happen before creating the buffer
+	// allowing us to map its contents.
+	if (!dedicatedImage && !dedicatedBuffer) {
+		if (!ensureMTLHeap()) {
+			setConfigurationResult(reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "Could not allocate VkDeviceMemory of size %llu bytes.", _allocationSize));
+			return;
+		}
+	}
+
 	// If memory needs to be coherent it must reside in an MTLBuffer, since an open-ended map() must work.
 	if (isMemoryHostCoherent() && !ensureMTLBuffer() ) {
 		setConfigurationResult(reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "Could not allocate a host-coherent VkDeviceMemory of size %llu bytes. The maximum memory-aligned size of a host-coherent VkDeviceMemory is %llu bytes.", _allocationSize, _device->_pMetalFeatures->maxMTLBufferSize));
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKImage.h b/MoltenVK/MoltenVK/GPUObjects/MVKImage.h
index 0cbf0d0..dfbde55 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKImage.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKImage.h
@@ -275,6 +275,7 @@
 	bool _usesTexelBuffer;
 	bool _isLinear;
 	bool _is3DCompressed;
+	bool _isAliasable;
 };
 
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKImage.mm b/MoltenVK/MoltenVK/GPUObjects/MVKImage.mm
index 04a8862..bf51f65 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKImage.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKImage.mm
@@ -192,21 +192,16 @@
 VkResult MVKImage::getMemoryRequirements(const void*, VkMemoryRequirements2* pMemoryRequirements) {
 	pMemoryRequirements->sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2;
 	getMemoryRequirements(&pMemoryRequirements->memoryRequirements);
-	auto* next = (VkStructureType*)pMemoryRequirements->pNext;
-	while (next) {
-		switch (*next) {
+	for (auto* next = (VkBaseOutStructure*)pMemoryRequirements->pNext; next; next = next->pNext) {
+		switch (next->sType) {
 		case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
 			auto* dedicatedReqs = (VkMemoryDedicatedRequirements*)next;
-			// TODO: Maybe someday we could do something with MTLHeaps
-			// and allocate non-dedicated memory from them. For now, we
-			// always prefer dedicated allocations.
-			dedicatedReqs->prefersDedicatedAllocation = VK_TRUE;
+			bool writable = mvkIsAnyFlagEnabled(_usage, VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT);
+			dedicatedReqs->prefersDedicatedAllocation = !_usesTexelBuffer && (writable || !_device->_pMetalFeatures->placementHeaps);
 			dedicatedReqs->requiresDedicatedAllocation = VK_FALSE;
-			next = (VkStructureType*)dedicatedReqs->pNext;
 			break;
 		}
 		default:
-			next = (VkStructureType*)((VkMemoryRequirements2*)next)->pNext;
 			break;
 		}
 	}
@@ -231,7 +226,7 @@
 	bool isUncompressed = blockExt.width == 1 && blockExt.height == 1;
 
 	bool useTexelBuffer = _device->_pMetalFeatures->texelBuffers;								// Texel buffers available
-	useTexelBuffer = useTexelBuffer && isMemoryHostAccessible() && _isLinear && isUncompressed;	// Applicable memory layout
+	useTexelBuffer = useTexelBuffer && (isMemoryHostAccessible() || _device->_pMetalFeatures->placementHeaps) && _isLinear && isUncompressed;	// Applicable memory layout
 	useTexelBuffer = useTexelBuffer && _deviceMemory && _deviceMemory->_mtlBuffer;				// Buffer is available to overlay
 
 #if MVK_MACOS
@@ -352,6 +347,10 @@
 		mtlTex = [_deviceMemory->_mtlBuffer newTextureWithDescriptor: mtlTexDesc
 															  offset: getDeviceMemoryOffset()
 														 bytesPerRow: _subresources[0].layout.rowPitch];
+	} else if (_deviceMemory->_mtlHeap) {
+		mtlTex = [_deviceMemory->_mtlHeap newTextureWithDescriptor: mtlTexDesc
+															offset: getDeviceMemoryOffset()];
+		if (_isAliasable) [mtlTex makeAliasable];
 	} else {
 		mtlTex = [getMTLDevice() newTextureWithDescriptor: mtlTexDesc];
 	}
@@ -628,11 +627,20 @@
 	_canSupportMTLTextureView = !_isDepthStencilAttachment || _device->_pMetalFeatures->stencilViews;
 	_hasExpectedTexelSize = (mvkMTLPixelFormatBytesPerBlock(_mtlPixelFormat) == mvkVkFormatBytesPerBlock(pCreateInfo->format));
 
-	// Calc _byteCount after _byteAlignment
-	_byteAlignment = _isLinear ? _device->getVkFormatTexelBufferAlignment(pCreateInfo->format, this) : mvkEnsurePowerOfTwo(mvkVkFormatBytesPerBlock(pCreateInfo->format));
-    for (uint32_t mipLvl = 0; mipLvl < _mipLevels; mipLvl++) {
-        _byteCount += getBytesPerLayer(mipLvl) * _extent.depth * _arrayLayers;
-    }
+	if (!_isLinear && _device->_pMetalFeatures->placementHeaps) {
+		MTLTextureDescriptor *mtlTexDesc = newMTLTextureDescriptor();	// temp retain
+		MTLSizeAndAlign sizeAndAlign = [_device->getMTLDevice() heapTextureSizeAndAlignWithDescriptor: mtlTexDesc];
+		[mtlTexDesc release];
+		_byteCount = sizeAndAlign.size;
+		_byteAlignment = sizeAndAlign.align;
+		_isAliasable = mvkIsAnyFlagEnabled(pCreateInfo->flags, VK_IMAGE_CREATE_ALIAS_BIT);
+	} else {
+		// Calc _byteCount after _byteAlignment
+		_byteAlignment = _isLinear ? _device->getVkFormatTexelBufferAlignment(pCreateInfo->format, this) : mvkEnsurePowerOfTwo(mvkVkFormatBytesPerBlock(pCreateInfo->format));
+		for (uint32_t mipLvl = 0; mipLvl < _mipLevels; mipLvl++) {
+			_byteCount += getBytesPerLayer(mipLvl) * _extent.depth * _arrayLayers;
+		}
+	}
 
     initSubresources(pCreateInfo);
 }