Use placement heaps for VkDeviceMemory when possible.

All Apple GPUs support this, as does Mac GPU family 2. With this, we can
avoid expensive copies between buffers and textures allocated from the
same memory, and reduce memory usage to boot.

macOS is, unfortunately, constrained by the fact that `MTLHeap` objects
do not support any storage mode other than `MTLStorageModePrivate`--not
even `MTLStorageModeManaged`. This won't help for managed textures
there. Perhaps Apple will fix this in a later version.
diff --git a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
index c9edf75..7a09f83 100644
--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@@ -542,6 +542,7 @@
 	VkBool32 postDepthCoverage;					/**< If true, coverage masks in fragment shaders post-depth-test are supported. */
 	VkBool32 native3DCompressedTextures;		/**< If true, 3D compressed images are supported natively, without manual decompression. */
 	VkBool32 nativeTextureSwizzle;				/**< If true, component swizzle is supported natively, without manual swizzling in shaders. */
+	VkBool32 placementHeaps;					/**< If true, MTLHeap objects support placement of resources. */
 } MVKPhysicalDeviceMetalFeatures;
 
 /**
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.h b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.h
index 82815ca..67d8844 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.h
@@ -69,10 +69,10 @@
 #pragma mark Metal
 
 	/** Returns the Metal buffer underlying this memory allocation. */
-	inline id<MTLBuffer> getMTLBuffer() { return _deviceMemory ? _deviceMemory->getMTLBuffer() : nullptr; }
+	id<MTLBuffer> getMTLBuffer();
 
 	/** Returns the offset at which the contents of this instance starts within the underlying Metal buffer. */
-	inline NSUInteger getMTLBufferOffset() { return _deviceMemoryOffset; }
+	inline NSUInteger getMTLBufferOffset() { return _deviceMemory && _deviceMemory->getMTLHeap() ? 0 : _deviceMemoryOffset; }
 
 
 #pragma mark Construction
@@ -90,6 +90,7 @@
 						   VkBufferMemoryBarrier* pBufferMemoryBarrier);
 
 	VkBufferUsageFlags _usage;
+	id<MTLBuffer> _mtlBuffer = nil;
 };
 
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm
index ee23a31..bf1be23 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm
@@ -29,21 +29,28 @@
 #pragma mark MVKBuffer
 
 void MVKBuffer::propogateDebugName() {
-	if (_debugName &&
-		_deviceMemory &&
+	if (!_debugName) { return; }
+	if (_deviceMemory &&
 		_deviceMemory->isDedicatedAllocation() &&
 		_deviceMemory->_debugName.length == 0) {
 
 		_deviceMemory->setDebugName(_debugName.UTF8String);
 	}
+	setLabelIfNotNil(_mtlBuffer, _debugName);
 }
 
 
 #pragma mark Resource memory
 
 VkResult MVKBuffer::getMemoryRequirements(VkMemoryRequirements* pMemoryRequirements) {
-	pMemoryRequirements->size = getByteCount();
-	pMemoryRequirements->alignment = _byteAlignment;
+	if (_device->_pMetalFeatures->placementHeaps) {
+		MTLSizeAndAlign sizeAndAlign = [_device->getMTLDevice() heapBufferSizeAndAlignWithLength: getByteCount() options: MTLResourceStorageModePrivate];
+		pMemoryRequirements->size = sizeAndAlign.size;
+		pMemoryRequirements->alignment = sizeAndAlign.align;
+	} else {
+		pMemoryRequirements->size = getByteCount();
+		pMemoryRequirements->alignment = _byteAlignment;
+	}
 	pMemoryRequirements->memoryTypeBits = _device->getPhysicalDevice()->getAllMemoryTypes();
 #if MVK_MACOS
 	// Textures must not use shared memory
@@ -61,21 +68,15 @@
 VkResult MVKBuffer::getMemoryRequirements(const void*, VkMemoryRequirements2* pMemoryRequirements) {
 	pMemoryRequirements->sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2;
 	getMemoryRequirements(&pMemoryRequirements->memoryRequirements);
-	auto* next = (VkStructureType*)pMemoryRequirements->pNext;
-	while (next) {
-		switch (*next) {
+	for (auto* next = (VkBaseOutStructure*)pMemoryRequirements->pNext; next; next = next->pNext) {
+		switch (next->sType) {
 		case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
 			auto* dedicatedReqs = (VkMemoryDedicatedRequirements*)next;
-			// TODO: Maybe someday we could do something with MTLHeaps
-			// and allocate non-dedicated memory from them. For now, we
-			// always prefer dedicated allocations.
-			dedicatedReqs->prefersDedicatedAllocation = VK_TRUE;
+			dedicatedReqs->prefersDedicatedAllocation = VK_FALSE;
 			dedicatedReqs->requiresDedicatedAllocation = VK_FALSE;
-			next = (VkStructureType*)dedicatedReqs->pNext;
 			break;
 		}
 		default:
-			next = (VkStructureType*)((VkMemoryRequirements2*)next)->pNext;
 			break;
 		}
 	}
@@ -134,6 +135,25 @@
 }
 
 
+#pragma mark Metal
+
+id<MTLBuffer> MVKBuffer::getMTLBuffer() {
+	if (_mtlBuffer) { return _mtlBuffer; }
+	if (_deviceMemory) {
+		if (_deviceMemory->getMTLHeap()) {
+			_mtlBuffer = [_deviceMemory->getMTLHeap() newBufferWithLength: getByteCount()
+																  options: _deviceMemory->getMTLResourceOptions()
+																   offset: _deviceMemoryOffset];	// retained
+			propogateDebugName();
+			return _mtlBuffer;
+		} else {
+			return _deviceMemory->getMTLBuffer();
+		}
+	}
+	return nil;
+}
+
+
 #pragma mark Construction
 
 MVKBuffer::MVKBuffer(MVKDevice* device, const VkBufferCreateInfo* pCreateInfo) : MVKResource(device), _usage(pCreateInfo->usage) {
@@ -143,6 +163,7 @@
 
 MVKBuffer::~MVKBuffer() {
 	if (_deviceMemory) { _deviceMemory->removeBuffer(this); }
+	if (_mtlBuffer) { [_mtlBuffer release]; }
 }
 
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
index 1360e4e..8873d69 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@@ -840,6 +840,7 @@
 
 	if ( mvkOSVersion() >= 13.0 ) {
 		_metalFeatures.mslVersionEnum = MTLLanguageVersion2_2;
+		_metalFeatures.placementHeaps = true;
 		if ( getSupportsGPUFamily(MTLGPUFamilyApple4) ) {
 			_metalFeatures.nativeTextureSwizzle = true;
 		}
@@ -894,6 +895,7 @@
 		_metalFeatures.native3DCompressedTextures = true;
 		if ( getSupportsGPUFamily(MTLGPUFamilyMac2) ) {
 			_metalFeatures.nativeTextureSwizzle = true;
+			_metalFeatures.placementHeaps = true;
 		}
 	}
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.h b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.h
index c87b443..bd15e52 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.h
@@ -94,6 +94,9 @@
 	/** Returns the Metal buffer underlying this memory allocation. */
 	inline id<MTLBuffer> getMTLBuffer() { return _mtlBuffer; }
 
+	/** Returns the Metal heap underlying this memory allocation. */
+	inline id<MTLHeap> getMTLHeap() { return _mtlHeap; }
+
 	/** Returns the Metal storage mode used by this memory allocation. */
 	inline MTLStorageMode getMTLStorageMode() { return _mtlStorageMode; }
 
@@ -123,6 +126,7 @@
 	void removeBuffer(MVKBuffer* mvkBuff);
 	VkResult addImage(MVKImage* mvkImg);
 	void removeImage(MVKImage* mvkImg);
+	bool ensureMTLHeap();
 	bool ensureMTLBuffer();
 	bool ensureHostMemory();
 	void freeHostMemory();
@@ -135,6 +139,7 @@
 	VkDeviceSize _mapOffset = 0;
 	VkDeviceSize _mapSize = 0;
 	id<MTLBuffer> _mtlBuffer = nil;
+	id<MTLHeap> _mtlHeap = nil;
 	void* _pMemory = nullptr;
 	void* _pHostMemory = nullptr;
 	bool _isMapped = false;
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm
index ee4aedf..c25500c 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm
@@ -31,7 +31,10 @@
 
 #pragma mark MVKDeviceMemory
 
-void MVKDeviceMemory::propogateDebugName() { setLabelIfNotNil(_mtlBuffer, _debugName); }
+void MVKDeviceMemory::propogateDebugName() {
+	setLabelIfNotNil(_mtlHeap, _debugName);
+	setLabelIfNotNil(_mtlBuffer, _debugName);
+}
 
 VkResult MVKDeviceMemory::map(VkDeviceSize offset, VkDeviceSize size, VkMemoryMapFlags flags, void** ppData) {
 
@@ -85,8 +88,11 @@
 		}
 #endif
 
-		lock_guard<mutex> lock(_rezLock);
-        for (auto& img : _images) { img->flushToDevice(offset, memSize); }
+		// If we have an MTLHeap object, there's no need to sync memory manually between images and the buffer.
+		if (!_mtlHeap) {
+			lock_guard<mutex> lock(_rezLock);
+			for (auto& img : _images) { img->flushToDevice(offset, memSize); }
+		}
 	}
 	return VK_SUCCESS;
 }
@@ -94,7 +100,7 @@
 VkResult MVKDeviceMemory::pullFromDevice(VkDeviceSize offset, VkDeviceSize size, bool evenIfCoherent) {
 	// Coherent memory is flushed on unmap(), so it is only flushed if forced
     VkDeviceSize memSize = adjustMemorySize(size, offset);
-	if (memSize > 0 && isMemoryHostAccessible() && (evenIfCoherent || !isMemoryHostCoherent()) ) {
+	if (memSize > 0 && isMemoryHostAccessible() && (evenIfCoherent || !isMemoryHostCoherent()) && !_mtlHeap) {
 		lock_guard<mutex> lock(_rezLock);
         for (auto& img : _images) { img->pullFromDevice(offset, memSize); }
 	}
@@ -140,8 +146,7 @@
 		return reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "Could not bind VkImage %p to a VkDeviceMemory dedicated to resource %p. A dedicated allocation may only be used with the resource it was dedicated to.", mvkImg, getDedicatedResource() );
 	}
 
-	if (!_isDedicated)
-		_images.push_back(mvkImg);
+	if (!_isDedicated) { _images.push_back(mvkImg); }
 
 	return VK_SUCCESS;
 }
@@ -151,6 +156,36 @@
 	mvkRemoveAllOccurances(_images, mvkImg);
 }
 
+// Ensures that this instance is backed by a MTLHeap object,
+// creating the MTLHeap if needed, and returns whether it was successful.
+bool MVKDeviceMemory::ensureMTLHeap() {
+
+	if (_mtlHeap) { return true; }
+
+	// Don't bother if we don't have placement heaps.
+	if (!getDevice()->_pMetalFeatures->placementHeaps) { return true; }
+
+#if MVK_MACOS
+	// MTLHeaps on Mac must use private storage for now.
+	if (_mtlStorageMode != MTLStorageModePrivate) { return true; }
+#endif
+
+	MTLHeapDescriptor* heapDesc = [MTLHeapDescriptor new];
+	heapDesc.type = MTLHeapTypePlacement;
+	heapDesc.resourceOptions = getMTLResourceOptions();
+	// For now, use tracked resources. Later, we should probably default
+	// to untracked, since Vulkan uses explicit barriers anyway.
+	heapDesc.hazardTrackingMode = MTLHazardTrackingModeTracked;
+	heapDesc.size = _allocationSize;
+	_mtlHeap = [_device->getMTLDevice() newHeapWithDescriptor: heapDesc];	// retained
+	[heapDesc release];
+	if (!_mtlHeap) { return false; }
+
+	propogateDebugName();
+
+	return true;
+}
+
 // Ensures that this instance is backed by a MTLBuffer object,
 // creating the MTLBuffer if needed, and returns whether it was successful.
 bool MVKDeviceMemory::ensureMTLBuffer() {
@@ -162,12 +197,20 @@
 	if (memLen > _device->_pMetalFeatures->maxMTLBufferSize) { return false; }
 
 	// If host memory was already allocated, it is copied into the new MTLBuffer, and then released.
-	if (_pHostMemory) {
+	if (_mtlHeap) {
+		_mtlBuffer = [_mtlHeap newBufferWithLength: memLen options: getMTLResourceOptions() offset: 0];	// retained
+		if (_pHostMemory) {
+			memcpy(_mtlBuffer.contents, _pHostMemory, memLen);
+			freeHostMemory();
+		}
+		[_mtlBuffer makeAliasable];
+	} else if (_pHostMemory) {
 		_mtlBuffer = [getMTLDevice() newBufferWithBytes: _pHostMemory length: memLen options: getMTLResourceOptions()];     // retained
 		freeHostMemory();
 	} else {
 		_mtlBuffer = [getMTLDevice() newBufferWithLength: memLen options: getMTLResourceOptions()];     // retained
 	}
+	if (!_mtlBuffer) { return false; }
 	_pMemory = isMemoryHostAccessible() ? _mtlBuffer.contents : nullptr;
 
 	propogateDebugName();
@@ -254,6 +297,15 @@
 		return;
 	}
 
+	// If we can, create a MTLHeap. This should happen before creating the buffer
+	// allowing us to map its contents.
+	if (!dedicatedImage && !dedicatedBuffer) {
+		if (!ensureMTLHeap()) {
+			setConfigurationResult(reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "Could not allocate VkDeviceMemory of size %llu bytes.", _allocationSize));
+			return;
+		}
+	}
+
 	// If memory needs to be coherent it must reside in an MTLBuffer, since an open-ended map() must work.
 	if (isMemoryHostCoherent() && !ensureMTLBuffer() ) {
 		setConfigurationResult(reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "Could not allocate a host-coherent VkDeviceMemory of size %llu bytes. The maximum memory-aligned size of a host-coherent VkDeviceMemory is %llu bytes.", _allocationSize, _device->_pMetalFeatures->maxMTLBufferSize));
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKImage.h b/MoltenVK/MoltenVK/GPUObjects/MVKImage.h
index 0cbf0d0..dfbde55 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKImage.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKImage.h
@@ -275,6 +275,7 @@
 	bool _usesTexelBuffer;
 	bool _isLinear;
 	bool _is3DCompressed;
+	bool _isAliasable;
 };
 
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKImage.mm b/MoltenVK/MoltenVK/GPUObjects/MVKImage.mm
index 04a8862..bf51f65 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKImage.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKImage.mm
@@ -192,21 +192,16 @@
 VkResult MVKImage::getMemoryRequirements(const void*, VkMemoryRequirements2* pMemoryRequirements) {
 	pMemoryRequirements->sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2;
 	getMemoryRequirements(&pMemoryRequirements->memoryRequirements);
-	auto* next = (VkStructureType*)pMemoryRequirements->pNext;
-	while (next) {
-		switch (*next) {
+	for (auto* next = (VkBaseOutStructure*)pMemoryRequirements->pNext; next; next = next->pNext) {
+		switch (next->sType) {
 		case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
 			auto* dedicatedReqs = (VkMemoryDedicatedRequirements*)next;
-			// TODO: Maybe someday we could do something with MTLHeaps
-			// and allocate non-dedicated memory from them. For now, we
-			// always prefer dedicated allocations.
-			dedicatedReqs->prefersDedicatedAllocation = VK_TRUE;
+			bool writable = mvkIsAnyFlagEnabled(_usage, VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT);
+			dedicatedReqs->prefersDedicatedAllocation = !_usesTexelBuffer && (writable || !_device->_pMetalFeatures->placementHeaps);
 			dedicatedReqs->requiresDedicatedAllocation = VK_FALSE;
-			next = (VkStructureType*)dedicatedReqs->pNext;
 			break;
 		}
 		default:
-			next = (VkStructureType*)((VkMemoryRequirements2*)next)->pNext;
 			break;
 		}
 	}
@@ -231,7 +226,7 @@
 	bool isUncompressed = blockExt.width == 1 && blockExt.height == 1;
 
 	bool useTexelBuffer = _device->_pMetalFeatures->texelBuffers;								// Texel buffers available
-	useTexelBuffer = useTexelBuffer && isMemoryHostAccessible() && _isLinear && isUncompressed;	// Applicable memory layout
+	useTexelBuffer = useTexelBuffer && (isMemoryHostAccessible() || _device->_pMetalFeatures->placementHeaps) && _isLinear && isUncompressed;	// Applicable memory layout
 	useTexelBuffer = useTexelBuffer && _deviceMemory && _deviceMemory->_mtlBuffer;				// Buffer is available to overlay
 
 #if MVK_MACOS
@@ -352,6 +347,10 @@
 		mtlTex = [_deviceMemory->_mtlBuffer newTextureWithDescriptor: mtlTexDesc
 															  offset: getDeviceMemoryOffset()
 														 bytesPerRow: _subresources[0].layout.rowPitch];
+	} else if (_deviceMemory->_mtlHeap) {
+		mtlTex = [_deviceMemory->_mtlHeap newTextureWithDescriptor: mtlTexDesc
+															offset: getDeviceMemoryOffset()];
+		if (_isAliasable) [mtlTex makeAliasable];
 	} else {
 		mtlTex = [getMTLDevice() newTextureWithDescriptor: mtlTexDesc];
 	}
@@ -628,11 +627,20 @@
 	_canSupportMTLTextureView = !_isDepthStencilAttachment || _device->_pMetalFeatures->stencilViews;
 	_hasExpectedTexelSize = (mvkMTLPixelFormatBytesPerBlock(_mtlPixelFormat) == mvkVkFormatBytesPerBlock(pCreateInfo->format));
 
-	// Calc _byteCount after _byteAlignment
-	_byteAlignment = _isLinear ? _device->getVkFormatTexelBufferAlignment(pCreateInfo->format, this) : mvkEnsurePowerOfTwo(mvkVkFormatBytesPerBlock(pCreateInfo->format));
-    for (uint32_t mipLvl = 0; mipLvl < _mipLevels; mipLvl++) {
-        _byteCount += getBytesPerLayer(mipLvl) * _extent.depth * _arrayLayers;
-    }
+	if (!_isLinear && _device->_pMetalFeatures->placementHeaps) {
+		MTLTextureDescriptor *mtlTexDesc = newMTLTextureDescriptor();	// temp retain
+		MTLSizeAndAlign sizeAndAlign = [_device->getMTLDevice() heapTextureSizeAndAlignWithDescriptor: mtlTexDesc];
+		[mtlTexDesc release];
+		_byteCount = sizeAndAlign.size;
+		_byteAlignment = sizeAndAlign.align;
+		_isAliasable = mvkIsAnyFlagEnabled(pCreateInfo->flags, VK_IMAGE_CREATE_ALIAS_BIT);
+	} else {
+		// Calc _byteCount after _byteAlignment
+		_byteAlignment = _isLinear ? _device->getVkFormatTexelBufferAlignment(pCreateInfo->format, this) : mvkEnsurePowerOfTwo(mvkVkFormatBytesPerBlock(pCreateInfo->format));
+		for (uint32_t mipLvl = 0; mipLvl < _mipLevels; mipLvl++) {
+			_byteCount += getBytesPerLayer(mipLvl) * _extent.depth * _arrayLayers;
+		}
+	}
 
     initSubresources(pCreateInfo);
 }