MVKCmdBlitImage: Support depth/stencil blits with inversion and scaling.

Prior to this, we weren't even setting the `BLIT_DST` bit for
depth/stencil formats. Conforming apps would thus never pass DS images
at all to `vkCmdBlitImage()`. It is now possible to do that, and even
get scaling and inversion to boot.

Stencil blits require the use of stencil feedback. If this feature isn't
available, both stencil and packed depth/stencil formats have their
`BLIT_SRC` and `BLIT_DST` features turned off, to prevent apps from
attempting to blit the stencil aspect.

There's only a couple of failing tests, involving a 1D stencil blit
(really a 2D stencil with height 1). For some reason, the fragments
produced during a scaled blit get spread out over the rendering surface.
I think this is a bug in Metal; we can't do anything about it.
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
index 817ed19..2813a91 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
@@ -278,23 +278,15 @@
 
 	_filter = filter;
 
-	bool isDepthStencil = _dstImage->getIsDepthStencil();
 	bool isDestUnwritableLinear = MVK_MACOS && _dstImage->getIsLinear();
 
 	_vkImageBlits.clear();		// Clear for reuse
 	for (uint32_t rIdx = 0; rIdx < regionCount; rIdx++) {
 		auto& vkIB = pRegions[rIdx];
 
-		// Validate - depth stencil formats and macOS linear images cannot be a scaling or inversion destination
-		if (isDepthStencil || isDestUnwritableLinear) {
-			if ( !(canCopyFormats(vkIB) && canCopy(vkIB)) ) {
-				if (isDepthStencil) {
-					return cmdBuff->reportError(VK_ERROR_FEATURE_NOT_PRESENT, "vkCmdBlitImage(): Scaling or inverting depth/stencil images is not supported.");
-				}
-				if (isDestUnwritableLinear) {
-					return cmdBuff->reportError(VK_ERROR_FEATURE_NOT_PRESENT, "vkCmdBlitImage(): Scaling or inverting to a linear destination image is not supported.");
-				}
-			}
+		// Validate - macOS linear images cannot be a scaling or inversion destination
+		if (isDestUnwritableLinear && !(canCopyFormats(vkIB) && canCopy(vkIB)) ) {
+			return cmdBuff->reportError(VK_ERROR_FEATURE_NOT_PRESENT, "vkCmdBlitImage(): Scaling or inverting to a linear destination image is not supported.");
 		}
 
 		_vkImageBlits.push_back(vkIB);
@@ -438,9 +430,31 @@
 
             MTLRenderPassDescriptor* mtlRPD = [MTLRenderPassDescriptor renderPassDescriptor];
             MTLRenderPassColorAttachmentDescriptor* mtlColorAttDesc = mtlRPD.colorAttachments[0];
-            mtlColorAttDesc.loadAction = MTLLoadActionLoad;
-            mtlColorAttDesc.storeAction = MTLStoreActionStore;
-            mtlColorAttDesc.texture = dstMTLTex;
+            MTLRenderPassDepthAttachmentDescriptor* mtlDepthAttDesc = mtlRPD.depthAttachment;
+            MTLRenderPassStencilAttachmentDescriptor* mtlStencilAttDesc = mtlRPD.stencilAttachment;
+            if (mvkIsAnyFlagEnabled(mvkIBR.region.dstSubresource.aspectMask, (VK_IMAGE_ASPECT_DEPTH_BIT))) {
+                mtlDepthAttDesc.loadAction = MTLLoadActionLoad;
+                mtlDepthAttDesc.storeAction = MTLStoreActionStore;
+                mtlDepthAttDesc.texture = dstMTLTex;
+            } else {
+                mtlDepthAttDesc.loadAction = MTLLoadActionDontCare;
+                mtlDepthAttDesc.storeAction = MTLStoreActionDontCare;
+                mtlDepthAttDesc.texture = nil;
+            }
+            if (mvkIsAnyFlagEnabled(mvkIBR.region.dstSubresource.aspectMask, (VK_IMAGE_ASPECT_STENCIL_BIT))) {
+                mtlStencilAttDesc.loadAction = MTLLoadActionLoad;
+                mtlStencilAttDesc.storeAction = MTLStoreActionStore;
+                mtlStencilAttDesc.texture = dstMTLTex;
+            } else {
+                mtlStencilAttDesc.loadAction = MTLLoadActionDontCare;
+                mtlStencilAttDesc.storeAction = MTLStoreActionDontCare;
+                mtlStencilAttDesc.texture = nil;
+            }
+            if (!mvkIsAnyFlagEnabled(mvkIBR.region.dstSubresource.aspectMask, (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
+                mtlColorAttDesc.loadAction = MTLLoadActionLoad;
+                mtlColorAttDesc.storeAction = MTLStoreActionStore;
+                mtlColorAttDesc.texture = dstMTLTex;
+            }
 
             MVKRPSKeyBlitImg blitKey;
             blitKey.srcMTLPixelFormat = _srcImage->getMTLPixelFormat(srcPlaneIndex);
@@ -449,7 +463,7 @@
                 // In this case, I'll use a temp 2D array view. That way, I don't have to
                 // deal with mapping the blit coordinates to a cube direction vector.
                 blitKey.srcMTLTextureType = MTLTextureType2DArray;
-                srcMTLTex = [srcMTLTex newTextureViewWithPixelFormat: (MTLPixelFormat)blitKey.srcMTLPixelFormat
+                srcMTLTex = [srcMTLTex newTextureViewWithPixelFormat: blitKey.getSrcMTLPixelFormat()
                                                          textureType: MTLTextureType2DArray
                                                               levels: NSMakeRange(0, srcMTLTex.mipmapLevelCount)
                                                               slices: NSMakeRange(0, srcMTLTex.arrayLength)];
@@ -459,13 +473,19 @@
             }
             blitKey.dstMTLPixelFormat = _dstImage->getMTLPixelFormat(dstPlaneIndex);
             blitKey.srcFilter = mvkMTLSamplerMinMagFilterFromVkFilter(_filter);
+            blitKey.srcAspect = mvkIBR.region.srcSubresource.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);
             blitKey.dstSampleCount = mvkSampleCountFromVkSampleCountFlagBits(_dstImage->getSampleCount());
             id<MTLRenderPipelineState> mtlRPS = cmdEncoder->getCommandEncodingPool()->getCmdBlitImageMTLRenderPipelineState(blitKey);
-
+            bool isBlittingDepth = mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_DEPTH_BIT));
+            bool isBlittingStencil = mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_STENCIL_BIT));
+            id<MTLDepthStencilState> mtlDSS = cmdEncoder->getCommandEncodingPool()->getMTLDepthStencilState(isBlittingDepth, isBlittingStencil);
+            
             uint32_t vtxBuffIdx = cmdEncoder->getDevice()->getMetalBufferIndexForVertexAttributeBinding(kMVKVertexContentBufferIndex);
             
             mtlColorAttDesc.level = mvkIBR.region.dstSubresource.mipLevel;
-
+            mtlDepthAttDesc.level = mvkIBR.region.dstSubresource.mipLevel;
+            mtlStencilAttDesc.level = mvkIBR.region.dstSubresource.mipLevel;
+            
             uint32_t layCnt = mvkIBR.region.srcSubresource.layerCount;
             if (_dstImage->getMTLTextureType() == MTLTextureType3D) {
                 layCnt = mvkAbsDiff(mvkIBR.region.dstOffsets[1].z, mvkIBR.region.dstOffsets[0].z);
@@ -473,9 +493,14 @@
             for (uint32_t layIdx = 0; layIdx < layCnt; layIdx++) {
                 // Update the render pass descriptor for the texture level and slice, and create a render encoder.
                 if (_dstImage->getMTLTextureType() == MTLTextureType3D) {
-                    mtlColorAttDesc.depthPlane = mvkIBR.region.dstOffsets[0].z + (mvkIBR.region.dstOffsets[1].z > mvkIBR.region.dstOffsets[0].z ? layIdx : -(layIdx + 1));
+                    uint32_t depthPlane = mvkIBR.region.dstOffsets[0].z + (mvkIBR.region.dstOffsets[1].z > mvkIBR.region.dstOffsets[0].z ? layIdx : -(layIdx + 1));
+                    mtlColorAttDesc.depthPlane = depthPlane;
+                    mtlDepthAttDesc.depthPlane = depthPlane;
+                    mtlStencilAttDesc.depthPlane = depthPlane;
                 } else {
                     mtlColorAttDesc.slice = mvkIBR.region.dstSubresource.baseArrayLayer + layIdx;
+                    mtlDepthAttDesc.slice = mvkIBR.region.dstSubresource.baseArrayLayer + layIdx;
+                    mtlStencilAttDesc.slice = mvkIBR.region.dstSubresource.baseArrayLayer + layIdx;
                 }
                 id<MTLRenderCommandEncoder> mtlRendEnc = [cmdEncoder->_mtlCmdBuffer renderCommandEncoderWithDescriptor: mtlRPD];
                 setLabelIfNotNil(mtlRendEnc, mvkMTLRenderCommandEncoderLabel(commandUse));
@@ -494,8 +519,34 @@
                 }
                 [mtlRendEnc pushDebugGroup: @"vkCmdBlitImage"];
                 [mtlRendEnc setRenderPipelineState: mtlRPS];
+                [mtlRendEnc setDepthStencilState: mtlDSS];
                 cmdEncoder->setVertexBytes(mtlRendEnc, mvkIBR.vertices, sizeof(mvkIBR.vertices), vtxBuffIdx);
-                [mtlRendEnc setFragmentTexture: srcMTLTex atIndex: 0];
+                if (!mvkIsOnlyAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_STENCIL_BIT))) {
+                    [mtlRendEnc setFragmentTexture: srcMTLTex atIndex: 0];
+                }
+                if (isBlittingStencil) {
+                    // For stencil blits of packed depth/stencil images, I need to use a stencil view. 
+                    MVKPixelFormats* pixFmts = cmdEncoder->getPixelFormats();
+                    if (pixFmts->isDepthFormat(blitKey.getSrcMTLPixelFormat()) &&
+                        pixFmts->isStencilFormat(blitKey.getSrcMTLPixelFormat())) {
+                        MTLPixelFormat stencilFmt = blitKey.getSrcMTLPixelFormat();
+                        if (stencilFmt == MTLPixelFormatDepth32Float_Stencil8) {
+
+                            stencilFmt = MTLPixelFormatX32_Stencil8;
+#if MVK_MACOS
+                        } else if (stencilFmt == MTLPixelFormatDepth24Unorm_Stencil8) {
+                            stencilFmt = MTLPixelFormatX24_Stencil8;
+#endif
+                        }
+                        id<MTLTexture> stencilMTLTex = [srcMTLTex newTextureViewWithPixelFormat: stencilFmt];
+                        [cmdEncoder->_mtlCmdBuffer addCompletedHandler: ^(id<MTLCommandBuffer>) {
+                            [stencilMTLTex release];
+                        }];
+                        [mtlRendEnc setFragmentTexture: stencilMTLTex atIndex: 1];
+                    } else {
+                        [mtlRendEnc setFragmentTexture: srcMTLTex atIndex: 1];
+                    }
+                }
 
                 struct {
                     uint slice;
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandResourceFactory.h b/MoltenVK/MoltenVK/Commands/MVKCommandResourceFactory.h
index 105483f..15584e1 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandResourceFactory.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandResourceFactory.h
@@ -38,15 +38,17 @@
  */
 typedef struct MVKRPSKeyBlitImg {
 	uint16_t srcMTLPixelFormat = 0;			/**< as MTLPixelFormat */
-	uint16_t srcMTLTextureType = 0;			/**< as MTLTextureType */
 	uint16_t dstMTLPixelFormat = 0;			/**< as MTLPixelFormat */
+	uint8_t srcMTLTextureType = 0;			/**< as MTLTextureType */
+	uint8_t srcAspect = 0;					/**< as VkImageAspectFlags */
 	uint8_t srcFilter = 0;					/**< as MTLSamplerMinMagFilter */
 	uint8_t dstSampleCount = 0;
 
 	bool operator==(const MVKRPSKeyBlitImg& rhs) const {
 		if (srcMTLPixelFormat != rhs.srcMTLPixelFormat) { return false; }
-		if (srcMTLTextureType != rhs.srcMTLTextureType) { return false; }
 		if (dstMTLPixelFormat != rhs.dstMTLPixelFormat) { return false; }
+		if (srcMTLTextureType != rhs.srcMTLTextureType) { return false; }
+		if (srcAspect != rhs.srcAspect) { return false; }
 		if (srcFilter != rhs.srcFilter) { return false; }
 		if (dstSampleCount != rhs.dstSampleCount) { return false; }
 		return true;
@@ -72,10 +74,13 @@
 		std::size_t hash = srcMTLPixelFormat;
 
 		hash <<= 16;
+		hash |= dstMTLPixelFormat;
+
+		hash <<= 8;
 		hash |= srcMTLTextureType;
 
-		hash <<= 16;
-		hash |= dstMTLPixelFormat;
+		hash <<= 8;
+		hash |= srcAspect;
 
 		hash <<= 8;
 		hash |= srcFilter;
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandResourceFactory.mm b/MoltenVK/MoltenVK/Commands/MVKCommandResourceFactory.mm
index 51640ae..0a42be0 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandResourceFactory.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandResourceFactory.mm
@@ -42,7 +42,15 @@
 	plDesc.fragmentFunction = fragFunc;
 	plDesc.sampleCount = blitKey.dstSampleCount;
 
-	plDesc.colorAttachments[0].pixelFormat = blitKey.getDstMTLPixelFormat();
+	if (mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_DEPTH_BIT))) {
+		plDesc.depthAttachmentPixelFormat = blitKey.getDstMTLPixelFormat();
+	}
+	if (mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_STENCIL_BIT))) {
+		plDesc.stencilAttachmentPixelFormat = blitKey.getDstMTLPixelFormat();
+	}
+	if (!mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
+		plDesc.colorAttachments[0].pixelFormat = blitKey.getDstMTLPixelFormat();
+	}
 
     MTLVertexDescriptor* vtxDesc = plDesc.vertexDescriptor;
 
@@ -158,8 +166,12 @@
 
 		bool isArrayType = blitKey.isSrcArrayType();
 		bool isLinearFilter = (blitKey.getSrcMTLSamplerMinMagFilter() == MTLSamplerMinMagFilterLinear);
+		NSString* typePrefix = @"texture";
 		NSString* typeSuffix;
 		NSString* coordArg;
+		if (mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_DEPTH_BIT))) {
+			typePrefix = @"depth";
+		}
 		switch (blitKey.getSrcMTLTextureType()) {
 			case MTLTextureType1D:
 				typeSuffix = @"1d";
@@ -199,22 +211,59 @@
 		[msl appendLineMVK: @"} VaryingsPosTex;"];
 		[msl appendLineMVK];
 		[msl appendLineMVK: @"typedef struct {"];
+		if (mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_DEPTH_BIT))) {
+			[msl appendFormat: @"    %@ depth [[depth(any)]];", typeStr];
+			[msl appendLineMVK];
+		}
+		if (mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_STENCIL_BIT))) {
+			[msl appendLineMVK: @"    uint stencil [[stencil]];"];
+		}
+		if (!mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
+			[msl appendFormat: @"    %@4 color [[color(0)]];", typeStr];
+			[msl appendLineMVK];
+		}
+		[msl appendLineMVK: @"} FragmentOutputs;"];
+		[msl appendLineMVK];
+		[msl appendLineMVK: @"typedef struct {"];
 		[msl appendLineMVK: @"    uint slice;"];
 		[msl appendLineMVK: @"    float lod;"];
 		[msl appendLineMVK: @"} TexSubrez;"];
 		[msl appendLineMVK];
 
-		[msl appendFormat: @"constexpr sampler ce_sampler(mip_filter::nearest, filter::%@);", srcFilter];
-		[msl appendLineMVK];
+		if (!mvkIsOnlyAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_STENCIL_BIT))) {
+			[msl appendFormat: @"constexpr sampler ce_sampler(mip_filter::nearest, filter::%@);", srcFilter];
+			[msl appendLineMVK];
+		}
+		if (mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_STENCIL_BIT))) {
+			[msl appendLineMVK: @"constexpr sampler ce_stencil_sampler(mip_filter::nearest);"];
+		}
 
 		NSString* funcName = @"fragCmdBlitImage";
-		[msl appendFormat: @"fragment %@4 %@(VaryingsPosTex varyings [[stage_in]],", typeStr, funcName];
+		[msl appendFormat: @"fragment FragmentOutputs %@(VaryingsPosTex varyings [[stage_in]],", funcName];
 		[msl appendLineMVK];
-		[msl appendFormat: @"                         texture%@<%@> tex [[texture(0)]],", typeSuffix, typeStr];
-		[msl appendLineMVK];
+		if (!mvkIsOnlyAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_STENCIL_BIT))) {
+			[msl appendFormat: @"                         %@%@<%@> tex [[texture(0)]],", typePrefix, typeSuffix, typeStr];
+			[msl appendLineMVK];
+		}
+		if (mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_STENCIL_BIT))) {
+			[msl appendFormat: @"                         texture%@<uint> stencilTex [[texture(1)]],", typeSuffix];
+			[msl appendLineMVK];
+		}
 		[msl appendLineMVK: @"                         constant TexSubrez& subRez [[buffer(0)]]) {"];
-		[msl appendFormat: @"    return tex.sample(ce_sampler, varyings.v_texCoord%@%@, level(subRez.lod));", coordArg, sliceArg];
-		[msl appendLineMVK];
+		[msl appendLineMVK: @"    FragmentOutputs out;"];
+		if (mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_DEPTH_BIT))) {
+			[msl appendFormat: @"    out.depth = tex.sample(ce_sampler, varyings.v_texCoord%@%@, level(subRez.lod));", coordArg, sliceArg];
+			[msl appendLineMVK];
+		}
+		if (mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_STENCIL_BIT))) {
+			[msl appendFormat: @"    out.stencil = stencilTex.sample(ce_stencil_sampler, varyings.v_texCoord%@%@, level(subRez.lod)).x;", coordArg, sliceArg];
+			[msl appendLineMVK];
+		}
+		if (!mvkIsAnyFlagEnabled(blitKey.srcAspect, (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
+			[msl appendFormat: @"    out.color = tex.sample(ce_sampler, varyings.v_texCoord%@%@, level(subRez.lod));", coordArg, sliceArg];
+			[msl appendLineMVK];
+		}
+		[msl appendLineMVK: @"    return out;"];
 		[msl appendLineMVK: @"}"];
 
 //		MVKLogDebug("\n%s", msl.UTF8String);
@@ -313,6 +362,7 @@
 		case kMVKFormatColorUInt16:		return @"ushort";
 		case kMVKFormatColorInt32:		return @"int";
 		case kMVKFormatColorUInt32:		return @"uint";
+		case kMVKFormatDepthStencil:	return @"float";
 		default:						return @"unexpected_type";
 	}
 }
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKPixelFormats.mm b/MoltenVK/MoltenVK/GPUObjects/MVKPixelFormats.mm
index bae44ce..f28d017 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKPixelFormats.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKPixelFormats.mm
@@ -1668,7 +1668,8 @@
 	kMVKVkFormatFeatureFlagsTexAtomic   = (VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT),
 	kMVKVkFormatFeatureFlagsTexColorAtt = (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
 										   VK_FORMAT_FEATURE_BLIT_DST_BIT),
-	kMVKVkFormatFeatureFlagsTexDSAtt    = (VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT),
+	kMVKVkFormatFeatureFlagsTexDSAtt    = (VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT |
+										   VK_FORMAT_FEATURE_BLIT_DST_BIT),
 	kMVKVkFormatFeatureFlagsTexBlend    = (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT),
     kMVKVkFormatFeatureFlagsTexTransfer          = (VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
                                                     VK_FORMAT_FEATURE_TRANSFER_DST_BIT),
@@ -1720,8 +1721,21 @@
 	enableFormatFeatures(DSAtt, Tex, mtlPixFmtCaps, vkProps.optimalTilingFeatures);
 	enableFormatFeatures(Blend, Tex, mtlPixFmtCaps, vkProps.optimalTilingFeatures);
 
-	if (chromaSubsamplingComponentBits > 0) {
+	if ( chromaSubsamplingComponentBits > 0 ||
+		// XXX We really want to use the device's Metal features instead of duplicating the
+		// logic from MVKPhysicalDevice, but those may not have been initialized yet.
+#if MVK_MACOS
+		 (isStencilFormat(vkDesc.mtlPixelFormat) && (!_physicalDevice || ![_physicalDevice->getMTLDevice() supportsFeatureSet: MTLFeatureSet_macOS_GPUFamily2_v1]))
+#endif
+#if MVK_IOS
+		 (isStencilFormat(vkDesc.mtlPixelFormat) && (!_physicalDevice || ![_physicalDevice->getMTLDevice() supportsFeatureSet: MTLFeatureSet_iOS_GPUFamily5_v1]))
+#endif
+#if MVK_TVOS
+		isStencilFormat(vkDesc.mtlPixelFormat)
+#endif
+		) {
 		// Vulkan forbids blits between chroma-subsampled formats.
+		// If we can't write the stencil reference from the shader, we can't blit stencil.
 		mvkDisableFlags(vkProps.optimalTilingFeatures, (VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT));
 	}