Align flattened shader inputs to previous stage output structs.

When flattening shader inputs for stage_in, which are to be read from a buffer
that was populated as nested structs during an earlier stage, the structs will
be aligned according to C++ rules, which can affect the alignment of the first
member of the flattened input struct.

Add SPIRVShaderOutput::firstStructMemberAlignment to track the alignment
requirements of the first member of a nested structure, and recursively
determine the alignment of the first member of each nested output structure.

Move sizeOfOutput() from MVKPipeline.mm to SPIRVReflection.h,
rename to getShaderOutputSize(), and add getShaderOutputAlignment()
to extract member alignment.
diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md
index edf3769..0c77a8c 100644
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@@ -20,6 +20,7 @@
 
 - Update *glslang* version, to use `python3` in *glslang* scripts, to replace missing `python` on *macOS 12.3*.
 - Remove logged warning if MoltenVK does not support `VkApplicationInfo::apiVersion` value.
+- Fix alignment between outputs and inputs between shader stages when using nested structures. 
 
 
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm b/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
index 3d8d1ff..d9032f8 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
@@ -635,27 +635,6 @@
 	return plDesc;
 }
 
-static uint32_t sizeOfOutput(const SPIRVShaderOutput& output) {
-	if ( !output.isUsed ) { return 0; }		// Unused outputs consume no buffer space.
-
-	uint32_t vecWidth = output.vecWidth;
-	if (vecWidth == 3) { vecWidth = 4; }	// Metal 3-vectors consume same as 4-vectors.
-	switch (output.baseType) {
-		case SPIRType::SByte:
-		case SPIRType::UByte:
-			return 1 * vecWidth;
-		case SPIRType::Short:
-		case SPIRType::UShort:
-		case SPIRType::Half:
-			return 2 * vecWidth;
-		case SPIRType::Int:
-		case SPIRType::UInt:
-		case SPIRType::Float:
-		default:
-			return 4 * vecWidth;
-	}
-}
-
 static VkFormat mvkFormatFromOutput(const SPIRVShaderOutput& output) {
 	switch (output.baseType) {
 		case SPIRType::SByte:
@@ -818,10 +797,10 @@
 		if (!shaderConfig.isShaderInputLocationUsed(output.location)) {
 			if (output.perPatch && !(output.builtin == spv::BuiltInTessLevelOuter || output.builtin == spv::BuiltInTessLevelInner) ) {
 				if (!firstPatch) { firstPatch = &output; }
-				patchOffset += sizeOfOutput(output);
+				patchOffset += getShaderOutputSize(output);
 			} else if (!output.perPatch) {
 				if (!firstVertex) { firstVertex = &output; }
-				offset += sizeOfOutput(output);
+				offset += getShaderOutputSize(output);
 			}
 			continue;
 		}
@@ -853,30 +832,30 @@
 				plDesc.vertexDescriptor.attributes[location].format = MTLVertexFormatHalf2;	// FIXME Should use Float2
 			}
 		} else if (output.perPatch) {
-			patchOffset = (uint32_t)mvkAlignByteCount(patchOffset, sizeOfOutput(output));
+			patchOffset = (uint32_t)mvkAlignByteCount(patchOffset, getShaderOutputAlignment(output));
 			plDesc.vertexDescriptor.attributes[output.location].bufferIndex = kMVKTessEvalPatchInputBufferIndex;
 			plDesc.vertexDescriptor.attributes[output.location].format = getPixelFormats()->getMTLVertexFormat(mvkFormatFromOutput(output));
 			plDesc.vertexDescriptor.attributes[output.location].offset = patchOffset;
-			patchOffset += sizeOfOutput(output);
+			patchOffset += getShaderOutputSize(output);
 			if (!firstPatch) { firstPatch = &output; }
 			usedPerPatch = true;
 		} else {
-			offset = (uint32_t)mvkAlignByteCount(offset, sizeOfOutput(output));
+			offset = (uint32_t)mvkAlignByteCount(offset, getShaderOutputAlignment(output));
 			plDesc.vertexDescriptor.attributes[output.location].bufferIndex = kMVKTessEvalInputBufferIndex;
 			plDesc.vertexDescriptor.attributes[output.location].format = getPixelFormats()->getMTLVertexFormat(mvkFormatFromOutput(output));
 			plDesc.vertexDescriptor.attributes[output.location].offset = offset;
-			offset += sizeOfOutput(output);
+			offset += getShaderOutputSize(output);
 			if (!firstVertex) { firstVertex = &output; }
 			usedPerVertex = true;
 		}
 	}
 	if (usedPerVertex) {
 		plDesc.vertexDescriptor.layouts[kMVKTessEvalInputBufferIndex].stepFunction = MTLVertexStepFunctionPerPatchControlPoint;
-		plDesc.vertexDescriptor.layouts[kMVKTessEvalInputBufferIndex].stride = mvkAlignByteCount(offset, sizeOfOutput(*firstVertex));
+		plDesc.vertexDescriptor.layouts[kMVKTessEvalInputBufferIndex].stride = mvkAlignByteCount(offset, getShaderOutputAlignment(*firstVertex));
 	}
 	if (usedPerPatch) {
 		plDesc.vertexDescriptor.layouts[kMVKTessEvalPatchInputBufferIndex].stepFunction = MTLVertexStepFunctionPerPatch;
-		plDesc.vertexDescriptor.layouts[kMVKTessEvalPatchInputBufferIndex].stride = mvkAlignByteCount(patchOffset, sizeOfOutput(*firstPatch));
+		plDesc.vertexDescriptor.layouts[kMVKTessEvalPatchInputBufferIndex].stride = mvkAlignByteCount(patchOffset, getShaderOutputAlignment(*firstPatch));
 	}
 	if (outerLoc != (uint32_t)(-1) || innerLoc != (uint32_t)(-1)) {
 		plDesc.vertexDescriptor.layouts[kMVKTessEvalLevelBufferIndex].stepFunction = MTLVertexStepFunctionPerPatch;
diff --git a/MoltenVKShaderConverter/MoltenVKShaderConverter/SPIRVReflection.h b/MoltenVKShaderConverter/MoltenVKShaderConverter/SPIRVReflection.h
index 4e81bb8..06f78ec 100644
--- a/MoltenVKShaderConverter/MoltenVKShaderConverter/SPIRVReflection.h
+++ b/MoltenVKShaderConverter/MoltenVKShaderConverter/SPIRVReflection.h
@@ -32,7 +32,10 @@
 #pragma mark -
 #pragma mark SPIRVTessReflectionData
 
-	/** Reflection data for a pair of tessellation shaders. This contains the information needed to construct a tessellation pipeline. */
+	/**
+	 * Reflection data for a pair of tessellation shaders.
+	 * This contains the information needed to construct a tessellation pipeline.
+	 */
 	struct SPIRVTessReflectionData {
 		/** The partition mode, one of SpacingEqual, SpacingFractionalEven, or SpacingFractionalOdd. */
 		spv::ExecutionMode partitionMode = spv::ExecutionModeMax;
@@ -53,7 +56,11 @@
 #pragma mark -
 #pragma mark SPIRVShaderOutputData
 
-	/** Reflection data on a single output of a shader. This contains the information needed to construct a stage-input descriptor for the next stage of a pipeline. */
+	/**
+	 * Reflection data on a single output of a shader.
+	 * This contains the information needed to construct a
+	 * stage-input descriptor for the next stage of a pipeline.
+	 */
 	struct SPIRVShaderOutput {
 		/** The type of the output. */
 		SPIRV_CROSS_NAMESPACE::SPIRType::BaseType baseType;
@@ -67,6 +74,12 @@
 		/** The component index of the output. */
 		uint32_t component;
 
+		/**
+		 * If this is the first member of a struct, this will contain the alignment
+		 * of the struct containing this output, otherwise this will be zero.
+		 */
+		uint32_t firstStructMemberAlignment;
+
 		/** If this is a builtin, the kind of builtin this is. */
 		spv::BuiltIn builtin;
 
@@ -77,10 +90,14 @@
 		bool isUsed;
 	};
 
+
 #pragma mark -
 #pragma mark Functions
 
-	/** Given a tessellation control shader and a tessellation evaluation shader, both in SPIR-V format, returns tessellation reflection data. */
+	/**
+	 * Given a tessellation control shader and a tessellation evaluation shader,
+	 * both in SPIR-V format, returns tessellation reflection data.
+	 */
 	template<typename Vs>
 	static inline bool getTessReflectionData(const Vs& tesc, const std::string& tescEntryName,
 											 const Vs& tese, const std::string& teseEntryName,
@@ -173,14 +190,50 @@
 #endif
 	}
 
+	/** Returns the size in bytes of the output. */
+	static inline uint32_t getShaderOutputSize(const SPIRVShaderOutput& output) {
+		if ( !output.isUsed ) { return 0; }		// Unused outputs consume no buffer space.
+
+		uint32_t vecWidth = output.vecWidth;
+		if (vecWidth == 3) { vecWidth = 4; }	// Metal 3-vectors consume same as 4-vectors.
+		switch (output.baseType) {
+			case SPIRV_CROSS_NAMESPACE::SPIRType::SByte:
+			case SPIRV_CROSS_NAMESPACE::SPIRType::UByte:
+				return 1 * vecWidth;
+			case SPIRV_CROSS_NAMESPACE::SPIRType::Short:
+			case SPIRV_CROSS_NAMESPACE::SPIRType::UShort:
+			case SPIRV_CROSS_NAMESPACE::SPIRType::Half:
+				return 2 * vecWidth;
+			case SPIRV_CROSS_NAMESPACE::SPIRType::Int:
+			case SPIRV_CROSS_NAMESPACE::SPIRType::UInt:
+			case SPIRV_CROSS_NAMESPACE::SPIRType::Float:
+			default:
+				return 4 * vecWidth;
+		}
+	}
+
+	/**
+	 * Returns the alignment of the shader output, which typically matches the size of the output,
+	 * but the first member of a nested output struct may inherit special alignment from the struct.
+	 */
+	static inline uint32_t getShaderOutputAlignment(const SPIRVShaderOutput& output) {
+		if(output.firstStructMemberAlignment && output.isUsed) {
+			return output.firstStructMemberAlignment;
+		} else {
+			return getShaderOutputSize(output);
+		}
+	}
+
 	auto addSat = [](uint32_t a, uint32_t b) { return a == uint32_t(-1) ? a : a + b; };
 
 	template<typename Vo>
-	static inline uint32_t getShaderOutputStructMembers(const SPIRV_CROSS_NAMESPACE::CompilerReflection& reflect, Vo& outputs,
+	static inline uint32_t getShaderOutputStructMembers(const SPIRV_CROSS_NAMESPACE::CompilerReflection& reflect,
+														Vo& outputs, SPIRVShaderOutput* pParentFirstMember,
 														const SPIRV_CROSS_NAMESPACE::SPIRType* structType, spv::StorageClass storage,
 														bool patch, uint32_t loc) {
 		bool isUsed = true;
 		auto biType = spv::BuiltInMax;
+		SPIRVShaderOutput* pFirstMember = nullptr;
 		size_t mbrCnt = structType->member_types.size();
 		for (uint32_t mbrIdx = 0; mbrIdx < mbrCnt; mbrIdx++) {
 			// Each member may have a location decoration. If not, each member
@@ -197,15 +250,28 @@
 			}
 			const SPIRV_CROSS_NAMESPACE::SPIRType* type = &reflect.get_type(structType->member_types[mbrIdx]);
 			uint32_t elemCnt = (type->array.empty() ? 1 : type->array[0]) * type->columns;
-			for (uint32_t i = 0; i < elemCnt; i++) {
+			for (uint32_t elemIdx = 0; elemIdx < elemCnt; elemIdx++) {
 				if (type->basetype == SPIRV_CROSS_NAMESPACE::SPIRType::Struct)
-					loc = getShaderOutputStructMembers(reflect, outputs, type, storage, patch, loc);
+					loc = getShaderOutputStructMembers(reflect, outputs, pFirstMember, type, storage, patch, loc);
 				else {
-					outputs.push_back({type->basetype, type->vecsize, loc, cmp, biType, patch, isUsed});
+					// The alignment of a structure is the same as the largest member of the structure.
+					// Consequently, the first flattened member of a structure should align with structure itself.
+					outputs.push_back({type->basetype, type->vecsize, loc, cmp, 0, biType, patch, isUsed});
+					auto& currOutput = outputs.back();
+					if ( !pFirstMember ) { pFirstMember = &currOutput; }
+					pFirstMember->firstStructMemberAlignment = std::max(pFirstMember->firstStructMemberAlignment, getShaderOutputSize(currOutput));
 					loc = addSat(loc, 1);
 				}
 			}
 		}
+
+		// Set the parent's first member alignment to the largest alignment found so far.
+		if ( !pParentFirstMember ) {
+			pParentFirstMember = pFirstMember;
+		} else if (pParentFirstMember && pFirstMember) {
+			pParentFirstMember->firstStructMemberAlignment = std::max(pParentFirstMember->firstStructMemberAlignment, pFirstMember->firstStructMemberAlignment);
+		}
+
 		return loc;
 	}
 
@@ -252,10 +318,11 @@
 
 				uint32_t elemCnt = (type->array.empty() ? 1 : type->array[0]) * type->columns;
 				for (uint32_t i = 0; i < elemCnt; i++) {
-					if (type->basetype == SPIRV_CROSS_NAMESPACE::SPIRType::Struct)
-						loc = getShaderOutputStructMembers(reflect, outputs, type, storage, patch, loc);
-					else {
-						outputs.push_back({type->basetype, type->vecsize, loc, cmp, biType, patch, isUsed});
+					if (type->basetype == SPIRV_CROSS_NAMESPACE::SPIRType::Struct) {
+						SPIRVShaderOutput* pFirstMember = nullptr;
+						loc = getShaderOutputStructMembers(reflect, outputs, pFirstMember, type, storage, patch, loc);
+					} else {
+						outputs.push_back({type->basetype, type->vecsize, loc, cmp, 0, biType, patch, isUsed});
 						loc = addSat(loc, 1);
 					}
 				}