Merge pull request #785 from billhollings/master

Fix tessellation break when control stage declares but does not use position builtin.
diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md
index a7298a3..071c359 100644
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@@ -19,11 +19,14 @@
 
 Released TBD
 
+- Add support for extensions:
+	- `VK_EXT_inline_uniform_block`
 - Support linear filtering when using `vkCmdBlitImage()`.
 - Clamp image copy extents to image extent.
 - Fix crash in `fetchDependencies` on build paths containing spaces.
 - Fix image subresource sizing calculations for heap-based textures.
 - Fix `MTLHeap` memory leak in `MVKDeviceMemory`.
+- Fix tessellation break when control stage declares but does not use position builtin.
 - Support *Xcode 11.2*.
 
 
diff --git a/ExternalRevisions/SPIRV-Cross_repo_revision b/ExternalRevisions/SPIRV-Cross_repo_revision
index 7ab941b..57d5a9f 100644
--- a/ExternalRevisions/SPIRV-Cross_repo_revision
+++ b/ExternalRevisions/SPIRV-Cross_repo_revision
@@ -1 +1 @@
-b10f5d48c959c5600c5e103a6b955b6b1b59cdff
+96a276c2ca86e81530313786e99e2b95623898ca
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm b/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm
index 87e1128..a1cb774 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm
@@ -110,7 +110,7 @@
     if (pipeline->isTessellationPipeline()) {
         inControlPointCount = pipeline->getInputControlPointCount();
         outControlPointCount = pipeline->getOutputControlPointCount();
-        patchCount = (uint32_t)mvkCeilingDivide(_vertexCount, inControlPointCount);
+        patchCount = mvkCeilingDivide(_vertexCount, inControlPointCount);
     }
     for (uint32_t s : stages) {
         auto stage = MVKGraphicsStage(s);
@@ -308,7 +308,7 @@
     if (pipeline->isTessellationPipeline()) {
         inControlPointCount = pipeline->getInputControlPointCount();
         outControlPointCount = pipeline->getOutputControlPointCount();
-        patchCount = (uint32_t)mvkCeilingDivide(_indexCount, inControlPointCount);
+        patchCount = mvkCeilingDivide(_indexCount, inControlPointCount);
     }
     for (uint32_t s : stages) {
         auto stage = MVKGraphicsStage(s);
@@ -544,7 +544,7 @@
         inControlPointCount = pipeline->getInputControlPointCount();
         outControlPointCount = pipeline->getOutputControlPointCount();
         vertexCount = kMVKDrawIndirectVertexCountUpperBound;
-        patchCount = (uint32_t)mvkCeilingDivide(vertexCount, inControlPointCount);
+        patchCount = mvkCeilingDivide(vertexCount, inControlPointCount);
         VkDeviceSize indirectSize = (sizeof(MTLDispatchThreadgroupsIndirectArguments) + sizeof(MTLDrawPatchIndirectArguments)) * _drawCount;
         if (cmdEncoder->_pDeviceMetalFeatures->mslVersion >= 20100) {
             indirectSize += sizeof(MTLStageInRegionIndirectArguments) * _drawCount;
@@ -614,7 +614,7 @@
                                             &_drawCount,
                                             sizeof(_drawCount),
                                             5);
-                [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
+                [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide<NSUInteger>(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
                                   threadsPerThreadgroup: MTLSizeMake(mtlConvertState.threadExecutionWidth, 1, 1)];
             }
 
@@ -783,7 +783,7 @@
         inControlPointCount = pipeline->getInputControlPointCount();
         outControlPointCount = pipeline->getOutputControlPointCount();
         vertexCount = kMVKDrawIndirectVertexCountUpperBound;
-        patchCount = (uint32_t)mvkCeilingDivide(vertexCount, inControlPointCount);
+        patchCount = mvkCeilingDivide(vertexCount, inControlPointCount);
         VkDeviceSize indirectSize = (sizeof(MTLDispatchThreadgroupsIndirectArguments) + sizeof(MTLDrawPatchIndirectArguments)) * _drawCount;
         if (cmdEncoder->_pDeviceMetalFeatures->mslVersion >= 20100) {
             indirectSize += sizeof(MTLStageInRegionIndirectArguments) * _drawCount;
@@ -842,7 +842,7 @@
                                                 &_drawCount,
                                                 sizeof(_drawCount),
                                                 5);
-                    [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
+                    [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide<NSUInteger>(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
                                       threadsPerThreadgroup: MTLSizeMake(mtlConvertState.threadExecutionWidth, 1, 1)];
                 }
                 // We actually need to make a copy of the index buffer, regardless of whether
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
index e15467f..82ced58 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
@@ -825,15 +825,15 @@
             // One thread is run per block. Each block decompresses to an m x n array of texels.
             // So the size of the grid is (ceil(width/m), ceil(height/n), depth).
             VkExtent2D blockExtent = mvkMTLPixelFormatBlockTexelSize(mtlPixFmt);
-            MTLSize mtlGridSize = MTLSizeMake(mvkCeilingDivide(mtlTxtSize.width, blockExtent.width),
-                                              mvkCeilingDivide(mtlTxtSize.height, blockExtent.height),
+            MTLSize mtlGridSize = MTLSizeMake(mvkCeilingDivide<NSUInteger>(mtlTxtSize.width, blockExtent.width),
+                                              mvkCeilingDivide<NSUInteger>(mtlTxtSize.height, blockExtent.height),
                                               mtlTxtSize.depth);
             // Use four times the thread execution width as the threadgroup size.
             MTLSize mtlTgrpSize = MTLSizeMake(2, 2, mtlComputeState.threadExecutionWidth);
             // Then the number of threadgroups is (ceil(x/2), ceil(y/2), ceil(z/t)),
             // where 't' is the thread execution width.
-            mtlGridSize.width = mvkCeilingDivide(mtlGridSize.width, 2);
-            mtlGridSize.height = mvkCeilingDivide(mtlGridSize.height, 2);
+            mtlGridSize.width = mvkCeilingDivide(mtlGridSize.width, mtlTgrpSize.width);
+            mtlGridSize.height = mvkCeilingDivide(mtlGridSize.height, mtlTgrpSize.height);
             mtlGridSize.depth = mvkCeilingDivide(mtlGridSize.depth, mtlTgrpSize.depth);
             // There may be extra threads, but that's OK; the shader does bounds checking to
             // ensure it doesn't try to write out of bounds.
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm b/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
index 48a3f6e..5dfa27f 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
@@ -507,9 +507,10 @@
 }
 
 static uint32_t sizeOfOutput(const SPIRVShaderOutput& output) {
+	if ( !output.isUsed ) { return 0; }		// Unused outputs consume no buffer space.
+
 	uint32_t vecWidth = output.vecWidth;
-	// Round up to 4 elements for 3-vectors, since that reflects how Metal lays them out.
-	if (vecWidth == 3) { vecWidth = 4; }
+	if (vecWidth == 3) { vecWidth = 4; }	// Metal 3-vectors consume same as 4-vectors.
 	switch (output.baseType) {
 		case SPIRType::SByte:
 		case SPIRType::UByte:
diff --git a/MoltenVK/MoltenVK/Utility/MVKFoundation.h b/MoltenVK/MoltenVK/Utility/MVKFoundation.h
index 0fddedd..4481344 100644
--- a/MoltenVK/MoltenVK/Utility/MVKFoundation.h
+++ b/MoltenVK/MoltenVK/Utility/MVKFoundation.h
@@ -141,12 +141,6 @@
 #pragma mark -
 #pragma mark Alignment functions
 
-/** Returns the result of an unsigned integer division, rounded up. */
-static inline size_t mvkCeilingDivide(size_t numerator, size_t denominator) {
-	if (denominator == 1) { return numerator; }		// Short circuit for this very common usecase.
-	return (numerator + denominator - 1) / denominator;
-}
-
 /** Returns whether the specified value is a power-of-two. */
 static inline bool mvkIsPowerOfTwo(uintptr_t value) {
 	// Test POT:  (x != 0) && ((x & (x - 1)) == 0)
@@ -348,6 +342,13 @@
     return std::min(std::max(val, lower), upper);
 }
 
+/** Returns the result of a division, rounded up. */
+template<typename T>
+T mvkCeilingDivide(T numerator, T denominator) {
+	// Short circuit very common usecase of dividing by one.
+	return (denominator == 1) ? numerator : (numerator + denominator - 1) / denominator;
+}
+
 /**
  * Returns a hash value calculated from the specified array of numeric elements,
  * using the DJB2a algorithm:  hash = (hash * 33) ^ value.
diff --git a/MoltenVKShaderConverter/MoltenVKSPIRVToMSLConverter/SPIRVReflection.cpp b/MoltenVKShaderConverter/MoltenVKSPIRVToMSLConverter/SPIRVReflection.cpp
index d6eefce..4a58885 100644
--- a/MoltenVKShaderConverter/MoltenVKSPIRVToMSLConverter/SPIRVReflection.cpp
+++ b/MoltenVKShaderConverter/MoltenVKSPIRVToMSLConverter/SPIRVReflection.cpp
@@ -129,6 +129,7 @@
 			reflect.set_entry_point(entryName, model);
 		}
 		reflect.compile();
+		reflect.update_active_builtins();
 
 		outputs.clear();
 
@@ -136,11 +137,13 @@
 		parser.get_parsed_ir().for_each_typed_id<SPIRV_CROSS_NAMESPACE::SPIRVariable>([&reflect, &outputs, model, addSat](uint32_t varID, const SPIRV_CROSS_NAMESPACE::SPIRVariable& var) {
 			if (var.storage != spv::StorageClassOutput) { return; }
 
+			bool isUsed = true;
 			const auto* type = &reflect.get_type(reflect.get_type_from_variable(varID).parent_type);
 			bool patch = reflect.has_decoration(varID, spv::DecorationPatch);
 			auto biType = spv::BuiltInMax;
 			if (reflect.has_decoration(varID, spv::DecorationBuiltIn)) {
 				biType = (spv::BuiltIn)reflect.get_decoration(varID, spv::DecorationBuiltIn);
+				isUsed = reflect.has_active_builtin(biType, var.storage);
 			}
 			uint32_t loc = -1;
 			if (reflect.has_decoration(varID, spv::DecorationLocation)) {
@@ -160,30 +163,31 @@
 					patch = reflect.has_member_decoration(type->self, i, spv::DecorationPatch);
 					if (reflect.has_member_decoration(type->self, i, spv::DecorationBuiltIn)) {
 						biType = (spv::BuiltIn)reflect.get_member_decoration(type->self, i, spv::DecorationBuiltIn);
+						isUsed = reflect.has_active_builtin(biType, var.storage);
 					}
 					const SPIRV_CROSS_NAMESPACE::SPIRType& memberType = reflect.get_type(type->member_types[i]);
 					if (memberType.columns > 1) {
 						for (uint32_t i = 0; i < memberType.columns; i++) {
-							outputs.push_back({memberType.basetype, memberType.vecsize, addSat(memberLoc, i), patch, biType});
+							outputs.push_back({memberType.basetype, memberType.vecsize, addSat(memberLoc, i), biType, patch, isUsed});
 						}
 					} else if (!memberType.array.empty()) {
 						for (uint32_t i = 0; i < memberType.array[0]; i++) {
-							outputs.push_back({memberType.basetype, memberType.vecsize, addSat(memberLoc, i), patch, biType});
+							outputs.push_back({memberType.basetype, memberType.vecsize, addSat(memberLoc, i), biType, patch, isUsed});
 						}
 					} else {
-						outputs.push_back({memberType.basetype, memberType.vecsize, memberLoc, patch, biType});
+						outputs.push_back({memberType.basetype, memberType.vecsize, memberLoc, biType, patch, isUsed});
 					}
 				}
 			} else if (type->columns > 1) {
 				for (uint32_t i = 0; i < type->columns; i++) {
-					outputs.push_back({type->basetype, type->vecsize, addSat(loc, i), patch, biType});
+					outputs.push_back({type->basetype, type->vecsize, addSat(loc, i), biType, patch, isUsed});
 				}
 			} else if (!type->array.empty()) {
 				for (uint32_t i = 0; i < type->array[0]; i++) {
-					outputs.push_back({type->basetype, type->vecsize, addSat(loc, i), patch, biType});
+					outputs.push_back({type->basetype, type->vecsize, addSat(loc, i), biType, patch, isUsed});
 				}
 			} else {
-				outputs.push_back({type->basetype, type->vecsize, loc, patch, biType});
+				outputs.push_back({type->basetype, type->vecsize, loc, biType, patch, isUsed});
 			}
 		});
 		// Sort outputs by ascending location.
diff --git a/MoltenVKShaderConverter/MoltenVKSPIRVToMSLConverter/SPIRVReflection.h b/MoltenVKShaderConverter/MoltenVKSPIRVToMSLConverter/SPIRVReflection.h
index f8867c6..cb85311 100644
--- a/MoltenVKShaderConverter/MoltenVKSPIRVToMSLConverter/SPIRVReflection.h
+++ b/MoltenVKShaderConverter/MoltenVKSPIRVToMSLConverter/SPIRVReflection.h
@@ -61,11 +61,14 @@
 		/** The location number of the output. */
 		uint32_t location;
 
+		/** If this is a builtin, the kind of builtin this is. */
+		spv::BuiltIn builtin;
+
 		/** Whether this is a per-patch or per-vertex output. Only meaningful for tessellation control shaders. */
 		bool perPatch;
 
-		/** If this is a builtin, the kind of builtin this is. */
-		spv::BuiltIn builtin;
+		/** Whether this output is actually used (populated) by the shader. */
+		bool isUsed;
 	};
 
 #pragma mark -