Merge branch 'master' of https://github.com/KhronosGroup/MoltenVK
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm b/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm
index 87e1128..a1cb774 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm
@@ -110,7 +110,7 @@
     if (pipeline->isTessellationPipeline()) {
         inControlPointCount = pipeline->getInputControlPointCount();
         outControlPointCount = pipeline->getOutputControlPointCount();
-        patchCount = (uint32_t)mvkCeilingDivide(_vertexCount, inControlPointCount);
+        patchCount = mvkCeilingDivide(_vertexCount, inControlPointCount);
     }
     for (uint32_t s : stages) {
         auto stage = MVKGraphicsStage(s);
@@ -308,7 +308,7 @@
     if (pipeline->isTessellationPipeline()) {
         inControlPointCount = pipeline->getInputControlPointCount();
         outControlPointCount = pipeline->getOutputControlPointCount();
-        patchCount = (uint32_t)mvkCeilingDivide(_indexCount, inControlPointCount);
+        patchCount = mvkCeilingDivide(_indexCount, inControlPointCount);
     }
     for (uint32_t s : stages) {
         auto stage = MVKGraphicsStage(s);
@@ -544,7 +544,7 @@
         inControlPointCount = pipeline->getInputControlPointCount();
         outControlPointCount = pipeline->getOutputControlPointCount();
         vertexCount = kMVKDrawIndirectVertexCountUpperBound;
-        patchCount = (uint32_t)mvkCeilingDivide(vertexCount, inControlPointCount);
+        patchCount = mvkCeilingDivide(vertexCount, inControlPointCount);
         VkDeviceSize indirectSize = (sizeof(MTLDispatchThreadgroupsIndirectArguments) + sizeof(MTLDrawPatchIndirectArguments)) * _drawCount;
         if (cmdEncoder->_pDeviceMetalFeatures->mslVersion >= 20100) {
             indirectSize += sizeof(MTLStageInRegionIndirectArguments) * _drawCount;
@@ -614,7 +614,7 @@
                                             &_drawCount,
                                             sizeof(_drawCount),
                                             5);
-                [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
+                [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide<NSUInteger>(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
                                   threadsPerThreadgroup: MTLSizeMake(mtlConvertState.threadExecutionWidth, 1, 1)];
             }
 
@@ -783,7 +783,7 @@
         inControlPointCount = pipeline->getInputControlPointCount();
         outControlPointCount = pipeline->getOutputControlPointCount();
         vertexCount = kMVKDrawIndirectVertexCountUpperBound;
-        patchCount = (uint32_t)mvkCeilingDivide(vertexCount, inControlPointCount);
+        patchCount = mvkCeilingDivide(vertexCount, inControlPointCount);
         VkDeviceSize indirectSize = (sizeof(MTLDispatchThreadgroupsIndirectArguments) + sizeof(MTLDrawPatchIndirectArguments)) * _drawCount;
         if (cmdEncoder->_pDeviceMetalFeatures->mslVersion >= 20100) {
             indirectSize += sizeof(MTLStageInRegionIndirectArguments) * _drawCount;
@@ -842,7 +842,7 @@
                                                 &_drawCount,
                                                 sizeof(_drawCount),
                                                 5);
-                    [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
+                    [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide<NSUInteger>(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
                                       threadsPerThreadgroup: MTLSizeMake(mtlConvertState.threadExecutionWidth, 1, 1)];
                 }
                 // We actually need to make a copy of the index buffer, regardless of whether
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
index e15467f..82ced58 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
@@ -825,15 +825,15 @@
             // One thread is run per block. Each block decompresses to an m x n array of texels.
             // So the size of the grid is (ceil(width/m), ceil(height/n), depth).
             VkExtent2D blockExtent = mvkMTLPixelFormatBlockTexelSize(mtlPixFmt);
-            MTLSize mtlGridSize = MTLSizeMake(mvkCeilingDivide(mtlTxtSize.width, blockExtent.width),
-                                              mvkCeilingDivide(mtlTxtSize.height, blockExtent.height),
+            MTLSize mtlGridSize = MTLSizeMake(mvkCeilingDivide<NSUInteger>(mtlTxtSize.width, blockExtent.width),
+                                              mvkCeilingDivide<NSUInteger>(mtlTxtSize.height, blockExtent.height),
                                               mtlTxtSize.depth);
             // Use four times the thread execution width as the threadgroup size.
             MTLSize mtlTgrpSize = MTLSizeMake(2, 2, mtlComputeState.threadExecutionWidth);
             // Then the number of threadgroups is (ceil(x/2), ceil(y/2), ceil(z/t)),
             // where 't' is the thread execution width.
-            mtlGridSize.width = mvkCeilingDivide(mtlGridSize.width, 2);
-            mtlGridSize.height = mvkCeilingDivide(mtlGridSize.height, 2);
+            mtlGridSize.width = mvkCeilingDivide(mtlGridSize.width, mtlTgrpSize.width);
+            mtlGridSize.height = mvkCeilingDivide(mtlGridSize.height, mtlTgrpSize.height);
             mtlGridSize.depth = mvkCeilingDivide(mtlGridSize.depth, mtlTgrpSize.depth);
             // There may be extra threads, but that's OK; the shader does bounds checking to
             // ensure it doesn't try to write out of bounds.
diff --git a/MoltenVK/MoltenVK/Utility/MVKFoundation.h b/MoltenVK/MoltenVK/Utility/MVKFoundation.h
index 0fddedd..4481344 100644
--- a/MoltenVK/MoltenVK/Utility/MVKFoundation.h
+++ b/MoltenVK/MoltenVK/Utility/MVKFoundation.h
@@ -141,12 +141,6 @@
 #pragma mark -
 #pragma mark Alignment functions
 
-/** Returns the result of an unsigned integer division, rounded up. */
-static inline size_t mvkCeilingDivide(size_t numerator, size_t denominator) {
-	if (denominator == 1) { return numerator; }		// Short circuit for this very common usecase.
-	return (numerator + denominator - 1) / denominator;
-}
-
 /** Returns whether the specified value is a power-of-two. */
 static inline bool mvkIsPowerOfTwo(uintptr_t value) {
 	// Test POT:  (x != 0) && ((x & (x - 1)) == 0)
@@ -348,6 +342,13 @@
     return std::min(std::max(val, lower), upper);
 }
 
+/** Returns the result of a division, rounded up. */
+template<typename T>
+T mvkCeilingDivide(T numerator, T denominator) {
+	// Short circuit very common usecase of dividing by one.
+	return (denominator == 1) ? numerator : (numerator + denominator - 1) / denominator;
+}
+
 /**
  * Returns a hash value calculated from the specified array of numeric elements,
  * using the DJB2a algorithm:  hash = (hash * 33) ^ value.