Merge branch 'master' of https://github.com/KhronosGroup/MoltenVK
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm b/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm
index 87e1128..a1cb774 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm
@@ -110,7 +110,7 @@
if (pipeline->isTessellationPipeline()) {
inControlPointCount = pipeline->getInputControlPointCount();
outControlPointCount = pipeline->getOutputControlPointCount();
- patchCount = (uint32_t)mvkCeilingDivide(_vertexCount, inControlPointCount);
+ patchCount = mvkCeilingDivide(_vertexCount, inControlPointCount);
}
for (uint32_t s : stages) {
auto stage = MVKGraphicsStage(s);
@@ -308,7 +308,7 @@
if (pipeline->isTessellationPipeline()) {
inControlPointCount = pipeline->getInputControlPointCount();
outControlPointCount = pipeline->getOutputControlPointCount();
- patchCount = (uint32_t)mvkCeilingDivide(_indexCount, inControlPointCount);
+ patchCount = mvkCeilingDivide(_indexCount, inControlPointCount);
}
for (uint32_t s : stages) {
auto stage = MVKGraphicsStage(s);
@@ -544,7 +544,7 @@
inControlPointCount = pipeline->getInputControlPointCount();
outControlPointCount = pipeline->getOutputControlPointCount();
vertexCount = kMVKDrawIndirectVertexCountUpperBound;
- patchCount = (uint32_t)mvkCeilingDivide(vertexCount, inControlPointCount);
+ patchCount = mvkCeilingDivide(vertexCount, inControlPointCount);
VkDeviceSize indirectSize = (sizeof(MTLDispatchThreadgroupsIndirectArguments) + sizeof(MTLDrawPatchIndirectArguments)) * _drawCount;
if (cmdEncoder->_pDeviceMetalFeatures->mslVersion >= 20100) {
indirectSize += sizeof(MTLStageInRegionIndirectArguments) * _drawCount;
@@ -614,7 +614,7 @@
&_drawCount,
sizeof(_drawCount),
5);
- [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
+ [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide<NSUInteger>(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
threadsPerThreadgroup: MTLSizeMake(mtlConvertState.threadExecutionWidth, 1, 1)];
}
@@ -783,7 +783,7 @@
inControlPointCount = pipeline->getInputControlPointCount();
outControlPointCount = pipeline->getOutputControlPointCount();
vertexCount = kMVKDrawIndirectVertexCountUpperBound;
- patchCount = (uint32_t)mvkCeilingDivide(vertexCount, inControlPointCount);
+ patchCount = mvkCeilingDivide(vertexCount, inControlPointCount);
VkDeviceSize indirectSize = (sizeof(MTLDispatchThreadgroupsIndirectArguments) + sizeof(MTLDrawPatchIndirectArguments)) * _drawCount;
if (cmdEncoder->_pDeviceMetalFeatures->mslVersion >= 20100) {
indirectSize += sizeof(MTLStageInRegionIndirectArguments) * _drawCount;
@@ -842,7 +842,7 @@
&_drawCount,
sizeof(_drawCount),
5);
- [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
+ [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide<NSUInteger>(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
threadsPerThreadgroup: MTLSizeMake(mtlConvertState.threadExecutionWidth, 1, 1)];
}
// We actually need to make a copy of the index buffer, regardless of whether
diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
index e15467f..82ced58 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm
@@ -825,15 +825,15 @@
// One thread is run per block. Each block decompresses to an m x n array of texels.
// So the size of the grid is (ceil(width/m), ceil(height/n), depth).
VkExtent2D blockExtent = mvkMTLPixelFormatBlockTexelSize(mtlPixFmt);
- MTLSize mtlGridSize = MTLSizeMake(mvkCeilingDivide(mtlTxtSize.width, blockExtent.width),
- mvkCeilingDivide(mtlTxtSize.height, blockExtent.height),
+ MTLSize mtlGridSize = MTLSizeMake(mvkCeilingDivide<NSUInteger>(mtlTxtSize.width, blockExtent.width),
+ mvkCeilingDivide<NSUInteger>(mtlTxtSize.height, blockExtent.height),
mtlTxtSize.depth);
// Use four times the thread execution width as the threadgroup size.
MTLSize mtlTgrpSize = MTLSizeMake(2, 2, mtlComputeState.threadExecutionWidth);
// Then the number of threadgroups is (ceil(x/2), ceil(y/2), ceil(z/t)),
// where 't' is the thread execution width.
- mtlGridSize.width = mvkCeilingDivide(mtlGridSize.width, 2);
- mtlGridSize.height = mvkCeilingDivide(mtlGridSize.height, 2);
+ mtlGridSize.width = mvkCeilingDivide(mtlGridSize.width, mtlTgrpSize.width);
+ mtlGridSize.height = mvkCeilingDivide(mtlGridSize.height, mtlTgrpSize.height);
mtlGridSize.depth = mvkCeilingDivide(mtlGridSize.depth, mtlTgrpSize.depth);
// There may be extra threads, but that's OK; the shader does bounds checking to
// ensure it doesn't try to write out of bounds.
diff --git a/MoltenVK/MoltenVK/Utility/MVKFoundation.h b/MoltenVK/MoltenVK/Utility/MVKFoundation.h
index 0fddedd..4481344 100644
--- a/MoltenVK/MoltenVK/Utility/MVKFoundation.h
+++ b/MoltenVK/MoltenVK/Utility/MVKFoundation.h
@@ -141,12 +141,6 @@
#pragma mark -
#pragma mark Alignment functions
-/** Returns the result of an unsigned integer division, rounded up. */
-static inline size_t mvkCeilingDivide(size_t numerator, size_t denominator) {
- if (denominator == 1) { return numerator; } // Short circuit for this very common usecase.
- return (numerator + denominator - 1) / denominator;
-}
-
/** Returns whether the specified value is a power-of-two. */
static inline bool mvkIsPowerOfTwo(uintptr_t value) {
// Test POT: (x != 0) && ((x & (x - 1)) == 0)
@@ -348,6 +342,13 @@
return std::min(std::max(val, lower), upper);
}
+/** Returns the result of a division, rounded up. */
+template<typename T>
+T mvkCeilingDivide(T numerator, T denominator) {
+ // Short circuit very common usecase of dividing by one.
+ return (denominator == 1) ? numerator : (numerator + denominator - 1) / denominator;
+}
+
/**
* Returns a hash value calculated from the specified array of numeric elements,
* using the DJB2a algorithm: hash = (hash * 33) ^ value.