Merge pull request #1387 from billhollings/occlusion-query-fixes-for-M1

Occlusion query fixes for M1
diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md
index 25c119f..64df3cc 100644
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@@ -33,6 +33,8 @@
 - Fix synchronization issue with locking `MTLArgumentEncoder` for Metal Argument Buffers.
 - Fix race condition on submission fence during device loss.
 - Fix crash using memoryless storage for input attachments on Apple Silicon.
+- Fix issue where M1 GPU does not support reusing Metal visibility buffer offsets
+  across separate render encoders within a single Metal command buffer (Vulkan submit).
 - On command buffer submission failure, if `MVKConfiguration::resumeLostDevice` enabled,  do not release 
   waits on `VkDevice`, and do not return `VK_ERROR_DEVICE_LOST`, unless `VkPhysicalDevice` is also lost.
 - Fix inconsistent handling of linear attachment decisions on Apple Silicon.
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
index 952c7ec..7d9c92f 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
@@ -47,6 +47,16 @@
 
 
 #pragma mark -
+#pragma mark MVKCommandEncodingContext
+
+/** Context for tracking information across multiple encodings. */
+typedef struct MVKCommandEncodingContext {
+	NSUInteger mtlVisibilityResultOffset = 0;
+	const MVKMTLBufferAllocation* visibilityResultBuffer = nullptr;
+} MVKCommandEncodingContext;
+
+
+#pragma mark -
 #pragma mark MVKCommandBuffer
 
 /** Represents a Vulkan command pool. */
@@ -83,7 +93,7 @@
 	inline MVKCommandPool* getCommandPool() { return _commandPool; }
 
 	/** Submit the commands in this buffer as part of the queue submission. */
-	void submit(MVKQueueCommandBufferSubmission* cmdBuffSubmit);
+	void submit(MVKQueueCommandBufferSubmission* cmdBuffSubmit, MVKCommandEncodingContext* pEncodingContext);
 
     /** Returns whether this command buffer can be submitted to a queue more than once. */
     inline bool getIsReusable() { return _isReusable; }
@@ -264,7 +274,7 @@
 	MVKVulkanAPIObject* getVulkanAPIObject() override { return _cmdBuffer->getVulkanAPIObject(); };
 
 	/** Encode commands from the command buffer onto the Metal command buffer. */
-	void encode(id<MTLCommandBuffer> mtlCmdBuff);
+	void encode(id<MTLCommandBuffer> mtlCmdBuff, MVKCommandEncodingContext* pEncodingContext);
 
 	/** Encode commands from the specified secondary command buffer onto the Metal command buffer. */
 	void encodeSecondary(MVKCommandBuffer* secondaryCmdBuffer);
@@ -407,6 +417,9 @@
 
 #pragma mark Dynamic encoding state accessed directly
 
+	/** Context for tracking information across multiple encodings. */
+	MVKCommandEncodingContext* _pEncodingContext;
+
     /** A reference to the Metal features supported by the device. */
     const MVKPhysicalDeviceMetalFeatures* _pDeviceMetalFeatures;
 
@@ -428,9 +441,6 @@
 	/** The current Metal render encoder. */
 	id<MTLRenderCommandEncoder> _mtlRenderEncoder;
 
-	/** The buffer used to hold occlusion query results in a render pass. */
-	const MVKMTLBufferAllocation* _visibilityResultMTLBuffer;
-
     /** Tracks the current graphics pipeline bound to the encoder. */
     MVKPipelineCommandEncoderState _graphicsPipelineState;
 
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
index 85f312b..18d2441 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
@@ -105,7 +105,8 @@
     _commandCount++;
 }
 
-void MVKCommandBuffer::submit(MVKQueueCommandBufferSubmission* cmdBuffSubmit) {
+void MVKCommandBuffer::submit(MVKQueueCommandBufferSubmission* cmdBuffSubmit,
+							  MVKCommandEncodingContext* pEncodingContext) {
 	if ( !canExecute() ) { return; }
 
 	if (_prefilledMTLCmdBuffer) {
@@ -113,7 +114,7 @@
 		clearPrefilledMTLCommandBuffer();
 	} else {
 		MVKCommandEncoder encoder(this);
-		encoder.encode(cmdBuffSubmit->getActiveMTLCommandBuffer());
+		encoder.encode(cmdBuffSubmit->getActiveMTLCommandBuffer(), pEncodingContext);
 	}
 
 	if ( !_supportsConcurrentExecution ) { _isExecutingNonConcurrently.clear(); }
@@ -150,8 +151,9 @@
 		uint32_t qIdx = 0;
 		_prefilledMTLCmdBuffer = _commandPool->newMTLCommandBuffer(qIdx);	// retain
 
+		MVKCommandEncodingContext encodingContext;
 		MVKCommandEncoder encoder(this);
-		encoder.encode(_prefilledMTLCmdBuffer);
+		encoder.encode(_prefilledMTLCmdBuffer, &encodingContext);
 
 		// Once encoded onto Metal, if this command buffer is not reusable, we don't need the
 		// MVKCommand instances anymore, so release them in order to reduce memory pressure.
@@ -246,13 +248,15 @@
 #pragma mark -
 #pragma mark MVKCommandEncoder
 
-void MVKCommandEncoder::encode(id<MTLCommandBuffer> mtlCmdBuff) {
+void MVKCommandEncoder::encode(id<MTLCommandBuffer> mtlCmdBuff,
+							   MVKCommandEncodingContext* pEncodingContext) {
 	_renderPass = nullptr;
 	_subpassContents = VK_SUBPASS_CONTENTS_INLINE;
 	_renderSubpassIndex = 0;
 	_multiviewPassIndex = 0;
 	_canUseLayeredRendering = false;
 
+	_pEncodingContext = pEncodingContext;
 	_mtlCmdBuffer = mtlCmdBuff;		// not retained
 
 	setLabelIfNotNil(_mtlCmdBuffer, _cmdBuffer->_debugName);
@@ -345,12 +349,12 @@
 												  _clearValues.contents(),
 												  _isRenderingEntireAttachment,
 												  loadOverride);
-    if (_cmdBuffer->_needsVisibilityResultMTLBuffer) {
-        if (!_visibilityResultMTLBuffer) {
-            _visibilityResultMTLBuffer = getTempMTLBuffer(_pDeviceMetalFeatures->maxQueryBufferSize, true, true);
-        }
-        mtlRPDesc.visibilityResultBuffer = _visibilityResultMTLBuffer->_mtlBuffer;
-    }
+	if (_cmdBuffer->_needsVisibilityResultMTLBuffer) {
+		if ( !_pEncodingContext->visibilityResultBuffer ) {
+			_pEncodingContext->visibilityResultBuffer = getTempMTLBuffer(_pDeviceMetalFeatures->maxQueryBufferSize, true, true);
+		}
+		mtlRPDesc.visibilityResultBuffer = _pEncodingContext->visibilityResultBuffer->_mtlBuffer;
+	}
 
 	VkExtent2D fbExtent = _framebufferExtent;
     mtlRPDesc.renderTargetWidthMVK = max(min(_renderArea.offset.x + _renderArea.extent.width, fbExtent.width), 1u);
@@ -770,7 +774,6 @@
 
 MVKCommandEncoder::MVKCommandEncoder(MVKCommandBuffer* cmdBuffer) : MVKBaseDeviceObject(cmdBuffer->getDevice()),
         _cmdBuffer(cmdBuffer),
-        _visibilityResultMTLBuffer(nil),
         _graphicsPipelineState(this),
         _computePipelineState(this),
         _viewportState(this),
@@ -799,6 +802,7 @@
 			_mtlComputeEncoderUse = kMVKCommandUseNone;
             _mtlBlitEncoder = nil;
             _mtlBlitEncoderUse = kMVKCommandUseNone;
+			_pEncodingContext = nullptr;
 }
 
 
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.h b/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.h
index d18fa87..572a732 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.h
@@ -614,9 +614,18 @@
 protected:
     void encodeImpl(uint32_t) override;
 
+	typedef struct OcclusionQueryLocation {
+		MVKOcclusionQueryPool* queryPool = nullptr;
+		uint32_t query = 0;
+		NSUInteger visibilityBufferOffset = 0;
+
+		OcclusionQueryLocation(MVKOcclusionQueryPool* qPool, uint32_t qIdx, NSUInteger vbOfst)
+		: queryPool(qPool), query(qIdx), visibilityBufferOffset(vbOfst) {}
+
+	} OcclusionQueryLocation;
+
+	MVKSmallVector<OcclusionQueryLocation> _mtlRenderPassQueries;
     MTLVisibilityResultMode _mtlVisibilityResultMode = MTLVisibilityResultModeDisabled;
-    NSUInteger _mtlVisibilityResultOffset = 0;
-	MVKSmallVector<std::pair<MVKQuerySpec, NSUInteger>> _mtlRenderPassQueries;
 };
 
 
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm b/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm
index 96ea826..8093cab 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm
@@ -1073,20 +1073,19 @@
 #pragma mark MVKOcclusionQueryCommandEncoderState
 
 void MVKOcclusionQueryCommandEncoderState::endMetalRenderPass() {
+	const MVKMTLBufferAllocation* vizBuff = _cmdEncoder->_pEncodingContext->visibilityResultBuffer;
+    if ( !vizBuff || _mtlRenderPassQueries.empty() ) { return; }  // Nothing to do.
 
-    if (_mtlRenderPassQueries.empty()) { return; }  // Nothing to do.
-
-    id<MTLComputePipelineState> mtlAccumState = _cmdEncoder->getCommandEncodingPool()->getAccumulateOcclusionQueryResultsMTLComputePipelineState();
+	id<MTLComputePipelineState> mtlAccumState = _cmdEncoder->getCommandEncodingPool()->getAccumulateOcclusionQueryResultsMTLComputePipelineState();
     id<MTLComputeCommandEncoder> mtlAccumEncoder = _cmdEncoder->getMTLComputeEncoder(kMVKCommandUseAccumOcclusionQuery);
     [mtlAccumEncoder setComputePipelineState: mtlAccumState];
-    for (auto& query : _mtlRenderPassQueries) {
+    for (auto& qryLoc : _mtlRenderPassQueries) {
         // Accumulate the current results to the query pool's buffer.
-        auto* pQueryPool = (MVKOcclusionQueryPool*)query.first.queryPool;
-        [mtlAccumEncoder setBuffer: pQueryPool->getVisibilityResultMTLBuffer()
-                            offset: pQueryPool->getVisibilityResultOffset(query.first.query)
+        [mtlAccumEncoder setBuffer: qryLoc.queryPool->getVisibilityResultMTLBuffer()
+                            offset: qryLoc.queryPool->getVisibilityResultOffset(qryLoc.query)
                            atIndex: 0];
-        [mtlAccumEncoder setBuffer: _cmdEncoder->_visibilityResultMTLBuffer->_mtlBuffer
-                            offset: query.second
+        [mtlAccumEncoder setBuffer: vizBuff->_mtlBuffer
+                            offset: vizBuff->_offset + qryLoc.visibilityBufferOffset
                            atIndex: 1];
         [mtlAccumEncoder dispatchThreadgroups: MTLSizeMake(1, 1, 1)
                         threadsPerThreadgroup: MTLSizeMake(1, 1, 1)];
@@ -1095,24 +1094,28 @@
     _mtlRenderPassQueries.clear();
 }
 
+// The Metal visibility buffer has a finite size, and on some Metal platforms (looking at you M1),
+// query offsets cannnot be reused with the same MTLCommandBuffer. If enough occlusion queries are
+// begun within a single MTLCommandBuffer, it may exhaust the visibility buffer. If that occurs,
+// report an error and disable further visibility tracking for the remainder of the MTLCommandBuffer.
+// In most cases, a MTLCommandBuffer corresponds to a Vulkan command submit (VkSubmitInfo),
+// and so the error text is framed in terms of the Vulkan submit.
 void MVKOcclusionQueryCommandEncoderState::beginOcclusionQuery(MVKOcclusionQueryPool* pQueryPool, uint32_t query, VkQueryControlFlags flags) {
-
-    MVKQuerySpec querySpec;
-    querySpec.set(pQueryPool, query);
-    NSUInteger offset = _mtlRenderPassQueries.empty() ? 0 : _mtlVisibilityResultOffset + kMVKQuerySlotSizeInBytes;
-    NSUInteger maxOffset = _cmdEncoder->_pDeviceMetalFeatures->maxQueryBufferSize - kMVKQuerySlotSizeInBytes;
-    offset = min(offset, maxOffset);
-    _mtlRenderPassQueries.push_back(make_pair(querySpec, offset));
-
-    bool shouldCount = _cmdEncoder->_pDeviceFeatures->occlusionQueryPrecise && mvkAreAllFlagsEnabled(flags, VK_QUERY_CONTROL_PRECISE_BIT);
-    _mtlVisibilityResultMode = shouldCount ? MTLVisibilityResultModeCounting : MTLVisibilityResultModeBoolean;
-    _mtlVisibilityResultOffset = offset;
-
+	if (_cmdEncoder->_pEncodingContext->mtlVisibilityResultOffset + kMVKQuerySlotSizeInBytes <= _cmdEncoder->_pDeviceMetalFeatures->maxQueryBufferSize) {
+		bool shouldCount = _cmdEncoder->_pDeviceFeatures->occlusionQueryPrecise && mvkAreAllFlagsEnabled(flags, VK_QUERY_CONTROL_PRECISE_BIT);
+		_mtlVisibilityResultMode = shouldCount ? MTLVisibilityResultModeCounting : MTLVisibilityResultModeBoolean;
+		_mtlRenderPassQueries.emplace_back(pQueryPool, query, _cmdEncoder->_pEncodingContext->mtlVisibilityResultOffset);
+	} else {
+		reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "vkCmdBeginQuery(): The maximum number of queries in a single Vulkan command submission is %llu.", _cmdEncoder->_pDeviceMetalFeatures->maxQueryBufferSize / kMVKQuerySlotSizeInBytes);
+		_mtlVisibilityResultMode = MTLVisibilityResultModeDisabled;
+		_cmdEncoder->_pEncodingContext->mtlVisibilityResultOffset -= kMVKQuerySlotSizeInBytes;
+	}
     markDirty();
 }
 
 void MVKOcclusionQueryCommandEncoderState::endOcclusionQuery(MVKOcclusionQueryPool* pQueryPool, uint32_t query) {
 	_mtlVisibilityResultMode = MTLVisibilityResultModeDisabled;
+	_cmdEncoder->_pEncodingContext->mtlVisibilityResultOffset += kMVKQuerySlotSizeInBytes;
 	markDirty();
 }
 
@@ -1120,5 +1123,5 @@
 	if (stage != kMVKGraphicsStageRasterization) { return; }
 
 	[_cmdEncoder->_mtlRenderEncoder setVisibilityResultMode: _mtlVisibilityResultMode
-													 offset: _mtlVisibilityResultOffset];
+													 offset: _cmdEncoder->_pEncodingContext->mtlVisibilityResultOffset];
 }
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h
index e9c92ee..1379687 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h
@@ -237,7 +237,7 @@
 		}
 
 protected:
-	void submitCommandBuffers() override { for (auto& cb : _cmdBuffers) { cb->submit(this); } }
+	void submitCommandBuffers() override;
 
 	MVKSmallVector<MVKCommandBuffer*, N> _cmdBuffers;
 };
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
index 22bd1b2..286946a 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
@@ -463,6 +463,13 @@
 }
 
 
+template <size_t N>
+void MVKQueueFullCommandBufferSubmission<N>::submitCommandBuffers() {
+	MVKCommandEncodingContext encodingContext;
+	for (auto& cb : _cmdBuffers) { cb->submit(this, &encodingContext); }
+}
+
+
 #pragma mark -
 #pragma mark MVKQueuePresentSurfaceSubmission