Merge pull request #1408 from billhollings/handle-bad-api-pointers

Properly ignore non-null pipeline creation pointers that should be ignored.
diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md
index 4dbd388..26b7e76 100644
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@@ -18,11 +18,13 @@
 
 Released TBD
 
+- Vulkan timestamp query pools use Metal GPU counters when available.
 - Fix incorrect translation of clear color values on Apple Silicon.
 - Fix swizzle of depth and stencil values into RGBA (`float4`) variable in shaders.
 - Disable `VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT` for 
   `VK_FORMAT_E5B9G9R9_UFLOAT_PACK32` on macOS Apple Silicon.
 - Support alpha-to-coverage without a color attachment.
+- Update `VK_MVK_MOLTENVK_SPEC_VERSION` to `32`.
 - Update to latest SPIRV-Cross version:
 	- MSL: Adjust `gl_SampleMaskIn` for sample-shading and/or fixed sample mask.
 	- MSL: Fix setting `SPIRVCrossDecorationInterpolantComponentExpr` decoration.
diff --git a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
index 994eb4c..d9a6820 100644
--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@@ -835,6 +835,16 @@
 	MVK_FLOAT_ROUNDING_UP_MAX_ENUM = 0x7FFFFFFF
 } MVKFloatRounding;
 
+/** Identifies the pipeline points where GPU counter sampling can occur. Maps to MTLCounterSamplingPoint. */
+typedef enum MVKCounterSamplingBits {
+	MVK_COUNTER_SAMPLING_AT_DRAW           = 0x00000001,
+	MVK_COUNTER_SAMPLING_AT_DISPATCH       = 0x00000002,
+	MVK_COUNTER_SAMPLING_AT_BLIT           = 0x00000004,
+	MVK_COUNTER_SAMPLING_AT_PIPELINE_STAGE = 0x00000008,
+	MVK_COUNTER_SAMPLING_MAX_ENUM          = 0X7FFFFFFF
+} MVKCounterSamplingBits;
+typedef VkFlags MVKCounterSamplingFlags;
+
 /**
  * Features provided by the current implementation of Metal on the current device. You can
  * retrieve a copy of this structure using the vkGetPhysicalDeviceMetalFeaturesMVK() function.
@@ -850,71 +860,72 @@
  * SHOULD NOT BE CHANGED.
  */
 typedef struct {
-    uint32_t mslVersion;                        /**< The version of the Metal Shading Language available on this device. The format of the integer is MMmmpp, with two decimal digts each for Major, minor, and patch version values (eg. MSL 1.2 would appear as 010200). */
-	VkBool32 indirectDrawing;                   /**< If true, draw calls support parameters held in a GPU buffer. */
-	VkBool32 baseVertexInstanceDrawing;         /**< If true, draw calls support specifiying the base vertex and instance. */
-    uint32_t dynamicMTLBufferSize;              /**< If greater than zero, dynamic MTLBuffers for setting vertex, fragment, and compute bytes are supported, and their content must be below this value. */
-    VkBool32 shaderSpecialization;              /**< If true, shader specialization (aka Metal function constants) is supported. */
-    VkBool32 ioSurfaces;                        /**< If true, VkImages can be underlaid by IOSurfaces via the vkUseIOSurfaceMVK() function, to support inter-process image transfers. */
-    VkBool32 texelBuffers;                      /**< If true, texel buffers are supported, allowing the contents of a buffer to be interpreted as an image via a VkBufferView. */
-	VkBool32 layeredRendering;                  /**< If true, layered rendering to multiple cube or texture array layers is supported. */
-	VkBool32 presentModeImmediate;              /**< If true, immediate surface present mode (VK_PRESENT_MODE_IMMEDIATE_KHR), allowing a swapchain image to be presented immediately, without waiting for the vertical sync period of the display, is supported. */
-	VkBool32 stencilViews;                      /**< If true, stencil aspect views are supported through the MTLPixelFormatX24_Stencil8 and MTLPixelFormatX32_Stencil8 formats. */
-	VkBool32 multisampleArrayTextures;          /**< If true, MTLTextureType2DMultisampleArray is supported. */
-	VkBool32 samplerClampToBorder;              /**< If true, the border color set when creating a sampler will be respected. */
-	uint32_t maxTextureDimension; 	     	  	/**< The maximum size of each texture dimension (width, height, or depth). */
-	uint32_t maxPerStageBufferCount;            /**< The total number of per-stage Metal buffers available for shader uniform content and attributes. */
-    uint32_t maxPerStageTextureCount;           /**< The total number of per-stage Metal textures available for shader uniform content. */
-    uint32_t maxPerStageSamplerCount;           /**< The total number of per-stage Metal samplers available for shader uniform content. */
-    VkDeviceSize maxMTLBufferSize;              /**< The max size of a MTLBuffer (in bytes). */
-    VkDeviceSize mtlBufferAlignment;            /**< The alignment used when allocating memory for MTLBuffers. Must be PoT. */
-    VkDeviceSize maxQueryBufferSize;            /**< The maximum size of an occlusion query buffer (in bytes). */
-	VkDeviceSize mtlCopyBufferAlignment;        /**< The alignment required during buffer copy operations (in bytes). */
-    VkSampleCountFlags supportedSampleCounts;   /**< A bitmask identifying the sample counts supported by the device. */
-	uint32_t minSwapchainImageCount;	 	  	/**< The minimum number of swapchain images that can be supported by a surface. */
-	uint32_t maxSwapchainImageCount;	 	  	/**< The maximum number of swapchain images that can be supported by a surface. */
-	VkBool32 combinedStoreResolveAction;		/**< If true, the device supports VK_ATTACHMENT_STORE_OP_STORE with a simultaneous resolve attachment. */
-	VkBool32 arrayOfTextures;			 	  	/**< If true, arrays of textures is supported. */
-	VkBool32 arrayOfSamplers;			 	  	/**< If true, arrays of texture samplers is supported. */
-	MTLLanguageVersion mslVersionEnum;			/**< The version of the Metal Shading Language available on this device, as a Metal enumeration. */
-	VkBool32 depthSampleCompare;				/**< If true, depth texture samplers support the comparison of the pixel value against a reference value. */
-	VkBool32 events;							/**< If true, Metal synchronization events (MTLEvent) are supported. */
-	VkBool32 memoryBarriers;					/**< If true, full memory barriers within Metal render passes are supported. */
-	VkBool32 multisampleLayeredRendering;       /**< If true, layered rendering to multiple multi-sampled cube or texture array layers is supported. */
-	VkBool32 stencilFeedback;					/**< If true, fragment shaders that write to [[stencil]] outputs are supported. */
-	VkBool32 textureBuffers;					/**< If true, textures of type MTLTextureTypeBuffer are supported. */
-	VkBool32 postDepthCoverage;					/**< If true, coverage masks in fragment shaders post-depth-test are supported. */
-	VkBool32 fences;							/**< If true, Metal synchronization fences (MTLFence) are supported. */
-	VkBool32 rasterOrderGroups;					/**< If true, Raster order groups in fragment shaders are supported. */
-	VkBool32 native3DCompressedTextures;		/**< If true, 3D compressed images are supported natively, without manual decompression. */
-	VkBool32 nativeTextureSwizzle;				/**< If true, component swizzle is supported natively, without manual swizzling in shaders. */
-	VkBool32 placementHeaps;					/**< If true, MTLHeap objects support placement of resources. */
-	VkDeviceSize pushConstantSizeAlignment;		/**< The alignment used internally when allocating memory for push constants. Must be PoT. */
-	uint32_t maxTextureLayers;					/**< The maximum number of layers in an array texture. */
-    uint32_t maxSubgroupSize;			        /**< The maximum number of threads in a SIMD-group. */
-	VkDeviceSize vertexStrideAlignment;         /**< The alignment used for the stride of vertex attribute bindings. */
-	VkBool32 indirectTessellationDrawing;		/**< If true, tessellation draw calls support parameters held in a GPU buffer. */
-	VkBool32 nonUniformThreadgroups;			/**< If true, the device supports arbitrary-sized grids in compute workloads. */
-	VkBool32 renderWithoutAttachments;          /**< If true, we don't have to create a dummy attachment for a render pass if there isn't one. */
-	VkBool32 deferredStoreActions;				/**< If true, render pass store actions can be specified after the render encoder is created. */
-	VkBool32 sharedLinearTextures;				/**< If true, linear textures and texture buffers can be created from buffers in Shared storage. */
-	VkBool32 depthResolve;						/**< If true, resolving depth textures with filters other than Sample0 is supported. */
-	VkBool32 stencilResolve;					/**< If true, resolving stencil textures with filters other than Sample0 is supported. */
-	uint32_t maxPerStageDynamicMTLBufferCount;	/**< The maximum number of inline buffers that can be set on a command buffer. */
-	uint32_t maxPerStageStorageTextureCount;    /**< The total number of per-stage Metal textures with read-write access available for writing to from a shader. */
-	VkBool32 astcHDRTextures;					/**< If true, ASTC HDR pixel formats are supported. */
-	VkBool32 renderLinearTextures;				/**< If true, linear textures are renderable. */
-	VkBool32 pullModelInterpolation;			/**< If true, explicit interpolation functions are supported. */
-	VkBool32 samplerMirrorClampToEdge;			/**< If true, the mirrored clamp to edge address mode is supported in samplers. */
-	VkBool32 quadPermute;						/**< If true, quadgroup permutation functions (vote, ballot, shuffle) are supported in shaders. */
-	VkBool32 simdPermute;						/**< If true, SIMD-group permutation functions (vote, ballot, shuffle) are supported in shaders. */
-	VkBool32 simdReduction;						/**< If true, SIMD-group reduction functions (arithmetic) are supported in shaders. */
-    uint32_t minSubgroupSize;			        /**< The minimum number of threads in a SIMD-group. */
-    VkBool32 textureBarriers;                   /**< If true, texture barriers are supported within Metal render passes. */
-    VkBool32 tileBasedDeferredRendering;        /**< If true, this device uses tile-based deferred rendering. */
-	VkBool32 argumentBuffers;					/**< If true, Metal argument buffers are supported. */
-	VkBool32 descriptorSetArgumentBuffers;		/**< If true, a Metal argument buffer can be assigned to a descriptor set, and used on any pipeline and pipeline stage. If false, a different Metal argument buffer must be used for each pipeline-stage/descriptor-set combination. */
-	MVKFloatRounding clearColorFloatRounding;	/**< Identifies the type of rounding Metal uses for MTLClearColor float to integer conversions. */
+    uint32_t mslVersion;                        	/**< The version of the Metal Shading Language available on this device. The format of the integer is MMmmpp, with two decimal digts each for Major, minor, and patch version values (eg. MSL 1.2 would appear as 010200). */
+	VkBool32 indirectDrawing;                   	/**< If true, draw calls support parameters held in a GPU buffer. */
+	VkBool32 baseVertexInstanceDrawing;         	/**< If true, draw calls support specifiying the base vertex and instance. */
+    uint32_t dynamicMTLBufferSize;              	/**< If greater than zero, dynamic MTLBuffers for setting vertex, fragment, and compute bytes are supported, and their content must be below this value. */
+    VkBool32 shaderSpecialization;              	/**< If true, shader specialization (aka Metal function constants) is supported. */
+    VkBool32 ioSurfaces;                        	/**< If true, VkImages can be underlaid by IOSurfaces via the vkUseIOSurfaceMVK() function, to support inter-process image transfers. */
+    VkBool32 texelBuffers;                      	/**< If true, texel buffers are supported, allowing the contents of a buffer to be interpreted as an image via a VkBufferView. */
+	VkBool32 layeredRendering;                  	/**< If true, layered rendering to multiple cube or texture array layers is supported. */
+	VkBool32 presentModeImmediate;              	/**< If true, immediate surface present mode (VK_PRESENT_MODE_IMMEDIATE_KHR), allowing a swapchain image to be presented immediately, without waiting for the vertical sync period of the display, is supported. */
+	VkBool32 stencilViews;                      	/**< If true, stencil aspect views are supported through the MTLPixelFormatX24_Stencil8 and MTLPixelFormatX32_Stencil8 formats. */
+	VkBool32 multisampleArrayTextures;          	/**< If true, MTLTextureType2DMultisampleArray is supported. */
+	VkBool32 samplerClampToBorder;              	/**< If true, the border color set when creating a sampler will be respected. */
+	uint32_t maxTextureDimension; 	     	  		/**< The maximum size of each texture dimension (width, height, or depth). */
+	uint32_t maxPerStageBufferCount;            	/**< The total number of per-stage Metal buffers available for shader uniform content and attributes. */
+    uint32_t maxPerStageTextureCount;           	/**< The total number of per-stage Metal textures available for shader uniform content. */
+    uint32_t maxPerStageSamplerCount;           	/**< The total number of per-stage Metal samplers available for shader uniform content. */
+    VkDeviceSize maxMTLBufferSize;              	/**< The max size of a MTLBuffer (in bytes). */
+    VkDeviceSize mtlBufferAlignment;            	/**< The alignment used when allocating memory for MTLBuffers. Must be PoT. */
+    VkDeviceSize maxQueryBufferSize;            	/**< The maximum size of an occlusion query buffer (in bytes). */
+	VkDeviceSize mtlCopyBufferAlignment;        	/**< The alignment required during buffer copy operations (in bytes). */
+    VkSampleCountFlags supportedSampleCounts;   	/**< A bitmask identifying the sample counts supported by the device. */
+	uint32_t minSwapchainImageCount;	 	  		/**< The minimum number of swapchain images that can be supported by a surface. */
+	uint32_t maxSwapchainImageCount;	 	  		/**< The maximum number of swapchain images that can be supported by a surface. */
+	VkBool32 combinedStoreResolveAction;			/**< If true, the device supports VK_ATTACHMENT_STORE_OP_STORE with a simultaneous resolve attachment. */
+	VkBool32 arrayOfTextures;			 	  		/**< If true, arrays of textures is supported. */
+	VkBool32 arrayOfSamplers;			 	  		/**< If true, arrays of texture samplers is supported. */
+	MTLLanguageVersion mslVersionEnum;				/**< The version of the Metal Shading Language available on this device, as a Metal enumeration. */
+	VkBool32 depthSampleCompare;					/**< If true, depth texture samplers support the comparison of the pixel value against a reference value. */
+	VkBool32 events;								/**< If true, Metal synchronization events (MTLEvent) are supported. */
+	VkBool32 memoryBarriers;						/**< If true, full memory barriers within Metal render passes are supported. */
+	VkBool32 multisampleLayeredRendering;       	/**< If true, layered rendering to multiple multi-sampled cube or texture array layers is supported. */
+	VkBool32 stencilFeedback;						/**< If true, fragment shaders that write to [[stencil]] outputs are supported. */
+	VkBool32 textureBuffers;						/**< If true, textures of type MTLTextureTypeBuffer are supported. */
+	VkBool32 postDepthCoverage;						/**< If true, coverage masks in fragment shaders post-depth-test are supported. */
+	VkBool32 fences;								/**< If true, Metal synchronization fences (MTLFence) are supported. */
+	VkBool32 rasterOrderGroups;						/**< If true, Raster order groups in fragment shaders are supported. */
+	VkBool32 native3DCompressedTextures;			/**< If true, 3D compressed images are supported natively, without manual decompression. */
+	VkBool32 nativeTextureSwizzle;					/**< If true, component swizzle is supported natively, without manual swizzling in shaders. */
+	VkBool32 placementHeaps;						/**< If true, MTLHeap objects support placement of resources. */
+	VkDeviceSize pushConstantSizeAlignment;			/**< The alignment used internally when allocating memory for push constants. Must be PoT. */
+	uint32_t maxTextureLayers;						/**< The maximum number of layers in an array texture. */
+    uint32_t maxSubgroupSize;			        	/**< The maximum number of threads in a SIMD-group. */
+	VkDeviceSize vertexStrideAlignment;         	/**< The alignment used for the stride of vertex attribute bindings. */
+	VkBool32 indirectTessellationDrawing;			/**< If true, tessellation draw calls support parameters held in a GPU buffer. */
+	VkBool32 nonUniformThreadgroups;				/**< If true, the device supports arbitrary-sized grids in compute workloads. */
+	VkBool32 renderWithoutAttachments;          	/**< If true, we don't have to create a dummy attachment for a render pass if there isn't one. */
+	VkBool32 deferredStoreActions;					/**< If true, render pass store actions can be specified after the render encoder is created. */
+	VkBool32 sharedLinearTextures;					/**< If true, linear textures and texture buffers can be created from buffers in Shared storage. */
+	VkBool32 depthResolve;							/**< If true, resolving depth textures with filters other than Sample0 is supported. */
+	VkBool32 stencilResolve;						/**< If true, resolving stencil textures with filters other than Sample0 is supported. */
+	uint32_t maxPerStageDynamicMTLBufferCount;		/**< The maximum number of inline buffers that can be set on a command buffer. */
+	uint32_t maxPerStageStorageTextureCount;    	/**< The total number of per-stage Metal textures with read-write access available for writing to from a shader. */
+	VkBool32 astcHDRTextures;						/**< If true, ASTC HDR pixel formats are supported. */
+	VkBool32 renderLinearTextures;					/**< If true, linear textures are renderable. */
+	VkBool32 pullModelInterpolation;				/**< If true, explicit interpolation functions are supported. */
+	VkBool32 samplerMirrorClampToEdge;				/**< If true, the mirrored clamp to edge address mode is supported in samplers. */
+	VkBool32 quadPermute;							/**< If true, quadgroup permutation functions (vote, ballot, shuffle) are supported in shaders. */
+	VkBool32 simdPermute;							/**< If true, SIMD-group permutation functions (vote, ballot, shuffle) are supported in shaders. */
+	VkBool32 simdReduction;							/**< If true, SIMD-group reduction functions (arithmetic) are supported in shaders. */
+    uint32_t minSubgroupSize;			        	/**< The minimum number of threads in a SIMD-group. */
+    VkBool32 textureBarriers;                   	/**< If true, texture barriers are supported within Metal render passes. */
+    VkBool32 tileBasedDeferredRendering;        	/**< If true, this device uses tile-based deferred rendering. */
+	VkBool32 argumentBuffers;						/**< If true, Metal argument buffers are supported. */
+	VkBool32 descriptorSetArgumentBuffers;			/**< If true, a Metal argument buffer can be assigned to a descriptor set, and used on any pipeline and pipeline stage. If false, a different Metal argument buffer must be used for each pipeline-stage/descriptor-set combination. */
+	MVKFloatRounding clearColorFloatRounding;		/**< Identifies the type of rounding Metal uses for MTLClearColor float to integer conversions. */
+	MVKCounterSamplingFlags counterSamplingPoints;	/**< Identifies the points where pipeline GPU counter sampling may occur. */
 } MVKPhysicalDeviceMetalFeatures;
 
 /** MoltenVK performance of a particular type of activity. */
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
index 7d9c92f..7bf8ebf 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
@@ -410,7 +410,7 @@
     void endOcclusionQuery(MVKOcclusionQueryPool* pQueryPool, uint32_t query);
 
     /** Marks a timestamp for the specified query. */
-    void markTimestamp(MVKQueryPool* pQueryPool, uint32_t query);
+    void markTimestamp(MVKTimestampQueryPool* pQueryPool, uint32_t query);
 
     /** Reset a range of queries. */
     void resetQueries(MVKQueryPool* pQueryPool, uint32_t firstQuery, uint32_t queryCount);
@@ -499,6 +499,13 @@
 	void setSubpass(MVKCommand* passCmd, VkSubpassContents subpassContents, uint32_t subpassIndex);
 	void clearRenderArea();
     NSString* getMTLRenderCommandEncoderName();
+	void encodeGPUCounterSample(MVKGPUCounterQueryPool* mvkQryPool, uint32_t sampleIndex, MVKCounterSamplingFlags samplingPoints);
+	void encodeTimestampStageCounterSamples();
+
+	typedef struct GPUCounterQuery {
+		MVKGPUCounterQueryPool* queryPool = nullptr;
+		uint32_t query = 0;
+	} GPUCounterQuery;
 
 	VkSubpassContents _subpassContents;
 	MVKRenderPass* _renderPass;
@@ -507,6 +514,7 @@
 	uint32_t _multiviewPassIndex;
 	VkRect2D _renderArea;
     MVKActivatedQueries* _pActivatedQueries;
+	MVKSmallVector<GPUCounterQuery, 16> _timestampStageCounterQueries;
 	MVKSmallVector<VkClearValue, kMVKDefaultAttachmentCount> _clearValues;
 	MVKSmallVector<MVKImageView*, kMVKDefaultAttachmentCount> _attachments;
 	id<MTLComputeCommandEncoder> _mtlComputeEncoder;
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
index 18d2441..d3a351c 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
@@ -614,6 +614,8 @@
 	[_mtlBlitEncoder endEncoding];
 	_mtlBlitEncoder = nil;          // not retained
     _mtlBlitEncoderUse = kMVKCommandUseNone;
+
+	encodeTimestampStageCounterSamples();
 }
 
 id<MTLComputeCommandEncoder> MVKCommandEncoder::getMTLComputeEncoder(MVKCommandUse cmdUse) {
@@ -720,6 +722,23 @@
 
 #pragma mark Queries
 
+// Only executes on immediate-mode GPUs. Encode a GPU counter sample command on whichever Metal
+// encoder is currently in use, creating a temporary BLIT encoder if no encoder is currently active.
+// We only encode the GPU sample if the platform allows encoding at the associated pipeline point.
+void MVKCommandEncoder::encodeGPUCounterSample(MVKGPUCounterQueryPool* mvkQryPool, uint32_t sampleIndex, MVKCounterSamplingFlags samplingPoints){
+	if (_mtlRenderEncoder) {
+		if (mvkIsAnyFlagEnabled(samplingPoints, MVK_COUNTER_SAMPLING_AT_DRAW)) {
+			[_mtlRenderEncoder sampleCountersInBuffer: mvkQryPool->getMTLCounterBuffer() atSampleIndex: sampleIndex withBarrier: NO];
+		}
+	} else if (_mtlComputeEncoder) {
+		if (mvkIsAnyFlagEnabled(samplingPoints, MVK_COUNTER_SAMPLING_AT_DISPATCH)) {
+			[_mtlComputeEncoder sampleCountersInBuffer: mvkQryPool->getMTLCounterBuffer() atSampleIndex: sampleIndex withBarrier: NO];
+		}
+	} else if (mvkIsAnyFlagEnabled(samplingPoints, MVK_COUNTER_SAMPLING_AT_BLIT)) {
+		[getMTLBlitEncoder(kMVKCommandUseRecordGPUCounterSample) sampleCountersInBuffer: mvkQryPool->getMTLCounterBuffer() atSampleIndex: sampleIndex withBarrier: NO];
+	}
+}
+
 void MVKCommandEncoder::beginOcclusionQuery(MVKOcclusionQueryPool* pQueryPool, uint32_t query, VkQueryControlFlags flags) {
     _occlusionQueryState.beginOcclusionQuery(pQueryPool, query, flags);
     uint32_t queryCount = 1;
@@ -733,14 +752,67 @@
     _occlusionQueryState.endOcclusionQuery(pQueryPool, query);
 }
 
-void MVKCommandEncoder::markTimestamp(MVKQueryPool* pQueryPool, uint32_t query) {
+void MVKCommandEncoder::markTimestamp(MVKTimestampQueryPool* pQueryPool, uint32_t query) {
     uint32_t queryCount = 1;
     if (_renderPass && getSubpass()->isMultiview()) {
         queryCount = getSubpass()->getViewCountInMetalPass(_multiviewPassIndex);
     }
-    addActivatedQueries(pQueryPool, query, queryCount);
+	addActivatedQueries(pQueryPool, query, queryCount);
+
+	MVKCounterSamplingFlags sampPts = _device->_pMetalFeatures->counterSamplingPoints;
+	if (sampPts) {
+		for (uint32_t qOfst = 0; qOfst < queryCount; qOfst++) {
+			if (mvkIsAnyFlagEnabled(sampPts, MVK_COUNTER_SAMPLING_AT_PIPELINE_STAGE)) {
+				_timestampStageCounterQueries.push_back({ pQueryPool, query + qOfst });
+			} else {
+				encodeGPUCounterSample(pQueryPool, query + qOfst, sampPts);
+			}
+		}
+	}
 }
 
+#if MVK_XCODE_12
+// Metal stage GPU counters need to be configured in a Metal render, compute, or BLIT encoder, meaning that the
+// Metal encoder needs to know about any Vulkan timestamp commands that will be executed during the execution
+// of a renderpass, or set of Vulkan dispatch or BLIT commands. In addition, there are a very small number of
+// staged timestamps (4) that can be tracked in any single render, compute, or BLIT pass, meaning a renderpass
+// that timestamped after each of many draw calls, would not be trackable. Finally, stage counters are only
+// available on tile-based GPU's, which means draw or dispatch calls cannot be individually timestamped.
+// We avoid dealing with all this complexity and mismatch between how Vulkan and Metal stage counters operate
+// by deferring all timestamps to the end of any batch of Metal encoding, and add a lightweight Metal encoder
+// that does minimal work (it won't timestamp if completely empty), and timestamps that work into all of the
+// Vulkan timestamp queries that have been executed during the execution of the previous Metal encoder.
+void MVKCommandEncoder::encodeTimestampStageCounterSamples() {
+	size_t qCnt = _timestampStageCounterQueries.size();
+	uint32_t qIdx = 0;
+	while (qIdx < qCnt) {
+
+		// With each BLIT pass, consume as many outstanding timestamp queries as possible.
+		// Attach an query result to each of the available sample buffer attachments in the BLIT pass descriptor.
+		auto* bpDesc = [[[MTLBlitPassDescriptor alloc] init] autorelease];
+		for (uint32_t attIdx = 0; attIdx < MTLMaxBlitPassSampleBuffers && qIdx < qCnt; attIdx++, qIdx++) {
+			auto* sbAttDesc = bpDesc.sampleBufferAttachments[attIdx];
+			auto& tsQry = _timestampStageCounterQueries[qIdx];
+
+			// We actually only need to use startOfEncoderSampleIndex, but apparently,
+			// and contradicting docs, Metal hits an unexpected validation error if
+			// endOfEncoderSampleIndex is left at MTLCounterDontSample.
+			sbAttDesc.startOfEncoderSampleIndex = tsQry.query;
+			sbAttDesc.endOfEncoderSampleIndex = tsQry.query;
+			sbAttDesc.sampleBuffer = tsQry.queryPool->getMTLCounterBuffer();
+		}
+
+		auto* mtlEnc = [_mtlCmdBuffer blitCommandEncoderWithDescriptor: bpDesc];
+		setLabelIfNotNil(mtlEnc, mvkMTLBlitCommandEncoderLabel(kMVKCommandUseRecordGPUCounterSample));
+		[mtlEnc fillBuffer: _device->getDummyBlitMTLBuffer() range: NSMakeRange(0, 1) value: 0];
+		[mtlEnc endEncoding];
+	}
+	_timestampStageCounterQueries.clear();
+}
+#else
+void MVKCommandEncoder::encodeTimestampStageCounterSamples() {}
+#endif
+
 void MVKCommandEncoder::resetQueries(MVKQueryPool* pQueryPool, uint32_t firstQuery, uint32_t queryCount) {
     addActivatedQueries(pQueryPool, firstQuery, queryCount);
 }
@@ -847,6 +919,7 @@
         case kMVKCommandUseUpdateBuffer:                    return @"vkCmdUpdateBuffer BlitEncoder";
         case kMVKCommandUseResetQueryPool:                  return @"vkCmdResetQueryPool BlitEncoder";
         case kMVKCommandUseCopyQueryPoolResults:            return @"vkCmdCopyQueryPoolResults BlitEncoder";
+		case kMVKCommandUseRecordGPUCounterSample:          return @"Record GPU Counter Sample BlitEncoder";
         default:                                            return @"Unknown Use BlitEncoder";
     }
 }
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
index 8eef871..d4d65d7 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
@@ -324,10 +324,10 @@
 	}
 
 	/** Returns whether the MSL version is supported on this device. */
-	inline bool mslVersionIsAtLeast(MTLLanguageVersion minVer) { return _metalFeatures.mslVersionEnum >= minVer; }
+	bool mslVersionIsAtLeast(MTLLanguageVersion minVer) { return _metalFeatures.mslVersionEnum >= minVer; }
 
 	/** Returns whether this device is using Metal argument buffers. */
-	inline bool isUsingMetalArgumentBuffers() const  { return _metalFeatures.argumentBuffers && mvkConfig().useMetalArgumentBuffers; };
+	bool isUsingMetalArgumentBuffers() const  { return _metalFeatures.argumentBuffers && mvkConfig().useMetalArgumentBuffers; };
 
 
 #pragma mark Construction
@@ -371,6 +371,7 @@
 	uint32_t getMaxSamplerCount();
 	void initExternalMemoryProperties();
 	void initExtensions();
+	void initCounterSets();
 	MVKArrayRef<MVKQueueFamily*> getQueueFamilies();
 	void initPipelineCacheUUID();
 	uint32_t getHighestMTLFeatureSet();
@@ -388,6 +389,7 @@
 	VkPhysicalDeviceMemoryProperties _memoryProperties;
 	MVKSmallVector<MVKQueueFamily*, kMVKQueueFamilyCount> _queueFamilies;
 	MVKPixelFormats _pixelFormats;
+	id<MTLCounterSet> _timestampMTLCounterSet;
 	uint32_t _allMemoryTypes;
 	uint32_t _hostVisibleMemoryTypes;
 	uint32_t _hostCoherentMemoryTypes;
@@ -684,6 +686,9 @@
      */
     uint32_t expandVisibilityResultMTLBuffer(uint32_t queryCount);
 
+	/** Returns the GPU sample counter used for timestamps. */
+	id<MTLCounterSet> getTimestampMTLCounterSet() { return _physicalDevice->_timestampMTLCounterSet; }
+
     /** Returns the memory type index corresponding to the specified Metal memory storage mode. */
     uint32_t getVulkanMemoryTypeIndex(MTLStorageMode mtlStorageMode);
 
@@ -691,6 +696,12 @@
 	id<MTLSamplerState> getDefaultMTLSamplerState();
 
 	/**
+	 * Returns a MTLBuffer of length one that can be used as a dummy to
+	 * create a no-op BLIT encoder based on filling this single-byte buffer.
+	 */
+	id<MTLBuffer> getDummyBlitMTLBuffer();
+
+	/**
 	 * Returns whether MTLCommandBuffers can be prefilled.
 	 *
 	 * This depends both on whether the app config has requested prefilling, and whether doing so will
@@ -818,6 +829,7 @@
     std::mutex _perfLock;
     id<MTLBuffer> _globalVisibilityResultMTLBuffer;
 	id<MTLSamplerState> _defaultMTLSamplerState;
+	id<MTLBuffer> _dummyBlitMTLBuffer;
     uint32_t _globalVisibilityQueryCount;
     std::mutex _vizLock;
 	bool _useMTLFenceForSemaphores;
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
index 1e8146b..7ef37f5 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@@ -1150,6 +1150,7 @@
 	initExtensions();
 	initMemoryProperties();
 	initExternalMemoryProperties();
+	initCounterSets();
 	logGPUInfo();
 }
 
@@ -1594,6 +1595,27 @@
 	// Currently, if we don't support descriptor set argument buffers, we can't support argument buffers.
 	_metalFeatures.argumentBuffers = _metalFeatures.descriptorSetArgumentBuffers;
 
+#define checkSupportsMTLCounterSamplingPoint(mtlSP, mvkSP)  \
+	if ([_mtlDevice respondsToSelector: @selector(supportsCounterSampling:)] &&  \
+		[_mtlDevice supportsCounterSampling: MTLCounterSamplingPointAt ##mtlSP ##Boundary]) {  \
+		_metalFeatures.counterSamplingPoints |= MVK_COUNTER_SAMPLING_AT_ ##mvkSP;  \
+	}
+
+#if MVK_XCODE_12
+	checkSupportsMTLCounterSamplingPoint(Draw, DRAW);
+	checkSupportsMTLCounterSamplingPoint(Dispatch, DISPATCH);
+	checkSupportsMTLCounterSamplingPoint(Blit, BLIT);
+	checkSupportsMTLCounterSamplingPoint(Stage, PIPELINE_STAGE);
+#endif
+
+#if !MVK_APPLE_SILICON
+	// On macOS, if we couldn't query supported sample points (on macOS 11),
+	// but the platform can support immediate-mode sample points, indicate that here.
+	if (!_metalFeatures.counterSamplingPoints && mvkOSVersionIsAtLeast(10.15)) {  \
+		_metalFeatures.counterSamplingPoints = MVK_COUNTER_SAMPLING_AT_DRAW | MVK_COUNTER_SAMPLING_AT_DISPATCH | MVK_COUNTER_SAMPLING_AT_BLIT;  \
+	}
+#endif
+
 }
 
 // Initializes the physical device features of this instance.
@@ -2726,6 +2748,28 @@
 #endif
 }
 
+void MVKPhysicalDevice::initCounterSets() {
+	_timestampMTLCounterSet = nil;
+	@autoreleasepool {
+		if (_metalFeatures.counterSamplingPoints) {
+			NSArray<id<MTLCounterSet>>* counterSets = _mtlDevice.counterSets;
+			for (id<MTLCounterSet> cs in counterSets){
+				NSString* csName = cs.name;
+				if ( [csName caseInsensitiveCompare: MTLCommonCounterSetTimestamp] == NSOrderedSame) {
+					NSArray<id<MTLCounter>>* countersInSet = cs.counters;
+					for(id<MTLCounter> ctr in countersInSet) {
+						if ( [ctr.name caseInsensitiveCompare: MTLCommonCounterTimestamp] == NSOrderedSame) {
+							_timestampMTLCounterSet = [cs retain];		// retained
+							break;
+						}
+					}
+					break;
+				}
+			}
+		}
+	}
+}
+
 void MVKPhysicalDevice::logGPUInfo() {
 	string devTypeStr;
 	switch (_properties.deviceType) {
@@ -2838,6 +2882,7 @@
 
 MVKPhysicalDevice::~MVKPhysicalDevice() {
 	mvkDestroyContainerContents(_queueFamilies);
+	[_timestampMTLCounterSet release];
 	[_mtlDevice release];
 }
 
@@ -3712,6 +3757,20 @@
 	return _defaultMTLSamplerState;
 }
 
+id<MTLBuffer> MVKDevice::getDummyBlitMTLBuffer() {
+	if ( !_dummyBlitMTLBuffer ) {
+
+		// Lock and check again in case another thread has created the buffer.
+		lock_guard<mutex> lock(_rezLock);
+		if ( !_dummyBlitMTLBuffer ) {
+			@autoreleasepool {
+				_dummyBlitMTLBuffer = [getMTLDevice() newBufferWithLength: 1 options: MTLResourceStorageModePrivate];
+			}
+		}
+	}
+	return _dummyBlitMTLBuffer;
+}
+
 MTLCompileOptions* MVKDevice::getMTLCompileOptions(bool useFastMath, bool preserveInvariance) {
 	MTLCompileOptions* mtlCompOpt = [MTLCompileOptions new];
 	mtlCompOpt.languageVersion = _pMetalFeatures->mslVersionEnum;
@@ -3833,6 +3892,7 @@
     _globalVisibilityQueryCount = 0;
 
 	_defaultMTLSamplerState = nil;
+	_dummyBlitMTLBuffer = nil;
 
 	_commandResourceFactory = new MVKCommandResourceFactory(this);
 
@@ -4200,6 +4260,7 @@
 
     [_globalVisibilityResultMTLBuffer release];
 	[_defaultMTLSamplerState release];
+	[_dummyBlitMTLBuffer release];
 
 	stopAutoGPUCapture(MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE_DEVICE);
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.h b/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.h
index 91dba09..79b6eed 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.h
@@ -38,7 +38,6 @@
 /** 
  * Abstract class representing a Vulkan query pool.
  * Subclasses are specialized for specific query types.
- * Subclasses will generally override the beginQuery(), endQuery(), and getResult(uint32_t, void*, bool) member functions.
  */
 class MVKQueryPool : public MVKVulkanAPIDeviceObject {
 
@@ -106,10 +105,12 @@
 
 protected:
 	bool areQueriesHostAvailable(uint32_t firstQuery, uint32_t endQuery);
-    VkResult getResult(uint32_t query, void* pQryData, VkQueryResultFlags flags);
-	virtual void getResult(uint32_t query, void* pQryData, bool shouldOutput64Bit) {}
+	virtual NSData* getQuerySourceData(uint32_t firstQuery, uint32_t queryCount) { return nil; }
+    VkResult getResult(uint32_t query, NSData* srcData, uint32_t srcDataQueryOffset, void* pDstData, VkQueryResultFlags flags);
 	virtual id<MTLBuffer> getResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, NSUInteger& offset) { return nil; }
-	virtual void encodeSetResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) {}
+	virtual id<MTLComputeCommandEncoder> encodeComputeCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) { return nil; }
+	virtual void encodeDirectCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount,
+										 MVKBuffer* destBuffer, VkDeviceSize destOffset, VkDeviceSize stride);
 
 	struct DeferredCopy {
 		uint32_t firstQuery;
@@ -137,31 +138,6 @@
 
 
 #pragma mark -
-#pragma mark MVKTimestampQueryPool
-
-/** A Vulkan query pool for timestamp queries. */
-class MVKTimestampQueryPool : public MVKQueryPool {
-
-public:
-    void endQuery(uint32_t query, MVKCommandEncoder* cmdEncoder) override;
-    void finishQueries(const MVKArrayRef<uint32_t>& queries) override;
-
-
-#pragma mark Construction
-
-	MVKTimestampQueryPool(MVKDevice* device, const VkQueryPoolCreateInfo* pCreateInfo);
-
-protected:
-	void propagateDebugName() override {}
-	void getResult(uint32_t query, void* pQryData, bool shouldOutput64Bit) override;
-	id<MTLBuffer> getResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, NSUInteger& offset) override;
-	void encodeSetResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) override;
-
-	MVKSmallVector<uint64_t, kMVKDefaultQueryCount> _timestamps;
-};
-
-
-#pragma mark -
 #pragma mark MVKOcclusionQueryPool
 
 /** A Vulkan query pool for occlusion queries. */
@@ -189,9 +165,9 @@
 
 protected:
 	void propagateDebugName() override;
-    void getResult(uint32_t query, void* pQryData, bool shouldOutput64Bit) override;
+	NSData* getQuerySourceData(uint32_t firstQuery, uint32_t queryCount) override;
 	id<MTLBuffer> getResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, NSUInteger& offset) override;
-	void encodeSetResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) override;
+	id<MTLComputeCommandEncoder> encodeComputeCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) override;
 
     id<MTLBuffer> _visibilityResultMTLBuffer;
     uint32_t _queryIndexOffset;
@@ -199,10 +175,63 @@
 
 
 #pragma mark -
+#pragma mark MVKGPUCounterQueryPool
+
+/** An abstract parent class for query pools that use Metal GPU counters if they are supported on the platform. */
+class MVKGPUCounterQueryPool : public MVKQueryPool {
+
+public:
+
+	/**
+	 * Returns the MTLCounterBuffer being used by this query pool,
+	 * or returns nil if GPU counters are not supported.
+	 * */
+	id<MTLCounterSampleBuffer> getMTLCounterBuffer() { return _mtlCounterBuffer; }
+
+	MVKGPUCounterQueryPool(MVKDevice* device, const VkQueryPoolCreateInfo* pCreateInfo);
+
+	~MVKGPUCounterQueryPool() override;
+
+protected:
+	void initMTLCounterSampleBuffer(const VkQueryPoolCreateInfo* pCreateInfo,
+									id<MTLCounterSet> mtlCounterSet,
+									const char* queryTypeName);
+
+	id<MTLCounterSampleBuffer> _mtlCounterBuffer;
+};
+
+
+#pragma mark -
+#pragma mark MVKTimestampQueryPool
+
+/** A Vulkan query pool for timestamp queries. */
+class MVKTimestampQueryPool : public MVKGPUCounterQueryPool {
+
+public:
+	void endQuery(uint32_t query, MVKCommandEncoder* cmdEncoder) override;
+	void finishQueries(const MVKArrayRef<uint32_t>& queries) override;
+
+#pragma mark Construction
+
+	MVKTimestampQueryPool(MVKDevice* device, const VkQueryPoolCreateInfo* pCreateInfo);
+
+protected:
+	void propagateDebugName() override {}
+	NSData* getQuerySourceData(uint32_t firstQuery, uint32_t queryCount) override;
+	id<MTLBuffer> getResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, NSUInteger& offset) override;
+	id<MTLComputeCommandEncoder> encodeComputeCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) override;
+	void encodeDirectCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount,
+								 MVKBuffer* destBuffer, VkDeviceSize destOffset, VkDeviceSize stride) override;
+
+	MVKSmallVector<uint64_t> _timestamps;
+};
+
+
+#pragma mark -
 #pragma mark MVKPipelineStatisticsQueryPool
 
 /** A Vulkan query pool for a query pool type that tracks pipeline statistics. */
-class MVKPipelineStatisticsQueryPool : public MVKQueryPool {
+class MVKPipelineStatisticsQueryPool : public MVKGPUCounterQueryPool {
 
 public:
     MVKPipelineStatisticsQueryPool(MVKDevice* device, const VkQueryPoolCreateInfo* pCreateInfo);
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.mm b/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.mm
index cf3853f..b2e2cd7 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.mm
@@ -89,10 +89,13 @@
 	}
 
 	VkResult rqstRslt = VK_SUCCESS;
-	uintptr_t pQryData = (uintptr_t)pData;
-	for (uint32_t query = firstQuery; query < endQuery; query++, pQryData += stride) {
-		VkResult qryRslt = getResult(query, (void*)pQryData, flags);
-		if (rqstRslt == VK_SUCCESS) { rqstRslt = qryRslt; }
+	@autoreleasepool {
+		NSData* srcData = getQuerySourceData(firstQuery, queryCount);
+		uintptr_t pDstData = (uintptr_t)pData;
+		for (uint32_t query = firstQuery; query < endQuery; query++, pDstData += stride) {
+			VkResult qryRslt = getResult(query, srcData, firstQuery, (void*)pDstData, flags);
+			if (rqstRslt == VK_SUCCESS) { rqstRslt = qryRslt; }
+		}
 	}
 	return rqstRslt;
 }
@@ -114,7 +117,7 @@
     return true;
 }
 
-VkResult MVKQueryPool::getResult(uint32_t query, void* pQryData, VkQueryResultFlags flags) {
+VkResult MVKQueryPool::getResult(uint32_t query, NSData* srcData, uint32_t srcDataQueryOffset, void* pDstData, VkQueryResultFlags flags) {
 
 	if (_device->getConfigurationResult() != VK_SUCCESS) { return _device->getConfigurationResult(); }
 
@@ -123,15 +126,22 @@
 	bool shouldOutput64Bit = mvkAreAllFlagsEnabled(flags, VK_QUERY_RESULT_64_BIT);
 
 	// Output the results of this query
-	if (shouldOutput) { getResult(query, pQryData, shouldOutput64Bit); }
+	if (shouldOutput) {
+		uint64_t rsltVal = ((uint64_t*)srcData.bytes)[query - srcDataQueryOffset];
+		if (shouldOutput64Bit) {
+			*(uint64_t*)pDstData = rsltVal;
+		} else {
+			*(uint32_t*)pDstData = (uint32_t)rsltVal;
+		}
+	}
 
 	// If requested, output the availability bit
 	if (mvkAreAllFlagsEnabled(flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
 		if (shouldOutput64Bit) {
-			uintptr_t pAvailability = (uintptr_t)pQryData + (_queryElementCount * sizeof(uint64_t));
+			uintptr_t pAvailability = (uintptr_t)pDstData + (_queryElementCount * sizeof(uint64_t));
 			*(uint64_t*)pAvailability = isAvailable;
 		} else {
-			uintptr_t pAvailability = (uintptr_t)pQryData + (_queryElementCount * sizeof(uint32_t));
+			uintptr_t pAvailability = (uintptr_t)pDstData + (_queryElementCount * sizeof(uint32_t));
 			*(uint32_t*)pAvailability = isAvailable;
 		}
 	}
@@ -154,20 +164,12 @@
 		stride == _queryElementCount * sizeof(uint64_t) &&
 		areQueriesDeviceAvailable(firstQuery, queryCount)) {
 
-		id<MTLBlitCommandEncoder> mtlBlitCmdEnc = cmdEncoder->getMTLBlitEncoder(kMVKCommandUseCopyQueryPoolResults);
-		NSUInteger srcOffset;
-		id<MTLBuffer> srcBuff = getResultBuffer(cmdEncoder, firstQuery, queryCount, srcOffset);
-		[mtlBlitCmdEnc copyFromBuffer: srcBuff
-						 sourceOffset: srcOffset
-							 toBuffer: destBuffer->getMTLBuffer()
-					destinationOffset: destBuffer->getMTLBufferOffset() + destOffset
-								 size: stride * queryCount];
+		encodeDirectCopyResults(cmdEncoder, firstQuery, queryCount, destBuffer, destOffset, stride);
 		// TODO: In the case where none of the queries is ready, we can fill with 0.
 	} else {
-		id<MTLComputeCommandEncoder> mtlComputeCmdEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyQueryPoolResults);
 		id<MTLComputePipelineState> mtlCopyResultsState = cmdEncoder->getCommandEncodingPool()->getCmdCopyQueryPoolResultsMTLComputePipelineState();
+		id<MTLComputeCommandEncoder> mtlComputeCmdEnc = encodeComputeCopyResults(cmdEncoder, firstQuery, queryCount, 0);
 		[mtlComputeCmdEnc setComputePipelineState: mtlCopyResultsState];
-		encodeSetResultBuffer(cmdEncoder, firstQuery, queryCount, 0);
 		[mtlComputeCmdEnc setBuffer: destBuffer->getMTLBuffer()
 							 offset: destBuffer->getMTLBufferOffset() + destOffset
 							atIndex: 1];
@@ -183,6 +185,24 @@
 	}
 }
 
+// If this asked for 64-bit results with no availability and packed stride, then we can do a straight copy.
+void MVKQueryPool::encodeDirectCopyResults(MVKCommandEncoder* cmdEncoder,
+									 uint32_t firstQuery,
+									 uint32_t queryCount,
+									 MVKBuffer* destBuffer,
+									 VkDeviceSize destOffset,
+									 VkDeviceSize stride) {
+
+	id<MTLBlitCommandEncoder> mtlBlitCmdEnc = cmdEncoder->getMTLBlitEncoder(kMVKCommandUseCopyQueryPoolResults);
+	NSUInteger srcOffset;
+	id<MTLBuffer> srcBuff = getResultBuffer(cmdEncoder, firstQuery, queryCount, srcOffset);
+	[mtlBlitCmdEnc copyFromBuffer: srcBuff
+					 sourceOffset: srcOffset
+						 toBuffer: destBuffer->getMTLBuffer()
+				destinationOffset: destBuffer->getMTLBufferOffset() + destOffset
+							 size: stride * queryCount];
+}
+
 void MVKQueryPool::deferCopyResults(uint32_t firstQuery,
 									uint32_t queryCount,
 									MVKBuffer* destBuffer,
@@ -196,53 +216,6 @@
 
 
 #pragma mark -
-#pragma mark MVKTimestampQueryPool
-
-void MVKTimestampQueryPool::endQuery(uint32_t query, MVKCommandEncoder* cmdEncoder) {
-    cmdEncoder->markTimestamp(this, query);
-    MVKQueryPool::endQuery(query, cmdEncoder);
-}
-
-// Update timestamp values, then mark queries as available
-void MVKTimestampQueryPool::finishQueries(const MVKArrayRef<uint32_t>& queries) {
-    uint64_t ts = mvkGetTimestamp();
-    for (uint32_t qry : queries) { _timestamps[qry] = ts; }
-
-    MVKQueryPool::finishQueries(queries);
-}
-
-void MVKTimestampQueryPool::getResult(uint32_t query, void* pQryData, bool shouldOutput64Bit) {
-	if (shouldOutput64Bit) {
-		*(uint64_t*)pQryData = _timestamps[query];
-	} else {
-		*(uint32_t*)pQryData = (uint32_t)_timestamps[query];
-	}
-}
-
-id<MTLBuffer> MVKTimestampQueryPool::getResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, NSUInteger& offset) {
-	const MVKMTLBufferAllocation* tempBuff = cmdEncoder->getTempMTLBuffer(queryCount * _queryElementCount * sizeof(uint64_t));
-	void* pBuffData = tempBuff->getContents();
-	size_t size = queryCount * _queryElementCount * sizeof(uint64_t);
-	memcpy(pBuffData, &_timestamps[firstQuery], size);
-	offset = tempBuff->_offset;
-	return tempBuff->_mtlBuffer;
-}
-
-void MVKTimestampQueryPool::encodeSetResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) {
-	// No need to create a temp buffer here.
-	cmdEncoder->setComputeBytes(cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyQueryPoolResults), &_timestamps[firstQuery], queryCount * _queryElementCount * sizeof(uint64_t), index);
-}
-
-
-#pragma mark Construction
-
-MVKTimestampQueryPool::MVKTimestampQueryPool(MVKDevice* device,
-											 const VkQueryPoolCreateInfo* pCreateInfo) :
-	MVKQueryPool(device, pCreateInfo, 1), _timestamps(pCreateInfo->queryCount, 0) {
-}
-
-
-#pragma mark -
 #pragma mark MVKOcclusionQueryPool
 
 void MVKOcclusionQueryPool::propagateDebugName() { setLabelIfNotNil(_visibilityResultMTLBuffer, _debugName); }
@@ -285,15 +258,11 @@
     }
 }
 
-void MVKOcclusionQueryPool::getResult(uint32_t query, void* pQryData, bool shouldOutput64Bit) {
-    NSUInteger mtlBuffOffset = getVisibilityResultOffset(query);
-    uint64_t* pData = (uint64_t*)((uintptr_t)getVisibilityResultMTLBuffer().contents + mtlBuffOffset);
-
-    if (shouldOutput64Bit) {
-        *(uint64_t*)pQryData = *pData;
-    } else {
-        *(uint32_t*)pQryData = (uint32_t)(*pData);
-    }
+NSData* MVKOcclusionQueryPool::getQuerySourceData(uint32_t firstQuery, uint32_t queryCount) {
+	id<MTLBuffer> vizBuff = getVisibilityResultMTLBuffer();
+	return [NSData dataWithBytesNoCopy: (void*)((uintptr_t)vizBuff.contents + getVisibilityResultOffset(firstQuery))
+								length: queryCount * kMVKQuerySlotSizeInBytes
+						  freeWhenDone: false];
 }
 
 id<MTLBuffer> MVKOcclusionQueryPool::getResultBuffer(MVKCommandEncoder*, uint32_t firstQuery, uint32_t, NSUInteger& offset) {
@@ -301,10 +270,10 @@
 	return getVisibilityResultMTLBuffer();
 }
 
-void MVKOcclusionQueryPool::encodeSetResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t, uint32_t index) {
-	[cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyQueryPoolResults) setBuffer: getVisibilityResultMTLBuffer()
-																			 offset: getVisibilityResultOffset(firstQuery)
-																			atIndex: index];
+id<MTLComputeCommandEncoder> MVKOcclusionQueryPool::encodeComputeCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t, uint32_t index) {
+	id<MTLComputeCommandEncoder> mtlCmdEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyQueryPoolResults);
+	[mtlCmdEnc setBuffer: getVisibilityResultMTLBuffer() offset: getVisibilityResultOffset(firstQuery) atIndex: index];
+	return mtlCmdEnc;
 }
 
 void MVKOcclusionQueryPool::beginQueryAddedTo(uint32_t query, MVKCommandBuffer* cmdBuffer) {
@@ -332,14 +301,12 @@
         _queryIndexOffset = 0;
 
         // Ensure we don't overflow the maximum number of queries
-        uint32_t queryCount = pCreateInfo->queryCount;
-        VkDeviceSize reqBuffLen = (VkDeviceSize)queryCount * kMVKQuerySlotSizeInBytes;
+        VkDeviceSize reqBuffLen = (VkDeviceSize)pCreateInfo->queryCount * kMVKQuerySlotSizeInBytes;
         VkDeviceSize maxBuffLen = _device->_pMetalFeatures->maxQueryBufferSize;
         VkDeviceSize newBuffLen = min(reqBuffLen, maxBuffLen);
-        queryCount = uint32_t(newBuffLen / kMVKQuerySlotSizeInBytes);
 
         if (reqBuffLen > maxBuffLen) {
-            reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "vkCreateQueryPool(): Each query pool can support a maximum of %d queries.", queryCount);
+            reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "vkCreateQueryPool(): Each query pool can support a maximum of %d queries.", uint32_t(newBuffLen / kMVKQuerySlotSizeInBytes));
         }
 
         NSUInteger mtlBuffLen = mvkAlignByteCount(newBuffLen, _device->_pMetalFeatures->mtlBufferAlignment);
@@ -358,10 +325,132 @@
 
 
 #pragma mark -
+#pragma mark MVKGPUCounterQueryPool
+
+MVKGPUCounterQueryPool::MVKGPUCounterQueryPool(MVKDevice* device, const VkQueryPoolCreateInfo* pCreateInfo) :
+	MVKQueryPool(device, pCreateInfo, 1), _mtlCounterBuffer(nil) {}
+
+// To establish the Metal counter sample buffer, this must be called from the construtors
+// of subclasses, because the type of MTLCounterSet is determined by the subclass.
+void MVKGPUCounterQueryPool::initMTLCounterSampleBuffer(const VkQueryPoolCreateInfo* pCreateInfo,
+														id<MTLCounterSet> mtlCounterSet,
+														const char* queryTypeName) {
+	if ( !_device->_pMetalFeatures->counterSamplingPoints ) { return; }
+
+	@autoreleasepool {
+		MTLCounterSampleBufferDescriptor* tsDesc = [[[MTLCounterSampleBufferDescriptor alloc] init] autorelease];
+		tsDesc.counterSet = mtlCounterSet;
+		tsDesc.storageMode = MTLStorageModeShared;
+		tsDesc.sampleCount = pCreateInfo->queryCount;
+
+		NSError* err = nil;
+		_mtlCounterBuffer = [getMTLDevice() newCounterSampleBufferWithDescriptor: tsDesc error: &err];
+		if (err) {
+			setConfigurationResult(reportError(VK_ERROR_INITIALIZATION_FAILED,
+											   "Could not create MTLCounterSampleBuffer for query pool of type %s. Reverting to emulated behavior. (Error code %li): %s",
+											   queryTypeName, (long)err.code, err.localizedDescription.UTF8String));
+		}
+	}
+};
+
+MVKGPUCounterQueryPool::~MVKGPUCounterQueryPool() {
+	[_mtlCounterBuffer release];
+}
+
+
+#pragma mark -
+#pragma mark MVKTimestampQueryPool
+
+void MVKTimestampQueryPool::endQuery(uint32_t query, MVKCommandEncoder* cmdEncoder) {
+	cmdEncoder->markTimestamp(this, query);
+	MVKQueryPool::endQuery(query, cmdEncoder);
+}
+
+// If not using MTLCounterSampleBuffer, update timestamp values, then mark queries as available
+void MVKTimestampQueryPool::finishQueries(const MVKArrayRef<uint32_t>& queries) {
+	if ( !_mtlCounterBuffer ) {
+		uint64_t ts = mvkGetTimestamp();
+		for (uint32_t qry : queries) { _timestamps[qry] = ts; }
+	}
+	MVKQueryPool::finishQueries(queries);
+}
+
+NSData* MVKTimestampQueryPool::getQuerySourceData(uint32_t firstQuery, uint32_t queryCount) {
+	if (_mtlCounterBuffer) {
+		return [_mtlCounterBuffer resolveCounterRange: NSMakeRange(firstQuery, queryCount)];
+	} else {
+		return [NSData dataWithBytesNoCopy: (void*)&_timestamps[firstQuery]
+									length: queryCount * kMVKQuerySlotSizeInBytes
+							  freeWhenDone: false];
+	}
+}
+
+void MVKTimestampQueryPool::encodeDirectCopyResults(MVKCommandEncoder* cmdEncoder,
+													uint32_t firstQuery,
+													uint32_t queryCount,
+													MVKBuffer* destBuffer,
+													VkDeviceSize destOffset,
+													VkDeviceSize stride) {
+	if (_mtlCounterBuffer) {
+		id<MTLBlitCommandEncoder> mtlBlitCmdEnc = cmdEncoder->getMTLBlitEncoder(kMVKCommandUseCopyQueryPoolResults);
+		[mtlBlitCmdEnc resolveCounters: _mtlCounterBuffer
+							   inRange: NSMakeRange(firstQuery,  queryCount)
+					 destinationBuffer: destBuffer->getMTLBuffer()
+					 destinationOffset: destBuffer->getMTLBufferOffset() + destOffset];
+	} else {
+		MVKQueryPool::encodeDirectCopyResults(cmdEncoder, firstQuery, queryCount, destBuffer, destOffset, stride);
+	}
+}
+
+id<MTLBuffer> MVKTimestampQueryPool::getResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, NSUInteger& offset) {
+	const MVKMTLBufferAllocation* tempBuff = cmdEncoder->getTempMTLBuffer(queryCount * _queryElementCount * sizeof(uint64_t));
+	void* pBuffData = tempBuff->getContents();
+	size_t size = queryCount * _queryElementCount * sizeof(uint64_t);
+	memcpy(pBuffData, &_timestamps[firstQuery], size);
+	offset = tempBuff->_offset;
+	return tempBuff->_mtlBuffer;
+}
+
+id<MTLComputeCommandEncoder> MVKTimestampQueryPool::encodeComputeCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) {
+	if (_mtlCounterBuffer) {
+		// We first need to resolve from the MTLCounterSampleBuffer into a temp buffer using a
+		// MTLBlitCommandEncoder, before creating the compute encoder and set that temp buffer into it.
+		const MVKMTLBufferAllocation* tempBuff = cmdEncoder->getTempMTLBuffer(queryCount * _queryElementCount * sizeof(uint64_t));
+		id<MTLBlitCommandEncoder> mtlBlitCmdEnc = cmdEncoder->getMTLBlitEncoder(kMVKCommandUseCopyQueryPoolResults);
+		[mtlBlitCmdEnc resolveCounters: _mtlCounterBuffer
+							   inRange: NSMakeRange(firstQuery,  queryCount)
+					 destinationBuffer: tempBuff->_mtlBuffer
+					 destinationOffset: tempBuff->_offset];
+
+		id<MTLComputeCommandEncoder> mtlCmdEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyQueryPoolResults);
+		[mtlCmdEnc setBuffer: tempBuff->_mtlBuffer offset: tempBuff->_offset atIndex: index];
+		return mtlCmdEnc;
+	} else {
+		// We can set the timestamp bytes into the compute encoder.
+		id<MTLComputeCommandEncoder> mtlCmdEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyQueryPoolResults);
+		cmdEncoder->setComputeBytes(mtlCmdEnc, &_timestamps[firstQuery], queryCount * _queryElementCount * sizeof(uint64_t), index);
+		return mtlCmdEnc;
+	}
+}
+
+
+#pragma mark Construction
+
+MVKTimestampQueryPool::MVKTimestampQueryPool(MVKDevice* device, const VkQueryPoolCreateInfo* pCreateInfo) :
+	MVKGPUCounterQueryPool(device, pCreateInfo) {
+
+		initMTLCounterSampleBuffer(pCreateInfo, _device->getTimestampMTLCounterSet(), "VK_QUERY_TYPE_TIMESTAMP");
+
+		// If we don't use a MTLCounterSampleBuffer, allocate memory to hold the timestamps.
+		if ( !_mtlCounterBuffer ) { _timestamps.resize(pCreateInfo->queryCount, 0); }
+}
+
+
+#pragma mark -
 #pragma mark MVKPipelineStatisticsQueryPool
 
 MVKPipelineStatisticsQueryPool::MVKPipelineStatisticsQueryPool(MVKDevice* device,
-															   const VkQueryPoolCreateInfo* pCreateInfo) : MVKQueryPool(device, pCreateInfo, 1) {
+															   const VkQueryPoolCreateInfo* pCreateInfo) : MVKGPUCounterQueryPool(device, pCreateInfo) {
 	if ( !_device->_enabledFeatures.pipelineStatisticsQuery ) {
 		setConfigurationResult(reportError(VK_ERROR_FEATURE_NOT_PRESENT, "vkCreateQueryPool: VK_QUERY_TYPE_PIPELINE_STATISTICS is not supported."));
 	}
diff --git a/MoltenVK/MoltenVK/Utility/MVKFoundation.h b/MoltenVK/MoltenVK/Utility/MVKFoundation.h
index e061142..1fd226c 100644
--- a/MoltenVK/MoltenVK/Utility/MVKFoundation.h
+++ b/MoltenVK/MoltenVK/Utility/MVKFoundation.h
@@ -91,7 +91,8 @@
     kMVKCommandUseTessellationVertexTessCtl,    /**< vkCmdDraw* - vertex and tessellation control stages. */
 	kMVKCommandUseMultiviewInstanceCountAdjust, /**< vkCmdDrawIndirect* - adjust instance count for multiview. */
     kMVKCommandUseCopyQueryPoolResults,         /**< vkCmdCopyQueryPoolResults. */
-    kMVKCommandUseAccumOcclusionQuery           /**< Any command terminating a Metal render pass with active visibility buffer. */
+    kMVKCommandUseAccumOcclusionQuery,          /**< Any command terminating a Metal render pass with active visibility buffer. */
+	kMVKCommandUseRecordGPUCounterSample        /**< Any command triggering the recording of a GPU counter sample. */
 } MVKCommandUse;
 
 /** Represents a given stage of a graphics pipeline. */