Merge pull request #1744 from billhollings/fix-prefilling-mem-leaks

Fix memory leaks when configured for prefilling Metal command buffers.
diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md
index 1801aaa..b7b0a5f 100644
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@@ -29,6 +29,8 @@
 - Support config option to automatically use Metal argument buffers when `VK_EXT_descriptor_indexing` 
   extension is enabled. `MVKConfiguration::useMetalArgumentBuffers` (`MVK_CONFIG_USE_METAL_ARGUMENT_BUFFERS`) 
   is now an enum field. The use of Metal argument buffers is still disabled by default (`MVK_CONFIG_USE_METAL_ARGUMENT_BUFFERS_NEVER`).
+- Fix memory leaks when configured for prefilling Metal command buffers.
+- `MVKConfiguration` replace boolean `prefillMetalCommandBuffers` with enumeration.
 - `MVKPipeline`: Add builtins that are read but not written to tessellation pipelines.
 - Fix occassional crash from retention of `MVKSwapchain` for future drawable presentations.
 - Fix crash in `vkCreateSwapchainKHR()` on macOS 10.14 and earlier
diff --git a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
index c97b41c..fa4d957 100644
--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@@ -110,9 +110,18 @@
 	MVK_CONFIG_VK_SEMAPHORE_SUPPORT_STYLE_METAL_EVENTS_WHERE_SAFE = 1,	/**< Use Metal events (MTLEvent) when available on the platform, and where safe. This will revert to same as MVK_CONFIG_VK_SEMAPHORE_USE_SINGLE_QUEUE on some NVIDIA GPUs and Rosetta2, due to potential challenges with MTLEvents on those platforms, or in older environments where MTLEvents are not supported. */
 	MVK_CONFIG_VK_SEMAPHORE_SUPPORT_STYLE_METAL_EVENTS            = 2,	/**< Always use Metal events (MTLEvent) when available on the platform. This will revert to same as MVK_CONFIG_VK_SEMAPHORE_USE_SINGLE_QUEUE in older environments where MTLEvents are not supported. */
 	MVK_CONFIG_VK_SEMAPHORE_SUPPORT_STYLE_CALLBACK                = 3,	/**< Use CPU callbacks upon GPU submission completion. This is the slowest technique, but allows multiple queues, compared to MVK_CONFIG_VK_SEMAPHORE_USE_SINGLE_QUEUE. */
-	MVK_CONFIG_VK_SEMAPHORE_MAX_ENUM                              = 0x7FFFFFFF
+	MVK_CONFIG_VK_SEMAPHORE_SUPPORT_STYLE_MAX_ENUM                = 0x7FFFFFFF
 } MVKVkSemaphoreSupportStyle;
 
+/** Identifies the style of Metal command buffer pre-filling to be used. */
+typedef enum MVKPrefillMetalCommandBuffersStyle {
+	MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_NO_PREFILL                        = 0,	/**< During Vulkan command buffer filling, do not prefill a Metal command buffer for each Vulkan command buffer. A single Metal command buffer is created and encoded for all the Vulkan command buffers included when vkQueueSubmit() is called. MoltenVK automatically creates and drains a single Metal object autorelease pool when vkQueueSubmit() is called. This is the fastest option, but potentially has the largest memory footprint. */
+	MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_DEFERRED_ENCODING                 = 1,	/**< During Vulkan command buffer filling, encode to the Metal command buffer when vkEndCommandBuffer() is called. MoltenVK automatically creates and drains a single Metal object autorelease pool when vkEndCommandBuffer() is called. This option has the fastest performance, and the largest memory footprint, of the prefilling options using autorelease pools. */
+	MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_IMMEDIATE_ENCODING                = 2,	/**< During Vulkan command buffer filling, immediately encode to the Metal command buffer, as each command is submitted to the Vulkan command buffer, and do not retain any command content in the Vulkan command buffer. MoltenVK automatically creates and drains a Metal object autorelease pool for each and every command added to the Vulkan command buffer. This option has the smallest memory footprint, and the slowest performance, of the prefilling options using autorelease pools. */
+	MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_IMMEDIATE_ENCODING_NO_AUTORELEASE = 3,	/**< During Vulkan command buffer filling, immediately encode to the Metal command buffer, as each command is submitted to the Vulkan command buffer, do not retain any command content in the Vulkan command buffer, and assume the app will ensure that each thread that fills commands into a Vulkan command buffer has a Metal autorelease pool. MoltenVK will not create and drain any autorelease pools during encoding. This is the fastest prefilling option, and generally has a small memory footprint, depending on when the app-provided autorelease pool drains. */
+	MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_MAX_ENUM                          = 0x7FFFFFFF
+} MVKPrefillMetalCommandBuffersStyle;
+
 /**
  * MoltenVK configuration settings.
  *
@@ -206,25 +215,31 @@
 	VkBool32 synchronousQueueSubmits;
 
 	/**
-	 * If enabled, where possible, a Metal command buffer will be created and filled when each
-	 * Vulkan command buffer is filled. For applications that parallelize the filling of Vulkan
+	 * If set to MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_NO_PREFILL, a single Metal
+	 * command buffer will be created and filled when the Vulkan command buffers are submitted
+	 * to the Vulkan queue. This allows a single Metal command buffer to be used for all of the
+	 * Vulkan command buffers in a queue submission. The Metal command buffer is filled on the
+	 * thread that processes the command queue submission.
+	 *
+	 * If set to any value other than MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_NO_PREFILL,
+	 * where possible, a Metal command buffer will be created and filled when each Vulkan
+	 * command buffer is filled. For applications that parallelize the filling of Vulkan
 	 * commmand buffers across multiple threads, this allows the Metal command buffers to also
 	 * be filled on the same parallel thread. Because each command buffer is filled separately,
-	 * this requires that each Vulkan command buffer requires a dedicated Metal command buffer.
+	 * this requires that each Vulkan command buffer have a dedicated Metal command buffer.
 	 *
-	 * If disabled, a single Metal command buffer will be created and filled when the Vulkan
-	 * command buffers are submitted to the Vulkan queue. This allows a single Metal command
-	 * buffer to be used for all of the Vulkan command buffers in a queue submission. The
-	 * Metal command buffer is filled on the thread that processes the command queue submission.
+	 * See the definition of the MVKPrefillMetalCommandBuffersStyle enumeration above for
+	 * descriptions of the various values that can be used for this setting. The differences
+	 * are primarily distinguished by how memory recovery is handled for autoreleased Metal
+	 * objects that are created under the covers as the commands added to the Vulkan command
+	 * buffer are encoded into the corresponding Metal command buffer. You can decide whether
+	 * your app will recover all autoreleased Metal objects, or how agressively MoltenVK should
+	 * recover autoreleased Metal objects, based on your approach to command buffer filling.
 	 *
 	 * Depending on the nature of your application, you may find performance is improved by filling
 	 * the Metal command buffers on parallel threads, or you may find that performance is improved by
 	 * consolidating all Vulkan command buffers onto a single Metal command buffer during queue submission.
 	 *
-	 * Prefilling of a Metal command buffer will not occur during the filling of secondary command
-	 * buffers (VK_COMMAND_BUFFER_LEVEL_SECONDARY), or for primary command buffers that are intended
-	 * to be submitted to multiple queues concurrently (VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT).
-	 *
 	 * When enabling this feature, be aware that one Metal command buffer is required for each Vulkan
 	 * command buffer. Depending on the number of command buffers that you use, you may also need to
 	 * change the value of the maxActiveMetalCommandBuffersPerQueue setting.
@@ -235,6 +250,10 @@
 	 * the concept of being reset after being filled. Depending on when and how often you do this,
 	 * it may cause unexpected visual artifacts and unnecessary GPU load.
 	 *
+	 * Prefilling of a Metal command buffer will not occur during the filling of secondary command
+	 * buffers (VK_COMMAND_BUFFER_LEVEL_SECONDARY), or for primary command buffers that are intended
+	 * to be submitted to multiple queues concurrently (VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT).
+	 *
 	 * This feature is incompatible with updating descriptors after binding. If any of the
 	 * *UpdateAfterBind feature flags of VkPhysicalDeviceDescriptorIndexingFeatures or
 	 * VkPhysicalDeviceInlineUniformBlockFeatures have been enabled, the value of this
@@ -243,14 +262,15 @@
 	 * The value of this parameter may be changed at any time during application runtime,
 	 * and the changed value will immediately effect subsequent MoltenVK behaviour.
 	 * Specifically, this parameter can be enabled when filling some command buffers,
-	 * and disabled when filling others.
+	 * and disabled when later filling others.
 	 *
 	 * The initial value or this parameter is set by the
 	 * MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS
 	 * runtime environment variable or MoltenVK compile-time build setting.
-	 * If neither is set, the value of this parameter defaults to false.
+	 * If neither is set, the value of this parameter defaults to
+	 * MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_NO_PREFILL.
 	 */
-	VkBool32 prefillMetalCommandBuffers;
+	MVKPrefillMetalCommandBuffersStyle prefillMetalCommandBuffers;
 
 	/**
 	 * The maximum number of Metal command buffers that can be concurrently active per Vulkan queue.
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
index c5fb691..87ae599 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
@@ -96,16 +96,16 @@
 	void addCommand(MVKCommand* command);
 
 	/** Returns the number of commands currently in this command buffer. */
-	inline uint32_t getCommandCount() { return _commandCount; }
+	uint32_t getCommandCount() { return _commandCount; }
 
 	/** Returns the command pool backing this command buffer. */
-	inline MVKCommandPool* getCommandPool() { return _commandPool; }
+	MVKCommandPool* getCommandPool() { return _commandPool; }
 
 	/** Submit the commands in this buffer as part of the queue submission. */
 	void submit(MVKQueueCommandBufferSubmission* cmdBuffSubmit, MVKCommandEncodingContext* pEncodingContext);
 
     /** Returns whether this command buffer can be submitted to a queue more than once. */
-    inline bool getIsReusable() { return _isReusable; }
+    bool getIsReusable() { return _isReusable; }
 
     /**
      * Metal requires that a visibility buffer is established when a render pass is created, 
@@ -159,13 +159,13 @@
      * Returns a reference to this object suitable for use as a Vulkan API handle.
      * This is the compliment of the getMVKCommandBuffer() method.
      */
-    inline VkCommandBuffer getVkCommandBuffer() { return (VkCommandBuffer)getVkHandle(); }
+	VkCommandBuffer getVkCommandBuffer() { return (VkCommandBuffer)getVkHandle(); }
 
     /**
      * Retrieves the MVKCommandBuffer instance referenced by the VkCommandBuffer handle.
      * This is the compliment of the getVkCommandBuffer() method.
      */
-    static inline MVKCommandBuffer* getMVKCommandBuffer(VkCommandBuffer vkCommandBuffer) {
+    static MVKCommandBuffer* getMVKCommandBuffer(VkCommandBuffer vkCommandBuffer) {
         return (MVKCommandBuffer*)getDispatchableObject(vkCommandBuffer);
     }
 
@@ -177,12 +177,11 @@
 	void propagateDebugName() override {}
 	void init(const VkCommandBufferAllocateInfo* pAllocateInfo);
 	bool canExecute();
-	bool canPrefill();
-	void prefill();
 	void clearPrefilledMTLCommandBuffer();
     void releaseCommands(MVKCommand* command);
 	void releaseRecordedCommands();
-    void flushImmediateCmdEncoder();
+	void flushImmediateCmdEncoder();
+	void checkDeferredEncoding();
 
 	MVKCommand* _head = nullptr;
 	MVKCommand* _tail = nullptr;
@@ -471,18 +470,24 @@
 
 #pragma mark Construction
 
-	MVKCommandEncoder(MVKCommandBuffer* cmdBuffer);
+	MVKCommandEncoder(MVKCommandBuffer* cmdBuffer,
+					  MVKPrefillMetalCommandBuffersStyle prefillStyle = MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_NO_PREFILL);
+
+	~MVKCommandEncoder() override;
 
 protected:
     void addActivatedQueries(MVKQueryPool* pQueryPool, uint32_t query, uint32_t queryCount);
     void finishQueries();
 	void setSubpass(MVKCommand* passCmd, VkSubpassContents subpassContents, uint32_t subpassIndex);
 	void clearRenderArea();
-    NSString* getMTLRenderCommandEncoderName(MVKCommandUse cmdUse);
+	void encodeCommandsImpl(MVKCommand* command);
 	void encodeGPUCounterSample(MVKGPUCounterQueryPool* mvkQryPool, uint32_t sampleIndex, MVKCounterSamplingFlags samplingPoints);
 	void encodeTimestampStageCounterSamples();
 	id<MTLFence> getStageCountersMTLFence();
 	MVKArrayRef<MTLSamplePosition> getCustomSamplePositions();
+	NSString* getMTLRenderCommandEncoderName(MVKCommandUse cmdUse);
+	template<typename T> void retainIfImmediatelyEncoding(T& mtlEnc);
+	template<typename T> void endMetalEncoding(T& mtlEnc);
 
 	typedef struct GPUCounterQuery {
 		MVKGPUCounterQueryPool* queryPool = nullptr;
@@ -506,12 +511,13 @@
 	MVKPushConstantsCommandEncoderState _fragmentPushConstants;
 	MVKPushConstantsCommandEncoderState _computePushConstants;
     MVKOcclusionQueryCommandEncoderState _occlusionQueryState;
+	MVKPrefillMetalCommandBuffersStyle _prefillStyle;
 	VkSubpassContents _subpassContents;
-	MVKCommandUse _mtlComputeEncoderUse;
-	MVKCommandUse _mtlBlitEncoderUse;
 	uint32_t _renderSubpassIndex;
 	uint32_t _multiviewPassIndex;
-    uint32_t _flushCount = 0;
+    uint32_t _flushCount;
+	MVKCommandUse _mtlComputeEncoderUse;
+	MVKCommandUse _mtlBlitEncoderUse;
 	bool _isRenderingEntireAttachment;
 };
 
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
index 0f4c5f5..32bb365 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
@@ -95,18 +95,19 @@
 		}
 	}
 
-    if(canPrefill()) {
-        @autoreleasepool {
-            uint32_t qIdx = 0;
-            _prefilledMTLCmdBuffer = _commandPool->newMTLCommandBuffer(qIdx);    // retain
-            
-            _immediateCmdEncodingContext = new MVKCommandEncodingContext;
-            
-            _immediateCmdEncoder = new MVKCommandEncoder(this);
-            _immediateCmdEncoder->beginEncoding(_prefilledMTLCmdBuffer, _immediateCmdEncodingContext);
-        }
+    if(_device->shouldPrefillMTLCommandBuffers() && !(_isSecondary || _supportsConcurrentExecution)) {
+		@autoreleasepool {
+			_prefilledMTLCmdBuffer = [_commandPool->getMTLCommandBuffer(0) retain];    // retained
+			auto prefillStyle = mvkConfig().prefillMetalCommandBuffers;
+			if (prefillStyle == MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_IMMEDIATE_ENCODING ||
+				prefillStyle == MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_IMMEDIATE_ENCODING_NO_AUTORELEASE ) {
+				_immediateCmdEncodingContext = new MVKCommandEncodingContext;
+				_immediateCmdEncoder = new MVKCommandEncoder(this, prefillStyle);
+				_immediateCmdEncoder->beginEncoding(_prefilledMTLCmdBuffer, _immediateCmdEncodingContext);
+			}
+		}
     }
-    
+
     return getConfigurationResult();
 }
 
@@ -163,10 +164,25 @@
 	_canAcceptCommands = false;
     
     flushImmediateCmdEncoder();
-    
+	checkDeferredEncoding();
+
 	return getConfigurationResult();
 }
 
+void MVKCommandBuffer::checkDeferredEncoding() {
+	if (mvkConfig().prefillMetalCommandBuffers == MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_DEFERRED_ENCODING) {
+		@autoreleasepool {
+			MVKCommandEncodingContext encodingContext;
+			MVKCommandEncoder encoder(this);
+			encoder.encode(_prefilledMTLCmdBuffer, &encodingContext);
+
+			// Once encoded onto Metal, if this command buffer is not reusable, we don't need the
+			// MVKCommand instances anymore, so release them in order to reduce memory pressure.
+			if ( !_isReusable ) { releaseRecordedCommands(); }
+		}
+	}
+}
+
 void MVKCommandBuffer::addCommand(MVKCommand* command) {
     if ( !_canAcceptCommands ) {
         setConfigurationResult(reportError(VK_NOT_READY, "Command buffer cannot accept commands before vkBeginCommandBuffer() is called."));
@@ -177,7 +193,6 @@
 
     if(_immediateCmdEncoder) {
         _immediateCmdEncoder->encodeCommands(command);
-        
         if( !_isReusable ) {
             releaseCommands(command);
             return;
@@ -225,11 +240,6 @@
 	return true;
 }
 
-bool MVKCommandBuffer::canPrefill() {
-	bool wantPrefill = _device->shouldPrefillMTLCommandBuffers();
-	return wantPrefill && !(_isSecondary || _supportsConcurrentExecution);
-}
-
 void MVKCommandBuffer::clearPrefilledMTLCommandBuffer() {
 
 	// Metal command buffers do not return to their pool on release, nor do they support the
@@ -333,7 +343,18 @@
     setLabelIfNotNil(_mtlCmdBuffer, _cmdBuffer->_debugName);
 }
 
+// Multithread autorelease prefill style uses a dedicated autorelease pool when encoding each command.
 void MVKCommandEncoder::encodeCommands(MVKCommand* command) {
+	if (_prefillStyle == MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_IMMEDIATE_ENCODING) {
+		@autoreleasepool {
+			encodeCommandsImpl(command);
+		}
+	} else {
+		encodeCommandsImpl(command);
+	}
+}
+
+void MVKCommandEncoder::encodeCommandsImpl(MVKCommand* command) {
     while(command) {
         uint32_t prevMVPassIdx = _multiviewPassIndex;
         command->encode(this);
@@ -466,6 +487,21 @@
 	_dynamicSamplePositions.assign(dynamicSamplePositions.begin(), dynamicSamplePositions.end());
 }
 
+// Retain encoders when prefilling, because prefilling may span multiple autorelease pools.
+template<typename T>
+void MVKCommandEncoder::retainIfImmediatelyEncoding(T& mtlEnc) {
+	if (_cmdBuffer->_immediateCmdEncoder) { [mtlEnc retain]; }
+}
+
+// End Metal encoder and release retained encoders when immediately encoding.
+template<typename T>
+void MVKCommandEncoder::endMetalEncoding(T& mtlEnc) {
+	[mtlEnc endEncoding];
+	if (_cmdBuffer->_immediateCmdEncoder) { [mtlEnc release]; }
+	mtlEnc = nil;
+}
+
+
 // Creates _mtlRenderEncoder and marks cached render state as dirty so it will be set into the _mtlRenderEncoder.
 void MVKCommandEncoder::beginMetalRenderPass(MVKCommandUse cmdUse) {
 
@@ -525,7 +561,8 @@
 		[mtlRPDesc setSamplePositions: cstmSampPosns.data count: cstmSampPosns.size];
 	}
 
-    _mtlRenderEncoder = [_mtlCmdBuffer renderCommandEncoderWithDescriptor: mtlRPDesc];     // not retained
+    _mtlRenderEncoder = [_mtlCmdBuffer renderCommandEncoderWithDescriptor: mtlRPDesc];
+	retainIfImmediatelyEncoding(_mtlRenderEncoder);
 	setLabelIfNotNil(_mtlRenderEncoder, getMTLRenderCommandEncoderName(cmdUse));
 
 	// We shouldn't clear the render area if we are restarting the Metal renderpass
@@ -748,8 +785,7 @@
     if (_mtlRenderEncoder == nil) { return; }
 
 	if (_cmdBuffer->_hasStageCounterTimestampCommand) { [_mtlRenderEncoder updateFence: getStageCountersMTLFence() afterStages: MTLRenderStageFragment]; }
-    [_mtlRenderEncoder endEncoding];
-	_mtlRenderEncoder = nil;    // not retained
+	endMetalEncoding(_mtlRenderEncoder);
 
 	getSubpass()->resolveUnresolvableAttachments(this, _attachments.contents());
 
@@ -776,13 +812,11 @@
 	_computePushConstants.markDirty();
 
 	if (_mtlComputeEncoder && _cmdBuffer->_hasStageCounterTimestampCommand) { [_mtlComputeEncoder updateFence: getStageCountersMTLFence()]; }
-	[_mtlComputeEncoder endEncoding];
-	_mtlComputeEncoder = nil;       // not retained
+	endMetalEncoding(_mtlComputeEncoder);
 	_mtlComputeEncoderUse = kMVKCommandUseNone;
 
 	if (_mtlBlitEncoder && _cmdBuffer->_hasStageCounterTimestampCommand) { [_mtlBlitEncoder updateFence: getStageCountersMTLFence()]; }
-	[_mtlBlitEncoder endEncoding];
-	_mtlBlitEncoder = nil;          // not retained
+	endMetalEncoding(_mtlBlitEncoder);
     _mtlBlitEncoderUse = kMVKCommandUseNone;
 
 	encodeTimestampStageCounterSamples();
@@ -791,7 +825,8 @@
 id<MTLComputeCommandEncoder> MVKCommandEncoder::getMTLComputeEncoder(MVKCommandUse cmdUse) {
 	if ( !_mtlComputeEncoder ) {
 		endCurrentMetalEncoding();
-		_mtlComputeEncoder = [_mtlCmdBuffer computeCommandEncoder];		// not retained
+		_mtlComputeEncoder = [_mtlCmdBuffer computeCommandEncoder];
+		retainIfImmediatelyEncoding(_mtlComputeEncoder);
 		beginMetalComputeEncoding(cmdUse);
 	}
 	if (_mtlComputeEncoderUse != cmdUse) {
@@ -804,7 +839,8 @@
 id<MTLBlitCommandEncoder> MVKCommandEncoder::getMTLBlitEncoder(MVKCommandUse cmdUse) {
 	if ( !_mtlBlitEncoder ) {
 		endCurrentMetalEncoding();
-		_mtlBlitEncoder = [_mtlCmdBuffer blitCommandEncoder];   // not retained
+		_mtlBlitEncoder = [_mtlCmdBuffer blitCommandEncoder];
+		retainIfImmediatelyEncoding(_mtlBlitEncoder);
 	}
     if (_mtlBlitEncoderUse != cmdUse) {
         _mtlBlitEncoderUse = cmdUse;
@@ -978,7 +1014,7 @@
 		// in Xcode 13 as inaccurate for all platforms. Leave this value at 1 until we can figure out how to
 		// accurately determine the length of sampleBufferAttachments on each platform.
 		uint32_t maxMTLBlitPassSampleBuffers = 1;		// Was MTLMaxBlitPassSampleBuffers API definition
-		auto* bpDesc = [[[MTLBlitPassDescriptor alloc] init] autorelease];
+		auto* bpDesc = [MTLBlitPassDescriptor new];		// temp retained
 		for (uint32_t attIdx = 0; attIdx < maxMTLBlitPassSampleBuffers && qIdx < qCnt; attIdx++, qIdx++) {
 			auto* sbAttDesc = bpDesc.sampleBufferAttachments[attIdx];
 			auto& tsQry = _timestampStageCounterQueries[qIdx];
@@ -993,6 +1029,7 @@
 
 		auto* mtlEnc = [_mtlCmdBuffer blitCommandEncoderWithDescriptor: bpDesc];
 		setLabelIfNotNil(mtlEnc, mvkMTLBlitCommandEncoderLabel(kMVKCommandUseRecordGPUCounterSample));
+		[bpDesc release];		// Release temp object
 		[mtlEnc waitForFence: getStageCountersMTLFence()];
 		[mtlEnc fillBuffer: _device->getDummyBlitMTLBuffer() range: NSMakeRange(0, 1) value: 0];
 		[mtlEnc endEncoding];
@@ -1046,7 +1083,8 @@
 
 #pragma mark Construction
 
-MVKCommandEncoder::MVKCommandEncoder(MVKCommandBuffer* cmdBuffer) : MVKBaseDeviceObject(cmdBuffer->getDevice()),
+MVKCommandEncoder::MVKCommandEncoder(MVKCommandBuffer* cmdBuffer,
+									 MVKPrefillMetalCommandBuffersStyle prefillStyle) : MVKBaseDeviceObject(cmdBuffer->getDevice()),
         _cmdBuffer(cmdBuffer),
         _graphicsPipelineState(this),
         _computePipelineState(this),
@@ -1063,7 +1101,8 @@
         _tessEvalPushConstants(this, VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT),
         _fragmentPushConstants(this, VK_SHADER_STAGE_FRAGMENT_BIT),
         _computePushConstants(this, VK_SHADER_STAGE_COMPUTE_BIT),
-        _occlusionQueryState(this) {
+        _occlusionQueryState(this),
+		_prefillStyle(prefillStyle){
 
             _pDeviceFeatures = &_device->_enabledFeatures;
             _pDeviceMetalFeatures = _device->_pMetalFeatures;
@@ -1078,6 +1117,14 @@
             _mtlBlitEncoderUse = kMVKCommandUseNone;
 			_pEncodingContext = nullptr;
 			_stageCountersMTLFence = nil;
+			_flushCount = 0;
+}
+
+MVKCommandEncoder::~MVKCommandEncoder() {
+	[_mtlRenderEncoder release];
+	[_mtlComputeEncoder release];
+	[_mtlBlitEncoder release];
+	// _stageCountersMTLFence is released after Metal command buffer completion
 }
 
 
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandPool.h b/MoltenVK/MoltenVK/Commands/MVKCommandPool.h
index 977f963..e7b7b0c 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandPool.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandPool.h
@@ -82,7 +82,7 @@
 	 * Returns a retained MTLCommandBuffer created from the indexed queue
 	 * within the queue family for which this command pool was created.
 	 */
-	id<MTLCommandBuffer> newMTLCommandBuffer(uint32_t queueIndex);
+	id<MTLCommandBuffer> getMTLCommandBuffer(uint32_t queueIndex);
 
 	/** Release any held but unused memory back to the system. */
 	void trim();
diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandPool.mm b/MoltenVK/MoltenVK/Commands/MVKCommandPool.mm
index 0e0d1f3..17a8d63 100644
--- a/MoltenVK/MoltenVK/Commands/MVKCommandPool.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandPool.mm
@@ -77,8 +77,8 @@
 	}
 }
 
-id<MTLCommandBuffer> MVKCommandPool::newMTLCommandBuffer(uint32_t queueIndex) {
-	return [_device->getQueue(_queueFamilyIndex, queueIndex)->getMTLCommandBuffer(kMVKCommandUseEndCommandBuffer, true) retain];
+id<MTLCommandBuffer> MVKCommandPool::getMTLCommandBuffer(uint32_t queueIndex) {
+	return _device->getQueue(_queueFamilyIndex, queueIndex)->getMTLCommandBuffer(kMVKCommandUseEndCommandBuffer, true);
 }
 
 // Clear the command type pool member variables.
diff --git a/MoltenVK/MoltenVK/Utility/MVKEnvironment.cpp b/MoltenVK/MoltenVK/Utility/MVKEnvironment.cpp
index c3a8805..255fb05 100644
--- a/MoltenVK/MoltenVK/Utility/MVKEnvironment.cpp
+++ b/MoltenVK/MoltenVK/Utility/MVKEnvironment.cpp
@@ -30,7 +30,7 @@
 	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.debugMode,                              MVK_DEBUG);
 	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.shaderConversionFlipVertexY,            MVK_CONFIG_SHADER_CONVERSION_FLIP_VERTEX_Y);
 	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.synchronousQueueSubmits,                MVK_CONFIG_SYNCHRONOUS_QUEUE_SUBMITS);
-	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.prefillMetalCommandBuffers,             MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS);
+	MVK_SET_FROM_ENV_OR_BUILD_INT32 (evCfg.prefillMetalCommandBuffers,             MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS);
 	MVK_SET_FROM_ENV_OR_BUILD_INT32 (evCfg.maxActiveMetalCommandBuffersPerQueue,   MVK_CONFIG_MAX_ACTIVE_METAL_COMMAND_BUFFERS_PER_QUEUE);
 	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.supportLargeQueryPools,                 MVK_CONFIG_SUPPORT_LARGE_QUERY_POOLS);
 	MVK_SET_FROM_ENV_OR_BUILD_BOOL  (evCfg.presentWithCommandBuffer,               MVK_CONFIG_PRESENT_WITH_COMMAND_BUFFER);
diff --git a/MoltenVK/MoltenVK/Utility/MVKEnvironment.h b/MoltenVK/MoltenVK/Utility/MVKEnvironment.h
index 06abb17..f7b36ff 100644
--- a/MoltenVK/MoltenVK/Utility/MVKEnvironment.h
+++ b/MoltenVK/MoltenVK/Utility/MVKEnvironment.h
@@ -105,9 +105,9 @@
 #   define MVK_CONFIG_SYNCHRONOUS_QUEUE_SUBMITS    mvkOSVersionIsAtLeast(MVK_CONFIG_MTLEVENT_MIN_OS)
 #endif
 
-/** Fill a Metal command buffers when each Vulkan command buffer is filled. */
+/** Fill a Metal command buffer when each Vulkan command buffer is filled. */
 #ifndef MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS
-#   define MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS    0
+#   define MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS    MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS_STYLE_NO_PREFILL
 #endif
 
 /**
diff --git a/README.md b/README.md
index 46cc9d4..6b2e200 100644
--- a/README.md
+++ b/README.md
@@ -419,14 +419,14 @@
 ### Memory Management
 
 *Metal*, and other *Objective-C* objects in *Apple's SDK* frameworks, use reference counting for memory management. 
-When instantiating *Objective-C* objects, it is important that you do not rely on implied *autorelease pools* to do 
-memory management for you. Because many *Vulkan* games and apps may be ported from other platforms, they will 
-typically not include autorelease pools in their threading models.
+As a contributor to **MoltenVK**, when instantiating *Objective-C* objects, it is important that you do not rely on 
+the app providing *autorelease pools* to do memory management for you. Because many *Vulkan* games and apps may be 
+ported from other platforms, they will often not automatically include autorelease pools in their threading models.
 
-Avoid the use of the `autorelease` method, or any object creation methods that imply use of `autorelease`,
-(eg- `[NSString stringWithFormat: ]`, etc). Instead, favour object creation methods that return a retained object
-(eg- `[[NSString alloc] initWithFormat: ]`, etc), and manually track and release those objects. If you need to use 
-autoreleased objects, wrap code blocks in an `@autoreleasepool {...}` block.
+As a contributor to **MoltenVK**, avoid the use of the *Metal* `autorelease` method, or any object *Metal* creation 
+methods that imply internal use of `autorelease`, (eg- `[NSString stringWithFormat: ]`, etc). Instead, favor object 
+creation methods that return a retained object (eg- `[[NSString alloc] initWithFormat: ]`, etc), and manually track 
+and release those objects. If you need to use autoreleased objects, wrap your code in an `@autoreleasepool {...}` block.
 
 
 ### Code Formatting