MoltenVK/MoltenVK/Commands/MVKCommandPipelineStateFactoryShaderSource.h - external/github.com/KhronosGroup/MoltenVK - Git at Google

 /*
  * MVKCommandPipelineStateFactoryShaderSource.h
  *
  * Copyright (c) 2015-2021 The Brenwill Workshop Ltd. (http://www.brenwill.com)
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "MVKDevice.h"

 #import <Foundation/Foundation.h>


 /** This file contains static MSL source code for the MoltenVK command shaders. */

 static NSString* _MVKStaticCmdShaderSource = @"                                                                 \n\
 #include <metal_stdlib>                                                                                         \n\
 using namespace metal;                                                                                          \n\
                                                                                                                 \n\
 typedef struct {                                                                                                \n\
     float2 a_position [[attribute(0)]];                                                                         \n\
     float3 a_texCoord [[attribute(1)]];                                                                         \n\
 } AttributesPosTex;                                                                                             \n\
                                                                                                                 \n\
 typedef struct {                                                                                                \n\
     float4 v_position [[position]];                                                                             \n\
     float3 v_texCoord;                                                                                          \n\
 } VaryingsPosTex;                                                                                               \n\
                                                                                                                 \n\
 typedef struct {                                                                                                \n\
     float4 v_position [[position]];                                                                             \n\
     float3 v_texCoord;                                                                                          \n\
     uint v_layer [[render_target_array_index]];                                                                 \n\
 } VaryingsPosTexLayer;                                                                                          \n\
                                                                                                                 \n\
 typedef size_t VkDeviceSize;                                                                                    \n\
                                                                                                                 \n\
 typedef enum : uint32_t {                                                                                       \n\
     VK_FORMAT_BC1_RGB_UNORM_BLOCK = 131,                                                                        \n\
     VK_FORMAT_BC1_RGB_SRGB_BLOCK = 132,                                                                         \n\
     VK_FORMAT_BC1_RGBA_UNORM_BLOCK = 133,                                                                       \n\
     VK_FORMAT_BC1_RGBA_SRGB_BLOCK = 134,                                                                        \n\
     VK_FORMAT_BC2_UNORM_BLOCK = 135,                                                                            \n\
     VK_FORMAT_BC2_SRGB_BLOCK = 136,                                                                             \n\
     VK_FORMAT_BC3_UNORM_BLOCK = 137,                                                                            \n\
     VK_FORMAT_BC3_SRGB_BLOCK = 138,                                                                             \n\
 } VkFormat;                                                                                                     \n\
                                                                                                                 \n\
 typedef struct {                                                                                                \n\
     uint32_t width;                                                                                             \n\
     uint32_t height;                                                                                            \n\
 } VkExtent2D;                                                                                                   \n\
                                                                                                                 \n\
 typedef struct {                                                                                                \n\
     uint32_t width;                                                                                             \n\
     uint32_t height;                                                                                            \n\
     uint32_t depth;                                                                                             \n\
 } __attribute__((packed)) VkExtent3D;                                                                           \n\
                                                                                                                 \n\
 typedef struct {                                                                                                \n\
     int32_t x;                                                                                                  \n\
     int32_t y;                                                                                                  \n\
     int32_t z;                                                                                                  \n\
 } __attribute__((packed)) VkOffset3D;                                                                           \n\
                                                                                                                 \n"
 #define MVK_DECOMPRESS_CODE(...) #__VA_ARGS__
 #include "MVKDXTnCodec.def"
 #undef MVK_DECOMPRESS_CODE
 "\n\
                                                                                                                 \n\
 vertex VaryingsPosTex vtxCmdBlitImage(AttributesPosTex attributes [[stage_in]]) {                               \n\
     VaryingsPosTex varyings;                                                                                    \n\
     varyings.v_position = float4(attributes.a_position, 0.0, 1.0);                                              \n\
     varyings.v_texCoord = attributes.a_texCoord;                                                                \n\
     return varyings;                                                                                            \n\
 }                                                                                                               \n\
 																			                			        \n\
 vertex VaryingsPosTexLayer vtxCmdBlitImageLayered(AttributesPosTex attributes [[stage_in]],                     \n\
                                                   uint instanceID [[instance_id]],                              \n\
                                                   constant float &zIncr [[buffer(0)]]) {                        \n\
     VaryingsPosTexLayer varyings;                                                                               \n\
     varyings.v_position = float4(attributes.a_position, 0.0, 1.0);                                              \n\
     varyings.v_texCoord = float3(attributes.a_texCoord.xy, attributes.a_texCoord.z + (instanceID + 0.5) * zIncr);\n\
     varyings.v_layer = instanceID;                                                                              \n\
     return varyings;                                                                                            \n\
 }                                                                                                               \n\
 																			                			        \n\
 typedef struct {                                                                                                \n\
     uint32_t srcOffset;                                                                                         \n\
     uint32_t dstOffset;                                                                                         \n\
     uint32_t size;                                                                                              \n\
 } CopyInfo;                                                                                                     \n\
                                                                                                                 \n\
 kernel void cmdCopyBufferBytes(device uint8_t* src [[ buffer(0) ]],                                             \n\
                                device uint8_t* dst [[ buffer(1) ]],                                             \n\
                                constant CopyInfo& info [[ buffer(2) ]]) {                                       \n\
     for (size_t i = 0; i < info.size; i++) {                                                                    \n\
         dst[i + info.dstOffset] = src[i + info.srcOffset];                                                      \n\
     }                                                                                                           \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 kernel void cmdFillBuffer(device uint32_t* dst [[ buffer(0) ]],                                                 \n\
                           constant uint32_t& fillValue [[ buffer(1) ]],                                         \n\
                           uint pos [[thread_position_in_grid]]) {                                               \n\
     dst[pos] = fillValue;                                                                                       \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 kernel void cmdClearColorImage2DFloat(texture2d<float, access::write> dst [[ texture(0) ]],                     \n\
                                       constant float4& clearValue [[ buffer(0) ]],                              \n\
                                       uint2 pos [[thread_position_in_grid]]) {                                  \n\
     dst.write(clearValue, pos);                                                                                 \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 kernel void cmdClearColorImage2DUInt(texture2d<uint, access::write> dst [[ texture(0) ]],                       \n\
                                      constant uint4& clearValue [[ buffer(0) ]],                                \n\
                                      uint2 pos [[thread_position_in_grid]]) {                                   \n\
     dst.write(clearValue, pos);                                                                                 \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 kernel void cmdClearColorImage2DInt(texture2d<int, access::write> dst [[ texture(0) ]],                         \n\
                                     constant int4& clearValue [[ buffer(0) ]],                                  \n\
                                     uint2 pos [[thread_position_in_grid]]) {                                    \n\
     dst.write(clearValue, pos);                                                                                 \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 typedef struct {                                                                                                \n\
     uint32_t srcRowStride;                                                                                      \n\
     uint32_t srcRowStrideHigh;                                                                                  \n\
     uint32_t srcDepthStride;                                                                                    \n\
     uint32_t srcDepthStrideHigh;                                                                                \n\
     uint32_t destRowStride;                                                                                     \n\
     uint32_t destRowStrideHigh;                                                                                 \n\
     uint32_t destDepthStride;                                                                                   \n\
     uint32_t destDepthStrideHigh;                                                                               \n\
     VkFormat format;                                                                                            \n\
     VkOffset3D offset;                                                                                          \n\
     VkExtent3D extent;                                                                                          \n\
 } CmdCopyBufferToImageInfo;                                                                                     \n\
                                                                                                                 \n\
 kernel void cmdCopyBufferToImage3DDecompressDXTn(const device uint8_t* src [[buffer(0)]],                       \n\
                                                  texture3d<float, access::write> dest [[texture(0)]],           \n\
                                                  constant CmdCopyBufferToImageInfo& info [[buffer(2)]],         \n\
                                                  uint3 pos [[thread_position_in_grid]]) {                       \n\
     uint x = pos.x * 4, y = pos.y * 4, z = pos.z;                                                               \n\
     VkDeviceSize blockByteCount = isBC1Format(info.format) ? 8 : 16;                                            \n\
                                                                                                                 \n\
     if (x >= info.extent.width || y >= info.extent.height || z >= info.extent.depth) { return; }                \n\
                                                                                                                 \n\
     src += z * info.srcDepthStride + y * info.srcRowStride / 4 + x * blockByteCount / 4;                        \n\
     VkExtent2D blockExtent;                                                                                     \n\
     blockExtent.width = min(info.extent.width - x, 4u);                                                         \n\
     blockExtent.height = min(info.extent.height - y, 4u);                                                       \n\
     uint pixels[16] = {0};                                                                                      \n\
     decompressDXTnBlock(src, pixels, blockExtent, 4 * sizeof(uint), info.format);                               \n\
     for (uint j = 0; j < blockExtent.height; ++j) {                                                             \n\
         for (uint i = 0; i < blockExtent.width; ++i) {                                                          \n\
             // The pixel components are in BGRA order, but texture::write wants them                            \n\
             // in RGBA order. We can fix that (ironically) with a BGRA swizzle.                                 \n\
             dest.write(unpack_unorm4x8_to_float(pixels[j * 4 + i]).bgra,                                        \n\
                        uint3(info.offset.x + x + i, info.offset.y + y + j, info.offset.z + z));                 \n\
         }                                                                                                       \n\
     }                                                                                                           \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 kernel void cmdCopyBufferToImage3DDecompressTempBufferDXTn(const device uint8_t* src [[buffer(0)]],             \n\
                                                            device uint8_t* dest [[buffer(1)]],                  \n\
                                                            constant CmdCopyBufferToImageInfo& info [[buffer(2)]],\n\
                                                            uint3 pos [[thread_position_in_grid]]) {             \n\
     uint x = pos.x * 4, y = pos.y * 4, z = pos.z;                                                               \n\
     VkDeviceSize blockByteCount = isBC1Format(info.format) ? 8 : 16;                                            \n\
                                                                                                                 \n\
     if (x >= info.extent.width || y >= info.extent.height || z >= info.extent.depth) { return; }                \n\
                                                                                                                 \n\
     src += z * info.srcDepthStride + y * info.srcRowStride / 4 + x * blockByteCount / 4;                        \n\
     dest += z * info.destDepthStride + y * info.destRowStride + x * sizeof(uint);                               \n\
     VkExtent2D blockExtent;                                                                                     \n\
     blockExtent.width = min(info.extent.width - x, 4u);                                                         \n\
     blockExtent.height = min(info.extent.height - y, 4u);                                                       \n\
     uint pixels[16] = {0};                                                                                      \n\
     decompressDXTnBlock(src, pixels, blockExtent, 4 * sizeof(uint), info.format);                               \n\
     device uint* destPixel = (device uint*)dest;                                                                \n\
     for (uint j = 0; j < blockExtent.height; ++j) {                                                             \n\
         for (uint i = 0; i < blockExtent.width; ++i) {                                                          \n\
             destPixel[j * info.destRowStride / sizeof(uint) + i] = pixels[j * 4 + i];                           \n\
         }                                                                                                       \n\
     }                                                                                                           \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 #if __METAL_VERSION__ >= 210                                                                                    \n\
 // This structure is missing from the MSL headers. :/                                                           \n\
 struct MTLStageInRegionIndirectArguments {                                                                      \n\
     uint32_t stageInOrigin[3];                                                                                  \n\
     uint32_t stageInSize[3];                                                                                    \n\
 };                                                                                                              \n\
 #endif                                                                                                          \n\
                                                                                                                 \n\
 kernel void cmdDrawIndirectMultiviewConvertBuffers(const device char* srcBuff [[buffer(0)]],                    \n\
                                                    device MTLDrawPrimitivesIndirectArguments* destBuff [[buffer(1)]],\n\
                                                    constant uint32_t& srcStride [[buffer(2)]],                  \n\
                                                    constant uint32_t& drawCount [[buffer(3)]],                  \n\
                                                    constant uint32_t& viewCount [[buffer(4)]],                  \n\
                                                    uint idx [[thread_position_in_grid]]) {                      \n\
     if (idx >= drawCount) { return; }                                                                           \n\
     const device auto& src = *reinterpret_cast<const device MTLDrawPrimitivesIndirectArguments*>(srcBuff + idx * srcStride);\n\
     destBuff[idx] = src;                                                                                        \n\
     destBuff[idx].instanceCount *= viewCount;                                                                   \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 kernel void cmdDrawIndexedIndirectMultiviewConvertBuffers(const device char* srcBuff [[buffer(0)]],             \n\
                                                           device MTLDrawIndexedPrimitivesIndirectArguments* destBuff [[buffer(1)]],\n\
                                                           constant uint32_t& srcStride [[buffer(2)]],           \n\
                                                           constant uint32_t& drawCount [[buffer(3)]],           \n\
                                                           constant uint32_t& viewCount [[buffer(4)]],           \n\
                                                           uint idx [[thread_position_in_grid]]) {               \n\
     if (idx >= drawCount) { return; }                                                                           \n\
     const device auto& src = *reinterpret_cast<const device MTLDrawIndexedPrimitivesIndirectArguments*>(srcBuff + idx * srcStride);\n\
     destBuff[idx] = src;                                                                                        \n\
     destBuff[idx].instanceCount *= viewCount;                                                                   \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 #if __METAL_VERSION__ >= 120                                                                                    \n\
 kernel void cmdDrawIndirectTessConvertBuffers(const device char* srcBuff [[buffer(0)]],                         \n\
                                               device char* destBuff [[buffer(1)]],                              \n\
                                               device char* paramsBuff [[buffer(2)]],                            \n\
                                               constant uint32_t& srcStride [[buffer(3)]],                       \n\
                                               constant uint32_t& inControlPointCount [[buffer(4)]],             \n\
                                               constant uint32_t& outControlPointCount [[buffer(5)]],            \n\
                                               constant uint32_t& drawCount [[buffer(6)]],                       \n\
                                               constant uint32_t& vtxThreadExecWidth [[buffer(7)]],              \n\
                                               constant uint32_t& tcWorkgroupSize [[buffer(8)]],                 \n\
                                               uint idx [[thread_position_in_grid]]) {                           \n\
     if (idx >= drawCount) { return; }                                                                           \n\
     const device auto& src = *reinterpret_cast<const device MTLDrawPrimitivesIndirectArguments*>(srcBuff + idx * srcStride);\n\
     device char* dest;                                                                                          \n\
     device auto* params = reinterpret_cast<device uint32_t*>(paramsBuff + idx * 256);                           \n\
 #if __METAL_VERSION__ >= 210                                                                                    \n\
     dest = destBuff + idx * (sizeof(MTLStageInRegionIndirectArguments) + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
     device auto& destSI = *(device MTLStageInRegionIndirectArguments*)dest;                                     \n\
     dest += sizeof(MTLStageInRegionIndirectArguments);                                                          \n\
 #else                                                                                                           \n\
     dest = destBuff + idx * (sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
 #endif                                                                                                          \n\
     device auto& destVtx = *(device MTLDispatchThreadgroupsIndirectArguments*)dest;                             \n\
     device auto& destTC = *(device MTLDispatchThreadgroupsIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments));\n\
     device auto& destTE = *(device MTLDrawPatchIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2);\n\
     uint32_t patchCount = (src.vertexCount * src.instanceCount + inControlPointCount - 1) / inControlPointCount;\n\
     params[0] = inControlPointCount;                                                                            \n\
     params[1] = patchCount;                                                                                     \n\
     destVtx.threadgroupsPerGrid[0] = (src.vertexCount + vtxThreadExecWidth - 1) / vtxThreadExecWidth;           \n\
     destVtx.threadgroupsPerGrid[1] = src.instanceCount;                                                         \n\
     destVtx.threadgroupsPerGrid[2] = 1;                                                                         \n\
     destTC.threadgroupsPerGrid[0] = (patchCount * outControlPointCount + tcWorkgroupSize - 1) / tcWorkgroupSize;\n\
     destTC.threadgroupsPerGrid[1] = destTC.threadgroupsPerGrid[2] = 1;                                          \n\
     destTE.patchCount = patchCount;                                                                             \n\
     destTE.instanceCount = 1;                                                                                   \n\
     destTE.patchStart = destTE.baseInstance = 0;                                                                \n\
 #if __METAL_VERSION__ >= 210                                                                                    \n\
     destSI.stageInOrigin[0] = src.vertexStart;                                                                  \n\
     destSI.stageInOrigin[1] = src.baseInstance;                                                                 \n\
     destSI.stageInOrigin[2] = 0;                                                                                \n\
     destSI.stageInSize[0] = src.vertexCount;                                                                    \n\
     destSI.stageInSize[1] = src.instanceCount;                                                                  \n\
     destSI.stageInSize[2] = 1;                                                                                  \n\
 #endif                                                                                                          \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 kernel void cmdDrawIndexedIndirectConvertBuffers(const device char* srcBuff [[buffer(0)]],                      \n\
                                                  device char* destBuff [[buffer(1)]],                           \n\
                                                  device char* paramsBuff [[buffer(2)]],                         \n\
                                                  constant uint32_t& srcStride [[buffer(3)]],                    \n\
                                                  constant uint32_t& inControlPointCount [[buffer(4)]],          \n\
                                                  constant uint32_t& outControlPointCount [[buffer(5)]],         \n\
                                                  constant uint32_t& drawCount [[buffer(6)]],                    \n\
                                                  constant uint32_t& vtxThreadExecWidth [[buffer(7)]],           \n\
                                                  constant uint32_t& tcWorkgroupSize [[buffer(8)]],              \n\
                                                  uint idx [[thread_position_in_grid]]) {                        \n\
     if (idx >= drawCount) { return; }                                                                           \n\
     const device auto& src = *reinterpret_cast<const device MTLDrawIndexedPrimitivesIndirectArguments*>(srcBuff + idx * srcStride);\n\
     device char* dest;                                                                                          \n\
     device auto* params = reinterpret_cast<device uint32_t*>(paramsBuff + idx * 256);                           \n\
 #if __METAL_VERSION__ >= 210                                                                                    \n\
     dest = destBuff + idx * (sizeof(MTLStageInRegionIndirectArguments) + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
     device auto& destSI = *(device MTLStageInRegionIndirectArguments*)dest;                                     \n\
     dest += sizeof(MTLStageInRegionIndirectArguments);                                                          \n\
 #else                                                                                                           \n\
     dest = destBuff + idx * (sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
 #endif                                                                                                          \n\
     device auto& destVtx = *(device MTLDispatchThreadgroupsIndirectArguments*)dest;                             \n\
     device auto& destTC = *(device MTLDispatchThreadgroupsIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments));\n\
     device auto& destTE = *(device MTLDrawPatchIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2);\n\
     uint32_t patchCount = (src.indexCount * src.instanceCount + inControlPointCount - 1) / inControlPointCount;\n\
     params[0] = inControlPointCount;                                                                            \n\
     params[1] = patchCount;                                                                                     \n\
     destVtx.threadgroupsPerGrid[0] = (src.indexCount + vtxThreadExecWidth - 1) / vtxThreadExecWidth;            \n\
     destVtx.threadgroupsPerGrid[1] = src.instanceCount;                                                         \n\
     destVtx.threadgroupsPerGrid[2] = 1;                                                                         \n\
     destTC.threadgroupsPerGrid[0] = (patchCount * outControlPointCount + tcWorkgroupSize - 1) / tcWorkgroupSize;\n\
     destTC.threadgroupsPerGrid[1] = destTC.threadgroupsPerGrid[2] = 1;                                          \n\
     destTE.patchCount = patchCount;                                                                             \n\
     destTE.instanceCount = 1;                                                                                   \n\
     destTE.patchStart = destTE.baseInstance = 0;                                                                \n\
 #if __METAL_VERSION__ >= 210                                                                                    \n\
     destSI.stageInOrigin[0] = src.baseVertex;                                                                   \n\
     destSI.stageInOrigin[1] = src.baseInstance;                                                                 \n\
     destSI.stageInOrigin[2] = 0;                                                                                \n\
     destSI.stageInSize[0] = src.indexCount;                                                                     \n\
     destSI.stageInSize[1] = src.instanceCount;                                                                  \n\
     destSI.stageInSize[2] = 1;                                                                                  \n\
 #endif                                                                                                          \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 kernel void cmdDrawIndexedCopyIndex16Buffer(const device uint16_t* srcBuff [[buffer(0)]],                       \n\
                                             device uint16_t* destBuff [[buffer(1)]],                            \n\
                                             const device MTLDrawIndexedPrimitivesIndirectArguments& params [[buffer(2)]],\n\
                                             uint i [[thread_position_in_grid]]) {                               \n\
     destBuff[i] = srcBuff[params.indexStart + i];                                                               \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 kernel void cmdDrawIndexedCopyIndex32Buffer(const device uint32_t* srcBuff [[buffer(0)]],                       \n\
                                             device uint32_t* destBuff [[buffer(1)]],                            \n\
                                             const device MTLDrawIndexedPrimitivesIndirectArguments& params [[buffer(2)]],\n\
                                             uint i [[thread_position_in_grid]]) {                               \n\
     destBuff[i] = srcBuff[params.indexStart + i];                                                               \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 #endif                                                                                                          \n\
                                                                                                                 \n\
 typedef struct {                                                                                                \n\
     uint32_t count;                                                                                             \n\
     uint32_t countHigh;                                                                                         \n\
 } VisibilityBuffer;                                                                                             \n\
                                                                                                                 \n\
 typedef enum {                                                                                                  \n\
     Initial,                                                                                                    \n\
     DeviceAvailable,                                                                                            \n\
     Available                                                                                                   \n\
 } QueryStatus;                                                                                                  \n\
                                                                                                                 \n\
 typedef enum {                                                                                                  \n\
     VK_QUERY_RESULT_64_BIT                = 0x00000001,                                                         \n\
     VK_QUERY_RESULT_WAIT_BIT              = 0x00000002,                                                         \n\
     VK_QUERY_RESULT_WITH_AVAILABILITY_BIT = 0x00000004,                                                         \n\
     VK_QUERY_RESULT_PARTIAL_BIT           = 0x00000008,                                                         \n\
 } VkQueryResultFlagBits;                                                                                        \n\
                                                                                                                 \n\
 kernel void cmdCopyQueryPoolResultsToBuffer(const device VisibilityBuffer* src [[buffer(0)]],                   \n\
                                             device uint8_t* dest [[buffer(1)]],                                 \n\
                                             constant uint& stride [[buffer(2)]],                                \n\
                                             constant uint& numQueries [[buffer(3)]],                            \n\
                                             constant uint& flags [[buffer(4)]],                                 \n\
                                             constant QueryStatus* availability [[buffer(5)]],                   \n\
                                             uint query [[thread_position_in_grid]]) {                           \n\
     if (query >= numQueries) { return; }                                                                        \n\
     device uint32_t* destCount = (device uint32_t*)(dest + stride * query);                                     \n\
     if (availability[query] != Initial || flags & VK_QUERY_RESULT_PARTIAL_BIT) {                                \n\
         destCount[0] = src[query].count;                                                                        \n\
         if (flags & VK_QUERY_RESULT_64_BIT) { destCount[1] = src[query].countHigh; }                            \n\
     }                                                                                                           \n\
     if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {                                                        \n\
         if (flags & VK_QUERY_RESULT_64_BIT) {                                                                   \n\
             destCount[2] = availability[query] != Initial ? 1 : 0;                                              \n\
             destCount[3] = 0;                                                                                   \n\
         } else {                                                                                                \n\
             destCount[1] = availability[query] != Initial ? 1 : 0;                                              \n\
         }                                                                                                       \n\
     }                                                                                                           \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 kernel void accumulateOcclusionQueryResults(device VisibilityBuffer& dest [[buffer(0)]],                        \n\
                                             const device VisibilityBuffer& src [[buffer(1)]]) {                 \n\
     uint32_t oldDestCount = dest.count;                                                                         \n\
     dest.count += src.count;                                                                                    \n\
     dest.countHigh += src.countHigh;                                                                            \n\
     if (dest.count < max(oldDestCount, src.count)) { dest.countHigh++; }                                        \n\
 }                                                                                                               \n\
                                                                                                                 \n\
 ";
	/*
	* MVKCommandPipelineStateFactoryShaderSource.h
	*
	* Copyright (c) 2015-2021 The Brenwill Workshop Ltd. (http://www.brenwill.com)
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "MVKDevice.h"

	#import <Foundation/Foundation.h>


	/** This file contains static MSL source code for the MoltenVK command shaders. */

	static NSString* _MVKStaticCmdShaderSource = @" \n\
	#include <metal_stdlib> \n\
	using namespace metal; \n\
	\n\
	typedef struct { \n\
	float2 a_position [[attribute(0)]]; \n\
	float3 a_texCoord [[attribute(1)]]; \n\
	} AttributesPosTex; \n\
	\n\
	typedef struct { \n\
	float4 v_position [[position]]; \n\
	float3 v_texCoord; \n\
	} VaryingsPosTex; \n\
	\n\
	typedef struct { \n\
	float4 v_position [[position]]; \n\
	float3 v_texCoord; \n\
	uint v_layer [[render_target_array_index]]; \n\
	} VaryingsPosTexLayer; \n\
	\n\
	typedef size_t VkDeviceSize; \n\
	\n\
	typedef enum : uint32_t { \n\
	VK_FORMAT_BC1_RGB_UNORM_BLOCK = 131, \n\
	VK_FORMAT_BC1_RGB_SRGB_BLOCK = 132, \n\
	VK_FORMAT_BC1_RGBA_UNORM_BLOCK = 133, \n\
	VK_FORMAT_BC1_RGBA_SRGB_BLOCK = 134, \n\
	VK_FORMAT_BC2_UNORM_BLOCK = 135, \n\
	VK_FORMAT_BC2_SRGB_BLOCK = 136, \n\
	VK_FORMAT_BC3_UNORM_BLOCK = 137, \n\
	VK_FORMAT_BC3_SRGB_BLOCK = 138, \n\
	} VkFormat; \n\
	\n\
	typedef struct { \n\
	uint32_t width; \n\
	uint32_t height; \n\
	} VkExtent2D; \n\
	\n\
	typedef struct { \n\
	uint32_t width; \n\
	uint32_t height; \n\
	uint32_t depth; \n\
	} __attribute__((packed)) VkExtent3D; \n\
	\n\
	typedef struct { \n\
	int32_t x; \n\
	int32_t y; \n\
	int32_t z; \n\
	} __attribute__((packed)) VkOffset3D; \n\
	\n"
	#define MVK_DECOMPRESS_CODE(...) #__VA_ARGS__
	#include "MVKDXTnCodec.def"
	#undef MVK_DECOMPRESS_CODE
	"\n\
	\n\
	vertex VaryingsPosTex vtxCmdBlitImage(AttributesPosTex attributes [[stage_in]]) { \n\
	VaryingsPosTex varyings; \n\
	varyings.v_position = float4(attributes.a_position, 0.0, 1.0); \n\
	varyings.v_texCoord = attributes.a_texCoord; \n\
	return varyings; \n\
	} \n\
	\n\
	vertex VaryingsPosTexLayer vtxCmdBlitImageLayered(AttributesPosTex attributes [[stage_in]], \n\
	uint instanceID [[instance_id]], \n\
	constant float &zIncr [[buffer(0)]]) { \n\
	VaryingsPosTexLayer varyings; \n\
	varyings.v_position = float4(attributes.a_position, 0.0, 1.0); \n\
	varyings.v_texCoord = float3(attributes.a_texCoord.xy, attributes.a_texCoord.z + (instanceID + 0.5) * zIncr);\n\
	varyings.v_layer = instanceID; \n\
	return varyings; \n\
	} \n\
	\n\
	typedef struct { \n\
	uint32_t srcOffset; \n\
	uint32_t dstOffset; \n\
	uint32_t size; \n\
	} CopyInfo; \n\
	\n\
	kernel void cmdCopyBufferBytes(device uint8_t* src [[ buffer(0) ]], \n\
	device uint8_t* dst [[ buffer(1) ]], \n\
	constant CopyInfo& info [[ buffer(2) ]]) { \n\
	for (size_t i = 0; i < info.size; i++) { \n\
	dst[i + info.dstOffset] = src[i + info.srcOffset]; \n\
	} \n\
	} \n\
	\n\
	kernel void cmdFillBuffer(device uint32_t* dst [[ buffer(0) ]], \n\
	constant uint32_t& fillValue [[ buffer(1) ]], \n\
	uint pos [[thread_position_in_grid]]) { \n\
	dst[pos] = fillValue; \n\
	} \n\
	\n\
	kernel void cmdClearColorImage2DFloat(texture2d<float, access::write> dst [[ texture(0) ]], \n\
	constant float4& clearValue [[ buffer(0) ]], \n\
	uint2 pos [[thread_position_in_grid]]) { \n\
	dst.write(clearValue, pos); \n\
	} \n\
	\n\
	kernel void cmdClearColorImage2DUInt(texture2d<uint, access::write> dst [[ texture(0) ]], \n\
	constant uint4& clearValue [[ buffer(0) ]], \n\
	uint2 pos [[thread_position_in_grid]]) { \n\
	dst.write(clearValue, pos); \n\
	} \n\
	\n\
	kernel void cmdClearColorImage2DInt(texture2d<int, access::write> dst [[ texture(0) ]], \n\
	constant int4& clearValue [[ buffer(0) ]], \n\
	uint2 pos [[thread_position_in_grid]]) { \n\
	dst.write(clearValue, pos); \n\
	} \n\
	\n\
	typedef struct { \n\
	uint32_t srcRowStride; \n\
	uint32_t srcRowStrideHigh; \n\
	uint32_t srcDepthStride; \n\
	uint32_t srcDepthStrideHigh; \n\
	uint32_t destRowStride; \n\
	uint32_t destRowStrideHigh; \n\
	uint32_t destDepthStride; \n\
	uint32_t destDepthStrideHigh; \n\
	VkFormat format; \n\
	VkOffset3D offset; \n\
	VkExtent3D extent; \n\
	} CmdCopyBufferToImageInfo; \n\
	\n\
	kernel void cmdCopyBufferToImage3DDecompressDXTn(const device uint8_t* src [[buffer(0)]], \n\
	texture3d<float, access::write> dest [[texture(0)]], \n\
	constant CmdCopyBufferToImageInfo& info [[buffer(2)]], \n\
	uint3 pos [[thread_position_in_grid]]) { \n\
	uint x = pos.x * 4, y = pos.y * 4, z = pos.z; \n\
	VkDeviceSize blockByteCount = isBC1Format(info.format) ? 8 : 16; \n\
	\n\
	if (x >= info.extent.width \|\| y >= info.extent.height \|\| z >= info.extent.depth) { return; } \n\
	\n\
	src += z * info.srcDepthStride + y * info.srcRowStride / 4 + x * blockByteCount / 4; \n\
	VkExtent2D blockExtent; \n\
	blockExtent.width = min(info.extent.width - x, 4u); \n\
	blockExtent.height = min(info.extent.height - y, 4u); \n\
	uint pixels[16] = {0}; \n\
	decompressDXTnBlock(src, pixels, blockExtent, 4 * sizeof(uint), info.format); \n\
	for (uint j = 0; j < blockExtent.height; ++j) { \n\
	for (uint i = 0; i < blockExtent.width; ++i) { \n\
	// The pixel components are in BGRA order, but texture::write wants them \n\
	// in RGBA order. We can fix that (ironically) with a BGRA swizzle. \n\
	dest.write(unpack_unorm4x8_to_float(pixels[j * 4 + i]).bgra, \n\
	uint3(info.offset.x + x + i, info.offset.y + y + j, info.offset.z + z)); \n\
	} \n\
	} \n\
	} \n\
	\n\
	kernel void cmdCopyBufferToImage3DDecompressTempBufferDXTn(const device uint8_t* src [[buffer(0)]], \n\
	device uint8_t* dest [[buffer(1)]], \n\
	constant CmdCopyBufferToImageInfo& info [[buffer(2)]],\n\
	uint3 pos [[thread_position_in_grid]]) { \n\
	uint x = pos.x * 4, y = pos.y * 4, z = pos.z; \n\
	VkDeviceSize blockByteCount = isBC1Format(info.format) ? 8 : 16; \n\
	\n\
	if (x >= info.extent.width \|\| y >= info.extent.height \|\| z >= info.extent.depth) { return; } \n\
	\n\
	src += z * info.srcDepthStride + y * info.srcRowStride / 4 + x * blockByteCount / 4; \n\
	dest += z * info.destDepthStride + y * info.destRowStride + x * sizeof(uint); \n\
	VkExtent2D blockExtent; \n\
	blockExtent.width = min(info.extent.width - x, 4u); \n\
	blockExtent.height = min(info.extent.height - y, 4u); \n\
	uint pixels[16] = {0}; \n\
	decompressDXTnBlock(src, pixels, blockExtent, 4 * sizeof(uint), info.format); \n\
	device uint* destPixel = (device uint*)dest; \n\
	for (uint j = 0; j < blockExtent.height; ++j) { \n\
	for (uint i = 0; i < blockExtent.width; ++i) { \n\
	destPixel[j * info.destRowStride / sizeof(uint) + i] = pixels[j * 4 + i]; \n\
	} \n\
	} \n\
	} \n\
	\n\
	#if __METAL_VERSION__ >= 210 \n\
	// This structure is missing from the MSL headers. :/ \n\
	struct MTLStageInRegionIndirectArguments { \n\
	uint32_t stageInOrigin[3]; \n\
	uint32_t stageInSize[3]; \n\
	}; \n\
	#endif \n\
	\n\
	kernel void cmdDrawIndirectMultiviewConvertBuffers(const device char* srcBuff [[buffer(0)]], \n\
	device MTLDrawPrimitivesIndirectArguments* destBuff [[buffer(1)]],\n\
	constant uint32_t& srcStride [[buffer(2)]], \n\
	constant uint32_t& drawCount [[buffer(3)]], \n\
	constant uint32_t& viewCount [[buffer(4)]], \n\
	uint idx [[thread_position_in_grid]]) { \n\
	if (idx >= drawCount) { return; } \n\
	const device auto& src = reinterpret_cast<const device MTLDrawPrimitivesIndirectArguments>(srcBuff + idx * srcStride);\n\
	destBuff[idx] = src; \n\
	destBuff[idx].instanceCount *= viewCount; \n\
	} \n\
	\n\
	kernel void cmdDrawIndexedIndirectMultiviewConvertBuffers(const device char* srcBuff [[buffer(0)]], \n\
	device MTLDrawIndexedPrimitivesIndirectArguments* destBuff [[buffer(1)]],\n\
	constant uint32_t& srcStride [[buffer(2)]], \n\
	constant uint32_t& drawCount [[buffer(3)]], \n\
	constant uint32_t& viewCount [[buffer(4)]], \n\
	uint idx [[thread_position_in_grid]]) { \n\
	if (idx >= drawCount) { return; } \n\
	const device auto& src = reinterpret_cast<const device MTLDrawIndexedPrimitivesIndirectArguments>(srcBuff + idx * srcStride);\n\
	destBuff[idx] = src; \n\
	destBuff[idx].instanceCount *= viewCount; \n\
	} \n\
	\n\
	#if __METAL_VERSION__ >= 120 \n\
	kernel void cmdDrawIndirectTessConvertBuffers(const device char* srcBuff [[buffer(0)]], \n\
	device char* destBuff [[buffer(1)]], \n\
	device char* paramsBuff [[buffer(2)]], \n\
	constant uint32_t& srcStride [[buffer(3)]], \n\
	constant uint32_t& inControlPointCount [[buffer(4)]], \n\
	constant uint32_t& outControlPointCount [[buffer(5)]], \n\
	constant uint32_t& drawCount [[buffer(6)]], \n\
	constant uint32_t& vtxThreadExecWidth [[buffer(7)]], \n\
	constant uint32_t& tcWorkgroupSize [[buffer(8)]], \n\
	uint idx [[thread_position_in_grid]]) { \n\
	if (idx >= drawCount) { return; } \n\
	const device auto& src = reinterpret_cast<const device MTLDrawPrimitivesIndirectArguments>(srcBuff + idx * srcStride);\n\
	device char* dest; \n\
	device auto* params = reinterpret_cast<device uint32_t>(paramsBuff + idx 256); \n\
	#if __METAL_VERSION__ >= 210 \n\
	dest = destBuff + idx * (sizeof(MTLStageInRegionIndirectArguments) + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
	device auto& destSI = (device MTLStageInRegionIndirectArguments)dest; \n\
	dest += sizeof(MTLStageInRegionIndirectArguments); \n\
	#else \n\
	dest = destBuff + idx * (sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
	#endif \n\
	device auto& destVtx = (device MTLDispatchThreadgroupsIndirectArguments)dest; \n\
	device auto& destTC = (device MTLDispatchThreadgroupsIndirectArguments)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments));\n\
	device auto& destTE = (device MTLDrawPatchIndirectArguments)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2);\n\
	uint32_t patchCount = (src.vertexCount * src.instanceCount + inControlPointCount - 1) / inControlPointCount;\n\
	params[0] = inControlPointCount; \n\
	params[1] = patchCount; \n\
	destVtx.threadgroupsPerGrid[0] = (src.vertexCount + vtxThreadExecWidth - 1) / vtxThreadExecWidth; \n\
	destVtx.threadgroupsPerGrid[1] = src.instanceCount; \n\
	destVtx.threadgroupsPerGrid[2] = 1; \n\
	destTC.threadgroupsPerGrid[0] = (patchCount * outControlPointCount + tcWorkgroupSize - 1) / tcWorkgroupSize;\n\
	destTC.threadgroupsPerGrid[1] = destTC.threadgroupsPerGrid[2] = 1; \n\
	destTE.patchCount = patchCount; \n\
	destTE.instanceCount = 1; \n\
	destTE.patchStart = destTE.baseInstance = 0; \n\
	#if __METAL_VERSION__ >= 210 \n\
	destSI.stageInOrigin[0] = src.vertexStart; \n\
	destSI.stageInOrigin[1] = src.baseInstance; \n\
	destSI.stageInOrigin[2] = 0; \n\
	destSI.stageInSize[0] = src.vertexCount; \n\
	destSI.stageInSize[1] = src.instanceCount; \n\
	destSI.stageInSize[2] = 1; \n\
	#endif \n\
	} \n\
	\n\
	kernel void cmdDrawIndexedIndirectConvertBuffers(const device char* srcBuff [[buffer(0)]], \n\
	device char* destBuff [[buffer(1)]], \n\
	device char* paramsBuff [[buffer(2)]], \n\
	constant uint32_t& srcStride [[buffer(3)]], \n\
	constant uint32_t& inControlPointCount [[buffer(4)]], \n\
	constant uint32_t& outControlPointCount [[buffer(5)]], \n\
	constant uint32_t& drawCount [[buffer(6)]], \n\
	constant uint32_t& vtxThreadExecWidth [[buffer(7)]], \n\
	constant uint32_t& tcWorkgroupSize [[buffer(8)]], \n\
	uint idx [[thread_position_in_grid]]) { \n\
	if (idx >= drawCount) { return; } \n\
	const device auto& src = reinterpret_cast<const device MTLDrawIndexedPrimitivesIndirectArguments>(srcBuff + idx * srcStride);\n\
	device char* dest; \n\
	device auto* params = reinterpret_cast<device uint32_t>(paramsBuff + idx 256); \n\
	#if __METAL_VERSION__ >= 210 \n\
	dest = destBuff + idx * (sizeof(MTLStageInRegionIndirectArguments) + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
	device auto& destSI = (device MTLStageInRegionIndirectArguments)dest; \n\
	dest += sizeof(MTLStageInRegionIndirectArguments); \n\
	#else \n\
	dest = destBuff + idx * (sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
	#endif \n\
	device auto& destVtx = (device MTLDispatchThreadgroupsIndirectArguments)dest; \n\
	device auto& destTC = (device MTLDispatchThreadgroupsIndirectArguments)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments));\n\
	device auto& destTE = (device MTLDrawPatchIndirectArguments)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2);\n\
	uint32_t patchCount = (src.indexCount * src.instanceCount + inControlPointCount - 1) / inControlPointCount;\n\
	params[0] = inControlPointCount; \n\
	params[1] = patchCount; \n\
	destVtx.threadgroupsPerGrid[0] = (src.indexCount + vtxThreadExecWidth - 1) / vtxThreadExecWidth; \n\
	destVtx.threadgroupsPerGrid[1] = src.instanceCount; \n\
	destVtx.threadgroupsPerGrid[2] = 1; \n\
	destTC.threadgroupsPerGrid[0] = (patchCount * outControlPointCount + tcWorkgroupSize - 1) / tcWorkgroupSize;\n\
	destTC.threadgroupsPerGrid[1] = destTC.threadgroupsPerGrid[2] = 1; \n\
	destTE.patchCount = patchCount; \n\
	destTE.instanceCount = 1; \n\
	destTE.patchStart = destTE.baseInstance = 0; \n\
	#if __METAL_VERSION__ >= 210 \n\
	destSI.stageInOrigin[0] = src.baseVertex; \n\
	destSI.stageInOrigin[1] = src.baseInstance; \n\
	destSI.stageInOrigin[2] = 0; \n\
	destSI.stageInSize[0] = src.indexCount; \n\
	destSI.stageInSize[1] = src.instanceCount; \n\
	destSI.stageInSize[2] = 1; \n\
	#endif \n\
	} \n\
	\n\
	kernel void cmdDrawIndexedCopyIndex16Buffer(const device uint16_t* srcBuff [[buffer(0)]], \n\
	device uint16_t* destBuff [[buffer(1)]], \n\
	const device MTLDrawIndexedPrimitivesIndirectArguments& params [[buffer(2)]],\n\
	uint i [[thread_position_in_grid]]) { \n\
	destBuff[i] = srcBuff[params.indexStart + i]; \n\
	} \n\
	\n\
	kernel void cmdDrawIndexedCopyIndex32Buffer(const device uint32_t* srcBuff [[buffer(0)]], \n\
	device uint32_t* destBuff [[buffer(1)]], \n\
	const device MTLDrawIndexedPrimitivesIndirectArguments& params [[buffer(2)]],\n\
	uint i [[thread_position_in_grid]]) { \n\
	destBuff[i] = srcBuff[params.indexStart + i]; \n\
	} \n\
	\n\
	#endif \n\
	\n\
	typedef struct { \n\
	uint32_t count; \n\
	uint32_t countHigh; \n\
	} VisibilityBuffer; \n\
	\n\
	typedef enum { \n\
	Initial, \n\
	DeviceAvailable, \n\
	Available \n\
	} QueryStatus; \n\
	\n\
	typedef enum { \n\
	VK_QUERY_RESULT_64_BIT = 0x00000001, \n\
	VK_QUERY_RESULT_WAIT_BIT = 0x00000002, \n\
	VK_QUERY_RESULT_WITH_AVAILABILITY_BIT = 0x00000004, \n\
	VK_QUERY_RESULT_PARTIAL_BIT = 0x00000008, \n\
	} VkQueryResultFlagBits; \n\
	\n\
	kernel void cmdCopyQueryPoolResultsToBuffer(const device VisibilityBuffer* src [[buffer(0)]], \n\
	device uint8_t* dest [[buffer(1)]], \n\
	constant uint& stride [[buffer(2)]], \n\
	constant uint& numQueries [[buffer(3)]], \n\
	constant uint& flags [[buffer(4)]], \n\
	constant QueryStatus* availability [[buffer(5)]], \n\
	uint query [[thread_position_in_grid]]) { \n\
	if (query >= numQueries) { return; } \n\
	device uint32_t* destCount = (device uint32_t)(dest + stride query); \n\
	if (availability[query] != Initial \|\| flags & VK_QUERY_RESULT_PARTIAL_BIT) { \n\
	destCount[0] = src[query].count; \n\
	if (flags & VK_QUERY_RESULT_64_BIT) { destCount[1] = src[query].countHigh; } \n\
	} \n\
	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { \n\
	if (flags & VK_QUERY_RESULT_64_BIT) { \n\
	destCount[2] = availability[query] != Initial ? 1 : 0; \n\
	destCount[3] = 0; \n\
	} else { \n\
	destCount[1] = availability[query] != Initial ? 1 : 0; \n\
	} \n\
	} \n\
	} \n\
	\n\
	kernel void accumulateOcclusionQueryResults(device VisibilityBuffer& dest [[buffer(0)]], \n\
	const device VisibilityBuffer& src [[buffer(1)]]) { \n\
	uint32_t oldDestCount = dest.count; \n\
	dest.count += src.count; \n\
	dest.countHigh += src.countHigh; \n\
	if (dest.count < max(oldDestCount, src.count)) { dest.countHigh++; } \n\
	} \n\
	\n\
	";