blob: 3730d71939248aaf2b7327fbe4479c8b752dd065 [file] [log] [blame]
/*
* MVKCommandPipelineStateFactoryShaderSource.h
*
* Copyright (c) 2015-2022 The Brenwill Workshop Ltd. (http://www.brenwill.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "MVKDevice.h"
#import <Foundation/Foundation.h>
/** This file contains static MSL source code for the MoltenVK command shaders. */
static NSString* _MVKStaticCmdShaderSource = @" \n\
#include <metal_stdlib> \n\
using namespace metal; \n\
\n\
typedef struct { \n\
float2 a_position [[attribute(0)]]; \n\
float3 a_texCoord [[attribute(1)]]; \n\
} AttributesPosTex; \n\
\n\
typedef struct { \n\
float4 v_position [[position]]; \n\
float3 v_texCoord; \n\
} VaryingsPosTex; \n\
\n\
typedef struct { \n\
float4 v_position [[position]]; \n\
float3 v_texCoord; \n\
uint v_layer [[render_target_array_index]]; \n\
} VaryingsPosTexLayer; \n\
\n\
typedef size_t VkDeviceSize; \n\
\n\
typedef enum : uint32_t { \n\
VK_FORMAT_BC1_RGB_UNORM_BLOCK = 131, \n\
VK_FORMAT_BC1_RGB_SRGB_BLOCK = 132, \n\
VK_FORMAT_BC1_RGBA_UNORM_BLOCK = 133, \n\
VK_FORMAT_BC1_RGBA_SRGB_BLOCK = 134, \n\
VK_FORMAT_BC2_UNORM_BLOCK = 135, \n\
VK_FORMAT_BC2_SRGB_BLOCK = 136, \n\
VK_FORMAT_BC3_UNORM_BLOCK = 137, \n\
VK_FORMAT_BC3_SRGB_BLOCK = 138, \n\
} VkFormat; \n\
\n\
typedef struct { \n\
uint32_t width; \n\
uint32_t height; \n\
} VkExtent2D; \n\
\n\
typedef struct { \n\
uint32_t width; \n\
uint32_t height; \n\
uint32_t depth; \n\
} __attribute__((packed)) VkExtent3D; \n\
\n\
typedef struct { \n\
int32_t x; \n\
int32_t y; \n\
int32_t z; \n\
} __attribute__((packed)) VkOffset3D; \n\
\n"
#define MVK_DECOMPRESS_CODE(...) #__VA_ARGS__
#include "MVKDXTnCodec.def"
#undef MVK_DECOMPRESS_CODE
"\n\
\n\
vertex VaryingsPosTex vtxCmdBlitImage(AttributesPosTex attributes [[stage_in]]) { \n\
VaryingsPosTex varyings; \n\
varyings.v_position = float4(attributes.a_position, 0.0, 1.0); \n\
varyings.v_texCoord = attributes.a_texCoord; \n\
return varyings; \n\
} \n\
\n\
vertex VaryingsPosTexLayer vtxCmdBlitImageLayered(AttributesPosTex attributes [[stage_in]], \n\
uint instanceID [[instance_id]], \n\
constant float &zIncr [[buffer(0)]]) { \n\
VaryingsPosTexLayer varyings; \n\
varyings.v_position = float4(attributes.a_position, 0.0, 1.0); \n\
varyings.v_texCoord = float3(attributes.a_texCoord.xy, attributes.a_texCoord.z + (instanceID + 0.5) * zIncr);\n\
varyings.v_layer = instanceID; \n\
return varyings; \n\
} \n\
\n\
typedef struct { \n\
uint32_t srcOffset; \n\
uint32_t dstOffset; \n\
uint32_t size; \n\
} CopyInfo; \n\
\n\
kernel void cmdCopyBufferBytes(device uint8_t* src [[ buffer(0) ]], \n\
device uint8_t* dst [[ buffer(1) ]], \n\
constant CopyInfo& info [[ buffer(2) ]]) { \n\
for (size_t i = 0; i < info.size; i++) { \n\
dst[i + info.dstOffset] = src[i + info.srcOffset]; \n\
} \n\
} \n\
\n\
kernel void cmdFillBuffer(device uint32_t* dst [[ buffer(0) ]], \n\
constant uint32_t& fillValue [[ buffer(1) ]], \n\
uint pos [[thread_position_in_grid]]) { \n\
dst[pos] = fillValue; \n\
} \n\
\n\
kernel void cmdClearColorImage2DFloat(texture2d<float, access::write> dst [[ texture(0) ]], \n\
constant float4& clearValue [[ buffer(0) ]], \n\
uint2 pos [[thread_position_in_grid]]) { \n\
dst.write(clearValue, pos); \n\
} \n\
\n\
kernel void cmdClearColorImage2DUInt(texture2d<uint, access::write> dst [[ texture(0) ]], \n\
constant uint4& clearValue [[ buffer(0) ]], \n\
uint2 pos [[thread_position_in_grid]]) { \n\
dst.write(clearValue, pos); \n\
} \n\
\n\
kernel void cmdClearColorImage2DInt(texture2d<int, access::write> dst [[ texture(0) ]], \n\
constant int4& clearValue [[ buffer(0) ]], \n\
uint2 pos [[thread_position_in_grid]]) { \n\
dst.write(clearValue, pos); \n\
} \n\
\n\
kernel void cmdResolveColorImage2DFloat(texture2d<float, access::write> dst [[ texture(0) ]], \n\
texture2d_ms<float, access::read> src [[ texture(1) ]], \n\
uint2 pos [[thread_position_in_grid]]) { \n\
dst.write(src.read(pos, 0), pos); \n\
} \n\
\n\
kernel void cmdResolveColorImage2DUInt(texture2d<uint, access::write> dst [[ texture(0) ]], \n\
texture2d_ms<uint, access::read> src [[ texture(1) ]], \n\
uint2 pos [[thread_position_in_grid]]) { \n\
dst.write(src.read(pos, 0), pos); \n\
} \n\
\n\
kernel void cmdResolveColorImage2DInt(texture2d<int, access::write> dst [[ texture(0) ]], \n\
texture2d_ms<int, access::read> src [[ texture(1) ]], \n\
uint2 pos [[thread_position_in_grid]]) { \n\
dst.write(src.read(pos, 0), pos); \n\
} \n\
\n\
typedef struct { \n\
uint32_t srcRowStride; \n\
uint32_t srcRowStrideHigh; \n\
uint32_t srcDepthStride; \n\
uint32_t srcDepthStrideHigh; \n\
uint32_t destRowStride; \n\
uint32_t destRowStrideHigh; \n\
uint32_t destDepthStride; \n\
uint32_t destDepthStrideHigh; \n\
VkFormat format; \n\
VkOffset3D offset; \n\
VkExtent3D extent; \n\
} CmdCopyBufferToImageInfo; \n\
\n\
kernel void cmdCopyBufferToImage3DDecompressDXTn(const device uint8_t* src [[buffer(0)]], \n\
texture3d<float, access::write> dest [[texture(0)]], \n\
constant CmdCopyBufferToImageInfo& info [[buffer(2)]], \n\
uint3 pos [[thread_position_in_grid]]) { \n\
uint x = pos.x * 4, y = pos.y * 4, z = pos.z; \n\
VkDeviceSize blockByteCount = isBC1Format(info.format) ? 8 : 16; \n\
\n\
if (x >= info.extent.width || y >= info.extent.height || z >= info.extent.depth) { return; } \n\
\n\
src += z * info.srcDepthStride + y * info.srcRowStride / 4 + x * blockByteCount / 4; \n\
VkExtent2D blockExtent; \n\
blockExtent.width = min(info.extent.width - x, 4u); \n\
blockExtent.height = min(info.extent.height - y, 4u); \n\
uint pixels[16] = {0}; \n\
decompressDXTnBlock(src, pixels, blockExtent, 4 * sizeof(uint), info.format); \n\
for (uint j = 0; j < blockExtent.height; ++j) { \n\
for (uint i = 0; i < blockExtent.width; ++i) { \n\
// The pixel components are in BGRA order, but texture::write wants them \n\
// in RGBA order. We can fix that (ironically) with a BGRA swizzle. \n\
dest.write(unpack_unorm4x8_to_float(pixels[j * 4 + i]).bgra, \n\
uint3(info.offset.x + x + i, info.offset.y + y + j, info.offset.z + z)); \n\
} \n\
} \n\
} \n\
\n\
kernel void cmdCopyBufferToImage3DDecompressTempBufferDXTn(const device uint8_t* src [[buffer(0)]], \n\
device uint8_t* dest [[buffer(1)]], \n\
constant CmdCopyBufferToImageInfo& info [[buffer(2)]],\n\
uint3 pos [[thread_position_in_grid]]) { \n\
uint x = pos.x * 4, y = pos.y * 4, z = pos.z; \n\
VkDeviceSize blockByteCount = isBC1Format(info.format) ? 8 : 16; \n\
\n\
if (x >= info.extent.width || y >= info.extent.height || z >= info.extent.depth) { return; } \n\
\n\
src += z * info.srcDepthStride + y * info.srcRowStride / 4 + x * blockByteCount / 4; \n\
dest += z * info.destDepthStride + y * info.destRowStride + x * sizeof(uint); \n\
VkExtent2D blockExtent; \n\
blockExtent.width = min(info.extent.width - x, 4u); \n\
blockExtent.height = min(info.extent.height - y, 4u); \n\
uint pixels[16] = {0}; \n\
decompressDXTnBlock(src, pixels, blockExtent, 4 * sizeof(uint), info.format); \n\
device uint* destPixel = (device uint*)dest; \n\
for (uint j = 0; j < blockExtent.height; ++j) { \n\
for (uint i = 0; i < blockExtent.width; ++i) { \n\
destPixel[j * info.destRowStride / sizeof(uint) + i] = pixels[j * 4 + i]; \n\
} \n\
} \n\
} \n\
\n\
#if __METAL_VERSION__ >= 210 \n\
// This structure is missing from the MSL headers. :/ \n\
struct MTLStageInRegionIndirectArguments { \n\
uint32_t stageInOrigin[3]; \n\
uint32_t stageInSize[3]; \n\
}; \n\
#endif \n\
\n\
kernel void cmdDrawIndirectMultiviewConvertBuffers(const device char* srcBuff [[buffer(0)]], \n\
device MTLDrawPrimitivesIndirectArguments* destBuff [[buffer(1)]],\n\
constant uint32_t& srcStride [[buffer(2)]], \n\
constant uint32_t& drawCount [[buffer(3)]], \n\
constant uint32_t& viewCount [[buffer(4)]], \n\
uint idx [[thread_position_in_grid]]) { \n\
if (idx >= drawCount) { return; } \n\
const device auto& src = *reinterpret_cast<const device MTLDrawPrimitivesIndirectArguments*>(srcBuff + idx * srcStride);\n\
destBuff[idx] = src; \n\
destBuff[idx].instanceCount *= viewCount; \n\
} \n\
\n\
kernel void cmdDrawIndexedIndirectMultiviewConvertBuffers(const device char* srcBuff [[buffer(0)]], \n\
device MTLDrawIndexedPrimitivesIndirectArguments* destBuff [[buffer(1)]],\n\
constant uint32_t& srcStride [[buffer(2)]], \n\
constant uint32_t& drawCount [[buffer(3)]], \n\
constant uint32_t& viewCount [[buffer(4)]], \n\
uint idx [[thread_position_in_grid]]) { \n\
if (idx >= drawCount) { return; } \n\
const device auto& src = *reinterpret_cast<const device MTLDrawIndexedPrimitivesIndirectArguments*>(srcBuff + idx * srcStride);\n\
destBuff[idx] = src; \n\
destBuff[idx].instanceCount *= viewCount; \n\
} \n\
\n\
#if __METAL_VERSION__ >= 120 \n\
kernel void cmdDrawIndirectTessConvertBuffers(const device char* srcBuff [[buffer(0)]], \n\
device char* destBuff [[buffer(1)]], \n\
device char* paramsBuff [[buffer(2)]], \n\
constant uint32_t& srcStride [[buffer(3)]], \n\
constant uint32_t& inControlPointCount [[buffer(4)]], \n\
constant uint32_t& outControlPointCount [[buffer(5)]], \n\
constant uint32_t& drawCount [[buffer(6)]], \n\
constant uint32_t& vtxThreadExecWidth [[buffer(7)]], \n\
constant uint32_t& tcWorkgroupSize [[buffer(8)]], \n\
uint idx [[thread_position_in_grid]]) { \n\
if (idx >= drawCount) { return; } \n\
const device auto& src = *reinterpret_cast<const device MTLDrawPrimitivesIndirectArguments*>(srcBuff + idx * srcStride);\n\
device char* dest; \n\
device auto* params = reinterpret_cast<device uint32_t*>(paramsBuff + idx * 256); \n\
#if __METAL_VERSION__ >= 210 \n\
dest = destBuff + idx * (sizeof(MTLStageInRegionIndirectArguments) + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
device auto& destSI = *(device MTLStageInRegionIndirectArguments*)dest; \n\
dest += sizeof(MTLStageInRegionIndirectArguments); \n\
#else \n\
dest = destBuff + idx * (sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
#endif \n\
device auto& destVtx = *(device MTLDispatchThreadgroupsIndirectArguments*)dest; \n\
device auto& destTC = *(device MTLDispatchThreadgroupsIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments));\n\
device auto& destTE = *(device MTLDrawPatchIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2);\n\
uint32_t patchCount = (src.vertexCount * src.instanceCount + inControlPointCount - 1) / inControlPointCount;\n\
params[0] = inControlPointCount; \n\
params[1] = patchCount; \n\
destVtx.threadgroupsPerGrid[0] = (src.vertexCount + vtxThreadExecWidth - 1) / vtxThreadExecWidth; \n\
destVtx.threadgroupsPerGrid[1] = src.instanceCount; \n\
destVtx.threadgroupsPerGrid[2] = 1; \n\
destTC.threadgroupsPerGrid[0] = (patchCount * outControlPointCount + tcWorkgroupSize - 1) / tcWorkgroupSize;\n\
destTC.threadgroupsPerGrid[1] = destTC.threadgroupsPerGrid[2] = 1; \n\
destTE.patchCount = patchCount; \n\
destTE.instanceCount = 1; \n\
destTE.patchStart = destTE.baseInstance = 0; \n\
#if __METAL_VERSION__ >= 210 \n\
destSI.stageInOrigin[0] = src.vertexStart; \n\
destSI.stageInOrigin[1] = src.baseInstance; \n\
destSI.stageInOrigin[2] = 0; \n\
destSI.stageInSize[0] = src.vertexCount; \n\
destSI.stageInSize[1] = src.instanceCount; \n\
destSI.stageInSize[2] = 1; \n\
#endif \n\
} \n\
\n\
kernel void cmdDrawIndexedIndirectConvertBuffers(const device char* srcBuff [[buffer(0)]], \n\
device char* destBuff [[buffer(1)]], \n\
device char* paramsBuff [[buffer(2)]], \n\
constant uint32_t& srcStride [[buffer(3)]], \n\
constant uint32_t& inControlPointCount [[buffer(4)]], \n\
constant uint32_t& outControlPointCount [[buffer(5)]], \n\
constant uint32_t& drawCount [[buffer(6)]], \n\
constant uint32_t& vtxThreadExecWidth [[buffer(7)]], \n\
constant uint32_t& tcWorkgroupSize [[buffer(8)]], \n\
uint idx [[thread_position_in_grid]]) { \n\
if (idx >= drawCount) { return; } \n\
const device auto& src = *reinterpret_cast<const device MTLDrawIndexedPrimitivesIndirectArguments*>(srcBuff + idx * srcStride);\n\
device char* dest; \n\
device auto* params = reinterpret_cast<device uint32_t*>(paramsBuff + idx * 256); \n\
#if __METAL_VERSION__ >= 210 \n\
dest = destBuff + idx * (sizeof(MTLStageInRegionIndirectArguments) + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
device auto& destSI = *(device MTLStageInRegionIndirectArguments*)dest; \n\
dest += sizeof(MTLStageInRegionIndirectArguments); \n\
#else \n\
dest = destBuff + idx * (sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
#endif \n\
device auto& destVtx = *(device MTLDispatchThreadgroupsIndirectArguments*)dest; \n\
device auto& destTC = *(device MTLDispatchThreadgroupsIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments));\n\
device auto& destTE = *(device MTLDrawPatchIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2);\n\
uint32_t patchCount = (src.indexCount * src.instanceCount + inControlPointCount - 1) / inControlPointCount;\n\
params[0] = inControlPointCount; \n\
params[1] = patchCount; \n\
destVtx.threadgroupsPerGrid[0] = (src.indexCount + vtxThreadExecWidth - 1) / vtxThreadExecWidth; \n\
destVtx.threadgroupsPerGrid[1] = src.instanceCount; \n\
destVtx.threadgroupsPerGrid[2] = 1; \n\
destTC.threadgroupsPerGrid[0] = (patchCount * outControlPointCount + tcWorkgroupSize - 1) / tcWorkgroupSize;\n\
destTC.threadgroupsPerGrid[1] = destTC.threadgroupsPerGrid[2] = 1; \n\
destTE.patchCount = patchCount; \n\
destTE.instanceCount = 1; \n\
destTE.patchStart = destTE.baseInstance = 0; \n\
#if __METAL_VERSION__ >= 210 \n\
destSI.stageInOrigin[0] = src.baseVertex; \n\
destSI.stageInOrigin[1] = src.baseInstance; \n\
destSI.stageInOrigin[2] = 0; \n\
destSI.stageInSize[0] = src.indexCount; \n\
destSI.stageInSize[1] = src.instanceCount; \n\
destSI.stageInSize[2] = 1; \n\
#endif \n\
} \n\
\n\
kernel void cmdDrawIndexedCopyIndex16Buffer(const device uint16_t* srcBuff [[buffer(0)]], \n\
device uint16_t* destBuff [[buffer(1)]], \n\
const device MTLDrawIndexedPrimitivesIndirectArguments& params [[buffer(2)]],\n\
uint i [[thread_position_in_grid]]) { \n\
destBuff[i] = srcBuff[params.indexStart + i]; \n\
} \n\
\n\
kernel void cmdDrawIndexedCopyIndex32Buffer(const device uint32_t* srcBuff [[buffer(0)]], \n\
device uint32_t* destBuff [[buffer(1)]], \n\
const device MTLDrawIndexedPrimitivesIndirectArguments& params [[buffer(2)]],\n\
uint i [[thread_position_in_grid]]) { \n\
destBuff[i] = srcBuff[params.indexStart + i]; \n\
} \n\
\n\
#endif \n\
\n\
typedef struct { \n\
uint32_t count; \n\
uint32_t countHigh; \n\
} VisibilityBuffer; \n\
\n\
typedef enum { \n\
Initial, \n\
DeviceAvailable, \n\
Available \n\
} QueryStatus; \n\
\n\
typedef enum { \n\
VK_QUERY_RESULT_64_BIT = 0x00000001, \n\
VK_QUERY_RESULT_WAIT_BIT = 0x00000002, \n\
VK_QUERY_RESULT_WITH_AVAILABILITY_BIT = 0x00000004, \n\
VK_QUERY_RESULT_PARTIAL_BIT = 0x00000008, \n\
} VkQueryResultFlagBits; \n\
\n\
kernel void cmdCopyQueryPoolResultsToBuffer(const device VisibilityBuffer* src [[buffer(0)]], \n\
device uint8_t* dest [[buffer(1)]], \n\
constant uint& stride [[buffer(2)]], \n\
constant uint& numQueries [[buffer(3)]], \n\
constant uint& flags [[buffer(4)]], \n\
constant QueryStatus* availability [[buffer(5)]], \n\
uint query [[thread_position_in_grid]]) { \n\
if (query >= numQueries) { return; } \n\
device uint32_t* destCount = (device uint32_t*)(dest + stride * query); \n\
if (availability[query] != Initial || flags & VK_QUERY_RESULT_PARTIAL_BIT) { \n\
destCount[0] = src[query].count; \n\
if (flags & VK_QUERY_RESULT_64_BIT) { destCount[1] = src[query].countHigh; } \n\
} \n\
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { \n\
if (flags & VK_QUERY_RESULT_64_BIT) { \n\
destCount[2] = availability[query] != Initial ? 1 : 0; \n\
destCount[3] = 0; \n\
} else { \n\
destCount[1] = availability[query] != Initial ? 1 : 0; \n\
} \n\
} \n\
} \n\
\n\
kernel void accumulateOcclusionQueryResults(device VisibilityBuffer& dest [[buffer(0)]], \n\
const device VisibilityBuffer& src [[buffer(1)]]) { \n\
uint32_t oldDestCount = dest.count; \n\
dest.count += src.count; \n\
dest.countHigh += src.countHigh; \n\
if (dest.count < max(oldDestCount, src.count)) { dest.countHigh++; } \n\
} \n\
\n\
";