blob: 92757aaca03e3145bd4b8620479c7e3c5abddcc7 [file] [log] [blame]
/*
* Copyright 2022 Google LLC
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "tests/Test.h"
#include "include/core/SkBitmap.h"
#include "include/gpu/graphite/Context.h"
#include "include/gpu/graphite/Recorder.h"
#include "include/gpu/graphite/Recording.h"
#include "src/gpu/graphite/Buffer.h"
#include "src/gpu/graphite/Caps.h"
#include "src/gpu/graphite/ComputePipelineDesc.h"
#include "src/gpu/graphite/ComputeTypes.h"
#include "src/gpu/graphite/ContextPriv.h"
#include "src/gpu/graphite/RecorderPriv.h"
#include "src/gpu/graphite/ResourceProvider.h"
#include "src/gpu/graphite/UniformManager.h"
#include "src/gpu/graphite/compute/ComputeStep.h"
#include "src/gpu/graphite/compute/DispatchGroup.h"
#include "src/gpu/graphite/task/ComputeTask.h"
#include "src/gpu/graphite/task/CopyTask.h"
#include "src/gpu/graphite/task/SynchronizeToCpuTask.h"
#include "src/gpu/graphite/task/UploadTask.h"
#include "tools/graphite/GraphiteTestContext.h"
using namespace skgpu::graphite;
using namespace skiatest::graphite;
namespace {
void* map_buffer(Context* context,
skiatest::graphite::GraphiteTestContext* testContext,
Buffer* buffer,
size_t offset) {
SkASSERT(buffer);
if (context->priv().caps()->bufferMapsAreAsync()) {
buffer->asyncMap();
while (!buffer->isMapped()) {
testContext->tick();
}
}
std::byte* ptr = static_cast<std::byte*>(buffer->map());
SkASSERT(ptr);
return ptr + offset;
}
sk_sp<Buffer> sync_buffer_to_cpu(Recorder* recorder, const Buffer* buffer) {
if (recorder->priv().caps()->drawBufferCanBeMappedForReadback()) {
// `buffer` can be mapped directly, however it may still require a synchronization step
// by the underlying API (e.g. a managed buffer in Metal). SynchronizeToCpuTask
// automatically handles this for us.
recorder->priv().add(SynchronizeToCpuTask::Make(sk_ref_sp(buffer)));
return sk_ref_sp(buffer);
}
// The backend requires a transfer buffer for CPU read-back
auto xferBuffer =
recorder->priv().resourceProvider()->findOrCreateBuffer(buffer->size(),
BufferType::kXferGpuToCpu,
AccessPattern::kHostVisible,
"ComputeTest_TransferToCpu");
SkASSERT(xferBuffer);
recorder->priv().add(CopyBufferToBufferTask::Make(buffer,
/*srcOffset=*/0,
xferBuffer,
/*dstOffset=*/0,
buffer->size()));
return xferBuffer;
}
std::unique_ptr<Recording> submit_recording(Context* context,
GraphiteTestContext* testContext,
Recorder* recorder) {
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
return nullptr;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
return recording;
}
bool is_dawn_or_metal_context_type(skiatest::GpuContextType ctxType) {
return skiatest::IsDawnContextType(ctxType) || skiatest::IsMetalContextType(ctxType);
}
} // namespace
#define DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS( \
name, reporter, graphite_context, test_context) \
DEF_GRAPHITE_TEST_FOR_CONTEXTS(name, \
is_dawn_or_metal_context_type, \
reporter, \
graphite_context, \
test_context, \
CtsEnforcement::kNever)
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
// compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SingleDispatchTest,
reporter,
context,
testContext) {
constexpr uint32_t kProblemSize = 512;
constexpr float kFactor = 4.f;
// The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
// processes 1 vector at a time.
constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
std::unique_ptr<Recorder> recorder = context->makeRecorder();
class TestComputeStep : public ComputeStep {
public:
// TODO(skia:40045541): SkSL doesn't support std430 layout well, so the buffers
// below all pack their data into vectors to be compatible with SPIR-V/WGSL.
TestComputeStep() : ComputeStep(
/*name=*/"TestArrayMultiply",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
// Input buffer:
{
// TODO(b/299979165): Declare this binding as read-only.
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kPrivate,
/*policy=*/ResourcePolicy::kMapped,
/*sksl=*/"inputBlock {\n"
" float factor;\n"
" layout(offset=16) float4 in_data[];\n"
"}",
},
// Output buffer:
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared, // shared to allow us to access it from the
// Builder
/*policy=*/ResourcePolicy::kMapped, // mappable for read-back
/*slot=*/0,
/*sksl=*/"outputBlock { float4 out_data[]; }",
}
}) {}
~TestComputeStep() override = default;
// A kernel that multiplies a large array of floats by a supplied factor.
std::string computeSkSL() const override {
return R"(
void main() {
out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
}
)";
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
if (index == 0) {
SkASSERT(r.fFlow == DataFlow::kPrivate);
return sizeof(float) * (kProblemSize + 4);
}
SkASSERT(index == 1);
SkASSERT(r.fSlot == 0);
SkASSERT(r.fFlow == DataFlow::kShared);
return sizeof(float) * kProblemSize;
}
void prepareStorageBuffer(int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
// Only initialize the input buffer.
if (resourceIndex != 0) {
return;
}
SkASSERT(r.fFlow == DataFlow::kPrivate);
size_t dataCount = sizeof(float) * (kProblemSize + 4);
SkASSERT(bufferSize == dataCount);
SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
inData[0] = kFactor;
for (unsigned int i = 0; i < kProblemSize; ++i) {
inData[i + 4] = i + 1;
}
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} step;
DispatchGroup::Builder builder(recorder.get());
if (!builder.appendStep(&step)) {
ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
return;
}
// The output buffer should have been placed in the right output slot.
BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
if (!outputInfo) {
ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
return;
}
// Record the compute task
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
// Verify the contents of the output buffer.
float* outData = static_cast<float*>(
map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
SkASSERT(outputBuffer->isMapped() && outData != nullptr);
for (unsigned int i = 0; i < kProblemSize; ++i) {
const float expected = (i + 1) * kFactor;
const float found = outData[i];
REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
}
}
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
// compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_DispatchGroupTest,
reporter,
context,
testContext) {
// TODO(b/315834710): This fails on Dawn D3D11
if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
return;
}
constexpr uint32_t kProblemSize = 512;
constexpr float kFactor1 = 4.f;
constexpr float kFactor2 = 3.f;
// The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
// processes 1 vector at a time.
constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
std::unique_ptr<Recorder> recorder = context->makeRecorder();
// Define two steps that perform two multiplication passes over the same input.
class TestComputeStep1 : public ComputeStep {
public:
// TODO(skia:40045541): SkSL doesn't support std430 layout well, so the buffers
// below all pack their data into vectors to be compatible with SPIR-V/WGSL.
TestComputeStep1() : ComputeStep(
/*name=*/"TestArrayMultiplyFirstPass",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
// Input buffer:
{
// TODO(b/299979165): Declare this binding as read-only.
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kPrivate,
/*policy=*/ResourcePolicy::kMapped, // mappable for read-back
/*sksl=*/"inputBlock {\n"
" float factor;\n"
" layout(offset=16) float4 in_data[];\n"
"}",
},
// Output buffers:
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone, // GPU-only, read by second step
/*slot=*/0,
/*sksl=*/"outputBlock1 { float4 forward_data[]; }",
},
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kMapped, // mappable for read-back
/*slot=*/1,
/*sksl=*/"outputBlock2 { float2 extra_data; }",
}
}) {}
~TestComputeStep1() override = default;
// A kernel that multiplies a large array of floats by a supplied factor.
std::string computeSkSL() const override {
return R"(
void main() {
uint idx = sk_GlobalInvocationID.x;
forward_data[idx] = in_data[idx] * factor;
if (idx == 0) {
extra_data.x = factor;
extra_data.y = 2 * factor;
}
}
)";
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
if (index == 0) {
SkASSERT(r.fFlow == DataFlow::kPrivate);
return sizeof(float) * (kProblemSize + 4);
}
if (index == 1) {
SkASSERT(r.fFlow == DataFlow::kShared);
SkASSERT(r.fSlot == 0);
return sizeof(float) * kProblemSize;
}
SkASSERT(index == 2);
SkASSERT(r.fSlot == 1);
SkASSERT(r.fFlow == DataFlow::kShared);
return 2 * sizeof(float);
}
void prepareStorageBuffer(int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
if (resourceIndex != 0) {
return;
}
size_t dataCount = sizeof(float) * (kProblemSize + 4);
SkASSERT(bufferSize == dataCount);
SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
inData[0] = kFactor1;
for (unsigned int i = 0; i < kProblemSize; ++i) {
inData[i + 4] = i + 1;
}
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} step1;
class TestComputeStep2 : public ComputeStep {
public:
TestComputeStep2() : ComputeStep(
/*name=*/"TestArrayMultiplySecondPass",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
// Input buffer:
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone, // GPU-only
/*slot=*/0, // this is the output from the first step
/*sksl=*/"inputBlock { float4 in_data[]; }",
},
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kPrivate,
/*policy=*/ResourcePolicy::kMapped,
/*sksl=*/"factorBlock { float factor; }"
},
// Output buffer:
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kMapped, // mappable for read-back
/*slot=*/2,
/*sksl=*/"outputBlock { float4 out_data[]; }",
}
}) {}
~TestComputeStep2() override = default;
// A kernel that multiplies a large array of floats by a supplied factor.
std::string computeSkSL() const override {
return R"(
void main() {
out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
}
)";
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
SkASSERT(index != 0);
if (index == 1) {
SkASSERT(r.fFlow == DataFlow::kPrivate);
return sizeof(float) * 4;
}
SkASSERT(index == 2);
SkASSERT(r.fSlot == 2);
SkASSERT(r.fFlow == DataFlow::kShared);
return sizeof(float) * kProblemSize;
}
void prepareStorageBuffer(int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
if (resourceIndex != 1) {
return;
}
SkASSERT(r.fFlow == DataFlow::kPrivate);
*static_cast<float*>(buffer) = kFactor2;
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} step2;
DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&step1);
builder.appendStep(&step2);
// Slots 0, 1, and 2 should all contain shared buffers. Slot 1 contains the extra output buffer
// from step 1 while slot 2 contains the result of the second multiplication pass from step 1.
// Slot 0 is not mappable.
REPORTER_ASSERT(reporter,
std::holds_alternative<BufferView>(builder.outputTable().fSharedSlots[0]),
"shared resource at slot 0 is missing");
BindBufferInfo outputInfo = builder.getSharedBufferResource(2);
if (!outputInfo) {
ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
return;
}
// Extra output buffer from step 1 (corresponding to 'outputBlock2')
BindBufferInfo extraOutputInfo = builder.getSharedBufferResource(1);
if (!extraOutputInfo) {
ERRORF(reporter, "shared resource at slot 1 is missing");
return;
}
// Record the compute task
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffers get synchronized to the CPU once the GPU submission has finished.
auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
auto extraOutputBuffer = sync_buffer_to_cpu(recorder.get(), extraOutputInfo.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
// Verify the contents of the output buffer from step 2
float* outData = static_cast<float*>(
map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
SkASSERT(outputBuffer->isMapped() && outData != nullptr);
for (unsigned int i = 0; i < kProblemSize; ++i) {
const float expected = (i + 1) * kFactor1 * kFactor2;
const float found = outData[i];
REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
}
// Verify the contents of the extra output buffer from step 1
float* extraOutData = static_cast<float*>(
map_buffer(context, testContext, extraOutputBuffer.get(), extraOutputInfo.fOffset));
SkASSERT(extraOutputBuffer->isMapped() && extraOutData != nullptr);
REPORTER_ASSERT(reporter,
kFactor1 == extraOutData[0],
"expected '%f', found '%f'",
kFactor1,
extraOutData[0]);
REPORTER_ASSERT(reporter,
2 * kFactor1 == extraOutData[1],
"expected '%f', found '%f'",
2 * kFactor2,
extraOutData[1]);
}
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
// compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_UniformBufferTest,
reporter,
context,
testContext) {
// TODO(b/315834710): This fails on Dawn D3D11
if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
return;
}
constexpr uint32_t kProblemSize = 512;
constexpr float kFactor = 4.f;
// The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
// processes 1 vector at a time.
constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
std::unique_ptr<Recorder> recorder = context->makeRecorder();
class TestComputeStep : public ComputeStep {
public:
TestComputeStep() : ComputeStep(
/*name=*/"TestArrayMultiply",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
// Uniform buffer:
{
/*type=*/ResourceType::kUniformBuffer,
/*flow=*/DataFlow::kPrivate,
/*policy=*/ResourcePolicy::kMapped,
/*sksl=*/"uniformBlock { float factor; }"
},
// Input buffer:
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kPrivate,
/*policy=*/ResourcePolicy::kMapped,
/*sksl=*/"inputBlock { float4 in_data[]; }",
},
// Output buffer:
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared, // shared to allow us to access it from the
// Builder
/*policy=*/ResourcePolicy::kMapped, // mappable for read-back
/*slot=*/0,
/*sksl=*/"outputBlock { float4 out_data[]; }",
}
}) {}
~TestComputeStep() override = default;
// A kernel that multiplies a large array of floats by a supplied factor.
std::string computeSkSL() const override {
return R"(
void main() {
out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
}
)";
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
if (index == 0) {
SkASSERT(r.fFlow == DataFlow::kPrivate);
return sizeof(float);
}
if (index == 1) {
SkASSERT(r.fFlow == DataFlow::kPrivate);
return sizeof(float) * kProblemSize;
}
SkASSERT(index == 2);
SkASSERT(r.fSlot == 0);
SkASSERT(r.fFlow == DataFlow::kShared);
return sizeof(float) * kProblemSize;
}
void prepareStorageBuffer(int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
// Only initialize the input storage buffer.
if (resourceIndex != 1) {
return;
}
SkASSERT(r.fFlow == DataFlow::kPrivate);
size_t dataCount = sizeof(float) * kProblemSize;
SkASSERT(bufferSize == dataCount);
SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
for (unsigned int i = 0; i < kProblemSize; ++i) {
inData[i] = i + 1;
}
}
void prepareUniformBuffer(int resourceIndex,
const ResourceDesc&,
UniformManager* mgr) const override {
SkASSERT(resourceIndex == 0);
SkDEBUGCODE(
const Uniform uniforms[] = {{"factor", SkSLType::kFloat}};
mgr->setExpectedUniforms(uniforms);
)
mgr->write(kFactor);
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} step;
DispatchGroup::Builder builder(recorder.get());
if (!builder.appendStep(&step)) {
ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
return;
}
// The output buffer should have been placed in the right output slot.
BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
if (!outputInfo) {
ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
return;
}
// Record the compute task
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
// Verify the contents of the output buffer.
float* outData = static_cast<float*>(
map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
SkASSERT(outputBuffer->isMapped() && outData != nullptr);
for (unsigned int i = 0; i < kProblemSize; ++i) {
const float expected = (i + 1) * kFactor;
const float found = outData[i];
REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
}
}
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
// compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ExternallyAssignedBuffer,
reporter,
context,
testContext) {
constexpr uint32_t kProblemSize = 512;
constexpr float kFactor = 4.f;
// The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
// processes 1 vector at a time.
constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
std::unique_ptr<Recorder> recorder = context->makeRecorder();
class TestComputeStep : public ComputeStep {
public:
TestComputeStep() : ComputeStep(
/*name=*/"ExternallyAssignedBuffer",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
// Input buffer:
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kPrivate,
/*policy=*/ResourcePolicy::kMapped,
/*sksl=*/"inputBlock {\n"
" float factor;\n"
" layout(offset = 16) float4 in_data[];\n"
"}\n",
},
// Output buffer:
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared, // shared to allow us to access it from the
// Builder
/*policy=*/ResourcePolicy::kMapped, // mappable for read-back
/*slot=*/0,
/*sksl=*/"outputBlock { float4 out_data[]; }",
}
}) {}
~TestComputeStep() override = default;
// A kernel that multiplies a large array of floats by a supplied factor.
std::string computeSkSL() const override {
return R"(
void main() {
out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
}
)";
}
size_t calculateBufferSize(int resourceIndex, const ResourceDesc& r) const override {
SkASSERT(resourceIndex == 0);
SkASSERT(r.fFlow == DataFlow::kPrivate);
return sizeof(float) * (kProblemSize + 4);
}
void prepareStorageBuffer(int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
SkASSERT(resourceIndex == 0);
SkASSERT(r.fFlow == DataFlow::kPrivate);
size_t dataCount = sizeof(float) * (kProblemSize + 4);
SkASSERT(bufferSize == dataCount);
SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
inData[0] = kFactor;
for (unsigned int i = 0; i < kProblemSize; ++i) {
inData[i + 4] = i + 1;
}
}
} step;
// We allocate a buffer and directly assign it to the DispatchGroup::Builder. The ComputeStep
// will not participate in the creation of this buffer.
auto [_, outputInfo] =
recorder->priv().drawBufferManager()->getStoragePointer(sizeof(float) * kProblemSize);
REPORTER_ASSERT(reporter, outputInfo, "Failed to allocate output buffer");
DispatchGroup::Builder builder(recorder.get());
builder.assignSharedBuffer({outputInfo, sizeof(float) * kProblemSize}, 0);
// Initialize the step with a pre-determined global size
if (!builder.appendStep(&step, {WorkgroupSize(1, 1, 1)})) {
ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
return;
}
// Record the compute task
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
// Verify the contents of the output buffer.
float* outData = static_cast<float*>(
map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
SkASSERT(outputBuffer->isMapped() && outData != nullptr);
for (unsigned int i = 0; i < kProblemSize; ++i) {
const float expected = (i + 1) * kFactor;
const float found = outData[i];
REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
}
}
// Tests the storage texture binding for a compute dispatch that writes the same color to every
// pixel of a storage texture.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTexture,
reporter,
context,
testContext) {
std::unique_ptr<Recorder> recorder = context->makeRecorder();
// For this test we allocate a 16x16 tile which is written to by a single workgroup of the same
// size.
constexpr uint32_t kDim = 16;
class TestComputeStep : public ComputeStep {
public:
TestComputeStep() : ComputeStep(
/*name=*/"TestStorageTexture",
/*localDispatchSize=*/{kDim, kDim, 1},
/*resources=*/{
{
/*type=*/ResourceType::kWriteOnlyStorageTexture,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/0,
/*sksl=*/"dst",
}
}) {}
~TestComputeStep() override = default;
std::string computeSkSL() const override {
return R"(
void main() {
textureWrite(dst, sk_LocalInvocationID.xy, half4(0.0, 1.0, 0.0, 1.0));
}
)";
}
std::tuple<SkISize, SkColorType> calculateTextureParameters(
int index, const ResourceDesc& r) const override {
return {{kDim, kDim}, kRGBA_8888_SkColorType};
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} step;
DispatchGroup::Builder builder(recorder.get());
if (!builder.appendStep(&step)) {
ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
return;
}
sk_sp<TextureProxy> texture = builder.getSharedTextureResource(0);
if (!texture) {
ERRORF(reporter, "Shared resource at slot 0 is missing");
return;
}
// Record the compute task
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
SkBitmap bitmap;
SkImageInfo imgInfo =
SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
bitmap.allocPixels(imgInfo);
SkPixmap pixels;
bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
REPORTER_ASSERT(reporter, peekPixelsSuccess);
bool readPixelsSuccess = context->priv().readPixels(pixels, texture.get(), imgInfo, 0, 0);
REPORTER_ASSERT(reporter, readPixelsSuccess);
for (uint32_t x = 0; x < kDim; ++x) {
for (uint32_t y = 0; y < kDim; ++y) {
SkColor4f expected = SkColor4f::FromColor(SK_ColorGREEN);
SkColor4f color = pixels.getColor4f(x, y);
REPORTER_ASSERT(reporter, expected == color,
"At position {%u, %u}, "
"expected {%.1f, %.1f, %.1f, %.1f}, "
"found {%.1f, %.1f, %.1f, %.1f}",
x, y,
expected.fR, expected.fG, expected.fB, expected.fA,
color.fR, color.fG, color.fB, color.fA);
}
}
}
// Tests the readonly texture binding for a compute dispatch that random-access reads from a
// CPU-populated texture and copies it to a storage texture.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureReadAndWrite,
reporter,
context,
testContext) {
std::unique_ptr<Recorder> recorder = context->makeRecorder();
// For this test we allocate a 16x16 tile which is written to by a single workgroup of the same
// size.
constexpr uint32_t kDim = 16;
class TestComputeStep : public ComputeStep {
public:
TestComputeStep() : ComputeStep(
/*name=*/"TestStorageTextureReadAndWrite",
/*localDispatchSize=*/{kDim, kDim, 1},
/*resources=*/{
{
/*type=*/ResourceType::kReadOnlyTexture,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/0,
/*sksl=*/"src",
},
{
/*type=*/ResourceType::kWriteOnlyStorageTexture,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/1,
/*sksl=*/"dst",
}
}) {}
~TestComputeStep() override = default;
std::string computeSkSL() const override {
return R"(
void main() {
half4 color = textureRead(src, sk_LocalInvocationID.xy);
textureWrite(dst, sk_LocalInvocationID.xy, color);
}
)";
}
std::tuple<SkISize, SkColorType> calculateTextureParameters(
int index, const ResourceDesc& r) const override {
SkASSERT(index == 1);
return {{kDim, kDim}, kRGBA_8888_SkColorType};
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} step;
// Create and populate an input texture.
SkBitmap srcBitmap;
SkImageInfo srcInfo =
SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
srcBitmap.allocPixels(srcInfo);
SkPixmap srcPixels;
bool srcPeekPixelsSuccess = srcBitmap.peekPixels(&srcPixels);
REPORTER_ASSERT(reporter, srcPeekPixelsSuccess);
for (uint32_t x = 0; x < kDim; ++x) {
for (uint32_t y = 0; y < kDim; ++y) {
*srcPixels.writable_addr32(x, y) =
SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0);
}
}
auto texInfo = context->priv().caps()->getDefaultSampledTextureInfo(kRGBA_8888_SkColorType,
skgpu::Mipmapped::kNo,
skgpu::Protected::kNo,
skgpu::Renderable::kNo);
sk_sp<TextureProxy> srcProxy = TextureProxy::Make(context->priv().caps(),
recorder->priv().resourceProvider(),
{kDim, kDim},
texInfo,
"ComputeTestSrcProxy",
skgpu::Budgeted::kNo);
MipLevel mipLevel;
mipLevel.fPixels = srcPixels.addr();
mipLevel.fRowBytes = srcPixels.rowBytes();
UploadInstance upload = UploadInstance::Make(recorder.get(),
srcProxy,
srcPixels.info().colorInfo(),
srcPixels.info().colorInfo(),
{mipLevel},
SkIRect::MakeWH(kDim, kDim),
std::make_unique<ImageUploadContext>());
if (!upload.isValid()) {
ERRORF(reporter, "Could not create UploadInstance");
return;
}
recorder->priv().add(UploadTask::Make(std::move(upload)));
DispatchGroup::Builder builder(recorder.get());
// Assign the input texture to slot 0. This corresponds to the ComputeStep's "src" texture
// binding.
builder.assignSharedTexture(std::move(srcProxy), 0);
if (!builder.appendStep(&step)) {
ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
return;
}
sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
if (!dst) {
ERRORF(reporter, "shared resource at slot 1 is missing");
return;
}
// Record the compute task
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
SkBitmap bitmap;
SkImageInfo imgInfo =
SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
bitmap.allocPixels(imgInfo);
SkPixmap pixels;
bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
REPORTER_ASSERT(reporter, peekPixelsSuccess);
bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
REPORTER_ASSERT(reporter, readPixelsSuccess);
for (uint32_t x = 0; x < kDim; ++x) {
for (uint32_t y = 0; y < kDim; ++y) {
SkColor4f expected = SkColor4f::FromBytes_RGBA(
SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0));
SkColor4f color = pixels.getColor4f(x, y);
REPORTER_ASSERT(reporter, expected == color,
"At position {%u, %u}, "
"expected {%.1f, %.1f, %.1f, %.1f}, "
"found {%.1f, %.1f, %.1f, %.1f}",
x, y,
expected.fR, expected.fG, expected.fB, expected.fA,
color.fR, color.fG, color.fB, color.fA);
}
}
}
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ReadOnlyStorageBuffer,
reporter,
context,
testContext) {
std::unique_ptr<Recorder> recorder = context->makeRecorder();
// For this test we allocate a 16x16 tile which is written to by a single workgroup of the same
// size.
constexpr uint32_t kDim = 16;
class TestComputeStep : public ComputeStep {
public:
TestComputeStep() : ComputeStep(
/*name=*/"TestReadOnlyStorageBuffer",
/*localDispatchSize=*/{kDim, kDim, 1},
/*resources=*/{
{
/*type=*/ResourceType::kReadOnlyStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kMapped,
/*slot=*/0,
/*sksl=*/"src { uint in_data[]; }",
},
{
/*type=*/ResourceType::kWriteOnlyStorageTexture,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/1,
/*sksl=*/"dst",
}
}) {}
~TestComputeStep() override = default;
std::string computeSkSL() const override {
return R"(
void main() {
uint ix = sk_LocalInvocationID.y * 16 + sk_LocalInvocationID.x;
uint value = in_data[ix];
half4 splat = half4(
half(value & 0xFF),
half((value >> 8) & 0xFF),
half((value >> 16) & 0xFF),
half((value >> 24) & 0xFF)
);
textureWrite(dst, sk_LocalInvocationID.xy, splat / 255.0);
}
)";
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
SkASSERT(index == 0);
return kDim * kDim * sizeof(uint32_t);
}
void prepareStorageBuffer(int index,
const ResourceDesc&,
void* buffer,
size_t bufferSize) const override {
SkASSERT(index == 0);
SkASSERT(bufferSize == kDim * kDim * sizeof(uint32_t));
uint32_t* inputs = reinterpret_cast<uint32_t*>(buffer);
for (uint32_t y = 0; y < kDim; ++y) {
for (uint32_t x = 0; x < kDim; ++x) {
uint32_t value =
((x * 256 / kDim) & 0xFF) | ((y * 256 / kDim) & 0xFF) << 8 | 255 << 24;
*(inputs++) = value;
}
}
}
std::tuple<SkISize, SkColorType> calculateTextureParameters(
int index, const ResourceDesc& r) const override {
SkASSERT(index == 1);
return {{kDim, kDim}, kRGBA_8888_SkColorType};
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} step;
DispatchGroup::Builder builder(recorder.get());
if (!builder.appendStep(&step)) {
ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
return;
}
sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
if (!dst) {
ERRORF(reporter, "shared resource at slot 1 is missing");
return;
}
// Record the compute task
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
SkBitmap bitmap;
SkImageInfo imgInfo =
SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
bitmap.allocPixels(imgInfo);
SkPixmap pixels;
bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
REPORTER_ASSERT(reporter, peekPixelsSuccess);
bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
REPORTER_ASSERT(reporter, readPixelsSuccess);
for (uint32_t x = 0; x < kDim; ++x) {
for (uint32_t y = 0; y < kDim; ++y) {
SkColor4f expected =
SkColor4f::FromColor(SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0));
SkColor4f color = pixels.getColor4f(x, y);
bool pass = true;
for (int i = 0; i < 4; i++) {
pass &= color[i] == expected[i];
}
REPORTER_ASSERT(reporter, pass,
"At position {%u, %u}, "
"expected {%.1f, %.1f, %.1f, %.1f}, "
"found {%.1f, %.1f, %.1f, %.1f}",
x, y,
expected.fR, expected.fG, expected.fB, expected.fA,
color.fR, color.fG, color.fB, color.fA);
}
}
}
// Tests that a texture written by one compute step can be sampled by a subsequent step.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureMultipleComputeSteps,
reporter,
context,
testContext) {
std::unique_ptr<Recorder> recorder = context->makeRecorder();
// For this test we allocate a 16x16 tile which is written to by a single workgroup of the same
// size.
constexpr uint32_t kDim = 16;
// Writes to a texture in slot 0.
class TestComputeStep1 : public ComputeStep {
public:
TestComputeStep1() : ComputeStep(
/*name=*/"TestStorageTexturesFirstPass",
/*localDispatchSize=*/{kDim, kDim, 1},
/*resources=*/{
{
/*type=*/ResourceType::kWriteOnlyStorageTexture,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/0,
/*sksl=*/"dst",
}
}) {}
~TestComputeStep1() override = default;
std::string computeSkSL() const override {
return R"(
void main() {
textureWrite(dst, sk_LocalInvocationID.xy, half4(0.0, 1.0, 0.0, 1.0));
}
)";
}
std::tuple<SkISize, SkColorType> calculateTextureParameters(
int index, const ResourceDesc& r) const override {
SkASSERT(index == 0);
return {{kDim, kDim}, kRGBA_8888_SkColorType};
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} step1;
// Reads from the texture in slot 0 and writes it to another texture in slot 1.
class TestComputeStep2 : public ComputeStep {
public:
TestComputeStep2() : ComputeStep(
/*name=*/"TestStorageTexturesSecondPass",
/*localDispatchSize=*/{kDim, kDim, 1},
/*resources=*/{
{
/*type=*/ResourceType::kReadOnlyTexture,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/0,
/*sksl=*/"src",
},
{
/*type=*/ResourceType::kWriteOnlyStorageTexture,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/1,
/*sksl=*/"dst",
}
}) {}
~TestComputeStep2() override = default;
std::string computeSkSL() const override {
return R"(
void main() {
half4 color = textureRead(src, sk_LocalInvocationID.xy);
textureWrite(dst, sk_LocalInvocationID.xy, color);
}
)";
}
std::tuple<SkISize, SkColorType> calculateTextureParameters(
int index, const ResourceDesc& r) const override {
SkASSERT(index == 1);
return {{kDim, kDim}, kRGBA_8888_SkColorType};
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} step2;
DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&step1);
builder.appendStep(&step2);
sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
if (!dst) {
ERRORF(reporter, "shared resource at slot 1 is missing");
return;
}
// Record the compute task
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
SkBitmap bitmap;
SkImageInfo imgInfo =
SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
bitmap.allocPixels(imgInfo);
SkPixmap pixels;
bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
REPORTER_ASSERT(reporter, peekPixelsSuccess);
bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
REPORTER_ASSERT(reporter, readPixelsSuccess);
for (uint32_t x = 0; x < kDim; ++x) {
for (uint32_t y = 0; y < kDim; ++y) {
SkColor4f expected = SkColor4f::FromColor(SK_ColorGREEN);
SkColor4f color = pixels.getColor4f(x, y);
REPORTER_ASSERT(reporter, expected == color,
"At position {%u, %u}, "
"expected {%.1f, %.1f, %.1f, %.1f}, "
"found {%.1f, %.1f, %.1f, %.1f}",
x, y,
expected.fR, expected.fG, expected.fB, expected.fA,
color.fR, color.fG, color.fB, color.fA);
}
}
}
// Tests that a texture can be sampled by a compute step using a sampler.
// TODO(armansito): Once the previous TODO is done, add additional tests that exercise mixed use of
// texture, buffer, and sampler bindings.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SampledTexture,
reporter,
context,
testContext) {
std::unique_ptr<Recorder> recorder = context->makeRecorder();
// The first ComputeStep initializes a 16x16 texture with a checkerboard pattern of alternating
// red and black pixels. The second ComputeStep downsamples this texture into a 4x4 using
// bilinear filtering at pixel borders, intentionally averaging the values of each 4x4 tile in
// the source texture, and writes the result to the destination texture.
constexpr uint32_t kSrcDim = 16;
constexpr uint32_t kDstDim = 4;
class TestComputeStep1 : public ComputeStep {
public:
TestComputeStep1() : ComputeStep(
/*name=*/"Test_SampledTexture_Init",
/*localDispatchSize=*/{kSrcDim, kSrcDim, 1},
/*resources=*/{
{
/*type=*/ResourceType::kWriteOnlyStorageTexture,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/0,
/*sksl=*/"dst",
}
}) {}
~TestComputeStep1() override = default;
std::string computeSkSL() const override {
return R"(
void main() {
uint2 c = sk_LocalInvocationID.xy;
uint checkerBoardColor = (c.x + (c.y % 2)) % 2;
textureWrite(dst, c, half4(checkerBoardColor, 0, 0, 1));
}
)";
}
std::tuple<SkISize, SkColorType> calculateTextureParameters(
int index, const ResourceDesc& r) const override {
SkASSERT(index == 0);
return {{kSrcDim, kSrcDim}, kRGBA_8888_SkColorType};
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} step1;
class TestComputeStep2 : public ComputeStep {
public:
TestComputeStep2() : ComputeStep(
/*name=*/"Test_SampledTexture_Sample",
/*localDispatchSize=*/{kDstDim, kDstDim, 1},
/*resources=*/{
// Declare the storage texture before the sampled texture. This tests that
// binding index assignment works consistently across all backends when a
// sampler-less texture and a texture+sampler pair are intermixed and sampler
// bindings aren't necessarily contiguous when the ranges are distinct.
{
/*type=*/ResourceType::kWriteOnlyStorageTexture,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/1,
/*sksl=*/"dst",
},
{
/*type=*/ResourceType::kSampledTexture,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/0,
/*sksl=*/"src",
}
}) {}
~TestComputeStep2() override = default;
std::string computeSkSL() const override {
return R"(
void main() {
// Normalize the 4x4 invocation indices and sample the source texture using
// that.
uint2 dstCoord = sk_LocalInvocationID.xy;
const float2 dstSizeInv = float2(0.25, 0.25);
float2 unormCoord = float2(dstCoord) * dstSizeInv;
// Use explicit LOD, as quad derivatives are not available to a compute shader.
half4 color = sampleLod(src, unormCoord, 0);
textureWrite(dst, dstCoord, color);
}
)";
}
std::tuple<SkISize, SkColorType> calculateTextureParameters(
int index, const ResourceDesc& r) const override {
SkASSERT(index == 0 || index == 1);
return {{kDstDim, kDstDim}, kRGBA_8888_SkColorType};
}
SamplerDesc calculateSamplerParameters(int index, const ResourceDesc&) const override {
SkASSERT(index == 1);
// Use the repeat tile mode to sample an infinite checkerboard.
constexpr SkTileMode kTileModes[2] = {SkTileMode::kRepeat, SkTileMode::kRepeat};
return {SkFilterMode::kLinear, kTileModes};
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} step2;
DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&step1);
builder.appendStep(&step2);
sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
if (!dst) {
ERRORF(reporter, "shared resource at slot 1 is missing");
return;
}
// Record the compute task
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
SkBitmap bitmap;
SkImageInfo imgInfo =
SkImageInfo::Make(kDstDim, kDstDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
bitmap.allocPixels(imgInfo);
SkPixmap pixels;
bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
REPORTER_ASSERT(reporter, peekPixelsSuccess);
bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
REPORTER_ASSERT(reporter, readPixelsSuccess);
for (uint32_t x = 0; x < kDstDim; ++x) {
for (uint32_t y = 0; y < kDstDim; ++y) {
SkColor4f color = pixels.getColor4f(x, y);
REPORTER_ASSERT(reporter, color.fR > 0.49 && color.fR < 0.51,
"At position {%u, %u}, "
"expected red channel in range [0.49, 0.51], "
"found {%.3f}",
x, y, color.fR);
}
}
}
// TODO(b/260622403): The shader tested here is identical to
// `resources/sksl/compute/AtomicsOperations.compute`. It would be nice to be able to exercise SkSL
// features like this as part of SkSLTest.cpp instead of as a graphite test.
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
// compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsTest,
reporter,
context,
testContext) {
// This fails on Dawn D3D11, b/315834710
if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
return;
}
std::unique_ptr<Recorder> recorder = context->makeRecorder();
constexpr uint32_t kWorkgroupCount = 32;
constexpr uint32_t kWorkgroupSize = 256;
class TestComputeStep : public ComputeStep {
public:
TestComputeStep() : ComputeStep(
/*name=*/"TestAtomicOperations",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kMapped,
/*slot=*/0,
/*sksl=*/"ssbo { atomicUint globalCounter; }",
}
}) {}
~TestComputeStep() override = default;
// A kernel that increments a global (device memory) counter across multiple workgroups.
// Each workgroup maintains its own independent tally in a workgroup-shared counter which
// is then added to the global count.
//
// This exercises atomic store/load/add and coherent reads and writes over memory in storage
// and workgroup address spaces.
std::string computeSkSL() const override {
return R"(
workgroup atomicUint localCounter;
void main() {
// Initialize the local counter.
if (sk_LocalInvocationID.x == 0) {
atomicStore(localCounter, 0);
}
// Synchronize the threads in the workgroup so they all see the initial value.
workgroupBarrier();
// All threads increment the counter.
atomicAdd(localCounter, 1);
// Synchronize the threads again to ensure they have all executed the increment
// and the following load reads the same value across all threads in the
// workgroup.
workgroupBarrier();
// Add the workgroup-only tally to the global counter.
if (sk_LocalInvocationID.x == 0) {
atomicAdd(globalCounter, atomicLoad(localCounter));
}
}
)";
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
SkASSERT(index == 0);
SkASSERT(r.fSlot == 0);
SkASSERT(r.fFlow == DataFlow::kShared);
return sizeof(uint32_t);
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(kWorkgroupCount, 1, 1);
}
void prepareStorageBuffer(int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
SkASSERT(resourceIndex == 0);
*static_cast<uint32_t*>(buffer) = 0;
}
} step;
DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&step);
BindBufferInfo info = builder.getSharedBufferResource(0);
if (!info) {
ERRORF(reporter, "shared resource at slot 0 is missing");
return;
}
// Record the compute pass task.
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
// Verify the contents of the output buffer.
constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
const uint32_t result = static_cast<const uint32_t*>(
map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
REPORTER_ASSERT(reporter,
result == kExpectedCount,
"expected '%u', found '%u'",
kExpectedCount,
result);
}
// TODO(b/260622403): The shader tested here is identical to
// `resources/sksl/compute/AtomicsOperationsOverArrayAndStruct.compute`. It would be nice to be able
// to exercise SkSL features like this as part of SkSLTest.cpp instead of as a graphite test.
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
// compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsOverArrayAndStructTest,
reporter,
context,
testContext) {
// This fails on Dawn D3D11, b/315834710
if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
return;
}
std::unique_ptr<Recorder> recorder = context->makeRecorder();
constexpr uint32_t kWorkgroupCount = 32;
constexpr uint32_t kWorkgroupSize = 256;
class TestComputeStep : public ComputeStep {
public:
TestComputeStep() : ComputeStep(
/*name=*/"TestAtomicOperationsOverArrayAndStruct",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kMapped,
/*slot=*/0,
/*sksl=*/"ssbo {\n"
" atomicUint globalCountsFirstHalf;\n"
" atomicUint globalCountsSecondHalf;\n"
"}\n"
}
}) {}
~TestComputeStep() override = default;
// Construct a kernel that increments a two global (device memory) counters across multiple
// workgroups. Each workgroup maintains its own independent tallies in workgroup-shared
// counters which are then added to the global counts.
//
// This exercises atomic store/load/add and coherent reads and writes over memory in storage
// and workgroup address spaces.
std::string computeSkSL() const override {
return R"(
const uint WORKGROUP_SIZE = 256;
workgroup atomicUint localCounts[2];
void main() {
// Initialize the local counts.
if (sk_LocalInvocationID.x == 0) {
atomicStore(localCounts[0], 0);
atomicStore(localCounts[1], 0);
}
// Synchronize the threads in the workgroup so they all see the initial value.
workgroupBarrier();
// Each thread increments one of the local counters based on its invocation
// index.
uint idx = sk_LocalInvocationID.x < (WORKGROUP_SIZE / 2) ? 0 : 1;
atomicAdd(localCounts[idx], 1);
// Synchronize the threads again to ensure they have all executed the increments
// and the following load reads the same value across all threads in the
// workgroup.
workgroupBarrier();
// Add the workgroup-only tally to the global counter.
if (sk_LocalInvocationID.x == 0) {
atomicAdd(globalCountsFirstHalf, atomicLoad(localCounts[0]));
atomicAdd(globalCountsSecondHalf, atomicLoad(localCounts[1]));
}
}
)";
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
SkASSERT(index == 0);
SkASSERT(r.fSlot == 0);
SkASSERT(r.fFlow == DataFlow::kShared);
return 2 * sizeof(uint32_t);
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(kWorkgroupCount, 1, 1);
}
void prepareStorageBuffer(int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
SkASSERT(resourceIndex == 0);
uint32_t* data = static_cast<uint32_t*>(buffer);
data[0] = 0;
data[1] = 0;
}
} step;
DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&step);
BindBufferInfo info = builder.getSharedBufferResource(0);
if (!info) {
ERRORF(reporter, "shared resource at slot 0 is missing");
return;
}
// Record the compute pass task.
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
// Verify the contents of the output buffer.
constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize / 2;
const uint32_t* ssboData = static_cast<const uint32_t*>(
map_buffer(context, testContext, buffer.get(), info.fOffset));
const uint32_t firstHalfCount = ssboData[0];
const uint32_t secondHalfCount = ssboData[1];
REPORTER_ASSERT(reporter,
firstHalfCount == kExpectedCount,
"expected '%u', found '%u'",
kExpectedCount,
firstHalfCount);
REPORTER_ASSERT(reporter,
secondHalfCount == kExpectedCount,
"expected '%u', found '%u'",
kExpectedCount,
secondHalfCount);
}
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearedBuffer,
reporter,
context,
testContext) {
constexpr uint32_t kProblemSize = 512;
// The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
// processes 1 vector at a time.
constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
std::unique_ptr<Recorder> recorder = context->makeRecorder();
// The ComputeStep requests an unmapped buffer that is zero-initialized. It writes the output to
// a mapped buffer which test verifies.
class TestComputeStep : public ComputeStep {
public:
TestComputeStep() : ComputeStep(
/*name=*/"TestClearedBuffer",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
// Zero initialized input buffer
{
// TODO(b/299979165): Declare this binding as read-only.
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kPrivate,
/*policy=*/ResourcePolicy::kClear,
/*sksl=*/"inputBlock { uint4 in_data[]; }\n",
},
// Output buffer:
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared, // shared to allow us to access it from the
// Builder
/*policy=*/ResourcePolicy::kMapped, // mappable for read-back
/*slot=*/0,
/*sksl=*/"outputBlock { uint4 out_data[]; }\n",
}
}) {}
~TestComputeStep() override = default;
std::string computeSkSL() const override {
return R"(
void main() {
out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
}
)";
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
return sizeof(uint32_t) * kProblemSize;
}
void prepareStorageBuffer(int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
// Should receive this call only for the mapped buffer.
SkASSERT(resourceIndex == 1);
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} step;
DispatchGroup::Builder builder(recorder.get());
if (!builder.appendStep(&step)) {
ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
return;
}
// The output buffer should have been placed in the right output slot.
BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
if (!outputInfo) {
ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
return;
}
// Record the compute task
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
// Verify the contents of the output buffer.
uint32_t* outData = static_cast<uint32_t*>(
map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
SkASSERT(outputBuffer->isMapped() && outData != nullptr);
for (unsigned int i = 0; i < kProblemSize; ++i) {
const uint32_t found = outData[i];
REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
}
}
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrdering,
reporter,
context,
testContext) {
// Initiate two independent DispatchGroups operating on the same buffer. The first group
// writes garbage to the buffer and the second group copies the contents to an output buffer.
// This test validates that the reads, writes, and clear occur in the expected order.
constexpr uint32_t kWorkgroupSize = 64;
// Initialize buffer with non-zero data.
class FillWithGarbage : public ComputeStep {
public:
FillWithGarbage() : ComputeStep(
/*name=*/"FillWithGarbage",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/0,
/*sksl=*/"outputBlock { uint4 out_data[]; }\n",
}
}) {}
~FillWithGarbage() override = default;
std::string computeSkSL() const override {
return R"(
void main() {
out_data[sk_GlobalInvocationID.x] = uint4(0xFE);
}
)";
}
} garbageStep;
// Second stage just copies the data to a destination buffer. This is only to verify that this
// stage, issued in a separate DispatchGroup, observes the clear.
class CopyBuffer : public ComputeStep {
public:
CopyBuffer() : ComputeStep(
/*name=*/"CopyBuffer",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/0,
/*sksl=*/"inputBlock { uint4 in_data[]; }\n",
},
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/1,
/*sksl=*/"outputBlock { uint4 out_data[]; }\n",
}
}) {}
~CopyBuffer() override = default;
std::string computeSkSL() const override {
return R"(
void main() {
out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
}
)";
}
} copyStep;
std::unique_ptr<Recorder> recorder = context->makeRecorder();
DispatchGroup::Builder builder(recorder.get());
constexpr size_t kElementCount = 4 * kWorkgroupSize;
constexpr size_t kBufferSize = sizeof(uint32_t) * kElementCount;
auto input = recorder->priv().drawBufferManager()->getStorage(kBufferSize);
auto [_, output] = recorder->priv().drawBufferManager()->getStoragePointer(kBufferSize);
ComputeTask::DispatchGroupList groups;
// First group.
builder.assignSharedBuffer({input, kBufferSize}, 0);
builder.appendStep(&garbageStep, {{1, 1, 1}});
groups.push_back(builder.finalize());
// Second group.
builder.reset();
builder.assignSharedBuffer({input, kBufferSize}, 0, ClearBuffer::kYes);
builder.assignSharedBuffer({output, kBufferSize}, 1);
builder.appendStep(&copyStep, {{1, 1, 1}});
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
auto outputBuffer = sync_buffer_to_cpu(recorder.get(), output.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = submit_recording(context, testContext, recorder.get());
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
// Verify the contents of the output buffer.
uint32_t* outData = static_cast<uint32_t*>(
map_buffer(context, testContext, outputBuffer.get(), output.fOffset));
SkASSERT(outputBuffer->isMapped() && outData != nullptr);
for (unsigned int i = 0; i < kElementCount; ++i) {
const uint32_t found = outData[i];
REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
}
}
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrderingScratchBuffers,
reporter,
context,
testContext) {
// This test is the same as the ClearOrdering test but the two stages write to a recycled
// ScratchBuffer. This is primarily to test ScratchBuffer reuse.
constexpr uint32_t kWorkgroupSize = 64;
// Initialize buffer with non-zero data.
class FillWithGarbage : public ComputeStep {
public:
FillWithGarbage() : ComputeStep(
/*name=*/"FillWithGarbage",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/0,
/*sksl=*/"outputBlock { uint4 out_data[]; }\n",
}
}) {}
~FillWithGarbage() override = default;
std::string computeSkSL() const override {
return R"(
void main() {
out_data[sk_GlobalInvocationID.x] = uint4(0xFE);
}
)";
}
} garbageStep;
// Second stage just copies the data to a destination buffer. This is only to verify that this
// stage (issued in a separate DispatchGroup) sees the changes.
class CopyBuffer : public ComputeStep {
public:
CopyBuffer() : ComputeStep(
/*name=*/"CopyBuffer",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/0,
/*sksl=*/"inputBlock { uint4 in_data[]; }\n",
},
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kNone,
/*slot=*/1,
/*sksl=*/"outputBlock { uint4 out_data[]; }\n",
}
}) {}
~CopyBuffer() override = default;
std::string computeSkSL() const override {
return R"(
void main() {
out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
}
)";
}
} copyStep;
std::unique_ptr<Recorder> recorder = context->makeRecorder();
DispatchGroup::Builder builder(recorder.get());
constexpr size_t kElementCount = 4 * kWorkgroupSize;
constexpr size_t kBufferSize = sizeof(uint32_t) * kElementCount;
auto [_, output] = recorder->priv().drawBufferManager()->getStoragePointer(kBufferSize);
ComputeTask::DispatchGroupList groups;
// First group.
{
auto scratch = recorder->priv().drawBufferManager()->getScratchStorage(kBufferSize);
auto input = scratch.suballocate(kBufferSize);
builder.assignSharedBuffer({input, kBufferSize}, 0);
// `scratch` returns to the scratch buffer pool when it goes out of scope
}
builder.appendStep(&garbageStep, {{1, 1, 1}});
groups.push_back(builder.finalize());
// Second group.
builder.reset();
{
auto scratch = recorder->priv().drawBufferManager()->getScratchStorage(kBufferSize);
auto input = scratch.suballocate(kBufferSize);
builder.assignSharedBuffer({input, kBufferSize}, 0, ClearBuffer::kYes);
}
builder.assignSharedBuffer({output, kBufferSize}, 1);
builder.appendStep(&copyStep, {{1, 1, 1}});
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
auto outputBuffer = sync_buffer_to_cpu(recorder.get(), output.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = submit_recording(context, testContext, recorder.get());
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
// Verify the contents of the output buffer.
uint32_t* outData = static_cast<uint32_t*>(
map_buffer(context, testContext, outputBuffer.get(), output.fOffset));
SkASSERT(outputBuffer->isMapped() && outData != nullptr);
for (unsigned int i = 0; i < kElementCount; ++i) {
const uint32_t found = outData[i];
REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
}
}
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_IndirectDispatch,
reporter,
context,
testContext) {
// This fails on Dawn D3D11, b/315834710
if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
return;
}
std::unique_ptr<Recorder> recorder = context->makeRecorder();
constexpr uint32_t kWorkgroupCount = 32;
constexpr uint32_t kWorkgroupSize = 64;
// `IndirectStep` populates a buffer with the global workgroup count for `CountStep`.
// `CountStep` is recorded using `DispatchGroup::appendStepIndirect()` and its workgroups get
// dispatched according to the values computed by `IndirectStep` on the GPU.
class IndirectStep : public ComputeStep {
public:
IndirectStep()
: ComputeStep(
/*name=*/"TestIndirectDispatch_IndirectStep",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/
{{
/*type=*/ResourceType::kIndirectBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kClear,
/*slot=*/0,
// TODO(armansito): Ideally the SSBO would have a single member of
// type `IndirectDispatchArgs` struct type. SkSL modules don't
// support struct declarations so this is currently not possible.
/*sksl=*/"ssbo { uint indirect[]; }",
}}) {}
~IndirectStep() override = default;
// Kernel that specifies a workgroup size of `kWorkgroupCount` to be used by the indirect
// dispatch.
std::string computeSkSL() const override {
return R"(
// This needs to match `kWorkgroupCount` declared above.
const uint kWorkgroupCount = 32;
void main() {
if (sk_LocalInvocationID.x == 0) {
indirect[0] = kWorkgroupCount;
indirect[1] = 1;
indirect[2] = 1;
}
}
)";
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
SkASSERT(index == 0);
SkASSERT(r.fSlot == 0);
SkASSERT(r.fFlow == DataFlow::kShared);
return kIndirectDispatchArgumentSize;
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(1, 1, 1);
}
} indirectStep;
class CountStep : public ComputeStep {
public:
CountStep()
: ComputeStep(
/*name=*/"TestIndirectDispatch_CountStep",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/
{{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kMapped,
/*slot=*/1,
/*sksl=*/"ssbo { atomicUint globalCounter; }",
}}) {}
~CountStep() override = default;
std::string computeSkSL() const override {
return R"(
workgroup atomicUint localCounter;
void main() {
// Initialize the local counter.
if (sk_LocalInvocationID.x == 0) {
atomicStore(localCounter, 0);
}
// Synchronize the threads in the workgroup so they all see the initial value.
workgroupBarrier();
// All threads increment the counter.
atomicAdd(localCounter, 1);
// Synchronize the threads again to ensure they have all executed the increment
// and the following load reads the same value across all threads in the
// workgroup.
workgroupBarrier();
// Add the workgroup-only tally to the global counter.
if (sk_LocalInvocationID.x == 0) {
atomicAdd(globalCounter, atomicLoad(localCounter));
}
}
)";
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
SkASSERT(index == 0);
SkASSERT(r.fSlot == 1);
SkASSERT(r.fFlow == DataFlow::kShared);
return sizeof(uint32_t);
}
void prepareStorageBuffer(int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
SkASSERT(resourceIndex == 0);
*static_cast<uint32_t*>(buffer) = 0;
}
} countStep;
DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&indirectStep);
BindBufferInfo indirectBufferInfo = builder.getSharedBufferResource(0);
if (!indirectBufferInfo) {
ERRORF(reporter, "Shared resource at slot 0 is missing");
return;
}
builder.appendStepIndirect(&countStep, {indirectBufferInfo, kIndirectDispatchArgumentSize});
BindBufferInfo info = builder.getSharedBufferResource(1);
if (!info) {
ERRORF(reporter, "Shared resource at slot 1 is missing");
return;
}
// Record the compute pass task.
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
// Verify the contents of the output buffer.
constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
const uint32_t result = static_cast<const uint32_t*>(
map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
REPORTER_ASSERT(reporter,
result == kExpectedCount,
"expected '%u', found '%u'",
kExpectedCount,
result);
}
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal,
reporter,
context,
testContext) {
std::unique_ptr<Recorder> recorder = context->makeRecorder();
constexpr uint32_t kWorkgroupCount = 32;
constexpr uint32_t kWorkgroupSize = 1024;
class TestComputeStep : public ComputeStep {
public:
TestComputeStep() : ComputeStep(
/*name=*/"TestAtomicOperationsMetal",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kMapped,
/*slot=*/0,
}
},
/*workgroupBuffers=*/{},
/*baseFlags=*/Flags::kSupportsNativeShader) {}
~TestComputeStep() override = default;
NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
SkASSERT(format == NativeShaderFormat::kMSL);
static constexpr std::string_view kSource = R"(
#include <metal_stdlib>
using namespace metal;
kernel void atomicCount(uint3 localId [[thread_position_in_threadgroup]],
device atomic_uint& globalCounter [[buffer(0)]]) {
threadgroup atomic_uint localCounter;
// Initialize the local counter.
if (localId.x == 0u) {
atomic_store_explicit(&localCounter, 0u, memory_order_relaxed);
}
// Synchronize the threads in the workgroup so they all see the initial value.
threadgroup_barrier(mem_flags::mem_threadgroup);
// All threads increment the counter.
atomic_fetch_add_explicit(&localCounter, 1u, memory_order_relaxed);
// Synchronize the threads again to ensure they have all executed the increment
// and the following load reads the same value across all threads in the
// workgroup.
threadgroup_barrier(mem_flags::mem_threadgroup);
// Add the workgroup-only tally to the global counter.
if (localId.x == 0u) {
uint tally = atomic_load_explicit(&localCounter, memory_order_relaxed);
atomic_fetch_add_explicit(&globalCounter, tally, memory_order_relaxed);
}
}
)";
return {kSource, "atomicCount"};
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
SkASSERT(index == 0);
SkASSERT(r.fSlot == 0);
SkASSERT(r.fFlow == DataFlow::kShared);
return sizeof(uint32_t);
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(kWorkgroupCount, 1, 1);
}
void prepareStorageBuffer(int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
SkASSERT(resourceIndex == 0);
*static_cast<uint32_t*>(buffer) = 0;
}
} step;
DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&step);
BindBufferInfo info = builder.getSharedBufferResource(0);
if (!info) {
ERRORF(reporter, "shared resource at slot 0 is missing");
return;
}
// Record the compute pass task.
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
// Verify the contents of the output buffer.
constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
const uint32_t result = static_cast<const uint32_t*>(
map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
REPORTER_ASSERT(reporter,
result == kExpectedCount,
"expected '%u', found '%u'",
kExpectedCount,
result);
}
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_WorkgroupBufferDescMetal,
reporter,
context,
testContext) {
std::unique_ptr<Recorder> recorder = context->makeRecorder();
constexpr uint32_t kWorkgroupCount = 32;
constexpr uint32_t kWorkgroupSize = 1024;
class TestComputeStep : public ComputeStep {
public:
TestComputeStep() : ComputeStep(
/*name=*/"TestAtomicOperationsMetal",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kMapped,
/*slot=*/0,
}
},
/*workgroupBuffers=*/{
{
/*size=*/sizeof(uint32_t),
/*index=*/0u,
}
},
/*baseFlags=*/Flags::kSupportsNativeShader) {}
~TestComputeStep() override = default;
// This is the same MSL kernel as in Compute_NativeShaderSourceMetal, except `localCounter`
// is an entry-point parameter instead of a local variable. This forces the workgroup
// binding to be encoded explicitly in the command encoder.
NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
SkASSERT(format == NativeShaderFormat::kMSL);
static constexpr std::string_view kSource = R"(
#include <metal_stdlib>
using namespace metal;
kernel void atomicCount(uint3 localId [[thread_position_in_threadgroup]],
device atomic_uint& globalCounter [[buffer(0)]],
threadgroup atomic_uint& localCounter [[threadgroup(0)]]) {
// Initialize the local counter.
if (localId.x == 0u) {
atomic_store_explicit(&localCounter, 0u, memory_order_relaxed);
}
// Synchronize the threads in the workgroup so they all see the initial value.
threadgroup_barrier(mem_flags::mem_threadgroup);
// All threads increment the counter.
atomic_fetch_add_explicit(&localCounter, 1u, memory_order_relaxed);
// Synchronize the threads again to ensure they have all executed the increment
// and the following load reads the same value across all threads in the
// workgroup.
threadgroup_barrier(mem_flags::mem_threadgroup);
// Add the workgroup-only tally to the global counter.
if (localId.x == 0u) {
uint tally = atomic_load_explicit(&localCounter, memory_order_relaxed);
atomic_fetch_add_explicit(&globalCounter, tally, memory_order_relaxed);
}
}
)";
return {kSource, "atomicCount"};
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
SkASSERT(index == 0);
SkASSERT(r.fSlot == 0);
SkASSERT(r.fFlow == DataFlow::kShared);
return sizeof(uint32_t);
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(kWorkgroupCount, 1, 1);
}
void prepareStorageBuffer(int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
SkASSERT(resourceIndex == 0);
*static_cast<uint32_t*>(buffer) = 0;
}
} step;
DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&step);
BindBufferInfo info = builder.getSharedBufferResource(0);
if (!info) {
ERRORF(reporter, "shared resource at slot 0 is missing");
return;
}
// Record the compute pass task.
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
// Verify the contents of the output buffer.
constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
const uint32_t result = static_cast<const uint32_t*>(
map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
REPORTER_ASSERT(reporter,
result == kExpectedCount,
"expected '%u', found '%u'",
kExpectedCount,
result);
}
DEF_GRAPHITE_TEST_FOR_DAWN_CONTEXT(Compute_NativeShaderSourceWGSL, reporter, context, testContext) {
// This fails on Dawn D3D11, b/315834710
if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
return;
}
std::unique_ptr<Recorder> recorder = context->makeRecorder();
constexpr uint32_t kWorkgroupCount = 32;
constexpr uint32_t kWorkgroupSize = 256; // The WebGPU default workgroup size limit is 256
class TestComputeStep : public ComputeStep {
public:
TestComputeStep() : ComputeStep(
/*name=*/"TestAtomicOperationsWGSL",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kMapped,
/*slot=*/0,
}
},
/*workgroupBuffers=*/{},
/*baseFlags=*/Flags::kSupportsNativeShader) {}
~TestComputeStep() override = default;
NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
SkASSERT(format == NativeShaderFormat::kWGSL);
static constexpr std::string_view kSource = R"(
@group(0) @binding(0) var<storage, read_write> globalCounter: atomic<u32>;
var<workgroup> localCounter: atomic<u32>;
@compute @workgroup_size(256)
fn atomicCount(@builtin(local_invocation_id) localId: vec3u) {
// Initialize the local counter.
if localId.x == 0u {
atomicStore(&localCounter, 0u);
}
// Synchronize the threads in the workgroup so they all see the initial value.
workgroupBarrier();
// All threads increment the counter.
atomicAdd(&localCounter, 1u);
// Synchronize the threads again to ensure they have all executed the increment
// and the following load reads the same value across all threads in the
// workgroup.
workgroupBarrier();
// Add the workgroup-only tally to the global counter.
if localId.x == 0u {
let tally = atomicLoad(&localCounter);
atomicAdd(&globalCounter, tally);
}
}
)";
return {kSource, "atomicCount"};
}
size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
SkASSERT(index == 0);
SkASSERT(r.fSlot == 0);
SkASSERT(r.fFlow == DataFlow::kShared);
return sizeof(uint32_t);
}
WorkgroupSize calculateGlobalDispatchSize() const override {
return WorkgroupSize(kWorkgroupCount, 1, 1);
}
void prepareStorageBuffer(int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
SkASSERT(resourceIndex == 0);
*static_cast<uint32_t*>(buffer) = 0;
}
} step;
DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&step);
BindBufferInfo info = builder.getSharedBufferResource(0);
if (!info) {
ERRORF(reporter, "shared resource at slot 0 is missing");
return;
}
// Record the compute pass task.
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
testContext->syncedSubmit(context);
// Verify the contents of the output buffer.
constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
const uint32_t result = static_cast<const uint32_t*>(
map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
REPORTER_ASSERT(reporter,
result == kExpectedCount,
"expected '%u', found '%u'",
kExpectedCount,
result);
}