blob: 63d3d2188e07a6cc230a4d6305a2795be40a76ff [file] [log] [blame]
* Copyright 2022 Google LLC
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
#include "tests/Test.h"
#include "include/gpu/graphite/Context.h"
#include "include/gpu/graphite/Recorder.h"
#include "include/gpu/graphite/Recording.h"
#include "src/gpu/graphite/Buffer.h"
#include "src/gpu/graphite/Caps.h"
#include "src/gpu/graphite/ComputePipelineDesc.h"
#include "src/gpu/graphite/ComputeTask.h"
#include "src/gpu/graphite/ComputeTypes.h"
#include "src/gpu/graphite/DrawParams.h"
#include "src/gpu/graphite/RecorderPriv.h"
#include "src/gpu/graphite/ResourceProvider.h"
#include "src/gpu/graphite/SynchronizeToCpuTask.h"
#include "src/gpu/graphite/compute/ComputeStep.h"
#include "src/gpu/graphite/compute/DispatchGroup.h"
using namespace skgpu::graphite;
namespace {
static const Transform kTestTransform = Transform::Identity();
static DrawParams fake_draw_params_for_testing() {
return DrawParams(kTestTransform, {}, {}, DrawOrder({}), nullptr);
void* map_bind_buffer(const BindBufferInfo& info) {
auto buffer = sk_ref_sp(info.fBuffer);
uint8_t* ptr = static_cast<uint8_t*>(buffer->map());
return ptr + info.fOffset;
} // namespace
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
// compute programs.
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_SingleDispatchTest, reporter, context) {
constexpr uint32_t kProblemSize = 512;
constexpr float kFactor = 4.f;
std::unique_ptr<Recorder> recorder = context->makeRecorder();
class TestComputeStep : public ComputeStep {
TestComputeStep() : ComputeStep(
/*localDispatchSize=*/{kProblemSize, 1, 1},
// Input buffer:
// Output buffer:
/*flow=*/DataFlow::kShared, // shared to allow us to access it from the
// Builder
/*policy=*/ResourcePolicy::kMapped, // mappable for read-back
}) {}
~TestComputeStep() override = default;
// A kernel that multiplies a large array of floats by a supplied factor.
std::string computeSkSL(const ResourceBindingRequirements&, int) const override {
return R"(
layout(set=0, binding=0) readonly buffer inputBlock
float factor;
float in_data[];
layout(set=0, binding=1) buffer outputBlock
float out_data[];
void main() {
out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
size_t calculateResourceSize(const DrawParams&,
int index,
const ResourceDesc& r) const override {
if (index == 0) {
SkASSERT(r.fFlow == DataFlow::kPrivate);
return sizeof(float) * (kProblemSize + 1);
SkASSERT(index == 1);
SkASSERT(r.fSlot == 0);
SkASSERT(r.fFlow == DataFlow::kShared);
return sizeof(float) * kProblemSize;
void prepareBuffer(const DrawParams&,
int ssboIndex,
int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
// Only initialize the input buffer.
if (resourceIndex != 0) {
SkASSERT(r.fFlow == DataFlow::kPrivate);
size_t dataCount = sizeof(float) * (kProblemSize + 1);
SkASSERT(bufferSize == dataCount);
SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
inData[0] = kFactor;
for (unsigned int i = 0; i < kProblemSize; ++i) {
inData[i + 1] = i + 1;
} step;
DispatchGroup::Builder builder(recorder.get());
if (!builder.appendStep(&step, fake_draw_params_for_testing(), 0)) {
ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
// The output buffer should have been placed in the right output slot.
BindBufferInfo outputInfo = builder.outputTable().fSharedSlots[0];
if (!outputInfo) {
ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
// Record the compute task
ComputeTask::DispatchGroupList groups;
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
// Verify the contents of the output buffer.
float* outData = static_cast<float*>(map_bind_buffer(outputInfo));
SkASSERT(outputInfo.fBuffer->isMapped() && outData != nullptr);
for (unsigned int i = 0; i < kProblemSize; ++i) {
const float expected = (i + 1) * kFactor;
const float found = outData[i];
REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
// compute programs.
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_DispatchGroupTest, reporter, context) {
constexpr uint32_t kProblemSize = 512;
constexpr float kFactor1 = 4.f;
constexpr float kFactor2 = 3.f;
std::unique_ptr<Recorder> recorder = context->makeRecorder();
// Define two steps that perform two multiplication passes over the same input.
class TestComputeStep1 : public ComputeStep {
TestComputeStep1() : ComputeStep(
/*localDispatchSize=*/{kProblemSize, 1, 1},
// Input buffer:
/*policy=*/ResourcePolicy::kMapped, // mappable for read-back
// Output buffers:
/*policy=*/ResourcePolicy::kNone, // GPU-only, read by second step
/*policy=*/ResourcePolicy::kMapped, // mappable for read-back
}) {}
~TestComputeStep1() override = default;
// A kernel that multiplies a large array of floats by a supplied factor.
std::string computeSkSL(const ResourceBindingRequirements&, int) const override {
return R"(
layout(set=0, binding=0) readonly buffer inputBlock
float factor;
float in_data[];
layout(set=0, binding=1) buffer outputBlock1
float forward_data[];
layout(set=0, binding=2) buffer outputBlock2
float extra_data[2];
void main() {
forward_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
extra_data[0] = factor;
extra_data[1] = 2 * factor;
size_t calculateResourceSize(const DrawParams&,
int index,
const ResourceDesc& r) const override {
if (index == 0) {
SkASSERT(r.fFlow == DataFlow::kPrivate);
return sizeof(float) * (kProblemSize + 1);
if (index == 1) {
SkASSERT(r.fFlow == DataFlow::kShared);
SkASSERT(r.fSlot == 0);
return sizeof(float) * kProblemSize;
SkASSERT(index == 2);
SkASSERT(r.fSlot == 1);
SkASSERT(r.fFlow == DataFlow::kShared);
return 2 * sizeof(float);
void prepareBuffer(const DrawParams&,
int ssboIndex,
int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
if (resourceIndex != 0) {
size_t dataCount = sizeof(float) * (kProblemSize + 1);
SkASSERT(bufferSize == dataCount);
SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
inData[0] = kFactor1;
for (unsigned int i = 0; i < kProblemSize; ++i) {
inData[i + 1] = i + 1;
} step1;
class TestComputeStep2 : public ComputeStep {
TestComputeStep2() : ComputeStep(
/*localDispatchSize=*/{kProblemSize, 1, 1},
// Input buffer:
/*policy=*/ResourcePolicy::kNone, // GPU-only
/*slot=*/0, // this is the output from the first step
// Output buffer:
/*policy=*/ResourcePolicy::kMapped, // mappable for read-back
}) {}
~TestComputeStep2() override = default;
// A kernel that multiplies a large array of floats by a supplied factor.
std::string computeSkSL(const ResourceBindingRequirements&, int) const override {
return R"(
layout(set=0, binding=0) readonly buffer inputBlock
float in_data[];
layout(set=0, binding=1) readonly buffer factorBlock
float factor;
layout(set=0, binding=2) buffer outputBlock
float out_data[];
void main() {
out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
size_t calculateResourceSize(const DrawParams&,
int index,
const ResourceDesc& r) const override {
if (index == 0) {
return sizeof(float) * kProblemSize;
if (index == 1) {
SkASSERT(r.fFlow == DataFlow::kPrivate);
return sizeof(float);
SkASSERT(index == 2);
SkASSERT(r.fSlot == 2);
SkASSERT(r.fFlow == DataFlow::kShared);
return sizeof(float) * kProblemSize;
void prepareBuffer(const DrawParams&,
int ssboIndex,
int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
if (resourceIndex != 1) {
SkASSERT(r.fFlow == DataFlow::kPrivate);
*static_cast<float*>(buffer) = kFactor2;
} step2;
DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&step1, fake_draw_params_for_testing(), 0);
builder.appendStep(&step2, fake_draw_params_for_testing(), 0);
// Slots 0, 1, and 2 should all contain shared buffers. Slot 1 contains the extra output buffer
// from step 1 while slot 2 contains the result of the second multiplication pass from step 1.
// Slot 0 is not mappable.
"shared resource at slot 0 is missing");
BindBufferInfo outputInfo = builder.outputTable().fSharedSlots[2];
if (!outputInfo) {
ERRORF(reporter, "shared resource at slot 2 is missing");
// Extra output buffer from step 1 (corresponding to 'outputBlock2')
BindBufferInfo extraOutputInfo = builder.outputTable().fSharedSlots[1];
if (!extraOutputInfo) {
ERRORF(reporter, "shared resource at slot 1 is missing");
// Record the compute task
ComputeTask::DispatchGroupList groups;
// Ensure the output buffers get synchronized to the CPU once the GPU submission has finished.
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
// Verify the contents of the output buffer from step 2
float* outData = static_cast<float*>(map_bind_buffer(outputInfo));
SkASSERT(outputInfo.fBuffer->isMapped() && outData != nullptr);
for (unsigned int i = 0; i < kProblemSize; ++i) {
const float expected = (i + 1) * kFactor1 * kFactor2;
const float found = outData[i];
REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
// Verify the contents of the extra output buffer from step 1
float* extraOutData = static_cast<float*>(map_bind_buffer(extraOutputInfo));
SkASSERT(extraOutputInfo.fBuffer->isMapped() && extraOutData != nullptr);
kFactor1 == extraOutData[0],
"expected '%f', found '%f'",
2 * kFactor1 == extraOutData[1],
"expected '%f', found '%f'",
2 * kFactor2,
// TODO(b/260622403): The shader tested here is identical to
// `resources/sksl/compute/AtomicsOperations.compute`. It would be nice to be able to exercise SkSL
// features like this as part of SkSLTest.cpp instead of as a graphite test.
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
// compute programs.
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_AtomicOperationsTest, reporter, context) {
std::unique_ptr<Recorder> recorder = context->makeRecorder();
constexpr uint32_t kWorkgroupCount = 32;
constexpr uint32_t kWorkgroupSize = 1024;
class TestComputeStep : public ComputeStep {
TestComputeStep() : ComputeStep(
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
}) {}
~TestComputeStep() override = default;
// A kernel that increments a global (device memory) counter across multiple workgroups.
// Each workgroup maintains its own independent tally in a workgroup-shared counter which
// is then added to the global count.
// This exercises atomic store/load/add and coherent reads and writes over memory in storage
// and workgroup address spaces.
std::string computeSkSL(const ResourceBindingRequirements&, int) const override {
return R"(
layout(metal, binding = 0) buffer ssbo {
atomicUint globalCounter;
workgroup atomicUint localCounter;
void main() {
// Initialize the local counter.
if (sk_LocalInvocationID.x == 0) {
atomicStore(localCounter, 0);
// Synchronize the threads in the workgroup so they all see the initial value.
// All threads increment the counter.
atomicAdd(localCounter, 1);
// Synchronize the threads again to ensure they have all executed the increment
// and the following load reads the same value across all threads in the
// workgroup.
// Add the workgroup-only tally to the global counter.
if (sk_LocalInvocationID.x == 0) {
atomicAdd(globalCounter, atomicLoad(localCounter));
size_t calculateResourceSize(const DrawParams&,
int index,
const ResourceDesc& r) const override {
SkASSERT(index == 0);
SkASSERT(r.fSlot == 0);
SkASSERT(r.fFlow == DataFlow::kShared);
return sizeof(uint32_t);
WorkgroupSize calculateGlobalDispatchSize(const DrawParams&) const override {
return WorkgroupSize(kWorkgroupCount, 1, 1);
void prepareBuffer(const DrawParams&,
int ssboIndex,
int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
SkASSERT(resourceIndex == 0);
*static_cast<uint32_t*>(buffer) = 0;
} step;
DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&step, fake_draw_params_for_testing(), 0);
BindBufferInfo info = builder.outputTable().fSharedSlots[0];
if (!info) {
ERRORF(reporter, "shared resource at slot 0 is missing");
// Record the compute pass task.
ComputeTask::DispatchGroupList groups;
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
// Verify the contents of the output buffer.
constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
const uint32_t result = static_cast<const uint32_t*>(map_bind_buffer(info))[0];
result == kExpectedCount,
"expected '%d', found '%d'",
// TODO(b/260622403): The shader tested here is identical to
// `resources/sksl/compute/AtomicsOperationsOverArrayAndStruct.compute`. It would be nice to be able
// to exercise SkSL features like this as part of SkSLTest.cpp instead of as a graphite test.
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
// compute programs.
context) {
std::unique_ptr<Recorder> recorder = context->makeRecorder();
constexpr uint32_t kWorkgroupCount = 32;
constexpr uint32_t kWorkgroupSize = 1024;
class TestComputeStep : public ComputeStep {
TestComputeStep() : ComputeStep(
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
}) {}
~TestComputeStep() override = default;
// Construct a kernel that increments a two global (device memory) counters across multiple
// workgroups. Each workgroup maintains its own independent tallies in workgroup-shared
// counters which are then added to the global counts.
// This exercises atomic store/load/add and coherent reads and writes over memory in storage
// and workgroup address spaces.
std::string computeSkSL(const ResourceBindingRequirements&, int) const override {
return R"(
const uint WORKGROUP_SIZE = 1024;
struct GlobalCounts {
atomicUint firstHalfCount;
atomicUint secondHalfCount;
layout(metal, binding = 0) buffer ssbo {
GlobalCounts globalCounts;
workgroup atomicUint localCounts[2];
void main() {
// Initialize the local counts.
if (sk_LocalInvocationID.x == 0) {
atomicStore(localCounts[0], 0);
atomicStore(localCounts[1], 0);
// Synchronize the threads in the workgroup so they all see the initial value.
// Each thread increments one of the local counters based on its invocation
// index.
uint idx = sk_LocalInvocationID.x < (WORKGROUP_SIZE / 2) ? 0 : 1;
atomicAdd(localCounts[idx], 1);
// Synchronize the threads again to ensure they have all executed the increments
// and the following load reads the same value across all threads in the
// workgroup.
// Add the workgroup-only tally to the global counter.
if (sk_LocalInvocationID.x == 0) {
atomicAdd(globalCounts.firstHalfCount, atomicLoad(localCounts[0]));
atomicAdd(globalCounts.secondHalfCount, atomicLoad(localCounts[1]));
size_t calculateResourceSize(const DrawParams&,
int index,
const ResourceDesc& r) const override {
SkASSERT(index == 0);
SkASSERT(r.fSlot == 0);
SkASSERT(r.fFlow == DataFlow::kShared);
return 2 * sizeof(uint32_t);
WorkgroupSize calculateGlobalDispatchSize(const DrawParams&) const override {
return WorkgroupSize(kWorkgroupCount, 1, 1);
void prepareBuffer(const DrawParams&,
int ssboIndex,
int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
SkASSERT(resourceIndex == 0);
uint32_t* data = static_cast<uint32_t*>(buffer);
data[0] = 0;
data[1] = 0;
} step;
DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&step, fake_draw_params_for_testing(), 0);
BindBufferInfo info = builder.outputTable().fSharedSlots[0];
if (!info) {
ERRORF(reporter, "shared resource at slot 0 is missing");
// Record the compute pass task.
ComputeTask::DispatchGroupList groups;
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
// Verify the contents of the output buffer.
constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize / 2;
const uint32_t* ssboData = static_cast<const uint32_t*>(map_bind_buffer(info));
const uint32_t firstHalfCount = ssboData[0];
const uint32_t secondHalfCount = ssboData[1];
firstHalfCount == kExpectedCount,
"expected '%d', found '%d'",
secondHalfCount == kExpectedCount,
"expected '%d', found '%d'",