[graphite][compute] Buffer clear at DispatchGroup granularity When storage buffers get requested from DrawBufferManager with the `Cleared::kYes` argument, DrawBufferManager adds a buffer clear record to a list. The list gets transfered to a Recording as a single `ClearBuffersTask` which precedes compute and render tasks. This doesn't work if a buffer gets reused across two DispatchGroups under the same Recording, s.t. the first DispatchGroup writes to it and the second group requires to be cleared to 0. This CL addresses this issue by having each DispatchGroup maintain its own separate buffer clear list. When the DispatchGroups get snapped to a ComputeTask, the clears become individual child tasks that must be interleaved with compute passes according to their dependency. Bug: b/301139790 Change-Id: If8f3be6a076a799a827b482f9bf70c3370415f35 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/828477 Commit-Queue: Arman Uguray <armansito@google.com> Reviewed-by: Michael Ludwig <michaelludwig@google.com> Reviewed-by: Jim Van Verth <jvanverth@google.com> Reviewed-by: James Godfrey-Kittle <jamesgk@google.com>

commit: e727b91ec444203aa559f16e7f55d8fc9cc53c5a [log] [tgz]
author: Arman Uguray <armansito@google.com> Mon Mar 18 23:41:09 2024 -0700
committer: SkCQ <skcq-be@skia-corp.google.com.iam.gserviceaccount.com> Mon Apr 01 19:30:43 2024 +0000
tree: 420123321b507a27aff784d2ce0b59a9408be151
parent: 0b0ed6ca101b4e4a352d2f90214e3b71db56d9dd [diff]
diff --git a/src/gpu/graphite/CommandBuffer.cpp b/src/gpu/graphite/CommandBuffer.cpp
index 7bdca1c..ea8e51a 100644
--- a/src/gpu/graphite/CommandBuffer.cpp
+++ b/src/gpu/graphite/CommandBuffer.cpp

@@ -107,7 +107,7 @@
     return true;
 }
 
-bool CommandBuffer::addComputePass(const DispatchGroupList& dispatchGroups) {
+bool CommandBuffer::addComputePass(DispatchGroupSpan dispatchGroups) {
     TRACE_EVENT0("skia.gpu", TRACE_FUNC);
 
     if (!this->onAddComputePass(dispatchGroups)) {

diff --git a/src/gpu/graphite/CommandBuffer.h b/src/gpu/graphite/CommandBuffer.h
index c36022e..b6e73be 100644
--- a/src/gpu/graphite/CommandBuffer.h
+++ b/src/gpu/graphite/CommandBuffer.h

@@ -38,7 +38,7 @@
 class CommandBuffer {
 public:
     using DrawPassList = skia_private::TArray<std::unique_ptr<DrawPass>>;
-    using DispatchGroupList = skia_private::TArray<std::unique_ptr<DispatchGroup>>;
+    using DispatchGroupSpan = SkSpan<const std::unique_ptr<DispatchGroup>>;
 
     virtual ~CommandBuffer();
 
@@ -80,7 +80,7 @@
                        SkRect viewport,
                        const DrawPassList& drawPasses);
 
-    bool addComputePass(const DispatchGroupList& dispatchGroups);
+    bool addComputePass(DispatchGroupSpan dispatchGroups);
 
     //---------------------------------------------------------------
     // Can only be used outside renderpasses
@@ -131,7 +131,7 @@
                                  SkRect viewport,
                                  const DrawPassList& drawPasses) = 0;
 
-    virtual bool onAddComputePass(const DispatchGroupList& dispatchGroups) = 0;
+    virtual bool onAddComputePass(DispatchGroupSpan dispatchGroups) = 0;
 
     virtual bool onCopyBufferToBuffer(const Buffer* srcBuffer,
                                       size_t srcOffset,

diff --git a/src/gpu/graphite/compute/DispatchGroup.cpp b/src/gpu/graphite/compute/DispatchGroup.cpp
index 63dccdf..a7a28f2 100644
--- a/src/gpu/graphite/compute/DispatchGroup.cpp
+++ b/src/gpu/graphite/compute/DispatchGroup.cpp

@@ -18,6 +18,7 @@
 #include "src/gpu/graphite/ResourceProvider.h"
 #include "src/gpu/graphite/Texture.h"
 #include "src/gpu/graphite/UniformManager.h"
+#include "src/gpu/graphite/task/ClearBuffersTask.h"
 
 namespace skgpu::graphite {
 
@@ -72,6 +73,13 @@
     }
 }
 
+sk_sp<Task> DispatchGroup::snapChildTask() {
+    if (fClearList.empty()) {
+        return nullptr;
+    }
+    return ClearBuffersTask::Make(std::move(fClearList));
+}
+
 const Texture* DispatchGroup::getTexture(size_t index) const {
     SkASSERT(index < SkToSizeT(fTextures.size()));
     SkASSERT(fTextures[index]);
@@ -233,12 +241,15 @@
     return true;
 }
 
-void Builder::assignSharedBuffer(BufferView buffer, unsigned int slot) {
+void Builder::assignSharedBuffer(BufferView buffer, unsigned int slot, ClearBuffer cleared) {
     SkASSERT(fObj);
     SkASSERT(buffer.fInfo);
     SkASSERT(buffer.fSize);
 
     fOutputTable.fSharedSlots[slot] = buffer;
+    if (cleared == ClearBuffer::kYes) {
+        fObj->fClearList.push_back({buffer.fInfo.fBuffer, buffer.fInfo.fOffset, buffer.fSize});
+    }
 }
 
 void Builder::assignSharedTexture(sk_sp<TextureProxy> texture, unsigned int slot) {
@@ -255,6 +266,13 @@
     return obj;
 }
 
+#if defined(GRAPHITE_TEST_UTILS)
+void Builder::reset() {
+    fOutputTable.reset();
+    fObj.reset(new DispatchGroup);
+}
+#endif
+
 BindBufferInfo Builder::getSharedBufferResource(unsigned int slot) const {
     SkASSERT(fObj);
 
@@ -281,6 +299,7 @@
                                                    const ComputeStep::ResourceDesc& resource,
                                                    int resourceIdx) {
     SkASSERT(step);
+    SkASSERT(fObj);
     using Type = ComputeStep::ResourceType;
     using ResourcePolicy = ComputeStep::ResourcePolicy;
 

diff --git a/src/gpu/graphite/compute/DispatchGroup.h b/src/gpu/graphite/compute/DispatchGroup.h
index 152e440..05c2cc1 100644
--- a/src/gpu/graphite/compute/DispatchGroup.h
+++ b/src/gpu/graphite/compute/DispatchGroup.h

@@ -83,6 +83,10 @@
     bool prepareResources(ResourceProvider*);
     void addResourceRefs(CommandBuffer*) const;
 
+    // Returns a single tasks that must execute before this DispatchGroup or nullptr if the group
+    // has no task dependencies.
+    sk_sp<Task> snapChildTask();
+
 private:
     friend class DispatchGroupBuilder;
 
@@ -94,6 +98,9 @@
 
     skia_private::TArray<Dispatch> fDispatchList;
 
+    // The list of all buffers that must be cleared before the dispatches.
+    skia_private::TArray<ClearBufferInfo> fClearList;
+
     // Pipelines are referenced by index by each Dispatch in `fDispatchList`. They are stored as a
     // pipeline description until instantiated in `prepareResources()`.
     skia_private::TArray<ComputePipelineDesc> fPipelineDescs;
@@ -148,7 +155,12 @@
     // If the slot is already assigned a buffer, it will be overwritten. Calling this method does
     // not have any effect on previously appended ComputeSteps that were already bound that
     // resource.
-    void assignSharedBuffer(BufferView buffer, unsigned int slot);
+    //
+    // If `cleared` is kYes, the contents of the given view will be cleared to 0 before the current
+    // DispatchGroup gets submitted.
+    void assignSharedBuffer(BufferView buffer,
+                            unsigned int slot,
+                            ClearBuffer cleared = ClearBuffer::kNo);
 
     // Directly assign a texture to a shared slot. ComputeSteps that are appended after this call
     // will use this resource if they reference the given `slot` index. Builder will not allocate
@@ -160,10 +172,17 @@
     // resource.
     void assignSharedTexture(sk_sp<TextureProxy> texture, unsigned int slot);
 
-    // Finalize and return the constructed DispatchGroup. The Builder can be used to construct a new
-    // DispatchGroup after this method returns.
+    // Finalize and return the constructed DispatchGroup.
+    //
+    // The Builder can be used to construct a new DispatchGroup by calling "reset()" after this
+    // method returns.
     std::unique_ptr<DispatchGroup> finalize();
 
+#if defined(GRAPHITE_TEST_UTILS)
+    // Clear old state and start a new DispatchGroup.
+    void reset();
+#endif
+
     // Returns the buffer resource assigned to the shared slot with the given index, if any.
     BindBufferInfo getSharedBufferResource(unsigned int slot) const;
 

diff --git a/src/gpu/graphite/dawn/DawnCommandBuffer.cpp b/src/gpu/graphite/dawn/DawnCommandBuffer.cpp
index dc68179..e27c5c4 100644
--- a/src/gpu/graphite/dawn/DawnCommandBuffer.cpp
+++ b/src/gpu/graphite/dawn/DawnCommandBuffer.cpp

@@ -115,7 +115,7 @@
     return true;
 }
 
-bool DawnCommandBuffer::onAddComputePass(const DispatchGroupList& groups) {
+bool DawnCommandBuffer::onAddComputePass(DispatchGroupSpan groups) {
     this->beginComputePass();
     for (const auto& group : groups) {
         group->addResourceRefs(this);

diff --git a/src/gpu/graphite/dawn/DawnCommandBuffer.h b/src/gpu/graphite/dawn/DawnCommandBuffer.h
index 24b4809..1a861b2 100644
--- a/src/gpu/graphite/dawn/DawnCommandBuffer.h
+++ b/src/gpu/graphite/dawn/DawnCommandBuffer.h

@@ -48,7 +48,7 @@
                          const Texture* depthStencilTexture,
                          SkRect viewport,
                          const DrawPassList&) override;
-    bool onAddComputePass(const DispatchGroupList&) override;
+    bool onAddComputePass(DispatchGroupSpan) override;
 
     // Methods for populating a Dawn RenderPassEncoder:
     bool beginRenderPass(const RenderPassDesc&,

diff --git a/src/gpu/graphite/mtl/MtlCommandBuffer.h b/src/gpu/graphite/mtl/MtlCommandBuffer.h
index 0de3c55..5e0d3d9 100644
--- a/src/gpu/graphite/mtl/MtlCommandBuffer.h
+++ b/src/gpu/graphite/mtl/MtlCommandBuffer.h

@@ -76,7 +76,7 @@
                          const Texture* depthStencilTexture,
                          SkRect viewport,
                          const DrawPassList&) override;
-    bool onAddComputePass(const DispatchGroupList&) override;
+    bool onAddComputePass(DispatchGroupSpan) override;
 
     // Methods for populating a MTLRenderCommandEncoder:
     bool beginRenderPass(const RenderPassDesc&,

diff --git a/src/gpu/graphite/mtl/MtlCommandBuffer.mm b/src/gpu/graphite/mtl/MtlCommandBuffer.mm
index bd27dfd..9ba07ee 100644
--- a/src/gpu/graphite/mtl/MtlCommandBuffer.mm
+++ b/src/gpu/graphite/mtl/MtlCommandBuffer.mm

@@ -165,7 +165,7 @@
     return true;
 }
 
-bool MtlCommandBuffer::onAddComputePass(const DispatchGroupList& groups) {
+bool MtlCommandBuffer::onAddComputePass(DispatchGroupSpan groups) {
     this->beginComputePass();
     for (const auto& group : groups) {
         group->addResourceRefs(this);

diff --git a/src/gpu/graphite/task/ComputeTask.cpp b/src/gpu/graphite/task/ComputeTask.cpp
index cb18dac..65fba0e 100644
--- a/src/gpu/graphite/task/ComputeTask.cpp
+++ b/src/gpu/graphite/task/ComputeTask.cpp

@@ -19,11 +19,20 @@
 }
 
 ComputeTask::ComputeTask(DispatchGroupList dispatchGroups)
-        : fDispatchGroups(std::move(dispatchGroups)) {}
+        : fDispatchGroups(std::move(dispatchGroups)), fChildTasks(fDispatchGroups.size()) {
+    for (auto& group : fDispatchGroups) {
+        fChildTasks.push_back(group->snapChildTask());
+    }
+}
 
 ComputeTask::~ComputeTask() = default;
 
-bool ComputeTask::prepareResources(ResourceProvider* provider, const RuntimeEffectDictionary*) {
+bool ComputeTask::prepareResources(ResourceProvider* provider, const RuntimeEffectDictionary* rtd) {
+    for (const auto& child : fChildTasks) {
+        if (child) {
+            child->prepareResources(provider, rtd);
+        }
+    }
     for (const auto& group : fDispatchGroups) {
         if (!group->prepareResources(provider)) {
             return false;
@@ -32,8 +41,34 @@
     return true;
 }
 
-bool ComputeTask::addCommands(Context*, CommandBuffer* commandBuffer, ReplayTargetData) {
-    return commandBuffer->addComputePass(fDispatchGroups);
+bool ComputeTask::addCommands(Context* ctx, CommandBuffer* commandBuffer, ReplayTargetData rtd) {
+    if (fDispatchGroups.empty()) {
+        return true;
+    }
+    SkASSERT(fDispatchGroups.size() == fChildTasks.size());
+    const std::unique_ptr<DispatchGroup>* currentSpanPtr = &fDispatchGroups[0];
+    size_t currentSpanSize = 0u;
+    for (int i = 0; i < fDispatchGroups.size(); ++i) {
+        // If the next DispatchGroup has a dependent task, then encode the accumulated span as a
+        // compute pass now. CommandBuffer encodes each compute pass with a separate encoder, so
+        // the dependent task can use a non-compute encoder if needed.
+        Task* child = fChildTasks[i].get();
+        if (child) {
+            if (currentSpanSize > 0u) {
+                if (!commandBuffer->addComputePass({currentSpanPtr, currentSpanSize})) {
+                    return false;
+                }
+                currentSpanPtr = &fDispatchGroups[i];
+                currentSpanSize = 0u;
+            }
+            if (!child->addCommands(ctx, commandBuffer, rtd)) {
+                return false;
+            }
+        }
+        currentSpanSize++;
+    }
+    return currentSpanSize == 0u ||
+           commandBuffer->addComputePass({currentSpanPtr, currentSpanSize});
 }
 
 }  // namespace skgpu::graphite

diff --git a/src/gpu/graphite/task/ComputeTask.h b/src/gpu/graphite/task/ComputeTask.h
index 4483426..e241f53 100644
--- a/src/gpu/graphite/task/ComputeTask.h
+++ b/src/gpu/graphite/task/ComputeTask.h

@@ -37,6 +37,11 @@
     explicit ComputeTask(DispatchGroupList dispatchGroups);
 
     DispatchGroupList fDispatchGroups;
+
+    // Every element of this list is a task that must execute before the DispatchGroup stored at the
+    // same array index. Child tasks are allowed to be a nullptr to represent NOP (i.e. the
+    // corresponding DispatchGroup doesn't have any pre-tasks).
+    skia_private::TArray<sk_sp<Task>> fChildTasks;
 };
 
 }  // namespace skgpu::graphite

diff --git a/src/gpu/graphite/vk/VulkanCommandBuffer.cpp b/src/gpu/graphite/vk/VulkanCommandBuffer.cpp
index 56188e9..1fc7098 100644
--- a/src/gpu/graphite/vk/VulkanCommandBuffer.cpp
+++ b/src/gpu/graphite/vk/VulkanCommandBuffer.cpp

@@ -1296,7 +1296,7 @@
                                        /*stride=*/0));
 }
 
-bool VulkanCommandBuffer::onAddComputePass(const DispatchGroupList&) { return false; }
+bool VulkanCommandBuffer::onAddComputePass(DispatchGroupSpan) { return false; }
 
 bool VulkanCommandBuffer::onCopyBufferToBuffer(const Buffer* srcBuffer,
                                                size_t srcOffset,

diff --git a/src/gpu/graphite/vk/VulkanCommandBuffer.h b/src/gpu/graphite/vk/VulkanCommandBuffer.h
index 941d071..66b3666 100644
--- a/src/gpu/graphite/vk/VulkanCommandBuffer.h
+++ b/src/gpu/graphite/vk/VulkanCommandBuffer.h

@@ -119,7 +119,7 @@
 
     // TODO: The virtuals in this class have not yet been implemented as we still haven't
     // implemented the objects they use.
-    bool onAddComputePass(const DispatchGroupList&) override;
+    bool onAddComputePass(DispatchGroupSpan) override;
 
     bool onCopyBufferToBuffer(const Buffer* srcBuffer,
                               size_t srcOffset,

diff --git a/tests/graphite/ComputeTest.cpp b/tests/graphite/ComputeTest.cpp
index 0c06477..a1e94856 100644
--- a/tests/graphite/ComputeTest.cpp
+++ b/tests/graphite/ComputeTest.cpp

@@ -29,6 +29,7 @@
 #include "tools/graphite/GraphiteTestContext.h"
 
 using namespace skgpu::graphite;
+using namespace skiatest::graphite;
 
 namespace {
 
@@ -71,6 +72,22 @@
     return xferBuffer;
 }
 
+std::unique_ptr<Recording> submit_recording(Context* context,
+                                            GraphiteTestContext* testContext,
+                                            Recorder* recorder) {
+    std::unique_ptr<Recording> recording = recorder->snap();
+    if (!recording) {
+        return nullptr;
+    }
+
+    InsertRecordingInfo insertInfo;
+    insertInfo.fRecording = recording.get();
+    context->insertRecording(insertInfo);
+    testContext->syncedSubmit(context);
+
+    return recording;
+}
+
 bool is_dawn_or_metal_context_type(skiatest::GpuContextType ctxType) {
     return skiatest::IsDawnContextType(ctxType) || skiatest::IsMetalContextType(ctxType);
 }
@@ -1842,6 +1859,238 @@
     }
 }
 
+DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrdering,
+                                              reporter,
+                                              context,
+                                              testContext) {
+    // Initiate two independent DispatchGroups operating on the same buffer. The first group
+    // writes garbage to the buffer and the second group copies the contents to an output buffer.
+    // This test validates that the reads, writes, and clear occur in the expected order.
+    constexpr uint32_t kWorkgroupSize = 64;
+
+    // Initialize buffer with non-zero data.
+    class FillWithGarbage : public ComputeStep {
+    public:
+        FillWithGarbage() : ComputeStep(
+                /*name=*/"FillWithGarbage",
+                /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
+                /*resources=*/{
+                    {
+                        /*type=*/ResourceType::kStorageBuffer,
+                        /*flow=*/DataFlow::kShared,
+                        /*policy=*/ResourcePolicy::kNone,
+                        /*slot=*/0,
+                        /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
+                    }
+                }) {}
+        ~FillWithGarbage() override = default;
+
+        std::string computeSkSL() const override {
+            return R"(
+                void main() {
+                    out_data[sk_GlobalInvocationID.x] = uint4(0xFE);
+                }
+            )";
+        }
+    } garbageStep;
+
+    // Second stage just copies the data to a destination buffer. This is only to verify that this
+    // stage, issued in a separate DispatchGroup, observes the clear.
+    class CopyBuffer : public ComputeStep {
+    public:
+        CopyBuffer() : ComputeStep(
+                /*name=*/"CopyBuffer",
+                /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
+                /*resources=*/{
+                    {
+                        /*type=*/ResourceType::kStorageBuffer,
+                        /*flow=*/DataFlow::kShared,
+                        /*policy=*/ResourcePolicy::kNone,
+                        /*slot=*/0,
+                        /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
+                    },
+                    {
+                        /*type=*/ResourceType::kStorageBuffer,
+                        /*flow=*/DataFlow::kShared,
+                        /*policy=*/ResourcePolicy::kNone,
+                        /*slot=*/1,
+                        /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
+                    }
+                }) {}
+        ~CopyBuffer() override = default;
+
+        std::string computeSkSL() const override {
+            return R"(
+                void main() {
+                    out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
+                }
+            )";
+        }
+    } copyStep;
+
+    std::unique_ptr<Recorder> recorder = context->makeRecorder();
+    DispatchGroup::Builder builder(recorder.get());
+
+    constexpr size_t kElementCount = 4 * kWorkgroupSize;
+    constexpr size_t kBufferSize = sizeof(uint32_t) * kElementCount;
+    auto input = recorder->priv().drawBufferManager()->getStorage(kBufferSize);
+    auto [_, output] = recorder->priv().drawBufferManager()->getStoragePointer(kBufferSize);
+
+    ComputeTask::DispatchGroupList groups;
+
+    // First group.
+    builder.assignSharedBuffer({input, kBufferSize}, 0);
+    builder.appendStep(&garbageStep, {{1, 1, 1}});
+    groups.push_back(builder.finalize());
+
+    // Second group.
+    builder.reset();
+    builder.assignSharedBuffer({input, kBufferSize}, 0, ClearBuffer::kYes);
+    builder.assignSharedBuffer({output, kBufferSize}, 1);
+    builder.appendStep(&copyStep, {{1, 1, 1}});
+    groups.push_back(builder.finalize());
+
+    recorder->priv().add(ComputeTask::Make(std::move(groups)));
+    // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
+    auto outputBuffer = sync_buffer_to_cpu(recorder.get(), output.fBuffer);
+
+    // Submit the work and wait for it to complete.
+    std::unique_ptr<Recording> recording = submit_recording(context, testContext, recorder.get());
+    if (!recording) {
+        ERRORF(reporter, "Failed to make recording");
+        return;
+    }
+
+    // Verify the contents of the output buffer.
+    uint32_t* outData = static_cast<uint32_t*>(
+            map_buffer(context, testContext, outputBuffer.get(), output.fOffset));
+    SkASSERT(outputBuffer->isMapped() && outData != nullptr);
+    for (unsigned int i = 0; i < kElementCount; ++i) {
+        const uint32_t found = outData[i];
+        REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
+    }
+}
+
+DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrderingScratchBuffers,
+                                              reporter,
+                                              context,
+                                              testContext) {
+    // This test is the same as the ClearOrdering test but the two stages write to a recycled
+    // ScratchBuffer. This is primarily to test ScratchBuffer reuse.
+    constexpr uint32_t kWorkgroupSize = 64;
+
+    // Initialize buffer with non-zero data.
+    class FillWithGarbage : public ComputeStep {
+    public:
+        FillWithGarbage() : ComputeStep(
+                /*name=*/"FillWithGarbage",
+                /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
+                /*resources=*/{
+                    {
+                        /*type=*/ResourceType::kStorageBuffer,
+                        /*flow=*/DataFlow::kShared,
+                        /*policy=*/ResourcePolicy::kNone,
+                        /*slot=*/0,
+                        /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
+                    }
+                }) {}
+        ~FillWithGarbage() override = default;
+
+        std::string computeSkSL() const override {
+            return R"(
+                void main() {
+                    out_data[sk_GlobalInvocationID.x] = uint4(0xFE);
+                }
+            )";
+        }
+    } garbageStep;
+
+    // Second stage just copies the data to a destination buffer. This is only to verify that this
+    // stage (issued in a separate DispatchGroup) sees the changes.
+    class CopyBuffer : public ComputeStep {
+    public:
+        CopyBuffer() : ComputeStep(
+                /*name=*/"CopyBuffer",
+                /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
+                /*resources=*/{
+                    {
+                        /*type=*/ResourceType::kStorageBuffer,
+                        /*flow=*/DataFlow::kShared,
+                        /*policy=*/ResourcePolicy::kNone,
+                        /*slot=*/0,
+                        /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
+                    },
+                    {
+                        /*type=*/ResourceType::kStorageBuffer,
+                        /*flow=*/DataFlow::kShared,
+                        /*policy=*/ResourcePolicy::kNone,
+                        /*slot=*/1,
+                        /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
+                    }
+                }) {}
+        ~CopyBuffer() override = default;
+
+        std::string computeSkSL() const override {
+            return R"(
+                void main() {
+                    out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
+                }
+            )";
+        }
+    } copyStep;
+
+    std::unique_ptr<Recorder> recorder = context->makeRecorder();
+    DispatchGroup::Builder builder(recorder.get());
+
+    constexpr size_t kElementCount = 4 * kWorkgroupSize;
+    constexpr size_t kBufferSize = sizeof(uint32_t) * kElementCount;
+    auto [_, output] = recorder->priv().drawBufferManager()->getStoragePointer(kBufferSize);
+
+    ComputeTask::DispatchGroupList groups;
+
+    // First group.
+    {
+        auto scratch = recorder->priv().drawBufferManager()->getScratchStorage(kBufferSize);
+        auto input = scratch.suballocate(kBufferSize);
+        builder.assignSharedBuffer({input, kBufferSize}, 0);
+
+        // `scratch` returns to the scratch buffer pool when it goes out of scope
+    }
+    builder.appendStep(&garbageStep, {{1, 1, 1}});
+    groups.push_back(builder.finalize());
+
+    // Second group.
+    builder.reset();
+    {
+        auto scratch = recorder->priv().drawBufferManager()->getScratchStorage(kBufferSize);
+        auto input = scratch.suballocate(kBufferSize);
+        builder.assignSharedBuffer({input, kBufferSize}, 0, ClearBuffer::kYes);
+    }
+    builder.assignSharedBuffer({output, kBufferSize}, 1);
+    builder.appendStep(&copyStep, {{1, 1, 1}});
+    groups.push_back(builder.finalize());
+
+    recorder->priv().add(ComputeTask::Make(std::move(groups)));
+    // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
+    auto outputBuffer = sync_buffer_to_cpu(recorder.get(), output.fBuffer);
+
+    // Submit the work and wait for it to complete.
+    std::unique_ptr<Recording> recording = submit_recording(context, testContext, recorder.get());
+    if (!recording) {
+        ERRORF(reporter, "Failed to make recording");
+        return;
+    }
+
+    // Verify the contents of the output buffer.
+    uint32_t* outData = static_cast<uint32_t*>(
+            map_buffer(context, testContext, outputBuffer.get(), output.fOffset));
+    SkASSERT(outputBuffer->isMapped() && outData != nullptr);
+    for (unsigned int i = 0; i < kElementCount; ++i) {
+        const uint32_t found = outData[i];
+        REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
+    }
+}
+
 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_IndirectDispatch,
                                               reporter,
                                               context,
commit	e727b91ec444203aa559f16e7f55d8fc9cc53c5a	[log] [tgz]
author	Arman Uguray <armansito@google.com>	Mon Mar 18 23:41:09 2024 -0700
committer	SkCQ <skcq-be@skia-corp.google.com.iam.gserviceaccount.com>	Mon Apr 01 19:30:43 2024 +0000
tree	420123321b507a27aff784d2ce0b59a9408be151
parent	0b0ed6ca101b4e4a352d2f90214e3b71db56d9dd [diff]