Dawn: staging manager rewrite.

Using many small staging buffers is quite costly for performance, for
example when there are many small texture uploads (such as MotionMark's
"canvas bouncing clipped rects" test).

Instead, the new manager uses buffers of 32K minimum, and returns
suballocations of those.

For now, allocation is simply iterates through the list of staging
buffers and returns the first one with enough remaining space.
This is O(N) in the number of staging buffers, which should be
ok since there are fewer of them. (If it becomes a hot path, we
can optimize it later.)

This also subsumes the UBO-specific staging buffer, since that one
did much the same thing but did not reuse staging buffers.

Change-Id: I4fddee8bb0fc602c49fe552acc327a640bf6917b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/279921
Reviewed-by: Greg Daniel <egdaniel@google.com>
Commit-Queue: Stephen White <senorblanco@chromium.org>
diff --git a/gn/gpu.gni b/gn/gpu.gni
index 261be7a..8edaf9b 100644
--- a/gn/gpu.gni
+++ b/gn/gpu.gni
@@ -188,6 +188,8 @@
   "$_src/gpu/GrSimpleMesh.h",
   "$_src/gpu/GrSoftwarePathRenderer.cpp",
   "$_src/gpu/GrSoftwarePathRenderer.h",
+  "$_src/gpu/GrStagingBuffer.cpp",
+  "$_src/gpu/GrStagingBuffer.h",
   "$_src/gpu/GrStencilAttachment.cpp",
   "$_src/gpu/GrStencilAttachment.h",
   "$_src/gpu/GrStencilClip.h",
@@ -770,8 +772,8 @@
   "$_src/gpu/dawn/GrDawnRenderTarget.h",
   "$_src/gpu/dawn/GrDawnRingBuffer.cpp",
   "$_src/gpu/dawn/GrDawnRingBuffer.h",
-  "$_src/gpu/dawn/GrDawnStagingManager.cpp",
-  "$_src/gpu/dawn/GrDawnStagingManager.h",
+  "$_src/gpu/dawn/GrDawnStagingBuffer.cpp",
+  "$_src/gpu/dawn/GrDawnStagingBuffer.h",
   "$_src/gpu/dawn/GrDawnStencilAttachment.cpp",
   "$_src/gpu/dawn/GrDawnStencilAttachment.h",
   "$_src/gpu/dawn/GrDawnTexture.cpp",
diff --git a/src/gpu/GrGpu.cpp b/src/gpu/GrGpu.cpp
index 6b8e47e..43adfba 100644
--- a/src/gpu/GrGpu.cpp
+++ b/src/gpu/GrGpu.cpp
@@ -26,6 +26,7 @@
 #include "src/gpu/GrResourceCache.h"
 #include "src/gpu/GrResourceProvider.h"
 #include "src/gpu/GrSemaphore.h"
+#include "src/gpu/GrStagingBuffer.h"
 #include "src/gpu/GrStencilAttachment.h"
 #include "src/gpu/GrStencilSettings.h"
 #include "src/gpu/GrSurfacePriv.h"
@@ -34,11 +35,15 @@
 #include "src/gpu/GrTracing.h"
 #include "src/utils/SkJSONWriter.h"
 
+static const size_t kMinStagingBufferSize = 32 * 1024;
+
 ////////////////////////////////////////////////////////////////////////////////
 
 GrGpu::GrGpu(GrContext* context) : fResetBits(kAll_GrBackendState), fContext(context) {}
 
-GrGpu::~GrGpu() {}
+GrGpu::~GrGpu() {
+    SkASSERT(fBusyStagingBuffers.isEmpty());
+}
 
 void GrGpu::disconnect(DisconnectType) {}
 
@@ -611,12 +616,44 @@
     return fSamplePatternDictionary.findOrAssignSamplePatternKey(sampleLocations);
 }
 
+#ifdef SK_DEBUG
+bool GrGpu::inStagingBuffers(GrStagingBuffer* b) const {
+    for (const auto& i : fStagingBuffers) {
+        if (b == i.get()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+void GrGpu::validateStagingBuffers() const {
+    for (const auto& i : fStagingBuffers) {
+        GrStagingBuffer* buffer = i.get();
+        SkASSERT(fAvailableStagingBuffers.isInList(buffer) ||
+                 fActiveStagingBuffers.isInList(buffer) ||
+                 fBusyStagingBuffers.isInList(buffer));
+    }
+    for (auto b : fAvailableStagingBuffers) {
+        SkASSERT(this->inStagingBuffers(b));
+    }
+    for (auto b : fActiveStagingBuffers) {
+        SkASSERT(this->inStagingBuffers(b));
+    }
+    for (auto b : fBusyStagingBuffers) {
+        SkASSERT(this->inStagingBuffers(b));
+    }
+}
+#endif
+
 GrSemaphoresSubmitted GrGpu::finishFlush(GrSurfaceProxy* proxies[],
                                          int n,
                                          SkSurface::BackendSurfaceAccess access,
                                          const GrFlushInfo& info,
                                          const GrPrepareForExternalIORequests& externalRequests) {
     TRACE_EVENT0("skia.gpu", TRACE_FUNC);
+#ifdef SK_DEBUG
+    this->validateStagingBuffers();
+#endif
     this->stats()->incNumFinishFlushes();
     GrResourceProvider* resourceProvider = fContext->priv().resourceProvider();
 
@@ -650,6 +687,8 @@
         }
     }
 
+    this->unmapStagingBuffers();
+
     // We always want to try flushing, so do that before checking if we failed semaphore creation.
     if (!this->onFinishFlush(proxies, n, access, info, externalRequests) ||
         failedSemaphoreCreation) {
@@ -666,6 +705,12 @@
         return GrSemaphoresSubmitted::kNo;
     }
 
+    // Move all active staging buffers to the busy list.
+    while (GrStagingBuffer* buffer = fActiveStagingBuffers.head()) {
+        fActiveStagingBuffers.remove(buffer);
+        fBusyStagingBuffers.addToTail(buffer);
+    }
+
     for (int i = 0; i < info.fNumSemaphores; ++i) {
         if (!info.fSignalSemaphores[i].isInitialized()) {
             SkASSERT(semaphoreInfos[i].fSemaphore);
@@ -891,3 +936,54 @@
     return this->onCreateCompressedBackendTexture(dimensions, format, mipMapped,
                                                   isProtected, data);
 }
+
+GrStagingBuffer* GrGpu::findStagingBuffer(size_t size) {
+#ifdef SK_DEBUG
+    this->validateStagingBuffers();
+#endif
+    for (auto b : fActiveStagingBuffers) {
+        if (b->remaining() >= size) {
+            return b;
+        }
+    }
+    for (auto b : fAvailableStagingBuffers) {
+        if (b->remaining() >= size) {
+            fAvailableStagingBuffers.remove(b);
+            fActiveStagingBuffers.addToTail(b);
+            return b;
+        }
+    }
+    size = SkNextPow2(size);
+    size = std::max(size, kMinStagingBufferSize);
+    std::unique_ptr<GrStagingBuffer> b = this->createStagingBuffer(size);
+    GrStagingBuffer* stagingBuffer = b.get();
+    fStagingBuffers.push_back(std::move(b));
+    fActiveStagingBuffers.addToTail(stagingBuffer);
+    return stagingBuffer;
+}
+
+GrStagingBuffer::Slice GrGpu::allocateStagingBufferSlice(size_t size) {
+#ifdef SK_DEBUG
+    this->validateStagingBuffers();
+#endif
+    GrStagingBuffer* stagingBuffer = this->findStagingBuffer(size);
+    return stagingBuffer->allocate(size);
+}
+
+void GrGpu::unmapStagingBuffers() {
+#ifdef SK_DEBUG
+    this->validateStagingBuffers();
+#endif
+    // Unmap all active buffers.
+    for (auto buffer : fActiveStagingBuffers) {
+        buffer->unmap();
+    }
+}
+
+void GrGpu::markStagingBufferAvailable(GrStagingBuffer* buffer) {
+#ifdef SK_DEBUG
+    this->validateStagingBuffers();
+#endif
+    fBusyStagingBuffers.remove(buffer);
+    fAvailableStagingBuffers.addToTail(buffer);
+}
diff --git a/src/gpu/GrGpu.h b/src/gpu/GrGpu.h
index 3c4cc78..4684805 100644
--- a/src/gpu/GrGpu.h
+++ b/src/gpu/GrGpu.h
@@ -12,9 +12,11 @@
 #include "include/core/SkSurface.h"
 #include "include/gpu/GrTypes.h"
 #include "include/private/SkTArray.h"
+#include "src/core/SkTInternalLList.h"
 #include "src/gpu/GrCaps.h"
 #include "src/gpu/GrOpsRenderPass.h"
 #include "src/gpu/GrSamplePatternDictionary.h"
+#include "src/gpu/GrStagingBuffer.h"
 #include "src/gpu/GrSwizzle.h"
 #include "src/gpu/GrTextureProducer.h"
 #include "src/gpu/GrXferProcessor.h"
@@ -678,6 +680,12 @@
     // Called before certain draws in order to guarantee coherent results from dst reads.
     virtual void xferBarrier(GrRenderTarget*, GrXferBarrierType) = 0;
 
+    GrStagingBuffer* findStagingBuffer(size_t size);
+    GrStagingBuffer::Slice allocateStagingBufferSlice(size_t size);
+    virtual std::unique_ptr<GrStagingBuffer> createStagingBuffer(size_t size) { return nullptr; }
+    void unmapStagingBuffers();
+    void markStagingBufferAvailable(GrStagingBuffer* buffer);
+
 protected:
     static bool MipMapsAreCorrect(SkISize dimensions, GrMipMapped, const BackendTextureData*);
     static bool CompressedDataIsCorrect(SkISize dimensions, SkImage::CompressionType,
@@ -687,10 +695,18 @@
     void didWriteToSurface(GrSurface* surface, GrSurfaceOrigin origin, const SkIRect* bounds,
                            uint32_t mipLevels = 1) const;
 
+    typedef SkTInternalLList<GrStagingBuffer> StagingBufferList;
+
     Stats                            fStats;
     std::unique_ptr<GrPathRendering> fPathRendering;
     // Subclass must initialize this in its constructor.
     sk_sp<const GrCaps>              fCaps;
+    std::vector<std::unique_ptr<GrStagingBuffer>> fStagingBuffers;
+
+    StagingBufferList                fAvailableStagingBuffers;
+    StagingBufferList                fActiveStagingBuffers;
+    StagingBufferList                fBusyStagingBuffers;
+
 
 private:
     virtual GrBackendTexture onCreateBackendTexture(SkISize dimensions,
@@ -809,6 +825,10 @@
         this->onResetContext(fResetBits);
         fResetBits = 0;
     }
+#ifdef SK_DEBUG
+    bool inStagingBuffers(GrStagingBuffer* b) const;
+    void validateStagingBuffers() const;
+#endif
 
     uint32_t fResetBits;
     // The context owns us, not vice-versa, so this ptr is not ref'ed by Gpu.
diff --git a/src/gpu/GrStagingBuffer.cpp b/src/gpu/GrStagingBuffer.cpp
new file mode 100644
index 0000000..19c64d5
--- /dev/null
+++ b/src/gpu/GrStagingBuffer.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "src/gpu/GrGpu.h"
+#include "src/gpu/GrStagingBuffer.h"
+
+#include "src/core/SkMathPriv.h"
+
+void GrStagingBuffer::markAvailable(void* data) {
+    fData = data;
+    fOffset = 0;
+    fGpu->markStagingBufferAvailable(this);
+}
+
+void GrStagingBuffer::unmap() {
+    this->onUnmap();
+}
+
+GrStagingBuffer::Slice GrStagingBuffer::allocate(size_t size) {
+    size_t offset = fOffset;
+    fOffset += size;
+    char* data = static_cast<char*>(fData) + offset;
+    return Slice(this, offset, data);
+}
diff --git a/src/gpu/GrStagingBuffer.h b/src/gpu/GrStagingBuffer.h
new file mode 100644
index 0000000..a50b1a4
--- /dev/null
+++ b/src/gpu/GrStagingBuffer.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GrStagingBuffer_DEFINED
+#define GrStagingBuffer_DEFINED
+
+#include "src/core/SkTInternalLList.h"
+
+class GrGpu;
+
+class GrStagingBuffer {
+public:
+    GrStagingBuffer(GrGpu* gpu, size_t size, void* data) : fGpu(gpu), fSize(size), fData(data) {}
+    virtual ~GrStagingBuffer() {
+        fGpu = nullptr;
+    }
+    void markAvailable(void* data);
+    struct Slice {
+        Slice(GrStagingBuffer* buffer, int offset, void* data)
+          : fBuffer(buffer), fOffset(offset), fData(data) {}
+        GrStagingBuffer*   fBuffer;
+        int                fOffset;
+        void*              fData;
+    };
+    size_t remaining() const { return fSize - fOffset; }
+    GrGpu* getGpu() const { return fGpu; }
+    void unmap();
+    Slice allocate(size_t size);
+private:
+    virtual void onUnmap() = 0;
+
+    GrGpu*                 fGpu;
+    size_t                 fSize;
+    size_t                 fOffset = 0;
+    void*                  fData;
+
+    SK_DECLARE_INTERNAL_LLIST_INTERFACE(GrStagingBuffer);
+};
+
+#endif
diff --git a/src/gpu/dawn/GrDawnBuffer.cpp b/src/gpu/dawn/GrDawnBuffer.cpp
index 6af4133..eb0a747 100644
--- a/src/gpu/dawn/GrDawnBuffer.cpp
+++ b/src/gpu/dawn/GrDawnBuffer.cpp
@@ -6,6 +6,7 @@
  */
 
 #include "src/gpu/dawn/GrDawnBuffer.h"
+#include "src/gpu/dawn/GrDawnStagingBuffer.h"
 
 #include "src/gpu/dawn/GrDawnGpu.h"
 
@@ -29,8 +30,7 @@
 
 GrDawnBuffer::GrDawnBuffer(GrDawnGpu* gpu, size_t sizeInBytes, GrGpuBufferType type,
                            GrAccessPattern pattern)
-    : INHERITED(gpu, sizeInBytes, type, pattern)
-    , fStagingBuffer(nullptr) {
+    : INHERITED(gpu, sizeInBytes, type, pattern) {
     wgpu::BufferDescriptor bufferDesc;
     bufferDesc.size = sizeInBytes;
     bufferDesc.usage = GrGpuBufferTypeToDawnUsageBit(type) | wgpu::BufferUsage::CopyDst;
@@ -45,18 +45,19 @@
     if (this->wasDestroyed()) {
         return;
     }
-    fStagingBuffer = getDawnGpu()->getStagingBuffer(this->size());
-    fMapPtr = fStagingBuffer->fData;
+    GrStagingBuffer::Slice slice = getGpu()->allocateStagingBufferSlice(this->size());
+    fStagingBuffer = static_cast<GrDawnStagingBuffer*>(slice.fBuffer)->buffer();
+    fStagingOffset = slice.fOffset;
+    fMapPtr = slice.fData;
 }
 
 void GrDawnBuffer::onUnmap() {
     if (this->wasDestroyed()) {
         return;
     }
-    fStagingBuffer->fBuffer.Unmap();
     fMapPtr = nullptr;
     getDawnGpu()->getCopyEncoder()
-        .CopyBufferToBuffer(fStagingBuffer->fBuffer, 0, fBuffer, 0, this->size());
+        .CopyBufferToBuffer(fStagingBuffer, fStagingOffset, fBuffer, 0, this->size());
 }
 
 bool GrDawnBuffer::onUpdateData(const void* src, size_t srcSizeInBytes) {
@@ -64,7 +65,7 @@
         return false;
     }
     this->onMap();
-    memcpy(fStagingBuffer->fData, src, srcSizeInBytes);
+    memcpy(fMapPtr, src, srcSizeInBytes);
     this->onUnmap();
     return true;
 }
diff --git a/src/gpu/dawn/GrDawnBuffer.h b/src/gpu/dawn/GrDawnBuffer.h
index 52a03a8..999dabb 100644
--- a/src/gpu/dawn/GrDawnBuffer.h
+++ b/src/gpu/dawn/GrDawnBuffer.h
@@ -12,7 +12,6 @@
 #include "dawn/webgpu_cpp.h"
 
 class GrDawnGpu;
-struct GrDawnStagingBuffer;
 
 class GrDawnBuffer : public GrGpuBuffer {
 public:
@@ -28,7 +27,8 @@
 
 private:
     wgpu::Buffer fBuffer;
-    GrDawnStagingBuffer* fStagingBuffer;
+    wgpu::Buffer fStagingBuffer;
+    size_t       fStagingOffset;
     typedef GrGpuBuffer INHERITED;
 };
 
diff --git a/src/gpu/dawn/GrDawnGpu.cpp b/src/gpu/dawn/GrDawnGpu.cpp
index b3c05f0..e2686cb 100644
--- a/src/gpu/dawn/GrDawnGpu.cpp
+++ b/src/gpu/dawn/GrDawnGpu.cpp
@@ -22,6 +22,7 @@
 #include "src/gpu/dawn/GrDawnOpsRenderPass.h"
 #include "src/gpu/dawn/GrDawnProgramBuilder.h"
 #include "src/gpu/dawn/GrDawnRenderTarget.h"
+#include "src/gpu/dawn/GrDawnStagingBuffer.h"
 #include "src/gpu/dawn/GrDawnStencilAttachment.h"
 #include "src/gpu/dawn/GrDawnTexture.h"
 #include "src/gpu/dawn/GrDawnUtil.h"
@@ -34,7 +35,7 @@
 #include <unistd.h>
 #endif // !defined(SK_BUILD_FOR_WIN)
 
-const int kMaxRenderPipelineEntries = 1024;
+static const int kMaxRenderPipelineEntries = 1024;
 
 static wgpu::FilterMode to_dawn_filter_mode(GrSamplerState::Filter filter) {
     switch (filter) {
@@ -83,17 +84,28 @@
         , fQueue(device.CreateQueue())
         , fCompiler(new SkSL::Compiler())
         , fUniformRingBuffer(this, wgpu::BufferUsage::Uniform)
-        , fRenderPipelineCache(kMaxRenderPipelineEntries)
-        , fStagingManager(fDevice) {
+        , fRenderPipelineCache(kMaxRenderPipelineEntries) {
     fCaps.reset(new GrDawnCaps(options));
 }
 
 GrDawnGpu::~GrDawnGpu() {
+    while (!fBusyStagingBuffers.isEmpty()) {
+        fDevice.Tick();
+    }
 }
 
 
 void GrDawnGpu::disconnect(DisconnectType type) {
-    SkASSERT(!"unimplemented");
+    if (DisconnectType::kAbandon == type) {
+        fBusyStagingBuffers.reset();
+        fAvailableStagingBuffers.reset();
+        fActiveStagingBuffers.reset();
+    } else {
+        while (!fBusyStagingBuffers.isEmpty()) {
+            fDevice.Tick();
+        }
+    }
+    fStagingBuffers.clear();
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -312,24 +324,21 @@
         size_t origRowBytes = bpp * w;
         size_t rowBytes = GrDawnRoundRowBytes(origRowBytes);
         size_t size = rowBytes * h;
-        GrDawnStagingBuffer* stagingBuffer = this->getStagingBuffer(size);
+        GrStagingBuffer::Slice stagingBuffer = this->allocateStagingBufferSlice(size);
         if (rowBytes == origRowBytes) {
-            memcpy(stagingBuffer->fData, pixels, size);
+            memcpy(stagingBuffer.fData, pixels, size);
         } else {
             const char* src = static_cast<const char*>(pixels);
-            char* dst = static_cast<char*>(stagingBuffer->fData);
+            char* dst = static_cast<char*>(stagingBuffer.fData);
             for (int row = 0; row < h; row++) {
                 memcpy(dst, src, origRowBytes);
                 dst += rowBytes;
                 src += origRowBytes;
             }
         }
-        wgpu::Buffer buffer = stagingBuffer->fBuffer;
-        buffer.Unmap();
-        stagingBuffer->fData = nullptr;
         wgpu::BufferCopyView srcBuffer;
-        srcBuffer.buffer = buffer;
-        srcBuffer.offset = 0;
+        srcBuffer.buffer = static_cast<GrDawnStagingBuffer*>(stagingBuffer.fBuffer)->buffer();
+        srcBuffer.offset = stagingBuffer.fOffset;
         srcBuffer.rowPitch = rowBytes;
         srcBuffer.imageHeight = h;
         wgpu::TextureCopyView dstTexture;
@@ -424,13 +433,12 @@
 #endif
 
 void GrDawnGpu::flush() {
-    fUniformRingBuffer.flush();
     this->flushCopyEncoder();
     if (!fCommandBuffers.empty()) {
         fQueue.Submit(fCommandBuffers.size(), &fCommandBuffers.front());
         fCommandBuffers.clear();
     }
-    fStagingManager.mapBusyList();
+    this->mapStagingBuffers();
     fDevice.Tick();
 }
 
@@ -634,8 +642,13 @@
     return fUniformRingBuffer.allocate(size);
 }
 
-GrDawnStagingBuffer* GrDawnGpu::getStagingBuffer(size_t size) {
-    return fStagingManager.findOrCreateStagingBuffer(size);
+std::unique_ptr<GrStagingBuffer> GrDawnGpu::createStagingBuffer(size_t size) {
+    wgpu::BufferDescriptor desc;
+    desc.usage = wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc;
+    desc.size = size;
+    wgpu::CreateBufferMappedResult result = fDevice.CreateBufferMapped(&desc);
+    auto stagingBuffer = new GrDawnStagingBuffer(this, result.buffer, desc.size, result.data);
+    return std::unique_ptr<GrStagingBuffer>(stagingBuffer);
 }
 
 void GrDawnGpu::appendCommandBuffer(wgpu::CommandBuffer commandBuffer) {
@@ -657,3 +670,10 @@
         fCopyEncoder = nullptr;
     }
 }
+
+void GrDawnGpu::mapStagingBuffers() {
+    // Map all active buffers, so we get a callback when they're done.
+    for (auto buffer : fActiveStagingBuffers) {
+        static_cast<GrDawnStagingBuffer*>(buffer)->mapAsync();
+    }
+}
diff --git a/src/gpu/dawn/GrDawnGpu.h b/src/gpu/dawn/GrDawnGpu.h
index 8afd08a..a8d41db 100644
--- a/src/gpu/dawn/GrDawnGpu.h
+++ b/src/gpu/dawn/GrDawnGpu.h
@@ -14,11 +14,11 @@
 #include "src/core/SkLRUCache.h"
 #include "src/gpu/GrProgramDesc.h"
 #include "src/gpu/dawn/GrDawnRingBuffer.h"
-#include "src/gpu/dawn/GrDawnStagingManager.h"
 
 #include <unordered_map>
 
 class GrDawnOpsRenderPass;
+class GrDawnStagingBuffer;
 class GrPipeline;
 struct GrDawnProgram;
 
@@ -65,6 +65,7 @@
     void testingOnly_flushGpuAndSync() override;
 #endif
     void flush();
+    std::unique_ptr<GrStagingBuffer> createStagingBuffer(size_t size) override;
 
     GrStencilAttachment* createStencilAttachmentForRenderTarget(const GrRenderTarget*,
                                                                 int width,
@@ -103,8 +104,6 @@
     wgpu::Sampler getOrCreateSampler(GrSamplerState samplerState);
 
     GrDawnRingBuffer::Slice allocateUniformRingBufferSlice(int size);
-    GrDawnStagingBuffer* getStagingBuffer(size_t size);
-    GrDawnStagingManager* getStagingManager() { return &fStagingManager; }
     wgpu::CommandEncoder getCopyEncoder();
     void flushCopyEncoder();
     void appendCommandBuffer(wgpu::CommandBuffer commandBuffer);
@@ -176,6 +175,8 @@
     bool onFinishFlush(GrSurfaceProxy*[], int n, SkSurface::BackendSurfaceAccess access,
                        const GrFlushInfo& info, const GrPrepareForExternalIORequests&) override;
 
+    void mapStagingBuffers();
+
     wgpu::Device                                    fDevice;
     wgpu::Queue                                     fQueue;
     std::unique_ptr<SkSL::Compiler>                 fCompiler;
@@ -198,7 +199,6 @@
 
     SkLRUCache<GrProgramDesc, sk_sp<GrDawnProgram>, ProgramDescHash>    fRenderPipelineCache;
     std::unordered_map<GrSamplerState, wgpu::Sampler, SamplerHash> fSamplers;
-    GrDawnStagingManager fStagingManager;
 
     typedef GrGpu INHERITED;
 };
diff --git a/src/gpu/dawn/GrDawnRingBuffer.cpp b/src/gpu/dawn/GrDawnRingBuffer.cpp
index 4c48828..2e7cb83 100644
--- a/src/gpu/dawn/GrDawnRingBuffer.cpp
+++ b/src/gpu/dawn/GrDawnRingBuffer.cpp
@@ -8,6 +8,7 @@
 #include "src/gpu/dawn/GrDawnRingBuffer.h"
 
 #include "src/gpu/dawn/GrDawnGpu.h"
+#include "src/gpu/dawn/GrDawnStagingBuffer.h"
 #include "src/gpu/dawn/GrDawnUtil.h"
 
 namespace {
@@ -23,7 +24,6 @@
 
 GrDawnRingBuffer::Slice GrDawnRingBuffer::allocate(int size) {
     if (!fBuffer || fOffset + size > kDefaultSize) {
-        flush();
         wgpu::BufferDescriptor desc;
         desc.usage = fUsage | wgpu::BufferUsage::CopyDst;
         desc.size = kDefaultSize;
@@ -31,25 +31,12 @@
         fOffset = 0;
     }
 
-    if (!fStagingBuffer) {
-        wgpu::BufferDescriptor desc;
-        desc.usage = wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc;
-        desc.size = kDefaultSize;
-        wgpu::CreateBufferMappedResult result = fGpu->device().CreateBufferMapped(&desc);
-        fStagingBuffer = result.buffer;
-        fData = result.data;
-    }
-    int offset = fOffset;
+    GrStagingBuffer::Slice staging = fGpu->allocateStagingBufferSlice(size);
+    size_t offset = fOffset;
     fOffset += size;
     fOffset = GrDawnRoundRowBytes(fOffset);
-    fGpu->getCopyEncoder().CopyBufferToBuffer(fStagingBuffer, offset, fBuffer, offset, size);
-    return Slice(fBuffer, offset, static_cast<uint8_t*>(fData) + offset);
-}
-
-void GrDawnRingBuffer::flush() {
-    if (fStagingBuffer) {
-        fStagingBuffer.Unmap();
-        fStagingBuffer = nullptr;   // FIXME: reuse staging buffer
-        fData = nullptr;
-    }
+    wgpu::Buffer srcBuffer = static_cast<GrDawnStagingBuffer*>(staging.fBuffer)->buffer();
+    fGpu->getCopyEncoder().CopyBufferToBuffer(srcBuffer, staging.fOffset,
+                                              fBuffer, offset, size);
+    return Slice(fBuffer, offset, staging.fData);
 }
diff --git a/src/gpu/dawn/GrDawnRingBuffer.h b/src/gpu/dawn/GrDawnRingBuffer.h
index bbc5a0a..6ac47dd 100644
--- a/src/gpu/dawn/GrDawnRingBuffer.h
+++ b/src/gpu/dawn/GrDawnRingBuffer.h
@@ -37,14 +37,11 @@
         void*        fData;
     };
     Slice allocate(int size);
-    void flush();
 
 private:
     GrDawnGpu*            fGpu;
     wgpu::BufferUsage     fUsage;
     wgpu::Buffer          fBuffer;
-    wgpu::Buffer          fStagingBuffer;
-    void*                 fData;
     int                   fOffset = 0;
 };
 
diff --git a/src/gpu/dawn/GrDawnStagingBuffer.cpp b/src/gpu/dawn/GrDawnStagingBuffer.cpp
new file mode 100644
index 0000000..25b0844
--- /dev/null
+++ b/src/gpu/dawn/GrDawnStagingBuffer.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "src/gpu/dawn/GrDawnGpu.h"
+#include "src/gpu/dawn/GrDawnStagingBuffer.h"
+
+#include "src/core/SkMathPriv.h"
+
+static void callback(WGPUBufferMapAsyncStatus status, void* data, uint64_t dataLength,
+                     void* userData) {
+    GrDawnStagingBuffer* buffer = static_cast<GrDawnStagingBuffer*>(userData);
+    buffer->markAvailable(data);
+}
+
+GrDawnGpu* GrDawnStagingBuffer::getDawnGpu() const {
+    return static_cast<GrDawnGpu*>(this->getGpu());
+}
+
+void GrDawnStagingBuffer::mapAsync() {
+    fBuffer.MapWriteAsync(callback, this);
+}
+
+void GrDawnStagingBuffer::onUnmap() {
+    fBuffer.Unmap();
+}
diff --git a/src/gpu/dawn/GrDawnStagingBuffer.h b/src/gpu/dawn/GrDawnStagingBuffer.h
new file mode 100644
index 0000000..82f53b1
--- /dev/null
+++ b/src/gpu/dawn/GrDawnStagingBuffer.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GrDawnStagingBuffer_DEFINED
+#define GrDawnStagingBuffer_DEFINED
+
+#include "dawn/webgpu_cpp.h"
+#include "src/gpu/GrStagingBuffer.h"
+
+class GrDawnStagingBuffer : public GrStagingBuffer {
+public:
+    GrDawnStagingBuffer(GrGpu* gpu, wgpu::Buffer buffer, size_t size, void* data)
+        : INHERITED(gpu, size, data), fBuffer(buffer) {}
+    ~GrDawnStagingBuffer() override {}
+    void           mapAsync();
+    wgpu::Buffer   buffer() const { return fBuffer; }
+    GrDawnGpu*     getDawnGpu() const;
+
+private:
+    void           onUnmap() override;
+
+    wgpu::Buffer   fBuffer;
+    typedef GrStagingBuffer INHERITED;
+};
+
+#endif
diff --git a/src/gpu/dawn/GrDawnStagingManager.cpp b/src/gpu/dawn/GrDawnStagingManager.cpp
deleted file mode 100644
index e655722..0000000
--- a/src/gpu/dawn/GrDawnStagingManager.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright 2019 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "src/gpu/dawn/GrDawnStagingManager.h"
-
-#include "src/core/SkMathPriv.h"
-
-GrDawnStagingManager::GrDawnStagingManager(wgpu::Device device) : fDevice(device) {
-}
-
-GrDawnStagingManager::~GrDawnStagingManager() {
-    // Clean up any pending callbacks before destroying the StagingBuffers.
-    while (fWaitingCount > 0) {
-        fDevice.Tick();
-    }
-}
-
-GrDawnStagingBuffer* GrDawnStagingManager::findOrCreateStagingBuffer(size_t size) {
-    size_t sizePow2 = GrNextPow2(size);
-    GrDawnStagingBuffer* stagingBuffer;
-    auto i = fReadyPool.find(sizePow2);
-    if (i != fReadyPool.end()) {
-        stagingBuffer = i->second;
-        fReadyPool.erase(i);
-    } else {
-        wgpu::BufferDescriptor desc;
-        desc.usage = wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc;
-        desc.size = sizePow2;
-        wgpu::CreateBufferMappedResult result = fDevice.CreateBufferMapped(&desc);
-        std::unique_ptr<GrDawnStagingBuffer> b(new GrDawnStagingBuffer(
-            this, result.buffer, sizePow2, result.data));
-        stagingBuffer = b.get();
-        fBuffers.push_back(std::move(b));
-    }
-    fBusyList.push_back(stagingBuffer);
-    return stagingBuffer;
-}
-
-static void callback(WGPUBufferMapAsyncStatus status, void* data, uint64_t dataLength,
-                     void* userData) {
-    GrDawnStagingBuffer* buffer = static_cast<GrDawnStagingBuffer*>(userData);
-    buffer->fData = data;
-    if (buffer->fManager) {
-        buffer->fManager->addToReadyPool(buffer);
-    }
-}
-
-void GrDawnStagingManager::mapBusyList() {
-    // Map all buffers on the busy list for writing. When they're no longer in flight on the GPU,
-    // their callback will be called and they'll be moved to the ready pool.
-    for (GrDawnStagingBuffer* buffer : fBusyList) {
-        buffer->fBuffer.MapWriteAsync(callback, buffer);
-        fWaitingCount++;
-    }
-    fBusyList.clear();
-}
-
-void GrDawnStagingManager::addToReadyPool(GrDawnStagingBuffer* buffer) {
-    fWaitingCount--;
-    fReadyPool.insert(std::pair<size_t, GrDawnStagingBuffer*>(buffer->fSize, buffer));
-}
diff --git a/src/gpu/dawn/GrDawnStagingManager.h b/src/gpu/dawn/GrDawnStagingManager.h
deleted file mode 100644
index 1bcc339..0000000
--- a/src/gpu/dawn/GrDawnStagingManager.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright 2019 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef GrDawnStagingManager_DEFINED
-#define GrDawnStagingManager_DEFINED
-
-#include <map>
-#include <memory>
-#include <vector>
-
-#include "dawn/webgpu_cpp.h"
-
-struct GrDawnStagingBuffer;
-
-class GrDawnStagingManager {
-public:
-    GrDawnStagingManager(wgpu::Device device);
-   ~GrDawnStagingManager();
-    GrDawnStagingBuffer* findOrCreateStagingBuffer(size_t size);
-
-    void addToReadyPool(GrDawnStagingBuffer* buffer);
-    void mapBusyList();
-
-private:
-    wgpu::Device                                       fDevice;
-    std::vector<std::unique_ptr<GrDawnStagingBuffer>>  fBuffers;
-    std::multimap<size_t, GrDawnStagingBuffer*>        fReadyPool;
-    std::vector<GrDawnStagingBuffer*>                  fBusyList;
-    int                                                fWaitingCount = 0;
-};
-
-struct GrDawnStagingBuffer {
-    GrDawnStagingBuffer(GrDawnStagingManager* manager, wgpu::Buffer buffer, size_t size,
-                       void* data)
-        : fManager(manager), fBuffer(buffer), fSize(size), fData(data) {}
-    ~GrDawnStagingBuffer() {
-        fManager = nullptr;
-    }
-    GrDawnStagingManager*  fManager;
-    wgpu::Buffer           fBuffer;
-    size_t                 fSize;
-    void*                  fData;
-};
-
-#endif
diff --git a/src/gpu/dawn/GrDawnTexture.cpp b/src/gpu/dawn/GrDawnTexture.cpp
index 81f898e..5acc963 100644
--- a/src/gpu/dawn/GrDawnTexture.cpp
+++ b/src/gpu/dawn/GrDawnTexture.cpp
@@ -9,6 +9,7 @@
 
 #include "src/core/SkConvertPixels.h"
 #include "src/gpu/dawn/GrDawnGpu.h"
+#include "src/gpu/dawn/GrDawnStagingBuffer.h"
 #include "src/gpu/dawn/GrDawnTextureRenderTarget.h"
 #include "src/gpu/dawn/GrDawnUtil.h"
 
@@ -151,15 +152,12 @@
         size_t trimRowBytes = width * SkColorTypeBytesPerPixel(colorType);
         size_t dstRowBytes = GrDawnRoundRowBytes(trimRowBytes);
         size_t size = dstRowBytes * height;
-        GrDawnStagingBuffer* stagingBuffer = getDawnGpu()->getStagingBuffer(size);
-        SkRectMemcpy(stagingBuffer->fData, dstRowBytes, src, srcRowBytes, trimRowBytes, height);
-        wgpu::Buffer buffer = stagingBuffer->fBuffer;
-        buffer.Unmap();
-        stagingBuffer->fData = nullptr;
+        GrStagingBuffer::Slice slice = getDawnGpu()->allocateStagingBufferSlice(size);
+        SkRectMemcpy(slice.fData, dstRowBytes, src, srcRowBytes, trimRowBytes, height);
 
         wgpu::BufferCopyView srcBuffer;
-        srcBuffer.buffer = buffer;
-        srcBuffer.offset = 0;
+        srcBuffer.buffer = static_cast<GrDawnStagingBuffer*>(slice.fBuffer)->buffer();
+        srcBuffer.offset = slice.fOffset;
         srcBuffer.rowPitch = dstRowBytes;
         srcBuffer.imageHeight = height;