src/gpu/graphite/compute/VelloRenderer.cpp - skia - Git at Google

 /*
  * Copyright 2023 Google LLC
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */

 #include "src/gpu/graphite/compute/VelloRenderer.h"

 #include "include/core/SkPath.h"
 #include "include/core/SkTypes.h"
 #include "include/gpu/graphite/Recorder.h"
 #include "src/core/SkGeometry.h"
 #include "src/core/SkPathPriv.h"
 #include "src/core/SkTraceEvent.h"
 #include "src/gpu/graphite/BufferManager.h"
 #include "src/gpu/graphite/Caps.h"
 #include "src/gpu/graphite/DrawParams.h"
 #include "src/gpu/graphite/Log.h"
 #include "src/gpu/graphite/PipelineData.h"
 #include "src/gpu/graphite/RecorderPriv.h"
 #include "src/gpu/graphite/TextureProxy.h"
 #include "src/gpu/graphite/TextureUtils.h"
 #include "src/gpu/graphite/UniformManager.h"
 #include "src/gpu/graphite/compute/DispatchGroup.h"

 #include <algorithm>

 namespace skgpu::graphite {
 namespace {

 BufferView new_scratch_slice(ScratchBuffer& scratch) {
     size_t size = scratch.size();  // Use the whole buffer.
     BindBufferInfo info = scratch.suballocate(size);
     return {info, info ? size : 0};
 }

 BufferView new_indirect_slice(DrawBufferManager* mgr, size_t size) {
     BindBufferInfo info = mgr->getIndirectStorage(size, ClearBuffer::kYes);
     return {info, info ? size : 0};
 }

 ::rust::Slice<uint8_t> to_slice(void* ptr, size_t size) {
     return {static_cast<uint8_t*>(ptr), size};
 }

 vello_cpp::Affine to_vello_affine(const SkMatrix& m) {
     // Vello currently doesn't support perspective scaling and the encoding only accepts a 2x3
     // affine transform matrix.
     return {{m.get(0), m.get(3), m.get(1), m.get(4), m.get(2), m.get(5)}};
 }

 vello_cpp::Point to_vello_point(const SkPoint& p) { return {p.x(), p.y()}; }

 vello_cpp::Color to_vello_color(const SkColor4f& color) {
     SkColor c = color.toSkColor();
     return {
             static_cast<uint8_t>(SkColorGetR(c)),
             static_cast<uint8_t>(SkColorGetG(c)),
             static_cast<uint8_t>(SkColorGetB(c)),
             static_cast<uint8_t>(SkColorGetA(c)),
     };
 }

 WorkgroupSize to_wg_size(const vello_cpp::WorkgroupSize& src) {
     return WorkgroupSize(src.x, src.y, src.z);
 }

 vello_cpp::Fill to_fill_type(SkPathFillType fillType) {
     // Vello does not provide an encoding for inverse fill types. When Skia uses vello to render
     // a coverage mask for an inverse fill, it encodes a regular fill and inverts the coverage value
     // after sampling the mask.
     switch (fillType) {
         case SkPathFillType::kWinding:
         case SkPathFillType::kInverseWinding:
             return vello_cpp::Fill::NonZero;
         case SkPathFillType::kEvenOdd:
         case SkPathFillType::kInverseEvenOdd:
             return vello_cpp::Fill::EvenOdd;
     }
     return vello_cpp::Fill::NonZero;
 }

 vello_cpp::CapStyle to_cap_style(SkPaint::Cap cap) {
     switch (cap) {
         case SkPaint::Cap::kButt_Cap:
             return vello_cpp::CapStyle::Butt;
         case SkPaint::Cap::kRound_Cap:
             return vello_cpp::CapStyle::Round;
         case SkPaint::Cap::kSquare_Cap:
             return vello_cpp::CapStyle::Square;
     }
     SkUNREACHABLE;
 }

 vello_cpp::JoinStyle to_join_style(SkPaint::Join join) {
     switch (join) {
         case SkPaint::Join::kMiter_Join:
             return vello_cpp::JoinStyle::Miter;
         case SkPaint::Join::kBevel_Join:
             return vello_cpp::JoinStyle::Bevel;
         case SkPaint::Join::kRound_Join:
             return vello_cpp::JoinStyle::Round;
     }
     SkUNREACHABLE;
 }

 vello_cpp::Stroke to_stroke(const SkStrokeRec& style) {
     return vello_cpp::Stroke{
             /*width=*/style.getWidth(),
             /*miter_limit=*/style.getMiter(),
             /*cap*/ to_cap_style(style.getCap()),
             /*join*/ to_join_style(style.getJoin()),
     };
 }

 class PathIter : public vello_cpp::PathIterator {
 public:
     PathIter(const SkPath& path, const Transform& t)
             : fIterate(path), fIter(fIterate.begin()), fTransform(t) {}

     bool next_element(vello_cpp::PathElement* outElem) override {
         if (fConicQuadIdx < fConicConverter.countQuads()) {
             SkASSERT(fConicQuads != nullptr);
             outElem->verb = vello_cpp::PathVerb::QuadTo;
             int pointIdx = fConicQuadIdx * 2;
             outElem->points[0] = to_vello_point(fConicQuads[pointIdx]);
             outElem->points[1] = to_vello_point(fConicQuads[pointIdx + 1]);
             outElem->points[2] = to_vello_point(fConicQuads[pointIdx + 2]);
             fConicQuadIdx++;
             return true;
         }

         if (fIter == fIterate.end()) {
             return false;
         }

         SkASSERT(outElem);
         auto [verb, points, weights] = *fIter;
         fIter++;

         switch (verb) {
             case SkPathVerb::kMove:
                 outElem->verb = vello_cpp::PathVerb::MoveTo;
                 outElem->points[0] = to_vello_point(points[0]);
                 break;
             case SkPathVerb::kLine:
                 outElem->verb = vello_cpp::PathVerb::LineTo;
                 outElem->points[0] = to_vello_point(points[0]);
                 outElem->points[1] = to_vello_point(points[1]);
                 break;
             case SkPathVerb::kConic:
                 // The vello encoding API doesn't handle conic sections. Approximate it with
                 // quadratic Béziers.
                 SkASSERT(fConicQuadIdx >= fConicConverter.countQuads());  // No other conic->quad
                                                                           // conversions should be
                                                                           // in progress
                 fConicQuads = fConicConverter.computeQuads(
                         points, *weights, 0.25 / fTransform.maxScaleFactor());
                 outElem->verb = vello_cpp::PathVerb::QuadTo;
                 outElem->points[0] = to_vello_point(fConicQuads[0]);
                 outElem->points[1] = to_vello_point(fConicQuads[1]);
                 outElem->points[2] = to_vello_point(fConicQuads[2]);

                 // The next call to `next_element` will yield the next quad in the list (at index 1)
                 // if `fConicConverter` contains more than 1 quad.
                 fConicQuadIdx = 1;
                 break;
             case SkPathVerb::kQuad:
                 outElem->verb = vello_cpp::PathVerb::QuadTo;
                 outElem->points[0] = to_vello_point(points[0]);
                 outElem->points[1] = to_vello_point(points[1]);
                 outElem->points[2] = to_vello_point(points[2]);
                 break;
             case SkPathVerb::kCubic:
                 outElem->verb = vello_cpp::PathVerb::CurveTo;
                 outElem->points[0] = to_vello_point(points[0]);
                 outElem->points[1] = to_vello_point(points[1]);
                 outElem->points[2] = to_vello_point(points[2]);
                 outElem->points[3] = to_vello_point(points[3]);
                 break;
             case SkPathVerb::kClose:
                 outElem->verb = vello_cpp::PathVerb::Close;
                 break;
         }

         return true;
     }

 private:
     SkPathPriv::Iterate fIterate;
     SkPathPriv::RangeIter fIter;

     // Variables used to track conic to quadratic spline conversion. `fTransform` is used to
     // determine the subpixel error tolerance in device coordinate space.
     const Transform& fTransform;
     SkAutoConicToQuads fConicConverter;
     const SkPoint* fConicQuads = nullptr;
     int fConicQuadIdx = 0;
 };

 }  // namespace

 VelloScene::VelloScene() : fEncoding(vello_cpp::new_encoding()) {}

 void VelloScene::reset() {
     fEncoding->reset();
 }

 void VelloScene::solidFill(const SkPath& shape,
                            const SkColor4f& fillColor,
                            const SkPathFillType fillType,
                            const Transform& t) {
     PathIter iter(shape, t);
     fEncoding->fill(to_fill_type(fillType),
                     to_vello_affine(t),
                     {vello_cpp::BrushKind::Solid, {to_vello_color(fillColor)}},
                     iter);
 }

 void VelloScene::solidStroke(const SkPath& shape,
                              const SkColor4f& fillColor,
                              const SkStrokeRec& style,
                              const Transform& t) {
     // TODO: Obtain dashing pattern here and let Vello handle dashing on the CPU while
     // encoding the path?
     PathIter iter(shape, t);
     vello_cpp::Brush brush{vello_cpp::BrushKind::Solid, {to_vello_color(fillColor)}};
     fEncoding->stroke(to_stroke(style), to_vello_affine(t), brush, iter);
 }

 void VelloScene::pushClipLayer(const SkPath& shape, const Transform& t) {
     PathIter iter(shape, t);
     fEncoding->begin_clip(to_vello_affine(t), iter);
     SkDEBUGCODE(fLayers++;)
 }

 void VelloScene::popClipLayer() {
     SkASSERT(fLayers > 0);
     fEncoding->end_clip();
     SkDEBUGCODE(fLayers--;)
 }

 VelloRenderer::VelloRenderer(const Caps* caps)
         // We currently use the fine stage to rasterize a coverage mask. For full compositing, we
         // should instantiate a second variant of fine with a different color format.
         //
         // Currently fine has only one variant: on Metal this variant can operate on any floating
         // point format (so we set it to R8) but on Dawn it must use RGBA8unorm.
         : fFineArea(ComputeShaderCoverageMaskTargetFormat(caps))
         , fFineMsaa16(ComputeShaderCoverageMaskTargetFormat(caps)) {
     fGradientImage = TextureProxy::Make(caps,
                                         {1, 1},
                                         kRGBA_8888_SkColorType,
                                         skgpu::Mipmapped::kNo,
                                         skgpu::Protected::kNo,
                                         skgpu::Renderable::kNo,
                                         skgpu::Budgeted::kYes);
     fImageAtlas = TextureProxy::Make(caps,
                                      {1, 1},
                                      kRGBA_8888_SkColorType,
                                      skgpu::Mipmapped::kNo,
                                      skgpu::Protected::kNo,
                                      skgpu::Renderable::kNo,
                                      skgpu::Budgeted::kYes);
 }

 VelloRenderer::~VelloRenderer() = default;

 std::unique_ptr<DispatchGroup> VelloRenderer::renderScene(const RenderParams& params,
                                                           const VelloScene& scene,
                                                           sk_sp<TextureProxy> target,
                                                           Recorder* recorder) const {
     TRACE_EVENT0("skia.gpu", TRACE_FUNC);
     SkASSERT(target);

     if (scene.fEncoding->is_empty()) {
         return nullptr;
     }

     if (params.fWidth == 0 || params.fHeight == 0) {
         return nullptr;
     }

     // TODO: validate that the pixel format is kRGBA_8888_SkColorType.
     // Clamp the draw region to the target texture dimensions.
     const SkISize dims = target->dimensions();
     if (dims.isEmpty() || dims.fWidth < 0 || dims.fHeight < 0) {
         SKGPU_LOG_W("VelloRenderer: cannot render to an empty target");
         return nullptr;
     }

     SkASSERT(scene.fLayers == 0);  // Begin/end clips must be matched.
     auto config = scene.fEncoding->prepare_render(
             std::min(params.fWidth, static_cast<uint32_t>(dims.fWidth)),
             std::min(params.fHeight, static_cast<uint32_t>(dims.fHeight)),
             to_vello_color(params.fBaseColor));
     auto dispatchInfo = config->workgroup_counts();
     auto bufferSizes = config->buffer_sizes();

     DispatchGroup::Builder builder(recorder);

     // In total there are 25 resources that are used across the full pipeline stages. The sizes of
     // these resources depend on the encoded scene. We allocate all of them and assign them
     // directly to the builder here instead of delegating the logic to the ComputeSteps.
     DrawBufferManager* bufMgr = recorder->priv().drawBufferManager();

     size_t uboSize = config->config_uniform_buffer_size();
     auto [uboPtr, configBuf] = bufMgr->getUniformPointer(uboSize);
     if (!config->write_config_uniform_buffer(to_slice(uboPtr, uboSize))) {
         return nullptr;
     }

     size_t sceneSize = config->scene_buffer_size();
     auto [scenePtr, sceneBuf] = bufMgr->getStoragePointer(sceneSize);
     if (!config->write_scene_buffer(to_slice(scenePtr, sceneSize))) {
         return nullptr;
     }

     // TODO(b/285189802): The default sizes for the bump buffers (~97MB) exceed Graphite's resource
     // budget if multiple passes are necessary per frame (250MB, see ResouceCache.h). We apply a
     // crude size reduction here which seems to be enough for a 4k x 4k atlas render for the GMs
     // that we have tested. The numbers below are able to render GM_longpathdash with CPU-side
     // stroke expansion.
     //
     // We need to come up with a better approach to accurately predict the sizes for these buffers
     // based on the scene encoding and our resource budget. It should be possible to build a
     // conservative estimate using the total number of path verbs, some heuristic based on the verb
     // and the path's transform, and the total number of tiles.
     //
     // The following numbers amount to ~48MB
     const size_t lines_size = bufferSizes.lines;
     const size_t bin_data_size = bufferSizes.bin_data;
     const size_t tiles_size = bufferSizes.tiles;
     const size_t segments_size = bufferSizes.segments;
     const size_t seg_counts_size = bufferSizes.seg_counts;
     const size_t ptcl_size = bufferSizes.ptcl;

     // See the comments in VelloComputeSteps.h for an explanation of the logic here.

     builder.assignSharedBuffer({configBuf, uboSize}, kVelloSlot_ConfigUniform);
     builder.assignSharedBuffer({sceneBuf, sceneSize}, kVelloSlot_Scene);

     // Buffers get cleared ahead of the entire DispatchGroup. Allocate the bump buffer early to
     // avoid a potentially recycled (and prematurely cleared) scratch buffer.
     ScratchBuffer bump = bufMgr->getScratchStorage(bufferSizes.bump_alloc);
     builder.assignSharedBuffer(new_scratch_slice(bump), kVelloSlot_BumpAlloc, ClearBuffer::kYes);

     // path_reduce
     ScratchBuffer tagmonoids = bufMgr->getScratchStorage(bufferSizes.path_monoids);
     {
         // This can be immediately returned after input processing.
         ScratchBuffer pathtagReduceOutput = bufMgr->getScratchStorage(bufferSizes.path_reduced);
         builder.assignSharedBuffer(new_scratch_slice(pathtagReduceOutput),
                                    kVelloSlot_PathtagReduceOutput);
         builder.assignSharedBuffer(new_scratch_slice(tagmonoids), kVelloSlot_TagMonoid);
         builder.appendStep(&fPathtagReduce, to_wg_size(dispatchInfo.path_reduce));

         // If the input is too large to be fully processed by a single workgroup then a second
         // reduce step and two scan steps are necessary. Otherwise one reduce+scan pair is
         // sufficient.
         //
         // In either case, the result is `tagmonoids`.
         if (dispatchInfo.use_large_path_scan) {
             ScratchBuffer reduced2 = bufMgr->getScratchStorage(bufferSizes.path_reduced2);
             ScratchBuffer reducedScan = bufMgr->getScratchStorage(bufferSizes.path_reduced_scan);

             builder.assignSharedBuffer(new_scratch_slice(reduced2),
                                        kVelloSlot_LargePathtagReduceSecondPassOutput);
             builder.assignSharedBuffer(new_scratch_slice(reducedScan),
                                        kVelloSlot_LargePathtagScanFirstPassOutput);

             builder.appendStep(&fPathtagReduce2, to_wg_size(dispatchInfo.path_reduce2));
             builder.appendStep(&fPathtagScan1, to_wg_size(dispatchInfo.path_scan1));
             builder.appendStep(&fPathtagScanLarge, to_wg_size(dispatchInfo.path_scan));
         } else {
             builder.appendStep(&fPathtagScanSmall, to_wg_size(dispatchInfo.path_scan));
         }
     }

     // bbox_clear
     ScratchBuffer pathBboxes = bufMgr->getScratchStorage(bufferSizes.path_bboxes);
     builder.assignSharedBuffer(new_scratch_slice(pathBboxes), kVelloSlot_PathBBoxes);
     builder.appendStep(&fBboxClear, to_wg_size(dispatchInfo.bbox_clear));

     // flatten
     ScratchBuffer lines = bufMgr->getScratchStorage(lines_size);
     builder.assignSharedBuffer(new_scratch_slice(lines), kVelloSlot_Lines);
     builder.appendStep(&fFlatten, to_wg_size(dispatchInfo.flatten));

     tagmonoids.returnToPool();

     // draw_reduce
     ScratchBuffer drawReduced = bufMgr->getScratchStorage(bufferSizes.draw_reduced);
     builder.assignSharedBuffer(new_scratch_slice(drawReduced), kVelloSlot_DrawReduceOutput);
     builder.appendStep(&fDrawReduce, to_wg_size(dispatchInfo.draw_reduce));

     // draw_leaf
     ScratchBuffer drawMonoids = bufMgr->getScratchStorage(bufferSizes.draw_monoids);
     ScratchBuffer binData = bufMgr->getScratchStorage(bin_data_size);
     // A clip input buffer must still get bound even if the encoding doesn't contain any clips
     ScratchBuffer clipInput = bufMgr->getScratchStorage(bufferSizes.clip_inps);
     builder.assignSharedBuffer(new_scratch_slice(drawMonoids), kVelloSlot_DrawMonoid);
     builder.assignSharedBuffer(new_scratch_slice(binData), kVelloSlot_InfoBinData);
     builder.assignSharedBuffer(new_scratch_slice(clipInput), kVelloSlot_ClipInput);
     builder.appendStep(&fDrawLeaf, to_wg_size(dispatchInfo.draw_leaf));

     drawReduced.returnToPool();

     // clip_reduce, clip_leaf
     // The clip bbox buffer is always an input to the binning stage, even when the encoding doesn't
     // contain any clips
     ScratchBuffer clipBboxes = bufMgr->getScratchStorage(bufferSizes.clip_bboxes);
     builder.assignSharedBuffer(new_scratch_slice(clipBboxes), kVelloSlot_ClipBBoxes);
     WorkgroupSize clipReduceWgCount = to_wg_size(dispatchInfo.clip_reduce);
     WorkgroupSize clipLeafWgCount = to_wg_size(dispatchInfo.clip_leaf);
     bool doClipReduce = clipReduceWgCount.scalarSize() > 0u;
     bool doClipLeaf = clipLeafWgCount.scalarSize() > 0u;
     if (doClipReduce || doClipLeaf) {
         ScratchBuffer clipBic = bufMgr->getScratchStorage(bufferSizes.clip_bics);
         ScratchBuffer clipEls = bufMgr->getScratchStorage(bufferSizes.clip_els);
         builder.assignSharedBuffer(new_scratch_slice(clipBic), kVelloSlot_ClipBicyclic);
         builder.assignSharedBuffer(new_scratch_slice(clipEls), kVelloSlot_ClipElement);
         if (doClipReduce) {
             builder.appendStep(&fClipReduce, clipReduceWgCount);
         }
         if (doClipLeaf) {
             builder.appendStep(&fClipLeaf, clipLeafWgCount);
         }
     }

     clipInput.returnToPool();

     // binning
     ScratchBuffer drawBboxes = bufMgr->getScratchStorage(bufferSizes.draw_bboxes);
     ScratchBuffer binHeaders = bufMgr->getScratchStorage(bufferSizes.bin_headers);
     builder.assignSharedBuffer(new_scratch_slice(drawBboxes), kVelloSlot_DrawBBoxes);
     builder.assignSharedBuffer(new_scratch_slice(binHeaders), kVelloSlot_BinHeader);
     builder.appendStep(&fBinning, to_wg_size(dispatchInfo.binning));

     pathBboxes.returnToPool();
     clipBboxes.returnToPool();

     // tile_alloc
     ScratchBuffer paths = bufMgr->getScratchStorage(bufferSizes.paths);
     ScratchBuffer tiles = bufMgr->getScratchStorage(tiles_size);
     builder.assignSharedBuffer(new_scratch_slice(paths), kVelloSlot_Path);
     builder.assignSharedBuffer(new_scratch_slice(tiles), kVelloSlot_Tile);
     builder.appendStep(&fTileAlloc, to_wg_size(dispatchInfo.tile_alloc));

     drawBboxes.returnToPool();

     // path_count_setup
     auto indirectCountBuffer = new_indirect_slice(bufMgr, bufferSizes.indirect_count);
     builder.assignSharedBuffer(indirectCountBuffer, kVelloSlot_IndirectCount);
     builder.appendStep(&fPathCountSetup, to_wg_size(dispatchInfo.path_count_setup));

     // Rasterization stage scratch buffers.
     ScratchBuffer seg_counts = bufMgr->getScratchStorage(seg_counts_size);
     ScratchBuffer segments = bufMgr->getScratchStorage(segments_size);
     ScratchBuffer ptcl = bufMgr->getScratchStorage(ptcl_size);

     // path_count
     builder.assignSharedBuffer(new_scratch_slice(seg_counts), kVelloSlot_SegmentCounts);
     builder.appendStepIndirect(&fPathCount, indirectCountBuffer);

     // backdrop
     builder.appendStep(&fBackdrop, to_wg_size(dispatchInfo.backdrop));

     // coarse
     builder.assignSharedBuffer(new_scratch_slice(ptcl), kVelloSlot_PTCL);
     builder.appendStep(&fCoarse, to_wg_size(dispatchInfo.coarse));

     // path_tiling_setup
     builder.appendStep(&fPathTilingSetup, to_wg_size(dispatchInfo.path_tiling_setup));

     // path_tiling
     builder.assignSharedBuffer(new_scratch_slice(segments), kVelloSlot_Segments);
     builder.appendStepIndirect(&fPathTiling, indirectCountBuffer);

     // fine
     builder.assignSharedTexture(fImageAtlas, kVelloSlot_ImageAtlas);
     builder.assignSharedTexture(fGradientImage, kVelloSlot_GradientImage);
     builder.assignSharedTexture(std::move(target), kVelloSlot_OutputImage);
     const ComputeStep* fineVariant = params.fAaConfig == VelloAaConfig::kMSAA16
                                              ? static_cast<const ComputeStep*>(&fFineMsaa16)
                                              : static_cast<const ComputeStep*>(&fFineArea);
     builder.appendStep(fineVariant, to_wg_size(dispatchInfo.fine));

     return builder.finalize();
 }

 }  // namespace skgpu::graphite
	/*
	* Copyright 2023 Google LLC
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/

	#include "src/gpu/graphite/compute/VelloRenderer.h"

	#include "include/core/SkPath.h"
	#include "include/core/SkTypes.h"
	#include "include/gpu/graphite/Recorder.h"
	#include "src/core/SkGeometry.h"
	#include "src/core/SkPathPriv.h"
	#include "src/core/SkTraceEvent.h"
	#include "src/gpu/graphite/BufferManager.h"
	#include "src/gpu/graphite/Caps.h"
	#include "src/gpu/graphite/DrawParams.h"
	#include "src/gpu/graphite/Log.h"
	#include "src/gpu/graphite/PipelineData.h"
	#include "src/gpu/graphite/RecorderPriv.h"
	#include "src/gpu/graphite/TextureProxy.h"
	#include "src/gpu/graphite/TextureUtils.h"
	#include "src/gpu/graphite/UniformManager.h"
	#include "src/gpu/graphite/compute/DispatchGroup.h"

	#include <algorithm>

	namespace skgpu::graphite {
	namespace {

	BufferView new_scratch_slice(ScratchBuffer& scratch) {
	size_t size = scratch.size(); // Use the whole buffer.
	BindBufferInfo info = scratch.suballocate(size);
	return {info, info ? size : 0};
	}

	BufferView new_indirect_slice(DrawBufferManager* mgr, size_t size) {
	BindBufferInfo info = mgr->getIndirectStorage(size, ClearBuffer::kYes);
	return {info, info ? size : 0};
	}

	::rust::Slice<uint8_t> to_slice(void* ptr, size_t size) {
	return {static_cast<uint8_t*>(ptr), size};
	}

	vello_cpp::Affine to_vello_affine(const SkMatrix& m) {
	// Vello currently doesn't support perspective scaling and the encoding only accepts a 2x3
	// affine transform matrix.
	return {{m.get(0), m.get(3), m.get(1), m.get(4), m.get(2), m.get(5)}};
	}

	vello_cpp::Point to_vello_point(const SkPoint& p) { return {p.x(), p.y()}; }

	vello_cpp::Color to_vello_color(const SkColor4f& color) {
	SkColor c = color.toSkColor();
	return {
	static_cast<uint8_t>(SkColorGetR(c)),
	static_cast<uint8_t>(SkColorGetG(c)),
	static_cast<uint8_t>(SkColorGetB(c)),
	static_cast<uint8_t>(SkColorGetA(c)),
	};
	}

	WorkgroupSize to_wg_size(const vello_cpp::WorkgroupSize& src) {
	return WorkgroupSize(src.x, src.y, src.z);
	}

	vello_cpp::Fill to_fill_type(SkPathFillType fillType) {
	// Vello does not provide an encoding for inverse fill types. When Skia uses vello to render
	// a coverage mask for an inverse fill, it encodes a regular fill and inverts the coverage value
	// after sampling the mask.
	switch (fillType) {
	case SkPathFillType::kWinding:
	case SkPathFillType::kInverseWinding:
	return vello_cpp::Fill::NonZero;
	case SkPathFillType::kEvenOdd:
	case SkPathFillType::kInverseEvenOdd:
	return vello_cpp::Fill::EvenOdd;
	}
	return vello_cpp::Fill::NonZero;
	}

	vello_cpp::CapStyle to_cap_style(SkPaint::Cap cap) {
	switch (cap) {
	case SkPaint::Cap::kButt_Cap:
	return vello_cpp::CapStyle::Butt;
	case SkPaint::Cap::kRound_Cap:
	return vello_cpp::CapStyle::Round;
	case SkPaint::Cap::kSquare_Cap:
	return vello_cpp::CapStyle::Square;
	}
	SkUNREACHABLE;
	}

	vello_cpp::JoinStyle to_join_style(SkPaint::Join join) {
	switch (join) {
	case SkPaint::Join::kMiter_Join:
	return vello_cpp::JoinStyle::Miter;
	case SkPaint::Join::kBevel_Join:
	return vello_cpp::JoinStyle::Bevel;
	case SkPaint::Join::kRound_Join:
	return vello_cpp::JoinStyle::Round;
	}
	SkUNREACHABLE;
	}

	vello_cpp::Stroke to_stroke(const SkStrokeRec& style) {
	return vello_cpp::Stroke{
	/width=/style.getWidth(),
	/miter_limit=/style.getMiter(),
	/cap/ to_cap_style(style.getCap()),
	/join/ to_join_style(style.getJoin()),
	};
	}

	class PathIter : public vello_cpp::PathIterator {
	public:
	PathIter(const SkPath& path, const Transform& t)
	: fIterate(path), fIter(fIterate.begin()), fTransform(t) {}

	bool next_element(vello_cpp::PathElement* outElem) override {
	if (fConicQuadIdx < fConicConverter.countQuads()) {
	SkASSERT(fConicQuads != nullptr);
	outElem->verb = vello_cpp::PathVerb::QuadTo;
	int pointIdx = fConicQuadIdx * 2;
	outElem->points[0] = to_vello_point(fConicQuads[pointIdx]);
	outElem->points[1] = to_vello_point(fConicQuads[pointIdx + 1]);
	outElem->points[2] = to_vello_point(fConicQuads[pointIdx + 2]);
	fConicQuadIdx++;
	return true;
	}

	if (fIter == fIterate.end()) {
	return false;
	}

	SkASSERT(outElem);
	auto [verb, points, weights] = *fIter;
	fIter++;

	switch (verb) {
	case SkPathVerb::kMove:
	outElem->verb = vello_cpp::PathVerb::MoveTo;
	outElem->points[0] = to_vello_point(points[0]);
	break;
	case SkPathVerb::kLine:
	outElem->verb = vello_cpp::PathVerb::LineTo;
	outElem->points[0] = to_vello_point(points[0]);
	outElem->points[1] = to_vello_point(points[1]);
	break;
	case SkPathVerb::kConic:
	// The vello encoding API doesn't handle conic sections. Approximate it with
	// quadratic Béziers.
	SkASSERT(fConicQuadIdx >= fConicConverter.countQuads()); // No other conic->quad
	// conversions should be
	// in progress
	fConicQuads = fConicConverter.computeQuads(
	points, *weights, 0.25 / fTransform.maxScaleFactor());
	outElem->verb = vello_cpp::PathVerb::QuadTo;
	outElem->points[0] = to_vello_point(fConicQuads[0]);
	outElem->points[1] = to_vello_point(fConicQuads[1]);
	outElem->points[2] = to_vello_point(fConicQuads[2]);

	// The next call to `next_element` will yield the next quad in the list (at index 1)
	// if `fConicConverter` contains more than 1 quad.
	fConicQuadIdx = 1;
	break;
	case SkPathVerb::kQuad:
	outElem->verb = vello_cpp::PathVerb::QuadTo;
	outElem->points[0] = to_vello_point(points[0]);
	outElem->points[1] = to_vello_point(points[1]);
	outElem->points[2] = to_vello_point(points[2]);
	break;
	case SkPathVerb::kCubic:
	outElem->verb = vello_cpp::PathVerb::CurveTo;
	outElem->points[0] = to_vello_point(points[0]);
	outElem->points[1] = to_vello_point(points[1]);
	outElem->points[2] = to_vello_point(points[2]);
	outElem->points[3] = to_vello_point(points[3]);
	break;
	case SkPathVerb::kClose:
	outElem->verb = vello_cpp::PathVerb::Close;
	break;
	}

	return true;
	}

	private:
	SkPathPriv::Iterate fIterate;
	SkPathPriv::RangeIter fIter;

	// Variables used to track conic to quadratic spline conversion. `fTransform` is used to
	// determine the subpixel error tolerance in device coordinate space.
	const Transform& fTransform;
	SkAutoConicToQuads fConicConverter;
	const SkPoint* fConicQuads = nullptr;
	int fConicQuadIdx = 0;
	};

	} // namespace

	VelloScene::VelloScene() : fEncoding(vello_cpp::new_encoding()) {}

	void VelloScene::reset() {
	fEncoding->reset();
	}

	void VelloScene::solidFill(const SkPath& shape,
	const SkColor4f& fillColor,
	const SkPathFillType fillType,
	const Transform& t) {
	PathIter iter(shape, t);
	fEncoding->fill(to_fill_type(fillType),
	to_vello_affine(t),
	{vello_cpp::BrushKind::Solid, {to_vello_color(fillColor)}},
	iter);
	}

	void VelloScene::solidStroke(const SkPath& shape,
	const SkColor4f& fillColor,
	const SkStrokeRec& style,
	const Transform& t) {
	// TODO: Obtain dashing pattern here and let Vello handle dashing on the CPU while
	// encoding the path?
	PathIter iter(shape, t);
	vello_cpp::Brush brush{vello_cpp::BrushKind::Solid, {to_vello_color(fillColor)}};
	fEncoding->stroke(to_stroke(style), to_vello_affine(t), brush, iter);
	}

	void VelloScene::pushClipLayer(const SkPath& shape, const Transform& t) {
	PathIter iter(shape, t);
	fEncoding->begin_clip(to_vello_affine(t), iter);
	SkDEBUGCODE(fLayers++;)
	}

	void VelloScene::popClipLayer() {
	SkASSERT(fLayers > 0);
	fEncoding->end_clip();
	SkDEBUGCODE(fLayers--;)
	}

	VelloRenderer::VelloRenderer(const Caps* caps)
	// We currently use the fine stage to rasterize a coverage mask. For full compositing, we
	// should instantiate a second variant of fine with a different color format.
	//
	// Currently fine has only one variant: on Metal this variant can operate on any floating
	// point format (so we set it to R8) but on Dawn it must use RGBA8unorm.
	: fFineArea(ComputeShaderCoverageMaskTargetFormat(caps))
	, fFineMsaa16(ComputeShaderCoverageMaskTargetFormat(caps)) {
	fGradientImage = TextureProxy::Make(caps,
	{1, 1},
	kRGBA_8888_SkColorType,
	skgpu::Mipmapped::kNo,
	skgpu::Protected::kNo,
	skgpu::Renderable::kNo,
	skgpu::Budgeted::kYes);
	fImageAtlas = TextureProxy::Make(caps,
	{1, 1},
	kRGBA_8888_SkColorType,
	skgpu::Mipmapped::kNo,
	skgpu::Protected::kNo,
	skgpu::Renderable::kNo,
	skgpu::Budgeted::kYes);
	}

	VelloRenderer::~VelloRenderer() = default;

	std::unique_ptr<DispatchGroup> VelloRenderer::renderScene(const RenderParams& params,
	const VelloScene& scene,
	sk_sp<TextureProxy> target,
	Recorder* recorder) const {
	TRACE_EVENT0("skia.gpu", TRACE_FUNC);
	SkASSERT(target);

	if (scene.fEncoding->is_empty()) {
	return nullptr;
	}

	if (params.fWidth == 0 \|\| params.fHeight == 0) {
	return nullptr;
	}

	// TODO: validate that the pixel format is kRGBA_8888_SkColorType.
	// Clamp the draw region to the target texture dimensions.
	const SkISize dims = target->dimensions();
	if (dims.isEmpty() \|\| dims.fWidth < 0 \|\| dims.fHeight < 0) {
	SKGPU_LOG_W("VelloRenderer: cannot render to an empty target");
	return nullptr;
	}

	SkASSERT(scene.fLayers == 0); // Begin/end clips must be matched.
	auto config = scene.fEncoding->prepare_render(
	std::min(params.fWidth, static_cast<uint32_t>(dims.fWidth)),
	std::min(params.fHeight, static_cast<uint32_t>(dims.fHeight)),
	to_vello_color(params.fBaseColor));
	auto dispatchInfo = config->workgroup_counts();
	auto bufferSizes = config->buffer_sizes();

	DispatchGroup::Builder builder(recorder);

	// In total there are 25 resources that are used across the full pipeline stages. The sizes of
	// these resources depend on the encoded scene. We allocate all of them and assign them
	// directly to the builder here instead of delegating the logic to the ComputeSteps.
	DrawBufferManager* bufMgr = recorder->priv().drawBufferManager();

	size_t uboSize = config->config_uniform_buffer_size();
	auto [uboPtr, configBuf] = bufMgr->getUniformPointer(uboSize);
	if (!config->write_config_uniform_buffer(to_slice(uboPtr, uboSize))) {
	return nullptr;
	}

	size_t sceneSize = config->scene_buffer_size();
	auto [scenePtr, sceneBuf] = bufMgr->getStoragePointer(sceneSize);
	if (!config->write_scene_buffer(to_slice(scenePtr, sceneSize))) {
	return nullptr;
	}

	// TODO(b/285189802): The default sizes for the bump buffers (~97MB) exceed Graphite's resource
	// budget if multiple passes are necessary per frame (250MB, see ResouceCache.h). We apply a
	// crude size reduction here which seems to be enough for a 4k x 4k atlas render for the GMs
	// that we have tested. The numbers below are able to render GM_longpathdash with CPU-side
	// stroke expansion.
	//
	// We need to come up with a better approach to accurately predict the sizes for these buffers
	// based on the scene encoding and our resource budget. It should be possible to build a
	// conservative estimate using the total number of path verbs, some heuristic based on the verb
	// and the path's transform, and the total number of tiles.
	//
	// The following numbers amount to ~48MB
	const size_t lines_size = bufferSizes.lines;
	const size_t bin_data_size = bufferSizes.bin_data;
	const size_t tiles_size = bufferSizes.tiles;
	const size_t segments_size = bufferSizes.segments;
	const size_t seg_counts_size = bufferSizes.seg_counts;
	const size_t ptcl_size = bufferSizes.ptcl;

	// See the comments in VelloComputeSteps.h for an explanation of the logic here.

	builder.assignSharedBuffer({configBuf, uboSize}, kVelloSlot_ConfigUniform);
	builder.assignSharedBuffer({sceneBuf, sceneSize}, kVelloSlot_Scene);

	// Buffers get cleared ahead of the entire DispatchGroup. Allocate the bump buffer early to
	// avoid a potentially recycled (and prematurely cleared) scratch buffer.
	ScratchBuffer bump = bufMgr->getScratchStorage(bufferSizes.bump_alloc);
	builder.assignSharedBuffer(new_scratch_slice(bump), kVelloSlot_BumpAlloc, ClearBuffer::kYes);

	// path_reduce
	ScratchBuffer tagmonoids = bufMgr->getScratchStorage(bufferSizes.path_monoids);
	{
	// This can be immediately returned after input processing.
	ScratchBuffer pathtagReduceOutput = bufMgr->getScratchStorage(bufferSizes.path_reduced);
	builder.assignSharedBuffer(new_scratch_slice(pathtagReduceOutput),
	kVelloSlot_PathtagReduceOutput);
	builder.assignSharedBuffer(new_scratch_slice(tagmonoids), kVelloSlot_TagMonoid);
	builder.appendStep(&fPathtagReduce, to_wg_size(dispatchInfo.path_reduce));

	// If the input is too large to be fully processed by a single workgroup then a second
	// reduce step and two scan steps are necessary. Otherwise one reduce+scan pair is
	// sufficient.
	//
	// In either case, the result is `tagmonoids`.
	if (dispatchInfo.use_large_path_scan) {
	ScratchBuffer reduced2 = bufMgr->getScratchStorage(bufferSizes.path_reduced2);
	ScratchBuffer reducedScan = bufMgr->getScratchStorage(bufferSizes.path_reduced_scan);

	builder.assignSharedBuffer(new_scratch_slice(reduced2),
	kVelloSlot_LargePathtagReduceSecondPassOutput);
	builder.assignSharedBuffer(new_scratch_slice(reducedScan),
	kVelloSlot_LargePathtagScanFirstPassOutput);

	builder.appendStep(&fPathtagReduce2, to_wg_size(dispatchInfo.path_reduce2));
	builder.appendStep(&fPathtagScan1, to_wg_size(dispatchInfo.path_scan1));
	builder.appendStep(&fPathtagScanLarge, to_wg_size(dispatchInfo.path_scan));
	} else {
	builder.appendStep(&fPathtagScanSmall, to_wg_size(dispatchInfo.path_scan));
	}
	}

	// bbox_clear
	ScratchBuffer pathBboxes = bufMgr->getScratchStorage(bufferSizes.path_bboxes);
	builder.assignSharedBuffer(new_scratch_slice(pathBboxes), kVelloSlot_PathBBoxes);
	builder.appendStep(&fBboxClear, to_wg_size(dispatchInfo.bbox_clear));

	// flatten
	ScratchBuffer lines = bufMgr->getScratchStorage(lines_size);
	builder.assignSharedBuffer(new_scratch_slice(lines), kVelloSlot_Lines);
	builder.appendStep(&fFlatten, to_wg_size(dispatchInfo.flatten));

	tagmonoids.returnToPool();

	// draw_reduce
	ScratchBuffer drawReduced = bufMgr->getScratchStorage(bufferSizes.draw_reduced);
	builder.assignSharedBuffer(new_scratch_slice(drawReduced), kVelloSlot_DrawReduceOutput);
	builder.appendStep(&fDrawReduce, to_wg_size(dispatchInfo.draw_reduce));

	// draw_leaf
	ScratchBuffer drawMonoids = bufMgr->getScratchStorage(bufferSizes.draw_monoids);
	ScratchBuffer binData = bufMgr->getScratchStorage(bin_data_size);
	// A clip input buffer must still get bound even if the encoding doesn't contain any clips
	ScratchBuffer clipInput = bufMgr->getScratchStorage(bufferSizes.clip_inps);
	builder.assignSharedBuffer(new_scratch_slice(drawMonoids), kVelloSlot_DrawMonoid);
	builder.assignSharedBuffer(new_scratch_slice(binData), kVelloSlot_InfoBinData);
	builder.assignSharedBuffer(new_scratch_slice(clipInput), kVelloSlot_ClipInput);
	builder.appendStep(&fDrawLeaf, to_wg_size(dispatchInfo.draw_leaf));

	drawReduced.returnToPool();

	// clip_reduce, clip_leaf
	// The clip bbox buffer is always an input to the binning stage, even when the encoding doesn't
	// contain any clips
	ScratchBuffer clipBboxes = bufMgr->getScratchStorage(bufferSizes.clip_bboxes);
	builder.assignSharedBuffer(new_scratch_slice(clipBboxes), kVelloSlot_ClipBBoxes);
	WorkgroupSize clipReduceWgCount = to_wg_size(dispatchInfo.clip_reduce);
	WorkgroupSize clipLeafWgCount = to_wg_size(dispatchInfo.clip_leaf);
	bool doClipReduce = clipReduceWgCount.scalarSize() > 0u;
	bool doClipLeaf = clipLeafWgCount.scalarSize() > 0u;
	if (doClipReduce \|\| doClipLeaf) {
	ScratchBuffer clipBic = bufMgr->getScratchStorage(bufferSizes.clip_bics);
	ScratchBuffer clipEls = bufMgr->getScratchStorage(bufferSizes.clip_els);
	builder.assignSharedBuffer(new_scratch_slice(clipBic), kVelloSlot_ClipBicyclic);
	builder.assignSharedBuffer(new_scratch_slice(clipEls), kVelloSlot_ClipElement);
	if (doClipReduce) {
	builder.appendStep(&fClipReduce, clipReduceWgCount);
	}
	if (doClipLeaf) {
	builder.appendStep(&fClipLeaf, clipLeafWgCount);
	}
	}

	clipInput.returnToPool();

	// binning
	ScratchBuffer drawBboxes = bufMgr->getScratchStorage(bufferSizes.draw_bboxes);
	ScratchBuffer binHeaders = bufMgr->getScratchStorage(bufferSizes.bin_headers);
	builder.assignSharedBuffer(new_scratch_slice(drawBboxes), kVelloSlot_DrawBBoxes);
	builder.assignSharedBuffer(new_scratch_slice(binHeaders), kVelloSlot_BinHeader);
	builder.appendStep(&fBinning, to_wg_size(dispatchInfo.binning));

	pathBboxes.returnToPool();
	clipBboxes.returnToPool();

	// tile_alloc
	ScratchBuffer paths = bufMgr->getScratchStorage(bufferSizes.paths);
	ScratchBuffer tiles = bufMgr->getScratchStorage(tiles_size);
	builder.assignSharedBuffer(new_scratch_slice(paths), kVelloSlot_Path);
	builder.assignSharedBuffer(new_scratch_slice(tiles), kVelloSlot_Tile);
	builder.appendStep(&fTileAlloc, to_wg_size(dispatchInfo.tile_alloc));

	drawBboxes.returnToPool();

	// path_count_setup
	auto indirectCountBuffer = new_indirect_slice(bufMgr, bufferSizes.indirect_count);
	builder.assignSharedBuffer(indirectCountBuffer, kVelloSlot_IndirectCount);
	builder.appendStep(&fPathCountSetup, to_wg_size(dispatchInfo.path_count_setup));

	// Rasterization stage scratch buffers.
	ScratchBuffer seg_counts = bufMgr->getScratchStorage(seg_counts_size);
	ScratchBuffer segments = bufMgr->getScratchStorage(segments_size);
	ScratchBuffer ptcl = bufMgr->getScratchStorage(ptcl_size);

	// path_count
	builder.assignSharedBuffer(new_scratch_slice(seg_counts), kVelloSlot_SegmentCounts);
	builder.appendStepIndirect(&fPathCount, indirectCountBuffer);

	// backdrop
	builder.appendStep(&fBackdrop, to_wg_size(dispatchInfo.backdrop));

	// coarse
	builder.assignSharedBuffer(new_scratch_slice(ptcl), kVelloSlot_PTCL);
	builder.appendStep(&fCoarse, to_wg_size(dispatchInfo.coarse));

	// path_tiling_setup
	builder.appendStep(&fPathTilingSetup, to_wg_size(dispatchInfo.path_tiling_setup));

	// path_tiling
	builder.assignSharedBuffer(new_scratch_slice(segments), kVelloSlot_Segments);
	builder.appendStepIndirect(&fPathTiling, indirectCountBuffer);

	// fine
	builder.assignSharedTexture(fImageAtlas, kVelloSlot_ImageAtlas);
	builder.assignSharedTexture(fGradientImage, kVelloSlot_GradientImage);
	builder.assignSharedTexture(std::move(target), kVelloSlot_OutputImage);
	const ComputeStep* fineVariant = params.fAaConfig == VelloAaConfig::kMSAA16
	? static_cast<const ComputeStep*>(&fFineMsaa16)
	: static_cast<const ComputeStep*>(&fFineArea);
	builder.appendStep(fineVariant, to_wg_size(dispatchInfo.fine));

	return builder.finalize();
	}

	} // namespace skgpu::graphite