| /* |
| * Copyright 2023 Google LLC |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #ifndef skgpu_graphite_compute_VelloComputeSteps_DEFINED |
| #define skgpu_graphite_compute_VelloComputeSteps_DEFINED |
| |
| #include "include/core/SkSpan.h" |
| #include "include/private/base/SkTArray.h" |
| #include "src/gpu/graphite/ComputeTypes.h" |
| #include "src/gpu/graphite/compute/ComputeStep.h" |
| #include "third_party/vello/cpp/vello.h" |
| |
| #include <string_view> |
| |
| namespace skgpu::graphite { |
| |
| // This file defines ComputeSteps for all Vello compute stages and their permutations. The |
| // declaration of each ComputeStep subclass mirrors the name of the pipeline stage as defined in the |
| // shader metadata. |
| // |
| // The compute stages all operate over a shared set of buffer and image resources. The |
| // `kVelloSlot_*` constant definitions below each uniquely identify a shared resource that must be |
| // instantiated when assembling the ComputeSteps into a DispatchGroup. |
| // |
| // === Monoids and Prefix Sums === |
| // |
| // Vello's GPU algorithms make repeated use of parallel prefix sums techniques. These occur |
| // frequently in path rasterization (e.g. winding number accummulation across a scanline can be |
| // thought of as per-pixel prefix sums) but Vello also uses them to calculate buffer offsets for |
| // associated entries across its variable length encoding streams. |
| // |
| // For instance, given a scene that contains Bézier paths, each path gets encoded as a transform, |
| // a sequence of path tags (verbs), and zero or more 2-D points associated with each |
| // tag. N paths will often map to N transforms, N + M tags, and N + M + L points (where N > 0, M > |
| // 0, L >= 0). These entries are stored in separate parallel transform, path tag, and path data |
| // streams. The correspondence between entries of these independent streams is implicit. To keep |
| // CPU encoding of these streams fast, the offsets into each buffer for a given "path object" is |
| // computed dynamically and in parallel on the GPU. Since the offsets for each object build |
| // additively on offsets that appear before it in the stream, parallel computation of |
| // offsets can be treated as a dynamic programming problem that maps well to parallel prefix sums |
| // where each object is a "monoid" (https://en.wikipedia.org/wiki/Monoid) that supports algebraic |
| // addition/subtraction over data encoded in the path tags themselves. |
| // |
| // Once computed, a monoid contains the offsets into the input (and sometimes output) buffers for a |
| // given object. The parallel prefix sums operation is defined as a monoidal reduce + pre-scan pair. |
| // (Prefix Sums and Their Applications, Blelloch, G., https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf) |
| // |
| // While these concepts are an implementation detail they are core to the Vello algorithm and are |
| // reflected in the pipeline names and data slot definitions. |
| // |
| // === Full Pipeline === |
| // |
| // The full Vello pipeline stages are as follows and should be dispatched in the following order: |
| // |
| // I. Build the path monoid stream: |
| // If the input fits within the workgroup size: |
| // pathtag_reduce, pathtag_scan_small |
| // else |
| // pathtag_reduce, pathtag_reduce2, pathtag_scan1, pathtag_scan_large |
| // |
| // II. Compute path bounding boxes, convert path segments into cubics: |
| // bbox_clear, pathseg |
| // |
| // III. Process the draw object stream to build the draw monoids and inputs to the clip stage: |
| // draw_reduce, draw_leaf |
| // |
| // IV. Compute the bounding boxes for the clip stack from the input stream, if the scene contains |
| // clips: |
| // clip_reduce, clip_leaf |
| // |
| // V. Allocate tile and segment buffers for the individual bins and prepare for coarse rasterization |
| // binning, tile_alloc, path_coarse |
| // |
| // VI. Coarse rasterization |
| // backdrop_dyn, coarse |
| // |
| // VII. Fine rasterization |
| // fine |
| // |
| // TODO: Document the coverage mask pipeline once it has been re-implemented. |
| |
| // *** |
| // Shared buffers that are accessed by various stages. |
| // |
| // The render configration uniform buffer. |
| constexpr int kVelloSlot_ConfigUniform = 0; |
| |
| // The scene encoding buffer. |
| constexpr int kVelloSlot_Scene = 1; |
| |
| // *** |
| // Buffers used during the element processing stage. This stage converts the stream of variable |
| // length path tags, transforms, brushes into a "path monoid" stream containing buffer offsets for |
| // the subsequent stages that associate the input streams with individual draw elements. This stage |
| // performs a parallel prefix sum (reduce + scan) which can be performed in two dispatches if the |
| // entire input can be processed by a single workgroup per dispatch. Otherwise, the algorithm |
| // requires two additional dispatches to continue the traversal (this is due to a lack of primitives |
| // to synchronize execution across workgroups in MSL and WGSL). |
| // |
| // Single pass variant pipelines: pathtag_reduce, pathtag_scan_small |
| // Multi-pass variant pipelines: pathtag_reduce, pathtag_reduce2, pathtag_scan1, pathtag_scan_large |
| constexpr int kVelloSlot_TagMonoid = 2; |
| |
| // Single pass variant slots: |
| constexpr int kVelloSlot_PathtagReduceOutput = 3; |
| |
| // Multi pass variant slots: |
| constexpr int kVelloSlot_LargePathtagReduceFirstPassOutput = kVelloSlot_PathtagReduceOutput; |
| constexpr int kVelloSlot_LargePathtagReduceSecondPassOutput = 4; |
| constexpr int kVelloSlot_LargePathtagScanFirstPassOutput = 5; |
| |
| // *** |
| // The second part of element processing flattens path elements (moveTo, lineTo, quadTo, etc) into |
| // an unordered line soup buffer and computes their bounding boxes. This stage is where strokes get |
| // expanded to fills and stroke styles get applied. The output is an unordered "line soup" buffer |
| // and the tight device-space bounding box of each path. |
| // |
| // Pipelines: bbox_clear, flatten |
| constexpr int kVelloSlot_PathBBoxes = 6; |
| constexpr int kVelloSlot_Lines = 7; |
| |
| // *** |
| // The next part prepares the draw object stream (entries in the per-tile command list aka PTCL) |
| // and additional metadata for the subsequent clipping and binning stages. |
| // |
| // Pipelines: draw_reduce, draw_leaf |
| constexpr int kVelloSlot_DrawReduceOutput = 8; |
| constexpr int kVelloSlot_DrawMonoid = 9; |
| constexpr int kVelloSlot_InfoBinData = 10; |
| constexpr int kVelloSlot_ClipInput = 11; |
| |
| // *** |
| // Clipping. The outputs of this stage are the finalized draw monoid and the clip bounding-boxes. |
| // Clipping involves evaluating the stack monoid: refer to the following paper for the meaning of |
| // these buffers: https://arxiv.org/pdf/2205.11659.pdf, |
| // https://en.wikipedia.org/wiki/Bicyclic_semigroup |
| // |
| // Pipelines: clip_reduce, clip_leaf |
| constexpr int kVelloSlot_ClipBicyclic = 12; |
| constexpr int kVelloSlot_ClipElement = 13; |
| constexpr int kVelloSlot_ClipBBoxes = 14; |
| |
| // *** |
| // Buffers containing bump allocated data, the inputs and outputs to the binning, coarse raster, and |
| // per-tile segment assembly stages. |
| // |
| // Pipelines: binning, tile_alloc, path_count, backdrop, coarse, path_tiling |
| constexpr int kVelloSlot_DrawBBoxes = 15; |
| constexpr int kVelloSlot_BumpAlloc = 16; |
| constexpr int kVelloSlot_BinHeader = 17; |
| |
| constexpr int kVelloSlot_Path = 18; |
| constexpr int kVelloSlot_Tile = 19; |
| constexpr int kVelloSlot_SegmentCounts = 20; |
| constexpr int kVelloSlot_Segments = 21; |
| constexpr int kVelloSlot_PTCL = 22; |
| |
| // *** |
| // Texture resources used by the fine rasterization stage. The gradient image needs to get populated |
| // on the CPU with pre-computed gradient ramps. The image atlas is intended to hold pre-uploaded |
| // images that are composited into the scene. |
| // |
| // The output image contains the final render. |
| constexpr int kVelloSlot_OutputImage = 23; |
| constexpr int kVelloSlot_GradientImage = 24; |
| constexpr int kVelloSlot_ImageAtlas = 25; |
| |
| // *** |
| // The indirect count buffer is used to issue an indirect dispatch of the path count and path tiling |
| // stages. |
| constexpr int kVelloSlot_IndirectCount = 26; |
| |
| // *** |
| // The sample mask lookup table used in MSAA modes of the fine rasterization stage. |
| constexpr int kVelloSlot_MaskLUT = 27; |
| |
| std::string_view VelloStageName(vello_cpp::ShaderStage); |
| WorkgroupSize VelloStageLocalSize(vello_cpp::ShaderStage); |
| skia_private::TArray<ComputeStep::WorkgroupBufferDesc> VelloWorkgroupBuffers( |
| vello_cpp::ShaderStage); |
| ComputeStep::NativeShaderSource VelloNativeShaderSource(vello_cpp::ShaderStage, |
| ComputeStep::NativeShaderFormat); |
| |
| template <vello_cpp::ShaderStage S> |
| class VelloStep : public ComputeStep { |
| public: |
| ~VelloStep() override = default; |
| |
| NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override { |
| return VelloNativeShaderSource(S, format); |
| } |
| |
| protected: |
| explicit VelloStep(SkSpan<const ResourceDesc> resources) |
| : ComputeStep(VelloStageName(S), |
| VelloStageLocalSize(S), |
| resources, |
| AsSpan<ComputeStep::WorkgroupBufferDesc>(VelloWorkgroupBuffers(S)), |
| Flags::kSupportsNativeShader) {} |
| |
| private: |
| // Helper that creates a SkSpan from a universal reference to a container. Generally, creating a |
| // SkSpan from an rvalue reference is not safe since the pointer stored in the SkSpan will |
| // dangle beyond the constructor expression. In our usage in the constructor above, |
| // the lifetime of the temporary TArray should match that of the SkSpan, both of which should |
| // live through the constructor call expression. |
| // |
| // From https://en.cppreference.com/w/cpp/language/reference_initialization#Lifetime_of_a_temporary: |
| // |
| // a temporary bound to a reference parameter in a function call exists until the end of the |
| // full expression containing that function call |
| // |
| template <typename T, typename C> |
| static SkSpan<const T> AsSpan(C&& container) { |
| return SkSpan(std::data(container), std::size(container)); |
| } |
| }; |
| |
| #define VELLO_COMPUTE_STEP(stage) \ |
| class Vello##stage##Step final : public VelloStep<vello_cpp::ShaderStage::stage> { \ |
| public: \ |
| Vello##stage##Step(); \ |
| }; |
| |
| VELLO_COMPUTE_STEP(BackdropDyn); |
| VELLO_COMPUTE_STEP(BboxClear); |
| VELLO_COMPUTE_STEP(Binning); |
| VELLO_COMPUTE_STEP(ClipLeaf); |
| VELLO_COMPUTE_STEP(ClipReduce); |
| VELLO_COMPUTE_STEP(Coarse); |
| VELLO_COMPUTE_STEP(Flatten); |
| VELLO_COMPUTE_STEP(DrawLeaf); |
| VELLO_COMPUTE_STEP(DrawReduce); |
| VELLO_COMPUTE_STEP(PathCount); |
| VELLO_COMPUTE_STEP(PathCountSetup); |
| VELLO_COMPUTE_STEP(PathTiling); |
| VELLO_COMPUTE_STEP(PathTilingSetup); |
| VELLO_COMPUTE_STEP(PathtagReduce); |
| VELLO_COMPUTE_STEP(PathtagReduce2); |
| VELLO_COMPUTE_STEP(PathtagScan1); |
| VELLO_COMPUTE_STEP(PathtagScanLarge); |
| VELLO_COMPUTE_STEP(PathtagScanSmall); |
| VELLO_COMPUTE_STEP(TileAlloc); |
| |
| #undef VELLO_COMPUTE_STEP |
| |
| template <vello_cpp::ShaderStage S, SkColorType T> class VelloFineStepBase : public VelloStep<S> { |
| public: |
| // We need to return a texture format for the bound textures. |
| std::tuple<SkISize, SkColorType> calculateTextureParameters( |
| int index, const ComputeStep::ResourceDesc&) const override { |
| SkASSERT(index == 4); |
| // TODO: The texture dimensions are unknown here so this method returns 0 for the texture |
| // size. In this case this field is unused since VelloRenderer assigns texture resources |
| // directly to the DispatchGroupBuilder. The format must still be queried to describe the |
| // ComputeStep's binding layout. This method could be improved to enable conditional |
| // querying of optional/dynamic parameters. |
| return {{}, T}; |
| } |
| |
| protected: |
| explicit VelloFineStepBase(SkSpan<const ComputeStep::ResourceDesc> resources) |
| : VelloStep<S>(resources) {} |
| }; |
| |
| template <vello_cpp::ShaderStage S, SkColorType T> |
| class VelloFineMsaa16StepBase : public VelloFineStepBase<S, T> { |
| public: |
| size_t calculateBufferSize(int resourceIndex, const ComputeStep::ResourceDesc&) const override { |
| SkASSERT(resourceIndex == 5); |
| return 64 * 64 * 2; // 64x64 LUT of 16-bit masks. |
| } |
| |
| void prepareStorageBuffer(int resourceIndex, |
| const ComputeStep::ResourceDesc&, |
| void* buffer, |
| size_t bufferSize) const override { |
| SkASSERT(resourceIndex == 5); |
| SkASSERT(fMaskLut.size() == bufferSize); |
| memcpy(buffer, fMaskLut.data(), fMaskLut.size()); |
| } |
| |
| protected: |
| explicit VelloFineMsaa16StepBase(SkSpan<const ComputeStep::ResourceDesc> resources) |
| : VelloFineStepBase<S, T>(resources), fMaskLut(vello_cpp::build_mask_lut_16()) {} |
| |
| private: |
| ::rust::Vec<uint8_t> fMaskLut; |
| }; |
| |
| class VelloFineAreaStep final |
| : public VelloFineStepBase<vello_cpp::ShaderStage::FineArea, kRGBA_8888_SkColorType> { |
| public: |
| VelloFineAreaStep(); |
| }; |
| |
| class VelloFineAreaAlpha8Step final |
| : public VelloFineStepBase<vello_cpp::ShaderStage::FineAreaR8, kAlpha_8_SkColorType> { |
| public: |
| VelloFineAreaAlpha8Step(); |
| }; |
| |
| class VelloFineMsaa16Step final : public VelloFineMsaa16StepBase<vello_cpp::ShaderStage::FineMsaa16, |
| kRGBA_8888_SkColorType> { |
| public: |
| VelloFineMsaa16Step(); |
| }; |
| |
| class VelloFineMsaa16Alpha8Step final |
| : public VelloFineMsaa16StepBase<vello_cpp::ShaderStage::FineMsaa16R8, |
| kAlpha_8_SkColorType> { |
| public: |
| VelloFineMsaa16Alpha8Step(); |
| }; |
| |
| } // namespace skgpu::graphite |
| |
| #endif // skgpu_graphite_compute_VelloComputeSteps_DEFINED |