src/core/SkRasterPipeline.h - skia - Git at Google

 /*
  * Copyright 2016 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */

 #ifndef SkRasterPipeline_DEFINED
 #define SkRasterPipeline_DEFINED

 #include "include/core/SkColor.h"
 #include "include/core/SkTypes.h"
 #include "include/private/SkMacros.h"
 #include "src/core/SkArenaAlloc.h"

 #include <cstddef>
 #include <cstdint>
 #include <functional>

 class SkMatrix;
 enum SkColorType : int;
 struct SkImageInfo;
 struct skcms_TransferFunction;

 #if __has_cpp_attribute(clang::musttail) && !defined(__EMSCRIPTEN__) && !defined(SK_CPU_ARM32)
     #define SK_HAS_MUSTTAIL 1
 #else
     #define SK_HAS_MUSTTAIL 0
 #endif

 /**
  * SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline.
  *
  * It's particularly designed for situations where the potential pipeline is extremely
  * combinatoric: {N dst formats} x {M source formats} x {K mask formats} x {C transfer modes} ...
  * No one wants to write specialized routines for all those combinations, and if we did, we'd
  * end up bloating our code size dramatically.  SkRasterPipeline stages can be chained together
  * at runtime, so we can scale this problem linearly rather than combinatorically.
  *
  * Each stage is represented by a function conforming to a common interface and by an
  * arbitrary context pointer.  The stage function arguments and calling convention are
  * designed to maximize the amount of data we can pass along the pipeline cheaply, and
  * vary depending on CPU feature detection.
  */

 // There are two macros here: The first defines stages that have lowp (and highp) implementations
 // The second defines stages that are only present in the highp pipeline.
 #define SK_RASTER_PIPELINE_STAGES_LOWP(M)                          \
     M(move_src_dst) M(move_dst_src) M(swap_src_dst)                \
     M(clamp_01) M(clamp_gamut)                                     \
     M(premul) M(premul_dst)                                        \
     M(force_opaque) M(force_opaque_dst)                            \
     M(set_rgb) M(swap_rb) M(swap_rb_dst)                           \
     M(black_color) M(white_color)                                  \
     M(uniform_color) M(uniform_color_dst)                          \
     M(seed_shader)                                                 \
     M(load_a8)     M(load_a8_dst)   M(store_a8)    M(gather_a8)    \
     M(load_565)    M(load_565_dst)  M(store_565)   M(gather_565)   \
     M(load_4444)   M(load_4444_dst) M(store_4444)  M(gather_4444)  \
     M(load_8888)   M(load_8888_dst) M(store_8888)  M(gather_8888)  \
     M(load_rg88)   M(load_rg88_dst) M(store_rg88)  M(gather_rg88)  \
     M(store_r8)                                                    \
     M(alpha_to_gray) M(alpha_to_gray_dst)                          \
     M(alpha_to_red) M(alpha_to_red_dst)                            \
     M(bt709_luminance_or_luma_to_alpha) M(bt709_luminance_or_luma_to_rgb) \
     M(bilerp_clamp_8888)                                           \
     M(load_src) M(store_src) M(store_src_a) M(load_dst) M(store_dst) \
     M(scale_u8) M(scale_565) M(scale_1_float) M(scale_native)      \
     M( lerp_u8) M( lerp_565) M( lerp_1_float) M(lerp_native)       \
     M(dstatop) M(dstin) M(dstout) M(dstover)                       \
     M(srcatop) M(srcin) M(srcout) M(srcover)                       \
     M(clear) M(modulate) M(multiply) M(plus_) M(screen) M(xor_)    \
     M(darken) M(difference)                                        \
     M(exclusion) M(hardlight) M(lighten) M(overlay)                \
     M(srcover_rgba_8888)                                           \
     M(matrix_translate) M(matrix_scale_translate)                  \
     M(matrix_2x3)                                                  \
     M(matrix_perspective)                                          \
     M(decal_x)    M(decal_y)   M(decal_x_and_y)                    \
     M(check_decal_mask)                                            \
     M(clamp_x_1) M(mirror_x_1) M(repeat_x_1)                       \
     M(evenly_spaced_gradient)                                      \
     M(gradient)                                                    \
     M(evenly_spaced_2_stop_gradient)                               \
     M(xy_to_unit_angle)                                            \
     M(xy_to_radius)                                                \
     M(emboss)                                                      \
     M(swizzle)

 #define SK_RASTER_PIPELINE_STAGES_HIGHP_ONLY(M)                    \
     M(callback)                                                    \
     M(stack_checkpoint) M(stack_rewind)                            \
     M(unbounded_set_rgb) M(unbounded_uniform_color)                \
     M(unpremul) M(unpremul_polar) M(dither)                        \
     M(load_16161616) M(load_16161616_dst) M(store_16161616) M(gather_16161616) \
     M(load_a16)    M(load_a16_dst)  M(store_a16)   M(gather_a16)   \
     M(load_rg1616) M(load_rg1616_dst) M(store_rg1616) M(gather_rg1616) \
     M(load_f16)    M(load_f16_dst)  M(store_f16)   M(gather_f16)   \
     M(load_af16)   M(load_af16_dst) M(store_af16)  M(gather_af16)  \
     M(load_rgf16)  M(load_rgf16_dst) M(store_rgf16) M(gather_rgf16) \
     M(load_f32)    M(load_f32_dst)  M(store_f32)   M(gather_f32)   \
     M(load_rgf32)                   M(store_rgf32)                 \
     M(load_1010102) M(load_1010102_dst) M(store_1010102) M(gather_1010102) \
     M(store_u16_be)                                                \
     M(byte_tables)                                                 \
     M(colorburn) M(colordodge) M(softlight)                        \
     M(hue) M(saturation) M(color) M(luminosity)                    \
     M(matrix_3x3) M(matrix_3x4) M(matrix_4x5) M(matrix_4x3)        \
     M(parametric) M(gamma_) M(PQish) M(HLGish) M(HLGinvish)        \
     M(rgb_to_hsl) M(hsl_to_rgb)                                    \
     M(css_lab_to_xyz) M(css_oklab_to_linear_srgb)                  \
     M(css_hcl_to_lab)                                              \
     M(css_hsl_to_srgb) M(css_hwb_to_srgb)                          \
     M(gauss_a_to_rgba)                                             \
     M(mirror_x)   M(repeat_x)                                      \
     M(mirror_y)   M(repeat_y)                                      \
     M(negate_x)                                                    \
     M(bicubic_clamp_8888)                                          \
     M(bilinear_nx) M(bilinear_px) M(bilinear_ny) M(bilinear_py)    \
     M(bicubic_setup)                                               \
     M(bicubic_n3x) M(bicubic_n1x) M(bicubic_p1x) M(bicubic_p3x)    \
     M(bicubic_n3y) M(bicubic_n1y) M(bicubic_p1y) M(bicubic_p3y)    \
     M(save_xy) M(accumulate)                                       \
     M(xy_to_2pt_conical_strip)                                     \
     M(xy_to_2pt_conical_focal_on_circle)                           \
     M(xy_to_2pt_conical_well_behaved)                              \
     M(xy_to_2pt_conical_smaller)                                   \
     M(xy_to_2pt_conical_greater)                                   \
     M(alter_2pt_conical_compensate_focal)                          \
     M(alter_2pt_conical_unswap)                                    \
     M(mask_2pt_conical_nan)                                        \
     M(mask_2pt_conical_degenerates) M(apply_vector_mask)           \
     /* Dedicated SkSL stages begin here: */                                                   \
     M(init_lane_masks) M(store_src_rg) M(immediate_f)                                         \
     M(load_unmasked) M(store_unmasked) M(store_masked)                                        \
     M(load_condition_mask) M(store_condition_mask) M(merge_condition_mask)                    \
     M(load_loop_mask)      M(store_loop_mask)      M(mask_off_loop_mask)                      \
     M(reenable_loop_mask)  M(merge_loop_mask)                                                 \
     M(load_return_mask)    M(store_return_mask)    M(mask_off_return_mask)                    \
     M(branch_if_any_active_lanes) M(branch_if_no_active_lanes) M(jump)                        \
     M(bitwise_and_n_ints)                                                                     \
     M(bitwise_and_int) M(bitwise_and_2_ints) M(bitwise_and_3_ints) M(bitwise_and_4_ints)      \
     M(bitwise_or_n_ints)                                                                      \
     M(bitwise_or_int)  M(bitwise_or_2_ints)  M(bitwise_or_3_ints)  M(bitwise_or_4_ints)       \
     M(bitwise_xor_n_ints)                                                                     \
     M(bitwise_xor_int) M(bitwise_xor_2_ints) M(bitwise_xor_3_ints) M(bitwise_xor_4_ints)      \
     M(bitwise_not_int) M(bitwise_not_2_ints) M(bitwise_not_3_ints) M(bitwise_not_4_ints)      \
     M(cast_to_float_from_int)     M(cast_to_float_from_2_ints)                                \
     M(cast_to_float_from_3_ints)  M(cast_to_float_from_4_ints)                                \
     M(cast_to_float_from_uint)    M(cast_to_float_from_2_uints)                               \
     M(cast_to_float_from_3_uints) M(cast_to_float_from_4_uints)                               \
     M(cast_to_int_from_float)     M(cast_to_int_from_2_floats)                                \
     M(cast_to_int_from_3_floats)  M(cast_to_int_from_4_floats)                                \
     M(cast_to_uint_from_float)    M(cast_to_uint_from_2_floats)                               \
     M(cast_to_uint_from_3_floats) M(cast_to_uint_from_4_floats)                               \
     M(abs_float)        M(abs_2_floats)        M(abs_3_floats)        M(abs_4_floats)         \
     M(abs_int)          M(abs_2_ints)          M(abs_3_ints)          M(abs_4_ints)           \
     M(floor_float)      M(floor_2_floats)      M(floor_3_floats)      M(floor_4_floats)       \
     M(ceil_float)       M(ceil_2_floats)       M(ceil_3_floats)       M(ceil_4_floats)        \
     M(copy_constant)    M(copy_2_constants)    M(copy_3_constants)    M(copy_4_constants)     \
     M(copy_slot_masked) M(copy_2_slots_masked) M(copy_3_slots_masked) M(copy_4_slots_masked)  \
     M(copy_slot_unmasked)    M(copy_2_slots_unmasked)                                         \
     M(copy_3_slots_unmasked) M(copy_4_slots_unmasked)                                         \
     M(zero_slot_unmasked)    M(zero_2_slots_unmasked)                                         \
     M(zero_3_slots_unmasked) M(zero_4_slots_unmasked)                                         \
     M(swizzle_1) M(swizzle_2) M(swizzle_3) M(swizzle_4) M(transpose)                          \
     M(add_n_floats) M(add_float) M(add_2_floats) M(add_3_floats) M(add_4_floats)              \
     M(add_n_ints)   M(add_int)   M(add_2_ints)   M(add_3_ints)   M(add_4_ints)                \
     M(sub_n_floats) M(sub_float) M(sub_2_floats) M(sub_3_floats) M(sub_4_floats)              \
     M(sub_n_ints)   M(sub_int)   M(sub_2_ints)   M(sub_3_ints)   M(sub_4_ints)                \
     M(mul_n_floats) M(mul_float) M(mul_2_floats) M(mul_3_floats) M(mul_4_floats)              \
     M(mul_n_ints)   M(mul_int)   M(mul_2_ints)   M(mul_3_ints)   M(mul_4_ints)                \
     M(div_n_floats) M(div_float) M(div_2_floats) M(div_3_floats) M(div_4_floats)              \
     M(div_n_ints)   M(div_int)   M(div_2_ints)   M(div_3_ints)   M(div_4_ints)                \
     M(div_n_uints)  M(div_uint)  M(div_2_uints)  M(div_3_uints)  M(div_4_uints)               \
     M(max_n_floats) M(max_float) M(max_2_floats) M(max_3_floats) M(max_4_floats)              \
     M(max_n_ints)   M(max_int)   M(max_2_ints)   M(max_3_ints)   M(max_4_ints)                \
     M(max_n_uints)  M(max_uint)  M(max_2_uints)  M(max_3_uints)  M(max_4_uints)               \
     M(min_n_floats) M(min_float) M(min_2_floats) M(min_3_floats) M(min_4_floats)              \
     M(min_n_ints)   M(min_int)   M(min_2_ints)   M(min_3_ints)   M(min_4_ints)                \
     M(min_n_uints)  M(min_uint)  M(min_2_uints)  M(min_3_uints)  M(min_4_uints)               \
     M(mix_n_floats) M(mix_float) M(mix_2_floats) M(mix_3_floats) M(mix_4_floats)              \
     M(cmplt_n_floats) M(cmplt_float) M(cmplt_2_floats) M(cmplt_3_floats) M(cmplt_4_floats)    \
     M(cmplt_n_ints)   M(cmplt_int)   M(cmplt_2_ints)   M(cmplt_3_ints)   M(cmplt_4_ints)      \
     M(cmplt_n_uints)  M(cmplt_uint)  M(cmplt_2_uints)  M(cmplt_3_uints)  M(cmplt_4_uints)     \
     M(cmple_n_floats) M(cmple_float) M(cmple_2_floats) M(cmple_3_floats) M(cmple_4_floats)    \
     M(cmple_n_ints)   M(cmple_int)   M(cmple_2_ints)   M(cmple_3_ints)   M(cmple_4_ints)      \
     M(cmple_n_uints)  M(cmple_uint)  M(cmple_2_uints)  M(cmple_3_uints)  M(cmple_4_uints)     \
     M(cmpeq_n_floats) M(cmpeq_float) M(cmpeq_2_floats) M(cmpeq_3_floats) M(cmpeq_4_floats)    \
     M(cmpeq_n_ints)   M(cmpeq_int)   M(cmpeq_2_ints)   M(cmpeq_3_ints)   M(cmpeq_4_ints)      \
     M(cmpne_n_floats) M(cmpne_float) M(cmpne_2_floats) M(cmpne_3_floats) M(cmpne_4_floats)    \
     M(cmpne_n_ints)   M(cmpne_int)   M(cmpne_2_ints)   M(cmpne_3_ints)   M(cmpne_4_ints)

 // The combined list of all stages:
 #define SK_RASTER_PIPELINE_STAGES_ALL(M) \
     SK_RASTER_PIPELINE_STAGES_LOWP(M)    \
     SK_RASTER_PIPELINE_STAGES_HIGHP_ONLY(M)

 // The largest number of pixels we handle at a time. We have a separate value for the largest number
 // of pixels we handle in the highp pipeline. Many of the context structs in this file are only used
 // by stages that have no lowp implementation. They can therefore use the (smaller) highp value to
 // save memory in the arena.
 inline static constexpr int SkRasterPipeline_kMaxStride = 16;
 inline static constexpr int SkRasterPipeline_kMaxStride_highp = 8;

 // Raster pipeline programs are stored as a contiguous array of SkRasterPipelineStages.
 SK_BEGIN_REQUIRE_DENSE
 struct SkRasterPipelineStage {
     // A function pointer from `stages_lowp` or `stages_highp`. The exact function pointer type
     // varies depending on architecture (specifically, see `Stage` in SkRasterPipeline_opts.h).
     void (*fn)();

     // Data used by the stage function. Most context structures are declared at the top of
     // SkRasterPipeline.h, and have names ending in Ctx (e.g. "SkRasterPipeline_SamplerCtx").
     void* ctx;
 };
 SK_END_REQUIRE_DENSE

 // Structs representing the arguments to some common stages.

 struct SkRasterPipeline_MemoryCtx {
     void* pixels;
     int   stride;
 };

 struct SkRasterPipeline_GatherCtx {
     const void* pixels;
     int         stride;
     float       width;
     float       height;
     float       weights[16];  // for bicubic and bicubic_clamp_8888
     // Controls whether pixel i-1 or i is selected when floating point sample position is exactly i.
     bool        roundDownAtInteger = false;
 };

 // State shared by save_xy, accumulate, and bilinear_* / bicubic_*.
 struct SkRasterPipeline_SamplerCtx {
     float      x[SkRasterPipeline_kMaxStride_highp];
     float      y[SkRasterPipeline_kMaxStride_highp];
     float     fx[SkRasterPipeline_kMaxStride_highp];
     float     fy[SkRasterPipeline_kMaxStride_highp];
     float scalex[SkRasterPipeline_kMaxStride_highp];
     float scaley[SkRasterPipeline_kMaxStride_highp];

     // for bicubic_[np][13][xy]
     float weights[16];
     float wx[4][SkRasterPipeline_kMaxStride_highp];
     float wy[4][SkRasterPipeline_kMaxStride_highp];
 };

 struct SkRasterPipeline_TileCtx {
     float scale;
     float invScale; // cache of 1/scale
     // When in the reflection portion of mirror tiling we need to snap the opposite direction
     // at integer sample points than when in the forward direction. This controls which way we bias
     // in the reflection. It should be 1 if SkRasterPipeline_GatherCtx::roundDownAtInteger is true
     // and otherwise -1.
     int   mirrorBiasDir = -1;
 };

 struct SkRasterPipeline_DecalTileCtx {
     uint32_t mask[SkRasterPipeline_kMaxStride];
     float    limit_x;
     float    limit_y;
     // These control which edge of the interval is included (i.e. closed interval at 0 or at limit).
     // They should be set to limit_x and limit_y if SkRasterPipeline_GatherCtx::roundDownAtInteger
     // is true and otherwise zero.
     float    inclusiveEdge_x = 0;
     float    inclusiveEdge_y = 0;
 };

 struct SkRasterPipeline_CallbackCtx {
     void (*fn)(SkRasterPipeline_CallbackCtx* self,
                int active_pixels /*<= SkRasterPipeline_kMaxStride_highp*/);

     // When called, fn() will have our active pixels available in rgba.
     // When fn() returns, the pipeline will read back those active pixels from read_from.
     float rgba[4*SkRasterPipeline_kMaxStride_highp];
     float* read_from = rgba;
 };

 // state shared by stack_checkpoint and stack_rewind
 struct SkRasterPipeline_RewindCtx {
     float  r[SkRasterPipeline_kMaxStride_highp];
     float  g[SkRasterPipeline_kMaxStride_highp];
     float  b[SkRasterPipeline_kMaxStride_highp];
     float  a[SkRasterPipeline_kMaxStride_highp];
     float dr[SkRasterPipeline_kMaxStride_highp];
     float dg[SkRasterPipeline_kMaxStride_highp];
     float db[SkRasterPipeline_kMaxStride_highp];
     float da[SkRasterPipeline_kMaxStride_highp];
     SkRasterPipelineStage* stage;
 };

 struct SkRasterPipeline_GradientCtx {
     size_t stopCount;
     float* fs[4];
     float* bs[4];
     float* ts;
 };

 struct SkRasterPipeline_EvenlySpaced2StopGradientCtx {
     float f[4];
     float b[4];
 };

 struct SkRasterPipeline_2PtConicalCtx {
     uint32_t fMask[SkRasterPipeline_kMaxStride_highp];
     float    fP0,
              fP1;
 };

 struct SkRasterPipeline_UniformColorCtx {
     float r,g,b,a;
     uint16_t rgba[4];  // [0,255] in a 16-bit lane.
 };

 struct SkRasterPipeline_EmbossCtx {
     SkRasterPipeline_MemoryCtx mul,
                                add;
 };

 struct SkRasterPipeline_TablesCtx {
     const uint8_t *r, *g, *b, *a;
 };

 struct SkRasterPipeline_BinaryOpCtx {
     float *dst;
     const float *src;
 };

 struct SkRasterPipeline_TernaryOpCtx {
     float *dst;
     const float *src0;
     const float *src1;
 };

 struct SkRasterPipeline_SwizzleCtx {
     float *ptr;
     uint16_t offsets[4];  // values must be byte offsets (4 * highp-stride * component-index)
 };

 struct SkRasterPipeline_TransposeCtx {
     float *ptr;
     int count;
     uint16_t offsets[16];  // values must be byte offsets (4 * highp-stride * component-index)
 };

 class SkRasterPipeline {
 public:
     explicit SkRasterPipeline(SkArenaAlloc*);

     SkRasterPipeline(const SkRasterPipeline&) = delete;
     SkRasterPipeline(SkRasterPipeline&&)      = default;

     SkRasterPipeline& operator=(const SkRasterPipeline&) = delete;
     SkRasterPipeline& operator=(SkRasterPipeline&&)      = default;

     void reset();

     enum Stage {
     #define M(stage) stage,
         SK_RASTER_PIPELINE_STAGES_ALL(M)
     #undef M
     };

 #define M(st) +1
     static constexpr int kNumLowpStages  = SK_RASTER_PIPELINE_STAGES_LOWP(M);
     static constexpr int kNumHighpStages = SK_RASTER_PIPELINE_STAGES_ALL(M);
 #undef M

     void append(Stage, void* = nullptr);
     void append(Stage stage, const void* ctx) { this->append(stage, const_cast<void*>(ctx)); }
     void append(Stage, uintptr_t ctx);

     // Append all stages to this pipeline.
     void extend(const SkRasterPipeline&);

     // Runs the pipeline in 2d from (x,y) inclusive to (x+w,y+h) exclusive.
     void run(size_t x, size_t y, size_t w, size_t h) const;

     // Allocates a thunk which amortizes run() setup cost in alloc.
     std::function<void(size_t, size_t, size_t, size_t)> compile() const;

     // Callers can inspect the stage list for debugging purposes.
     struct StageList {
         StageList* prev;
         Stage      stage;
         void*      ctx;
     };

     static const char* GetStageName(Stage stage);
     const StageList* getStageList() const { return fStages; }
     int getNumStages() const { return fNumStages; }

     // Prints the entire StageList using SkDebugf.
     void dump() const;

     // Appends a stage for the specified matrix.
     // Tries to optimize the stage by analyzing the type of matrix.
     void append_matrix(SkArenaAlloc*, const SkMatrix&);

     // Appends a stage for a constant uniform color.
     // Tries to optimize the stage based on the color.
     void append_constant_color(SkArenaAlloc*, const float rgba[4]);

     void append_constant_color(SkArenaAlloc* alloc, const SkColor4f& color) {
         this->append_constant_color(alloc, color.vec());
     }

     // Like append_constant_color() but only affecting r,g,b, ignoring the alpha channel.
     void append_set_rgb(SkArenaAlloc*, const float rgb[3]);

     void append_set_rgb(SkArenaAlloc* alloc, const SkColor4f& color) {
         this->append_set_rgb(alloc, color.vec());
     }

     void append_load    (SkColorType, const SkRasterPipeline_MemoryCtx*);
     void append_load_dst(SkColorType, const SkRasterPipeline_MemoryCtx*);
     void append_store   (SkColorType, const SkRasterPipeline_MemoryCtx*);

     void append_clamp_if_normalized(const SkImageInfo&);

     void append_transfer_function(const skcms_TransferFunction&);

     void append_stack_rewind();

     bool empty() const { return fStages == nullptr; }

 private:
     bool build_lowp_pipeline(SkRasterPipelineStage* ip) const;
     void build_highp_pipeline(SkRasterPipelineStage* ip) const;

     using StartPipelineFn = void(*)(size_t,size_t,size_t,size_t, SkRasterPipelineStage* program);
     StartPipelineFn build_pipeline(SkRasterPipelineStage*) const;

     void unchecked_append(Stage, void*);
     int stages_needed() const;

     SkArenaAlloc*               fAlloc;
     SkRasterPipeline_RewindCtx* fRewindCtx;
     StageList*                  fStages;
     int                         fNumStages;
 };

 template <size_t bytes>
 class SkRasterPipeline_ : public SkRasterPipeline {
 public:
     SkRasterPipeline_()
         : SkRasterPipeline(&fBuiltinAlloc) {}

 private:
     SkSTArenaAlloc<bytes> fBuiltinAlloc;
 };


 #endif//SkRasterPipeline_DEFINED
	/*
	* Copyright 2016 Google Inc.
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/

	#ifndef SkRasterPipeline_DEFINED
	#define SkRasterPipeline_DEFINED

	#include "include/core/SkColor.h"
	#include "include/core/SkTypes.h"
	#include "include/private/SkMacros.h"
	#include "src/core/SkArenaAlloc.h"

	#include <cstddef>
	#include <cstdint>
	#include <functional>

	class SkMatrix;
	enum SkColorType : int;
	struct SkImageInfo;
	struct skcms_TransferFunction;

	#if __has_cpp_attribute(clang::musttail) && !defined(__EMSCRIPTEN__) && !defined(SK_CPU_ARM32)
	#define SK_HAS_MUSTTAIL 1
	#else
	#define SK_HAS_MUSTTAIL 0
	#endif

	/**
	* SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline.
	*
	* It's particularly designed for situations where the potential pipeline is extremely
	* combinatoric: {N dst formats} x {M source formats} x {K mask formats} x {C transfer modes} ...
	* No one wants to write specialized routines for all those combinations, and if we did, we'd
	* end up bloating our code size dramatically. SkRasterPipeline stages can be chained together
	* at runtime, so we can scale this problem linearly rather than combinatorically.
	*
	* Each stage is represented by a function conforming to a common interface and by an
	* arbitrary context pointer. The stage function arguments and calling convention are
	* designed to maximize the amount of data we can pass along the pipeline cheaply, and
	* vary depending on CPU feature detection.
	*/

	// There are two macros here: The first defines stages that have lowp (and highp) implementations
	// The second defines stages that are only present in the highp pipeline.
	#define SK_RASTER_PIPELINE_STAGES_LOWP(M) \
	M(move_src_dst) M(move_dst_src) M(swap_src_dst) \
	M(clamp_01) M(clamp_gamut) \
	M(premul) M(premul_dst) \
	M(force_opaque) M(force_opaque_dst) \
	M(set_rgb) M(swap_rb) M(swap_rb_dst) \
	M(black_color) M(white_color) \
	M(uniform_color) M(uniform_color_dst) \
	M(seed_shader) \
	M(load_a8) M(load_a8_dst) M(store_a8) M(gather_a8) \
	M(load_565) M(load_565_dst) M(store_565) M(gather_565) \
	M(load_4444) M(load_4444_dst) M(store_4444) M(gather_4444) \
	M(load_8888) M(load_8888_dst) M(store_8888) M(gather_8888) \
	M(load_rg88) M(load_rg88_dst) M(store_rg88) M(gather_rg88) \
	M(store_r8) \
	M(alpha_to_gray) M(alpha_to_gray_dst) \
	M(alpha_to_red) M(alpha_to_red_dst) \
	M(bt709_luminance_or_luma_to_alpha) M(bt709_luminance_or_luma_to_rgb) \
	M(bilerp_clamp_8888) \
	M(load_src) M(store_src) M(store_src_a) M(load_dst) M(store_dst) \
	M(scale_u8) M(scale_565) M(scale_1_float) M(scale_native) \
	M( lerp_u8) M( lerp_565) M( lerp_1_float) M(lerp_native) \
	M(dstatop) M(dstin) M(dstout) M(dstover) \
	M(srcatop) M(srcin) M(srcout) M(srcover) \
	M(clear) M(modulate) M(multiply) M(plus_) M(screen) M(xor_) \
	M(darken) M(difference) \
	M(exclusion) M(hardlight) M(lighten) M(overlay) \
	M(srcover_rgba_8888) \
	M(matrix_translate) M(matrix_scale_translate) \
	M(matrix_2x3) \
	M(matrix_perspective) \
	M(decal_x) M(decal_y) M(decal_x_and_y) \
	M(check_decal_mask) \
	M(clamp_x_1) M(mirror_x_1) M(repeat_x_1) \
	M(evenly_spaced_gradient) \
	M(gradient) \
	M(evenly_spaced_2_stop_gradient) \
	M(xy_to_unit_angle) \
	M(xy_to_radius) \
	M(emboss) \
	M(swizzle)

	#define SK_RASTER_PIPELINE_STAGES_HIGHP_ONLY(M) \
	M(callback) \
	M(stack_checkpoint) M(stack_rewind) \
	M(unbounded_set_rgb) M(unbounded_uniform_color) \
	M(unpremul) M(unpremul_polar) M(dither) \
	M(load_16161616) M(load_16161616_dst) M(store_16161616) M(gather_16161616) \
	M(load_a16) M(load_a16_dst) M(store_a16) M(gather_a16) \
	M(load_rg1616) M(load_rg1616_dst) M(store_rg1616) M(gather_rg1616) \
	M(load_f16) M(load_f16_dst) M(store_f16) M(gather_f16) \
	M(load_af16) M(load_af16_dst) M(store_af16) M(gather_af16) \
	M(load_rgf16) M(load_rgf16_dst) M(store_rgf16) M(gather_rgf16) \
	M(load_f32) M(load_f32_dst) M(store_f32) M(gather_f32) \
	M(load_rgf32) M(store_rgf32) \
	M(load_1010102) M(load_1010102_dst) M(store_1010102) M(gather_1010102) \
	M(store_u16_be) \
	M(byte_tables) \
	M(colorburn) M(colordodge) M(softlight) \
	M(hue) M(saturation) M(color) M(luminosity) \
	M(matrix_3x3) M(matrix_3x4) M(matrix_4x5) M(matrix_4x3) \
	M(parametric) M(gamma_) M(PQish) M(HLGish) M(HLGinvish) \
	M(rgb_to_hsl) M(hsl_to_rgb) \
	M(css_lab_to_xyz) M(css_oklab_to_linear_srgb) \
	M(css_hcl_to_lab) \
	M(css_hsl_to_srgb) M(css_hwb_to_srgb) \
	M(gauss_a_to_rgba) \
	M(mirror_x) M(repeat_x) \
	M(mirror_y) M(repeat_y) \
	M(negate_x) \
	M(bicubic_clamp_8888) \
	M(bilinear_nx) M(bilinear_px) M(bilinear_ny) M(bilinear_py) \
	M(bicubic_setup) \
	M(bicubic_n3x) M(bicubic_n1x) M(bicubic_p1x) M(bicubic_p3x) \
	M(bicubic_n3y) M(bicubic_n1y) M(bicubic_p1y) M(bicubic_p3y) \
	M(save_xy) M(accumulate) \
	M(xy_to_2pt_conical_strip) \
	M(xy_to_2pt_conical_focal_on_circle) \
	M(xy_to_2pt_conical_well_behaved) \
	M(xy_to_2pt_conical_smaller) \
	M(xy_to_2pt_conical_greater) \
	M(alter_2pt_conical_compensate_focal) \
	M(alter_2pt_conical_unswap) \
	M(mask_2pt_conical_nan) \
	M(mask_2pt_conical_degenerates) M(apply_vector_mask) \
	/* Dedicated SkSL stages begin here: */ \
	M(init_lane_masks) M(store_src_rg) M(immediate_f) \
	M(load_unmasked) M(store_unmasked) M(store_masked) \
	M(load_condition_mask) M(store_condition_mask) M(merge_condition_mask) \
	M(load_loop_mask) M(store_loop_mask) M(mask_off_loop_mask) \
	M(reenable_loop_mask) M(merge_loop_mask) \
	M(load_return_mask) M(store_return_mask) M(mask_off_return_mask) \
	M(branch_if_any_active_lanes) M(branch_if_no_active_lanes) M(jump) \
	M(bitwise_and_n_ints) \
	M(bitwise_and_int) M(bitwise_and_2_ints) M(bitwise_and_3_ints) M(bitwise_and_4_ints) \
	M(bitwise_or_n_ints) \
	M(bitwise_or_int) M(bitwise_or_2_ints) M(bitwise_or_3_ints) M(bitwise_or_4_ints) \
	M(bitwise_xor_n_ints) \
	M(bitwise_xor_int) M(bitwise_xor_2_ints) M(bitwise_xor_3_ints) M(bitwise_xor_4_ints) \
	M(bitwise_not_int) M(bitwise_not_2_ints) M(bitwise_not_3_ints) M(bitwise_not_4_ints) \
	M(cast_to_float_from_int) M(cast_to_float_from_2_ints) \
	M(cast_to_float_from_3_ints) M(cast_to_float_from_4_ints) \
	M(cast_to_float_from_uint) M(cast_to_float_from_2_uints) \
	M(cast_to_float_from_3_uints) M(cast_to_float_from_4_uints) \
	M(cast_to_int_from_float) M(cast_to_int_from_2_floats) \
	M(cast_to_int_from_3_floats) M(cast_to_int_from_4_floats) \
	M(cast_to_uint_from_float) M(cast_to_uint_from_2_floats) \
	M(cast_to_uint_from_3_floats) M(cast_to_uint_from_4_floats) \
	M(abs_float) M(abs_2_floats) M(abs_3_floats) M(abs_4_floats) \
	M(abs_int) M(abs_2_ints) M(abs_3_ints) M(abs_4_ints) \
	M(floor_float) M(floor_2_floats) M(floor_3_floats) M(floor_4_floats) \
	M(ceil_float) M(ceil_2_floats) M(ceil_3_floats) M(ceil_4_floats) \
	M(copy_constant) M(copy_2_constants) M(copy_3_constants) M(copy_4_constants) \
	M(copy_slot_masked) M(copy_2_slots_masked) M(copy_3_slots_masked) M(copy_4_slots_masked) \
	M(copy_slot_unmasked) M(copy_2_slots_unmasked) \
	M(copy_3_slots_unmasked) M(copy_4_slots_unmasked) \
	M(zero_slot_unmasked) M(zero_2_slots_unmasked) \
	M(zero_3_slots_unmasked) M(zero_4_slots_unmasked) \
	M(swizzle_1) M(swizzle_2) M(swizzle_3) M(swizzle_4) M(transpose) \
	M(add_n_floats) M(add_float) M(add_2_floats) M(add_3_floats) M(add_4_floats) \
	M(add_n_ints) M(add_int) M(add_2_ints) M(add_3_ints) M(add_4_ints) \
	M(sub_n_floats) M(sub_float) M(sub_2_floats) M(sub_3_floats) M(sub_4_floats) \
	M(sub_n_ints) M(sub_int) M(sub_2_ints) M(sub_3_ints) M(sub_4_ints) \
	M(mul_n_floats) M(mul_float) M(mul_2_floats) M(mul_3_floats) M(mul_4_floats) \
	M(mul_n_ints) M(mul_int) M(mul_2_ints) M(mul_3_ints) M(mul_4_ints) \
	M(div_n_floats) M(div_float) M(div_2_floats) M(div_3_floats) M(div_4_floats) \
	M(div_n_ints) M(div_int) M(div_2_ints) M(div_3_ints) M(div_4_ints) \
	M(div_n_uints) M(div_uint) M(div_2_uints) M(div_3_uints) M(div_4_uints) \
	M(max_n_floats) M(max_float) M(max_2_floats) M(max_3_floats) M(max_4_floats) \
	M(max_n_ints) M(max_int) M(max_2_ints) M(max_3_ints) M(max_4_ints) \
	M(max_n_uints) M(max_uint) M(max_2_uints) M(max_3_uints) M(max_4_uints) \
	M(min_n_floats) M(min_float) M(min_2_floats) M(min_3_floats) M(min_4_floats) \
	M(min_n_ints) M(min_int) M(min_2_ints) M(min_3_ints) M(min_4_ints) \
	M(min_n_uints) M(min_uint) M(min_2_uints) M(min_3_uints) M(min_4_uints) \
	M(mix_n_floats) M(mix_float) M(mix_2_floats) M(mix_3_floats) M(mix_4_floats) \
	M(cmplt_n_floats) M(cmplt_float) M(cmplt_2_floats) M(cmplt_3_floats) M(cmplt_4_floats) \
	M(cmplt_n_ints) M(cmplt_int) M(cmplt_2_ints) M(cmplt_3_ints) M(cmplt_4_ints) \
	M(cmplt_n_uints) M(cmplt_uint) M(cmplt_2_uints) M(cmplt_3_uints) M(cmplt_4_uints) \
	M(cmple_n_floats) M(cmple_float) M(cmple_2_floats) M(cmple_3_floats) M(cmple_4_floats) \
	M(cmple_n_ints) M(cmple_int) M(cmple_2_ints) M(cmple_3_ints) M(cmple_4_ints) \
	M(cmple_n_uints) M(cmple_uint) M(cmple_2_uints) M(cmple_3_uints) M(cmple_4_uints) \
	M(cmpeq_n_floats) M(cmpeq_float) M(cmpeq_2_floats) M(cmpeq_3_floats) M(cmpeq_4_floats) \
	M(cmpeq_n_ints) M(cmpeq_int) M(cmpeq_2_ints) M(cmpeq_3_ints) M(cmpeq_4_ints) \
	M(cmpne_n_floats) M(cmpne_float) M(cmpne_2_floats) M(cmpne_3_floats) M(cmpne_4_floats) \
	M(cmpne_n_ints) M(cmpne_int) M(cmpne_2_ints) M(cmpne_3_ints) M(cmpne_4_ints)

	// The combined list of all stages:
	#define SK_RASTER_PIPELINE_STAGES_ALL(M) \
	SK_RASTER_PIPELINE_STAGES_LOWP(M) \
	SK_RASTER_PIPELINE_STAGES_HIGHP_ONLY(M)

	// The largest number of pixels we handle at a time. We have a separate value for the largest number
	// of pixels we handle in the highp pipeline. Many of the context structs in this file are only used
	// by stages that have no lowp implementation. They can therefore use the (smaller) highp value to
	// save memory in the arena.
	inline static constexpr int SkRasterPipeline_kMaxStride = 16;
	inline static constexpr int SkRasterPipeline_kMaxStride_highp = 8;

	// Raster pipeline programs are stored as a contiguous array of SkRasterPipelineStages.
	SK_BEGIN_REQUIRE_DENSE
	struct SkRasterPipelineStage {
	// A function pointer from `stages_lowp` or `stages_highp`. The exact function pointer type
	// varies depending on architecture (specifically, see `Stage` in SkRasterPipeline_opts.h).
	void (*fn)();

	// Data used by the stage function. Most context structures are declared at the top of
	// SkRasterPipeline.h, and have names ending in Ctx (e.g. "SkRasterPipeline_SamplerCtx").
	void* ctx;
	};
	SK_END_REQUIRE_DENSE

	// Structs representing the arguments to some common stages.

	struct SkRasterPipeline_MemoryCtx {
	void* pixels;
	int stride;
	};

	struct SkRasterPipeline_GatherCtx {
	const void* pixels;
	int stride;
	float width;
	float height;
	float weights[16]; // for bicubic and bicubic_clamp_8888
	// Controls whether pixel i-1 or i is selected when floating point sample position is exactly i.
	bool roundDownAtInteger = false;
	};

	// State shared by save_xy, accumulate, and bilinear_* / bicubic_*.
	struct SkRasterPipeline_SamplerCtx {
	float x[SkRasterPipeline_kMaxStride_highp];
	float y[SkRasterPipeline_kMaxStride_highp];
	float fx[SkRasterPipeline_kMaxStride_highp];
	float fy[SkRasterPipeline_kMaxStride_highp];
	float scalex[SkRasterPipeline_kMaxStride_highp];
	float scaley[SkRasterPipeline_kMaxStride_highp];

	// for bicubic_[np][13][xy]
	float weights[16];
	float wx[4][SkRasterPipeline_kMaxStride_highp];
	float wy[4][SkRasterPipeline_kMaxStride_highp];
	};

	struct SkRasterPipeline_TileCtx {
	float scale;
	float invScale; // cache of 1/scale
	// When in the reflection portion of mirror tiling we need to snap the opposite direction
	// at integer sample points than when in the forward direction. This controls which way we bias
	// in the reflection. It should be 1 if SkRasterPipeline_GatherCtx::roundDownAtInteger is true
	// and otherwise -1.
	int mirrorBiasDir = -1;
	};

	struct SkRasterPipeline_DecalTileCtx {
	uint32_t mask[SkRasterPipeline_kMaxStride];
	float limit_x;
	float limit_y;
	// These control which edge of the interval is included (i.e. closed interval at 0 or at limit).
	// They should be set to limit_x and limit_y if SkRasterPipeline_GatherCtx::roundDownAtInteger
	// is true and otherwise zero.
	float inclusiveEdge_x = 0;
	float inclusiveEdge_y = 0;
	};

	struct SkRasterPipeline_CallbackCtx {
	void (fn)(SkRasterPipeline_CallbackCtx self,
	int active_pixels /<= SkRasterPipeline_kMaxStride_highp/);

	// When called, fn() will have our active pixels available in rgba.
	// When fn() returns, the pipeline will read back those active pixels from read_from.
	float rgba[4*SkRasterPipeline_kMaxStride_highp];
	float* read_from = rgba;
	};

	// state shared by stack_checkpoint and stack_rewind
	struct SkRasterPipeline_RewindCtx {
	float r[SkRasterPipeline_kMaxStride_highp];
	float g[SkRasterPipeline_kMaxStride_highp];
	float b[SkRasterPipeline_kMaxStride_highp];
	float a[SkRasterPipeline_kMaxStride_highp];
	float dr[SkRasterPipeline_kMaxStride_highp];
	float dg[SkRasterPipeline_kMaxStride_highp];
	float db[SkRasterPipeline_kMaxStride_highp];
	float da[SkRasterPipeline_kMaxStride_highp];
	SkRasterPipelineStage* stage;
	};

	struct SkRasterPipeline_GradientCtx {
	size_t stopCount;
	float* fs[4];
	float* bs[4];
	float* ts;
	};

	struct SkRasterPipeline_EvenlySpaced2StopGradientCtx {
	float f[4];
	float b[4];
	};

	struct SkRasterPipeline_2PtConicalCtx {
	uint32_t fMask[SkRasterPipeline_kMaxStride_highp];
	float fP0,
	fP1;
	};

	struct SkRasterPipeline_UniformColorCtx {
	float r,g,b,a;
	uint16_t rgba[4]; // [0,255] in a 16-bit lane.
	};

	struct SkRasterPipeline_EmbossCtx {
	SkRasterPipeline_MemoryCtx mul,
	add;
	};

	struct SkRasterPipeline_TablesCtx {
	const uint8_t r, g, b, a;
	};

	struct SkRasterPipeline_BinaryOpCtx {
	float *dst;
	const float *src;
	};

	struct SkRasterPipeline_TernaryOpCtx {
	float *dst;
	const float *src0;
	const float *src1;
	};

	struct SkRasterPipeline_SwizzleCtx {
	float *ptr;
	uint16_t offsets[4]; // values must be byte offsets (4 * highp-stride * component-index)
	};

	struct SkRasterPipeline_TransposeCtx {
	float *ptr;
	int count;
	uint16_t offsets[16]; // values must be byte offsets (4 * highp-stride * component-index)
	};

	class SkRasterPipeline {
	public:
	explicit SkRasterPipeline(SkArenaAlloc*);

	SkRasterPipeline(const SkRasterPipeline&) = delete;
	SkRasterPipeline(SkRasterPipeline&&) = default;

	SkRasterPipeline& operator=(const SkRasterPipeline&) = delete;
	SkRasterPipeline& operator=(SkRasterPipeline&&) = default;

	void reset();

	enum Stage {
	#define M(stage) stage,
	SK_RASTER_PIPELINE_STAGES_ALL(M)
	#undef M
	};

	#define M(st) +1
	static constexpr int kNumLowpStages = SK_RASTER_PIPELINE_STAGES_LOWP(M);
	static constexpr int kNumHighpStages = SK_RASTER_PIPELINE_STAGES_ALL(M);
	#undef M

	void append(Stage, void* = nullptr);
	void append(Stage stage, const void* ctx) { this->append(stage, const_cast<void*>(ctx)); }
	void append(Stage, uintptr_t ctx);

	// Append all stages to this pipeline.
	void extend(const SkRasterPipeline&);

	// Runs the pipeline in 2d from (x,y) inclusive to (x+w,y+h) exclusive.
	void run(size_t x, size_t y, size_t w, size_t h) const;

	// Allocates a thunk which amortizes run() setup cost in alloc.
	std::function<void(size_t, size_t, size_t, size_t)> compile() const;

	// Callers can inspect the stage list for debugging purposes.
	struct StageList {
	StageList* prev;
	Stage stage;
	void* ctx;
	};

	static const char* GetStageName(Stage stage);
	const StageList* getStageList() const { return fStages; }
	int getNumStages() const { return fNumStages; }

	// Prints the entire StageList using SkDebugf.
	void dump() const;

	// Appends a stage for the specified matrix.
	// Tries to optimize the stage by analyzing the type of matrix.
	void append_matrix(SkArenaAlloc*, const SkMatrix&);

	// Appends a stage for a constant uniform color.
	// Tries to optimize the stage based on the color.
	void append_constant_color(SkArenaAlloc*, const float rgba[4]);

	void append_constant_color(SkArenaAlloc* alloc, const SkColor4f& color) {
	this->append_constant_color(alloc, color.vec());
	}

	// Like append_constant_color() but only affecting r,g,b, ignoring the alpha channel.
	void append_set_rgb(SkArenaAlloc*, const float rgb[3]);

	void append_set_rgb(SkArenaAlloc* alloc, const SkColor4f& color) {
	this->append_set_rgb(alloc, color.vec());
	}

	void append_load (SkColorType, const SkRasterPipeline_MemoryCtx*);
	void append_load_dst(SkColorType, const SkRasterPipeline_MemoryCtx*);
	void append_store (SkColorType, const SkRasterPipeline_MemoryCtx*);

	void append_clamp_if_normalized(const SkImageInfo&);

	void append_transfer_function(const skcms_TransferFunction&);

	void append_stack_rewind();

	bool empty() const { return fStages == nullptr; }

	private:
	bool build_lowp_pipeline(SkRasterPipelineStage* ip) const;
	void build_highp_pipeline(SkRasterPipelineStage* ip) const;

	using StartPipelineFn = void()(size_t,size_t,size_t,size_t, SkRasterPipelineStage program);
	StartPipelineFn build_pipeline(SkRasterPipelineStage*) const;

	void unchecked_append(Stage, void*);
	int stages_needed() const;

	SkArenaAlloc* fAlloc;
	SkRasterPipeline_RewindCtx* fRewindCtx;
	StageList* fStages;
	int fNumStages;
	};

	template <size_t bytes>
	class SkRasterPipeline_ : public SkRasterPipeline {
	public:
	SkRasterPipeline_()
	: SkRasterPipeline(&fBuiltinAlloc) {}

	private:
	SkSTArenaAlloc<bytes> fBuiltinAlloc;
	};


	#endif//SkRasterPipeline_DEFINED