basic pass at gradients

This is looking pretty good.

Some egregious diffs, some invisible,
and a handful in-between I've investigated.

The egregious diffs look like bad shader caching.  I need to fix that,
but its brokenness is simply helpfully exposed here, not made any worse.
To repo, run: lumafilter, srcmode.

These in-between diffs all fall within the existing variance:
   crbug_938592               ok
   analytic_gradients         ok
   gradients_dup_color_stops  ok
Probably different geometry math, FMAs, reordering, etc.

I will follow up with radial etc,
and shallow gradients also demonstrate we're missing dither.

Change-Id: I5a046ee42accdf1faed50b3b65efb4a6787b5e04
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/265821
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Mike Reed <reed@google.com>
diff --git a/src/shaders/SkImageShader.cpp b/src/shaders/SkImageShader.cpp
index f4a13eb..a88c349 100755
--- a/src/shaders/SkImageShader.cpp
+++ b/src/shaders/SkImageShader.cpp
@@ -664,30 +664,10 @@
     inv     = state->invMatrix();
     quality = state->quality();
     tweak_quality_and_inv_matrix(&quality, &inv);
+    inv.normalizePerspective();
 
     // Apply matrix to convert dst coords to sample center coords.
-    inv.normalizePerspective();
-    if (inv.isIdentity()) {
-        // That was easy.
-    } else if (inv.isTranslate()) {
-        x = p->add(x, p->uniformF(uniforms->pushF(inv[2])));
-        y = p->add(y, p->uniformF(uniforms->pushF(inv[5])));
-    } else if (inv.isScaleTranslate()) {
-        x = p->mad(x, p->uniformF(uniforms->pushF(inv[0])), p->uniformF(uniforms->pushF(inv[2])));
-        y = p->mad(y, p->uniformF(uniforms->pushF(inv[4])), p->uniformF(uniforms->pushF(inv[5])));
-    } else {  // Affine or perspective.
-        auto dot = [&,x,y](int row) {
-            return p->mad(x, p->uniformF(uniforms->pushF(inv[3*row+0])),
-                   p->mad(y, p->uniformF(uniforms->pushF(inv[3*row+1])),
-                             p->uniformF(uniforms->pushF(inv[3*row+2]))));
-        };
-        x = dot(0);
-        y = dot(1);
-        if (inv.hasPerspective()) {
-            x = p->div(x, dot(2));
-            y = p->div(y, dot(2));
-        }
-    }
+    SkShaderBase::ApplyMatrix(p, inv, &x,&y,uniforms);
 
     // Bail out if sample() can't yet handle our image's color type.
     switch (pm.colorType()) {
diff --git a/src/shaders/SkShader.cpp b/src/shaders/SkShader.cpp
index 8634f41..5c77cc8 100644
--- a/src/shaders/SkShader.cpp
+++ b/src/shaders/SkShader.cpp
@@ -234,6 +234,31 @@
     return false;
 }
 
+void SkShaderBase::ApplyMatrix(skvm::Builder* p, const SkMatrix& m,
+                               skvm::F32* x, skvm::F32* y, skvm::Uniforms* uniforms) {
+    if (m.isIdentity()) {
+        // That was easy.
+    } else if (m.isTranslate()) {
+        *x = p->add(*x, p->uniformF(uniforms->pushF(m[2])));
+        *y = p->add(*y, p->uniformF(uniforms->pushF(m[5])));
+    } else if (m.isScaleTranslate()) {
+        *x = p->mad(*x, p->uniformF(uniforms->pushF(m[0])), p->uniformF(uniforms->pushF(m[2])));
+        *y = p->mad(*y, p->uniformF(uniforms->pushF(m[4])), p->uniformF(uniforms->pushF(m[5])));
+    } else {  // Affine or perspective.
+        auto dot = [&,X=*x,Y=*y](int row) {
+            return p->mad(X, p->uniformF(uniforms->pushF(m[3*row+0])),
+                   p->mad(Y, p->uniformF(uniforms->pushF(m[3*row+1])),
+                             p->uniformF(uniforms->pushF(m[3*row+2]))));
+        };
+        *x = dot(0);
+        *y = dot(1);
+        if (m.hasPerspective()) {
+            *x = p->div(*x, dot(2));
+            *y = p->div(*y, dot(2));
+        }
+    }
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 sk_sp<SkFlattenable> SkEmptyShader::CreateProc(SkReadBuffer&) {
diff --git a/src/shaders/SkShaderBase.h b/src/shaders/SkShaderBase.h
index 4da4f89..bb80b9e 100644
--- a/src/shaders/SkShaderBase.h
+++ b/src/shaders/SkShaderBase.h
@@ -238,6 +238,9 @@
 
     virtual SkStageUpdater* onAppendUpdatableStages(const SkStageRec&) const { return nullptr; }
 
+protected:
+    static void ApplyMatrix(skvm::Builder*, const SkMatrix&, skvm::F32* x, skvm::F32* y, skvm::Uniforms*);
+
 private:
     // This is essentially const, but not officially so it can be modified in constructors.
     SkMatrix fLocalMatrix;
diff --git a/src/shaders/gradients/SkGradientShader.cpp b/src/shaders/gradients/SkGradientShader.cpp
index 641cbce..a8f76c4 100644
--- a/src/shaders/gradients/SkGradientShader.cpp
+++ b/src/shaders/gradients/SkGradientShader.cpp
@@ -9,9 +9,11 @@
 #include "include/core/SkMallocPixelRef.h"
 #include "include/private/SkFloatBits.h"
 #include "include/private/SkHalf.h"
+#include "include/private/SkVx.h"
 #include "src/core/SkColorSpacePriv.h"
 #include "src/core/SkConvertPixels.h"
 #include "src/core/SkReadBuffer.h"
+#include "src/core/SkVM.h"
 #include "src/core/SkWriteBuffer.h"
 #include "src/shaders/gradients/Sk4fLinearGradient.h"
 #include "src/shaders/gradients/SkGradientShaderPriv.h"
@@ -219,6 +221,7 @@
     (ctx->fs[1])[stop] = Fs.fG;
     (ctx->fs[2])[stop] = Fs.fB;
     (ctx->fs[3])[stop] = Fs.fA;
+
     (ctx->bs[0])[stop] = Bs.fR;
     (ctx->bs[1])[stop] = Bs.fG;
     (ctx->bs[2])[stop] = Bs.fB;
@@ -414,6 +417,180 @@
     return true;
 }
 
+bool SkGradientShaderBase::onProgram(skvm::Builder* p,
+                                     const SkMatrix& ctm, const SkMatrix* localM,
+                                     SkFilterQuality quality, SkColorSpace* dstCS,
+                                     skvm::Uniforms* uniforms, SkArenaAlloc* alloc,
+                                     skvm::F32 x, skvm::F32 y,
+                                     skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32* a) const {
+    SkMatrix inv;
+    if (!this->computeTotalInverse(ctm, localM, &inv)) {
+        return false;
+    }
+    inv.postConcat(fPtsToUnit);
+    inv.normalizePerspective();
+
+    // Having tacked on fPtsToUnit at the end means we'll be left with t in x.
+    SkShaderBase::ApplyMatrix(p, inv, &x,&y,uniforms);
+    skvm::F32 t = x;
+    if (!this->transformT(p, &t)) {  // Hook into subclasses for linear, radial, etc.
+        return false;
+    }
+
+    // Most tiling happens here, with kDecal doing its work at the end.
+    // Perhaps unexpectedly, all clamping is handled by our search, so
+    // we don't explicitly clamp t to [0,1].  That clamp would break
+    // hard stops right at 0 or 1 boundaries in kClamp mode.
+    // (kRepeat and kMirror always produce values in [0,1].)
+    switch(fTileMode) {
+        case SkTileMode::kDecal:  break;
+        case SkTileMode::kClamp:  break;
+        case SkTileMode::kRepeat: t = p->sub(t, p->floor(t)); break;
+        case SkTileMode::kMirror: {
+            // t = | (t-1) - 2*(floor( (t-1)*0.5 )) - 1 |
+            //       {-A-}      {--------B-------}
+            skvm::F32 A = p->sub(t, p->splat(1.0f)),
+                      B = p->floor( p->mul(A, p->splat(0.5f)));
+            t = p->abs(p->sub(p->sub(A, p->add(B,B)),
+                              p->splat(1.0f)));
+        } break;
+    }
+
+    // Transform our colors as we want them interpolated, in dst color space, possibly premul.
+    SkImageInfo common = SkImageInfo::Make(fColorCount,1, kRGBA_F32_SkColorType
+                                                        , kUnpremul_SkAlphaType),
+                src  = common.makeColorSpace(fColorSpace),
+                dst  = common.makeColorSpace(sk_ref_sp(dstCS));
+    if (fGradFlags & SkGradientShader::kInterpolateColorsInPremul_Flag) {
+        dst = dst.makeAlphaType(kPremul_SkAlphaType);
+    }
+
+    std::vector<float> rgba(4*fColorCount);  // TODO: SkSTArray?
+    SkConvertPixels(dst,   rgba.data(), dst.minRowBytes(),
+                    src, fOrigColors4f, src.minRowBytes());
+
+    // Transform our colors into a scale factor f and bias b such that for
+    // any t between stops i and i+1, the color we want is mad(t, f[i], b[i]).
+    using F4 = skvx::Vec<4,float>;
+    struct FB { F4 f,b; };
+
+    if (fColorCount == 2) {
+        // 2-stop gradients have colors at 0 and 1, and so must be evenly spaced.
+        SkASSERT(fOrigPos == nullptr);
+
+        // With 2 stops, we upload the single FB as uniforms and interpolate directly with t.
+        F4 lo = F4::Load(rgba.data() + 0),
+           hi = F4::Load(rgba.data() + 4);
+        F4 F = hi - lo,
+           B = lo;
+
+        auto T = p->clamp(t, p->splat(0.0f), p->splat(1.0f));
+        *r = p->mad(T, p->uniformF(uniforms->pushF(F[0])), p->uniformF(uniforms->pushF(B[0])));
+        *g = p->mad(T, p->uniformF(uniforms->pushF(F[1])), p->uniformF(uniforms->pushF(B[1])));
+        *b = p->mad(T, p->uniformF(uniforms->pushF(F[2])), p->uniformF(uniforms->pushF(B[2])));
+        *a = p->mad(T, p->uniformF(uniforms->pushF(F[3])), p->uniformF(uniforms->pushF(B[3])));
+    } else {
+        // To handle clamps in search we add a conceptual stop at t=-inf, so we
+        // may need up to fColorCount+1 FBs and fColorCount t stops between them:
+        //
+        //   FBs:         [color 0]  [color 0->1]  [color 1->2]  [color 2->3]  ...
+        //   stops:  (-inf)        t0            t1            t2  ...
+        //
+        // Both these arrays could end up shorter if any hard stops share the same t.
+        FB* fb = alloc->makeArrayDefault<FB>(fColorCount+1);
+        std::vector<float> stops;  // TODO: SkSTArray?
+        stops.reserve(fColorCount);
+
+        // Here's our conceptual stop at t=-inf covering all t<=0, clamping to our first color.
+        float  t_lo = this->getPos(0);
+        F4 color_lo = F4::Load(rgba.data());
+        fb[0] = { 0.0f, color_lo };
+        // N.B. No stops[] entry for this implicit -inf.
+
+        // Now the non-edge cases, calculating scale and bias between adjacent normal stops.
+        for (int i = 1; i < fColorCount; i++) {
+            float  t_hi = this->getPos(i);
+            F4 color_hi = F4::Load(rgba.data() + 4*i);
+
+            // If t_lo == t_hi, we're on a hard stop, and transition immediately to the next color.
+            SkASSERT(t_lo <= t_hi);
+            if (t_lo < t_hi) {
+                F4 f = (color_hi - color_lo) / (t_hi - t_lo),
+                   b = color_lo - f*t_lo;
+                stops.push_back(t_lo);
+                fb[stops.size()] = {f,b};
+            }
+
+            t_lo = t_hi;
+            color_lo = color_hi;
+        }
+        // Anything >= our final t clamps to our final color.
+        stops.push_back(t_lo);
+        fb[stops.size()] = { 0.0f, color_lo };
+
+        // We'll gather FBs from that array we just created.
+        skvm::Builder::Uniform fbs = uniforms->pushPtr(fb);
+
+        // Find the two stops we need to interpolate.
+        skvm::I32 ix;
+        if (fOrigPos == nullptr) {
+            // Evenly spaced stops... we can calculate ix directly.
+            // Of note: we need to clamp t and skip over that conceptual -inf stop we made up.
+            ix = p->trunc(p->mad(p->clamp(t, p->splat(0.0f), p->splat(1.0f)),
+                                 p->uniformF(uniforms->pushF(stops.size() - 1.0f)),
+                                 p->splat(1.0f)));
+        } else {
+            // Starting ix at 0 bakes in our conceptual first stop at -inf.
+            // TODO: good place to experiment with a loop in skvm.... stops.size() can be huge.
+            ix = p->splat(0);
+            for (float stop : stops) {
+                // ix += (t >= stop) ? +1 : 0 ~~>
+                // ix -= (t >= stop) ? -1 : 0
+                ix = p->sub(ix, p->gte(t, p->uniformF(uniforms->pushF(stop))));
+            }
+            // TODO: we could skip any of the dummy stops GradientShaderBase's ctor added
+            // to ensure the full [0,1] span is covered.  This linear search doesn't need
+            // them for correctness, and it'd be up to two fewer stops to check.
+            // N.B. we do still need those stops for the fOrigPos == nullptr direct math path.
+        }
+
+        // A scale factor and bias for each lane, 8 total.
+        // TODO: simpler, faster, tidier to push 8 uniform pointers, one for each struct lane?
+        ix = p->shl(ix, 3);            skvm::F32 Fr = p->bit_cast(p->gather32(fbs, ix));
+        ix = p->add(ix, p->splat(1));  skvm::F32 Fg = p->bit_cast(p->gather32(fbs, ix));
+        ix = p->add(ix, p->splat(1));  skvm::F32 Fb = p->bit_cast(p->gather32(fbs, ix));
+        ix = p->add(ix, p->splat(1));  skvm::F32 Fa = p->bit_cast(p->gather32(fbs, ix));
+
+        ix = p->add(ix, p->splat(1));  skvm::F32 Br = p->bit_cast(p->gather32(fbs, ix));
+        ix = p->add(ix, p->splat(1));  skvm::F32 Bg = p->bit_cast(p->gather32(fbs, ix));
+        ix = p->add(ix, p->splat(1));  skvm::F32 Bb = p->bit_cast(p->gather32(fbs, ix));
+        ix = p->add(ix, p->splat(1));  skvm::F32 Ba = p->bit_cast(p->gather32(fbs, ix));
+
+        // This is what we've been building towards!
+        *r = p->mad(t, Fr, Br);
+        *g = p->mad(t, Fg, Bg);
+        *b = p->mad(t, Fb, Bb);
+        *a = p->mad(t, Fa, Ba);
+    }
+
+    // If we interpolated unpremul, premul now to match our output convention.
+    if (0 == (fGradFlags & SkGradientShader::kInterpolateColorsInPremul_Flag)
+            && !fColorsAreOpaque) {
+        p->premul(r,g,b,*a);
+    }
+
+    // Mask away any pixels that we tried to sample outside the bounds in kDecal.
+    if (fTileMode == SkTileMode::kDecal) {
+        skvm::I32 in_bounds = p->eq(t, p->clamp(t, p->splat(0.0f), p->splat(1.0f)));
+        *r = p->bit_cast(p->bit_and(in_bounds, p->bit_cast(*r)));
+        *g = p->bit_cast(p->bit_and(in_bounds, p->bit_cast(*g)));
+        *b = p->bit_cast(p->bit_and(in_bounds, p->bit_cast(*b)));
+        *a = p->bit_cast(p->bit_and(in_bounds, p->bit_cast(*a)));
+    }
+
+    return true;
+}
+
 
 bool SkGradientShaderBase::isOpaque() const {
     return fColorsAreOpaque && (this->getTileMode() != SkTileMode::kDecal);
diff --git a/src/shaders/gradients/SkGradientShaderPriv.h b/src/shaders/gradients/SkGradientShaderPriv.h
index b362e90..66a3f1a 100644
--- a/src/shaders/gradients/SkGradientShaderPriv.h
+++ b/src/shaders/gradients/SkGradientShaderPriv.h
@@ -78,9 +78,18 @@
 
     bool onAppendStages(const SkStageRec&) const override;
 
+    bool onProgram(skvm::Builder* p,
+                   const SkMatrix& ctm, const SkMatrix* localM,
+                   SkFilterQuality quality, SkColorSpace* dstCS,
+                   skvm::Uniforms* uniforms, SkArenaAlloc* alloc,
+                   skvm::F32 x, skvm::F32 y,
+                   skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32* a) const override;
+
     virtual void appendGradientStages(SkArenaAlloc* alloc, SkRasterPipeline* tPipeline,
                                       SkRasterPipeline* postPipeline) const = 0;
 
+    virtual bool transformT(skvm::Builder*, skvm::F32* t) const { return false; }
+
     template <typename T, typename... Args>
     static Context* CheckedMakeContext(SkArenaAlloc* alloc, Args&&... args) {
         auto* ctx = alloc->make<T>(std::forward<Args>(args)...);
diff --git a/src/shaders/gradients/SkLinearGradient.cpp b/src/shaders/gradients/SkLinearGradient.cpp
index 98f15cf..ed45497 100644
--- a/src/shaders/gradients/SkLinearGradient.cpp
+++ b/src/shaders/gradients/SkLinearGradient.cpp
@@ -75,6 +75,8 @@
     // No extra stage needed for linear gradients.
 }
 
+bool SkLinearGradient::transformT(skvm::Builder*, skvm::F32*) const { return true; }
+
 SkShader::GradientType SkLinearGradient::asAGradient(GradientInfo* info) const {
     if (info) {
         commonAsAGradient(info);
diff --git a/src/shaders/gradients/SkLinearGradient.h b/src/shaders/gradients/SkLinearGradient.h
index fff6493..08c937a 100644
--- a/src/shaders/gradients/SkLinearGradient.h
+++ b/src/shaders/gradients/SkLinearGradient.h
@@ -29,6 +29,8 @@
     void appendGradientStages(SkArenaAlloc* alloc, SkRasterPipeline* tPipeline,
                               SkRasterPipeline* postPipeline) const final;
 
+    bool transformT(skvm::Builder*, skvm::F32* t) const final;
+
 
 private:
     SK_FLATTENABLE_HOOKS(SkLinearGradient)