src/gpu/graphite/sparse_strips/Flatten.cpp - skia - Git at Google

 /*
  * Copyright 2026 Google LLC
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */
 #include "src/gpu/graphite/sparse_strips/Flatten.h"

 #include "src/base/SkVx.h"
 #include "src/gpu/graphite/sparse_strips/Polyline.h"

 namespace skgpu::graphite {

 namespace {

 // An intermediate helper struct used to pass values between anonymous namespace functions and
 // Flatten class functions. Used for flattenning both quads and cubics.
 struct FlattenParams {
     // The starting value of the parabolic integral for each quadratic segment. Maps the start of
     // the curve into uniform arc-length space.
     double fA0;
     // The delta of the parabolic integral (a2 - a0) for each quadratic. Used to linearly
     // interpolate the integral 'a' across the segment.
     double fDa;
     // Inverse integral at fA0. Used to anchor the mapped parametric value back to 0.
     double fU0;
     // Normalization factor (1.0 / (u2 - u0)) to scale the final 't' into the [0, 1] range.
     double fUScale;
     // Total integrated curvature/error metric for the segment. Used to compute the required number
     // of subdivisions.
     double fCurvatureIntegral;
 };

 SK_ALWAYS_INLINE double approx_parabola_integral(double x) {
     constexpr double kD = 0.67;
     constexpr double kDPow4 = kD * kD * kD * kD;
     return x / ( 1.0 - kD + std::sqrt(std::sqrt(kDPow4 + 0.25 * x * x)));
 }

 SK_ALWAYS_INLINE double approx_parabola_inv_integral(double x) {
     constexpr double kB = 0.39;
     return x * (1.0 - kB + std::sqrt(kB * kB + 0.25 * x * x));
 }

 /*
  * Determing the number of quads from a cubic:
  *
  * 1. Error Budgeting (kCubicAccuracy):
  *    Flattening cubics happens in two steps: Cubic -> Quads, then Quads -> Lines. If the first step
  *    uses the full visual tolerance, the final lines will look jagged. kCubicAccuracy allocates
  *    only 10% (kCubicErrTolerance) of the total allowed pixel error to the Cubic -> Quads steps,
  *    saving 90% (kQuadToLineTolFromCubic) for the final lines.
  *
  * 2. Wang's Formula and the Vector Error (errV):
  *    Wang's formula gives the minimum number of evenly spaced (in parametric space) line segments
  *    that a bezier curve of degree "n" must be chopped into to guarantee all lines stay within a
  *    distance of "1/precision" pixels from the true curve:
  *
  *    maxLength = max([length(p[i+2] - 2p[i+1] + p[i]) for (0 <= i <= n-2)])
  *    numParametricSegments = sqrt(maxLength * precision * n*(n - 1)/8)
  *
  *    (Goldman, Ron. (2003). 5.6.3 Wang's Formula. "Pyramid Algorithms: A Dynamic Programming
  *    Approach to Curves and Surfaces for Geometric Modeling". Morgan Kaufmann Publishers.)
  *
  *    Since we are flattening a cubic to a *quadratic*, we shift the math up one degree and evaluate
  *    the third derivative instead: (3*P2 - P3) - (3*P1 - P0). If this evaluates to zero, the second
  *    derivative is constant, meaning the curve is a perfect parabola. We use this expression 'errV'
  *    as a measure of how much the cubic "bulges" away from an ideal quadratic shape.
  *
  * 3. Error Quantization and Lookup Table
  *    From Wang's formula, the maximum geometric error of the un-chopped cubic is
  *    ||errV|| / (12*sqrt(3)). By Taylor's theorem, if we chop the curve into n segments, the
  *    approximation error shrinks cubically (Base Error / n^3). Thus, to satisfy our desired
  *    tolerance, we must find the minimum n where:
  *
  *    ((||errV|| / (12*sqrt(3)) / n^3) <= kCubicErrTolerance.
  *
  *    To avoid calling sqrt, we square the equation: (12*sqrt(3))^2 = 432. Correspondingly,
  *    (n^3)^2 = n^6, so errDiv scales at n^6. To avoid calculating 6th roots, and because we know we
  *    can only have an integer number of subdivisions, we precompute a lookup table (kPowSixLut) of
  *    6th powers to find the required number of subdivisions.
  *
  * 4. Subdivision limit:
  *    As a guard against pathological inputs, and to avoid dynamic memory allocation, we bound the
  *    maximum number of quads that can be produced by a cubic. If errDiv exceeds the table, the
  *    function returns kMaxQuadsFromCubic.
  */
 SK_ALWAYS_INLINE uint32_t estimate_num_quads_from_cubic(const SkPoint pts[4]) {
     // (12 * sqrt(3))^2 * kCubicErrTolerance^2, inverted to avoid division
     constexpr double kWangErrThreshold =
             1.0 / (432.0 * Flatten::kCubicErrTolerance * Flatten::kCubicErrTolerance);

     // A cubic is a perfect quadratic if (3*P1 - P0) == (3*P2 - P3)
     SkPoint leftQuadCtrlX2 = pts[1] * 3.0f - pts[0];
     SkPoint rightQuadCtrlX2 = pts[2] * 3.0f - pts[3];
     SkPoint errV = rightQuadCtrlX2 - leftQuadCtrlX2;

     // Squared error distance
     double errSq = SkPoint::DotProduct(errV, errV);

     // The relative error mapped into n^6
     double errDiv = errSq * kWangErrThreshold;

     constexpr double kPowSixLut[Flatten::kMaxQuadsFromCubic] = {
             1.0,
             64.0,
             729.0,
             4096.0,
             15625.0,
             46656.0,
             117649.0,
             262144.0,
             531441.0,
             1000000.0,
             1771561.0,
             2985984.0,
             4826809.0,
             7529536.0,
             11390625.0,
             16777216.0,
     };

     for (uint32_t i = 0; i < Flatten::kMaxQuadsFromCubic; ++i) {
         if (errDiv <= kPowSixLut[i]) {
             return i + 1;
         }
     }
     return Flatten::kMaxQuadsFromCubic;
 }

 SK_ALWAYS_INLINE FlattenParams estimate_lines_from_quad(const SkPoint pts[3],
                                                         double kSqrtErrTolerance) {
     SkPoint d01 = pts[1] - pts[0];
     SkPoint d12 = pts[2] - pts[1];
     SkPoint dd = d01 - d12;

     double cross = SkPoint::CrossProduct(pts[2] - pts[0], dd);
     // If the quad's points are sufficiently collinear, we simplify it to a single line spanning
     // the endpoints to prevent division by zero.
     //
     // For perfectly collinear quads, this produces a correct image; in a filled path, the
     // collapsed control point adds no area. While this shortcut is technically incorrect for
     // "needle-thin" nearly collinear quads, it is extremely unlikely such a path would enclose
     // an MSAA subsample point, rendering extra handling unnecessary.
     if (std::abs(cross) < Flatten::kEpsilonD) {
         return {0.0, 0.0, 0.0, 1.0, 0.0};
     }

     double x0 = SkPoint::DotProduct(d01, dd) / cross;
     double x2 = SkPoint::DotProduct(d12, dd) / cross;
     double scale = std::abs(cross / (dd.length() * (x2 - x0)));

     double a0 = approx_parabola_integral(x0);
     double a2 = approx_parabola_integral(x2);
     double da = a2 - a0;
     double val = 0.0;

     if (std::isfinite(scale)) {
         double absDa = std::abs(da);
         double sqrtScale = std::sqrt(scale);

         if (std::signbit(x0) == std::signbit(x2)) {
             val = absDa * sqrtScale;
         } else {
             double xMin = kSqrtErrTolerance / sqrtScale;
             val = kSqrtErrTolerance * absDa / approx_parabola_integral(xMin);
         }
     }

     double u0 = approx_parabola_inv_integral(a0);
     double u2 = approx_parabola_inv_integral(a2);
     double uScale = 1.0 / (u2 - u0);

     return {a0, da, u0, uScale, val};
 }

 SK_ALWAYS_INLINE double determine_quad_subdiv_t(const FlattenParams& params, double x) {
     double a = params.fA0 + params.fDa * x;
     double u = approx_parabola_inv_integral(a);
     return (u - params.fU0) * params.fUScale;
 }

 SK_ALWAYS_INLINE SkPoint eval_quad_scalar(const SkPoint pts[3], float t) {
     float mt = 1.0f - t;
     return pts[0] * (mt * mt) + pts[1] * (2.0f * mt * t) + pts[2] * (t * t);
 }

 SK_ALWAYS_INLINE SkPoint eval_cubic_scalar(const SkPoint pts[4], float t) {
     float mt = 1.0f - t;
     float mt2 = mt * mt;
     float t2 = t * t;
     return pts[0] * (mt2 * mt) + pts[1] * (3.0f * mt2 * t) + pts[2] * (3.0f * mt * t2) +
            pts[3] * (t2 * t);
 }

 template <int N>
 SK_ALWAYS_INLINE bool is_completely_culled(const SkPoint pts[N], float width, float height) {
     bool top = true;
     bool right = true;
     bool bottom = true;
     for (int i = 0; i < N; ++i) {
         top    &= (pts[i].fY < 0.0f);
         right  &= (pts[i].fX > width);
         bottom &= (pts[i].fY > height);
     }
     return top || right || bottom;
 }

 template <int N>
 SK_ALWAYS_INLINE bool is_completely_left(const SkPoint pts[N]) {
     bool left = true;
     for (int i = 0; i < N; ++i) {
         left &= (pts[i].fX < 0.0f);
     }
     return left;
 }

 SK_ALWAYS_INLINE bool is_within_dist_sq(SkPoint p, SkPoint a, SkPoint b, float kErrTolerance) {
     SkPoint ab = b - a;
     SkPoint ap = p - a;
     float lenSq = SkPoint::DotProduct(ab, ab);
     float tNum = SkPoint::DotProduct(ap, ab);

     if (lenSq == 0.0f || tNum <= 0.0f) {
         return SkPoint::DotProduct(ap, ap) <= kErrTolerance;
     }
     if (tNum >= lenSq) {
         SkPoint bp = p - b;
         return SkPoint::DotProduct(bp, bp) <= kErrTolerance;
     }

     // Lagrange's identity: apSq*lenSq - tNum*tNum == cross^2
     float cross = SkPoint::CrossProduct(ap, ab);
     return (cross * cross) <= (kErrTolerance * lenSq);
 }

 // SIMD constants for convenience
 static const skvx::float4 kIotaLo(0.0f, 0.0f, 2.0f, 2.0f);
 static const skvx::float4 kIotaHi(1.0f, 1.0f, 3.0f, 3.0f);
 static const skvx::float8 kIota = skvx::join(kIotaLo, kIotaHi);
 static const skvx::float8 kIota2 =
         skvx::join(skvx::float4(0.0f, 0.0f, 1.0f, 1.0f), skvx::float4(2.0f, 2.0f, 3.0f, 3.0f));

 SK_ALWAYS_INLINE skvx::float8 splat_pt_simd(SkPoint pt) {
     skvx::float4 xyxy(pt.fX, pt.fY, pt.fX, pt.fY);
     return skvx::join(xyxy, xyxy);
 }

 SK_ALWAYS_INLINE skvx::float4 approx_parabola_integral_simd(skvx::float4 x) {
     constexpr float kD = 0.67f;
     constexpr float kDPow4 = kD * kD * kD * kD;
     skvx::float4 x2 = x * x;
     skvx::float4 temp =
             skvx::sqrt(skvx::sqrt(skvx::fma(x2, skvx::float4(0.25f), skvx::float4(kDPow4))));
     skvx::float4 denom = temp + (1.0f - kD);

     return x / denom;
 }

 SK_ALWAYS_INLINE skvx::float8 approx_parabola_inv_integral_simd(skvx::float8 x) {
     constexpr float kB = 0.39f;
     constexpr float kOneMinusB = 1.0f - kB;
     constexpr float kB2 = kB * kB;

     skvx::float8 x2 = x * x;
     skvx::float8 temp = skvx::sqrt(skvx::fma(x2, skvx::float8(0.25f), skvx::float8(kB2)));
     skvx::float8 factor = temp + kOneMinusB;

     return x * factor;
 }

 SK_ALWAYS_INLINE skvx::int4 is_finite_simd(skvx::float4 x) {
     // x is guaranteed positive area/length, no abs() needed to clear sign bit
     return sk_bit_cast<skvx::int4>(x) < skvx::int4(0x7f800000);
 }

 template <bool kIsIdentity,
           typename ProcessQuadFn,
           typename ProcessConicFn,
           typename ProcessCubicFn>
 SK_ALWAYS_INLINE void processPathsImpl(const SkPath& path,
                                        const SkMatrix& ctm,
                                        Polyline* polyline,
                                        ProcessQuadFn&& processQuad,
                                        ProcessConicFn&& processConic,
                                        ProcessCubicFn&& processCubic) {
     bool closed = true;
     SkPoint startPt = {0, 0};
     SkPoint lastPt = {0, 0};

     SkPath::Iter iter(path, false);
     SkPoint localPts[4];
     SkPoint mappedPts[4];
     SkPath::Verb verb;

     while ((verb = iter.next(localPts)) != SkPath::kDone_Verb) {
         switch (verb) {
             case SkPath::kMove_Verb: {
                 const SkPoint* pts = localPts;
                 if constexpr (!kIsIdentity) {
                     ctm.mapPoints(SkSpan(mappedPts, 1), SkSpan(localPts, 1));
                     pts = mappedPts;
                 }

                 if (!closed && lastPt != startPt) {
                     polyline->appendPoint(startPt);
                 }
                 polyline->appendSentinel();

                 closed = false;
                 lastPt = pts[0];
                 startPt = pts[0];
                 polyline->appendPoint(pts[0]);
                 break;
             }
             case SkPath::kLine_Verb: {
                 SkASSERT(!closed);
                 const SkPoint* pts = localPts;
                 if constexpr (!kIsIdentity) {
                     ctm.mapPoints(SkSpan(mappedPts, 2), SkSpan(localPts, 2));
                     pts = mappedPts;
                 }

                 lastPt = pts[1];
                 polyline->appendPoint(pts[1]);
                 break;
             }
             case SkPath::kQuad_Verb: {
                 SkASSERT(!closed);
                 const SkPoint* pts = localPts;
                 if constexpr (!kIsIdentity) {
                     ctm.mapPoints(SkSpan(mappedPts, 3), SkSpan(localPts, 3));
                     pts = mappedPts;
                 }

                 processQuad(pts);
                 lastPt = pts[2];
                 break;
             }
             case SkPath::kConic_Verb: {
                 SkASSERT(!closed);
                 const SkPoint* pts = localPts;
                 if constexpr (!kIsIdentity) {
                     ctm.mapPoints(SkSpan(mappedPts, 3), SkSpan(localPts, 3));
                     pts = mappedPts;
                 }

                 processConic(pts, iter.conicWeight());
                 lastPt = pts[2];
                 break;
             }
             case SkPath::kCubic_Verb: {
                 SkASSERT(!closed);
                 const SkPoint* pts = localPts;
                 if constexpr (!kIsIdentity) {
                     ctm.mapPoints(SkSpan(mappedPts, 4), SkSpan(localPts, 4));
                     pts = mappedPts;
                 }

                 processCubic(pts);
                 lastPt = pts[3];
                 break;
             }
             case SkPath::kClose_Verb: {
                 closed = true;
                 if (lastPt != startPt) {
                     polyline->appendPoint(startPt);
                 }
                 break;
             }
             default:
                 break;
         }
     }

     if (!closed && lastPt != startPt) {
         polyline->appendPoint(startPt);
     }

     polyline->appendSentinel();
 }

 }  // anonymous namespace

 SK_ALWAYS_INLINE void Flatten::flattenQuadScalar(const SkPoint pts[3], Polyline* polyline) {
     FlattenParams params = estimate_lines_from_quad(pts, kSqrtQuadTolerance);
     // Calculate the minimum number of linear subdivisions needed to stay within the visual
     // tolerance. The maximum geometric error of a straight line approximating a parabola is
     // (a * W^2) / 4. Taking the square root makes this easier: sqrt(Error) = (sqrt(a) * W) / 2
     // since fCurvatureIntegral = sqrt(a) * W_total. So the required number of segments 'n' to keep
     // the error per segment under `kSqrtQuadTolerance` is:
     // n = fCurvatureIntegral / (kSqrtQuadTolerance * 2).
     uint32_t numSegments = std::max(1u, static_cast<uint32_t>(
             std::ceil(0.5 / kSqrtQuadTolerance * params.fCurvatureIntegral)));
     double step = 1.0 / numSegments;

     for (uint32_t i = 1; i < numSegments; ++i) {
         double u = i * step;
         double t = determine_quad_subdiv_t(params, u);
         polyline->appendPoint(eval_quad_scalar(pts, static_cast<float>(t)));
     }
     polyline->appendPoint(pts[2]);
 }

 SK_ALWAYS_INLINE uint32_t Flatten::flattenCubicScalar(const SkPoint pts[4]) {
     uint32_t numQuads = estimate_num_quads_from_cubic(pts);
     fContext.fNumQuads = numQuads;
     static constexpr float kHalfOverN[kMaxQuadsFromCubic + 1] = {
         0.0f,
         0.5f/1.0f, 0.5f/2.0f, 0.5f/3.0f, 0.5f/4.0f,
         0.5f/5.0f, 0.5f/6.0f, 0.5f/7.0f, 0.5f/8.0f,
         0.5f/9.0f, 0.5f/10.f, 0.5f/11.f, 0.5f/12.f,
         0.5f/13.f, 0.5f/14.f, 0.5f/15.f, 0.5f/16.f
     };
     float dt = kHalfOverN[numQuads];

     float curvatureIntegralSum = 0.0f;

     for (uint32_t i = 0; i < numQuads; ++i) {
         float t0 = (2 * i) * dt;
         float t1 = (2 * i + 1) * dt;
         float t2 = (2 * i + 2) * dt;

         SkPoint p0 = eval_cubic_scalar(pts, t0);
         SkPoint pOneHalf = eval_cubic_scalar(pts, t1);
         SkPoint p2 = eval_cubic_scalar(pts, t2);
         SkPoint p1 = pOneHalf * 2.0f - (p0 + p2) * 0.5f;

         fContext.fEvenPts[i] = p0;
         fContext.fOddPts[i] = p1;
         fContext.fEvenPts[i + 1] = p2;

         SkPoint quad[3] = {p0, p1, p2};
         FlattenParams params = estimate_lines_from_quad(quad, kSqrtQuadFromCubicTol);

         fContext.fA0[i] = params.fA0;
         fContext.fDa[i] = params.fDa;
         fContext.fU0[i] = params.fU0;
         fContext.fUScale[i] = params.fUScale;

         fContext.fCurvatureIntegral[i] =
                 std::max(static_cast<float>(params.fCurvatureIntegral), kEpsilonF);

         curvatureIntegralSum += params.fCurvatureIntegral;
     }

     uint32_t numSegments = std::max<uint32_t>(
         1u, static_cast<uint32_t>(std::ceil(0.5 * curvatureIntegralSum / kSqrtQuadFromCubicTol)));
     uint32_t targetLen = numSegments + 4;
     fContext.fFlattenedCubics.resize(targetLen);

     float step = curvatureIntegralSum / numSegments;
     float stepRecip = 1.0f / step;
     float cumulativeCurvature = 0.0f;
     uint32_t lastN = 0;
     float x0Base = 0.0f;

     // Flatten each quad produced by the cubic.
     for (uint32_t i = 0; i < numQuads; ++i) {
         cumulativeCurvature += fContext.fCurvatureIntegral[i];
         float thisN = cumulativeCurvature * stepRecip;
         float thisNNext = 1.0f + std::floor(thisN);
         uint32_t dn = static_cast<uint32_t>(thisNNext) - lastN; // thisNNext always > lastN

         if (dn > 0) {
             float dx = step / fContext.fCurvatureIntegral[i];
             float x0 = x0Base * dx;

             SkPoint p0 = fContext.fEvenPts[i];
             SkPoint p1 = fContext.fOddPts[i];
             SkPoint p2 = fContext.fEvenPts[i + 1];

             float a0 = fContext.fA0[i];
             float da = fContext.fDa[i];
             float u0 = fContext.fU0[i];
             float uScale = fContext.fUScale[i];

             for (uint32_t j = 0; j < dn; ++j) {
                 float x = x0 + j * dx;
                 float a = a0 + da * x;
                 float u = approx_parabola_inv_integral(a);
                 float t = (u - u0) * uScale;

                 float mt = 1.0f - t;
                 SkPoint p = p0 * (mt * mt) + p1 * (2.0f * t * mt) + p2 * (t * t);
                 fContext.fFlattenedCubics[lastN + j] = p;
             }
         }
         x0Base = thisNNext - thisN;
         lastN = static_cast<uint32_t>(thisNNext);
     }

     fContext.fFlattenedCubics[numSegments] = fContext.fEvenPts[numQuads];
     return numSegments + 1;
 }

 SK_ALWAYS_INLINE void Flatten::flattenQuadSimd(const SkPoint pts[3], Polyline* polyline) {
     FlattenParams params = estimate_lines_from_quad(pts, kSqrtQuadTolerance);
     uint32_t numSegments = std::max(1u, static_cast<uint32_t>(
             std::ceil(0.5 / kSqrtQuadTolerance * params.fCurvatureIntegral)));

     float step = 1.0f / numSegments;

     skvx::float8 p0 = splat_pt_simd(pts[0]);
     skvx::float8 p1 = splat_pt_simd(pts[1]);
     skvx::float8 p2 = splat_pt_simd(pts[2]);

     skvx::float8 A = skvx::fma(p1, skvx::float8(-2.0f), p0) + p2;
     skvx::float8 B = (p1 - p0) * 2.0f;
     skvx::float8 C = p0;

     skvx::float8 vDa = skvx::float8(static_cast<float>(params.fDa));
     skvx::float8 vA0 = skvx::float8(static_cast<float>(params.fA0));
     skvx::float8 vU0 = skvx::float8(static_cast<float>(params.fU0));
     skvx::float8 vUScale = skvx::float8(static_cast<float>(params.fUScale));

     float out[8];
     for (uint32_t i = 1; i < numSegments; i += 4) {
         skvx::float8 x = (kIota2 + skvx::float8(static_cast<float>(i))) * step;

         skvx::float8 a = skvx::fma(vDa, x, vA0);
         skvx::float8 u = approx_parabola_inv_integral_simd(a);
         skvx::float8 t = (u - vU0) * vUScale;

         skvx::float8 p = skvx::fma(skvx::fma(A, t, B), t, C);
         p.store(out);

         // The first point is always valid if we entered the loop
         polyline->appendPoint({out[0], out[1]});
         // Conditionally append the tail results
         if (i + 1 < numSegments) polyline->appendPoint({out[2], out[3]});
         if (i + 2 < numSegments) polyline->appendPoint({out[4], out[5]});
         if (i + 3 < numSegments) polyline->appendPoint({out[6], out[7]});
     }

     // Always append the exact endpoint
     polyline->appendPoint(pts[2]);
 }

 SK_ALWAYS_INLINE void Flatten::evalCubicsSimd(const SkPoint pts[4], uint32_t numQuads) {
     fContext.fNumQuads = numQuads;
     float dt = 0.5f / numQuads;

     skvx::float8 p0 = splat_pt_simd(pts[0]);
     skvx::float8 p1 = splat_pt_simd(pts[1]);
     skvx::float8 p2 = splat_pt_simd(pts[2]);
     skvx::float8 p3 = splat_pt_simd(pts[3]);

     skvx::float8 A = skvx::fma(p1 - p2, skvx::float8(3.0f), p3 - p0);
     skvx::float8 B = skvx::fma(p1, skvx::float8(-2.0f), p0 + p2) * 3.0f;
     skvx::float8 C = (p1 - p0) * 3.0f;
     skvx::float8 D = p0;

     skvx::float8 step = kIota * dt;
     skvx::float8 t = step;
     skvx::float8 tInc(4.0f * dt);

     float* evenPts = reinterpret_cast<float*>(fContext.fEvenPts.data());
     float* oddPts = reinterpret_cast<float*>(fContext.fOddPts.data());

     uint32_t loopCount = (numQuads + 1) / 2;
     for (uint32_t i = 0; i < loopCount; ++i) {
         skvx::float8 evaluated = skvx::fma(skvx::fma(skvx::fma(A, t, B), t, C), t, D);
         evaluated.lo.store(evenPts + i * 4);
         evaluated.hi.store(oddPts + i * 4);
         t = t + tInc;
     }

     p3.store(evenPts + numQuads * 2);
 }

 SK_ALWAYS_INLINE void Flatten::estimateLinesFromQuadSimd() {
     uint32_t numQuads = fContext.fNumQuads;
     const float* evenPts = reinterpret_cast<const float*>(fContext.fEvenPts.data());
     float* oddPts = reinterpret_cast<float*>(fContext.fOddPts.data());

     uint32_t chunks = (numQuads + 3) / 4;
     for (uint32_t i = 0; i < chunks; ++i) {
         skvx::float8 p0 = skvx::float8::Load(evenPts + i * 8);
         skvx::float8 pOneHalf = skvx::float8::Load(oddPts + i * 8);
         skvx::float8 p2 = skvx::float8::Load(evenPts + i * 8 + 2);

         skvx::float8 p1 = skvx::fma(pOneHalf, skvx::float8(2.0f), (p0 + p2) * -0.5f);
         p1.store(oddPts + i * 8);

         skvx::float8 d01 = p1 - p0;
         skvx::float8 d12 = p2 - p1;

         skvx::float4 d01x = skvx::shuffle<0, 2, 4, 6>(d01);
         skvx::float4 d01y = skvx::shuffle<1, 3, 5, 7>(d01);
         skvx::float4 d12x = skvx::shuffle<0, 2, 4, 6>(d12);
         skvx::float4 d12y = skvx::shuffle<1, 3, 5, 7>(d12);

         skvx::float4 ddx = d01x - d12x;
         skvx::float4 ddy = d01y - d12y;

         skvx::float4 cross = (d01x + d12x) * ddy - (d01y + d12y) * ddx;
         skvx::float4 absCross = skvx::abs(cross);
         skvx::int4 collinearMask = skvx::cast<int32_t>(absCross < kEpsilonF);
         skvx::float4 invCross = 1.0f / cross;

         skvx::float4 x0 = skvx::fma(d01x, ddx, d01y * ddy) * invCross;
         skvx::float4 x2 = skvx::fma(d12x, ddx, d12y * ddy) * invCross;
         skvx::float4 ddSq = skvx::fma(ddx, ddx, ddy * ddy);
         skvx::float4 ddHypot = skvx::sqrt(ddSq);
         skvx::float4 scale = (cross * cross) / (ddHypot * ddSq);

         skvx::float4 a0 = approx_parabola_integral_simd(x0);
         skvx::float4 a2 = approx_parabola_integral_simd(x2);
         skvx::float4 da = a2 - a0;
         skvx::float4 absDa = skvx::abs(da);
         skvx::float4 sqrtScale = skvx::sqrt(scale);

         skvx::int4 signX0 = sk_bit_cast<skvx::int4>(x0) & 0x80000000;
         skvx::int4 signX2 = sk_bit_cast<skvx::int4>(x2) & 0x80000000;
         skvx::int4 mask = (signX0 == signX2);

         skvx::float4 nonCusp = absDa * sqrtScale;
         skvx::float4 xMin = static_cast<float>(kSqrtQuadFromCubicTol) / sqrtScale;
         skvx::float4 approxInt = approx_parabola_integral_simd(xMin);
         skvx::float4 cusp = (static_cast<float>(kSqrtQuadFromCubicTol) * absDa) / approxInt;

         skvx::float4 valRaw = skvx::if_then_else(mask, nonCusp, cusp);
         valRaw = skvx::if_then_else(collinearMask, skvx::float4(0.0f), valRaw);
         skvx::int4 finiteMask = is_finite_simd(valRaw);
         skvx::float4 val = skvx::if_then_else(finiteMask, valRaw, skvx::float4(0.0f));

         skvx::float8 u0U2 = approx_parabola_inv_integral_simd(skvx::join(a0, a2));
         skvx::float4 u0 = u0U2.lo;
         skvx::float4 u2 = u0U2.hi;
         skvx::float4 uScale = 1.0f / (u2 - u0);
         u0 = skvx::if_then_else(collinearMask, skvx::float4(0.0f), u0);
         uScale = skvx::if_then_else(collinearMask, skvx::float4(1.0f), uScale);

         a0.store(fContext.fA0.data() + i * 4);
         da.store(fContext.fDa.data() + i * 4);
         u0.store(fContext.fU0.data() + i * 4);
         uScale.store(fContext.fUScale.data() + i * 4);
         val.store(fContext.fCurvatureIntegral.data() + i * 4);
     }
 }

 SK_ALWAYS_INLINE void Flatten::outputLinesFromQuadSimd(
         uint32_t quadIdx, float x0, float dx, uint32_t numSegments, uint32_t startIdx) {
     skvx::float8 p0 = splat_pt_simd(fContext.fEvenPts[quadIdx]);
     skvx::float8 p1 = splat_pt_simd(fContext.fOddPts[quadIdx]);
     skvx::float8 p2 = splat_pt_simd(fContext.fEvenPts[quadIdx + 1]);

     skvx::float8 x = skvx::fma(kIota2, skvx::float8(dx), skvx::float8(x0));
     skvx::float8 da(fContext.fDa[quadIdx]);
     skvx::float8 a = skvx::fma(x, da, skvx::float8(fContext.fA0[quadIdx]));
     skvx::float8 aInc(4.0f * dx * fContext.fDa[quadIdx]);
     skvx::float8 uScale(fContext.fUScale[quadIdx]);
     skvx::float8 u0(fContext.fU0[quadIdx]);

     skvx::float8 A = skvx::fma(p1, skvx::float8(-2.0f), p0) + p2;
     skvx::float8 B = (p1 - p0) * 2.0f;
     skvx::float8 C = p0;

     uint32_t chunks = (numSegments + 3) / 4;
     for (uint32_t j = 0; j < chunks; ++j) {
         skvx::float8 u = approx_parabola_inv_integral_simd(a);
         skvx::float8 t = (u - u0) * uScale;
         skvx::float8 p = skvx::fma(skvx::fma(A, t, B), t, C);

         p.store(fContext.fFlattenedCubics.data() + startIdx + j * 4);
         a = a + aInc;
     }
 }

 uint32_t Flatten::flattenCubicSimd(const SkPoint pts[4]) {
     uint32_t numQuads = estimate_num_quads_from_cubic(pts);
     this->evalCubicsSimd(pts, numQuads);
     this->estimateLinesFromQuadSimd();

     float curvatureIntegralSum = 0.0f;
     for (uint32_t i = 0; i < numQuads; ++i) {
         float val = std::max(fContext.fCurvatureIntegral[i], static_cast<float>(kEpsilonF));
         fContext.fCurvatureIntegral[i] = val;
         curvatureIntegralSum += val;
     }

     uint32_t numSegments = std::max<uint32_t>(
             1,
             static_cast<uint32_t>(std::ceil(0.5f * curvatureIntegralSum / kSqrtQuadFromCubicTol)));
     uint32_t targetLen = numSegments + 4;
     fContext.fFlattenedCubics.resize(targetLen);

     float step = curvatureIntegralSum / numSegments;
     float stepRecip = 1.0f / step;
     float cumulativeCurvature = 0.0f;
     uint32_t lastN = 0;
     float x0Base = 0.0f;

     for (uint32_t i = 0; i < numQuads; ++i) {
         float val = fContext.fCurvatureIntegral[i];
         cumulativeCurvature += val;
         float thisN = cumulativeCurvature * stepRecip;
         float thisNNext = 1.0f + std::floor(thisN);
         uint32_t dn = static_cast<uint32_t>(thisNNext) - lastN;

         if (dn > 0) {
             float dx = step / val;
             float x0 = x0Base * dx;
             this->outputLinesFromQuadSimd(i, x0, dx, dn, lastN);
         }
         x0Base = thisNNext - thisN;
         lastN = static_cast<uint32_t>(thisNNext);
     }

     fContext.fFlattenedCubics[numSegments] = fContext.fEvenPts[numQuads];
     return numSegments + 1;
 }

 void Flatten::processPathsSimd(
         const SkPath& path, const SkMatrix& ctm, float width, float height, Polyline* polyline) {
     fContext.fFlattenedCubics.clear();

     auto processQuad = [this, width, height, polyline](const SkPoint pts[3]) {
         skvx::float4 X(pts[0].fX, pts[1].fX, pts[2].fX, pts[2].fX); // Duplicate last pt
         skvx::float4 Y(pts[0].fY, pts[1].fY, pts[2].fY, pts[2].fY);
         if (skvx::all(X > width) || skvx::all(Y < 0.0f) || skvx::all(Y > height)) {
             return;
         }
         if (skvx::all(X < 0.0f) ||
             is_within_dist_sq(pts[1], pts[0], pts[2], kQuadSubdivThreshold)) {
             polyline->appendPoint(pts[2]);
         } else {
             // Note: testing shows that the scalar function is *slightly* faster here, probably
             // because most quads don't produce enough segments to make simd worth it.
             this->flattenQuadScalar(pts, polyline);
         }
     };

     // TODO (thomsmit): this could probably simd-fied a little more.
     auto processConic = [this, width, height, polyline](const SkPoint pts[3], float weight) {
         skvx::float4 X(pts[0].fX, pts[1].fX, pts[2].fX, pts[2].fX); // Duplicate last pt
         skvx::float4 Y(pts[0].fY, pts[1].fY, pts[2].fY, pts[2].fY);
         if (skvx::all(X > width) || skvx::all(Y < 0.0f) || skvx::all(Y > height)) {
             return;
         }
         if (skvx::all(X < 0.0f) ||
             is_within_dist_sq(pts[1], pts[0], pts[2], kQuadSubdivThreshold)) {
             polyline->appendPoint(pts[2]);
         } else {
             const SkPoint* quadPts = fConicToQuad.computeQuads(pts, weight, kQuadErrTolerance);
             int quadCount = fConicToQuad.countQuads();
             for (int i = 0; i < quadCount; ++i) {
                 this->flattenQuadSimd(&quadPts[i * 2], polyline);
             }
         }
     };

     auto processCubic = [this, width, height, polyline](const SkPoint pts[4]) {
         skvx::float4 X(pts[0].fX, pts[1].fX, pts[2].fX, pts[3].fX);
         skvx::float4 Y(pts[0].fY, pts[1].fY, pts[2].fY, pts[3].fY);
         if (skvx::all(X > width) || skvx::all(Y < 0.0f) || skvx::all(Y > height)) {
             return;
         }
         if (skvx::all(X < 0.0f) ||
             (is_within_dist_sq(pts[1], pts[0], pts[3], kCubicSubdivThreshold) &&
              is_within_dist_sq(pts[2], pts[0], pts[3], kCubicSubdivThreshold))) {
             polyline->appendPoint(pts[3]);
         } else {
             uint32_t numSegments = this->flattenCubicSimd(pts);
             polyline->appendPoints(SkSpan(fContext.fFlattenedCubics.data() + 1, numSegments - 1));
         }
     };

     if (ctm.isIdentity()) {
         processPathsImpl<true>(path, ctm, polyline, processQuad, processConic, processCubic);
     } else {
         processPathsImpl<false>(path, ctm, polyline, processQuad, processConic, processCubic);
     }
 }

 void Flatten::processPathsScalar(
         const SkPath& path, const SkMatrix& ctm, float width, float height, Polyline* polyline) {
     fContext.fFlattenedCubics.clear();

     auto processQuad = [this, width, height, polyline](const SkPoint pts[3]) {
         if (is_completely_culled<3>(pts, width, height)) {
             // If the quad is completely top, right, or bottom of the viewport, cull.
             return;
         }
         if (is_completely_left<3>(pts) ||
             is_within_dist_sq(pts[1], pts[0], pts[2], kQuadSubdivThreshold)) {
             // If the quad is visually a line or completely left of the viewport, simplify.
             polyline->appendPoint(pts[2]);
         } else {
             this->flattenQuadScalar(pts, polyline);
         }
     };

     auto processConic = [this, width, height, polyline](const SkPoint pts[3], float weight) {
         if (is_completely_culled<3>(pts, width, height)) {
             // If the conic is completely top, right, or bottom of the viewport, cull.
             return;
         }
         if (is_completely_left<3>(pts) ||
             is_within_dist_sq(pts[1], pts[0], pts[2], kQuadSubdivThreshold)) {
             // If the conic is visually a line or completely left of the viewport, simplify.
             // Note: A low weight can produce a visually flat conic even if the control point is far
             // away, causing a false negative. This is acceptable as we fall back to subdivision.
             polyline->appendPoint(pts[2]);
         } else {
             const SkPoint* quadPts = fConicToQuad.computeQuads(pts, weight, kQuadErrTolerance);
             int quadCount = fConicToQuad.countQuads();
             for (int i = 0; i < quadCount; ++i) {
                 this->flattenQuadScalar(&quadPts[i * 2], polyline);
             }
         }
     };

     auto processCubic = [this, width, height, polyline](const SkPoint pts[4]) {
         if (is_completely_culled<4>(pts, width, height)) {
             // If the cubic is completely top, right, or bottom of the viewport, cull.
             return;
         }
         if (is_completely_left<4>(pts) ||
             (is_within_dist_sq(pts[1], pts[0], pts[3], kCubicSubdivThreshold) &&
              is_within_dist_sq(pts[2], pts[0], pts[3], kCubicSubdivThreshold))) {
             // If the cubic is visually a line or completely left of the viewport, simplify.
             polyline->appendPoint(pts[3]);
         } else {
             uint32_t numSegments = this->flattenCubicScalar(pts);
             polyline->appendPoints(SkSpan(fContext.fFlattenedCubics.data() + 1, numSegments - 1));
         }
     };

     if (ctm.isIdentity()) {
         processPathsImpl</*kIsIdentity=*/true>(
                 path, ctm, polyline, processQuad, processConic, processCubic);
     } else {
         processPathsImpl</*kIsIdentity=*/false>(
                 path, ctm, polyline, processQuad, processConic, processCubic);
     }
 }

 }  // namespace skgpu::graphite