sparse_strips/vello_common/src/flatten_simd.rs - external/github.com/linebender/vello - Git at Google

 // Copyright 2025 the Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT

 //! This is a temporary module that contains a SIMD version of flattening of cubic curves, as
 //! well as some code that was copied from kurbo, which is needed to reimplement the
 //! full `flatten` method.

 use crate::flatten::TOL_2;
 #[cfg(not(feature = "std"))]
 use crate::kurbo::common::FloatFuncs as _;
 use crate::kurbo::{CubicBez, ParamCurve, PathEl, Point, QuadBez};
 use alloc::vec::Vec;
 use bytemuck::{Pod, Zeroable};
 use fearless_simd::*;

 /// The element of a path made of lines.
 ///
 /// Each subpath must start with a `MoveTo`. Closing of subpaths is not supported, and subpaths are
 /// not closed implicitly when a new subpath (with `MoveTo`) is started. It is expected that closed
 /// subpaths are watertight in the sense that the last `LineTo` matches exactly with the first
 /// `MoveTo`.
 ///
 /// This intentionally allows for non-watertight subpaths, as, e.g., lines that are fully outside
 /// of the viewport do not need to be drawn.
 ///
 /// See [`PathEl`] for a more general-purpose path element type.
 pub(crate) enum LinePathEl {
     MoveTo(Point),
     LineTo(Point),
 }

 // Unlike kurbo, which takes a closure with a callback for outputting the lines, we use a trait
 // instead. The reason is that this way the callback can be inlined, which is not possible with
 // a closure and turned out to have a noticeable overhead.
 pub(crate) trait Callback {
     fn callback(&mut self, el: LinePathEl);
 }

 /// See the docs for the kurbo implementation of flattening:
 /// <https://docs.rs/kurbo/latest/kurbo/fn.flatten.html>
 ///
 /// This version works using a similar approach but using f32x4/f32x8 SIMD instead.
 #[inline(always)]
 pub(crate) fn flatten<S: Simd>(
     simd: S,
     path: impl IntoIterator<Item = PathEl>,
     tolerance: f64,
     callback: &mut impl Callback,
     flatten_ctx: &mut FlattenCtx,
 ) {
     flatten_ctx.flattened_cubics.clear();

     let sqrt_tol = tolerance.sqrt();
     let mut closed = true;
     let mut start_pt = Point::ZERO;
     let mut last_pt = Point::ZERO;

     for el in path {
         match el {
             PathEl::MoveTo(p) => {
                 if !closed && last_pt != start_pt {
                     callback.callback(LinePathEl::LineTo(start_pt));
                 }
                 closed = false;
                 last_pt = p;
                 start_pt = p;
                 callback.callback(LinePathEl::MoveTo(p));
             }
             PathEl::LineTo(p) => {
                 debug_assert!(!closed, "Expected a `MoveTo` before a `LineTo`");
                 last_pt = p;
                 callback.callback(LinePathEl::LineTo(p));
             }
             PathEl::QuadTo(p1, p2) => {
                 debug_assert!(!closed, "Expected a `MoveTo` before a `QuadTo`");
                 let p0 = last_pt;
                 // An upper bound on the shortest distance of any point on the quadratic Bezier
                 // curve to the line segment [p0, p2] is 1/2 of the maximum of the
                 // endpoint-to-control-point distances.
                 //
                 // The derivation is similar to that for the cubic Bezier (see below). In
                 // short:
                 //
                 // q}(t) = B0(t) p0 + B1(t) p1 + B2(t) p2
                 // dist(q(t), [p0, p1]) <= B1(t) dist(p1, [p0, p1])
                 //                       = 2 (1-t)t dist(p1, [p0, p1]).
                 //
                 // The maximum occurs at t=1/2, hence
                 // max(dist(q(t), [p0, p1] <= 1/2 dist(p1, [p0, p1])).
                 //
                 // A cheap upper bound for dist(p1, [p0, p1]) is max(dist(p1, p0), dist(p1, p2)).
                 //
                 // The following takes the square to elide the square root of the Euclidean
                 // distance.
                 if f64::max((p1 - p0).hypot2(), (p1 - p2).hypot2()) <= 4. * TOL_2 {
                     callback.callback(LinePathEl::LineTo(p2));
                 } else {
                     let q = QuadBez::new(p0, p1, p2);
                     let params = q.estimate_subdiv(sqrt_tol);
                     let n = ((0.5 * params.val / sqrt_tol).ceil() as usize).max(1);
                     let step = 1.0 / (n as f64);
                     for i in 1..n {
                         let u = (i as f64) * step;
                         let t = q.determine_subdiv_t(&params, u);
                         let p = q.eval(t);
                         callback.callback(LinePathEl::LineTo(p));
                     }
                     callback.callback(LinePathEl::LineTo(p2));
                 }
                 last_pt = p2;
             }
             PathEl::CurveTo(p1, p2, p3) => {
                 debug_assert!(!closed, "Expected a `MoveTo` before a `CurveTo`");
                 let p0 = last_pt;
                 // An upper bound on the shortest distance of any point on the cubic Bezier
                 // curve to the line segment [p0, p3] is 3/4 of the maximum of the
                 // endpoint-to-control-point distances.
                 //
                 // With Bernstein weights Bi(t), we have
                 // c(t) = B0(t) p0 + B1(t) p1 + B2(t) p2 + B3(t) p3
                 // with t from 0 to 1 (inclusive).
                 //
                 // Through convexivity of the Euclidean distance function and the line segment,
                 // we have
                 // dist(c(t), [p0, p3]) <= B1(t) dist(p1, [p0, p3]) + B2(t) dist(p2, [p0, p3])
                 //                      <= B1(t) ||p1-p0|| + B2(t) ||p2-p3||
                 //                      <= (B1(t) + B2(t)) max(||p1-p0||, ||p2-p3|||)
                 //                       = 3 ((1-t)t^2 + (1-t)^2t) max(||p1-p0||, ||p2-p3||).
                 //
                 // The inner polynomial has its maximum of 1/4 at t=1/2, hence
                 // max(dist(c(t), [p0, p3])) <= 3/4 max(||p1-p0||, ||p2-p3||).
                 //
                 // The following takes the square to elide the square root of the Euclidean
                 // distance.
                 if f64::max((p0 - p1).hypot2(), (p3 - p2).hypot2()) <= 16. / 9. * TOL_2 {
                     callback.callback(LinePathEl::LineTo(p3));
                 } else {
                     let c = CubicBez::new(p0, p1, p2, p3);
                     let max = flatten_cubic_simd(simd, c, flatten_ctx, tolerance as f32);

                     for p in &flatten_ctx.flattened_cubics[1..max] {
                         callback.callback(LinePathEl::LineTo(Point::new(p.x as f64, p.y as f64)));
                     }
                 }
                 last_pt = p3;
             }
             PathEl::ClosePath => {
                 closed = true;
                 if last_pt != start_pt {
                     callback.callback(LinePathEl::LineTo(start_pt));
                 }
             }
         }
     }

     if !closed && last_pt != start_pt {
         callback.callback(LinePathEl::LineTo(start_pt));
     }
 }

 // The below methods are copied from kurbo and needed to implement flattening of normal quad curves.

 /// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
 ///
 /// This is used for flattening curves.
 fn approx_parabola_integral(x: f64) -> f64 {
     const D: f64 = 0.67;
     x / (1.0 - D + (D.powi(4) + 0.25 * x * x).sqrt().sqrt())
 }

 /// An approximation to the inverse parabola integral.
 fn approx_parabola_inv_integral(x: f64) -> f64 {
     const B: f64 = 0.39;
     x * (1.0 - B + (B * B + 0.25 * x * x).sqrt())
 }

 impl FlattenParamsExt for QuadBez {
     #[inline(always)]
     fn estimate_subdiv(&self, sqrt_tol: f64) -> FlattenParams {
         // Determine transformation to $y = x^2$ parabola.
         let d01 = self.p1 - self.p0;
         let d12 = self.p2 - self.p1;
         let dd = d01 - d12;
         let cross = (self.p2 - self.p0).cross(dd);
         let x0 = d01.dot(dd) * cross.recip();
         let x2 = d12.dot(dd) * cross.recip();
         let scale = (cross / (dd.hypot() * (x2 - x0))).abs();

         // Compute number of subdivisions needed.
         let a0 = approx_parabola_integral(x0);
         let a2 = approx_parabola_integral(x2);
         let val = if scale.is_finite() {
             let da = (a2 - a0).abs();
             let sqrt_scale = scale.sqrt();
             if x0.signum() == x2.signum() {
                 da * sqrt_scale
             } else {
                 // Handle cusp case (segment contains curvature maximum)
                 let xmin = sqrt_tol / sqrt_scale;
                 sqrt_tol * da / approx_parabola_integral(xmin)
             }
         } else {
             0.0
         };
         let u0 = approx_parabola_inv_integral(a0);
         let u2 = approx_parabola_inv_integral(a2);
         let uscale = (u2 - u0).recip();
         FlattenParams {
             a0,
             a2,
             u0,
             uscale,
             val,
         }
     }

     #[inline(always)]
     fn determine_subdiv_t(&self, params: &FlattenParams, x: f64) -> f64 {
         let a = params.a0 + (params.a2 - params.a0) * x;
         let u = approx_parabola_inv_integral(a);
         (u - params.u0) * params.uscale
     }
 }

 trait FlattenParamsExt {
     fn estimate_subdiv(&self, sqrt_tol: f64) -> FlattenParams;
     fn determine_subdiv_t(&self, params: &FlattenParams, x: f64) -> f64;
 }

 // Everything below is a SIMD implementation of flattening of cubic curves.
 // It's a combination of https://gist.github.com/raphlinus/5f4e9feb85fd79bafc72da744571ec0e
 // and https://gist.github.com/raphlinus/44e114fef2fd33b889383a60ced0129b.

 // TODO(laurenz): Perhaps we should get rid of this in the future and work directly with f32,
 // as it's the only reason we have to pull in proc_macros via the `derive` feature of bytemuck.
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
 #[repr(C)]
 struct Point32 {
     x: f32,
     y: f32,
 }

 struct FlattenParams {
     a0: f64,
     a2: f64,
     u0: f64,
     uscale: f64,
     /// The number of `subdivisions * 2 * sqrt_tol`.
     val: f64,
 }

 /// This limit was chosen based on the pre-existing GitHub gist.
 /// This limit should not be hit in normal operation, but _might_ be hit for very large
 /// transforms.
 const MAX_QUADS: usize = 16;

 /// The context needed for flattening curves.
 #[derive(Default, Debug)]
 pub struct FlattenCtx {
     // The +4 is to encourage alignment; might be better to be explicit
     even_pts: [Point32; MAX_QUADS + 4],
     odd_pts: [Point32; MAX_QUADS],
     a0: [f32; MAX_QUADS],
     da: [f32; MAX_QUADS],
     u0: [f32; MAX_QUADS],
     uscale: [f32; MAX_QUADS],
     val: [f32; MAX_QUADS],
     n_quads: usize,
     /// Reusable buffer for flattened cubic points.
     flattened_cubics: Vec<Point32>,
 }

 #[inline(always)]
 fn is_finite_simd<S: Simd>(x: f32x4<S>) -> mask32x4<S> {
     let simd = x.simd;

     let x_abs = x.abs();
     let reinterpreted = u32x4::from_bytes(x_abs.to_bytes());
     simd.simd_lt_u32x4(reinterpreted, u32x4::splat(simd, 0x7f80_0000))
 }

 #[inline(always)]
 fn approx_parabola_integral_simd<S: Simd>(x: f32x8<S>) -> f32x8<S> {
     let simd = x.simd;

     const D: f32 = 0.67;
     const D_POWI_4: f32 = 0.201_511_2;

     let temp1 = f32x8::splat(simd, 0.25).madd(x * x, f32x8::splat(simd, D_POWI_4));
     let temp2 = temp1.sqrt();
     let temp3 = temp2.sqrt();
     let temp4 = f32x8::splat(simd, 1.0) - f32x8::splat(simd, D);
     let temp5 = temp4 + temp3;
     x / temp5
 }

 #[inline(always)]
 fn approx_parabola_integral_simd_x4<S: Simd>(x: f32x4<S>) -> f32x4<S> {
     let simd = x.simd;

     const D: f32 = 0.67;
     const D_POWI_4: f32 = 0.201_511_2;

     let temp1 = f32x4::splat(simd, 0.25).madd(x * x, f32x4::splat(simd, D_POWI_4));
     let temp2 = temp1.sqrt();
     let temp3 = temp2.sqrt();
     let temp4 = f32x4::splat(simd, 1.0) - f32x4::splat(simd, D);
     let temp5 = temp4 + temp3;
     x / temp5
 }

 #[inline(always)]
 fn approx_parabola_inv_integral_simd<S: Simd>(x: f32x8<S>) -> f32x8<S> {
     let simd = x.simd;

     const B: f32 = 0.39;
     const ONE_MINUS_B: f32 = 1.0 - B;

     let temp1 = f32x8::splat(simd, B * B);
     let temp2 = f32x8::splat(simd, 0.25).madd(x * x, temp1);
     let temp3 = temp2.sqrt();
     let temp4 = f32x8::splat(simd, ONE_MINUS_B) + temp3;

     x * temp4
 }

 #[inline(always)]
 fn pt_splat_simd<S: Simd>(simd: S, pt: Point32) -> f32x8<S> {
     let p_f64: f64 = bytemuck::cast(pt);
     simd.reinterpret_f32_f64x4(f64x4::splat(simd, p_f64))
 }

 #[inline(always)]
 fn eval_simd<S: Simd>(
     p0: f32x8<S>,
     p1: f32x8<S>,
     p2: f32x8<S>,
     p3: f32x8<S>,
     t: f32x8<S>,
 ) -> f32x8<S> {
     let mt = 1.0 - t;
     let im0 = p0 * mt * mt * mt;
     let im1 = p1 * mt * mt * 3.0;
     let im2 = p2 * mt * 3.0;
     let im3 = p3.madd(t, im2) * t;

     (im1 + im3).madd(t, im0)
 }

 #[inline(always)]
 fn eval_cubics_simd<S: Simd>(simd: S, c: &CubicBez, n: usize, result: &mut FlattenCtx) {
     result.n_quads = n;
     let dt = 0.5 / n as f32;

     // TODO(laurenz): Perhaps we can SIMDify this better
     let p0p1 = f32x4::from_slice(
         simd,
         &[c.p0.x as f32, c.p0.y as f32, c.p1.x as f32, c.p1.y as f32],
     );
     let p2p3 = f32x4::from_slice(
         simd,
         &[c.p2.x as f32, c.p2.y as f32, c.p3.x as f32, c.p3.y as f32],
     );

     let split_single = |input: f32x4<S>| {
         let t1 = simd.reinterpret_f64_f32x4(input);
         let p0 = simd.zip_low_f64x2(t1, t1);
         let p1 = simd.zip_high_f64x2(t1, t1);

         let p0 = simd.reinterpret_f32_f64x2(p0);
         let p1 = simd.reinterpret_f32_f64x2(p1);

         (f32x8::block_splat(p0), f32x8::block_splat(p1))
     };

     let (p0_128, p1_128) = split_single(p0p1);
     let (p2_128, p3_128) = split_single(p2p3);

     let iota = f32x8::from_slice(simd, &[0.0, 0.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0]);
     let step = iota * dt;
     let mut t = step;
     let t_inc = f32x8::splat(simd, 4.0 * dt);

     let even_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut result.even_pts);
     let odd_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut result.odd_pts);

     for i in 0..n.div_ceil(2) {
         let evaluated = eval_simd(p0_128, p1_128, p2_128, p3_128, t);
         let (low, high) = simd.split_f32x8(evaluated);

         even_pts[i * 4..][..4].copy_from_slice(low.as_slice());
         odd_pts[i * 4..][..4].copy_from_slice(high.as_slice());

         t += t_inc;
     }

     even_pts[n * 2..][..8].copy_from_slice(p3_128.as_slice());
 }

 #[inline(always)]
 fn estimate_subdiv_simd<S: Simd>(simd: S, sqrt_tol: f32, ctx: &mut FlattenCtx) {
     let n = ctx.n_quads;

     let even_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.even_pts);
     let odd_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.odd_pts);

     for i in 0..n.div_ceil(4) {
         let p0 = f32x8::from_slice(simd, &even_pts[i * 8..][..8]);
         let p_onehalf = f32x8::from_slice(simd, &odd_pts[i * 8..][..8]);
         let p2 = f32x8::from_slice(simd, &even_pts[(i * 8 + 2)..][..8]);
         let x = p0 * -0.5;
         let x1 = p_onehalf.madd(2.0, x);
         let p1 = p2.madd(-0.5, x1);

         odd_pts[(i * 8)..][..8].copy_from_slice(p1.as_slice());

         let d01 = p1 - p0;
         let d12 = p2 - p1;
         let d01x = simd.unzip_low_f32x8(d01, d01);
         let d01y = simd.unzip_high_f32x8(d01, d01);
         let d12x = simd.unzip_low_f32x8(d12, d12);
         let d12y = simd.unzip_high_f32x8(d12, d12);
         let ddx = d01x - d12x;
         let ddy = d01y - d12y;
         let d02x = d01x + d12x;
         let d02y = d01y + d12y;
         // (d02x * ddy) - (d02y * ddx)
         let cross = ddx.madd(-d02y, d02x * ddy);

         let x0_x2_a = {
             let (d01x_low, _) = simd.split_f32x8(d01x);
             let (d12x_low, _) = simd.split_f32x8(d12x);

             simd.combine_f32x4(d12x_low, d01x_low) * ddx
         };
         let temp1 = {
             let (d12y_low, _) = simd.split_f32x8(d12y);
             let (d01y_low, _) = simd.split_f32x8(d01y);

             simd.combine_f32x4(d12y_low, d01y_low)
         };
         let x0_x2_num = temp1.madd(ddy, x0_x2_a);
         let x0_x2 = x0_x2_num / cross;
         let (ddx_low, _) = simd.split_f32x8(ddx);
         let (ddy_low, _) = simd.split_f32x8(ddy);
         let dd_hypot = ddy_low.madd(ddy_low, ddx_low * ddx_low).sqrt();
         let (x0, x2) = simd.split_f32x8(x0_x2);
         let scale_denom = dd_hypot * (x2 - x0);
         let (temp2, _) = simd.split_f32x8(cross);
         let scale = (temp2 / scale_denom).abs();
         let a0_a2 = approx_parabola_integral_simd(x0_x2);
         let (a0, a2) = simd.split_f32x8(a0_a2);
         let da = a2 - a0;
         let da_abs = da.abs();
         let sqrt_scale = scale.sqrt();
         let temp3 = simd.or_i32x4(x0.bitcast(), x2.bitcast());
         let mask = simd.simd_ge_i32x4(temp3, i32x4::splat(simd, 0));
         let noncusp = da_abs * sqrt_scale;
         // TODO: should we skip this if neither is a cusp? Maybe not worth branch prediction cost
         let xmin = sqrt_tol / sqrt_scale;
         let approxint = approx_parabola_integral_simd_x4(xmin);
         let cusp = (sqrt_tol * da_abs) / approxint;
         let val_raw = simd.select_f32x4(mask, noncusp, cusp);
         let finite_mask = is_finite_simd(val_raw);
         let val = simd.select_f32x4(finite_mask, val_raw, f32x4::splat(simd, 0.0));
         let u0_u2 = approx_parabola_inv_integral_simd(a0_a2);
         let (u0, u2) = simd.split_f32x8(u0_u2);
         let uscale_a = u2 - u0;
         let uscale = 1.0 / uscale_a;

         ctx.a0[i * 4..][..4].copy_from_slice(a0.as_slice());
         ctx.da[i * 4..][..4].copy_from_slice(da.as_slice());
         ctx.u0[i * 4..][..4].copy_from_slice(u0.as_slice());
         ctx.uscale[i * 4..][..4].copy_from_slice(uscale.as_slice());
         ctx.val[i * 4..][..4].copy_from_slice(val.as_slice());
     }
 }

 #[inline(always)]
 fn output_lines_simd<S: Simd>(
     simd: S,
     ctx: &mut FlattenCtx,
     i: usize,
     x0: f32,
     dx: f32,
     n: usize,
     start_idx: usize,
 ) {
     let p0 = pt_splat_simd(simd, ctx.even_pts[i]);
     let p1 = pt_splat_simd(simd, ctx.odd_pts[i]);
     let p2 = pt_splat_simd(simd, ctx.even_pts[i + 1]);

     const IOTA2: [f32; 8] = [0., 0., 1., 1., 2., 2., 3., 3.];
     let iota2 = f32x8::from_slice(simd, IOTA2.as_ref());
     let x = iota2.madd(dx, f32x8::splat(simd, x0));
     let da = f32x8::splat(simd, ctx.da[i]);
     let mut a = da.madd(x, f32x8::splat(simd, ctx.a0[i]));
     let a_inc = 4.0 * dx * da;
     let uscale = f32x8::splat(simd, ctx.uscale[i]);

     let out: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.flattened_cubics[start_idx..]);

     for j in 0..n.div_ceil(4) {
         let u = approx_parabola_inv_integral_simd(a);
         let t = u.madd(uscale, -ctx.u0[i] * uscale);
         let mt = 1.0 - t;
         let z = p0 * mt * mt;
         let z1 = p1.madd(2.0 * t * mt, z);
         let p = p2.madd(t * t, z1);

         out[j * 8..][..8].copy_from_slice(p.as_slice());

         a += a_inc;
     }
 }

 #[inline(always)]
 fn flatten_cubic_simd<S: Simd>(simd: S, c: CubicBez, ctx: &mut FlattenCtx, accuracy: f32) -> usize {
     let n_quads = estimate_num_quads(c, accuracy);
     eval_cubics_simd(simd, &c, n_quads, ctx);
     let tol = accuracy * (1.0 - TO_QUAD_TOL);
     let sqrt_tol = tol.sqrt();
     estimate_subdiv_simd(simd, sqrt_tol, ctx);
     let sum: f32 = ctx.val[..n_quads].iter().sum();
     let n = ((0.5 * sum / sqrt_tol).ceil() as usize).max(1);
     ctx.flattened_cubics.resize(n + 4, Point32::default());

     let step = sum / (n as f32);
     let step_recip = 1.0 / step;
     let mut val_sum = 0.0;
     let mut last_n = 0;
     let mut x0base = 0.0;

     for i in 0..n_quads {
         let val = ctx.val[i];
         val_sum += val;
         let this_n = val_sum * step_recip;
         let this_n_next = 1.0 + this_n.floor();
         let dn = this_n_next as usize - last_n;
         if dn > 0 {
             let dx = step / val;
             let x0 = x0base * dx;
             output_lines_simd(simd, ctx, i, x0, dx, dn, last_n);
         }
         x0base = this_n_next - this_n;
         last_n = this_n_next as usize;
     }

     ctx.flattened_cubics[n] = ctx.even_pts[n_quads];

     n + 1
 }

 #[inline(always)]
 fn estimate_num_quads(c: CubicBez, accuracy: f32) -> usize {
     let q_accuracy = (accuracy * TO_QUAD_TOL) as f64;
     let max_hypot2 = 432.0 * q_accuracy * q_accuracy;
     let p1x2 = c.p1.to_vec2() * 3.0 - c.p0.to_vec2();
     let p2x2 = c.p2.to_vec2() * 3.0 - c.p3.to_vec2();
     let err = (p2x2 - p1x2).hypot2();
     let err_div = err / max_hypot2;

     estimate(err_div)
 }

 const TO_QUAD_TOL: f32 = 0.1;

 #[inline(always)]
 fn estimate(err_div: f64) -> usize {
     // The original version of this method was:
     // let n_quads = (err_div.powf(1. / 6.0).ceil() as usize).max(1);
     // n_quads.min(MAX_QUADS)
     //
     // Note how we always round up and clamp to the range [1, max_quads]. Since we don't
     // care about the actual fractional value resulting from the powf call we can simply
     // compute this using a precomputed lookup table evaluating 1^6, 2^6, 3^6, etc. and simply
     // comparing if the value is less than or equal to each threshold.

     const LUT: [f64; MAX_QUADS] = [
         1.0, 64.0, 729.0, 4096.0, 15625.0, 46656.0, 117649.0, 262144.0, 531441.0, 1000000.0,
         1771561.0, 2985984.0, 4826809.0, 7529536.0, 11390625.0, 16777216.0,
     ];

     #[expect(clippy::needless_range_loop, reason = "better clarity")]
     for i in 0..MAX_QUADS {
         if err_div <= LUT[i] {
             return i + 1;
         }
     }

     MAX_QUADS
 }

 #[cfg(test)]
 mod tests {
     use crate::flatten_simd::{MAX_QUADS, estimate};

     fn old_estimate(err_div: f64) -> usize {
         let n_quads = (err_div.powf(1. / 6.0).ceil() as usize).max(1);
         n_quads.min(MAX_QUADS)
     }

     // Test is disabled by default since it takes 10-20 seconds to run, even in release mode.
     #[test]
     #[ignore]
     fn accuracy() {
         for i in 0..u32::MAX {
             let num = f32::from_bits(i);

             if num.is_finite() {
                 assert_eq!(old_estimate(num as f64), estimate(num as f64), "{num}");
             }
         }
     }
 }
	// Copyright 2025 the Vello Authors
	// SPDX-License-Identifier: Apache-2.0 OR MIT

	//! This is a temporary module that contains a SIMD version of flattening of cubic curves, as
	//! well as some code that was copied from kurbo, which is needed to reimplement the
	//! full `flatten` method.

	use crate::flatten::TOL_2;
	#[cfg(not(feature = "std"))]
	use crate::kurbo::common::FloatFuncs as _;
	use crate::kurbo::{CubicBez, ParamCurve, PathEl, Point, QuadBez};
	use alloc::vec::Vec;
	use bytemuck::{Pod, Zeroable};
	use fearless_simd::*;

	/// The element of a path made of lines.
	///
	/// Each subpath must start with a `MoveTo`. Closing of subpaths is not supported, and subpaths are
	/// not closed implicitly when a new subpath (with `MoveTo`) is started. It is expected that closed
	/// subpaths are watertight in the sense that the last `LineTo` matches exactly with the first
	/// `MoveTo`.
	///
	/// This intentionally allows for non-watertight subpaths, as, e.g., lines that are fully outside
	/// of the viewport do not need to be drawn.
	///
	/// See [`PathEl`] for a more general-purpose path element type.
	pub(crate) enum LinePathEl {
	MoveTo(Point),
	LineTo(Point),
	}

	// Unlike kurbo, which takes a closure with a callback for outputting the lines, we use a trait
	// instead. The reason is that this way the callback can be inlined, which is not possible with
	// a closure and turned out to have a noticeable overhead.
	pub(crate) trait Callback {
	fn callback(&mut self, el: LinePathEl);
	}

	/// See the docs for the kurbo implementation of flattening:
	/// <https://docs.rs/kurbo/latest/kurbo/fn.flatten.html>
	///
	/// This version works using a similar approach but using f32x4/f32x8 SIMD instead.
	#[inline(always)]
	pub(crate) fn flatten<S: Simd>(
	simd: S,
	path: impl IntoIterator<Item = PathEl>,
	tolerance: f64,
	callback: &mut impl Callback,
	flatten_ctx: &mut FlattenCtx,
	) {
	flatten_ctx.flattened_cubics.clear();

	let sqrt_tol = tolerance.sqrt();
	let mut closed = true;
	let mut start_pt = Point::ZERO;
	let mut last_pt = Point::ZERO;

	for el in path {
	match el {
	PathEl::MoveTo(p) => {
	if !closed && last_pt != start_pt {
	callback.callback(LinePathEl::LineTo(start_pt));
	}
	closed = false;
	last_pt = p;
	start_pt = p;
	callback.callback(LinePathEl::MoveTo(p));
	}
	PathEl::LineTo(p) => {
	debug_assert!(!closed, "Expected a `MoveTo` before a `LineTo`");
	last_pt = p;
	callback.callback(LinePathEl::LineTo(p));
	}
	PathEl::QuadTo(p1, p2) => {
	debug_assert!(!closed, "Expected a `MoveTo` before a `QuadTo`");
	let p0 = last_pt;
	// An upper bound on the shortest distance of any point on the quadratic Bezier
	// curve to the line segment [p0, p2] is 1/2 of the maximum of the
	// endpoint-to-control-point distances.
	//
	// The derivation is similar to that for the cubic Bezier (see below). In
	// short:
	//
	// q}(t) = B0(t) p0 + B1(t) p1 + B2(t) p2
	// dist(q(t), [p0, p1]) <= B1(t) dist(p1, [p0, p1])
	// = 2 (1-t)t dist(p1, [p0, p1]).
	//
	// The maximum occurs at t=1/2, hence
	// max(dist(q(t), [p0, p1] <= 1/2 dist(p1, [p0, p1])).
	//
	// A cheap upper bound for dist(p1, [p0, p1]) is max(dist(p1, p0), dist(p1, p2)).
	//
	// The following takes the square to elide the square root of the Euclidean
	// distance.
	if f64::max((p1 - p0).hypot2(), (p1 - p2).hypot2()) <= 4. * TOL_2 {
	callback.callback(LinePathEl::LineTo(p2));
	} else {
	let q = QuadBez::new(p0, p1, p2);
	let params = q.estimate_subdiv(sqrt_tol);
	let n = ((0.5 * params.val / sqrt_tol).ceil() as usize).max(1);
	let step = 1.0 / (n as f64);
	for i in 1..n {
	let u = (i as f64) * step;
	let t = q.determine_subdiv_t(&params, u);
	let p = q.eval(t);
	callback.callback(LinePathEl::LineTo(p));
	}
	callback.callback(LinePathEl::LineTo(p2));
	}
	last_pt = p2;
	}
	PathEl::CurveTo(p1, p2, p3) => {
	debug_assert!(!closed, "Expected a `MoveTo` before a `CurveTo`");
	let p0 = last_pt;
	// An upper bound on the shortest distance of any point on the cubic Bezier
	// curve to the line segment [p0, p3] is 3/4 of the maximum of the
	// endpoint-to-control-point distances.
	//
	// With Bernstein weights Bi(t), we have
	// c(t) = B0(t) p0 + B1(t) p1 + B2(t) p2 + B3(t) p3
	// with t from 0 to 1 (inclusive).
	//
	// Through convexivity of the Euclidean distance function and the line segment,
	// we have
	// dist(c(t), [p0, p3]) <= B1(t) dist(p1, [p0, p3]) + B2(t) dist(p2, [p0, p3])
	// <= B1(t) \|\|p1-p0\|\| + B2(t) \|\|p2-p3\|\|
	// <= (B1(t) + B2(t)) max(\|\|p1-p0\|\|, \|\|p2-p3\|\|\|)
	// = 3 ((1-t)t^2 + (1-t)^2t) max(\|\|p1-p0\|\|, \|\|p2-p3\|\|).
	//
	// The inner polynomial has its maximum of 1/4 at t=1/2, hence
	// max(dist(c(t), [p0, p3])) <= 3/4 max(\|\|p1-p0\|\|, \|\|p2-p3\|\|).
	//
	// The following takes the square to elide the square root of the Euclidean
	// distance.
	if f64::max((p0 - p1).hypot2(), (p3 - p2).hypot2()) <= 16. / 9. * TOL_2 {
	callback.callback(LinePathEl::LineTo(p3));
	} else {
	let c = CubicBez::new(p0, p1, p2, p3);
	let max = flatten_cubic_simd(simd, c, flatten_ctx, tolerance as f32);

	for p in &flatten_ctx.flattened_cubics[1..max] {
	callback.callback(LinePathEl::LineTo(Point::new(p.x as f64, p.y as f64)));
	}
	}
	last_pt = p3;
	}
	PathEl::ClosePath => {
	closed = true;
	if last_pt != start_pt {
	callback.callback(LinePathEl::LineTo(start_pt));
	}
	}
	}
	}

	if !closed && last_pt != start_pt {
	callback.callback(LinePathEl::LineTo(start_pt));
	}
	}

	// The below methods are copied from kurbo and needed to implement flattening of normal quad curves.

	/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
	///
	/// This is used for flattening curves.
	fn approx_parabola_integral(x: f64) -> f64 {
	const D: f64 = 0.67;
	x / (1.0 - D + (D.powi(4) + 0.25 * x * x).sqrt().sqrt())
	}

	/// An approximation to the inverse parabola integral.
	fn approx_parabola_inv_integral(x: f64) -> f64 {
	const B: f64 = 0.39;
	x * (1.0 - B + (B * B + 0.25 * x * x).sqrt())
	}

	impl FlattenParamsExt for QuadBez {
	#[inline(always)]
	fn estimate_subdiv(&self, sqrt_tol: f64) -> FlattenParams {
	// Determine transformation to $y = x^2$ parabola.
	let d01 = self.p1 - self.p0;
	let d12 = self.p2 - self.p1;
	let dd = d01 - d12;
	let cross = (self.p2 - self.p0).cross(dd);
	let x0 = d01.dot(dd) * cross.recip();
	let x2 = d12.dot(dd) * cross.recip();
	let scale = (cross / (dd.hypot() * (x2 - x0))).abs();

	// Compute number of subdivisions needed.
	let a0 = approx_parabola_integral(x0);
	let a2 = approx_parabola_integral(x2);
	let val = if scale.is_finite() {
	let da = (a2 - a0).abs();
	let sqrt_scale = scale.sqrt();
	if x0.signum() == x2.signum() {
	da * sqrt_scale
	} else {
	// Handle cusp case (segment contains curvature maximum)
	let xmin = sqrt_tol / sqrt_scale;
	sqrt_tol * da / approx_parabola_integral(xmin)
	}
	} else {
	0.0
	};
	let u0 = approx_parabola_inv_integral(a0);
	let u2 = approx_parabola_inv_integral(a2);
	let uscale = (u2 - u0).recip();
	FlattenParams {
	a0,
	a2,
	u0,
	uscale,
	val,
	}
	}

	#[inline(always)]
	fn determine_subdiv_t(&self, params: &FlattenParams, x: f64) -> f64 {
	let a = params.a0 + (params.a2 - params.a0) * x;
	let u = approx_parabola_inv_integral(a);
	(u - params.u0) * params.uscale
	}
	}

	trait FlattenParamsExt {
	fn estimate_subdiv(&self, sqrt_tol: f64) -> FlattenParams;
	fn determine_subdiv_t(&self, params: &FlattenParams, x: f64) -> f64;
	}

	// Everything below is a SIMD implementation of flattening of cubic curves.
	// It's a combination of https://gist.github.com/raphlinus/5f4e9feb85fd79bafc72da744571ec0e
	// and https://gist.github.com/raphlinus/44e114fef2fd33b889383a60ced0129b.

	// TODO(laurenz): Perhaps we should get rid of this in the future and work directly with f32,
	// as it's the only reason we have to pull in proc_macros via the `derive` feature of bytemuck.
	#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
	#[repr(C)]
	struct Point32 {
	x: f32,
	y: f32,
	}

	struct FlattenParams {
	a0: f64,
	a2: f64,
	u0: f64,
	uscale: f64,
	/// The number of `subdivisions * 2 * sqrt_tol`.
	val: f64,
	}

	/// This limit was chosen based on the pre-existing GitHub gist.
	/// This limit should not be hit in normal operation, but _might_ be hit for very large
	/// transforms.
	const MAX_QUADS: usize = 16;

	/// The context needed for flattening curves.
	#[derive(Default, Debug)]
	pub struct FlattenCtx {
	// The +4 is to encourage alignment; might be better to be explicit
	even_pts: [Point32; MAX_QUADS + 4],
	odd_pts: [Point32; MAX_QUADS],
	a0: [f32; MAX_QUADS],
	da: [f32; MAX_QUADS],
	u0: [f32; MAX_QUADS],
	uscale: [f32; MAX_QUADS],
	val: [f32; MAX_QUADS],
	n_quads: usize,
	/// Reusable buffer for flattened cubic points.
	flattened_cubics: Vec<Point32>,
	}

	#[inline(always)]
	fn is_finite_simd<S: Simd>(x: f32x4<S>) -> mask32x4<S> {
	let simd = x.simd;

	let x_abs = x.abs();
	let reinterpreted = u32x4::from_bytes(x_abs.to_bytes());
	simd.simd_lt_u32x4(reinterpreted, u32x4::splat(simd, 0x7f80_0000))
	}

	#[inline(always)]
	fn approx_parabola_integral_simd<S: Simd>(x: f32x8<S>) -> f32x8<S> {
	let simd = x.simd;

	const D: f32 = 0.67;
	const D_POWI_4: f32 = 0.201_511_2;

	let temp1 = f32x8::splat(simd, 0.25).madd(x * x, f32x8::splat(simd, D_POWI_4));
	let temp2 = temp1.sqrt();
	let temp3 = temp2.sqrt();
	let temp4 = f32x8::splat(simd, 1.0) - f32x8::splat(simd, D);
	let temp5 = temp4 + temp3;
	x / temp5
	}

	#[inline(always)]
	fn approx_parabola_integral_simd_x4<S: Simd>(x: f32x4<S>) -> f32x4<S> {
	let simd = x.simd;

	const D: f32 = 0.67;
	const D_POWI_4: f32 = 0.201_511_2;

	let temp1 = f32x4::splat(simd, 0.25).madd(x * x, f32x4::splat(simd, D_POWI_4));
	let temp2 = temp1.sqrt();
	let temp3 = temp2.sqrt();
	let temp4 = f32x4::splat(simd, 1.0) - f32x4::splat(simd, D);
	let temp5 = temp4 + temp3;
	x / temp5
	}

	#[inline(always)]
	fn approx_parabola_inv_integral_simd<S: Simd>(x: f32x8<S>) -> f32x8<S> {
	let simd = x.simd;

	const B: f32 = 0.39;
	const ONE_MINUS_B: f32 = 1.0 - B;

	let temp1 = f32x8::splat(simd, B * B);
	let temp2 = f32x8::splat(simd, 0.25).madd(x * x, temp1);
	let temp3 = temp2.sqrt();
	let temp4 = f32x8::splat(simd, ONE_MINUS_B) + temp3;

	x * temp4
	}

	#[inline(always)]
	fn pt_splat_simd<S: Simd>(simd: S, pt: Point32) -> f32x8<S> {
	let p_f64: f64 = bytemuck::cast(pt);
	simd.reinterpret_f32_f64x4(f64x4::splat(simd, p_f64))
	}

	#[inline(always)]
	fn eval_simd<S: Simd>(
	p0: f32x8<S>,
	p1: f32x8<S>,
	p2: f32x8<S>,
	p3: f32x8<S>,
	t: f32x8<S>,
	) -> f32x8<S> {
	let mt = 1.0 - t;
	let im0 = p0 * mt * mt * mt;
	let im1 = p1 * mt * mt * 3.0;
	let im2 = p2 * mt * 3.0;
	let im3 = p3.madd(t, im2) * t;

	(im1 + im3).madd(t, im0)
	}

	#[inline(always)]
	fn eval_cubics_simd<S: Simd>(simd: S, c: &CubicBez, n: usize, result: &mut FlattenCtx) {
	result.n_quads = n;
	let dt = 0.5 / n as f32;

	// TODO(laurenz): Perhaps we can SIMDify this better
	let p0p1 = f32x4::from_slice(
	simd,
	&[c.p0.x as f32, c.p0.y as f32, c.p1.x as f32, c.p1.y as f32],
	);
	let p2p3 = f32x4::from_slice(
	simd,
	&[c.p2.x as f32, c.p2.y as f32, c.p3.x as f32, c.p3.y as f32],
	);

	let split_single = \|input: f32x4<S>\| {
	let t1 = simd.reinterpret_f64_f32x4(input);
	let p0 = simd.zip_low_f64x2(t1, t1);
	let p1 = simd.zip_high_f64x2(t1, t1);

	let p0 = simd.reinterpret_f32_f64x2(p0);
	let p1 = simd.reinterpret_f32_f64x2(p1);

	(f32x8::block_splat(p0), f32x8::block_splat(p1))
	};

	let (p0_128, p1_128) = split_single(p0p1);
	let (p2_128, p3_128) = split_single(p2p3);

	let iota = f32x8::from_slice(simd, &[0.0, 0.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0]);
	let step = iota * dt;
	let mut t = step;
	let t_inc = f32x8::splat(simd, 4.0 * dt);

	let even_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut result.even_pts);
	let odd_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut result.odd_pts);

	for i in 0..n.div_ceil(2) {
	let evaluated = eval_simd(p0_128, p1_128, p2_128, p3_128, t);
	let (low, high) = simd.split_f32x8(evaluated);

	even_pts[i * 4..][..4].copy_from_slice(low.as_slice());
	odd_pts[i * 4..][..4].copy_from_slice(high.as_slice());

	t += t_inc;
	}

	even_pts[n * 2..][..8].copy_from_slice(p3_128.as_slice());
	}

	#[inline(always)]
	fn estimate_subdiv_simd<S: Simd>(simd: S, sqrt_tol: f32, ctx: &mut FlattenCtx) {
	let n = ctx.n_quads;

	let even_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.even_pts);
	let odd_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.odd_pts);

	for i in 0..n.div_ceil(4) {
	let p0 = f32x8::from_slice(simd, &even_pts[i * 8..][..8]);
	let p_onehalf = f32x8::from_slice(simd, &odd_pts[i * 8..][..8]);
	let p2 = f32x8::from_slice(simd, &even_pts[(i * 8 + 2)..][..8]);
	let x = p0 * -0.5;
	let x1 = p_onehalf.madd(2.0, x);
	let p1 = p2.madd(-0.5, x1);

	odd_pts[(i * 8)..][..8].copy_from_slice(p1.as_slice());

	let d01 = p1 - p0;
	let d12 = p2 - p1;
	let d01x = simd.unzip_low_f32x8(d01, d01);
	let d01y = simd.unzip_high_f32x8(d01, d01);
	let d12x = simd.unzip_low_f32x8(d12, d12);
	let d12y = simd.unzip_high_f32x8(d12, d12);
	let ddx = d01x - d12x;
	let ddy = d01y - d12y;
	let d02x = d01x + d12x;
	let d02y = d01y + d12y;
	// (d02x * ddy) - (d02y * ddx)
	let cross = ddx.madd(-d02y, d02x * ddy);

	let x0_x2_a = {
	let (d01x_low, _) = simd.split_f32x8(d01x);
	let (d12x_low, _) = simd.split_f32x8(d12x);

	simd.combine_f32x4(d12x_low, d01x_low) * ddx
	};
	let temp1 = {
	let (d12y_low, _) = simd.split_f32x8(d12y);
	let (d01y_low, _) = simd.split_f32x8(d01y);

	simd.combine_f32x4(d12y_low, d01y_low)
	};
	let x0_x2_num = temp1.madd(ddy, x0_x2_a);
	let x0_x2 = x0_x2_num / cross;
	let (ddx_low, _) = simd.split_f32x8(ddx);
	let (ddy_low, _) = simd.split_f32x8(ddy);
	let dd_hypot = ddy_low.madd(ddy_low, ddx_low * ddx_low).sqrt();
	let (x0, x2) = simd.split_f32x8(x0_x2);
	let scale_denom = dd_hypot * (x2 - x0);
	let (temp2, _) = simd.split_f32x8(cross);
	let scale = (temp2 / scale_denom).abs();
	let a0_a2 = approx_parabola_integral_simd(x0_x2);
	let (a0, a2) = simd.split_f32x8(a0_a2);
	let da = a2 - a0;
	let da_abs = da.abs();
	let sqrt_scale = scale.sqrt();
	let temp3 = simd.or_i32x4(x0.bitcast(), x2.bitcast());
	let mask = simd.simd_ge_i32x4(temp3, i32x4::splat(simd, 0));
	let noncusp = da_abs * sqrt_scale;
	// TODO: should we skip this if neither is a cusp? Maybe not worth branch prediction cost
	let xmin = sqrt_tol / sqrt_scale;
	let approxint = approx_parabola_integral_simd_x4(xmin);
	let cusp = (sqrt_tol * da_abs) / approxint;
	let val_raw = simd.select_f32x4(mask, noncusp, cusp);
	let finite_mask = is_finite_simd(val_raw);
	let val = simd.select_f32x4(finite_mask, val_raw, f32x4::splat(simd, 0.0));
	let u0_u2 = approx_parabola_inv_integral_simd(a0_a2);
	let (u0, u2) = simd.split_f32x8(u0_u2);
	let uscale_a = u2 - u0;
	let uscale = 1.0 / uscale_a;

	ctx.a0[i * 4..][..4].copy_from_slice(a0.as_slice());
	ctx.da[i * 4..][..4].copy_from_slice(da.as_slice());
	ctx.u0[i * 4..][..4].copy_from_slice(u0.as_slice());
	ctx.uscale[i * 4..][..4].copy_from_slice(uscale.as_slice());
	ctx.val[i * 4..][..4].copy_from_slice(val.as_slice());
	}
	}

	#[inline(always)]
	fn output_lines_simd<S: Simd>(
	simd: S,
	ctx: &mut FlattenCtx,
	i: usize,
	x0: f32,
	dx: f32,
	n: usize,
	start_idx: usize,
	) {
	let p0 = pt_splat_simd(simd, ctx.even_pts[i]);
	let p1 = pt_splat_simd(simd, ctx.odd_pts[i]);
	let p2 = pt_splat_simd(simd, ctx.even_pts[i + 1]);

	const IOTA2: [f32; 8] = [0., 0., 1., 1., 2., 2., 3., 3.];
	let iota2 = f32x8::from_slice(simd, IOTA2.as_ref());
	let x = iota2.madd(dx, f32x8::splat(simd, x0));
	let da = f32x8::splat(simd, ctx.da[i]);
	let mut a = da.madd(x, f32x8::splat(simd, ctx.a0[i]));
	let a_inc = 4.0 * dx * da;
	let uscale = f32x8::splat(simd, ctx.uscale[i]);

	let out: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.flattened_cubics[start_idx..]);

	for j in 0..n.div_ceil(4) {
	let u = approx_parabola_inv_integral_simd(a);
	let t = u.madd(uscale, -ctx.u0[i] * uscale);
	let mt = 1.0 - t;
	let z = p0 * mt * mt;
	let z1 = p1.madd(2.0 * t * mt, z);
	let p = p2.madd(t * t, z1);

	out[j * 8..][..8].copy_from_slice(p.as_slice());

	a += a_inc;
	}
	}

	#[inline(always)]
	fn flatten_cubic_simd<S: Simd>(simd: S, c: CubicBez, ctx: &mut FlattenCtx, accuracy: f32) -> usize {
	let n_quads = estimate_num_quads(c, accuracy);
	eval_cubics_simd(simd, &c, n_quads, ctx);
	let tol = accuracy * (1.0 - TO_QUAD_TOL);
	let sqrt_tol = tol.sqrt();
	estimate_subdiv_simd(simd, sqrt_tol, ctx);
	let sum: f32 = ctx.val[..n_quads].iter().sum();
	let n = ((0.5 * sum / sqrt_tol).ceil() as usize).max(1);
	ctx.flattened_cubics.resize(n + 4, Point32::default());

	let step = sum / (n as f32);
	let step_recip = 1.0 / step;
	let mut val_sum = 0.0;
	let mut last_n = 0;
	let mut x0base = 0.0;

	for i in 0..n_quads {
	let val = ctx.val[i];
	val_sum += val;
	let this_n = val_sum * step_recip;
	let this_n_next = 1.0 + this_n.floor();
	let dn = this_n_next as usize - last_n;
	if dn > 0 {
	let dx = step / val;
	let x0 = x0base * dx;
	output_lines_simd(simd, ctx, i, x0, dx, dn, last_n);
	}
	x0base = this_n_next - this_n;
	last_n = this_n_next as usize;
	}

	ctx.flattened_cubics[n] = ctx.even_pts[n_quads];

	n + 1
	}

	#[inline(always)]
	fn estimate_num_quads(c: CubicBez, accuracy: f32) -> usize {
	let q_accuracy = (accuracy * TO_QUAD_TOL) as f64;
	let max_hypot2 = 432.0 * q_accuracy * q_accuracy;
	let p1x2 = c.p1.to_vec2() * 3.0 - c.p0.to_vec2();
	let p2x2 = c.p2.to_vec2() * 3.0 - c.p3.to_vec2();
	let err = (p2x2 - p1x2).hypot2();
	let err_div = err / max_hypot2;

	estimate(err_div)
	}

	const TO_QUAD_TOL: f32 = 0.1;

	#[inline(always)]
	fn estimate(err_div: f64) -> usize {
	// The original version of this method was:
	// let n_quads = (err_div.powf(1. / 6.0).ceil() as usize).max(1);
	// n_quads.min(MAX_QUADS)
	//
	// Note how we always round up and clamp to the range [1, max_quads]. Since we don't
	// care about the actual fractional value resulting from the powf call we can simply
	// compute this using a precomputed lookup table evaluating 1^6, 2^6, 3^6, etc. and simply
	// comparing if the value is less than or equal to each threshold.

	const LUT: [f64; MAX_QUADS] = [
	1.0, 64.0, 729.0, 4096.0, 15625.0, 46656.0, 117649.0, 262144.0, 531441.0, 1000000.0,
	1771561.0, 2985984.0, 4826809.0, 7529536.0, 11390625.0, 16777216.0,
	];

	#[expect(clippy::needless_range_loop, reason = "better clarity")]
	for i in 0..MAX_QUADS {
	if err_div <= LUT[i] {
	return i + 1;
	}
	}

	MAX_QUADS
	}

	#[cfg(test)]
	mod tests {
	use crate::flatten_simd::{MAX_QUADS, estimate};

	fn old_estimate(err_div: f64) -> usize {
	let n_quads = (err_div.powf(1. / 6.0).ceil() as usize).max(1);
	n_quads.min(MAX_QUADS)
	}

	// Test is disabled by default since it takes 10-20 seconds to run, even in release mode.
	#[test]
	#[ignore]
	fn accuracy() {
	for i in 0..u32::MAX {
	let num = f32::from_bits(i);

	if num.is_finite() {
	assert_eq!(old_estimate(num as f64), estimate(num as f64), "{num}");
	}
	}
	}
	}