sparse_strips/vello_cpu/src/fine/mod.rs - external/github.com/linebender/vello - Git at Google

 // Copyright 2025 the Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT

 mod common;
 mod highp;
 mod lowp;

 use crate::peniko::{BlendMode, Compose, ImageQuality, Mix};
 use crate::region::Region;
 use alloc::vec;
 use alloc::vec::Vec;
 use core::fmt::Debug;
 use core::iter;
 use vello_common::coarse::{Cmd, WideTile};
 use vello_common::encode::{
     EncodedBlurredRoundedRectangle, EncodedGradient, EncodedImage, EncodedKind, EncodedPaint,
 };
 use vello_common::paint::{ImageSource, Paint, PremulColor};
 use vello_common::tile::Tile;

 pub(crate) const COLOR_COMPONENTS: usize = 4;
 pub(crate) const TILE_HEIGHT_COMPONENTS: usize = Tile::HEIGHT as usize * COLOR_COMPONENTS;
 pub const SCRATCH_BUF_SIZE: usize =
     WideTile::WIDTH as usize * Tile::HEIGHT as usize * COLOR_COMPONENTS;

 use crate::fine::common::gradient::linear::SimdLinearKind;
 use crate::fine::common::gradient::radial::SimdRadialKind;
 use crate::fine::common::gradient::sweep::SimdSweepKind;
 use crate::fine::common::gradient::{GradientPainter, calculate_t_vals};
 use crate::fine::common::image::{FilteredImagePainter, NNImagePainter, PlainNNImagePainter};
 use crate::fine::common::rounded_blurred_rect::BlurredRoundedRectFiller;
 use crate::util::{BlendModeExt, EncodedImageExt};
 pub use highp::F32Kernel;
 pub use lowp::U8Kernel;
 use vello_common::fearless_simd::{
     Simd, SimdBase, SimdFloat, SimdInto, f32x4, f32x8, f32x16, u8x16, u8x32, u32x4, u32x8,
 };
 use vello_common::pixmap::Pixmap;
 use vello_common::simd::Splat4thExt;

 pub type ScratchBuf<F> = [F; SCRATCH_BUF_SIZE];

 pub trait Numeric: Copy + Default + Clone + Debug + PartialEq + Send + Sync + 'static {
     const ZERO: Self;
     const ONE: Self;
 }

 impl Numeric for f32 {
     const ZERO: Self = 0.0;
     const ONE: Self = 1.0;
 }

 impl Numeric for u8 {
     const ZERO: Self = 0;
     const ONE: Self = 255;
 }

 pub trait NumericVec<S: Simd>: Copy + Clone + Send + Sync {
     fn from_f32(simd: S, val: f32x16<S>) -> Self;
     fn from_u8(simd: S, val: u8x16<S>) -> Self;
 }

 impl<S: Simd> NumericVec<S> for f32x16<S> {
     #[inline(always)]
     fn from_f32(_: S, val: Self) -> Self {
         val
     }

     #[inline(always)]
     fn from_u8(simd: S, val: u8x16<S>) -> Self {
         let converted = u8_to_f32(val);
         converted * Self::splat(simd, 1.0 / 255.0)
     }
 }

 impl<S: Simd> NumericVec<S> for u8x16<S> {
     #[inline(always)]
     fn from_f32(simd: S, val: f32x16<S>) -> Self {
         let v1 = f32x16::splat(simd, 255.0);
         let v2 = f32x16::splat(simd, 0.5);
         let mulled = v2.madd(v1, val);

         f32_to_u8(mulled)
     }

     #[inline(always)]
     fn from_u8(_: S, val: Self) -> Self {
         val
     }
 }

 #[inline(always)]
 pub(crate) fn f32_to_u8<S: Simd>(val: f32x16<S>) -> u8x16<S> {
     let simd = val.simd;
     // Note that converting to u32 first using SIMD and then u8
     // is much faster than converting directly from f32 to u8.
     let converted = simd.cvt_u32_f32x16(val);

     // TODO: Maybe we can also do this using SIMD?
     [
         converted[0] as u8,
         converted[1] as u8,
         converted[2] as u8,
         converted[3] as u8,
         converted[4] as u8,
         converted[5] as u8,
         converted[6] as u8,
         converted[7] as u8,
         converted[8] as u8,
         converted[9] as u8,
         converted[10] as u8,
         converted[11] as u8,
         converted[12] as u8,
         converted[13] as u8,
         converted[14] as u8,
         converted[15] as u8,
     ]
     .simd_into(val.simd)
 }

 #[inline(always)]
 pub(crate) fn u8_to_f32<S: Simd>(val: u8x16<S>) -> f32x16<S> {
     // TODO: SIMDify
     [
         val.val[0] as f32,
         val.val[1] as f32,
         val.val[2] as f32,
         val.val[3] as f32,
         val.val[4] as f32,
         val.val[5] as f32,
         val.val[6] as f32,
         val.val[7] as f32,
         val.val[8] as f32,
         val.val[9] as f32,
         val.val[10] as f32,
         val.val[11] as f32,
         val.val[12] as f32,
         val.val[13] as f32,
         val.val[14] as f32,
         val.val[15] as f32,
     ]
     .simd_into(val.simd)
 }

 pub trait CompositeType<N: Numeric, S: Simd>: Copy + Clone + Send + Sync {
     const LENGTH: usize;

     fn from_slice(simd: S, slice: &[N]) -> Self;
     fn from_color(simd: S, color: [N; 4]) -> Self;
 }

 impl<S: Simd> CompositeType<f32, S> for f32x16<S> {
     const LENGTH: usize = 16;

     #[inline(always)]
     fn from_slice(simd: S, slice: &[f32]) -> Self {
         <Self as SimdBase<_, _>>::from_slice(simd, slice)
     }

     #[inline(always)]
     fn from_color(simd: S, color: [f32; 4]) -> Self {
         Self::block_splat(f32x4::from_slice(simd, &color[..]))
     }
 }

 impl<S: Simd> CompositeType<u8, S> for u8x32<S> {
     const LENGTH: usize = 32;

     #[inline(always)]
     fn from_slice(simd: S, slice: &[u8]) -> Self {
         <Self as SimdBase<_, _>>::from_slice(simd, slice)
     }

     #[inline(always)]
     fn from_color(simd: S, color: [u8; 4]) -> Self {
         u32x8::block_splat(u32x4::splat(simd, u32::from_ne_bytes(color))).reinterpret_u8()
     }
 }

 /// A kernel for performing fine rasterization.
 pub trait FineKernel<S: Simd>: Send + Sync + 'static {
     /// The basic underlying numerical type of the kernel.
     type Numeric: Numeric;
     /// The type that is used for blending and compositing.
     type Composite: CompositeType<Self::Numeric, S>;
     /// The base SIMD vector type for converting between u8 and f32.
     type NumericVec: NumericVec<S>;

     /// Extract the color from a premultiplied color.
     fn extract_color(color: PremulColor) -> [Self::Numeric; 4];
     /// Pack the blend buf into the given region.
     fn pack(simd: S, region: &mut Region<'_>, blend_buf: &[Self::Numeric]);
     /// Repeatedly copy the solid color into the target buffer.
     fn copy_solid(simd: S, target: &mut [Self::Numeric], color: [Self::Numeric; 4]);
     /// Return the painter used for painting gradients.
     fn gradient_painter<'a>(
         simd: S,
         gradient: &'a EncodedGradient,
         t_vals: &'a [f32],
     ) -> impl Painter + 'a {
         GradientPainter::new(simd, gradient, false, t_vals)
     }
     /// Return the painter used for painting gradients, with support for masking undefined locations.
     fn gradient_painter_with_undefined<'a>(
         simd: S,
         gradient: &'a EncodedGradient,
         t_vals: &'a [f32],
     ) -> impl Painter + 'a {
         GradientPainter::new(simd, gradient, true, t_vals)
     }
     /// Return the painter used for painting plain nearest-neighbor images.
     ///
     /// Plain nearest-neighbor images are images with the quality 'Low' and no skewing component in their
     /// transform.
     fn plain_nn_image_painter<'a>(
         simd: S,
         image: &'a EncodedImage,
         pixmap: &'a Pixmap,
         start_x: u16,
         start_y: u16,
     ) -> impl Painter + 'a {
         PlainNNImagePainter::new(simd, image, pixmap, start_x, start_y)
     }
     /// Return the painter used for painting plain nearest-neighbor images.
     ///
     /// Same as `plain_nn`, but must also support skewing transforms.
     fn nn_image_painter<'a>(
         simd: S,
         image: &'a EncodedImage,
         pixmap: &'a Pixmap,
         start_x: u16,
         start_y: u16,
     ) -> impl Painter + 'a {
         NNImagePainter::new(simd, image, pixmap, start_x, start_y)
     }
     /// Return the painter used for painting image with `Medium` quality.
     fn medium_quality_image_painter<'a>(
         simd: S,
         image: &'a EncodedImage,
         pixmap: &'a Pixmap,
         start_x: u16,
         start_y: u16,
     ) -> impl Painter + 'a {
         FilteredImagePainter::new(simd, image, pixmap, start_x, start_y)
     }
     /// Return the painter used for painting image with `High` quality.
     fn high_quality_image_painter<'a>(
         simd: S,
         image: &'a EncodedImage,
         pixmap: &'a Pixmap,
         start_x: u16,
         start_y: u16,
     ) -> impl Painter + 'a {
         FilteredImagePainter::new(simd, image, pixmap, start_x, start_y)
     }
     /// Return the painter used for painting blurred rounded rectangles.
     fn blurred_rounded_rectangle_painter<'a>(
         simd: S,
         rect: &'a EncodedBlurredRoundedRectangle,
         start_x: u16,
         start_y: u16,
     ) -> impl Painter + 'a {
         BlurredRoundedRectFiller::new(simd, rect, start_x, start_y)
     }
     /// Apply the mask to the destination buffer.
     fn apply_mask(simd: S, dest: &mut [Self::Numeric], src: impl Iterator<Item = Self::NumericVec>);
     /// Apply the painter to the destination buffer.
     fn apply_painter<'a>(simd: S, dest: &mut [Self::Numeric], painter: impl Painter + 'a);
     /// Do basic alpha compositing with a solid color.
     fn alpha_composite_solid(
         simd: S,
         target: &mut [Self::Numeric],
         src: [Self::Numeric; 4],
         alphas: Option<&[u8]>,
     );
     /// Do basic alpha compositing with the given buffer.
     fn alpha_composite_buffer(
         simd: S,
         dest: &mut [Self::Numeric],
         src: &[Self::Numeric],
         alphas: Option<&[u8]>,
     );
     /// Blend the source into the destination with the given blend mode.
     fn blend(
         simd: S,
         dest: &mut [Self::Numeric],
         src: impl Iterator<Item = Self::Composite>,
         blend_mode: BlendMode,
         alphas: Option<&[u8]>,
     );
 }

 /// An object for performing fine rasterization
 #[derive(Debug)]
 pub struct Fine<S: Simd, T: FineKernel<S>> {
     /// The coordinates of the currently covered wide tile.
     pub(crate) wide_coords: (u16, u16),
     /// The stack of blend buffers.
     pub(crate) blend_buf: Vec<ScratchBuf<T::Numeric>>,
     /// An intermediate buffer used by shaders to store their contents.
     pub(crate) paint_buf: ScratchBuf<T::Numeric>,
     /// An intermediate buffer used by gradients to store the t values.
     pub(crate) f32_buf: Vec<f32>,
     pub(crate) simd: S,
 }

 impl<S: Simd, T: FineKernel<S>> Fine<S, T> {
     pub fn new(simd: S) -> Self {
         Self {
             simd,
             wide_coords: (0, 0),
             blend_buf: vec![[T::Numeric::ZERO; SCRATCH_BUF_SIZE]],
             f32_buf: vec![0.0; SCRATCH_BUF_SIZE / 4],
             paint_buf: [T::Numeric::ZERO; SCRATCH_BUF_SIZE],
         }
     }

     pub fn set_coords(&mut self, x: u16, y: u16) {
         self.wide_coords = (x, y);
     }

     pub fn clear(&mut self, premul_color: PremulColor) {
         let converted_color = T::extract_color(premul_color);
         let blend_buf = self.blend_buf.last_mut().unwrap();

         T::copy_solid(self.simd, blend_buf, converted_color);
     }

     pub fn pack(&self, region: &mut Region<'_>) {
         let blend_buf = self.blend_buf.last().unwrap();

         T::pack(self.simd, region, blend_buf);
     }

     pub(crate) fn run_cmd(&mut self, cmd: &Cmd, alphas: &[u8], paints: &[EncodedPaint]) {
         match cmd {
             Cmd::Fill(f) => {
                 self.fill(
                     usize::from(f.x),
                     usize::from(f.width),
                     &f.paint,
                     f.blend_mode
                         .unwrap_or(BlendMode::new(Mix::Normal, Compose::SrcOver)),
                     paints,
                     None,
                 );
             }
             Cmd::AlphaFill(s) => {
                 self.fill(
                     usize::from(s.x),
                     usize::from(s.width),
                     &s.paint,
                     s.blend_mode
                         .unwrap_or(BlendMode::new(Mix::Normal, Compose::SrcOver)),
                     paints,
                     Some(&alphas[s.alpha_idx..]),
                 );
             }
             Cmd::PushBuf => {
                 self.blend_buf.push([T::Numeric::ZERO; SCRATCH_BUF_SIZE]);
             }
             Cmd::PopBuf => {
                 self.blend_buf.pop();
             }
             Cmd::ClipFill(cf) => {
                 self.clip(cf.x as usize, cf.width as usize, None);
             }
             Cmd::ClipStrip(cs) => {
                 self.clip(
                     cs.x as usize,
                     cs.width as usize,
                     Some(&alphas[cs.alpha_idx..]),
                 );
             }
             Cmd::Blend(b) => self.blend(*b),
             Cmd::Mask(m) => {
                 let start_x = self.wide_coords.0 * WideTile::WIDTH;
                 let start_y = self.wide_coords.1 * Tile::HEIGHT;

                 let blend_buf = self.blend_buf.last_mut().unwrap();

                 let width = (blend_buf.len() / (Tile::HEIGHT as usize * COLOR_COMPONENTS)) as u16;
                 let y = start_y as u32 + u32x4::from_slice(self.simd, &[0, 1, 2, 3]);

                 let iter = (start_x..(start_x + width)).map(|x| {
                     let x_in_range = x < m.width();

                     macro_rules! sample {
                         ($idx:expr) => {
                             if x_in_range && (y[$idx] as u16) < m.height() {
                                 m.sample(x, y[$idx] as u16)
                             } else {
                                 0
                             }
                         };
                     }

                     let s1 = sample!(0);
                     let s2 = sample!(1);
                     let s3 = sample!(2);
                     let s4 = sample!(3);

                     let samples = u8x16::from_slice(
                         self.simd,
                         &[
                             s1, s1, s1, s1, s2, s2, s2, s2, s3, s3, s3, s3, s4, s4, s4, s4,
                         ],
                     );
                     T::NumericVec::from_u8(self.simd, samples)
                 });

                 T::apply_mask(self.simd, blend_buf, iter);
             }
             Cmd::Opacity(o) => {
                 if *o != 1.0 {
                     let blend_buf = self.blend_buf.last_mut().unwrap();

                     T::apply_mask(
                         self.simd,
                         blend_buf,
                         iter::repeat(T::NumericVec::from_f32(
                             self.simd,
                             f32x16::splat(self.simd, *o),
                         )),
                     );
                 }
             }
         }
     }

     /// Fill at a given x and with a width using the given paint.
     // For short strip segments, benchmarks showed that not inlining leads to significantly
     // worse performance.
     pub fn fill(
         &mut self,
         x: usize,
         width: usize,
         fill: &Paint,
         blend_mode: BlendMode,
         encoded_paints: &[EncodedPaint],
         alphas: Option<&[u8]>,
     ) {
         let blend_buf = &mut self.blend_buf.last_mut().unwrap()[x * TILE_HEIGHT_COMPONENTS..]
             [..TILE_HEIGHT_COMPONENTS * width];
         let default_blend = blend_mode.is_default();

         match fill {
             Paint::Solid(color) => {
                 let color = T::extract_color(*color);

                 // If color is completely opaque, we can just directly override
                 // the blend buffer.
                 if color[3] == T::Numeric::ONE && default_blend && alphas.is_none() {
                     T::copy_solid(self.simd, blend_buf, color);

                     return;
                 }

                 if default_blend {
                     T::alpha_composite_solid(self.simd, blend_buf, color, alphas);
                 } else {
                     T::blend(
                         self.simd,
                         blend_buf,
                         iter::repeat(T::Composite::from_color(self.simd, color)),
                         blend_mode,
                         alphas,
                     );
                 }
             }
             Paint::Indexed(paint) => {
                 let color_buf = &mut self.paint_buf[x * TILE_HEIGHT_COMPONENTS..]
                     [..TILE_HEIGHT_COMPONENTS * width];

                 let encoded_paint = &encoded_paints[paint.index()];

                 let start_x = self.wide_coords.0 * WideTile::WIDTH + x as u16;
                 let start_y = self.wide_coords.1 * Tile::HEIGHT;

                 // We need to have this as a macro because closures cannot take generic arguments, and
                 // we would have to repeatedly provide all arguments if we made it a function.
                 macro_rules! fill_complex_paint {
                     ($has_opacities:expr, $filler:expr) => {
                         if $has_opacities || alphas.is_some() {
                             T::apply_painter(self.simd, color_buf, $filler);

                             if default_blend {
                                 T::alpha_composite_buffer(self.simd, blend_buf, color_buf, alphas);
                             } else {
                                 T::blend(
                                     self.simd,
                                     blend_buf,
                                     color_buf
                                         .chunks_exact(T::Composite::LENGTH)
                                         .map(|s| T::Composite::from_slice(self.simd, s)),
                                     blend_mode,
                                     alphas,
                                 );
                             }
                         } else {
                             // Similarly to solid colors we can just override the previous values
                             // if all colors in the gradient are fully opaque.
                             T::apply_painter(self.simd, blend_buf, $filler);
                         }
                     };
                 }

                 match encoded_paint {
                     EncodedPaint::BlurredRoundedRect(b) => {
                         fill_complex_paint!(
                             true,
                             T::blurred_rounded_rectangle_painter(self.simd, b, start_x, start_y)
                         );
                     }
                     EncodedPaint::Gradient(g) => {
                         // Note that we are calculating the t values first, store them in a separate
                         // buffer and then pass that buffer to the iterator instead of calculating
                         // the t values on the fly in the iterator. The latter would be faster, but
                         // it would probably increase code size a lot, because the functions for
                         // position calculation need to be inlined for good performance.
                         let f32_buf = &mut self.f32_buf[..width * Tile::HEIGHT as usize];

                         match &g.kind {
                             EncodedKind::Linear(l) => {
                                 calculate_t_vals(
                                     self.simd,
                                     SimdLinearKind::new(self.simd, *l),
                                     f32_buf,
                                     g,
                                     start_x,
                                     start_y,
                                 );

                                 fill_complex_paint!(
                                     g.has_opacities,
                                     T::gradient_painter(self.simd, g, f32_buf)
                                 );
                             }
                             EncodedKind::Sweep(s) => {
                                 calculate_t_vals(
                                     self.simd,
                                     SimdSweepKind::new(self.simd, s),
                                     f32_buf,
                                     g,
                                     start_x,
                                     start_y,
                                 );

                                 fill_complex_paint!(
                                     g.has_opacities,
                                     T::gradient_painter(self.simd, g, f32_buf)
                                 );
                             }
                             EncodedKind::Radial(r) => {
                                 calculate_t_vals(
                                     self.simd,
                                     SimdRadialKind::new(self.simd, r),
                                     f32_buf,
                                     g,
                                     start_x,
                                     start_y,
                                 );

                                 if r.has_undefined() {
                                     fill_complex_paint!(
                                         g.has_opacities,
                                         T::gradient_painter_with_undefined(self.simd, g, f32_buf)
                                     );
                                 } else {
                                     fill_complex_paint!(
                                         g.has_opacities,
                                         T::gradient_painter(self.simd, g, f32_buf)
                                     );
                                 }
                             }
                         }
                     }
                     EncodedPaint::Image(i) => {
                         let ImageSource::Pixmap(pixmap) = &i.source else {
                             panic!("vello_cpu doesn't support the opaque image source.");
                         };

                         match (i.has_skew(), i.nearest_neighbor()) {
                             (_, false) => {
                                 if i.quality == ImageQuality::Medium {
                                     fill_complex_paint!(
                                         i.has_opacities,
                                         T::medium_quality_image_painter(
                                             self.simd, i, pixmap, start_x, start_y
                                         )
                                     );
                                 } else {
                                     fill_complex_paint!(
                                         i.has_opacities,
                                         T::high_quality_image_painter(
                                             self.simd, i, pixmap, start_x, start_y
                                         )
                                     );
                                 }
                             }
                             (false, true) => {
                                 fill_complex_paint!(
                                     i.has_opacities,
                                     T::plain_nn_image_painter(
                                         self.simd, i, pixmap, start_x, start_y
                                     )
                                 );
                             }
                             (true, true) => {
                                 fill_complex_paint!(
                                     i.has_opacities,
                                     T::nn_image_painter(self.simd, i, pixmap, start_x, start_y)
                                 );
                             }
                         }
                     }
                 }
             }
         }
     }

     fn blend(&mut self, blend_mode: BlendMode) {
         let (source_buffer, rest) = self.blend_buf.split_last_mut().unwrap();
         let target_buffer = rest.last_mut().unwrap();

         if blend_mode.is_default() {
             T::alpha_composite_buffer(self.simd, target_buffer, source_buffer, None);
         } else {
             T::blend(
                 self.simd,
                 target_buffer,
                 source_buffer
                     .chunks_exact(T::Composite::LENGTH)
                     .map(|s| T::Composite::from_slice(self.simd, s)),
                 blend_mode,
                 None,
             );
         }
     }

     fn clip(&mut self, x: usize, width: usize, alphas: Option<&[u8]>) {
         let (source_buffer, rest) = self.blend_buf.split_last_mut().unwrap();
         let target_buffer = rest.last_mut().unwrap();

         let source_buffer =
             &mut source_buffer[x * TILE_HEIGHT_COMPONENTS..][..TILE_HEIGHT_COMPONENTS * width];
         let target_buffer =
             &mut target_buffer[x * TILE_HEIGHT_COMPONENTS..][..TILE_HEIGHT_COMPONENTS * width];

         T::alpha_composite_buffer(self.simd, target_buffer, source_buffer, alphas);
     }
 }

 /// A trait for shaders that can render their contents into a u8/f32 buffer. Note that while
 /// the trait has a method for both, f32 and u8, some shaders might only support 1 of them, so
 /// care is needed when using them.
 pub trait Painter {
     fn paint_u8(&mut self, buf: &mut [u8]);
     fn paint_f32(&mut self, buf: &mut [f32]);
 }

 /// Calculate the x/y position using the x/y advances for each pixel, assuming a tile height of 4.
 pub trait PosExt<S: Simd> {
     fn splat_pos(simd: S, pos: f32, x_advance: f32, y_advance: f32) -> Self;
 }

 impl<S: Simd> PosExt<S> for f32x4<S> {
     #[inline(always)]
     fn splat_pos(simd: S, pos: f32, _: f32, y_advance: f32) -> Self {
         let columns: [f32; Tile::HEIGHT as usize] = [0.0, 1.0, 2.0, 3.0];
         let column_mask: Self = columns.simd_into(simd);

         Self::splat(simd, pos).madd(column_mask, Self::splat(simd, y_advance))
     }
 }

 impl<S: Simd> PosExt<S> for f32x8<S> {
     #[inline(always)]
     fn splat_pos(simd: S, pos: f32, x_advance: f32, y_advance: f32) -> Self {
         simd.combine_f32x4(
             f32x4::splat_pos(simd, pos, x_advance, y_advance),
             f32x4::splat_pos(simd, pos + x_advance, x_advance, y_advance),
         )
     }
 }

 /// The results of an f32 shader, where each channel stored separately.
 pub(crate) struct ShaderResultF32<S: Simd> {
     pub(crate) r: f32x8<S>,
     pub(crate) g: f32x8<S>,
     pub(crate) b: f32x8<S>,
     pub(crate) a: f32x8<S>,
 }

 impl<S: Simd> ShaderResultF32<S> {
     /// Convert the result into two f32x16 elements, interleaved as RGBA.
     #[inline(always)]
     pub(crate) fn get(&self) -> (f32x16<S>, f32x16<S>) {
         let (r_1, r_2) = self.r.simd.split_f32x8(self.r);
         let (g_1, g_2) = self.g.simd.split_f32x8(self.g);
         let (b_1, b_2) = self.b.simd.split_f32x8(self.b);
         let (a_1, a_2) = self.a.simd.split_f32x8(self.a);

         let first = self.r.simd.combine_f32x8(
             self.r.simd.combine_f32x4(r_1, g_1),
             self.r.simd.combine_f32x4(b_1, a_1),
         );

         let second = self.r.simd.combine_f32x8(
             self.r.simd.combine_f32x4(r_2, g_2),
             self.r.simd.combine_f32x4(b_2, a_2),
         );

         (first, second)
     }
 }

 mod macros {
     /// The default `Painter` implementation for an iterator
     /// that returns its results as f32x16.
     macro_rules! f32x16_painter {
         ($($type_path:tt)+) => {
             impl<S: Simd> crate::fine::Painter for $($type_path)+ {
                 fn paint_u8(&mut self, buf: &mut [u8]) {
                     use vello_common::fearless_simd::*;
                     use crate::fine::NumericVec;

                     for chunk in buf.chunks_exact_mut(16) {
                         let next = self.next().unwrap();
                         let converted = u8x16::<S>::from_f32(next.simd, next);
                         chunk.copy_from_slice(&converted.val);
                     }
                 }

                 fn paint_f32(&mut self, buf: &mut [f32]) {


                     for chunk in buf.chunks_exact_mut(16) {
                         let next = self.next().unwrap();
                         chunk.copy_from_slice(&next.val);
                     }
                 }
             }
         };
     }

     /// The default `Painter` implementation for an iterator
     /// that returns its results as u8x16.
     macro_rules! u8x16_painter {
         ($($type_path:tt)+) => {
             impl<S: Simd> crate::fine::Painter for $($type_path)+ {
                 fn paint_u8(&mut self, buf: &mut [u8]) {
                     for chunk in buf.chunks_exact_mut(16) {
                         let next = self.next().unwrap();
                         chunk.copy_from_slice(&next.val);
                     }
                 }

                 fn paint_f32(&mut self, buf: &mut [f32]) {
                     use vello_common::fearless_simd::*;
                     use crate::fine::NumericVec;

                     for chunk in buf.chunks_exact_mut(16) {
                         let next = self.next().unwrap();
                         let converted = f32x16::<S>::from_u8(next.simd, next);
                         chunk.copy_from_slice(&converted.val);
                     }
                 }
             }
         };
     }

     pub(crate) use f32x16_painter;
     pub(crate) use u8x16_painter;
 }
	// Copyright 2025 the Vello Authors
	// SPDX-License-Identifier: Apache-2.0 OR MIT

	mod common;
	mod highp;
	mod lowp;

	use crate::peniko::{BlendMode, Compose, ImageQuality, Mix};
	use crate::region::Region;
	use alloc::vec;
	use alloc::vec::Vec;
	use core::fmt::Debug;
	use core::iter;
	use vello_common::coarse::{Cmd, WideTile};
	use vello_common::encode::{
	EncodedBlurredRoundedRectangle, EncodedGradient, EncodedImage, EncodedKind, EncodedPaint,
	};
	use vello_common::paint::{ImageSource, Paint, PremulColor};
	use vello_common::tile::Tile;

	pub(crate) const COLOR_COMPONENTS: usize = 4;
	pub(crate) const TILE_HEIGHT_COMPONENTS: usize = Tile::HEIGHT as usize * COLOR_COMPONENTS;
	pub const SCRATCH_BUF_SIZE: usize =
	WideTile::WIDTH as usize * Tile::HEIGHT as usize * COLOR_COMPONENTS;

	use crate::fine::common::gradient::linear::SimdLinearKind;
	use crate::fine::common::gradient::radial::SimdRadialKind;
	use crate::fine::common::gradient::sweep::SimdSweepKind;
	use crate::fine::common::gradient::{GradientPainter, calculate_t_vals};
	use crate::fine::common::image::{FilteredImagePainter, NNImagePainter, PlainNNImagePainter};
	use crate::fine::common::rounded_blurred_rect::BlurredRoundedRectFiller;
	use crate::util::{BlendModeExt, EncodedImageExt};
	pub use highp::F32Kernel;
	pub use lowp::U8Kernel;
	use vello_common::fearless_simd::{
	Simd, SimdBase, SimdFloat, SimdInto, f32x4, f32x8, f32x16, u8x16, u8x32, u32x4, u32x8,
	};
	use vello_common::pixmap::Pixmap;
	use vello_common::simd::Splat4thExt;

	pub type ScratchBuf<F> = [F; SCRATCH_BUF_SIZE];

	pub trait Numeric: Copy + Default + Clone + Debug + PartialEq + Send + Sync + 'static {
	const ZERO: Self;
	const ONE: Self;
	}

	impl Numeric for f32 {
	const ZERO: Self = 0.0;
	const ONE: Self = 1.0;
	}

	impl Numeric for u8 {
	const ZERO: Self = 0;
	const ONE: Self = 255;
	}

	pub trait NumericVec<S: Simd>: Copy + Clone + Send + Sync {
	fn from_f32(simd: S, val: f32x16<S>) -> Self;
	fn from_u8(simd: S, val: u8x16<S>) -> Self;
	}

	impl<S: Simd> NumericVec<S> for f32x16<S> {
	#[inline(always)]
	fn from_f32(_: S, val: Self) -> Self {
	val
	}

	#[inline(always)]
	fn from_u8(simd: S, val: u8x16<S>) -> Self {
	let converted = u8_to_f32(val);
	converted * Self::splat(simd, 1.0 / 255.0)
	}
	}

	impl<S: Simd> NumericVec<S> for u8x16<S> {
	#[inline(always)]
	fn from_f32(simd: S, val: f32x16<S>) -> Self {
	let v1 = f32x16::splat(simd, 255.0);
	let v2 = f32x16::splat(simd, 0.5);
	let mulled = v2.madd(v1, val);

	f32_to_u8(mulled)
	}

	#[inline(always)]
	fn from_u8(_: S, val: Self) -> Self {
	val
	}
	}

	#[inline(always)]
	pub(crate) fn f32_to_u8<S: Simd>(val: f32x16<S>) -> u8x16<S> {
	let simd = val.simd;
	// Note that converting to u32 first using SIMD and then u8
	// is much faster than converting directly from f32 to u8.
	let converted = simd.cvt_u32_f32x16(val);

	// TODO: Maybe we can also do this using SIMD?
	[
	converted[0] as u8,
	converted[1] as u8,
	converted[2] as u8,
	converted[3] as u8,
	converted[4] as u8,
	converted[5] as u8,
	converted[6] as u8,
	converted[7] as u8,
	converted[8] as u8,
	converted[9] as u8,
	converted[10] as u8,
	converted[11] as u8,
	converted[12] as u8,
	converted[13] as u8,
	converted[14] as u8,
	converted[15] as u8,
	]
	.simd_into(val.simd)
	}

	#[inline(always)]
	pub(crate) fn u8_to_f32<S: Simd>(val: u8x16<S>) -> f32x16<S> {
	// TODO: SIMDify
	[
	val.val[0] as f32,
	val.val[1] as f32,
	val.val[2] as f32,
	val.val[3] as f32,
	val.val[4] as f32,
	val.val[5] as f32,
	val.val[6] as f32,
	val.val[7] as f32,
	val.val[8] as f32,
	val.val[9] as f32,
	val.val[10] as f32,
	val.val[11] as f32,
	val.val[12] as f32,
	val.val[13] as f32,
	val.val[14] as f32,
	val.val[15] as f32,
	]
	.simd_into(val.simd)
	}

	pub trait CompositeType<N: Numeric, S: Simd>: Copy + Clone + Send + Sync {
	const LENGTH: usize;

	fn from_slice(simd: S, slice: &[N]) -> Self;
	fn from_color(simd: S, color: [N; 4]) -> Self;
	}

	impl<S: Simd> CompositeType<f32, S> for f32x16<S> {
	const LENGTH: usize = 16;

	#[inline(always)]
	fn from_slice(simd: S, slice: &[f32]) -> Self {
	<Self as SimdBase<_, _>>::from_slice(simd, slice)
	}

	#[inline(always)]
	fn from_color(simd: S, color: [f32; 4]) -> Self {
	Self::block_splat(f32x4::from_slice(simd, &color[..]))
	}
	}

	impl<S: Simd> CompositeType<u8, S> for u8x32<S> {
	const LENGTH: usize = 32;

	#[inline(always)]
	fn from_slice(simd: S, slice: &[u8]) -> Self {
	<Self as SimdBase<_, _>>::from_slice(simd, slice)
	}

	#[inline(always)]
	fn from_color(simd: S, color: [u8; 4]) -> Self {
	u32x8::block_splat(u32x4::splat(simd, u32::from_ne_bytes(color))).reinterpret_u8()
	}
	}

	/// A kernel for performing fine rasterization.
	pub trait FineKernel<S: Simd>: Send + Sync + 'static {
	/// The basic underlying numerical type of the kernel.
	type Numeric: Numeric;
	/// The type that is used for blending and compositing.
	type Composite: CompositeType<Self::Numeric, S>;
	/// The base SIMD vector type for converting between u8 and f32.
	type NumericVec: NumericVec<S>;

	/// Extract the color from a premultiplied color.
	fn extract_color(color: PremulColor) -> [Self::Numeric; 4];
	/// Pack the blend buf into the given region.
	fn pack(simd: S, region: &mut Region<'_>, blend_buf: &[Self::Numeric]);
	/// Repeatedly copy the solid color into the target buffer.
	fn copy_solid(simd: S, target: &mut [Self::Numeric], color: [Self::Numeric; 4]);
	/// Return the painter used for painting gradients.
	fn gradient_painter<'a>(
	simd: S,
	gradient: &'a EncodedGradient,
	t_vals: &'a [f32],
	) -> impl Painter + 'a {
	GradientPainter::new(simd, gradient, false, t_vals)
	}
	/// Return the painter used for painting gradients, with support for masking undefined locations.
	fn gradient_painter_with_undefined<'a>(
	simd: S,
	gradient: &'a EncodedGradient,
	t_vals: &'a [f32],
	) -> impl Painter + 'a {
	GradientPainter::new(simd, gradient, true, t_vals)
	}
	/// Return the painter used for painting plain nearest-neighbor images.
	///
	/// Plain nearest-neighbor images are images with the quality 'Low' and no skewing component in their
	/// transform.
	fn plain_nn_image_painter<'a>(
	simd: S,
	image: &'a EncodedImage,
	pixmap: &'a Pixmap,
	start_x: u16,
	start_y: u16,
	) -> impl Painter + 'a {
	PlainNNImagePainter::new(simd, image, pixmap, start_x, start_y)
	}
	/// Return the painter used for painting plain nearest-neighbor images.
	///
	/// Same as `plain_nn`, but must also support skewing transforms.
	fn nn_image_painter<'a>(
	simd: S,
	image: &'a EncodedImage,
	pixmap: &'a Pixmap,
	start_x: u16,
	start_y: u16,
	) -> impl Painter + 'a {
	NNImagePainter::new(simd, image, pixmap, start_x, start_y)
	}
	/// Return the painter used for painting image with `Medium` quality.
	fn medium_quality_image_painter<'a>(
	simd: S,
	image: &'a EncodedImage,
	pixmap: &'a Pixmap,
	start_x: u16,
	start_y: u16,
	) -> impl Painter + 'a {
	FilteredImagePainter::new(simd, image, pixmap, start_x, start_y)
	}
	/// Return the painter used for painting image with `High` quality.
	fn high_quality_image_painter<'a>(
	simd: S,
	image: &'a EncodedImage,
	pixmap: &'a Pixmap,
	start_x: u16,
	start_y: u16,
	) -> impl Painter + 'a {
	FilteredImagePainter::new(simd, image, pixmap, start_x, start_y)
	}
	/// Return the painter used for painting blurred rounded rectangles.
	fn blurred_rounded_rectangle_painter<'a>(
	simd: S,
	rect: &'a EncodedBlurredRoundedRectangle,
	start_x: u16,
	start_y: u16,
	) -> impl Painter + 'a {
	BlurredRoundedRectFiller::new(simd, rect, start_x, start_y)
	}
	/// Apply the mask to the destination buffer.
	fn apply_mask(simd: S, dest: &mut [Self::Numeric], src: impl Iterator<Item = Self::NumericVec>);
	/// Apply the painter to the destination buffer.
	fn apply_painter<'a>(simd: S, dest: &mut [Self::Numeric], painter: impl Painter + 'a);
	/// Do basic alpha compositing with a solid color.
	fn alpha_composite_solid(
	simd: S,
	target: &mut [Self::Numeric],
	src: [Self::Numeric; 4],
	alphas: Option<&[u8]>,
	);
	/// Do basic alpha compositing with the given buffer.
	fn alpha_composite_buffer(
	simd: S,
	dest: &mut [Self::Numeric],
	src: &[Self::Numeric],
	alphas: Option<&[u8]>,
	);
	/// Blend the source into the destination with the given blend mode.
	fn blend(
	simd: S,
	dest: &mut [Self::Numeric],
	src: impl Iterator<Item = Self::Composite>,
	blend_mode: BlendMode,
	alphas: Option<&[u8]>,
	);
	}

	/// An object for performing fine rasterization
	#[derive(Debug)]
	pub struct Fine<S: Simd, T: FineKernel<S>> {
	/// The coordinates of the currently covered wide tile.
	pub(crate) wide_coords: (u16, u16),
	/// The stack of blend buffers.
	pub(crate) blend_buf: Vec<ScratchBuf<T::Numeric>>,
	/// An intermediate buffer used by shaders to store their contents.
	pub(crate) paint_buf: ScratchBuf<T::Numeric>,
	/// An intermediate buffer used by gradients to store the t values.
	pub(crate) f32_buf: Vec<f32>,
	pub(crate) simd: S,
	}

	impl<S: Simd, T: FineKernel<S>> Fine<S, T> {
	pub fn new(simd: S) -> Self {
	Self {
	simd,
	wide_coords: (0, 0),
	blend_buf: vec![[T::Numeric::ZERO; SCRATCH_BUF_SIZE]],
	f32_buf: vec![0.0; SCRATCH_BUF_SIZE / 4],
	paint_buf: [T::Numeric::ZERO; SCRATCH_BUF_SIZE],
	}
	}

	pub fn set_coords(&mut self, x: u16, y: u16) {
	self.wide_coords = (x, y);
	}

	pub fn clear(&mut self, premul_color: PremulColor) {
	let converted_color = T::extract_color(premul_color);
	let blend_buf = self.blend_buf.last_mut().unwrap();

	T::copy_solid(self.simd, blend_buf, converted_color);
	}

	pub fn pack(&self, region: &mut Region<'_>) {
	let blend_buf = self.blend_buf.last().unwrap();

	T::pack(self.simd, region, blend_buf);
	}

	pub(crate) fn run_cmd(&mut self, cmd: &Cmd, alphas: &[u8], paints: &[EncodedPaint]) {
	match cmd {
	Cmd::Fill(f) => {
	self.fill(
	usize::from(f.x),
	usize::from(f.width),
	&f.paint,
	f.blend_mode
	.unwrap_or(BlendMode::new(Mix::Normal, Compose::SrcOver)),
	paints,
	None,
	);
	}
	Cmd::AlphaFill(s) => {
	self.fill(
	usize::from(s.x),
	usize::from(s.width),
	&s.paint,
	s.blend_mode
	.unwrap_or(BlendMode::new(Mix::Normal, Compose::SrcOver)),
	paints,
	Some(&alphas[s.alpha_idx..]),
	);
	}
	Cmd::PushBuf => {
	self.blend_buf.push([T::Numeric::ZERO; SCRATCH_BUF_SIZE]);
	}
	Cmd::PopBuf => {
	self.blend_buf.pop();
	}
	Cmd::ClipFill(cf) => {
	self.clip(cf.x as usize, cf.width as usize, None);
	}
	Cmd::ClipStrip(cs) => {
	self.clip(
	cs.x as usize,
	cs.width as usize,
	Some(&alphas[cs.alpha_idx..]),
	);
	}
	Cmd::Blend(b) => self.blend(*b),
	Cmd::Mask(m) => {
	let start_x = self.wide_coords.0 * WideTile::WIDTH;
	let start_y = self.wide_coords.1 * Tile::HEIGHT;

	let blend_buf = self.blend_buf.last_mut().unwrap();

	let width = (blend_buf.len() / (Tile::HEIGHT as usize * COLOR_COMPONENTS)) as u16;
	let y = start_y as u32 + u32x4::from_slice(self.simd, &[0, 1, 2, 3]);

	let iter = (start_x..(start_x + width)).map(\|x\| {
	let x_in_range = x < m.width();

	macro_rules! sample {
	($idx:expr) => {
	if x_in_range && (y[$idx] as u16) < m.height() {
	m.sample(x, y[$idx] as u16)
	} else {
	0
	}
	};
	}

	let s1 = sample!(0);
	let s2 = sample!(1);
	let s3 = sample!(2);
	let s4 = sample!(3);

	let samples = u8x16::from_slice(
	self.simd,
	&[
	s1, s1, s1, s1, s2, s2, s2, s2, s3, s3, s3, s3, s4, s4, s4, s4,
	],
	);
	T::NumericVec::from_u8(self.simd, samples)
	});

	T::apply_mask(self.simd, blend_buf, iter);
	}
	Cmd::Opacity(o) => {
	if *o != 1.0 {
	let blend_buf = self.blend_buf.last_mut().unwrap();

	T::apply_mask(
	self.simd,
	blend_buf,
	iter::repeat(T::NumericVec::from_f32(
	self.simd,
	f32x16::splat(self.simd, *o),
	)),
	);
	}
	}
	}
	}

	/// Fill at a given x and with a width using the given paint.
	// For short strip segments, benchmarks showed that not inlining leads to significantly
	// worse performance.
	pub fn fill(
	&mut self,
	x: usize,
	width: usize,
	fill: &Paint,
	blend_mode: BlendMode,
	encoded_paints: &[EncodedPaint],
	alphas: Option<&[u8]>,
	) {
	let blend_buf = &mut self.blend_buf.last_mut().unwrap()[x * TILE_HEIGHT_COMPONENTS..]
	[..TILE_HEIGHT_COMPONENTS * width];
	let default_blend = blend_mode.is_default();

	match fill {
	Paint::Solid(color) => {
	let color = T::extract_color(*color);

	// If color is completely opaque, we can just directly override
	// the blend buffer.
	if color[3] == T::Numeric::ONE && default_blend && alphas.is_none() {
	T::copy_solid(self.simd, blend_buf, color);

	return;
	}

	if default_blend {
	T::alpha_composite_solid(self.simd, blend_buf, color, alphas);
	} else {
	T::blend(
	self.simd,
	blend_buf,
	iter::repeat(T::Composite::from_color(self.simd, color)),
	blend_mode,
	alphas,
	);
	}
	}
	Paint::Indexed(paint) => {
	let color_buf = &mut self.paint_buf[x * TILE_HEIGHT_COMPONENTS..]
	[..TILE_HEIGHT_COMPONENTS * width];

	let encoded_paint = &encoded_paints[paint.index()];

	let start_x = self.wide_coords.0 * WideTile::WIDTH + x as u16;
	let start_y = self.wide_coords.1 * Tile::HEIGHT;

	// We need to have this as a macro because closures cannot take generic arguments, and
	// we would have to repeatedly provide all arguments if we made it a function.
	macro_rules! fill_complex_paint {
	($has_opacities:expr, $filler:expr) => {
	if $has_opacities \|\| alphas.is_some() {
	T::apply_painter(self.simd, color_buf, $filler);

	if default_blend {
	T::alpha_composite_buffer(self.simd, blend_buf, color_buf, alphas);
	} else {
	T::blend(
	self.simd,
	blend_buf,
	color_buf
	.chunks_exact(T::Composite::LENGTH)
	.map(\|s\| T::Composite::from_slice(self.simd, s)),
	blend_mode,
	alphas,
	);
	}
	} else {
	// Similarly to solid colors we can just override the previous values
	// if all colors in the gradient are fully opaque.
	T::apply_painter(self.simd, blend_buf, $filler);
	}
	};
	}

	match encoded_paint {
	EncodedPaint::BlurredRoundedRect(b) => {
	fill_complex_paint!(
	true,
	T::blurred_rounded_rectangle_painter(self.simd, b, start_x, start_y)
	);
	}
	EncodedPaint::Gradient(g) => {
	// Note that we are calculating the t values first, store them in a separate
	// buffer and then pass that buffer to the iterator instead of calculating
	// the t values on the fly in the iterator. The latter would be faster, but
	// it would probably increase code size a lot, because the functions for
	// position calculation need to be inlined for good performance.
	let f32_buf = &mut self.f32_buf[..width * Tile::HEIGHT as usize];

	match &g.kind {
	EncodedKind::Linear(l) => {
	calculate_t_vals(
	self.simd,
	SimdLinearKind::new(self.simd, *l),
	f32_buf,
	g,
	start_x,
	start_y,
	);

	fill_complex_paint!(
	g.has_opacities,
	T::gradient_painter(self.simd, g, f32_buf)
	);
	}
	EncodedKind::Sweep(s) => {
	calculate_t_vals(
	self.simd,
	SimdSweepKind::new(self.simd, s),
	f32_buf,
	g,
	start_x,
	start_y,
	);

	fill_complex_paint!(
	g.has_opacities,
	T::gradient_painter(self.simd, g, f32_buf)
	);
	}
	EncodedKind::Radial(r) => {
	calculate_t_vals(
	self.simd,
	SimdRadialKind::new(self.simd, r),
	f32_buf,
	g,
	start_x,
	start_y,
	);

	if r.has_undefined() {
	fill_complex_paint!(
	g.has_opacities,
	T::gradient_painter_with_undefined(self.simd, g, f32_buf)
	);
	} else {
	fill_complex_paint!(
	g.has_opacities,
	T::gradient_painter(self.simd, g, f32_buf)
	);
	}
	}
	}
	}
	EncodedPaint::Image(i) => {
	let ImageSource::Pixmap(pixmap) = &i.source else {
	panic!("vello_cpu doesn't support the opaque image source.");
	};

	match (i.has_skew(), i.nearest_neighbor()) {
	(_, false) => {
	if i.quality == ImageQuality::Medium {
	fill_complex_paint!(
	i.has_opacities,
	T::medium_quality_image_painter(
	self.simd, i, pixmap, start_x, start_y
	)
	);
	} else {
	fill_complex_paint!(
	i.has_opacities,
	T::high_quality_image_painter(
	self.simd, i, pixmap, start_x, start_y
	)
	);
	}
	}
	(false, true) => {
	fill_complex_paint!(
	i.has_opacities,
	T::plain_nn_image_painter(
	self.simd, i, pixmap, start_x, start_y
	)
	);
	}
	(true, true) => {
	fill_complex_paint!(
	i.has_opacities,
	T::nn_image_painter(self.simd, i, pixmap, start_x, start_y)
	);
	}
	}
	}
	}
	}
	}
	}

	fn blend(&mut self, blend_mode: BlendMode) {
	let (source_buffer, rest) = self.blend_buf.split_last_mut().unwrap();
	let target_buffer = rest.last_mut().unwrap();

	if blend_mode.is_default() {
	T::alpha_composite_buffer(self.simd, target_buffer, source_buffer, None);
	} else {
	T::blend(
	self.simd,
	target_buffer,
	source_buffer
	.chunks_exact(T::Composite::LENGTH)
	.map(\|s\| T::Composite::from_slice(self.simd, s)),
	blend_mode,
	None,
	);
	}
	}

	fn clip(&mut self, x: usize, width: usize, alphas: Option<&[u8]>) {
	let (source_buffer, rest) = self.blend_buf.split_last_mut().unwrap();
	let target_buffer = rest.last_mut().unwrap();

	let source_buffer =
	&mut source_buffer[x * TILE_HEIGHT_COMPONENTS..][..TILE_HEIGHT_COMPONENTS * width];
	let target_buffer =
	&mut target_buffer[x * TILE_HEIGHT_COMPONENTS..][..TILE_HEIGHT_COMPONENTS * width];

	T::alpha_composite_buffer(self.simd, target_buffer, source_buffer, alphas);
	}
	}

	/// A trait for shaders that can render their contents into a u8/f32 buffer. Note that while
	/// the trait has a method for both, f32 and u8, some shaders might only support 1 of them, so
	/// care is needed when using them.
	pub trait Painter {
	fn paint_u8(&mut self, buf: &mut [u8]);
	fn paint_f32(&mut self, buf: &mut [f32]);
	}

	/// Calculate the x/y position using the x/y advances for each pixel, assuming a tile height of 4.
	pub trait PosExt<S: Simd> {
	fn splat_pos(simd: S, pos: f32, x_advance: f32, y_advance: f32) -> Self;
	}

	impl<S: Simd> PosExt<S> for f32x4<S> {
	#[inline(always)]
	fn splat_pos(simd: S, pos: f32, _: f32, y_advance: f32) -> Self {
	let columns: [f32; Tile::HEIGHT as usize] = [0.0, 1.0, 2.0, 3.0];
	let column_mask: Self = columns.simd_into(simd);

	Self::splat(simd, pos).madd(column_mask, Self::splat(simd, y_advance))
	}
	}

	impl<S: Simd> PosExt<S> for f32x8<S> {
	#[inline(always)]
	fn splat_pos(simd: S, pos: f32, x_advance: f32, y_advance: f32) -> Self {
	simd.combine_f32x4(
	f32x4::splat_pos(simd, pos, x_advance, y_advance),
	f32x4::splat_pos(simd, pos + x_advance, x_advance, y_advance),
	)
	}
	}

	/// The results of an f32 shader, where each channel stored separately.
	pub(crate) struct ShaderResultF32<S: Simd> {
	pub(crate) r: f32x8<S>,
	pub(crate) g: f32x8<S>,
	pub(crate) b: f32x8<S>,
	pub(crate) a: f32x8<S>,
	}

	impl<S: Simd> ShaderResultF32<S> {
	/// Convert the result into two f32x16 elements, interleaved as RGBA.
	#[inline(always)]
	pub(crate) fn get(&self) -> (f32x16<S>, f32x16<S>) {
	let (r_1, r_2) = self.r.simd.split_f32x8(self.r);
	let (g_1, g_2) = self.g.simd.split_f32x8(self.g);
	let (b_1, b_2) = self.b.simd.split_f32x8(self.b);
	let (a_1, a_2) = self.a.simd.split_f32x8(self.a);

	let first = self.r.simd.combine_f32x8(
	self.r.simd.combine_f32x4(r_1, g_1),
	self.r.simd.combine_f32x4(b_1, a_1),
	);

	let second = self.r.simd.combine_f32x8(
	self.r.simd.combine_f32x4(r_2, g_2),
	self.r.simd.combine_f32x4(b_2, a_2),
	);

	(first, second)
	}
	}

	mod macros {
	/// The default `Painter` implementation for an iterator
	/// that returns its results as f32x16.
	macro_rules! f32x16_painter {
	($($type_path:tt)+) => {
	impl<S: Simd> crate::fine::Painter for $($type_path)+ {
	fn paint_u8(&mut self, buf: &mut [u8]) {
	use vello_common::fearless_simd::*;
	use crate::fine::NumericVec;

	for chunk in buf.chunks_exact_mut(16) {
	let next = self.next().unwrap();
	let converted = u8x16::<S>::from_f32(next.simd, next);
	chunk.copy_from_slice(&converted.val);
	}
	}

	fn paint_f32(&mut self, buf: &mut [f32]) {


	for chunk in buf.chunks_exact_mut(16) {
	let next = self.next().unwrap();
	chunk.copy_from_slice(&next.val);
	}
	}
	}
	};
	}

	/// The default `Painter` implementation for an iterator
	/// that returns its results as u8x16.
	macro_rules! u8x16_painter {
	($($type_path:tt)+) => {
	impl<S: Simd> crate::fine::Painter for $($type_path)+ {
	fn paint_u8(&mut self, buf: &mut [u8]) {
	for chunk in buf.chunks_exact_mut(16) {
	let next = self.next().unwrap();
	chunk.copy_from_slice(&next.val);
	}
	}

	fn paint_f32(&mut self, buf: &mut [f32]) {
	use vello_common::fearless_simd::*;
	use crate::fine::NumericVec;

	for chunk in buf.chunks_exact_mut(16) {
	let next = self.next().unwrap();
	let converted = f32x16::<S>::from_u8(next.simd, next);
	chunk.copy_from_slice(&converted.val);
	}
	}
	}
	};
	}

	pub(crate) use f32x16_painter;
	pub(crate) use u8x16_painter;
	}