Lift image extend calculation out of the hot loop for the simple case

diff --git a/sparse_strips/vello_cpu/src/fine/image.rs b/sparse_strips/vello_cpu/src/fine/image.rs
index e9e2840..77e54a7 100644
--- a/sparse_strips/vello_cpu/src/fine/image.rs
+++ b/sparse_strips/vello_cpu/src/fine/image.rs

@@ -4,7 +4,8 @@
 use crate::fine::{COLOR_COMPONENTS, FineType, Painter, TILE_HEIGHT_COMPONENTS};
 use vello_common::encode::EncodedImage;
 use vello_common::kurbo::{Point, Vec2};
-use vello_common::peniko::{Extend, ImageQuality};
+use vello_common::peniko;
+use vello_common::peniko::ImageQuality;
 use vello_common::tile::Tile;
 
 #[cfg(not(feature = "std"))]
@@ -70,37 +71,11 @@
                     self.cur_pos += self.image.x_advance;
                 });
         } else {
-            // Fast path. Each step in the x/y direction only updates x/y component of the
-            // current position, since we have no skewing.
-            // Most importantly, the y position is the same across each column, allowing us
-            // to precompute it (as well as it's extend).
-            let mut x_pos = self.cur_pos.x;
-            let x_advance = self.image.x_advance.x;
-            let y_advance = self.image.y_advance.y;
-
-            let mut y_positions = [0.0; Tile::HEIGHT as usize];
-
-            for (idx, pos) in y_positions.iter_mut().enumerate() {
-                *pos = extend(
-                    (self.cur_pos.y + y_advance * idx as f64) as f32,
-                    self.image.extends.1,
-                    self.height,
-                    self.height_inv,
-                );
+            match self.image.extends.0 {
+                peniko::Extend::Pad => self.run_simple::<F, Pad>(target),
+                peniko::Extend::Repeat => self.run_simple::<F, Repeat>(target),
+                peniko::Extend::Reflect => self.run_simple::<F, Reflect>(target),
             }
-
-            target
-                .chunks_exact_mut(TILE_HEIGHT_COMPONENTS)
-                .for_each(|column| {
-                    let extended_x_pos = extend(
-                        x_pos as f32,
-                        self.image.extends.0,
-                        self.width,
-                        self.width_inv,
-                    );
-                    self.run_simple_column(column, extended_x_pos, &y_positions);
-                    x_pos += x_advance;
-                });
         }
     }
 
@@ -109,29 +84,48 @@
         clippy::trivially_copy_pass_by_ref,
         reason = "Tile::HEIGHT is expected to increase later."
     )]
-    fn run_simple_column<F: FineType>(
-        &mut self,
-        col: &mut [F],
-        x_pos: f32,
-        y_positions: &[f32; Tile::HEIGHT as usize],
-    ) {
-        for (pixel, y_pos) in col
-            .chunks_exact_mut(COLOR_COMPONENTS)
-            .zip(y_positions.iter())
-        {
-            let sample = match self.image.quality {
-                ImageQuality::Low => F::from_rgba8(
-                    &self
-                        .image
-                        .pixmap
-                        .sample(x_pos as u16, *y_pos as u16)
-                        .to_u8_array()[..],
-                ),
-                ImageQuality::Medium | ImageQuality::High => unimplemented!(),
-            };
+    fn run_simple<F: FineType, E: Extend>(&mut self, target: &mut [F]) {
+        // Fast path. Each step in the x/y direction only updates x/y component of the
+        // current position, since we have no skewing.
+        // Most importantly, the y position is the same across each column, allowing us
+        // to precompute it (as well as its extend).
+        let mut x_pos = self.cur_pos.x;
+        let x_advance = self.image.x_advance.x;
+        let y_advance = self.image.y_advance.y;
 
-            pixel.copy_from_slice(&sample);
+        let mut y_positions = [0.0; Tile::HEIGHT as usize];
+
+        for (idx, pos) in y_positions.iter_mut().enumerate() {
+            *pos = extend(
+                (self.cur_pos.y + y_advance * idx as f64) as f32,
+                self.image.extends.1,
+                self.height,
+                self.height_inv,
+            );
         }
+
+        target
+            .chunks_exact_mut(TILE_HEIGHT_COMPONENTS)
+            .for_each(|column| {
+                let extended_x_pos = E::extend(x_pos as f32, self.width, self.width_inv);
+
+                for (pixel, y_pos) in column
+                    .chunks_exact_mut(COLOR_COMPONENTS)
+                    .zip(y_positions.iter())
+                {
+                    let sample = F::from_rgba8(
+                        &self
+                            .image
+                            .pixmap
+                            .sample(extended_x_pos as u16, *y_pos as u16)
+                            .to_u8_array()[..],
+                    );
+
+                    pixel.copy_from_slice(&sample);
+                }
+
+                x_pos += x_advance;
+            });
     }
 
     fn run_complex_column<F: FineType>(&mut self, col: &mut [F]) {
@@ -264,33 +258,60 @@
 }
 
 #[inline(always)]
-fn extend(val: f32, extend: Extend, max: f32, inv_max: f32) -> f32 {
-    // We cannot chose f32::EPSILON here because for example 30.0 - f32::EPSILON is still 30.0.
-    // This bias should be large enough for all numbers that we support (i.e. <= u16::MAX).
-    const BIAS: f32 = 0.01;
-
+fn extend(val: f32, extend: peniko::Extend, max: f32, inv_max: f32) -> f32 {
     match extend {
+        peniko::Extend::Pad => Pad::extend(val, max, inv_max),
+        peniko::Extend::Repeat => Repeat::extend(val, max, inv_max),
+        peniko::Extend::Reflect => Reflect::extend(val, max, inv_max),
+    }
+}
+
+trait Extend {
+    fn extend(val: f32, max: f32, inv_max: f32) -> f32;
+}
+
+struct Pad;
+impl Extend for Pad {
+    #[inline(always)]
+    fn extend(val: f32, max: f32, _: f32) -> f32 {
+        // We cannot chose f32::EPSILON here because for example 30.0 - f32::EPSILON is still 30.0.
+        // This bias should be large enough for all numbers that we support (i.e. <= u16::MAX).
+        const BIAS: f32 = 0.01;
+
         // Note that max should be exclusive, so subtract a small bias to enforce that.
         // Otherwise, we might sample out-of-bounds pixels.
         // Also note that we intentionally don't use `clamp` here, because it's slower than
         // doing `min` + `max`.
-        Extend::Pad => val.min(max - BIAS).max(0.0),
-        Extend::Repeat => val - floor(val * inv_max) * max,
+        val.min(max - BIAS).max(0.0)
+    }
+}
+
+struct Repeat;
+impl Extend for Repeat {
+    #[inline(always)]
+    fn extend(val: f32, max: f32, inv_max: f32) -> f32 {
+        val - floor(val * inv_max) * max
+    }
+}
+
+struct Reflect;
+impl Extend for Reflect {
+    #[inline(always)]
+    fn extend(val: f32, max: f32, inv_max: f32) -> f32 {
         // <https://github.com/google/skia/blob/220738774f7a0ce4a6c7bd17519a336e5e5dea5b/src/opts/SkRasterPipeline_opts.h#L3274-L3290>
-        Extend::Reflect => {
-            let u = val - floor(val * inv_max * 0.5) * 2.0 * max;
-            let s = floor(u * inv_max);
-            let m = u - 2.0 * s * (u - max);
 
-            let bias_in_ulps = s.trunc();
+        let u = val - floor(val * inv_max * 0.5) * 2.0 * max;
+        let s = floor(u * inv_max);
+        let m = u - 2.0 * s * (u - max);
 
-            let m_bits = m.to_bits();
-            // This would yield NaN if `m` is 0 and `bias_in_ulps` > 0, but since
-            // our `max` is always an integer number, u and s must also be an integer number
-            // and thus `m_bits` must be 0.
-            let biased_bits = m_bits.wrapping_sub(bias_in_ulps as u32);
-            f32::from_bits(biased_bits)
-        }
+        let bias_in_ulps = s.trunc();
+
+        let m_bits = m.to_bits();
+        // This would yield NaN if `m` is 0 and `bias_in_ulps` > 0, but since
+        // our `max` is always an integer number, u and s must also be an integer number
+        // and thus `m_bits` must be 0.
+        let biased_bits = m_bits.wrapping_sub(bias_in_ulps as u32);
+        f32::from_bits(biased_bits)
     }
 }