Partly SIMDify the `f32_to_u8` method (#1102)
This gives me -7% runtime on the bilinear image rendering benchmark on
NEON:
```
Running benches/main.rs (/Users/lstampfl/Programming/GitHub/vello/target/release/deps/main-011ad84cd531a979)
fine/image/quality/medium_u8_neon
time: [3.4155 µs 3.4190 µs 3.4231 µs]
change: [-7.3623% -7.1971% -7.0174%] (p = 0.00 < 0.05)
Performance has improved.
Found 7 outliers among 100 measurements (7.00%)
2 (2.00%) high mild
5 (5.00%) high severe
```
On WASM, it drops the frame time from 40ms to 30ms, so a huge
difference! I'm not sure if it's possible to easily SIMDify the u32 to
u8 conversion (at least from what I can tell `tiny-skia` also only uses
SIMD for the f32 to u32 conversion), open to suggestions, though.diff --git a/sparse_strips/vello_cpu/src/fine/mod.rs b/sparse_strips/vello_cpu/src/fine/mod.rs
index 07f849b..9feb999 100644
--- a/sparse_strips/vello_cpu/src/fine/mod.rs
+++ b/sparse_strips/vello_cpu/src/fine/mod.rs
@@ -91,24 +91,29 @@
#[inline(always)]
pub(crate) fn f32_to_u8<S: Simd>(val: f32x16<S>) -> u8x16<S> {
- // TODO: SIMDify
+ let simd = val.simd;
+ // Note that converting to u32 first using SIMD and then u8
+ // is much faster than converting directly from f32 to u8.
+ let converted = simd.cvt_u32_f32x16(val);
+
+ // TODO: Maybe we can also do this using SIMD?
[
- val.val[0] as u8,
- val.val[1] as u8,
- val.val[2] as u8,
- val.val[3] as u8,
- val.val[4] as u8,
- val.val[5] as u8,
- val.val[6] as u8,
- val.val[7] as u8,
- val.val[8] as u8,
- val.val[9] as u8,
- val.val[10] as u8,
- val.val[11] as u8,
- val.val[12] as u8,
- val.val[13] as u8,
- val.val[14] as u8,
- val.val[15] as u8,
+ converted[0] as u8,
+ converted[1] as u8,
+ converted[2] as u8,
+ converted[3] as u8,
+ converted[4] as u8,
+ converted[5] as u8,
+ converted[6] as u8,
+ converted[7] as u8,
+ converted[8] as u8,
+ converted[9] as u8,
+ converted[10] as u8,
+ converted[11] as u8,
+ converted[12] as u8,
+ converted[13] as u8,
+ converted[14] as u8,
+ converted[15] as u8,
]
.simd_into(val.simd)
}