| // Copyright 2025 the Vello Authors |
| // SPDX-License-Identifier: Apache-2.0 OR MIT |
| |
| use crate::peniko::{BlendMode, Compose, ImageQuality, Mix}; |
| use vello_common::encode::EncodedImage; |
| use vello_common::fearless_simd::{Simd, SimdBase, f32x4, u8x32, u16x16, u16x32}; |
| use vello_common::math::FloatExt; |
| |
| #[allow( |
| dead_code, |
| reason = "this is not used because the division by 255 is now done with SIMD, but\ |
| we still keep it around to document its properties." |
| )] |
| pub(crate) mod scalar { |
| /// Perform an approximate division by 255. |
| /// |
| /// There are three reasons for having this method. |
| /// 1) Divisions are slower than shifting + adding, and the compiler does not seem to replace |
| /// divisions by 255 with an equivalent (this was verified by benchmarking; doing / 255 was |
| /// significantly slower). |
| /// 2) Integer divisions are usually not available in SIMD, so this provides a good baseline |
| /// implementation. |
| /// 3) There are two options for performing the division: One is to perform the division |
| /// in a way that completely preserves the rounding semantics of a integer division by |
| /// 255. This could be achieved using the implementation `(val + 1 + (val >> 8)) >> 8`. |
| /// The second approach (used here) has slightly different rounding behavior to a |
| /// normal division by 255, but is much faster (see <https://github.com/linebender/vello/issues/904>) |
| /// and therefore preferable for the high-performance pipeline. |
| /// |
| /// Four properties worth mentioning: |
| /// - This actually calculates the ceiling of `val / 256`. |
| /// - Within the allowed range for `val`, rounding errors do not appear for values divisible by 255, i.e. any call `div_255(val * 255)` will always yield `val`. |
| /// - If there is a discrepancy, this division will always yield a value 1 higher than the original. |
| /// - This holds for values of `val` up to and including `65279`. You should not call this function with higher values. |
| #[inline(always)] |
| pub(crate) const fn div_255(val: u16) -> u16 { |
| debug_assert!( |
| val < 65280, |
| "the properties of `div_255` do not hold for values of `65280` or greater" |
| ); |
| (val + 255) >> 8 |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use crate::util::scalar::div_255; |
| |
| #[test] |
| fn div_255_properties() { |
| for i in 0_u16..256 * 255 { |
| let expected = i / 255; |
| let actual = div_255(i); |
| |
| assert!( |
| expected <= actual, |
| "In case of a discrepancy, the division should yield a value higher than the original." |
| ); |
| |
| let diff = expected.abs_diff(actual); |
| assert!(diff <= 1, "Rounding error shouldn't be higher than 1."); |
| |
| if i % 255 == 0 { |
| assert_eq!(diff, 0, "Division should be accurate for multiples of 255."); |
| } |
| } |
| } |
| } |
| } |
| |
| pub(crate) trait NormalizedMulExt { |
| fn normalized_mul(self, other: Self) -> Self; |
| } |
| |
| impl<S: Simd> NormalizedMulExt for u8x32<S> { |
| #[inline(always)] |
| fn normalized_mul(self, other: Self) -> Self { |
| let divided = (self.simd.widen_u8x32(self) * other.simd.widen_u8x32(other)).div_255(); |
| self.simd.narrow_u16x32(divided) |
| } |
| } |
| |
| pub(crate) trait Div255Ext { |
| fn div_255(self) -> Self; |
| } |
| |
| impl<S: Simd> Div255Ext for u16x32<S> { |
| #[inline(always)] |
| fn div_255(self) -> Self { |
| let p1 = Self::splat(self.simd, 255); |
| let p2 = self + p1; |
| p2.shr(8) |
| } |
| } |
| |
| impl<S: Simd> Div255Ext for u16x16<S> { |
| #[inline(always)] |
| fn div_255(self) -> Self { |
| let p1 = Self::splat(self.simd, 255); |
| let p2 = self + p1; |
| p2.shr(8) |
| } |
| } |
| |
| #[inline(always)] |
| pub(crate) fn normalized_mul<S: Simd>(a: u8x32<S>, b: u8x32<S>) -> u16x32<S> { |
| (S::widen_u8x32(a.simd, a) * S::widen_u8x32(b.simd, b)).div_255() |
| } |
| |
| pub(crate) trait BlendModeExt { |
| fn is_default(&self) -> bool; |
| } |
| |
| impl BlendModeExt for BlendMode { |
| // peniko uses `Clip` instead of `Normal` as the default, hence this override. |
| fn is_default(&self) -> bool { |
| matches!(self.mix, Mix::Normal | Mix::Clip) && self.compose == Compose::SrcOver |
| } |
| } |
| |
| pub(crate) trait EncodedImageExt { |
| fn has_skew(&self) -> bool; |
| fn nearest_neighbor(&self) -> bool; |
| } |
| |
| impl EncodedImageExt for EncodedImage { |
| fn has_skew(&self) -> bool { |
| !(self.x_advance.y as f32).is_nearly_zero() || !(self.y_advance.x as f32).is_nearly_zero() |
| } |
| |
| fn nearest_neighbor(&self) -> bool { |
| self.quality == ImageQuality::Low |
| } |
| } |
| |
| pub(crate) trait Premultiply { |
| fn premultiply(self, alphas: Self) -> Self; |
| fn unpremultiply(self, alphas: Self) -> Self; |
| } |
| |
| impl<S: Simd> Premultiply for f32x4<S> { |
| #[inline(always)] |
| fn premultiply(self, alphas: Self) -> Self { |
| self * alphas |
| } |
| |
| #[inline(always)] |
| fn unpremultiply(self, alphas: Self) -> Self { |
| let zero = Self::splat(alphas.simd, 0.0); |
| let divided = self / alphas; |
| |
| self.simd |
| .select_f32x4(self.simd.simd_eq_f32x4(alphas, zero), zero, divided) |
| } |
| } |