Add std/png decoder.filter_3_distance_4_arm_neon
On a Raspberry Pi 4 (32-bit armv7l) with -march=native and -mfpu=neon
("native" means "armv8-a+crc+simd"):
wuffs_png_decode_filt_3_dist_4/clang9 639MB/s ± 0% 906MB/s ± 0% +41.71% (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_4/gcc8 656MB/s ± 0% 714MB/s ± 0% +8.89% (p=0.008 n=5+5)
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 589aaa3..57de40a 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -561,6 +561,9 @@
"arm_neon_64.vget_lane_u32(b: u32[..= 1]) u32",
"arm_neon_64.vget_lane_u64(b: u32[..= 0]) u64",
"arm_neon_64.vget_lane_u8(b: u32[..= 7]) u8",
+ "arm_neon_64.vhadd_u16(b: arm_neon_64) arm_neon_64",
+ "arm_neon_64.vhadd_u32(b: arm_neon_64) arm_neon_64",
+ "arm_neon_64.vhadd_u8(b: arm_neon_64) arm_neon_64",
// ---- arm_neon_128
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 04a07a7..a07dae4 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -31207,6 +31207,14 @@
wuffs_base__slice_u8 a_curr);
#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_3_distance_4_arm_neon(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr,
+ wuffs_base__slice_u8 a_prev);
+#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+
static wuffs_base__empty_struct
wuffs_png__decoder__filter_1(
wuffs_png__decoder* self,
@@ -31546,6 +31554,97 @@
}
#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+// -------- func png.decoder.filter_3_distance_4_arm_neon
+
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_3_distance_4_arm_neon(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr,
+ wuffs_base__slice_u8 a_prev) {
+ wuffs_base__slice_u8 v_c = {0};
+ wuffs_base__slice_u8 v_p = {0};
+ uint8x8_t v_fa = {0};
+ uint8x8_t v_fb = {0};
+ uint8x8_t v_fx = {0};
+
+ v_fa = vreinterpret_u8_u32(vdup_n_u32(0));
+ v_fb = vreinterpret_u8_u32(vdup_n_u32(0));
+ if (((uint64_t)(a_prev.len)) == 0) {
+ {
+ wuffs_base__slice_u8 i_slice_c = a_curr;
+ v_c.ptr = i_slice_c.ptr;
+ v_c.len = 4;
+ uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
+ while (v_c.ptr < i_end0_c) {
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_fx = vadd_u8(v_fx, vhadd_u8(v_fa, v_fb));
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_fa = v_fx;
+ v_c.ptr += 4;
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_fx = vadd_u8(v_fx, vhadd_u8(v_fa, v_fb));
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_fa = v_fx;
+ v_c.ptr += 4;
+ }
+ v_c.len = 4;
+ uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+ while (v_c.ptr < i_end1_c) {
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_fx = vadd_u8(v_fx, vhadd_u8(v_fa, v_fb));
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_fa = v_fx;
+ v_c.ptr += 4;
+ }
+ v_c.len = 0;
+ }
+ } else {
+ {
+ wuffs_base__slice_u8 i_slice_c = a_curr;
+ v_c.ptr = i_slice_c.ptr;
+ wuffs_base__slice_u8 i_slice_p = a_prev;
+ v_p.ptr = i_slice_p.ptr;
+ i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
+ v_c.len = 4;
+ v_p.len = 4;
+ uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
+ while (v_c.ptr < i_end0_c) {
+ v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_fx = vadd_u8(v_fx, vhadd_u8(v_fa, v_fb));
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_fa = v_fx;
+ v_c.ptr += 4;
+ v_p.ptr += 4;
+ v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_fx = vadd_u8(v_fx, vhadd_u8(v_fa, v_fb));
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_fa = v_fx;
+ v_c.ptr += 4;
+ v_p.ptr += 4;
+ }
+ v_c.len = 4;
+ v_p.len = 4;
+ uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+ while (v_c.ptr < i_end1_c) {
+ v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_fx = vadd_u8(v_fx, vhadd_u8(v_fa, v_fb));
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_fa = v_fx;
+ v_c.ptr += 4;
+ v_p.ptr += 4;
+ }
+ v_c.len = 0;
+ v_p.len = 0;
+ }
+ }
+ return wuffs_base__make_empty_struct();
+}
+#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+
// -------- func png.decoder.filter_1
static wuffs_base__empty_struct
@@ -33292,6 +33391,9 @@
#endif
&wuffs_png__decoder__filter_1_distance_4_fallback);
self->private_impl.choosy_filter_3 = (
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+ wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_png__decoder__filter_3_distance_4_arm_neon :
+#endif
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
wuffs_base__cpu_arch__have_sse42() ? &wuffs_png__decoder__filter_3_distance_4_sse42 :
#endif
diff --git a/std/png/decode_filter_arm_neon.wuffs b/std/png/decode_filter_arm_neon.wuffs
index a5a1c0c..157c209 100644
--- a/std/png/decode_filter_arm_neon.wuffs
+++ b/std/png/decode_filter_arm_neon.wuffs
@@ -44,3 +44,38 @@
fa = fx
}
}
+
+// --------
+
+// Filter 3: Average.
+
+pri func decoder.filter_3_distance_4_arm_neon!(curr: slice base.u8, prev: slice base.u8),
+ choose cpu_arch >= arm_neon,
+{
+ var c : slice base.u8
+ var p : slice base.u8
+ var fa : base.arm_neon_64
+ var fb : base.arm_neon_64
+ var fx : base.arm_neon_64
+
+ fa = fa.create_vdup_n_u32(a: 0)
+ fb = fb.create_vdup_n_u32(a: 0)
+
+ if args.prev.length() == 0 {
+ iterate (c = args.curr)(length: 4, advance: 4, unroll: 2) {
+ fx = fx.create_vdup_n_u32(a: c.peek_u32le())
+ fx = fx.vadd_u8(b: fa.vhadd_u8(b: fb))
+ c.poke_u32le!(a: fx.vget_lane_u32(b: 0))
+ fa = fx
+ }
+
+ } else {
+ iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 2) {
+ fb = fb.create_vdup_n_u32(a: p.peek_u32le())
+ fx = fx.create_vdup_n_u32(a: c.peek_u32le())
+ fx = fx.vadd_u8(b: fa.vhadd_u8(b: fb))
+ c.poke_u32le!(a: fx.vget_lane_u32(b: 0))
+ fa = fx
+ }
+ }
+}
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index 7157fa2..956d901 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -384,6 +384,7 @@
filter_1_distance_4_sse42,
filter_1_distance_4_fallback]
choose filter_3 = [
+ filter_3_distance_4_arm_neon,
filter_3_distance_4_sse42,
filter_3_distance_4_fallback]
choose filter_4 = [