Add std/png decoder.filter_4_distance_?_arm_neon
On a Raspberry Pi 4 (32-bit armv7l) with -march=native and -mfpu=neon
("native" means "armv8-a+crc+simd"):
wuffs_png_decode_filt_4_dist_3/clang9 180MB/s ± 0% 217MB/s ± 0% +20.57% (p=0.008 n=5+5)
wuffs_png_decode_filt_4_dist_4/clang9 161MB/s ± 0% 287MB/s ± 0% +78.75% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/clang9 99.1MB/s ± 0% 99.4MB/s ± 0% +0.27% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9 123MB/s ± 0% 165MB/s ± 0% +33.61% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9 112MB/s ± 0% 146MB/s ± 0% +29.64% (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang9 93.1MB/s ± 0% 93.2MB/s ± 0% +0.15% (p=0.040 n=5+5)
wuffs_png_decode_filt_4_dist_3/gcc8 80.5MB/s ± 0% 211.7MB/s ± 0% +163.12% (p=0.008 n=5+5)
wuffs_png_decode_filt_4_dist_4/gcc8 77.3MB/s ± 0% 280.4MB/s ± 0% +262.98% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc8 98.1MB/s ± 0% 100.8MB/s ± 0% +2.78% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc8 114MB/s ± 0% 162MB/s ± 0% +41.95% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc8 103MB/s ± 0% 140MB/s ± 0% +36.07% (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc8 93.2MB/s ± 0% 94.1MB/s ± 0% +0.96% (p=0.008 n=5+5)
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index 3bb90ea..beec84d 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -529,19 +529,37 @@
methodStr := method.Str(g.tm)
vreinterpretU8Uxx, vreinterpretUxxU8, vreinterpretClose := "", "", ""
if armNeon {
+ vreinterpretClose = ")"
switch {
+ case methodStr == "vmovn_u16":
+ vreinterpretU8Uxx = "("
+ vreinterpretUxxU8 = "vreinterpretq_u16_u8("
+ case methodStr == "vmovn_u32":
+ vreinterpretU8Uxx = "vreinterpret_u8_u16("
+ vreinterpretUxxU8 = "vreinterpretq_u32_u8("
+ case methodStr == "vmovn_u64":
+ vreinterpretU8Uxx = "vreinterpret_u8_u32("
+ vreinterpretUxxU8 = "vreinterpretq_u64_u8("
+ case strings.HasSuffix(methodStr, "q_u16"):
+ vreinterpretU8Uxx = "vreinterpretq_u8_u16("
+ vreinterpretUxxU8 = "vreinterpretq_u16_u8("
+ case strings.HasSuffix(methodStr, "q_u32"):
+ vreinterpretU8Uxx = "vreinterpretq_u8_u32("
+ vreinterpretUxxU8 = "vreinterpretq_u32_u8("
+ case strings.HasSuffix(methodStr, "q_u64"):
+ vreinterpretU8Uxx = "vreinterpretq_u8_u64("
+ vreinterpretUxxU8 = "vreinterpretq_u64_u8("
case strings.HasSuffix(methodStr, "_u16"):
vreinterpretU8Uxx = "vreinterpret_u8_u16("
vreinterpretUxxU8 = "vreinterpret_u16_u8("
- vreinterpretClose = ")"
case strings.HasSuffix(methodStr, "_u32"):
vreinterpretU8Uxx = "vreinterpret_u8_u32("
vreinterpretUxxU8 = "vreinterpret_u32_u8("
- vreinterpretClose = ")"
case strings.HasSuffix(methodStr, "_u64"):
vreinterpretU8Uxx = "vreinterpret_u8_u64("
vreinterpretUxxU8 = "vreinterpret_u64_u8("
- vreinterpretClose = ")"
+ default:
+ vreinterpretClose = ""
}
}
@@ -587,8 +605,24 @@
b.writes(vreinterpretClose)
} else {
+ postArgsAfter := ""
if armCRC32U32 {
b.writeb('_')
+ } else if armNeon {
+ // TODO: generate this table automatically?
+ postArgsAfter = ")"
+ switch methodStr {
+ case "vabdl_u16", "vaddl_u16",
+ "vabdq_u32", "vcleq_u32":
+ b.writes("vreinterpretq_u8_u32(")
+ case "vabdl_u32", "vaddl_u32":
+ b.writes("vreinterpretq_u8_u64(")
+ case "vabdl_u8", "vaddl_u8",
+ "vabdq_u16", "vcleq_u16":
+ b.writes("vreinterpretq_u8_u16(")
+ default:
+ postArgsAfter = ""
+ }
}
b.printf("%s(", methodStr)
@@ -622,6 +656,7 @@
}
b.writes(after)
}
+ b.writes(postArgsAfter)
}
b.writes(")")
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 57de40a..ef8018c 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -553,10 +553,17 @@
"arm_neon_64.create_vdup_n_u64(a: u64) arm_neon_64",
"arm_neon_64.create_vdup_n_u8(a: u8) arm_neon_64",
+ "arm_neon_64.vabdl_u16(b: arm_neon_64) arm_neon_128",
+ "arm_neon_64.vabdl_u32(b: arm_neon_64) arm_neon_128",
+ "arm_neon_64.vabdl_u8(b: arm_neon_64) arm_neon_128",
"arm_neon_64.vadd_u16(b: arm_neon_64) arm_neon_64",
"arm_neon_64.vadd_u32(b: arm_neon_64) arm_neon_64",
"arm_neon_64.vadd_u64(b: arm_neon_64) arm_neon_64",
"arm_neon_64.vadd_u8(b: arm_neon_64) arm_neon_64",
+ "arm_neon_64.vaddl_u16(b: arm_neon_64) arm_neon_128",
+ "arm_neon_64.vaddl_u32(b: arm_neon_64) arm_neon_128",
+ "arm_neon_64.vaddl_u8(b: arm_neon_64) arm_neon_128",
+ "arm_neon_64.vbsl_u8(b: arm_neon_64, c: arm_neon_64) arm_neon_64",
"arm_neon_64.vget_lane_u16(b: u32[..= 3]) u16",
"arm_neon_64.vget_lane_u32(b: u32[..= 1]) u32",
"arm_neon_64.vget_lane_u64(b: u32[..= 0]) u64",
@@ -567,6 +574,17 @@
// ---- arm_neon_128
+ "arm_neon_128.vabdq_u16(b: arm_neon_128) arm_neon_128",
+ "arm_neon_128.vabdq_u32(b: arm_neon_128) arm_neon_128",
+ "arm_neon_128.vabdq_u8(b: arm_neon_128) arm_neon_128",
+ "arm_neon_128.vandq_u8(b: arm_neon_128) arm_neon_128",
+ "arm_neon_128.vcleq_u16(b: arm_neon_128) arm_neon_128",
+ "arm_neon_128.vcleq_u32(b: arm_neon_128) arm_neon_128",
+ "arm_neon_128.vcleq_u8(b: arm_neon_128) arm_neon_128",
+ "arm_neon_128.vmovn_u16() arm_neon_64",
+ "arm_neon_128.vmovn_u32() arm_neon_64",
+ "arm_neon_128.vmovn_u64() arm_neon_64",
+
// ---- x86_m128i
"x86_m128i.load_u32!(a: u32)",
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index a07dae4..05afd04 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -31215,6 +31215,22 @@
wuffs_base__slice_u8 a_prev);
#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_4_distance_3_arm_neon(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr,
+ wuffs_base__slice_u8 a_prev);
+#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_4_distance_4_arm_neon(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr,
+ wuffs_base__slice_u8 a_prev);
+#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+
static wuffs_base__empty_struct
wuffs_png__decoder__filter_1(
wuffs_png__decoder* self,
@@ -31645,6 +31661,226 @@
}
#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+// -------- func png.decoder.filter_4_distance_3_arm_neon
+
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_4_distance_3_arm_neon(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr,
+ wuffs_base__slice_u8 a_prev) {
+ wuffs_base__slice_u8 v_c = {0};
+ wuffs_base__slice_u8 v_p = {0};
+ uint8x8_t v_fa = {0};
+ uint8x8_t v_fb = {0};
+ uint8x8_t v_fc = {0};
+ uint8x8_t v_fx = {0};
+ uint8x16_t v_fafb = {0};
+ uint8x16_t v_fcfc = {0};
+ uint8x16_t v_pa = {0};
+ uint8x16_t v_pb = {0};
+ uint8x16_t v_pc = {0};
+ uint8x16_t v_cmpab = {0};
+ uint8x16_t v_cmpac = {0};
+ uint8x8_t v_picka = {0};
+ uint8x8_t v_pickb = {0};
+
+ v_fa = vreinterpret_u8_u32(vdup_n_u32(0));
+ v_fc = vreinterpret_u8_u32(vdup_n_u32(0));
+ {
+ wuffs_base__slice_u8 i_slice_c = a_curr;
+ v_c.ptr = i_slice_c.ptr;
+ wuffs_base__slice_u8 i_slice_p = a_prev;
+ v_p.ptr = i_slice_p.ptr;
+ i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
+ v_c.len = 4;
+ v_p.len = 4;
+ uint8_t* i_end0_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 7, 6);
+ while (v_c.ptr < i_end0_c) {
+ v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+ v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+ v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+ v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+ v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+ v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+ v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+ v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+ v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+ v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+ wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_fc = v_fb;
+ v_fa = v_fx;
+ v_c.ptr += 3;
+ v_p.ptr += 3;
+ v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+ v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+ v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+ v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+ v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+ v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+ v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+ v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+ v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+ v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+ wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_fc = v_fb;
+ v_fa = v_fx;
+ v_c.ptr += 3;
+ v_p.ptr += 3;
+ }
+ v_c.len = 4;
+ v_p.len = 4;
+ uint8_t* i_end1_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 4, 3);
+ while (v_c.ptr < i_end1_c) {
+ v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+ v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+ v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+ v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+ v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+ v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+ v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+ v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+ v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+ v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+ wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_fc = v_fb;
+ v_fa = v_fx;
+ v_c.ptr += 3;
+ v_p.ptr += 3;
+ }
+ v_c.len = 3;
+ v_p.len = 3;
+ uint8_t* i_end2_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+ while (v_c.ptr < i_end2_c) {
+ v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u24le__no_bounds_check(v_p.ptr)));
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u24le__no_bounds_check(v_c.ptr)));
+ v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+ v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+ v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+ v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+ v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+ v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+ v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+ v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+ v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+ v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+ wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_c.ptr += 3;
+ v_p.ptr += 3;
+ }
+ v_c.len = 0;
+ v_p.len = 0;
+ }
+ return wuffs_base__make_empty_struct();
+}
+#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+
+// -------- func png.decoder.filter_4_distance_4_arm_neon
+
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_4_distance_4_arm_neon(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr,
+ wuffs_base__slice_u8 a_prev) {
+ wuffs_base__slice_u8 v_c = {0};
+ wuffs_base__slice_u8 v_p = {0};
+ uint8x8_t v_fa = {0};
+ uint8x8_t v_fb = {0};
+ uint8x8_t v_fc = {0};
+ uint8x8_t v_fx = {0};
+ uint8x16_t v_fafb = {0};
+ uint8x16_t v_fcfc = {0};
+ uint8x16_t v_pa = {0};
+ uint8x16_t v_pb = {0};
+ uint8x16_t v_pc = {0};
+ uint8x16_t v_cmpab = {0};
+ uint8x16_t v_cmpac = {0};
+ uint8x8_t v_picka = {0};
+ uint8x8_t v_pickb = {0};
+
+ v_fa = vreinterpret_u8_u32(vdup_n_u32(0));
+ v_fc = vreinterpret_u8_u32(vdup_n_u32(0));
+ {
+ wuffs_base__slice_u8 i_slice_c = a_curr;
+ v_c.ptr = i_slice_c.ptr;
+ wuffs_base__slice_u8 i_slice_p = a_prev;
+ v_p.ptr = i_slice_p.ptr;
+ i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
+ v_c.len = 4;
+ v_p.len = 4;
+ uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
+ while (v_c.ptr < i_end0_c) {
+ v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+ v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+ v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+ v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+ v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+ v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+ v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+ v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+ v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+ v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_fc = v_fb;
+ v_fa = v_fx;
+ v_c.ptr += 4;
+ v_p.ptr += 4;
+ v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+ v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+ v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+ v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+ v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+ v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+ v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+ v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+ v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+ v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_fc = v_fb;
+ v_fa = v_fx;
+ v_c.ptr += 4;
+ v_p.ptr += 4;
+ }
+ v_c.len = 4;
+ v_p.len = 4;
+ uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+ while (v_c.ptr < i_end1_c) {
+ v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+ v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+ v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+ v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+ v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+ v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+ v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+ v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+ v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+ v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+ v_fc = v_fb;
+ v_fa = v_fx;
+ v_c.ptr += 4;
+ v_p.ptr += 4;
+ }
+ v_c.len = 0;
+ v_p.len = 0;
+ }
+ return wuffs_base__make_empty_struct();
+}
+#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+
// -------- func png.decoder.filter_1
static wuffs_base__empty_struct
@@ -33377,6 +33613,9 @@
self->private_impl.choosy_filter_3 = (
&wuffs_png__decoder__filter_3_distance_3_fallback);
self->private_impl.choosy_filter_4 = (
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+ wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_png__decoder__filter_4_distance_3_arm_neon :
+#endif
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
wuffs_base__cpu_arch__have_sse42() ? &wuffs_png__decoder__filter_4_distance_3_sse42 :
#endif
@@ -33399,6 +33638,9 @@
#endif
&wuffs_png__decoder__filter_3_distance_4_fallback);
self->private_impl.choosy_filter_4 = (
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+ wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_png__decoder__filter_4_distance_4_arm_neon :
+#endif
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
wuffs_base__cpu_arch__have_sse42() ? &wuffs_png__decoder__filter_4_distance_4_sse42 :
#endif
diff --git a/std/png/decode_filter_arm_neon.wuffs b/std/png/decode_filter_arm_neon.wuffs
index 157c209..339fdc2 100644
--- a/std/png/decode_filter_arm_neon.wuffs
+++ b/std/png/decode_filter_arm_neon.wuffs
@@ -79,3 +79,131 @@
}
}
}
+
+// --------
+
+// Filter 4: Paeth.
+
+pri func decoder.filter_4_distance_3_arm_neon!(curr: slice base.u8, prev: slice base.u8),
+ choose cpu_arch >= arm_neon,
+{
+ // See the comments in filter_4_distance_4_arm_neon for an explanation of
+ // how this works. That function's single loop is copied twice here, once
+ // with "length: 4" and once with "length: 3". It's generally faster to
+ // load 4 bytes at a time instead of 3.
+ //
+ // Differences between that function and this one are marked with a §.
+
+ var c : slice base.u8
+ var p : slice base.u8
+ var fa : base.arm_neon_64
+ var fb : base.arm_neon_64
+ var fc : base.arm_neon_64
+ var fx : base.arm_neon_64
+ var fafb : base.arm_neon_128
+ var fcfc : base.arm_neon_128
+ var pa : base.arm_neon_128
+ var pb : base.arm_neon_128
+ var pc : base.arm_neon_128
+ var cmpab : base.arm_neon_128
+ var cmpac : base.arm_neon_128
+ var picka : base.arm_neon_64
+ var pickb : base.arm_neon_64
+
+ fa = fa.create_vdup_n_u32(a: 0)
+ fc = fc.create_vdup_n_u32(a: 0)
+
+ // § The advance is 3, not 4.
+ iterate (c = args.curr, p = args.prev)(length: 4, advance: 3, unroll: 2) {
+ fb = fb.create_vdup_n_u32(a: p.peek_u32le())
+ fx = fx.create_vdup_n_u32(a: c.peek_u32le())
+ fafb = fa.vaddl_u8(b: fb)
+ fcfc = fc.vaddl_u8(b: fc)
+ pa = fb.vabdl_u8(b: fc)
+ pb = fa.vabdl_u8(b: fc)
+ pc = fafb.vabdq_u16(b: fcfc)
+ cmpab = pa.vcleq_u16(b: pb)
+ cmpac = pa.vcleq_u16(b: pc)
+ picka = cmpab.vandq_u8(b: cmpac).vmovn_u16()
+ pickb = pb.vcleq_u16(b: pc).vmovn_u16()
+ fx = fx.vadd_u8(
+ b: picka.vbsl_u8(b: fa,
+ c: pickb.vbsl_u8(b: fb, c: fc)))
+ // § poke_u24le replaces poke_u32le.
+ c.poke_u24le!(a: fx.vget_lane_u32(b: 0))
+ fc = fb
+ fa = fx
+ } else (length: 3, advance: 3, unroll: 1) {
+ // § peek_u24le_as_u32 replaces peek_u32le.
+ fb = fb.create_vdup_n_u32(a: p.peek_u24le_as_u32())
+ // § peek_u24le_as_u32 replaces peek_u32le.
+ fx = fx.create_vdup_n_u32(a: c.peek_u24le_as_u32())
+ fafb = fa.vaddl_u8(b: fb)
+ fcfc = fc.vaddl_u8(b: fc)
+ pa = fb.vabdl_u8(b: fc)
+ pb = fa.vabdl_u8(b: fc)
+ pc = fafb.vabdq_u16(b: fcfc)
+ cmpab = pa.vcleq_u16(b: pb)
+ cmpac = pa.vcleq_u16(b: pc)
+ picka = cmpab.vandq_u8(b: cmpac).vmovn_u16()
+ pickb = pb.vcleq_u16(b: pc).vmovn_u16()
+ fx = fx.vadd_u8(
+ b: picka.vbsl_u8(b: fa,
+ c: pickb.vbsl_u8(b: fb, c: fc)))
+ // § poke_u24le replaces poke_u32le.
+ c.poke_u24le!(a: fx.vget_lane_u32(b: 0))
+ // § These assignments are unnecessary; this is the last iteration.
+ // fc = fb
+ // fa = fx
+ }
+}
+
+pri func decoder.filter_4_distance_4_arm_neon!(curr: slice base.u8, prev: slice base.u8),
+ choose cpu_arch >= arm_neon,
+{
+ var c : slice base.u8
+ var p : slice base.u8
+ var fa : base.arm_neon_64
+ var fb : base.arm_neon_64
+ var fc : base.arm_neon_64
+ var fx : base.arm_neon_64
+ var fafb : base.arm_neon_128
+ var fcfc : base.arm_neon_128
+ var pa : base.arm_neon_128
+ var pb : base.arm_neon_128
+ var pc : base.arm_neon_128
+ var cmpab : base.arm_neon_128
+ var cmpac : base.arm_neon_128
+ var picka : base.arm_neon_64
+ var pickb : base.arm_neon_64
+
+ fa = fa.create_vdup_n_u32(a: 0)
+ fc = fc.create_vdup_n_u32(a: 0)
+
+ iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 2) {
+ fb = fb.create_vdup_n_u32(a: p.peek_u32le())
+ fx = fx.create_vdup_n_u32(a: c.peek_u32le())
+
+ fafb = fa.vaddl_u8(b: fb) // fafb = (fa + fb)
+ fcfc = fc.vaddl_u8(b: fc) // fcfc = (fc + fc)
+
+ pa = fb.vabdl_u8(b: fc) // pa = abs(fa + fb - fc - fa)
+ pb = fa.vabdl_u8(b: fc) // pb = abs(fa + fb - fc - fb)
+ pc = fafb.vabdq_u16(b: fcfc) // pc = abs(fa + fb - fc - fc)
+
+ cmpab = pa.vcleq_u16(b: pb) // cmpab = (pa <= pb)
+ cmpac = pa.vcleq_u16(b: pc) // cmpac = (pa <= pc)
+
+ picka = cmpab.vandq_u8(b: cmpac).vmovn_u16() // picka = ((pa <= pb) && (pa <= pc))
+ pickb = pb.vcleq_u16(b: pc).vmovn_u16() // pickb = (pb <= pc)
+
+ // Add the predictor to the residual.
+ fx = fx.vadd_u8(
+ b: picka.vbsl_u8(b: fa,
+ c: pickb.vbsl_u8(b: fb, c: fc)))
+
+ c.poke_u32le!(a: fx.vget_lane_u32(b: 0))
+ fc = fb
+ fa = fx
+ }
+}
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index 956d901..cc6b8a4 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -376,6 +376,7 @@
choose filter_1 = [filter_1_distance_3_fallback]
choose filter_3 = [filter_3_distance_3_fallback]
choose filter_4 = [
+ filter_4_distance_3_arm_neon,
filter_4_distance_3_sse42,
filter_4_distance_3_fallback]
} else if this.filter_distance == 4 {
@@ -388,6 +389,7 @@
filter_3_distance_4_sse42,
filter_3_distance_4_fallback]
choose filter_4 = [
+ filter_4_distance_4_arm_neon,
filter_4_distance_4_sse42,
filter_4_distance_4_fallback]
}