Add std/png decoder.filter_4_distance_?_arm_neon

On a Raspberry Pi 4 (32-bit armv7l) with -march=native and -mfpu=neon
("native" means "armv8-a+crc+simd"):

wuffs_png_decode_filt_4_dist_3/clang9                      180MB/s ± 0%    217MB/s ± 0%   +20.57%  (p=0.008 n=5+5)
wuffs_png_decode_filt_4_dist_4/clang9                      161MB/s ± 0%    287MB/s ± 0%   +78.75%  (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/clang9                   99.1MB/s ± 0%   99.4MB/s ± 0%    +0.27%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9   123MB/s ± 0%    165MB/s ± 0%   +33.61%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9   112MB/s ± 0%    146MB/s ± 0%   +29.64%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang9                 93.1MB/s ± 0%   93.2MB/s ± 0%    +0.15%  (p=0.040 n=5+5)

wuffs_png_decode_filt_4_dist_3/gcc8                       80.5MB/s ± 0%  211.7MB/s ± 0%  +163.12%  (p=0.008 n=5+5)
wuffs_png_decode_filt_4_dist_4/gcc8                       77.3MB/s ± 0%  280.4MB/s ± 0%  +262.98%  (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc8                     98.1MB/s ± 0%  100.8MB/s ± 0%    +2.78%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc8     114MB/s ± 0%    162MB/s ± 0%   +41.95%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc8     103MB/s ± 0%    140MB/s ± 0%   +36.07%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc8                   93.2MB/s ± 0%   94.1MB/s ± 0%    +0.96%  (p=0.008 n=5+5)
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index 3bb90ea..beec84d 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -529,19 +529,37 @@
 	methodStr := method.Str(g.tm)
 	vreinterpretU8Uxx, vreinterpretUxxU8, vreinterpretClose := "", "", ""
 	if armNeon {
+		vreinterpretClose = ")"
 		switch {
+		case methodStr == "vmovn_u16":
+			vreinterpretU8Uxx = "("
+			vreinterpretUxxU8 = "vreinterpretq_u16_u8("
+		case methodStr == "vmovn_u32":
+			vreinterpretU8Uxx = "vreinterpret_u8_u16("
+			vreinterpretUxxU8 = "vreinterpretq_u32_u8("
+		case methodStr == "vmovn_u64":
+			vreinterpretU8Uxx = "vreinterpret_u8_u32("
+			vreinterpretUxxU8 = "vreinterpretq_u64_u8("
+		case strings.HasSuffix(methodStr, "q_u16"):
+			vreinterpretU8Uxx = "vreinterpretq_u8_u16("
+			vreinterpretUxxU8 = "vreinterpretq_u16_u8("
+		case strings.HasSuffix(methodStr, "q_u32"):
+			vreinterpretU8Uxx = "vreinterpretq_u8_u32("
+			vreinterpretUxxU8 = "vreinterpretq_u32_u8("
+		case strings.HasSuffix(methodStr, "q_u64"):
+			vreinterpretU8Uxx = "vreinterpretq_u8_u64("
+			vreinterpretUxxU8 = "vreinterpretq_u64_u8("
 		case strings.HasSuffix(methodStr, "_u16"):
 			vreinterpretU8Uxx = "vreinterpret_u8_u16("
 			vreinterpretUxxU8 = "vreinterpret_u16_u8("
-			vreinterpretClose = ")"
 		case strings.HasSuffix(methodStr, "_u32"):
 			vreinterpretU8Uxx = "vreinterpret_u8_u32("
 			vreinterpretUxxU8 = "vreinterpret_u32_u8("
-			vreinterpretClose = ")"
 		case strings.HasSuffix(methodStr, "_u64"):
 			vreinterpretU8Uxx = "vreinterpret_u8_u64("
 			vreinterpretUxxU8 = "vreinterpret_u64_u8("
-			vreinterpretClose = ")"
+		default:
+			vreinterpretClose = ""
 		}
 	}
 
@@ -587,8 +605,24 @@
 		b.writes(vreinterpretClose)
 
 	} else {
+		postArgsAfter := ""
 		if armCRC32U32 {
 			b.writeb('_')
+		} else if armNeon {
+			// TODO: generate this table automatically?
+			postArgsAfter = ")"
+			switch methodStr {
+			case "vabdl_u16", "vaddl_u16",
+				"vabdq_u32", "vcleq_u32":
+				b.writes("vreinterpretq_u8_u32(")
+			case "vabdl_u32", "vaddl_u32":
+				b.writes("vreinterpretq_u8_u64(")
+			case "vabdl_u8", "vaddl_u8",
+				"vabdq_u16", "vcleq_u16":
+				b.writes("vreinterpretq_u8_u16(")
+			default:
+				postArgsAfter = ""
+			}
 		}
 		b.printf("%s(", methodStr)
 
@@ -622,6 +656,7 @@
 			}
 			b.writes(after)
 		}
+		b.writes(postArgsAfter)
 	}
 
 	b.writes(")")
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 57de40a..ef8018c 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -553,10 +553,17 @@
 	"arm_neon_64.create_vdup_n_u64(a: u64) arm_neon_64",
 	"arm_neon_64.create_vdup_n_u8(a: u8) arm_neon_64",
 
+	"arm_neon_64.vabdl_u16(b: arm_neon_64) arm_neon_128",
+	"arm_neon_64.vabdl_u32(b: arm_neon_64) arm_neon_128",
+	"arm_neon_64.vabdl_u8(b: arm_neon_64) arm_neon_128",
 	"arm_neon_64.vadd_u16(b: arm_neon_64) arm_neon_64",
 	"arm_neon_64.vadd_u32(b: arm_neon_64) arm_neon_64",
 	"arm_neon_64.vadd_u64(b: arm_neon_64) arm_neon_64",
 	"arm_neon_64.vadd_u8(b: arm_neon_64) arm_neon_64",
+	"arm_neon_64.vaddl_u16(b: arm_neon_64) arm_neon_128",
+	"arm_neon_64.vaddl_u32(b: arm_neon_64) arm_neon_128",
+	"arm_neon_64.vaddl_u8(b: arm_neon_64) arm_neon_128",
+	"arm_neon_64.vbsl_u8(b: arm_neon_64, c: arm_neon_64) arm_neon_64",
 	"arm_neon_64.vget_lane_u16(b: u32[..= 3]) u16",
 	"arm_neon_64.vget_lane_u32(b: u32[..= 1]) u32",
 	"arm_neon_64.vget_lane_u64(b: u32[..= 0]) u64",
@@ -567,6 +574,17 @@
 
 	// ---- arm_neon_128
 
+	"arm_neon_128.vabdq_u16(b: arm_neon_128) arm_neon_128",
+	"arm_neon_128.vabdq_u32(b: arm_neon_128) arm_neon_128",
+	"arm_neon_128.vabdq_u8(b: arm_neon_128) arm_neon_128",
+	"arm_neon_128.vandq_u8(b: arm_neon_128) arm_neon_128",
+	"arm_neon_128.vcleq_u16(b: arm_neon_128) arm_neon_128",
+	"arm_neon_128.vcleq_u32(b: arm_neon_128) arm_neon_128",
+	"arm_neon_128.vcleq_u8(b: arm_neon_128) arm_neon_128",
+	"arm_neon_128.vmovn_u16() arm_neon_64",
+	"arm_neon_128.vmovn_u32() arm_neon_64",
+	"arm_neon_128.vmovn_u64() arm_neon_64",
+
 	// ---- x86_m128i
 
 	"x86_m128i.load_u32!(a: u32)",
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index a07dae4..05afd04 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -31215,6 +31215,22 @@
     wuffs_base__slice_u8 a_prev);
 #endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
 
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_4_distance_3_arm_neon(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr,
+    wuffs_base__slice_u8 a_prev);
+#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_4_distance_4_arm_neon(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr,
+    wuffs_base__slice_u8 a_prev);
+#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+
 static wuffs_base__empty_struct
 wuffs_png__decoder__filter_1(
     wuffs_png__decoder* self,
@@ -31645,6 +31661,226 @@
 }
 #endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
 
+// -------- func png.decoder.filter_4_distance_3_arm_neon
+
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_4_distance_3_arm_neon(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr,
+    wuffs_base__slice_u8 a_prev) {
+  wuffs_base__slice_u8 v_c = {0};
+  wuffs_base__slice_u8 v_p = {0};
+  uint8x8_t v_fa = {0};
+  uint8x8_t v_fb = {0};
+  uint8x8_t v_fc = {0};
+  uint8x8_t v_fx = {0};
+  uint8x16_t v_fafb = {0};
+  uint8x16_t v_fcfc = {0};
+  uint8x16_t v_pa = {0};
+  uint8x16_t v_pb = {0};
+  uint8x16_t v_pc = {0};
+  uint8x16_t v_cmpab = {0};
+  uint8x16_t v_cmpac = {0};
+  uint8x8_t v_picka = {0};
+  uint8x8_t v_pickb = {0};
+
+  v_fa = vreinterpret_u8_u32(vdup_n_u32(0));
+  v_fc = vreinterpret_u8_u32(vdup_n_u32(0));
+  {
+    wuffs_base__slice_u8 i_slice_c = a_curr;
+    v_c.ptr = i_slice_c.ptr;
+    wuffs_base__slice_u8 i_slice_p = a_prev;
+    v_p.ptr = i_slice_p.ptr;
+    i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
+    v_c.len = 4;
+    v_p.len = 4;
+    uint8_t* i_end0_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 7, 6);
+    while (v_c.ptr < i_end0_c) {
+      v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+      v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+      v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+      v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+      v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+      v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+      v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+      v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+      v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+      v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+      wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+      v_fc = v_fb;
+      v_fa = v_fx;
+      v_c.ptr += 3;
+      v_p.ptr += 3;
+      v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+      v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+      v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+      v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+      v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+      v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+      v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+      v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+      v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+      v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+      wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+      v_fc = v_fb;
+      v_fa = v_fx;
+      v_c.ptr += 3;
+      v_p.ptr += 3;
+    }
+    v_c.len = 4;
+    v_p.len = 4;
+    uint8_t* i_end1_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 4, 3);
+    while (v_c.ptr < i_end1_c) {
+      v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+      v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+      v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+      v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+      v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+      v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+      v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+      v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+      v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+      v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+      wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+      v_fc = v_fb;
+      v_fa = v_fx;
+      v_c.ptr += 3;
+      v_p.ptr += 3;
+    }
+    v_c.len = 3;
+    v_p.len = 3;
+    uint8_t* i_end2_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+    while (v_c.ptr < i_end2_c) {
+      v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u24le__no_bounds_check(v_p.ptr)));
+      v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u24le__no_bounds_check(v_c.ptr)));
+      v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+      v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+      v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+      v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+      v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+      v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+      v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+      v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+      v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+      v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+      wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+      v_c.ptr += 3;
+      v_p.ptr += 3;
+    }
+    v_c.len = 0;
+    v_p.len = 0;
+  }
+  return wuffs_base__make_empty_struct();
+}
+#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+
+// -------- func png.decoder.filter_4_distance_4_arm_neon
+
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_4_distance_4_arm_neon(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr,
+    wuffs_base__slice_u8 a_prev) {
+  wuffs_base__slice_u8 v_c = {0};
+  wuffs_base__slice_u8 v_p = {0};
+  uint8x8_t v_fa = {0};
+  uint8x8_t v_fb = {0};
+  uint8x8_t v_fc = {0};
+  uint8x8_t v_fx = {0};
+  uint8x16_t v_fafb = {0};
+  uint8x16_t v_fcfc = {0};
+  uint8x16_t v_pa = {0};
+  uint8x16_t v_pb = {0};
+  uint8x16_t v_pc = {0};
+  uint8x16_t v_cmpab = {0};
+  uint8x16_t v_cmpac = {0};
+  uint8x8_t v_picka = {0};
+  uint8x8_t v_pickb = {0};
+
+  v_fa = vreinterpret_u8_u32(vdup_n_u32(0));
+  v_fc = vreinterpret_u8_u32(vdup_n_u32(0));
+  {
+    wuffs_base__slice_u8 i_slice_c = a_curr;
+    v_c.ptr = i_slice_c.ptr;
+    wuffs_base__slice_u8 i_slice_p = a_prev;
+    v_p.ptr = i_slice_p.ptr;
+    i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
+    v_c.len = 4;
+    v_p.len = 4;
+    uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
+    while (v_c.ptr < i_end0_c) {
+      v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+      v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+      v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+      v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+      v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+      v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+      v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+      v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+      v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+      v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+      wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+      v_fc = v_fb;
+      v_fa = v_fx;
+      v_c.ptr += 4;
+      v_p.ptr += 4;
+      v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+      v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+      v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+      v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+      v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+      v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+      v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+      v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+      v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+      v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+      wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+      v_fc = v_fb;
+      v_fa = v_fx;
+      v_c.ptr += 4;
+      v_p.ptr += 4;
+    }
+    v_c.len = 4;
+    v_p.len = 4;
+    uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+    while (v_c.ptr < i_end1_c) {
+      v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_fafb = vreinterpretq_u8_u16(vaddl_u8(v_fa, v_fb));
+      v_fcfc = vreinterpretq_u8_u16(vaddl_u8(v_fc, v_fc));
+      v_pa = vreinterpretq_u8_u16(vabdl_u8(v_fb, v_fc));
+      v_pb = vreinterpretq_u8_u16(vabdl_u8(v_fa, v_fc));
+      v_pc = vreinterpretq_u8_u16(vabdq_u16(vreinterpretq_u16_u8(v_fafb), vreinterpretq_u16_u8(v_fcfc)));
+      v_cmpab = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pb)));
+      v_cmpac = vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pa), vreinterpretq_u16_u8(v_pc)));
+      v_picka = vmovn_u16(vreinterpretq_u16_u8(vandq_u8(v_cmpab, v_cmpac)));
+      v_pickb = vmovn_u16(vreinterpretq_u16_u8(vreinterpretq_u8_u16(vcleq_u16(vreinterpretq_u16_u8(v_pb), vreinterpretq_u16_u8(v_pc)))));
+      v_fx = vadd_u8(v_fx, vbsl_u8(v_picka, v_fa, vbsl_u8(v_pickb, v_fb, v_fc)));
+      wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+      v_fc = v_fb;
+      v_fa = v_fx;
+      v_c.ptr += 4;
+      v_p.ptr += 4;
+    }
+    v_c.len = 0;
+    v_p.len = 0;
+  }
+  return wuffs_base__make_empty_struct();
+}
+#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+
 // -------- func png.decoder.filter_1
 
 static wuffs_base__empty_struct
@@ -33377,6 +33613,9 @@
     self->private_impl.choosy_filter_3 = (
         &wuffs_png__decoder__filter_3_distance_3_fallback);
     self->private_impl.choosy_filter_4 = (
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+        wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_png__decoder__filter_4_distance_3_arm_neon :
+#endif
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
         wuffs_base__cpu_arch__have_sse42() ? &wuffs_png__decoder__filter_4_distance_3_sse42 :
 #endif
@@ -33399,6 +33638,9 @@
 #endif
         &wuffs_png__decoder__filter_3_distance_4_fallback);
     self->private_impl.choosy_filter_4 = (
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+        wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_png__decoder__filter_4_distance_4_arm_neon :
+#endif
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
         wuffs_base__cpu_arch__have_sse42() ? &wuffs_png__decoder__filter_4_distance_4_sse42 :
 #endif
diff --git a/std/png/decode_filter_arm_neon.wuffs b/std/png/decode_filter_arm_neon.wuffs
index 157c209..339fdc2 100644
--- a/std/png/decode_filter_arm_neon.wuffs
+++ b/std/png/decode_filter_arm_neon.wuffs
@@ -79,3 +79,131 @@
 		}
 	}
 }
+
+// --------
+
+// Filter 4: Paeth.
+
+pri func decoder.filter_4_distance_3_arm_neon!(curr: slice base.u8, prev: slice base.u8),
+	choose cpu_arch >= arm_neon,
+{
+	// See the comments in filter_4_distance_4_arm_neon for an explanation of
+	// how this works. That function's single loop is copied twice here, once
+	// with "length: 4" and once with "length: 3". It's generally faster to
+	// load 4 bytes at a time instead of 3.
+	//
+	// Differences between that function and this one are marked with a §.
+
+	var c     : slice base.u8
+	var p     : slice base.u8
+	var fa    : base.arm_neon_64
+	var fb    : base.arm_neon_64
+	var fc    : base.arm_neon_64
+	var fx    : base.arm_neon_64
+	var fafb  : base.arm_neon_128
+	var fcfc  : base.arm_neon_128
+	var pa    : base.arm_neon_128
+	var pb    : base.arm_neon_128
+	var pc    : base.arm_neon_128
+	var cmpab : base.arm_neon_128
+	var cmpac : base.arm_neon_128
+	var picka : base.arm_neon_64
+	var pickb : base.arm_neon_64
+
+	fa = fa.create_vdup_n_u32(a: 0)
+	fc = fc.create_vdup_n_u32(a: 0)
+
+	// § The advance is 3, not 4.
+	iterate (c = args.curr, p = args.prev)(length: 4, advance: 3, unroll: 2) {
+		fb = fb.create_vdup_n_u32(a: p.peek_u32le())
+		fx = fx.create_vdup_n_u32(a: c.peek_u32le())
+		fafb = fa.vaddl_u8(b: fb)
+		fcfc = fc.vaddl_u8(b: fc)
+		pa = fb.vabdl_u8(b: fc)
+		pb = fa.vabdl_u8(b: fc)
+		pc = fafb.vabdq_u16(b: fcfc)
+		cmpab = pa.vcleq_u16(b: pb)
+		cmpac = pa.vcleq_u16(b: pc)
+		picka = cmpab.vandq_u8(b: cmpac).vmovn_u16()
+		pickb = pb.vcleq_u16(b: pc).vmovn_u16()
+		fx = fx.vadd_u8(
+			b: picka.vbsl_u8(b: fa,
+			c: pickb.vbsl_u8(b: fb, c: fc)))
+		// § poke_u24le replaces poke_u32le.
+		c.poke_u24le!(a: fx.vget_lane_u32(b: 0))
+		fc = fb
+		fa = fx
+	} else (length: 3, advance: 3, unroll: 1) {
+		// § peek_u24le_as_u32 replaces peek_u32le.
+		fb = fb.create_vdup_n_u32(a: p.peek_u24le_as_u32())
+		// § peek_u24le_as_u32 replaces peek_u32le.
+		fx = fx.create_vdup_n_u32(a: c.peek_u24le_as_u32())
+		fafb = fa.vaddl_u8(b: fb)
+		fcfc = fc.vaddl_u8(b: fc)
+		pa = fb.vabdl_u8(b: fc)
+		pb = fa.vabdl_u8(b: fc)
+		pc = fafb.vabdq_u16(b: fcfc)
+		cmpab = pa.vcleq_u16(b: pb)
+		cmpac = pa.vcleq_u16(b: pc)
+		picka = cmpab.vandq_u8(b: cmpac).vmovn_u16()
+		pickb = pb.vcleq_u16(b: pc).vmovn_u16()
+		fx = fx.vadd_u8(
+			b: picka.vbsl_u8(b: fa,
+			c: pickb.vbsl_u8(b: fb, c: fc)))
+		// § poke_u24le replaces poke_u32le.
+		c.poke_u24le!(a: fx.vget_lane_u32(b: 0))
+		// § These assignments are unnecessary; this is the last iteration.
+		// fc = fb
+		// fa = fx
+	}
+}
+
+pri func decoder.filter_4_distance_4_arm_neon!(curr: slice base.u8, prev: slice base.u8),
+	choose cpu_arch >= arm_neon,
+{
+	var c     : slice base.u8
+	var p     : slice base.u8
+	var fa    : base.arm_neon_64
+	var fb    : base.arm_neon_64
+	var fc    : base.arm_neon_64
+	var fx    : base.arm_neon_64
+	var fafb  : base.arm_neon_128
+	var fcfc  : base.arm_neon_128
+	var pa    : base.arm_neon_128
+	var pb    : base.arm_neon_128
+	var pc    : base.arm_neon_128
+	var cmpab : base.arm_neon_128
+	var cmpac : base.arm_neon_128
+	var picka : base.arm_neon_64
+	var pickb : base.arm_neon_64
+
+	fa = fa.create_vdup_n_u32(a: 0)
+	fc = fc.create_vdup_n_u32(a: 0)
+
+	iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 2) {
+		fb = fb.create_vdup_n_u32(a: p.peek_u32le())
+		fx = fx.create_vdup_n_u32(a: c.peek_u32le())
+
+		fafb = fa.vaddl_u8(b: fb)  // fafb = (fa + fb)
+		fcfc = fc.vaddl_u8(b: fc)  // fcfc = (fc + fc)
+
+		pa = fb.vabdl_u8(b: fc)  //      pa = abs(fa + fb - fc - fa)
+		pb = fa.vabdl_u8(b: fc)  //      pb = abs(fa + fb - fc - fb)
+		pc = fafb.vabdq_u16(b: fcfc)  // pc = abs(fa + fb - fc - fc)
+
+		cmpab = pa.vcleq_u16(b: pb)  // cmpab = (pa <= pb)
+		cmpac = pa.vcleq_u16(b: pc)  // cmpac = (pa <= pc)
+
+		picka = cmpab.vandq_u8(b: cmpac).vmovn_u16()  // picka = ((pa <= pb) && (pa <= pc))
+		pickb = pb.vcleq_u16(b: pc).vmovn_u16()  //      pickb = (pb <= pc)
+
+		// Add the predictor to the residual.
+		fx = fx.vadd_u8(
+			b: picka.vbsl_u8(b: fa,
+			c: pickb.vbsl_u8(b: fb, c: fc)))
+
+		c.poke_u32le!(a: fx.vget_lane_u32(b: 0))
+		fc = fb
+		fa = fx
+	}
+}
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index 956d901..cc6b8a4 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -376,6 +376,7 @@
 		choose filter_1 = [filter_1_distance_3_fallback]
 		choose filter_3 = [filter_3_distance_3_fallback]
 		choose filter_4 = [
+			filter_4_distance_3_arm_neon,
 			filter_4_distance_3_sse42,
 			filter_4_distance_3_fallback]
 	} else if this.filter_distance == 4 {
@@ -388,6 +389,7 @@
 			filter_3_distance_4_sse42,
 			filter_3_distance_4_fallback]
 		choose filter_4 = [
+			filter_4_distance_4_arm_neon,
 			filter_4_distance_4_sse42,
 			filter_4_distance_4_fallback]
 	}