Add std/png decoder.filter_3_distance_4_arm_neon

On a Raspberry Pi 4 (32-bit armv7l) with -march=native and -mfpu=neon
("native" means "armv8-a+crc+simd"):

wuffs_png_decode_filt_3_dist_4/clang9                      639MB/s ± 0%   906MB/s ± 0%  +41.71%  (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_4/gcc8                        656MB/s ± 0%   714MB/s ± 0%   +8.89%  (p=0.008 n=5+5)
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 589aaa3..57de40a 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -561,6 +561,9 @@
 	"arm_neon_64.vget_lane_u32(b: u32[..= 1]) u32",
 	"arm_neon_64.vget_lane_u64(b: u32[..= 0]) u64",
 	"arm_neon_64.vget_lane_u8(b: u32[..= 7]) u8",
+	"arm_neon_64.vhadd_u16(b: arm_neon_64) arm_neon_64",
+	"arm_neon_64.vhadd_u32(b: arm_neon_64) arm_neon_64",
+	"arm_neon_64.vhadd_u8(b: arm_neon_64) arm_neon_64",
 
 	// ---- arm_neon_128
 
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 04a07a7..a07dae4 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -31207,6 +31207,14 @@
     wuffs_base__slice_u8 a_curr);
 #endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
 
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_3_distance_4_arm_neon(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr,
+    wuffs_base__slice_u8 a_prev);
+#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+
 static wuffs_base__empty_struct
 wuffs_png__decoder__filter_1(
     wuffs_png__decoder* self,
@@ -31546,6 +31554,97 @@
 }
 #endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
 
+// -------- func png.decoder.filter_3_distance_4_arm_neon
+
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_3_distance_4_arm_neon(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr,
+    wuffs_base__slice_u8 a_prev) {
+  wuffs_base__slice_u8 v_c = {0};
+  wuffs_base__slice_u8 v_p = {0};
+  uint8x8_t v_fa = {0};
+  uint8x8_t v_fb = {0};
+  uint8x8_t v_fx = {0};
+
+  v_fa = vreinterpret_u8_u32(vdup_n_u32(0));
+  v_fb = vreinterpret_u8_u32(vdup_n_u32(0));
+  if (((uint64_t)(a_prev.len)) == 0) {
+    {
+      wuffs_base__slice_u8 i_slice_c = a_curr;
+      v_c.ptr = i_slice_c.ptr;
+      v_c.len = 4;
+      uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
+      while (v_c.ptr < i_end0_c) {
+        v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_fx = vadd_u8(v_fx, vhadd_u8(v_fa, v_fb));
+        wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+        v_fa = v_fx;
+        v_c.ptr += 4;
+        v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_fx = vadd_u8(v_fx, vhadd_u8(v_fa, v_fb));
+        wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+        v_fa = v_fx;
+        v_c.ptr += 4;
+      }
+      v_c.len = 4;
+      uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+      while (v_c.ptr < i_end1_c) {
+        v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_fx = vadd_u8(v_fx, vhadd_u8(v_fa, v_fb));
+        wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+        v_fa = v_fx;
+        v_c.ptr += 4;
+      }
+      v_c.len = 0;
+    }
+  } else {
+    {
+      wuffs_base__slice_u8 i_slice_c = a_curr;
+      v_c.ptr = i_slice_c.ptr;
+      wuffs_base__slice_u8 i_slice_p = a_prev;
+      v_p.ptr = i_slice_p.ptr;
+      i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
+      v_c.len = 4;
+      v_p.len = 4;
+      uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
+      while (v_c.ptr < i_end0_c) {
+        v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+        v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_fx = vadd_u8(v_fx, vhadd_u8(v_fa, v_fb));
+        wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+        v_fa = v_fx;
+        v_c.ptr += 4;
+        v_p.ptr += 4;
+        v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+        v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_fx = vadd_u8(v_fx, vhadd_u8(v_fa, v_fb));
+        wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+        v_fa = v_fx;
+        v_c.ptr += 4;
+        v_p.ptr += 4;
+      }
+      v_c.len = 4;
+      v_p.len = 4;
+      uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+      while (v_c.ptr < i_end1_c) {
+        v_fb = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+        v_fx = vreinterpret_u8_u32(vdup_n_u32(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_fx = vadd_u8(v_fx, vhadd_u8(v_fa, v_fb));
+        wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, vget_lane_u32(vreinterpret_u32_u8(v_fx), 0));
+        v_fa = v_fx;
+        v_c.ptr += 4;
+        v_p.ptr += 4;
+      }
+      v_c.len = 0;
+      v_p.len = 0;
+    }
+  }
+  return wuffs_base__make_empty_struct();
+}
+#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+
 // -------- func png.decoder.filter_1
 
 static wuffs_base__empty_struct
@@ -33292,6 +33391,9 @@
 #endif
         &wuffs_png__decoder__filter_1_distance_4_fallback);
     self->private_impl.choosy_filter_3 = (
+#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+        wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_png__decoder__filter_3_distance_4_arm_neon :
+#endif
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
         wuffs_base__cpu_arch__have_sse42() ? &wuffs_png__decoder__filter_3_distance_4_sse42 :
 #endif
diff --git a/std/png/decode_filter_arm_neon.wuffs b/std/png/decode_filter_arm_neon.wuffs
index a5a1c0c..157c209 100644
--- a/std/png/decode_filter_arm_neon.wuffs
+++ b/std/png/decode_filter_arm_neon.wuffs
@@ -44,3 +44,38 @@
 		fa = fx
 	}
 }
+
+// --------
+
+// Filter 3: Average.
+
+pri func decoder.filter_3_distance_4_arm_neon!(curr: slice base.u8, prev: slice base.u8),
+	choose cpu_arch >= arm_neon,
+{
+	var c  : slice base.u8
+	var p  : slice base.u8
+	var fa : base.arm_neon_64
+	var fb : base.arm_neon_64
+	var fx : base.arm_neon_64
+
+	fa = fa.create_vdup_n_u32(a: 0)
+	fb = fb.create_vdup_n_u32(a: 0)
+
+	if args.prev.length() == 0 {
+		iterate (c = args.curr)(length: 4, advance: 4, unroll: 2) {
+			fx = fx.create_vdup_n_u32(a: c.peek_u32le())
+			fx = fx.vadd_u8(b: fa.vhadd_u8(b: fb))
+			c.poke_u32le!(a: fx.vget_lane_u32(b: 0))
+			fa = fx
+		}
+
+	} else {
+		iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 2) {
+			fb = fb.create_vdup_n_u32(a: p.peek_u32le())
+			fx = fx.create_vdup_n_u32(a: c.peek_u32le())
+			fx = fx.vadd_u8(b: fa.vhadd_u8(b: fb))
+			c.poke_u32le!(a: fx.vget_lane_u32(b: 0))
+			fa = fx
+		}
+	}
+}
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index 7157fa2..956d901 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -384,6 +384,7 @@
 			filter_1_distance_4_sse42,
 			filter_1_distance_4_fallback]
 		choose filter_3 = [
+			filter_3_distance_4_arm_neon,
 			filter_3_distance_4_sse42,
 			filter_3_distance_4_fallback]
 		choose filter_4 = [