Have std/png filter_1_distance_? use more SIMD

This commit will soon be followed by a rollback, but it is committed
anyway so that we can refer to these numbers in the git log.

wuffs_png_decode_filt_1_dist_3/clang9  1.82GB/s ± 0%  1.03GB/s ± 0%  -43.49%  (p=0.008 n=5+5)
wuffs_png_decode_filt_1_dist_4/clang9  6.02GB/s ± 0%  5.23GB/s ± 0%  -13.05%  (p=0.008 n=5+5)

wuffs_png_decode_filt_1_dist_3/gcc10   1.82GB/s ± 1%  1.03GB/s ± 0%  -43.36%  (p=0.008 n=5+5)
wuffs_png_decode_filt_1_dist_4/gcc10   5.42GB/s ± 0%  5.13GB/s ± 1%   -5.39%  (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index a167986..48b08bf 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -30290,6 +30290,13 @@
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
 static wuffs_base__empty_struct
+wuffs_png__decoder__filter_1_distance_3_sse42(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr);
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+static wuffs_base__empty_struct
 wuffs_png__decoder__filter_1_distance_4_sse42(
     wuffs_png__decoder* self,
     wuffs_base__slice_u8 a_curr);
@@ -31142,6 +31149,55 @@
   return wuffs_base__make_empty_struct();
 }
 
+// -------- func png.decoder.filter_1_distance_3_sse42
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_1_distance_3_sse42(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr) {
+  wuffs_base__slice_u8 v_c = {0};
+  __m128i v_x128 = {0};
+  __m128i v_i128 = {0};
+  __m128i v_j128 = {0};
+  __m128i v_k128 = {0};
+  __m128i v_a128 = {0};
+
+  {
+    wuffs_base__slice_u8 i_slice_c = a_curr;
+    v_c.ptr = i_slice_c.ptr;
+    v_c.len = 16;
+    uint8_t* i_end0_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 16, 12);
+    while (v_c.ptr < i_end0_c) {
+      v_x128 = _mm_lddqu_si128((const __m128i*)(const void*)(v_c.ptr));
+      v_i128 = _mm_add_epi8(v_x128, v_a128);
+      v_j128 = _mm_slli_si128(v_i128, (int32_t)(3));
+      v_j128 = _mm_add_epi8(v_j128, v_i128);
+      v_k128 = _mm_slli_si128(v_j128, (int32_t)(6));
+      v_k128 = _mm_add_epi8(v_k128, v_j128);
+      v_k128 = _mm_blend_epi16(v_k128, v_x128, (int32_t)(192));
+      _mm_storeu_si128((__m128i*)(void*)(v_c.ptr), v_k128);
+      v_a128 = _mm_srli_si128(_mm_slli_si128(v_k128, (int32_t)(4)), (int32_t)(13));
+      v_c.ptr += 12;
+    }
+    v_c.len = 3;
+    uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+    while (v_c.ptr < i_end1_c) {
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u24le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_add_epi8(v_x128, v_a128);
+      v_a128 = v_x128;
+      wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+      v_c.ptr += 3;
+    }
+    v_c.len = 0;
+  }
+  return wuffs_base__make_empty_struct();
+}
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
 // -------- func png.decoder.filter_1_distance_4_sse42
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
@@ -31154,24 +31210,26 @@
     wuffs_base__slice_u8 a_curr) {
   wuffs_base__slice_u8 v_c = {0};
   __m128i v_x128 = {0};
+  __m128i v_i128 = {0};
+  __m128i v_j128 = {0};
+  __m128i v_k128 = {0};
   __m128i v_a128 = {0};
 
   {
     wuffs_base__slice_u8 i_slice_c = a_curr;
     v_c.ptr = i_slice_c.ptr;
-    v_c.len = 4;
-    uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
+    v_c.len = 16;
+    uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 16) * 16);
     while (v_c.ptr < i_end0_c) {
-      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
-      v_x128 = _mm_add_epi8(v_x128, v_a128);
-      v_a128 = v_x128;
-      wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
-      v_c.ptr += 4;
-      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
-      v_x128 = _mm_add_epi8(v_x128, v_a128);
-      v_a128 = v_x128;
-      wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
-      v_c.ptr += 4;
+      v_x128 = _mm_lddqu_si128((const __m128i*)(const void*)(v_c.ptr));
+      v_i128 = _mm_add_epi8(v_x128, v_a128);
+      v_j128 = _mm_slli_si128(v_i128, (int32_t)(4));
+      v_j128 = _mm_add_epi8(v_j128, v_i128);
+      v_k128 = _mm_slli_si128(v_j128, (int32_t)(8));
+      v_k128 = _mm_add_epi8(v_k128, v_j128);
+      _mm_storeu_si128((__m128i*)(void*)(v_c.ptr), v_k128);
+      v_a128 = _mm_srli_si128(v_k128, (int32_t)(12));
+      v_c.ptr += 16;
     }
     v_c.len = 4;
     uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
@@ -32134,6 +32192,9 @@
     wuffs_png__decoder* self) {
   if (self->private_impl.f_filter_distance == 3) {
     self->private_impl.choosy_filter_1 = (
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+        wuffs_base__cpu_arch__have_sse42() ? &wuffs_png__decoder__filter_1_distance_3_sse42 :
+#endif
         &wuffs_png__decoder__filter_1_distance_3_fallback);
     self->private_impl.choosy_filter_3 = (
         &wuffs_png__decoder__filter_3_distance_3_fallback);
diff --git a/std/png/decode_filter_sse42.wuffs b/std/png/decode_filter_sse42.wuffs
index e96a304..d63b562 100644
--- a/std/png/decode_filter_sse42.wuffs
+++ b/std/png/decode_filter_sse42.wuffs
@@ -16,36 +16,95 @@
 
 // Filter 1: Sub.
 
-// This (filter = 1, distance = 3) implementation doesn't actually bench faster
-// than the non-SIMD one.
-//
-// pri func decoder.filter_1_distance_3_sse42!(curr: slice base.u8),
-//     choose cpu_arch >= x86_sse42,
-// {
-//     var c    : slice base.u8
-//     var x128 : base.x86_m128i
-//     var a128 : base.x86_m128i
-//
-//     iterate (c = args.curr)(length: 4, advance: 3, unroll: 2) {
-//         x128.load_u32!(a: c.peek_u32le())
-//         x128 = x128._mm_add_epi8(b: a128)
-//         a128 = x128
-//         c.poke_u24le!(a: x128.truncate_u32())
-//     } else (length: 3, advance: 3, unroll: 1) {
-//         x128.load_u32!(a: c.peek_u24le_as_u32())
-//         x128 = x128._mm_add_epi8(b: a128)
-//         c.poke_u24le!(a: x128.truncate_u32())
-//     }
-// }
+pri func decoder.filter_1_distance_3_sse42!(curr: slice base.u8),
+	choose cpu_arch >= x86_sse42,
+{
+	var c    : slice base.u8
+	var x128 : base.x86_m128i
+	var i128 : base.x86_m128i
+	var j128 : base.x86_m128i
+	var k128 : base.x86_m128i
+	var a128 : base.x86_m128i
+
+	iterate (c = args.curr)(length: 16, advance: 12, unroll: 1) {
+		// For distance_3, we only use the first 12 bytes (96 bits) of the
+		// 128-bit registers. The final 4 bytes (32 bits) are usually junk:
+		//  - [R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 ; junkC junkD junkE junkF]
+		// For x128, the final bytes are the upcoming pixels:
+		//  - [R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 ;    R4    G4    B4    R5]
+		// For a128, the final bytes are zeroes.
+
+		// x128 = [x0, x1, x2, x3; upcoming]
+		x128.load_slice128!(a: c)
+
+		// i128 = [a+x0, x1, x2, x3; junk]
+		i128 = x128._mm_add_epi8(b: a128)
+
+		// j128 = [0, a+x0, x1, x2; junk]
+		j128 = i128._mm_slli_si128(imm8: 3)
+
+		// j128 = [a+x0, a+x0+x1, x1+x2, x2+x3; junk]
+		j128 = j128._mm_add_epi8(b: i128)
+
+		// k128 = [0, 0, a+x0, a+x0+x1; junk]
+		k128 = j128._mm_slli_si128(imm8: 6)
+
+		// k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3; junk]
+		k128 = k128._mm_add_epi8(b: j128)
+
+		// k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3; upcoming]
+		k128 = k128._mm_blend_epi16(b: x128, imm8: 0xC0)
+
+		// Store.
+		k128.store_slice128!(a: c)
+
+		// a128 = [a+x0+x1+x2+x3, 0, 0, 0; zeroes]
+		a128 = k128._mm_slli_si128(imm8: 4)._mm_srli_si128(imm8: 13)
+
+	} else (length: 3, advance: 3, unroll: 1) {
+		x128.load_u32!(a: c.peek_u24le_as_u32())
+		x128 = x128._mm_add_epi8(b: a128)
+		a128 = x128
+		c.poke_u24le!(a: x128.truncate_u32())
+	}
+}
 
 pri func decoder.filter_1_distance_4_sse42!(curr: slice base.u8),
 	choose cpu_arch >= x86_sse42,
 {
 	var c    : slice base.u8
 	var x128 : base.x86_m128i
+	var i128 : base.x86_m128i
+	var j128 : base.x86_m128i
+	var k128 : base.x86_m128i
 	var a128 : base.x86_m128i
 
-	iterate (c = args.curr)(length: 4, advance: 4, unroll: 2) {
+	iterate (c = args.curr)(length: 16, advance: 16, unroll: 1) {
+		// x128 = [x0, x1, x2, x3]
+		x128.load_slice128!(a: c)
+
+		// i128 = [a+x0, x1, x2, x3]
+		i128 = x128._mm_add_epi8(b: a128)
+
+		// j128 = [0, a+x0, x1, x2]
+		j128 = i128._mm_slli_si128(imm8: 4)
+
+		// j128 = [a+x0, a+x0+x1, x1+x2, x2+x3]
+		j128 = j128._mm_add_epi8(b: i128)
+
+		// k128 = [0, 0, a+x0, a+x0+x1]
+		k128 = j128._mm_slli_si128(imm8: 8)
+
+		// k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3]
+		k128 = k128._mm_add_epi8(b: j128)
+
+		// Store.
+		k128.store_slice128!(a: c)
+
+		// a128 = [a+x0+x1+x2+x3, 0, 0, 0]
+		a128 = k128._mm_srli_si128(imm8: 12)
+
+	} else (length: 4, advance: 4, unroll: 1) {
 		x128.load_u32!(a: c.peek_u32le())
 		x128 = x128._mm_add_epi8(b: a128)
 		a128 = x128
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index d2f0cb2..e261b0c 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -310,7 +310,9 @@
 	// Filter 0 is a no-op. Filter 2, the up filter, should already vectorize
 	// easily by a good optimizing C compiler.
 	if this.filter_distance == 3 {
-		choose filter_1 = [filter_1_distance_3_fallback]
+		choose filter_1 = [
+			filter_1_distance_3_sse42,
+			filter_1_distance_3_fallback]
 		choose filter_3 = [filter_3_distance_3_fallback]
 		choose filter_4 = [
 			filter_4_distance_3_sse42,