Roll back 'png filter_1_distance_? use more SIMD'

See the previous commit for the rationale.
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 48b08bf..a167986 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -30290,13 +30290,6 @@
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
 static wuffs_base__empty_struct
-wuffs_png__decoder__filter_1_distance_3_sse42(
-    wuffs_png__decoder* self,
-    wuffs_base__slice_u8 a_curr);
-#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
-
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-static wuffs_base__empty_struct
 wuffs_png__decoder__filter_1_distance_4_sse42(
     wuffs_png__decoder* self,
     wuffs_base__slice_u8 a_curr);
@@ -31149,55 +31142,6 @@
   return wuffs_base__make_empty_struct();
 }
 
-// -------- func png.decoder.filter_1_distance_3_sse42
-
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
-#endif
-static wuffs_base__empty_struct
-wuffs_png__decoder__filter_1_distance_3_sse42(
-    wuffs_png__decoder* self,
-    wuffs_base__slice_u8 a_curr) {
-  wuffs_base__slice_u8 v_c = {0};
-  __m128i v_x128 = {0};
-  __m128i v_i128 = {0};
-  __m128i v_j128 = {0};
-  __m128i v_k128 = {0};
-  __m128i v_a128 = {0};
-
-  {
-    wuffs_base__slice_u8 i_slice_c = a_curr;
-    v_c.ptr = i_slice_c.ptr;
-    v_c.len = 16;
-    uint8_t* i_end0_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 16, 12);
-    while (v_c.ptr < i_end0_c) {
-      v_x128 = _mm_lddqu_si128((const __m128i*)(const void*)(v_c.ptr));
-      v_i128 = _mm_add_epi8(v_x128, v_a128);
-      v_j128 = _mm_slli_si128(v_i128, (int32_t)(3));
-      v_j128 = _mm_add_epi8(v_j128, v_i128);
-      v_k128 = _mm_slli_si128(v_j128, (int32_t)(6));
-      v_k128 = _mm_add_epi8(v_k128, v_j128);
-      v_k128 = _mm_blend_epi16(v_k128, v_x128, (int32_t)(192));
-      _mm_storeu_si128((__m128i*)(void*)(v_c.ptr), v_k128);
-      v_a128 = _mm_srli_si128(_mm_slli_si128(v_k128, (int32_t)(4)), (int32_t)(13));
-      v_c.ptr += 12;
-    }
-    v_c.len = 3;
-    uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
-    while (v_c.ptr < i_end1_c) {
-      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u24le__no_bounds_check(v_c.ptr)));
-      v_x128 = _mm_add_epi8(v_x128, v_a128);
-      v_a128 = v_x128;
-      wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
-      v_c.ptr += 3;
-    }
-    v_c.len = 0;
-  }
-  return wuffs_base__make_empty_struct();
-}
-#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
-
 // -------- func png.decoder.filter_1_distance_4_sse42
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
@@ -31210,26 +31154,24 @@
     wuffs_base__slice_u8 a_curr) {
   wuffs_base__slice_u8 v_c = {0};
   __m128i v_x128 = {0};
-  __m128i v_i128 = {0};
-  __m128i v_j128 = {0};
-  __m128i v_k128 = {0};
   __m128i v_a128 = {0};
 
   {
     wuffs_base__slice_u8 i_slice_c = a_curr;
     v_c.ptr = i_slice_c.ptr;
-    v_c.len = 16;
-    uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 16) * 16);
+    v_c.len = 4;
+    uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
     while (v_c.ptr < i_end0_c) {
-      v_x128 = _mm_lddqu_si128((const __m128i*)(const void*)(v_c.ptr));
-      v_i128 = _mm_add_epi8(v_x128, v_a128);
-      v_j128 = _mm_slli_si128(v_i128, (int32_t)(4));
-      v_j128 = _mm_add_epi8(v_j128, v_i128);
-      v_k128 = _mm_slli_si128(v_j128, (int32_t)(8));
-      v_k128 = _mm_add_epi8(v_k128, v_j128);
-      _mm_storeu_si128((__m128i*)(void*)(v_c.ptr), v_k128);
-      v_a128 = _mm_srli_si128(v_k128, (int32_t)(12));
-      v_c.ptr += 16;
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_add_epi8(v_x128, v_a128);
+      v_a128 = v_x128;
+      wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+      v_c.ptr += 4;
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_add_epi8(v_x128, v_a128);
+      v_a128 = v_x128;
+      wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+      v_c.ptr += 4;
     }
     v_c.len = 4;
     uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
@@ -32192,9 +32134,6 @@
     wuffs_png__decoder* self) {
   if (self->private_impl.f_filter_distance == 3) {
     self->private_impl.choosy_filter_1 = (
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-        wuffs_base__cpu_arch__have_sse42() ? &wuffs_png__decoder__filter_1_distance_3_sse42 :
-#endif
         &wuffs_png__decoder__filter_1_distance_3_fallback);
     self->private_impl.choosy_filter_3 = (
         &wuffs_png__decoder__filter_3_distance_3_fallback);
diff --git a/std/png/decode_filter_sse42.wuffs b/std/png/decode_filter_sse42.wuffs
index d63b562..e96a304 100644
--- a/std/png/decode_filter_sse42.wuffs
+++ b/std/png/decode_filter_sse42.wuffs
@@ -16,95 +16,36 @@
 
 // Filter 1: Sub.
 
-pri func decoder.filter_1_distance_3_sse42!(curr: slice base.u8),
-	choose cpu_arch >= x86_sse42,
-{
-	var c    : slice base.u8
-	var x128 : base.x86_m128i
-	var i128 : base.x86_m128i
-	var j128 : base.x86_m128i
-	var k128 : base.x86_m128i
-	var a128 : base.x86_m128i
-
-	iterate (c = args.curr)(length: 16, advance: 12, unroll: 1) {
-		// For distance_3, we only use the first 12 bytes (96 bits) of the
-		// 128-bit registers. The final 4 bytes (32 bits) are usually junk:
-		//  - [R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 ; junkC junkD junkE junkF]
-		// For x128, the final bytes are the upcoming pixels:
-		//  - [R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 ;    R4    G4    B4    R5]
-		// For a128, the final bytes are zeroes.
-
-		// x128 = [x0, x1, x2, x3; upcoming]
-		x128.load_slice128!(a: c)
-
-		// i128 = [a+x0, x1, x2, x3; junk]
-		i128 = x128._mm_add_epi8(b: a128)
-
-		// j128 = [0, a+x0, x1, x2; junk]
-		j128 = i128._mm_slli_si128(imm8: 3)
-
-		// j128 = [a+x0, a+x0+x1, x1+x2, x2+x3; junk]
-		j128 = j128._mm_add_epi8(b: i128)
-
-		// k128 = [0, 0, a+x0, a+x0+x1; junk]
-		k128 = j128._mm_slli_si128(imm8: 6)
-
-		// k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3; junk]
-		k128 = k128._mm_add_epi8(b: j128)
-
-		// k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3; upcoming]
-		k128 = k128._mm_blend_epi16(b: x128, imm8: 0xC0)
-
-		// Store.
-		k128.store_slice128!(a: c)
-
-		// a128 = [a+x0+x1+x2+x3, 0, 0, 0; zeroes]
-		a128 = k128._mm_slli_si128(imm8: 4)._mm_srli_si128(imm8: 13)
-
-	} else (length: 3, advance: 3, unroll: 1) {
-		x128.load_u32!(a: c.peek_u24le_as_u32())
-		x128 = x128._mm_add_epi8(b: a128)
-		a128 = x128
-		c.poke_u24le!(a: x128.truncate_u32())
-	}
-}
+// This (filter = 1, distance = 3) implementation doesn't actually bench faster
+// than the non-SIMD one.
+//
+// pri func decoder.filter_1_distance_3_sse42!(curr: slice base.u8),
+//     choose cpu_arch >= x86_sse42,
+// {
+//     var c    : slice base.u8
+//     var x128 : base.x86_m128i
+//     var a128 : base.x86_m128i
+//
+//     iterate (c = args.curr)(length: 4, advance: 3, unroll: 2) {
+//         x128.load_u32!(a: c.peek_u32le())
+//         x128 = x128._mm_add_epi8(b: a128)
+//         a128 = x128
+//         c.poke_u24le!(a: x128.truncate_u32())
+//     } else (length: 3, advance: 3, unroll: 1) {
+//         x128.load_u32!(a: c.peek_u24le_as_u32())
+//         x128 = x128._mm_add_epi8(b: a128)
+//         c.poke_u24le!(a: x128.truncate_u32())
+//     }
+// }
 
 pri func decoder.filter_1_distance_4_sse42!(curr: slice base.u8),
 	choose cpu_arch >= x86_sse42,
 {
 	var c    : slice base.u8
 	var x128 : base.x86_m128i
-	var i128 : base.x86_m128i
-	var j128 : base.x86_m128i
-	var k128 : base.x86_m128i
 	var a128 : base.x86_m128i
 
-	iterate (c = args.curr)(length: 16, advance: 16, unroll: 1) {
-		// x128 = [x0, x1, x2, x3]
-		x128.load_slice128!(a: c)
-
-		// i128 = [a+x0, x1, x2, x3]
-		i128 = x128._mm_add_epi8(b: a128)
-
-		// j128 = [0, a+x0, x1, x2]
-		j128 = i128._mm_slli_si128(imm8: 4)
-
-		// j128 = [a+x0, a+x0+x1, x1+x2, x2+x3]
-		j128 = j128._mm_add_epi8(b: i128)
-
-		// k128 = [0, 0, a+x0, a+x0+x1]
-		k128 = j128._mm_slli_si128(imm8: 8)
-
-		// k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3]
-		k128 = k128._mm_add_epi8(b: j128)
-
-		// Store.
-		k128.store_slice128!(a: c)
-
-		// a128 = [a+x0+x1+x2+x3, 0, 0, 0]
-		a128 = k128._mm_srli_si128(imm8: 12)
-
-	} else (length: 4, advance: 4, unroll: 1) {
+	iterate (c = args.curr)(length: 4, advance: 4, unroll: 2) {
 		x128.load_u32!(a: c.peek_u32le())
 		x128 = x128._mm_add_epi8(b: a128)
 		a128 = x128
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index e261b0c..d2f0cb2 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -310,9 +310,7 @@
 	// Filter 0 is a no-op. Filter 2, the up filter, should already vectorize
 	// easily by a good optimizing C compiler.
 	if this.filter_distance == 3 {
-		choose filter_1 = [
-			filter_1_distance_3_sse42,
-			filter_1_distance_3_fallback]
+		choose filter_1 = [filter_1_distance_3_fallback]
 		choose filter_3 = [filter_3_distance_3_fallback]
 		choose filter_4 = [
 			filter_4_distance_3_sse42,