Add png.decoder.filter_3_distance_4_sse128

wuffs_png_decode_filt_3_dist_3/clang9   905MB/s ± 0%   913MB/s ± 0%    +0.81%  (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_4/clang9   944MB/s ± 1%  1780MB/s ± 0%   +88.51%  (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_3/gcc10   1.07GB/s ± 0%  1.08GB/s ± 0%    +0.42%  (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_4/gcc10   1.01GB/s ± 1%  2.11GB/s ± 0%  +109.79%  (p=0.008 n=5+5)
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index 9965cff..b7e4947 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -455,15 +455,36 @@
 		return nil
 	}
 
-	b.printf("%s(", method.Str(g.tm))
-	if err := g.writeExpr(b, recv, depth); err != nil {
-		return err
-	}
-	for _, o := range args {
-		b.writes(", ")
-		if err := g.writeExpr(b, o.AsArg().Value(), depth); err != nil {
+	const create = "create"
+	methodStr := method.Str(g.tm)
+	if strings.HasPrefix(methodStr, create) {
+		b.printf("%s(", methodStr[len(create):])
+		for i, o := range args {
+			if i > 0 {
+				b.writes(", ")
+			}
+			after := ""
+			switch method {
+			case t.IDCreateMMSet1EPI8:
+				b.writes("(char)(")
+				after = ")"
+			}
+			if err := g.writeExpr(b, o.AsArg().Value(), depth); err != nil {
+				return err
+			}
+			b.writes(after)
+		}
+	} else {
+		b.printf("%s(", methodStr)
+		if err := g.writeExpr(b, recv, depth); err != nil {
 			return err
 		}
+		for _, o := range args {
+			b.writes(", ")
+			if err := g.writeExpr(b, o.AsArg().Value(), depth); err != nil {
+				return err
+			}
+		}
 	}
 	b.writes(")")
 	return nil
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index b7d713a..60d804b 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -536,7 +536,14 @@
 	"sse128_i.truncate_u32() u32",
 
 	// TODO: generate these methods automatically?
-	"sse128_i._mm_add_epi8!(b: sse128_i) sse128_i",
+
+	"sse128_i.create_mm_set1_epi8(a: u8) sse128_i",
+
+	"sse128_i._mm_add_epi8(b: sse128_i) sse128_i",
+	"sse128_i._mm_and_si128(b: sse128_i) sse128_i",
+	"sse128_i._mm_avg_epu8(b: sse128_i) sse128_i",
+	"sse128_i._mm_sub_epi8(b: sse128_i) sse128_i",
+	"sse128_i._mm_xor_si128(b: sse128_i) sse128_i",
 }
 
 var Interfaces = []string{
diff --git a/lang/token/list.go b/lang/token/list.go
index 24ad4a9..ea3af92 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -678,6 +678,8 @@
 	IDStoreSlice  = ID(0x388)
 	IDTruncateU32 = ID(0x389)
 	IDTruncateU64 = ID(0x38A)
+
+	IDCreateMMSet1EPI8 = ID(0x390)
 )
 
 var builtInsByID = [nBuiltInIDs]string{
@@ -1072,6 +1074,8 @@
 	IDStoreSlice:  "store_slice",
 	IDTruncateU32: "truncate_u32",
 	IDTruncateU64: "truncate_u64",
+
+	IDCreateMMSet1EPI8: "create_mm_set1_epi8",
 }
 
 var builtInsByName = map[string]ID{}
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 6430220..d6ed9d9 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -30298,6 +30298,14 @@
     wuffs_base__slice_u8 a_curr);
 #endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
 
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_3_distance_4_sse128(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr,
+    wuffs_base__slice_u8 a_prev);
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
 static wuffs_base__empty_struct
 wuffs_png__decoder__choose_filter_implementations(
     wuffs_png__decoder* self);
@@ -31073,6 +31081,69 @@
 }
 #endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
 
+// -------- func png.decoder.filter_3_distance_4_sse128
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_3_distance_4_sse128(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr,
+    wuffs_base__slice_u8 a_prev) {
+  wuffs_base__slice_u8 v_c = {0};
+  wuffs_base__slice_u8 v_p = {0};
+  __m128i v_x128 = {0};
+  __m128i v_a128 = {0};
+  __m128i v_b128 = {0};
+  __m128i v_p128 = {0};
+  __m128i v_k128 = {0};
+
+  if (((uint64_t)(a_prev.len)) == 0) {
+    v_k128 = _mm_set1_epi8((char)(254));
+    {
+      wuffs_base__slice_u8 i_slice_c = a_curr;
+      v_c = i_slice_c;
+      v_c.len = 4;
+      uint8_t* i_end0_c = i_slice_c.ptr + ((i_slice_c.len / 4) * 4);
+      while (v_c.ptr < i_end0_c) {
+        v_p128 = _mm_avg_epu8(_mm_and_si128(v_a128, v_k128), v_b128);
+        (v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr))), wuffs_base__make_empty_struct());
+        v_x128 = _mm_add_epi8(v_x128, v_p128);
+        v_a128 = v_x128;
+        (wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128)))), wuffs_base__make_empty_struct());
+        v_c.ptr += 4;
+      }
+    }
+  } else {
+    v_k128 = _mm_set1_epi8((char)(1));
+    {
+      wuffs_base__slice_u8 i_slice_c = a_curr;
+      v_c = i_slice_c;
+      wuffs_base__slice_u8 i_slice_p = a_prev;
+      v_p = i_slice_p;
+      i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
+      v_c.len = 4;
+      v_p.len = 4;
+      uint8_t* i_end0_c = i_slice_c.ptr + ((i_slice_c.len / 4) * 4);
+      while (v_c.ptr < i_end0_c) {
+        (v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr))), wuffs_base__make_empty_struct());
+        v_p128 = _mm_avg_epu8(v_a128, v_b128);
+        v_p128 = _mm_sub_epi8(v_p128, _mm_and_si128(v_k128, _mm_xor_si128(v_a128, v_b128)));
+        (v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr))), wuffs_base__make_empty_struct());
+        v_x128 = _mm_add_epi8(v_x128, v_p128);
+        v_a128 = v_x128;
+        (wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128)))), wuffs_base__make_empty_struct());
+        v_c.ptr += 4;
+        v_p.ptr += 4;
+      }
+    }
+  }
+  return wuffs_base__make_empty_struct();
+}
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
 // -------- func png.decoder.set_quirk_enabled
 
 WUFFS_BASE__MAYBE_STATIC wuffs_base__empty_struct
@@ -31569,6 +31640,9 @@
 #endif
         &wuffs_png__decoder__filter_1_distance_4_fallback);
     self->private_impl.choosy_filter_3 = (
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+        wuffs_base__cpu_arch__have_sse128() ? &wuffs_png__decoder__filter_3_distance_4_sse128 :
+#endif
         &wuffs_png__decoder__filter_3_distance_4_fallback);
     self->private_impl.choosy_filter_4 = (
         &wuffs_png__decoder__filter_4_distance_4_fallback);
diff --git a/std/png/decode_filter_sse128.wuffs b/std/png/decode_filter_sse128.wuffs
index 7858541..ac18984 100644
--- a/std/png/decode_filter_sse128.wuffs
+++ b/std/png/decode_filter_sse128.wuffs
@@ -47,8 +47,74 @@
 
 	iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
 		x128.load_u32!(a: c.peek_u32le())
-		x128 = x128._mm_add_epi8!(b: a128)
+		x128 = x128._mm_add_epi8(b: a128)
 		a128 = x128
 		c.poke_u32le!(a: x128.truncate_u32())
 	}
 }
+
+// --------
+
+// Filter 3: Average.
+
+// Similar to filter_1_distance_3_sse128, the SIMD implementation for (filter =
+// 3, distance = 3) doesn't actually bench faster than the non-SIMD one.
+//
+// pri func decoder.filter_3_distance_3_sse128!(curr: slice base.u8, prev: slice base.u8),
+//     choose cpu_arch >= sse128,
+// {
+//     etc
+// }
+
+pri func decoder.filter_3_distance_4_sse128!(curr: slice base.u8, prev: slice base.u8),
+	choose cpu_arch >= sse128,
+{
+	var c    : slice base.u8
+	var p    : slice base.u8
+	var x128 : base.sse128_i
+	var a128 : base.sse128_i
+	var b128 : base.sse128_i
+	var p128 : base.sse128_i
+	var k128 : base.sse128_i
+
+	if args.prev.length() == 0 {
+		k128 = k128.create_mm_set1_epi8(a: 0xFE)
+		iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
+			// The predictor, p128, is just half (rounded down) of the previous
+			// pixel, a128. In this branch, b128 stays zero so the average of
+			// a128 and b128 is just half of a128. _mm_avg_epu8 rounds up, but
+			// (a128 & 0xFE_repeated) takes out the low bits of a128's bytes.
+			p128 = a128._mm_and_si128(b: k128)._mm_avg_epu8(b: b128)
+
+			// Add the predictor to the residual and, for the next iteration,
+			// set its previous pixel, a128, to this one, x128.
+			x128.load_u32!(a: c.peek_u32le())
+			x128 = x128._mm_add_epi8(b: p128)
+			a128 = x128
+			c.poke_u32le!(a: x128.truncate_u32())
+		}
+
+	} else {
+		k128 = k128.create_mm_set1_epi8(a: 0x01)
+		iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 1) {
+			// Load the pixel from the row above.
+			b128.load_u32!(a: p.peek_u32le())
+
+			// The predictor, p128, is the average (rounded down) of the
+			// previous pixel, a128, and the pixel above, b128.
+			p128 = a128._mm_avg_epu8(b: b128)
+
+			// Subtract a correction term because _mm_avg_epu8 rounds up but
+			// the PNG filter rounds down. The correction term is the low bit
+			// of each byte of (a128 ^ b128).
+			p128 = p128._mm_sub_epi8(b: k128._mm_and_si128(b: a128._mm_xor_si128(b: b128)))
+
+			// Add the predictor to the residual and, for the next iteration,
+			// set its previous pixel, a128, to this one, x128.
+			x128.load_u32!(a: c.peek_u32le())
+			x128 = x128._mm_add_epi8(b: p128)
+			a128 = x128
+			c.poke_u32le!(a: x128.truncate_u32())
+		}
+	}
+}
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index 45c8746..0dc152e 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -256,7 +256,9 @@
 		choose filter_1 = [
 			filter_1_distance_4_sse128,
 			filter_1_distance_4_fallback]
-		choose filter_3 = [filter_3_distance_4_fallback]
+		choose filter_3 = [
+			filter_3_distance_4_sse128,
+			filter_3_distance_4_fallback]
 		choose filter_4 = [filter_4_distance_4_fallback]
 	}
 }