Tweak builtin.SliceU8Funcs peek method names
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index d31af4c..9965cff 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -593,7 +593,11 @@
 	}
 
 	if (t.IDPeekU8 <= method) && (method <= t.IDPeekU64LE) {
-		b.printf("wuffs_base__%s__no_bounds_check(", method.Str(g.tm))
+		s := method.Str(g.tm)
+		if i := strings.Index(s, "_as_"); i >= 0 {
+			s = s[:i]
+		}
+		b.printf("wuffs_base__%s__no_bounds_check(", s)
 		if err := g.writeExpr(b, recv, depth); err != nil {
 			return err
 		}
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index e117e68..b7d713a 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -612,16 +612,16 @@
 	"GENERIC T1.peek_u8() u8",
 	"GENERIC T1.peek_u16be() u16",
 	"GENERIC T1.peek_u16le() u16",
-	"GENERIC T1.peek_u24be() u32",
-	"GENERIC T1.peek_u24le() u32",
+	"GENERIC T1.peek_u24be_as_u32() u32",
+	"GENERIC T1.peek_u24le_as_u32() u32",
 	"GENERIC T1.peek_u32be() u32",
 	"GENERIC T1.peek_u32le() u32",
-	"GENERIC T1.peek_u40be() u64",
-	"GENERIC T1.peek_u40le() u64",
-	"GENERIC T1.peek_u48be() u64",
-	"GENERIC T1.peek_u48le() u64",
-	"GENERIC T1.peek_u56be() u64",
-	"GENERIC T1.peek_u56le() u64",
+	"GENERIC T1.peek_u40be_as_u64() u64",
+	"GENERIC T1.peek_u40le_as_u64() u64",
+	"GENERIC T1.peek_u48be_as_u64() u64",
+	"GENERIC T1.peek_u48le_as_u64() u64",
+	"GENERIC T1.peek_u56be_as_u64() u64",
+	"GENERIC T1.peek_u56le_as_u64() u64",
 	"GENERIC T1.peek_u64be() u64",
 	"GENERIC T1.peek_u64le() u64",
 
diff --git a/std/png/decode_filter_sse128.wuffs b/std/png/decode_filter_sse128.wuffs
index 746afcd..7858541 100644
--- a/std/png/decode_filter_sse128.wuffs
+++ b/std/png/decode_filter_sse128.wuffs
@@ -16,6 +16,28 @@
 
 // Filter 1: Sub.
 
+// This (filter = 1, distance = 3) implementation doesn't actually bench faster
+// than the non-SIMD one.
+//
+// pri func decoder.filter_1_distance_3_sse128!(curr: slice base.u8),
+//     choose cpu_arch >= sse128,
+// {
+//     var c    : slice base.u8
+//     var x128 : base.sse128_i
+//     var a128 : base.sse128_i
+//
+//     iterate (c = args.curr)(length: 4, advance: 3, unroll: 1) {
+//         x128.load_u32!(a: c.peek_u32le())
+//         x128 = x128._mm_add_epi8!(b: a128)
+//         a128 = x128
+//         c.poke_u24le!(a: x128.truncate_u32())
+//     } else (length: 3, advance: 3, unroll: 1) {
+//         x128.load_u32!(a: c.peek_u24le_as_u32())
+//         x128 = x128._mm_add_epi8!(b: a128)
+//         c.poke_u24le!(a: x128.truncate_u32())
+//     }
+// }
+
 pri func decoder.filter_1_distance_4_sse128!(curr: slice base.u8),
 	choose cpu_arch >= sse128,
 {