Add slice base.u8 peek/poke methods

wuffs_png_decode_filter_1_sub/clang9  4.59GB/s ± 0%   4.59GB/s ± 0%      ~     (p=0.151 n=5+5)
wuffs_png_decode_filter_1_sub/gcc10   1.85GB/s ± 0%   4.11GB/s ± 0%  +122.24%  (p=0.016 n=5+4)
diff --git a/doc/changelog.md b/doc/changelog.md
index 4ffedf1..3138787 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -19,6 +19,7 @@
 - Added `example/json-to-cbor`.
 - Added `example/jsonfindptrs`.
 - Added `example/jsonptr`.
+- Added `slice base.u8 peek/poke` methods.
 - Added `std/bmp`.
 - Added `std/cbor`.
 - Added `std/gif.config_decoder`.
diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h
index cff5fcf..a36cf70 100644
--- a/internal/cgen/base/fundamental-public.h
+++ b/internal/cgen/base/fundamental-public.h
@@ -718,8 +718,14 @@
 
 static inline void  //
 wuffs_base__poke_u16le__no_bounds_check(uint8_t* p, uint16_t x) {
+#if defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__)
+  // This seems to perform better on gcc 10 (but not clang 9). Clang also
+  // defines "__GNUC__".
+  memcpy(p, &x, 2);
+#else
   p[0] = (uint8_t)(x >> 0);
   p[1] = (uint8_t)(x >> 8);
+#endif
 }
 
 static inline void  //
@@ -746,10 +752,16 @@
 
 static inline void  //
 wuffs_base__poke_u32le__no_bounds_check(uint8_t* p, uint32_t x) {
+#if defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__)
+  // This seems to perform better on gcc 10 (but not clang 9). Clang also
+  // defines "__GNUC__".
+  memcpy(p, &x, 4);
+#else
   p[0] = (uint8_t)(x >> 0);
   p[1] = (uint8_t)(x >> 8);
   p[2] = (uint8_t)(x >> 16);
   p[3] = (uint8_t)(x >> 24);
+#endif
 }
 
 static inline void  //
@@ -826,6 +838,11 @@
 
 static inline void  //
 wuffs_base__poke_u64le__no_bounds_check(uint8_t* p, uint64_t x) {
+#if defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__)
+  // This seems to perform better on gcc 10 (but not clang 9). Clang also
+  // defines "__GNUC__".
+  memcpy(p, &x, 8);
+#else
   p[0] = (uint8_t)(x >> 0);
   p[1] = (uint8_t)(x >> 8);
   p[2] = (uint8_t)(x >> 16);
@@ -834,6 +851,7 @@
   p[5] = (uint8_t)(x >> 40);
   p[6] = (uint8_t)(x >> 48);
   p[7] = (uint8_t)(x >> 56);
+#endif
 }
 
 // --------
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index 48ebc82..d31af4c 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -591,6 +591,29 @@
 		b.writes(", ")
 		return g.writeArgs(b, args, depth)
 	}
+
+	if (t.IDPeekU8 <= method) && (method <= t.IDPeekU64LE) {
+		b.printf("wuffs_base__%s__no_bounds_check(", method.Str(g.tm))
+		if err := g.writeExpr(b, recv, depth); err != nil {
+			return err
+		}
+		b.writes(".ptr)")
+		return nil
+	}
+
+	if (t.IDPokeU8 <= method) && (method <= t.IDPokeU64LE) {
+		b.printf("(wuffs_base__%s__no_bounds_check(", method.Str(g.tm))
+		if err := g.writeExpr(b, recv, depth); err != nil {
+			return err
+		}
+		b.writes(".ptr, ")
+		if err := g.writeExpr(b, args[0].AsArg().Value(), depth); err != nil {
+			return err
+		}
+		b.writes("), wuffs_base__make_empty_struct())")
+		return nil
+	}
+
 	return errNoSuchBuiltin
 }
 
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index ce5aa5f..8358c8f 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -96,10 +96,11 @@
 	"1]) << 32) |\n         ((uint64_t)(p[2]) << 24) | ((uint64_t)(p[3]) << 16) |\n         ((uint64_t)(p[4]) << 8) | ((uint64_t)(p[5]) << 0);\n}\n\nstatic inline uint64_t  //\nwuffs_base__peek_u48le__no_bounds_check(const uint8_t* p) {\n  return ((uint64_t)(p[0]) << 0) | ((uint64_t)(p[1]) << 8) |\n         ((uint64_t)(p[2]) << 16) | ((uint64_t)(p[3]) << 24) |\n         ((uint64_t)(p[4]) << 32) | ((uint64_t)(p[5]) << 40);\n}\n\nstatic inline uint64_t  //\nwuffs_base__peek_u56be__no_bounds_check(const uint8_t* p) {\n  return ((uint64_t)(p[0]) << 48) | ((uint64_t)(p[1]) << 40) |\n         ((uint64_t)(p[2]) << 32) | ((uint64_t)(p[3]) << 24) |\n         ((uint64_t)(p[4]) << 16) | ((uint64_t)(p[5]) << 8) |\n         ((uint64_t)(p[6]) << 0);\n}\n\nstatic inline uint64_t  //\nwuffs_base__peek_u56le__no_bounds_check(const uint8_t* p) {\n  return ((uint64_t)(p[0]) << 0) | ((uint64_t)(p[1]) << 8) |\n         ((uint64_t)(p[2]) << 16) | ((uint64_t)(p[3]) << 24) |\n         ((uint64_t)(p[4]) << 32) | ((uint64_t)(p[5]) << 40) |\n         ((uint64_t)(p[" +
 	"6]) << 48);\n}\n\nstatic inline uint64_t  //\nwuffs_base__peek_u64be__no_bounds_check(const uint8_t* p) {\n  return ((uint64_t)(p[0]) << 56) | ((uint64_t)(p[1]) << 48) |\n         ((uint64_t)(p[2]) << 40) | ((uint64_t)(p[3]) << 32) |\n         ((uint64_t)(p[4]) << 24) | ((uint64_t)(p[5]) << 16) |\n         ((uint64_t)(p[6]) << 8) | ((uint64_t)(p[7]) << 0);\n}\n\nstatic inline uint64_t  //\nwuffs_base__peek_u64le__no_bounds_check(const uint8_t* p) {\n  return ((uint64_t)(p[0]) << 0) | ((uint64_t)(p[1]) << 8) |\n         ((uint64_t)(p[2]) << 16) | ((uint64_t)(p[3]) << 24) |\n         ((uint64_t)(p[4]) << 32) | ((uint64_t)(p[5]) << 40) |\n         ((uint64_t)(p[6]) << 48) | ((uint64_t)(p[7]) << 56);\n}\n\n" +
 	"" +
-	"// --------\n\n#define wuffs_base__poke_u8be__no_bounds_check \\\n  wuffs_base__poke_u8__no_bounds_check\n#define wuffs_base__poke_u8le__no_bounds_check \\\n  wuffs_base__poke_u8__no_bounds_check\n\nstatic inline void  //\nwuffs_base__poke_u8__no_bounds_check(uint8_t* p, uint8_t x) {\n  p[0] = x;\n}\n\nstatic inline void  //\nwuffs_base__poke_u16be__no_bounds_check(uint8_t* p, uint16_t x) {\n  p[0] = (uint8_t)(x >> 8);\n  p[1] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u16le__no_bounds_check(uint8_t* p, uint16_t x) {\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n}\n\nstatic inline void  //\nwuffs_base__poke_u24be__no_bounds_check(uint8_t* p, uint32_t x) {\n  p[0] = (uint8_t)(x >> 16);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u24le__no_bounds_check(uint8_t* p, uint32_t x) {\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 16);\n}\n\nstatic inline void  //\nwuffs_base__poke_u32be__no_bounds_check(uint8_t* p, uint32_t " +
-	"x) {\n  p[0] = (uint8_t)(x >> 24);\n  p[1] = (uint8_t)(x >> 16);\n  p[2] = (uint8_t)(x >> 8);\n  p[3] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u32le__no_bounds_check(uint8_t* p, uint32_t x) {\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 16);\n  p[3] = (uint8_t)(x >> 24);\n}\n\nstatic inline void  //\nwuffs_base__poke_u40be__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 32);\n  p[1] = (uint8_t)(x >> 24);\n  p[2] = (uint8_t)(x >> 16);\n  p[3] = (uint8_t)(x >> 8);\n  p[4] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u40le__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 16);\n  p[3] = (uint8_t)(x >> 24);\n  p[4] = (uint8_t)(x >> 32);\n}\n\nstatic inline void  //\nwuffs_base__poke_u48be__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 40);\n  p[1] = (uint8_t)(x >> 32);\n  p[2] = (uint8_t)(x >> 24);\n  p[3] = (uint8_t)(x >> 16);\n  p[4] = (uint8_t)(x " +
-	">> 8);\n  p[5] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u48le__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 16);\n  p[3] = (uint8_t)(x >> 24);\n  p[4] = (uint8_t)(x >> 32);\n  p[5] = (uint8_t)(x >> 40);\n}\n\nstatic inline void  //\nwuffs_base__poke_u56be__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 48);\n  p[1] = (uint8_t)(x >> 40);\n  p[2] = (uint8_t)(x >> 32);\n  p[3] = (uint8_t)(x >> 24);\n  p[4] = (uint8_t)(x >> 16);\n  p[5] = (uint8_t)(x >> 8);\n  p[6] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u56le__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 16);\n  p[3] = (uint8_t)(x >> 24);\n  p[4] = (uint8_t)(x >> 32);\n  p[5] = (uint8_t)(x >> 40);\n  p[6] = (uint8_t)(x >> 48);\n}\n\nstatic inline void  //\nwuffs_base__poke_u64be__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 56);\n  p[1] = (uint8_t)" +
-	"(x >> 48);\n  p[2] = (uint8_t)(x >> 40);\n  p[3] = (uint8_t)(x >> 32);\n  p[4] = (uint8_t)(x >> 24);\n  p[5] = (uint8_t)(x >> 16);\n  p[6] = (uint8_t)(x >> 8);\n  p[7] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u64le__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 16);\n  p[3] = (uint8_t)(x >> 24);\n  p[4] = (uint8_t)(x >> 32);\n  p[5] = (uint8_t)(x >> 40);\n  p[6] = (uint8_t)(x >> 48);\n  p[7] = (uint8_t)(x >> 56);\n}\n\n" +
+	"// --------\n\n#define wuffs_base__poke_u8be__no_bounds_check \\\n  wuffs_base__poke_u8__no_bounds_check\n#define wuffs_base__poke_u8le__no_bounds_check \\\n  wuffs_base__poke_u8__no_bounds_check\n\nstatic inline void  //\nwuffs_base__poke_u8__no_bounds_check(uint8_t* p, uint8_t x) {\n  p[0] = x;\n}\n\nstatic inline void  //\nwuffs_base__poke_u16be__no_bounds_check(uint8_t* p, uint16_t x) {\n  p[0] = (uint8_t)(x >> 8);\n  p[1] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u16le__no_bounds_check(uint8_t* p, uint16_t x) {\n#if defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__)\n  // This seems to perform better on gcc 10 (but not clang 9). Clang also\n  // defines \"__GNUC__\".\n  memcpy(p, &x, 2);\n#else\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n#endif\n}\n\nstatic inline void  //\nwuffs_base__poke_u24be__no_bounds_check(uint8_t* p, uint32_t x) {\n  p[0] = (uint8_t)(x >> 16);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u24le__no_bounds_chec" +
+	"k(uint8_t* p, uint32_t x) {\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 16);\n}\n\nstatic inline void  //\nwuffs_base__poke_u32be__no_bounds_check(uint8_t* p, uint32_t x) {\n  p[0] = (uint8_t)(x >> 24);\n  p[1] = (uint8_t)(x >> 16);\n  p[2] = (uint8_t)(x >> 8);\n  p[3] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u32le__no_bounds_check(uint8_t* p, uint32_t x) {\n#if defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__)\n  // This seems to perform better on gcc 10 (but not clang 9). Clang also\n  // defines \"__GNUC__\".\n  memcpy(p, &x, 4);\n#else\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 16);\n  p[3] = (uint8_t)(x >> 24);\n#endif\n}\n\nstatic inline void  //\nwuffs_base__poke_u40be__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 32);\n  p[1] = (uint8_t)(x >> 24);\n  p[2] = (uint8_t)(x >> 16);\n  p[3] = (uint8_t)(x >> 8);\n  p[4] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u40le__no_bounds_chec" +
+	"k(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 16);\n  p[3] = (uint8_t)(x >> 24);\n  p[4] = (uint8_t)(x >> 32);\n}\n\nstatic inline void  //\nwuffs_base__poke_u48be__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 40);\n  p[1] = (uint8_t)(x >> 32);\n  p[2] = (uint8_t)(x >> 24);\n  p[3] = (uint8_t)(x >> 16);\n  p[4] = (uint8_t)(x >> 8);\n  p[5] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u48le__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 16);\n  p[3] = (uint8_t)(x >> 24);\n  p[4] = (uint8_t)(x >> 32);\n  p[5] = (uint8_t)(x >> 40);\n}\n\nstatic inline void  //\nwuffs_base__poke_u56be__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 48);\n  p[1] = (uint8_t)(x >> 40);\n  p[2] = (uint8_t)(x >> 32);\n  p[3] = (uint8_t)(x >> 24);\n  p[4] = (uint8_t)(x >> 16);\n  p[5] = (uint8_t)(x >> 8);\n  p[6] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_" +
+	"base__poke_u56le__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 16);\n  p[3] = (uint8_t)(x >> 24);\n  p[4] = (uint8_t)(x >> 32);\n  p[5] = (uint8_t)(x >> 40);\n  p[6] = (uint8_t)(x >> 48);\n}\n\nstatic inline void  //\nwuffs_base__poke_u64be__no_bounds_check(uint8_t* p, uint64_t x) {\n  p[0] = (uint8_t)(x >> 56);\n  p[1] = (uint8_t)(x >> 48);\n  p[2] = (uint8_t)(x >> 40);\n  p[3] = (uint8_t)(x >> 32);\n  p[4] = (uint8_t)(x >> 24);\n  p[5] = (uint8_t)(x >> 16);\n  p[6] = (uint8_t)(x >> 8);\n  p[7] = (uint8_t)(x >> 0);\n}\n\nstatic inline void  //\nwuffs_base__poke_u64le__no_bounds_check(uint8_t* p, uint64_t x) {\n#if defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__)\n  // This seems to perform better on gcc 10 (but not clang 9). Clang also\n  // defines \"__GNUC__\".\n  memcpy(p, &x, 8);\n#else\n  p[0] = (uint8_t)(x >> 0);\n  p[1] = (uint8_t)(x >> 8);\n  p[2] = (uint8_t)(x >> 16);\n  p[3] = (uint8_t)(x >> 24);\n  p[4] = (uint8_t)(x >> 32);\n  p[5] = (uin" +
+	"t8_t)(x >> 40);\n  p[6] = (uint8_t)(x >> 48);\n  p[7] = (uint8_t)(x >> 56);\n#endif\n}\n\n" +
 	"" +
 	"// --------\n\n// Load and Store functions are deprecated. Use Peek and Poke instead.\n\n#define wuffs_base__load_u8__no_bounds_check \\\n  wuffs_base__peek_u8__no_bounds_check\n#define wuffs_base__load_u16be__no_bounds_check \\\n  wuffs_base__peek_u16be__no_bounds_check\n#define wuffs_base__load_u16le__no_bounds_check \\\n  wuffs_base__peek_u16le__no_bounds_check\n#define wuffs_base__load_u24be__no_bounds_check \\\n  wuffs_base__peek_u24be__no_bounds_check\n#define wuffs_base__load_u24le__no_bounds_check \\\n  wuffs_base__peek_u24le__no_bounds_check\n#define wuffs_base__load_u32be__no_bounds_check \\\n  wuffs_base__peek_u32be__no_bounds_check\n#define wuffs_base__load_u32le__no_bounds_check \\\n  wuffs_base__peek_u32le__no_bounds_check\n#define wuffs_base__load_u40be__no_bounds_check \\\n  wuffs_base__peek_u40be__no_bounds_check\n#define wuffs_base__load_u40le__no_bounds_check \\\n  wuffs_base__peek_u40le__no_bounds_check\n#define wuffs_base__load_u48be__no_bounds_check \\\n  wuffs_base__peek_u48be__no_bounds_check\n#define wuffs_base__load_" +
 	"u48le__no_bounds_check \\\n  wuffs_base__peek_u48le__no_bounds_check\n#define wuffs_base__load_u56be__no_bounds_check \\\n  wuffs_base__peek_u56be__no_bounds_check\n#define wuffs_base__load_u56le__no_bounds_check \\\n  wuffs_base__peek_u56le__no_bounds_check\n#define wuffs_base__load_u64be__no_bounds_check \\\n  wuffs_base__peek_u64be__no_bounds_check\n#define wuffs_base__load_u64le__no_bounds_check \\\n  wuffs_base__peek_u64le__no_bounds_check\n\n#define wuffs_base__store_u8__no_bounds_check \\\n  wuffs_base__poke_u8__no_bounds_check\n#define wuffs_base__store_u16be__no_bounds_check \\\n  wuffs_base__poke_u16be__no_bounds_check\n#define wuffs_base__store_u16le__no_bounds_check \\\n  wuffs_base__poke_u16le__no_bounds_check\n#define wuffs_base__store_u24be__no_bounds_check \\\n  wuffs_base__poke_u24be__no_bounds_check\n#define wuffs_base__store_u24le__no_bounds_check \\\n  wuffs_base__poke_u24le__no_bounds_check\n#define wuffs_base__store_u32be__no_bounds_check \\\n  wuffs_base__poke_u32be__no_bounds_check\n#define wuffs_base__store_u32le__no_" +
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index cb2f730..19357c0 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -608,6 +608,40 @@
 	"GENERIC T1.suffix(up_to: u64) T1",
 }
 
+var SliceU8Funcs = []string{
+	"GENERIC T1.peek_u8() u8",
+	"GENERIC T1.peek_u16be() u16",
+	"GENERIC T1.peek_u16le() u16",
+	"GENERIC T1.peek_u24be() u32",
+	"GENERIC T1.peek_u24le() u32",
+	"GENERIC T1.peek_u32be() u32",
+	"GENERIC T1.peek_u32le() u32",
+	"GENERIC T1.peek_u40be() u64",
+	"GENERIC T1.peek_u40le() u64",
+	"GENERIC T1.peek_u48be() u64",
+	"GENERIC T1.peek_u48le() u64",
+	"GENERIC T1.peek_u56be() u64",
+	"GENERIC T1.peek_u56le() u64",
+	"GENERIC T1.peek_u64be() u64",
+	"GENERIC T1.peek_u64le() u64",
+
+	"GENERIC T1.poke_u8!(a: u8)",
+	"GENERIC T1.poke_u16be!(a: u16)",
+	"GENERIC T1.poke_u16le!(a: u16)",
+	"GENERIC T1.poke_u24be!(a: u32)",
+	"GENERIC T1.poke_u24le!(a: u32)",
+	"GENERIC T1.poke_u32be!(a: u32)",
+	"GENERIC T1.poke_u32le!(a: u32)",
+	"GENERIC T1.poke_u40be!(a: u64)",
+	"GENERIC T1.poke_u40le!(a: u64)",
+	"GENERIC T1.poke_u48be!(a: u64)",
+	"GENERIC T1.poke_u48le!(a: u64)",
+	"GENERIC T1.poke_u56be!(a: u64)",
+	"GENERIC T1.poke_u56le!(a: u64)",
+	"GENERIC T1.poke_u64be!(a: u64)",
+	"GENERIC T1.poke_u64le!(a: u64)",
+}
+
 var TableFuncs = []string{
 	"GENERIC T2.height() u64",
 	"GENERIC T2.stride() u64",
diff --git a/lang/check/bounds.go b/lang/check/bounds.go
index d9a27bc..ffcd7a8 100644
--- a/lang/check/bounds.go
+++ b/lang/check/bounds.go
@@ -1051,6 +1051,8 @@
 	recv := lhs.LHS().AsExpr()
 	method := lhs.Ident()
 
+	advance, advanceExpr, update := (*big.Int)(nil), (*a.Expr)(nil), false
+
 	if recvTyp := recv.MType(); recvTyp == nil {
 		return bounds{}, errNotASpecialCase
 
@@ -1094,8 +1096,6 @@
 		}
 
 	} else if recvTyp.IsIOTokenType() {
-		advance, advanceExpr, update := (*big.Int)(nil), (*a.Expr)(nil), false
-
 		if method == t.IDUndoByte {
 			if err := q.canUndoByte(recv); err != nil {
 				return bounds{}, err
@@ -1146,23 +1146,31 @@
 			}
 		}
 
-		if (advance != nil) || (advanceExpr != nil) {
-			if ok, err := q.optimizeIOMethodAdvance(recv, advance, advanceExpr, update); err != nil {
-				return bounds{}, err
-			} else if !ok {
-				adv := ""
-				if advance != nil {
-					adv = advance.String()
-				} else {
-					adv = advanceExpr.Str(q.tm)
-				}
-				return bounds{}, fmt.Errorf("check: could not prove %s pre-condition: %s.length() >= %s",
-					method.Str(q.tm), recv.Str(q.tm), adv)
+	} else if recvTyp.Eq(typeExprSliceU8) {
+		if method >= t.IDPeekU8 {
+			if m := method - t.IDPeekU8; m < t.ID(len(ioMethodAdvances)) {
+				au := ioMethodAdvances[m]
+				advance, update = au.advance, au.update
 			}
-			// TODO: drop other recv-related facts?
 		}
 	}
 
+	if (advance != nil) || (advanceExpr != nil) {
+		if ok, err := q.optimizeIOMethodAdvance(recv, advance, advanceExpr, update); err != nil {
+			return bounds{}, err
+		} else if !ok {
+			adv := ""
+			if advance != nil {
+				adv = advance.String()
+			} else {
+				adv = advanceExpr.Str(q.tm)
+			}
+			return bounds{}, fmt.Errorf("check: could not prove %s pre-condition: %s.length() >= %s",
+				method.Str(q.tm), recv.Str(q.tm), adv)
+		}
+		// TODO: drop other recv-related facts?
+	}
+
 	return bounds{}, errNotASpecialCase
 }
 
@@ -1319,6 +1327,22 @@
 	t.IDPeekU64BE - t.IDPeekU8:      {eight, false},
 	t.IDPeekU64LE - t.IDPeekU8:      {eight, false},
 
+	t.IDPokeU8 - t.IDPeekU8:    {one, false},
+	t.IDPokeU16BE - t.IDPeekU8: {two, false},
+	t.IDPokeU16LE - t.IDPeekU8: {two, false},
+	t.IDPokeU24BE - t.IDPeekU8: {three, false},
+	t.IDPokeU24LE - t.IDPeekU8: {three, false},
+	t.IDPokeU32BE - t.IDPeekU8: {four, false},
+	t.IDPokeU32LE - t.IDPeekU8: {four, false},
+	t.IDPokeU40BE - t.IDPeekU8: {five, false},
+	t.IDPokeU40LE - t.IDPeekU8: {five, false},
+	t.IDPokeU48BE - t.IDPeekU8: {six, false},
+	t.IDPokeU48LE - t.IDPeekU8: {six, false},
+	t.IDPokeU56BE - t.IDPeekU8: {seven, false},
+	t.IDPokeU56LE - t.IDPeekU8: {seven, false},
+	t.IDPokeU64BE - t.IDPeekU8: {eight, false},
+	t.IDPokeU64LE - t.IDPeekU8: {eight, false},
+
 	t.IDWriteU8Fast - t.IDPeekU8:    {one, true},
 	t.IDWriteU16BEFast - t.IDPeekU8: {two, true},
 	t.IDWriteU16LEFast - t.IDPeekU8: {two, true},
diff --git a/lang/check/check.go b/lang/check/check.go
index 610f9d6..84dc12d 100644
--- a/lang/check/check.go
+++ b/lang/check/check.go
@@ -91,8 +91,9 @@
 		funcs:     map[t.QQID]*a.Func{},
 		localVars: map[t.QQID]typeMap{},
 
-		builtInSliceFuncs: map[t.QQID]*a.Func{},
-		builtInTableFuncs: map[t.QQID]*a.Func{},
+		builtInSliceFuncs:   map[t.QQID]*a.Func{},
+		builtInSliceU8Funcs: map[t.QQID]*a.Func{},
+		builtInTableFuncs:   map[t.QQID]*a.Func{},
 
 		builtInInterfaces:     map[t.QID][]t.QQID{},
 		builtInInterfaceFuncs: map[t.QQID]*a.Func{},
@@ -105,6 +106,9 @@
 	if err := c.parseBuiltInFuncs(c.builtInSliceFuncs, builtin.SliceFuncs); err != nil {
 		return nil, err
 	}
+	if err := c.parseBuiltInFuncs(c.builtInSliceU8Funcs, builtin.SliceU8Funcs); err != nil {
+		return nil, err
+	}
 	if err := c.parseBuiltInFuncs(c.builtInTableFuncs, builtin.TableFuncs); err != nil {
 		return nil, err
 	}
@@ -221,8 +225,9 @@
 	funcs     map[t.QQID]*a.Func
 	localVars map[t.QQID]typeMap
 
-	builtInSliceFuncs map[t.QQID]*a.Func
-	builtInTableFuncs map[t.QQID]*a.Func
+	builtInSliceFuncs   map[t.QQID]*a.Func
+	builtInSliceU8Funcs map[t.QQID]*a.Func
+	builtInTableFuncs   map[t.QQID]*a.Func
 
 	builtInInterfaces     map[t.QID][]t.QQID
 	builtInInterfaceFuncs map[t.QQID]*a.Func
diff --git a/lang/check/optimize.go b/lang/check/optimize.go
index c96dba0..4c4202e 100644
--- a/lang/check/optimize.go
+++ b/lang/check/optimize.go
@@ -58,7 +58,9 @@
 		// receiver.length(), even if they aren't an exact match.
 
 		op := x.Operator()
-		if op != t.IDXBinaryGreaterEq && op != t.IDXBinaryGreaterThan {
+		if (op != t.IDXBinaryGreaterEq) &&
+			(op != t.IDXBinaryGreaterThan) &&
+			(op != t.IDXBinaryEqEq) {
 			return x, nil
 		}
 
diff --git a/lang/check/resolve.go b/lang/check/resolve.go
index de615b2..c9cd8f7 100644
--- a/lang/check/resolve.go
+++ b/lang/check/resolve.go
@@ -154,6 +154,11 @@
 		if f := c.builtInSliceFuncs[qqid]; f != nil {
 			return f, nil
 		}
+		if lTyp.Eq(typeExprSliceU8) {
+			if f := c.builtInSliceU8Funcs[qqid]; f != nil {
+				return f, nil
+			}
+		}
 
 	} else if lTyp.IsTableType() {
 		qqid[0] = t.IDBase
diff --git a/lang/check/type.go b/lang/check/type.go
index caedc7b..cde48c3 100644
--- a/lang/check/type.go
+++ b/lang/check/type.go
@@ -665,20 +665,21 @@
 	if lTyp.IsSliceType() {
 		qqid[0] = t.IDBase
 		qqid[1] = t.IDDagger1
-		if q.c.builtInSliceFuncs[qqid] == nil {
-			return fmt.Errorf("check: no slice method %q", n.Ident().Str(q.tm))
+		if (q.c.builtInSliceFuncs[qqid] != nil) ||
+			((q.c.builtInSliceU8Funcs[qqid] != nil) && lTyp.Eq(typeExprSliceU8)) {
+			n.SetMType(a.NewTypeExpr(t.IDFunc, 0, n.Ident(), lTyp.AsNode(), nil, nil))
+			return nil
 		}
-		n.SetMType(a.NewTypeExpr(t.IDFunc, 0, n.Ident(), lTyp.AsNode(), nil, nil))
-		return nil
+		return fmt.Errorf("check: no slice method %q", n.Ident().Str(q.tm))
 
 	} else if lTyp.IsTableType() {
 		qqid[0] = t.IDBase
 		qqid[1] = t.IDDagger2
-		if q.c.builtInTableFuncs[qqid] == nil {
-			return fmt.Errorf("check: no table method %q", n.Ident().Str(q.tm))
+		if q.c.builtInTableFuncs[qqid] != nil {
+			n.SetMType(a.NewTypeExpr(t.IDFunc, 0, n.Ident(), lTyp.AsNode(), nil, nil))
+			return nil
 		}
-		n.SetMType(a.NewTypeExpr(t.IDFunc, 0, n.Ident(), lTyp.AsNode(), nil, nil))
-		return nil
+		return fmt.Errorf("check: no table method %q", n.Ident().Str(q.tm))
 
 	} else if lTyp.Decorator() != 0 {
 		return fmt.Errorf("check: invalid type %q for dot-expression LHS %q", lTyp.Str(q.tm), lhs.Str(q.tm))
diff --git a/lang/token/list.go b/lang/token/list.go
index c216ebf..24ad4a9 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -582,6 +582,24 @@
 
 	// --------
 
+	IDPokeU8    = ID(0x1D1)
+	IDPokeU16BE = ID(0x1D2)
+	IDPokeU16LE = ID(0x1D3)
+	IDPokeU24BE = ID(0x1D4)
+	IDPokeU24LE = ID(0x1D5)
+	IDPokeU32BE = ID(0x1D6)
+	IDPokeU32LE = ID(0x1D7)
+	IDPokeU40BE = ID(0x1D8)
+	IDPokeU40LE = ID(0x1D9)
+	IDPokeU48BE = ID(0x1DA)
+	IDPokeU48LE = ID(0x1DB)
+	IDPokeU56BE = ID(0x1DC)
+	IDPokeU56LE = ID(0x1DD)
+	IDPokeU64BE = ID(0x1DE)
+	IDPokeU64LE = ID(0x1DF)
+
+	// --------
+
 	IDWriteU8Fast    = ID(0x1E1)
 	IDWriteU16BEFast = ID(0x1E2)
 	IDWriteU16LEFast = ID(0x1E3)
@@ -969,6 +987,24 @@
 
 	// --------
 
+	IDPokeU8:    "poke_u8",
+	IDPokeU16BE: "poke_u16be",
+	IDPokeU16LE: "poke_u16le",
+	IDPokeU24BE: "poke_u24be",
+	IDPokeU24LE: "poke_u24le",
+	IDPokeU32BE: "poke_u32be",
+	IDPokeU32LE: "poke_u32le",
+	IDPokeU40BE: "poke_u40be",
+	IDPokeU40LE: "poke_u40le",
+	IDPokeU48BE: "poke_u48be",
+	IDPokeU48LE: "poke_u48le",
+	IDPokeU56BE: "poke_u56be",
+	IDPokeU56LE: "poke_u56le",
+	IDPokeU64BE: "poke_u64be",
+	IDPokeU64LE: "poke_u64le",
+
+	// --------
+
 	IDWriteU8Fast:    "write_u8_fast",
 	IDWriteU16BEFast: "write_u16be_fast",
 	IDWriteU16LEFast: "write_u16le_fast",
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index dc02664..3de64cf 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -792,8 +792,14 @@
 
 static inline void  //
 wuffs_base__poke_u16le__no_bounds_check(uint8_t* p, uint16_t x) {
+#if defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__)
+  // This seems to perform better on gcc 10 (but not clang 9). Clang also
+  // defines "__GNUC__".
+  memcpy(p, &x, 2);
+#else
   p[0] = (uint8_t)(x >> 0);
   p[1] = (uint8_t)(x >> 8);
+#endif
 }
 
 static inline void  //
@@ -820,10 +826,16 @@
 
 static inline void  //
 wuffs_base__poke_u32le__no_bounds_check(uint8_t* p, uint32_t x) {
+#if defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__)
+  // This seems to perform better on gcc 10 (but not clang 9). Clang also
+  // defines "__GNUC__".
+  memcpy(p, &x, 4);
+#else
   p[0] = (uint8_t)(x >> 0);
   p[1] = (uint8_t)(x >> 8);
   p[2] = (uint8_t)(x >> 16);
   p[3] = (uint8_t)(x >> 24);
+#endif
 }
 
 static inline void  //
@@ -900,6 +912,11 @@
 
 static inline void  //
 wuffs_base__poke_u64le__no_bounds_check(uint8_t* p, uint64_t x) {
+#if defined(__GNUC__) && !defined(__clang__) && defined(__x86_64__)
+  // This seems to perform better on gcc 10 (but not clang 9). Clang also
+  // defines "__GNUC__".
+  memcpy(p, &x, 8);
+#else
   p[0] = (uint8_t)(x >> 0);
   p[1] = (uint8_t)(x >> 8);
   p[2] = (uint8_t)(x >> 16);
@@ -908,6 +925,7 @@
   p[5] = (uint8_t)(x >> 40);
   p[6] = (uint8_t)(x >> 48);
   p[7] = (uint8_t)(x >> 56);
+#endif
 }
 
 // --------
@@ -31045,18 +31063,12 @@
     v_c.len = 4;
     uint8_t* i_end0_c = i_slice_c.ptr + ((i_slice_c.len / 4) * 4);
     while (v_c.ptr < i_end0_c) {
-      v_x32 = ((((uint32_t)(v_c.ptr[0])) << 0) |
-          (((uint32_t)(v_c.ptr[1])) << 8) |
-          (((uint32_t)(v_c.ptr[2])) << 16) |
-          (((uint32_t)(v_c.ptr[3])) << 24));
+      v_x32 = wuffs_base__peek_u32le__no_bounds_check(v_c.ptr);
       (v_x128 = _mm_cvtsi32_si128((int)(v_x32)), wuffs_base__make_empty_struct());
       v_x128 = _mm_add_epi8(v_x128, v_a128);
       v_a128 = v_x128;
       v_x32 = ((uint32_t)(_mm_cvtsi128_si32(v_x128)));
-      v_c.ptr[0] = ((uint8_t)((255 & (v_x32 >> 0))));
-      v_c.ptr[1] = ((uint8_t)((255 & (v_x32 >> 8))));
-      v_c.ptr[2] = ((uint8_t)((255 & (v_x32 >> 16))));
-      v_c.ptr[3] = ((uint8_t)((255 & (v_x32 >> 24))));
+      (wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, v_x32), wuffs_base__make_empty_struct());
       v_c.ptr += 4;
     }
   }
diff --git a/std/png/decode_filter_sse128.wuffs b/std/png/decode_filter_sse128.wuffs
index 1eac907..0338f1c 100644
--- a/std/png/decode_filter_sse128.wuffs
+++ b/std/png/decode_filter_sse128.wuffs
@@ -25,19 +25,13 @@
 	var a128 : base.sse128_i
 
 	iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
-		x32 = ((c[0] as base.u32) << 0x00) |
-			((c[1] as base.u32) << 0x08) |
-			((c[2] as base.u32) << 0x10) |
-			((c[3] as base.u32) << 0x18)
+		x32 = c.peek_u32le()
 		x128.load_u32!(a: x32)
 
 		x128 = x128._mm_add_epi8!(b: a128)
 		a128 = x128
 
 		x32 = x128.truncate_u32()
-		c[0] = (0xFF & (x32 >> 0x00)) as base.u8
-		c[1] = (0xFF & (x32 >> 0x08)) as base.u8
-		c[2] = (0xFF & (x32 >> 0x10)) as base.u8
-		c[3] = (0xFF & (x32 >> 0x18)) as base.u8
+		c.poke_u32le!(a: x32)
 	}
 }