Add cpu_arch
diff --git a/doc/changelog.md b/doc/changelog.md
index 35364ab..5f910a2 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -10,6 +10,7 @@
 - Added `base` library support for UTF-8.
 - Added `base` library support for `atoi`-like string conversion.
 - Added `choose` and `choosy`.
+- Added `cpu_arch`.
 - Added `doc/logo`.
 - Added `endwhile` syntax.
 - Added `example/cbor-to-json`.
diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h
index baf6c3a..d95f096 100644
--- a/internal/cgen/base/fundamental-public.h
+++ b/internal/cgen/base/fundamental-public.h
@@ -107,7 +107,22 @@
   return ret;
 #else
   return 0;
-#endif  // defined( WUFFS_BASE__CPU_ARCH__X86_64)
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+}
+
+static inline bool  //
+wuffs_base__cpu_arch__have_sse128() {
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  unsigned int eax1 = 0;
+  unsigned int ebx1 = 0;
+  unsigned int ecx1 = 0;
+  unsigned int edx1 = 0;
+  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {
+    const unsigned int sse128_ecx1 = bit_SSE4_2 | bit_POPCNT;
+    return (ecx1 & sse128_ecx1) == sse128_ecx1;
+  }
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  return false;
 }
 
 // ---------------- Fundamentals
diff --git a/internal/cgen/base/pixconv-submodule.c b/internal/cgen/base/pixconv-submodule.c
index 4ba2123..70910ca 100644
--- a/internal/cgen/base/pixconv-submodule.c
+++ b/internal/cgen/base/pixconv-submodule.c
@@ -2257,8 +2257,7 @@
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
     case WUFFS_BASE__PIXEL_FORMAT__RGBX:
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-      if (wuffs_base__cpu_arch__x86_64__capabilities() &
-          WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {
+      if (wuffs_base__cpu_arch__have_sse128()) {
         return wuffs_base__pixel_swizzler__xxxx__y__sse128;
       }
 #endif
@@ -2563,8 +2562,7 @@
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
     case WUFFS_BASE__PIXEL_FORMAT__BGRX:
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-      if (wuffs_base__cpu_arch__x86_64__capabilities() &
-          WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {
+      if (wuffs_base__cpu_arch__have_sse128()) {
         return wuffs_base__pixel_swizzler__bgrw__rgb__sse128;
       }
 #endif
@@ -2611,8 +2609,7 @@
       switch (blend) {
         case WUFFS_BASE__PIXEL_BLEND__SRC:
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-          if (wuffs_base__cpu_arch__x86_64__capabilities() &
-              WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {
+          if (wuffs_base__cpu_arch__have_sse128()) {
             return wuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse128;
           }
 #endif
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index 7a4c1ed..fb270e0 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -59,7 +59,7 @@
 	"// --------\n\n// Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have\n// static storage. The motivation is discussed in the \"ALLOW STATIC\n// IMPLEMENTATION\" section of\n// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt\n#if defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n#define WUFFS_BASE__MAYBE_STATIC static\n#else\n#define WUFFS_BASE__MAYBE_STATIC\n#endif  // defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n\n" +
 	"" +
 	"// ---------------- CPU Architecture\n\n// WUFFS_BASE__CPU_ARCH__X86_64__ETC are bits returned by\n// wuffs_base__cpu_arch__x86_64__capabilities.\n// - \"SSE128\" means all of SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and POPCNT.\n// - \"AVX256\" means all of AVX and AVX2.\n// - \"AVX512ETC\" is reserved, pending need. Note that AVX-512 consists of\n//   multiple extensions that may be implemented independently.\n#define WUFFS_BASE__CPU_ARCH__X86_64__SSE128 0x01\n#define WUFFS_BASE__CPU_ARCH__X86_64__AVX256 0x02\n\nstatic inline uint32_t  //\nwuffs_base__cpu_arch__x86_64__capabilities() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  uint32_t ret = 0;\n\n  unsigned int eax1 = 0;\n  unsigned int ebx1 = 0;\n  unsigned int ecx1 = 0;\n  unsigned int edx1 = 0;\n  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n    const unsigned int sse128_ecx1 = bit_SSE4_2 | bit_POPCNT;\n    if ((ecx1 & sse128_ecx1) == sse128_ecx1) {\n      ret |= WUFFS_BASE__CPU_ARCH__X86_64__SSE128;\n    }\n  }\n\n  unsigned int eax7 = 0;\n  unsigned int ebx7 = 0;\n  unsigned in" +
-	"t ecx7 = 0;\n  unsigned int edx7 = 0;\n  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {\n    const unsigned int avx256_ebx7 = bit_AVX2;\n    if ((ebx7 & avx256_ebx7) == avx256_ebx7) {\n      ret |= WUFFS_BASE__CPU_ARCH__X86_64__AVX256;\n    }\n  }\n\n  return ret;\n#else\n  return 0;\n#endif  // defined( WUFFS_BASE__CPU_ARCH__X86_64)\n}\n\n" +
+	"t ecx7 = 0;\n  unsigned int edx7 = 0;\n  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {\n    const unsigned int avx256_ebx7 = bit_AVX2;\n    if ((ebx7 & avx256_ebx7) == avx256_ebx7) {\n      ret |= WUFFS_BASE__CPU_ARCH__X86_64__AVX256;\n    }\n  }\n\n  return ret;\n#else\n  return 0;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_sse128() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  unsigned int eax1 = 0;\n  unsigned int ebx1 = 0;\n  unsigned int ecx1 = 0;\n  unsigned int edx1 = 0;\n  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n    const unsigned int sse128_ecx1 = bit_SSE4_2 | bit_POPCNT;\n    return (ecx1 & sse128_ecx1) == sse128_ecx1;\n  }\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\n" +
 	"" +
 	"// ---------------- Fundamentals\n\n// Wuffs assumes that:\n//  - converting a uint32_t to a size_t will never overflow.\n//  - converting a size_t to a uint64_t will never overflow.\n#ifdef __WORDSIZE\n#if (__WORDSIZE != 32) && (__WORDSIZE != 64)\n#error \"Wuffs requires a word size of either 32 or 64 bits\"\n#endif\n#endif\n\n#if defined(__clang__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED_FIELD __attribute__((unused))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED_FIELD\n#endif\n\n// Clang also defines \"__GNUC__\".\n#if defined(__GNUC__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED __attribute__((unused))\n#define WUFFS_BASE__WARN_UNUSED_RESULT __attribute__((warn_unused_result))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED\n#define WUFFS_BASE__WARN_UNUSED_RESULT\n#endif\n\n" +
 	"" +
@@ -616,20 +616,20 @@
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__transparent_black_src(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    uint64_t num_pixels,\n    uint32_t dst_pixfmt_bytes_per_pixel) {\n  uint64_t n = ((uint64_t)dst_len) / dst_pixfmt_bytes_per_pixel;\n  if (n > num_pixels) {\n    n = num_pixels;\n  }\n  memset(dst_ptr, 0, ((size_t)(n * dst_pixfmt_bytes_per_pixel)));\n  return n;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__transparent_black_src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    uint64_t num_pixels,\n    uint32_t dst_pixfmt_bytes_per_pixel) {\n  uint64_t n = ((uint64_t)dst_len) / dst_pixfmt_bytes_per_pixel;\n  if (n > num_pixels) {\n    n = num_pixels;\n  }\n  return n;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__y(wuffs_base__pixel_swizzler* p,\n                                       wuffs_base__pixel_format dst_pixfmt,\n                                       wuffs_base__slice_u8 dst_palette,\n                                       wuffs_base__slice_u8 src_palette,\n                                       wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      return wuffs_base__pixel_swizzler__copy_1_1;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__y;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      return wuffs_base__pixel_swizzler__xxx__y;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BA" +
-	"SE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n      if (wuffs_base__cpu_arch__x86_64__capabilities() &\n          WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {\n        return wuffs_base__pixel_swizzler__xxxx__y__sse128;\n      }\n#endif\n      return wuffs_base__pixel_swizzler__xxxx__y;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NUL" +
-	"L;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_1_1;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      if (wuffs_base__pixel_swizzler__squash_align4_bgr_565_888(\n              dst_palette.ptr, dst_palette.len, NULL, 0, src_palette.ptr,\n              src_palette.len) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PI" +
-	"XEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(\n              dst_palette.ptr, dst_palette.len, NULL, 0, src_palette.ptr,\n              src_palette.len) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel" +
-	"_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(\n              dst_palette.ptr, dst_palette.len, NULL, 0, src_palette.ptr,\n              src_palette.len) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_pi" +
-	"xfmt,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__copy_3_3;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__bgrw__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base" +
-	"__pixel_swizzler__prepare__bgra_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_" +
-	"BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_4_4;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler" +
-	"__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul_4x16le(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BG" +
-	"RA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__squash_tight_4x8_4x16le;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMA" +
-	"T__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgrx(wuffs_base__pixel_swizzler* p,\n                                          wuffs_base__pixel_format dst_pixfmt,\n                                          wuffs_base__slice_u8 dst_palette,\n                                          wuffs_base__slice_u8 src_palette,\n                                          wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgrx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__xxx__xxxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__pixel_swizzler__bgrw__bgrx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__copy_4_4;\n\n    case WUFFS_BASE__PIXE" +
-	"L_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__rgb(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_pixfmt,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__rgb;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__swap_rgb_bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FO" +
-	"RMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n      if (wuffs_base__cpu_arch__x86_64__capabilities() &\n          WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {\n        return wuffs_base__pixel_swizzler__bgrw__rgb__sse128;\n      }\n#endif\n      return wuffs_base__pixel_swizzler__bgrw__rgb;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__rgba_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__" +
-	"PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n          if (wuffs_base__cpu_arch__x86_64__capabilities() &\n              WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {\n            return wuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse128;\n          }\n#endif\n          return wuffs_base__pixel_swizzler__swap_rgbx_bgrx;\n        case WUFFS_BA" +
-	"SE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
+	"SE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n      if (wuffs_base__cpu_arch__have_sse128()) {\n        return wuffs_base__pixel_swizzler__xxxx__y__sse128;\n      }\n#endif\n      return wuffs_base__pixel_swizzler__xxxx__y;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE_" +
+	"_PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_1_1;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      if (wuffs_base__pixel_swizzler__squash_align4_bgr_565_888(\n              dst_palette.ptr, dst_palette.len, NULL, 0, src_palette.ptr,\n              src_palette.len) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_sw" +
+	"izzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(\n              dst_palette.ptr, dst_palette.len, NULL, 0, src_palette.ptr,\n              src_palette.len) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL" +
+	"_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(\n              dst_palette.ptr, dst_palette.len, NULL, 0, src_palette.ptr,\n              src_palette.len) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_pixfmt,\n                                         wuffs_base_" +
+	"_slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__copy_3_3;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__bgrw__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n    wuffs_base_" +
+	"_pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel" +
+	"_swizzler__copy_4_4;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpr" +
+	"emul_4x16le(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BA" +
+	"SE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__squash_tight_4x8_4x16le;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n" +
+	"\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgrx(wuffs_base__pixel_swizzler* p,\n                                          wuffs_base__pixel_format dst_pixfmt,\n                                          wuffs_base__slice_u8 dst_palette,\n                                          wuffs_base__slice_u8 src_palette,\n                                          wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgrx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__xxx__xxxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__pixel_swizzler__bgrw__bgrx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__copy_4_4;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NON" +
+	"PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__rgb(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_pixfmt,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__rgb;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__swap_rgb_bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX" +
+	":\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n      if (wuffs_base__cpu_arch__have_sse128()) {\n        return wuffs_base__pixel_swizzler__bgrw__rgb__sse128;\n      }\n#endif\n      return wuffs_base__pixel_swizzler__bgrw__rgb;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__rgba_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__rgba_nonpremul__src;\n        case WUFFS_BASE" +
+	"__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n          if (wuffs_base__cpu_arch__have_sse128()) {\n            return wuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse128;\n          }\n#endif\n          return wuffs_base__pixel_swizzler__swap_rgbx_bgrx;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__B" +
+	"GRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
 	"" +
 	"// --------\n\nWUFFS_BASE__MAYBE_STATIC wuffs_base__status  //\nwuffs_base__pixel_swizzler__prepare(wuffs_base__pixel_swizzler* p,\n                                    wuffs_base__pixel_format dst_pixfmt,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_pixfmt,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend) {\n  if (!p) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n  p->private_impl.func = NULL;\n  p->private_impl.transparent_black_func = NULL;\n  p->private_impl.dst_pixfmt_bytes_per_pixel = 0;\n  p->private_impl.src_pixfmt_bytes_per_pixel = 0;\n\n  wuffs_base__pixel_swizzler__func func = NULL;\n  wuffs_base__pixel_swizzler__transparent_black_func transparent_black_func =\n      NULL;\n\n  uint32_t dst_pixfmt_bits_per_pixel =\n      wuffs_base__pixel_format__bits_per_pixel(&dst_pixfmt);\n  if ((dst_pixfmt_bits_per_pixel == " +
 	"0) ||\n      ((dst_pixfmt_bits_per_pixel & 7) != 0)) {\n    return wuffs_base__make_status(\n        wuffs_base__error__unsupported_pixel_swizzler_option);\n  }\n\n  uint32_t src_pixfmt_bits_per_pixel =\n      wuffs_base__pixel_format__bits_per_pixel(&src_pixfmt);\n  if ((src_pixfmt_bits_per_pixel == 0) ||\n      ((src_pixfmt_bits_per_pixel & 7) != 0)) {\n    return wuffs_base__make_status(\n        wuffs_base__error__unsupported_pixel_swizzler_option);\n  }\n\n  // TODO: support many more formats.\n\n  switch (blend) {\n    case WUFFS_BASE__PIXEL_BLEND__SRC:\n      transparent_black_func =\n          wuffs_base__pixel_swizzler__transparent_black_src;\n      break;\n\n    case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n      transparent_black_func =\n          wuffs_base__pixel_swizzler__transparent_black_src_over;\n      break;\n  }\n\n  switch (src_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_pixfmt, dst_palette,\n                                                    src_palette" +
diff --git a/internal/cgen/func.go b/internal/cgen/func.go
index 65a2b9a..caa35bf 100644
--- a/internal/cgen/func.go
+++ b/internal/cgen/func.go
@@ -202,10 +202,18 @@
 }
 
 func (g *gen) writeFuncPrototype(b *buffer, n *a.Func) error {
+	caMacro, _, _ := cpuArchCNames(n.Asserts())
+	if caMacro != "" {
+		b.printf("#if defined(WUFFS_BASE__CPU_ARCH__%s)\n", caMacro)
+	}
 	if err := g.writeFuncSignature(b, n, wfsCDecl); err != nil {
 		return err
 	}
-	b.writes(";\n\n")
+	b.writes(";\n")
+	if caMacro != "" {
+		b.printf("#endif  // defined(WUFFS_BASE__CPU_ARCH__%s)\n", caMacro)
+	}
+	b.writes("\n")
 	if n.Choosy() {
 		if err := g.writeFuncSignature(b, n, wfsCDeclChoosy); err != nil {
 			return err
@@ -219,6 +227,15 @@
 	k := g.funks[n.QQID()]
 
 	b.printf("// -------- func %s.%s\n\n", g.pkgName, n.QQID().Str(g.tm))
+
+	caMacro, _, caAttribute := cpuArchCNames(n.Asserts())
+	if caMacro != "" {
+		b.printf("#if defined(WUFFS_BASE__CPU_ARCH__%s)\n", caMacro)
+	}
+	if caAttribute != "" {
+		b.printf("#if defined(__GNUC__)\n%s\n#endif\n", caAttribute)
+	}
+
 	if err := g.writeFuncSignature(b, n, wfsCDecl); err != nil {
 		return err
 	}
@@ -251,7 +268,11 @@
 	}
 
 	b.writex(k.bEpilogue)
-	b.writes("}\n\n")
+	b.writes("}\n")
+	if caMacro != "" {
+		b.printf("#endif  // defined(WUFFS_BASE__CPU_ARCH__%s)\n", caMacro)
+	}
+	b.writes("\n")
 	return nil
 }
 
diff --git a/internal/cgen/statement.go b/internal/cgen/statement.go
index 51025fa..a442a8c 100644
--- a/internal/cgen/statement.go
+++ b/internal/cgen/statement.go
@@ -227,19 +227,55 @@
 func (g *gen) writeStatementChoose(b *buffer, n *a.Choose, depth uint32) error {
 	recv := g.currFunk.astFunc.Receiver()
 	args := n.Args()
-	if len(args) != 1 {
-		return fmt.Errorf("TODO: multiple choice")
+	if len(args) == 0 {
+		return nil
 	}
-	id := args[0].AsExpr().Ident()
-	suffix := ""
-	if n.Name() == id {
-		suffix = "__choosy_default"
+	b.printf("self->private_impl.choosy_%s = (\n", n.Name().Str(g.tm))
+
+	conclusive := false
+	for _, o := range args {
+		id := o.AsExpr().Ident()
+		suffix := ""
+		if n.Name() == id {
+			suffix = "__choosy_default"
+		}
+		caMacro, caName, _ := cpuArchCNames(g.funks[t.QQID{recv[0], recv[1], id}].astFunc.Asserts())
+		if caMacro == "" {
+			b.printf("&%s%s__%s%s", g.pkgPrefix, recv.Str(g.tm), id.Str(g.tm), suffix)
+			conclusive = true
+			break
+		}
+		b.printf("#if defined(WUFFS_BASE__CPU_ARCH__%s)\n"+
+			"wuffs_base__cpu_arch__have_%s() ? &%s%s__%s%s :\n"+
+			"#endif\n",
+			caMacro, caName, g.pkgPrefix, recv.Str(g.tm), id.Str(g.tm), suffix)
 	}
-	b.printf("self->private_impl.choosy_%s = &%s%s__%s%s;\n",
-		n.Name().Str(g.tm), g.pkgPrefix, recv.Str(g.tm), id.Str(g.tm), suffix)
+
+	if !conclusive {
+		b.printf("self->private_impl.choosy_%s", n.Name().Str(g.tm))
+	}
+	b.writes(");\n")
 	return nil
 }
 
+func cpuArchCNames(asserts []*a.Node) (caMacro string, caName string, caAttribute string) {
+	sse128 := false
+	for _, o := range asserts {
+		o := o.AsAssert()
+		if !o.IsChooseCPUArch() {
+			continue
+		}
+		switch o.Condition().RHS().AsExpr().Ident() {
+		case t.IDSSE128:
+			sse128 = true
+		}
+	}
+	if sse128 {
+		return "X86_64", "sse128", "__attribute__((target(\"sse4.2\")))"
+	}
+	return "", "", ""
+}
+
 func (g *gen) writeStatementIOBind(b *buffer, n *a.IOBind, depth uint32) error {
 	if g.currFunk.ioBinds > maxIOBinds {
 		return fmt.Errorf("too many temporary variables required")
diff --git a/lang/ast/ast.go b/lang/ast/ast.go
index fe17c12..fd462f5 100644
--- a/lang/ast/ast.go
+++ b/lang/ast/ast.go
@@ -106,6 +106,7 @@
 	FlagsRetsError        = Flags(0x00004000)
 	FlagsPrivateData      = Flags(0x00008000)
 	FlagsChoosy           = Flags(0x00010000)
+	FlagsHasChooseCPUArch = Flags(0x00020000)
 )
 
 func (f Flags) AsEffect() Effect { return Effect(f) }
@@ -424,8 +425,9 @@
 	}
 }
 
-// Assert is "assert RHS via ID2(args)", "pre etc", "inv etc" or "post etc":
-//  - ID0:   <IDAssert|IDPre|IDInv|IDPost>
+// Assert is "assert RHS via ID2(args)", "choose etc", "pre etc", "inv etc" or
+// "post etc":
+//  - ID0:   <IDAssert|IDChoose|IDPre|IDInv|IDPost>
 //  - ID2:   <"-string literal> reason
 //  - RHS:   <Expr>
 //  - List0: <Arg> reason arguments
@@ -439,6 +441,26 @@
 
 func (n *Assert) DropExprCachedMBounds() error { return n.AsNode().Walk(dropExprCachedMBounds) }
 
+func (n *Assert) IsChooseCPUArch() bool {
+	if n.id0 != t.IDChoose {
+		return false
+	}
+	cond := n.Condition()
+	if cond.Operator() != t.IDXBinaryGreaterEq {
+		return false
+	}
+	lhs := cond.LHS().AsExpr()
+	rhs := cond.RHS().AsExpr()
+	if (lhs.Operator() != 0) || (lhs.Ident() != t.IDCPUArch) || (rhs.Operator() != 0) {
+		return false
+	}
+	switch rhs.Ident() {
+	case t.IDSSE128:
+		return true
+	}
+	return false
+}
+
 func NewAssert(keyword t.ID, condition *Expr, reason t.ID, args []*Node) *Assert {
 	return &Assert{
 		kind:  KAssert,
@@ -894,19 +916,20 @@
 //  - List2: <Statement> body
 type Func Node
 
-func (n *Func) AsNode() *Node    { return (*Node)(n) }
-func (n *Func) Choosy() bool     { return n.flags&FlagsChoosy != 0 }
-func (n *Func) Effect() Effect   { return Effect(n.flags) }
-func (n *Func) Public() bool     { return n.flags&FlagsPublic != 0 }
-func (n *Func) Filename() string { return n.filename }
-func (n *Func) Line() uint32     { return n.line }
-func (n *Func) QQID() t.QQID     { return t.QQID{n.id1, n.id2, n.id0} }
-func (n *Func) Receiver() t.QID  { return t.QID{n.id1, n.id2} }
-func (n *Func) FuncName() t.ID   { return n.id0 }
-func (n *Func) In() *Struct      { return n.lhs.AsStruct() }
-func (n *Func) Out() *TypeExpr   { return n.rhs.AsTypeExpr() }
-func (n *Func) Asserts() []*Node { return n.list1 }
-func (n *Func) Body() []*Node    { return n.list2 }
+func (n *Func) AsNode() *Node          { return (*Node)(n) }
+func (n *Func) Choosy() bool           { return n.flags&FlagsChoosy != 0 }
+func (n *Func) Effect() Effect         { return Effect(n.flags) }
+func (n *Func) HasChooseCPUArch() bool { return n.flags&FlagsHasChooseCPUArch != 0 }
+func (n *Func) Public() bool           { return n.flags&FlagsPublic != 0 }
+func (n *Func) Filename() string       { return n.filename }
+func (n *Func) Line() uint32           { return n.line }
+func (n *Func) QQID() t.QQID           { return t.QQID{n.id1, n.id2, n.id0} }
+func (n *Func) Receiver() t.QID        { return t.QID{n.id1, n.id2} }
+func (n *Func) FuncName() t.ID         { return n.id0 }
+func (n *Func) In() *Struct            { return n.lhs.AsStruct() }
+func (n *Func) Out() *TypeExpr         { return n.rhs.AsTypeExpr() }
+func (n *Func) Asserts() []*Node       { return n.list1 }
+func (n *Func) Body() []*Node          { return n.list2 }
 
 func (n *Func) BodyEndsWithReturn() bool {
 	if len(n.list2) == 0 {
diff --git a/lang/check/bounds.go b/lang/check/bounds.go
index 04aee64..d9a27bc 100644
--- a/lang/check/bounds.go
+++ b/lang/check/bounds.go
@@ -370,6 +370,18 @@
 	return false
 }
 
+func (q *checker) bcheckFuncAssert(n *a.Assert) error {
+	if n.IsChooseCPUArch() {
+		b := bounds{zero, one}
+		cond := n.Condition()
+		cond.SetMBounds(b)
+		cond.LHS().AsExpr().SetMBounds(b)
+		cond.RHS().AsExpr().SetMBounds(b)
+		return nil
+	}
+	return fmt.Errorf("check: function assertions are not supported yet")
+}
+
 func (q *checker) bcheckAssert(n *a.Assert) error {
 	if err := n.DropExprCachedMBounds(); err != nil {
 		return err
diff --git a/lang/check/check.go b/lang/check/check.go
index 9a40cb7..94be556 100644
--- a/lang/check/check.go
+++ b/lang/check/check.go
@@ -624,7 +624,11 @@
 		tm: c.tm,
 	}
 	for _, o := range n.Asserts() {
-		if err := q.tcheckAssert(o.AsAssert()); err != nil {
+		setPlaceholderMBoundsMType(o)
+		if err := q.tcheckFuncAssert(o.AsAssert()); err != nil {
+			return err
+		}
+		if err := q.bcheckFuncAssert(o.AsAssert()); err != nil {
 			return err
 		}
 	}
diff --git a/lang/check/type.go b/lang/check/type.go
index ae9f276..c6e8229 100644
--- a/lang/check/type.go
+++ b/lang/check/type.go
@@ -206,6 +206,17 @@
 	return nil
 }
 
+func (q *checker) tcheckFuncAssert(n *a.Assert) error {
+	if n.IsChooseCPUArch() {
+		cond := n.Condition()
+		cond.SetMType(typeExprBool)
+		cond.LHS().AsExpr().SetMType(typeExprU32)
+		cond.RHS().AsExpr().SetMType(typeExprU32)
+		return nil
+	}
+	return fmt.Errorf("check: function assertions are not supported yet")
+}
+
 func (q *checker) tcheckAssert(n *a.Assert) error {
 	cond := n.Condition()
 	if err := q.tcheckExpr(cond, 0); err != nil {
@@ -531,6 +542,10 @@
 		return fmt.Errorf("check: %q has effect %q but %q has effect %q",
 			n.Str(q.tm), ne, f.QQID().Str(q.tm), fe)
 	}
+	if f.HasChooseCPUArch() {
+		return fmt.Errorf(`check: cannot call cpu_arch function %q directly, only via "choose"`,
+			f.QQID().Str(q.tm))
+	}
 
 	genericType1 := (*a.TypeExpr)(nil)
 	genericType2 := (*a.TypeExpr)(nil)
diff --git a/lang/parse/parse.go b/lang/parse/parse.go
index 303d878..cf163ce 100644
--- a/lang/parse/parse.go
+++ b/lang/parse/parse.go
@@ -249,9 +249,20 @@
 				if err != nil {
 					return nil, err
 				}
-				if err := p.assertsSorted(asserts); err != nil {
+				if err := p.assertsSorted(asserts, true); err != nil {
 					return nil, err
 				}
+				for _, o := range asserts {
+					o := o.AsAssert()
+					if o.Keyword() != t.IDChoose {
+						continue
+					} else if o.IsChooseCPUArch() {
+						flags |= a.FlagsHasChooseCPUArch
+					} else {
+						return nil, fmt.Errorf(`parse: invalid "choose" condition at %s:%d`,
+							p.filename, p.line())
+					}
+				}
 			}
 
 			p.allowVar = true
@@ -266,6 +277,17 @@
 				return nil, fmt.Errorf(`parse: expected (implicit) ";", got %q at %s:%d`, got, p.filename, p.line())
 			}
 			p.src = p.src[1:]
+
+			if (flags & a.FlagsHasChooseCPUArch) != 0 {
+				if (flags & a.FlagsPublic) != 0 {
+					return nil, fmt.Errorf(`parse: cpu_arch function cannot be public at %s:%d`,
+						p.filename, p.line())
+				}
+				if (flags & a.FlagsChoosy) != 0 {
+					return nil, fmt.Errorf(`parse: cpu_arch function cannot be choosy at %s:%d`,
+						p.filename, p.line())
+				}
+			}
 			p.funcEffect = 0
 			in := a.NewStruct(0, p.filename, line, t.IDArgs, nil, argFields)
 			return a.NewFunc(flags, p.filename, line, id0, id1, in, out, asserts, body).AsNode(), nil
@@ -638,17 +660,26 @@
 	return block, nil
 }
 
-func (p *parser) assertsSorted(asserts []*a.Node) error {
-	seenInv, seenPost := false, false
-	for _, a := range asserts {
-		switch a.AsAssert().Keyword() {
+func (p *parser) assertsSorted(asserts []*a.Node, allowChoose bool) error {
+	seenPre, seenInv, seenPost := false, false, false
+	for _, o := range asserts {
+		switch o.AsAssert().Keyword() {
 		case t.IDAssert:
 			return fmt.Errorf(`parse: assertion chain cannot contain "assert", `+
 				`only "pre", "inv" and "post" at %s:%d`, p.filename, p.line())
+		case t.IDChoose:
+			if !allowChoose {
+				return fmt.Errorf(`parse: invalid "choose" at %s:%d`, p.filename, p.line())
+			}
+			if seenPre || seenPost || seenInv {
+				break
+			}
+			continue
 		case t.IDPre:
 			if seenPost || seenInv {
 				break
 			}
+			seenPre = true
 			continue
 		case t.IDInv:
 			if seenPost {
@@ -660,7 +691,7 @@
 			seenPost = true
 			continue
 		}
-		return fmt.Errorf(`parse: assertion chain not in "pre", "inv", "post" order at %s:%d`,
+		return fmt.Errorf(`parse: assertion chain not in "choose", "pre", "inv", "post" order at %s:%d`,
 			p.filename, p.line())
 	}
 	return nil
@@ -668,7 +699,7 @@
 
 func (p *parser) parseAssertNode() (*a.Node, error) {
 	switch x := p.peek1(); x {
-	case t.IDAssert, t.IDPre, t.IDInv, t.IDPost:
+	case t.IDAssert, t.IDChoose, t.IDPre, t.IDInv, t.IDPost:
 		p.src = p.src[1:]
 		condition, err := p.parseExpr()
 		if err != nil {
@@ -753,7 +784,7 @@
 	p.allowVar = false
 
 	switch x {
-	case t.IDAssert, t.IDPre, t.IDPost:
+	case t.IDAssert:
 		return p.parseAssertNode()
 
 	case t.IDBreak, t.IDContinue:
@@ -1006,7 +1037,7 @@
 		if asserts, err = p.parseList(t.IDOpenDoubleCurly, (*parser).parseAssertNode); err != nil {
 			return nil, err
 		}
-		if err := p.assertsSorted(asserts); err != nil {
+		if err := p.assertsSorted(asserts, false); err != nil {
 			return nil, err
 		}
 	}
diff --git a/lang/token/list.go b/lang/token/list.go
index ae382b6..a1fe302 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -605,12 +605,13 @@
 	// -------- 0x200 block.
 
 	IDAdvance    = ID(0x200)
-	IDInitialize = ID(0x201)
-	IDLength     = ID(0x202)
-	IDReset      = ID(0x203)
-	IDSet        = ID(0x204)
-	IDUnroll     = ID(0x205)
-	IDUpdate     = ID(0x206)
+	IDCPUArch    = ID(0x201)
+	IDInitialize = ID(0x202)
+	IDLength     = ID(0x203)
+	IDReset      = ID(0x204)
+	IDSet        = ID(0x205)
+	IDUnroll     = ID(0x206)
+	IDUpdate     = ID(0x207)
 
 	// TODO: range/rect methods like intersection and contains?
 
@@ -636,6 +637,15 @@
 
 	IDLimitedSwizzleU32InterleavedFromReader = ID(0x280)
 	IDSwizzleInterleavedFromReader           = ID(0x281)
+
+	// -------- 0x300 block.
+
+	// 0x30? are reserved for NEON.
+
+	IDSSE128  = ID(0x310)
+	IDSSE128I = ID(0x311)
+
+	// 0x32? are reserved for AVX256.
 )
 
 var builtInsByID = [nBuiltInIDs]string{
@@ -969,6 +979,7 @@
 	// -------- 0x200 block.
 
 	IDAdvance:    "advance",
+	IDCPUArch:    "cpu_arch",
 	IDInitialize: "initialize",
 	IDLength:     "length",
 	IDReset:      "reset",
@@ -998,6 +1009,11 @@
 
 	IDLimitedSwizzleU32InterleavedFromReader: "limited_swizzle_u32_interleaved_from_reader",
 	IDSwizzleInterleavedFromReader:           "swizzle_interleaved_from_reader",
+
+	// -------- 0x300 block.
+
+	IDSSE128:  "sse128",
+	IDSSE128I: "sse128_i",
 }
 
 var builtInsByName = map[string]ID{}
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 77dbca4..58cd01f 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -141,7 +141,22 @@
   return ret;
 #else
   return 0;
-#endif  // defined( WUFFS_BASE__CPU_ARCH__X86_64)
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+}
+
+static inline bool  //
+wuffs_base__cpu_arch__have_sse128() {
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  unsigned int eax1 = 0;
+  unsigned int ebx1 = 0;
+  unsigned int ecx1 = 0;
+  unsigned int edx1 = 0;
+  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {
+    const unsigned int sse128_ecx1 = bit_SSE4_2 | bit_POPCNT;
+    return (ecx1 & sse128_ecx1) == sse128_ecx1;
+  }
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  return false;
 }
 
 // ---------------- Fundamentals
@@ -16256,8 +16271,7 @@
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
     case WUFFS_BASE__PIXEL_FORMAT__RGBX:
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-      if (wuffs_base__cpu_arch__x86_64__capabilities() &
-          WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {
+      if (wuffs_base__cpu_arch__have_sse128()) {
         return wuffs_base__pixel_swizzler__xxxx__y__sse128;
       }
 #endif
@@ -16562,8 +16576,7 @@
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
     case WUFFS_BASE__PIXEL_FORMAT__BGRX:
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-      if (wuffs_base__cpu_arch__x86_64__capabilities() &
-          WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {
+      if (wuffs_base__cpu_arch__have_sse128()) {
         return wuffs_base__pixel_swizzler__bgrw__rgb__sse128;
       }
 #endif
@@ -16610,8 +16623,7 @@
       switch (blend) {
         case WUFFS_BASE__PIXEL_BLEND__SRC:
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-          if (wuffs_base__cpu_arch__x86_64__capabilities() &
-              WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {
+          if (wuffs_base__cpu_arch__have_sse128()) {
             return wuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse128;
           }
 #endif
@@ -30198,6 +30210,13 @@
     wuffs_base__slice_u8 a_curr,
     wuffs_base__slice_u8 a_prev);
 
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_1_distance_4_sse128(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr);
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
 static wuffs_base__empty_struct
 wuffs_png__decoder__choose_filter_implementations(
     wuffs_png__decoder* self);
@@ -30942,6 +30961,43 @@
   return wuffs_base__make_empty_struct();
 }
 
+// -------- func png.decoder.filter_1_distance_4_sse128
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_1_distance_4_sse128(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr) {
+  wuffs_base__slice_u8 v_c = {0};
+  uint8_t v_fa0 = 0;
+  uint8_t v_fa1 = 0;
+  uint8_t v_fa2 = 0;
+  uint8_t v_fa3 = 0;
+
+  {
+    wuffs_base__slice_u8 i_slice_c = a_curr;
+    v_c = i_slice_c;
+    v_c.len = 4;
+    uint8_t* i_end0_c = i_slice_c.ptr + ((i_slice_c.len / 4) * 4);
+    while (v_c.ptr < i_end0_c) {
+      v_fa0 += v_c.ptr[0];
+      v_c.ptr[0] = v_fa0;
+      v_fa1 += v_c.ptr[1];
+      v_c.ptr[1] = v_fa1;
+      v_fa2 += v_c.ptr[2];
+      v_c.ptr[2] = v_fa2;
+      v_fa3 += v_c.ptr[3];
+      v_c.ptr[3] = v_fa3;
+      v_c.ptr += 4;
+    }
+  }
+  return wuffs_base__make_empty_struct();
+}
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
 // -------- func png.decoder.set_quirk_enabled
 
 WUFFS_BASE__MAYBE_STATIC wuffs_base__empty_struct
@@ -31425,13 +31481,22 @@
 wuffs_png__decoder__choose_filter_implementations(
     wuffs_png__decoder* self) {
   if (self->private_impl.f_filter_distance == 3) {
-    self->private_impl.choosy_filter_1 = &wuffs_png__decoder__filter_1_distance_3_fallback;
-    self->private_impl.choosy_filter_3 = &wuffs_png__decoder__filter_3_distance_3_fallback;
-    self->private_impl.choosy_filter_4 = &wuffs_png__decoder__filter_4_distance_3_fallback;
+    self->private_impl.choosy_filter_1 = (
+        &wuffs_png__decoder__filter_1_distance_3_fallback);
+    self->private_impl.choosy_filter_3 = (
+        &wuffs_png__decoder__filter_3_distance_3_fallback);
+    self->private_impl.choosy_filter_4 = (
+        &wuffs_png__decoder__filter_4_distance_3_fallback);
   } else if (self->private_impl.f_filter_distance == 4) {
-    self->private_impl.choosy_filter_1 = &wuffs_png__decoder__filter_1_distance_4_fallback;
-    self->private_impl.choosy_filter_3 = &wuffs_png__decoder__filter_3_distance_4_fallback;
-    self->private_impl.choosy_filter_4 = &wuffs_png__decoder__filter_4_distance_4_fallback;
+    self->private_impl.choosy_filter_1 = (
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+        wuffs_base__cpu_arch__have_sse128() ? &wuffs_png__decoder__filter_1_distance_4_sse128 :
+#endif
+        &wuffs_png__decoder__filter_1_distance_4_fallback);
+    self->private_impl.choosy_filter_3 = (
+        &wuffs_png__decoder__filter_3_distance_4_fallback);
+    self->private_impl.choosy_filter_4 = (
+        &wuffs_png__decoder__filter_4_distance_4_fallback);
   }
   return wuffs_base__make_empty_struct();
 }
diff --git a/std/png/decode_filter_sse128.wuffs b/std/png/decode_filter_sse128.wuffs
new file mode 100644
index 0000000..aaabb58
--- /dev/null
+++ b/std/png/decode_filter_sse128.wuffs
@@ -0,0 +1,38 @@
+// Copyright 2021 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// --------
+
+// Filter 1: Sub.
+
+pri func decoder.filter_1_distance_4_sse128!(curr: slice base.u8),
+	choose cpu_arch >= sse128,
+{
+	var c   : slice base.u8
+	var fa0 : base.u8
+	var fa1 : base.u8
+	var fa2 : base.u8
+	var fa3 : base.u8
+
+	iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
+		fa0 ~mod+= c[0]
+		c[0] = fa0
+		fa1 ~mod+= c[1]
+		c[1] = fa1
+		fa2 ~mod+= c[2]
+		c[2] = fa2
+		fa3 ~mod+= c[3]
+		c[3] = fa3
+	}
+}
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index b814892..45c8746 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -253,7 +253,9 @@
 		choose filter_3 = [filter_3_distance_3_fallback]
 		choose filter_4 = [filter_4_distance_3_fallback]
 	} else if this.filter_distance == 4 {
-		choose filter_1 = [filter_1_distance_4_fallback]
+		choose filter_1 = [
+			filter_1_distance_4_sse128,
+			filter_1_distance_4_fallback]
 		choose filter_3 = [filter_3_distance_4_fallback]
 		choose filter_4 = [filter_4_distance_4_fallback]
 	}