Add SIMD

wuffs_pixel_swizzler_bgra_premul_y_src/clang9  1.94GB/s ± 1%  12.08GB/s ± 0%  +521.63%  (p=0.008 n=5+5)
wuffs_pixel_swizzler_bgra_premul_y_src/gcc10   2.92GB/s ± 0%  16.00GB/s ± 0%  +448.86%  (p=0.008 n=5+5)
diff --git a/doc/changelog.md b/doc/changelog.md
index ed191b2..35364ab 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -26,6 +26,7 @@
 - Added `std/png`.
 - Added `std/wbmp`.
 - Added `tell_me_more?` mechanism.
+- Added SIMD.
 - Added alloc functions.
 - Added colons to const syntax.
 - Added double-curly blocks.
diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h
index d5e1b5e..baf6c3a 100644
--- a/internal/cgen/base/fundamental-public.h
+++ b/internal/cgen/base/fundamental-public.h
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// ---------------- Fundamentals
+// ---------------- Version
 
 // WUFFS_VERSION is the major.minor.patch version, as per https://semver.org/,
 // as a uint64_t. The major number is the high 32 bits. The minor number is the
@@ -40,17 +40,77 @@
 #define WUFFS_VERSION_BUILD_METADATA_COMMIT_DATE 0
 #define WUFFS_VERSION_STRING "0.0.0+0.00000000"
 
+// ---------------- Configuration
+
+// Define WUFFS_CONFIG__AVOID_CPU_ARCH to avoid any code tied to a specific CPU
+// architecture, such as SSE SIMD for the x86 CPU family.
+#if defined(WUFFS_CONFIG__AVOID_CPU_ARCH)
+// No-op.
+#else
+#if defined(__GNUC__) && defined(__x86_64__)
+#include <cpuid.h>
+#include <x86intrin.h>
+#define WUFFS_BASE__CPU_ARCH__X86_64
+#endif  // defined(__GNUC__) && defined(__x86_64__)
+#endif  // defined(WUFFS_CONFIG__AVOID_CPU_ARCH)
+
+// --------
+
 // Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have
 // static storage. The motivation is discussed in the "ALLOW STATIC
 // IMPLEMENTATION" section of
 // https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt
-#ifdef WUFFS_CONFIG__STATIC_FUNCTIONS
+#if defined(WUFFS_CONFIG__STATIC_FUNCTIONS)
 #define WUFFS_BASE__MAYBE_STATIC static
 #else
 #define WUFFS_BASE__MAYBE_STATIC
-#endif
+#endif  // defined(WUFFS_CONFIG__STATIC_FUNCTIONS)
 
-// --------
+// ---------------- CPU Architecture
+
+// WUFFS_BASE__CPU_ARCH__X86_64__ETC are bits returned by
+// wuffs_base__cpu_arch__x86_64__capabilities.
+// - "SSE128" means all of SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and POPCNT.
+// - "AVX256" means all of AVX and AVX2.
+// - "AVX512ETC" is reserved, pending need. Note that AVX-512 consists of
+//   multiple extensions that may be implemented independently.
+#define WUFFS_BASE__CPU_ARCH__X86_64__SSE128 0x01
+#define WUFFS_BASE__CPU_ARCH__X86_64__AVX256 0x02
+
+static inline uint32_t  //
+wuffs_base__cpu_arch__x86_64__capabilities() {
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  uint32_t ret = 0;
+
+  unsigned int eax1 = 0;
+  unsigned int ebx1 = 0;
+  unsigned int ecx1 = 0;
+  unsigned int edx1 = 0;
+  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {
+    const unsigned int sse128_ecx1 = bit_SSE4_2 | bit_POPCNT;
+    if ((ecx1 & sse128_ecx1) == sse128_ecx1) {
+      ret |= WUFFS_BASE__CPU_ARCH__X86_64__SSE128;
+    }
+  }
+
+  unsigned int eax7 = 0;
+  unsigned int ebx7 = 0;
+  unsigned int ecx7 = 0;
+  unsigned int edx7 = 0;
+  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {
+    const unsigned int avx256_ebx7 = bit_AVX2;
+    if ((ebx7 & avx256_ebx7) == avx256_ebx7) {
+      ret |= WUFFS_BASE__CPU_ARCH__X86_64__AVX256;
+    }
+  }
+
+  return ret;
+#else
+  return 0;
+#endif  // defined( WUFFS_BASE__CPU_ARCH__X86_64)
+}
+
+// ---------------- Fundamentals
 
 // Wuffs assumes that:
 //  - converting a uint32_t to a size_t will never overflow.
diff --git a/internal/cgen/base/pixconv-submodule.c b/internal/cgen/base/pixconv-submodule.c
index d4a252b..8f6e80c 100644
--- a/internal/cgen/base/pixconv-submodule.c
+++ b/internal/cgen/base/pixconv-submodule.c
@@ -2017,6 +2017,57 @@
   return len;
 }
 
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static uint64_t  //
+wuffs_base__pixel_swizzler__xxxx__y__sse128(uint8_t* dst_ptr,
+                                            size_t dst_len,
+                                            uint8_t* dst_palette_ptr,
+                                            size_t dst_palette_len,
+                                            const uint8_t* src_ptr,
+                                            size_t src_len) {
+  size_t dst_len4 = dst_len / 4;
+  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  __m128i shuffle = _mm_set_epi8(0x03, 0x03, 0x03, 0x03,  //
+                                 0x02, 0x02, 0x02, 0x02,  //
+                                 0x01, 0x01, 0x01, 0x01,  //
+                                 0x00, 0x00, 0x00, 0x00);
+  __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00,  //
+                            0xFF, 0x00, 0x00, 0x00,  //
+                            0xFF, 0x00, 0x00, 0x00,  //
+                            0xFF, 0x00, 0x00, 0x00);
+
+  while (n >= 4) {
+    __m128i x;
+    x = _mm_cvtsi32_si128(wuffs_base__load_u32le__no_bounds_check(s));
+    x = _mm_shuffle_epi8(x, shuffle);
+    x = _mm_or_si128(x, or);
+    _mm_storeu_si128((void*)d, x);
+
+    s += 4 * 1;
+    d += 4 * 4;
+    n -= 4;
+  }
+
+  while (n >= 1) {
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));
+
+    s += 1 * 1;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+#endif
+
 static uint64_t  //
 wuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,
                                     size_t dst_len,
@@ -2030,8 +2081,6 @@
   const uint8_t* s = src_ptr;
   size_t n = len;
 
-  // TODO: unroll.
-
   while (n >= 1) {
     wuffs_base__store_u32le__no_bounds_check(
         d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));
@@ -2104,6 +2153,12 @@
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
     case WUFFS_BASE__PIXEL_FORMAT__RGBX:
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+      if (wuffs_base__cpu_arch__x86_64__capabilities() &
+          WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {
+        return wuffs_base__pixel_swizzler__xxxx__y__sse128;
+      }
+#endif
       return wuffs_base__pixel_swizzler__xxxx__y;
   }
   return NULL;
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index 2015d64..7caafa3 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -51,10 +51,17 @@
 	""
 
 const BaseFundamentalPublicH = "" +
-	"// ---------------- Fundamentals\n\n// WUFFS_VERSION is the major.minor.patch version, as per https://semver.org/,\n// as a uint64_t. The major number is the high 32 bits. The minor number is the\n// middle 16 bits. The patch number is the low 16 bits. The pre-release label\n// and build metadata are part of the string representation (such as\n// \"1.2.3-beta+456.20181231\") but not the uint64_t representation.\n//\n// WUFFS_VERSION_PRE_RELEASE_LABEL (such as \"\", \"beta\" or \"rc.1\") being\n// non-empty denotes a developer preview, not a release version, and has no\n// backwards or forwards compatibility guarantees.\n//\n// WUFFS_VERSION_BUILD_METADATA_XXX, if non-zero, are the number of commits and\n// the last commit date in the repository used to build this library. Within\n// each major.minor branch, the commit count should increase monotonically.\n//\n// !! Some code generation programs can override WUFFS_VERSION.\n#define WUFFS_VERSION 0\n#define WUFFS_VERSION_MAJOR 0\n#define WUFFS_VERSION_MINOR 0\n#define WUFFS_VERSION_PATCH " +
-	"0\n#define WUFFS_VERSION_PRE_RELEASE_LABEL \"work.in.progress\"\n#define WUFFS_VERSION_BUILD_METADATA_COMMIT_COUNT 0\n#define WUFFS_VERSION_BUILD_METADATA_COMMIT_DATE 0\n#define WUFFS_VERSION_STRING \"0.0.0+0.00000000\"\n\n// Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have\n// static storage. The motivation is discussed in the \"ALLOW STATIC\n// IMPLEMENTATION\" section of\n// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt\n#ifdef WUFFS_CONFIG__STATIC_FUNCTIONS\n#define WUFFS_BASE__MAYBE_STATIC static\n#else\n#define WUFFS_BASE__MAYBE_STATIC\n#endif\n\n" +
+	"// ---------------- Version\n\n// WUFFS_VERSION is the major.minor.patch version, as per https://semver.org/,\n// as a uint64_t. The major number is the high 32 bits. The minor number is the\n// middle 16 bits. The patch number is the low 16 bits. The pre-release label\n// and build metadata are part of the string representation (such as\n// \"1.2.3-beta+456.20181231\") but not the uint64_t representation.\n//\n// WUFFS_VERSION_PRE_RELEASE_LABEL (such as \"\", \"beta\" or \"rc.1\") being\n// non-empty denotes a developer preview, not a release version, and has no\n// backwards or forwards compatibility guarantees.\n//\n// WUFFS_VERSION_BUILD_METADATA_XXX, if non-zero, are the number of commits and\n// the last commit date in the repository used to build this library. Within\n// each major.minor branch, the commit count should increase monotonically.\n//\n// !! Some code generation programs can override WUFFS_VERSION.\n#define WUFFS_VERSION 0\n#define WUFFS_VERSION_MAJOR 0\n#define WUFFS_VERSION_MINOR 0\n#define WUFFS_VERSION_PATCH 0\n#de" +
+	"fine WUFFS_VERSION_PRE_RELEASE_LABEL \"work.in.progress\"\n#define WUFFS_VERSION_BUILD_METADATA_COMMIT_COUNT 0\n#define WUFFS_VERSION_BUILD_METADATA_COMMIT_DATE 0\n#define WUFFS_VERSION_STRING \"0.0.0+0.00000000\"\n\n" +
 	"" +
-	"// --------\n\n// Wuffs assumes that:\n//  - converting a uint32_t to a size_t will never overflow.\n//  - converting a size_t to a uint64_t will never overflow.\n#ifdef __WORDSIZE\n#if (__WORDSIZE != 32) && (__WORDSIZE != 64)\n#error \"Wuffs requires a word size of either 32 or 64 bits\"\n#endif\n#endif\n\n#if defined(__clang__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED_FIELD __attribute__((unused))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED_FIELD\n#endif\n\n// Clang also defines \"__GNUC__\".\n#if defined(__GNUC__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED __attribute__((unused))\n#define WUFFS_BASE__WARN_UNUSED_RESULT __attribute__((warn_unused_result))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED\n#define WUFFS_BASE__WARN_UNUSED_RESULT\n#endif\n\n" +
+	"// ---------------- Configuration\n\n// Define WUFFS_CONFIG__AVOID_CPU_ARCH to avoid any code tied to a specific CPU\n// architecture, such as SSE SIMD for the x86 CPU family.\n#if defined(WUFFS_CONFIG__AVOID_CPU_ARCH)\n// No-op.\n#else\n#if defined(__GNUC__) && defined(__x86_64__)\n#include <cpuid.h>\n#include <x86intrin.h>\n#define WUFFS_BASE__CPU_ARCH__X86_64\n#endif  // defined(__GNUC__) && defined(__x86_64__)\n#endif  // defined(WUFFS_CONFIG__AVOID_CPU_ARCH)\n\n" +
+	"" +
+	"// --------\n\n// Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have\n// static storage. The motivation is discussed in the \"ALLOW STATIC\n// IMPLEMENTATION\" section of\n// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt\n#if defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n#define WUFFS_BASE__MAYBE_STATIC static\n#else\n#define WUFFS_BASE__MAYBE_STATIC\n#endif  // defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n\n" +
+	"" +
+	"// ---------------- CPU Architecture\n\n// WUFFS_BASE__CPU_ARCH__X86_64__ETC are bits returned by\n// wuffs_base__cpu_arch__x86_64__capabilities.\n// - \"SSE128\" means all of SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and POPCNT.\n// - \"AVX256\" means all of AVX and AVX2.\n// - \"AVX512ETC\" is reserved, pending need. Note that AVX-512 consists of\n//   multiple extensions that may be implemented independently.\n#define WUFFS_BASE__CPU_ARCH__X86_64__SSE128 0x01\n#define WUFFS_BASE__CPU_ARCH__X86_64__AVX256 0x02\n\nstatic inline uint32_t  //\nwuffs_base__cpu_arch__x86_64__capabilities() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  uint32_t ret = 0;\n\n  unsigned int eax1 = 0;\n  unsigned int ebx1 = 0;\n  unsigned int ecx1 = 0;\n  unsigned int edx1 = 0;\n  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n    const unsigned int sse128_ecx1 = bit_SSE4_2 | bit_POPCNT;\n    if ((ecx1 & sse128_ecx1) == sse128_ecx1) {\n      ret |= WUFFS_BASE__CPU_ARCH__X86_64__SSE128;\n    }\n  }\n\n  unsigned int eax7 = 0;\n  unsigned int ebx7 = 0;\n  unsigned in" +
+	"t ecx7 = 0;\n  unsigned int edx7 = 0;\n  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {\n    const unsigned int avx256_ebx7 = bit_AVX2;\n    if ((ebx7 & avx256_ebx7) == avx256_ebx7) {\n      ret |= WUFFS_BASE__CPU_ARCH__X86_64__AVX256;\n    }\n  }\n\n  return ret;\n#else\n  return 0;\n#endif  // defined( WUFFS_BASE__CPU_ARCH__X86_64)\n}\n\n" +
+	"" +
+	"// ---------------- Fundamentals\n\n// Wuffs assumes that:\n//  - converting a uint32_t to a size_t will never overflow.\n//  - converting a size_t to a uint64_t will never overflow.\n#ifdef __WORDSIZE\n#if (__WORDSIZE != 32) && (__WORDSIZE != 64)\n#error \"Wuffs requires a word size of either 32 or 64 bits\"\n#endif\n#endif\n\n#if defined(__clang__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED_FIELD __attribute__((unused))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED_FIELD\n#endif\n\n// Clang also defines \"__GNUC__\".\n#if defined(__GNUC__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED __attribute__((unused))\n#define WUFFS_BASE__WARN_UNUSED_RESULT __attribute__((warn_unused_result))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED\n#define WUFFS_BASE__WARN_UNUSED_RESULT\n#endif\n\n" +
 	"" +
 	"// --------\n\n// Options (bitwise or'ed together) for wuffs_foo__bar__initialize functions.\n\n#define WUFFS_INITIALIZE__DEFAULT_OPTIONS ((uint32_t)0x00000000)\n\n// WUFFS_INITIALIZE__ALREADY_ZEROED means that the \"self\" receiver struct value\n// has already been set to all zeroes.\n#define WUFFS_INITIALIZE__ALREADY_ZEROED ((uint32_t)0x00000001)\n\n// WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED means that, absent\n// WUFFS_INITIALIZE__ALREADY_ZEROED, only some of the \"self\" receiver struct\n// value will be set to all zeroes. Internal buffers, which tend to be a large\n// proportion of the struct's size, will be left uninitialized. Internal means\n// that the buffer is contained by the receiver struct, as opposed to being\n// passed as a separately allocated \"work buffer\".\n//\n// For more detail, see:\n// https://github.com/google/wuffs/blob/master/doc/note/initialization.md\n#define WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED \\\n  ((uint32_t)0x00000002)\n\n" +
 	"" +
@@ -599,26 +606,27 @@
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index__src(uint8_t* dst_ptr,\n                                             size_t dst_len,\n                                             uint8_t* dst_palette_ptr,\n                                             size_t dst_palette_len,\n                                             const uint8_t* src_ptr,\n                                             size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n  }\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (1 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         " +
 	"dst_palette_ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (2 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (3 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n" +
 	"  }\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[1] * 4));\n    if (s1) {\n      wuffs_base__store_u32le__no_bounds_check(d + (1 * 4), s1);\n    }\n    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__store_u32le__no_bounds_check(d + (2 * 4), s2);\n    }\n    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(dst_" +
-	"palette_ptr +\n                                                          ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u32le__no_bounds_check(d + (3 * 4), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,\n                                    size_t dst_len,\n                                    uint8_t* dst_palette_ptr,\n                                    size_t dst_palette_len,\n                                    const uint8_t* src_ptr,\n                                    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < " +
-	"src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"palette_ptr +\n                                                          ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u32le__no_bounds_check(d + (3 * 4), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n#if defined(__GNUC__)\n__attribute__((target(\"sse4.2\")))\n#endif\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y__sse128(uint8_t* dst_ptr,\n                                            size_t dst_len,\n                                            uint8_t* dst_palette_ptr,\n                                            size_t dst_palette_len,\n                                     " +
+	"       const uint8_t* src_ptr,\n                                            size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(0x03, 0x03, 0x03, 0x03,  //\n                                 0x02, 0x02, 0x02, 0x02,  //\n                                 0x01, 0x01, 0x01, 0x01,  //\n                                 0x00, 0x00, 0x00, 0x00);\n  __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00,  //\n                            0xFF, 0x00, 0x00, 0x00,  //\n                            0xFF, 0x00, 0x00, 0x00,  //\n                            0xFF, 0x00, 0x00, 0x00);\n\n  while (n >= 4) {\n    __m128i x;\n    x = _mm_cvtsi32_si128(wuffs_base__load_u32le__no_bounds_check(s));\n    x = _mm_shuffle_epi8(x, shuffle);\n    x = _mm_or_si128(x, or);\n    _mm_storeu_si128((void*)d, x);\n\n    s += 4 * 1;\n    d += 4 * 4;\n    n -= 4;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u32le_" +
+	"_no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n#endif\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,\n                                    size_t dst_len,\n                                    uint8_t* dst_palette_ptr,\n                                    size_t dst_palette_len,\n                                    const uint8_t* src_ptr,\n                                    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__transparent_black_src(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    uint64_t num_pixels,\n    uint32_t dst_pixfmt_bytes_per_pixel) {\n  uint64_t n = ((uint64_t)dst_len) / dst_pixfmt_bytes_per_pixel;\n  if (n > num_pixels) {\n    n = num_pixels;\n  }\n  memset(dst_ptr, 0, ((size_t)(n * dst_pixfmt_bytes_per_pixel)));\n  return n;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__transparent_black_src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    uint64_t num_pixels,\n    uint32_t dst_pixfmt_bytes_per_pixel) {\n  uint64_t n = ((uint64_t)dst_len) / dst_pixfmt_bytes_per_pixel;\n  if (n > num_pixels) {\n    n = num_pixels;\n  }\n  return n;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__y(wuffs_base__pixel_swizzler* p,\n                                       wuffs_base__pixel_format dst_pixfmt,\n                                       wuffs_base__slice_u8 dst_palette,\n                                       wuffs_base__slice_u8 src_palette,\n                                       wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      return wuffs_base__pixel_swizzler__copy_1_1;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__y;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      return wuffs_base__pixel_swizzler__xxx__y;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BA" +
-	"SE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__pixel_swizzler__xxxx__y;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_1_1;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      if (wuffs_base" +
-	"__pixel_swizzler__squash_align4_bgr_565_888(\n              dst_palette.ptr, dst_palette.len, NULL, 0, src_palette.ptr,\n              src_palette.len) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREM" +
-	"UL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(\n              dst_palette.ptr, dst_palette.len, NULL, 0, src_palette.ptr,\n              src_palette.len) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_" +
-	"NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(\n              dst_palette.ptr, dst_palette.len, NULL, 0, src_palette.ptr,\n              src_palette.len) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_pixfmt,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend ble" +
-	"nd) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__copy_3_3;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__bgrw__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend b" +
-	"lend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_4_4;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over;\n      }\n  " +
-	"    return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul_4x16le(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n " +
-	"   wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__squash_tight_4x8_4x16le;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_bas" +
-	"e__pixel_swizzler__bgra_nonpremul__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgrx(wuffs_base__pixel_swizzler* p,\n                                          wuffs_" +
-	"base__pixel_format dst_pixfmt,\n                                          wuffs_base__slice_u8 dst_palette,\n                                          wuffs_base__slice_u8 src_palette,\n                                          wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgrx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__xxx__xxxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__pixel_swizzler__bgrw__bgrx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__copy_4_4;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      " +
-	"break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__rgb(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_pixfmt,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__rgb;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__swap_rgb_bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__bgrw__rgb;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE_" +
-	"_PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__rgba_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__rgba_nonpremul__src;\n        case WUFFS_BASE_" +
-	"_PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__swap_rgbx_bgrx;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONP" +
-	"REMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
+	"SE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n      if (wuffs_base__cpu_arch__x86_64__capabilities() &\n          WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {\n        return wuffs_base__pixel_swizzler__xxxx__y__sse128;\n      }\n#endif\n      return wuffs_base__pixel_swizzler__xxxx__y;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NUL" +
+	"L;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_1_1;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      if (wuffs_base__pixel_swizzler__squash_align4_bgr_565_888(\n              dst_palette.ptr, dst_palette.len, NULL, 0, src_palette.ptr,\n              src_palette.len) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PI" +
+	"XEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(\n              dst_palette.ptr, dst_palette.len, NULL, 0, src_palette.ptr,\n              src_palette.len) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel" +
+	"_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(\n              dst_palette.ptr, dst_palette.len, NULL, 0, src_palette.ptr,\n              src_palette.len) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_pi" +
+	"xfmt,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__copy_3_3;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__bgrw__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base" +
+	"__pixel_swizzler__prepare__bgra_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_" +
+	"BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_4_4;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler" +
+	"__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul_4x16le(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BG" +
+	"RA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__squash_tight_4x8_4x16le;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMA" +
+	"T__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgrx(wuffs_base__pixel_swizzler* p,\n                                          wuffs_base__pixel_format dst_pixfmt,\n                                          wuffs_base__slice_u8 dst_palette,\n                                          wuffs_base__slice_u8 src_palette,\n                                          wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgrx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__xxx__xxxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__pixel_swizzler__bgrw__bgrx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__copy_4_4;\n\n    case WUFFS_BASE__PIXE" +
+	"L_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__rgb(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_pixfmt,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__rgb;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__swap_rgb_bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FO" +
+	"RMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__bgrw__rgb;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__rgba_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__rgba_nonpremul__src_over;\n     " +
+	" }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__swap_rgbx_bgrx;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__rgba_nonpremul__src_over;\n      " +
+	"}\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
 	"" +
 	"// --------\n\nWUFFS_BASE__MAYBE_STATIC wuffs_base__status  //\nwuffs_base__pixel_swizzler__prepare(wuffs_base__pixel_swizzler* p,\n                                    wuffs_base__pixel_format dst_pixfmt,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_pixfmt,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend) {\n  if (!p) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n  p->private_impl.func = NULL;\n  p->private_impl.transparent_black_func = NULL;\n  p->private_impl.dst_pixfmt_bytes_per_pixel = 0;\n  p->private_impl.src_pixfmt_bytes_per_pixel = 0;\n\n  wuffs_base__pixel_swizzler__func func = NULL;\n  wuffs_base__pixel_swizzler__transparent_black_func transparent_black_func =\n      NULL;\n\n  uint32_t dst_pixfmt_bits_per_pixel =\n      wuffs_base__pixel_format__bits_per_pixel(&dst_pixfmt);\n  if ((dst_pixfmt_bits_per_pixel == " +
 	"0) ||\n      ((dst_pixfmt_bits_per_pixel & 7) != 0)) {\n    return wuffs_base__make_status(\n        wuffs_base__error__unsupported_pixel_swizzler_option);\n  }\n\n  uint32_t src_pixfmt_bits_per_pixel =\n      wuffs_base__pixel_format__bits_per_pixel(&src_pixfmt);\n  if ((src_pixfmt_bits_per_pixel == 0) ||\n      ((src_pixfmt_bits_per_pixel & 7) != 0)) {\n    return wuffs_base__make_status(\n        wuffs_base__error__unsupported_pixel_swizzler_option);\n  }\n\n  // TODO: support many more formats.\n\n  switch (blend) {\n    case WUFFS_BASE__PIXEL_BLEND__SRC:\n      transparent_black_func =\n          wuffs_base__pixel_swizzler__transparent_black_src;\n      break;\n\n    case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n      transparent_black_func =\n          wuffs_base__pixel_swizzler__transparent_black_src_over;\n      break;\n  }\n\n  switch (src_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_pixfmt, dst_palette,\n                                                    src_palette" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 561509a..2a3bd4f 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -48,7 +48,7 @@
 extern "C" {
 #endif
 
-// ---------------- Fundamentals
+// ---------------- Version
 
 // WUFFS_VERSION is the major.minor.patch version, as per https://semver.org/,
 // as a uint64_t. The major number is the high 32 bits. The minor number is the
@@ -74,17 +74,77 @@
 #define WUFFS_VERSION_BUILD_METADATA_COMMIT_DATE 0
 #define WUFFS_VERSION_STRING "0.0.0+0.00000000"
 
+// ---------------- Configuration
+
+// Define WUFFS_CONFIG__AVOID_CPU_ARCH to avoid any code tied to a specific CPU
+// architecture, such as SSE SIMD for the x86 CPU family.
+#if defined(WUFFS_CONFIG__AVOID_CPU_ARCH)
+// No-op.
+#else
+#if defined(__GNUC__) && defined(__x86_64__)
+#include <cpuid.h>
+#include <x86intrin.h>
+#define WUFFS_BASE__CPU_ARCH__X86_64
+#endif  // defined(__GNUC__) && defined(__x86_64__)
+#endif  // defined(WUFFS_CONFIG__AVOID_CPU_ARCH)
+
+// --------
+
 // Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have
 // static storage. The motivation is discussed in the "ALLOW STATIC
 // IMPLEMENTATION" section of
 // https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt
-#ifdef WUFFS_CONFIG__STATIC_FUNCTIONS
+#if defined(WUFFS_CONFIG__STATIC_FUNCTIONS)
 #define WUFFS_BASE__MAYBE_STATIC static
 #else
 #define WUFFS_BASE__MAYBE_STATIC
-#endif
+#endif  // defined(WUFFS_CONFIG__STATIC_FUNCTIONS)
 
-// --------
+// ---------------- CPU Architecture
+
+// WUFFS_BASE__CPU_ARCH__X86_64__ETC are bits returned by
+// wuffs_base__cpu_arch__x86_64__capabilities.
+// - "SSE128" means all of SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and POPCNT.
+// - "AVX256" means all of AVX and AVX2.
+// - "AVX512ETC" is reserved, pending need. Note that AVX-512 consists of
+//   multiple extensions that may be implemented independently.
+#define WUFFS_BASE__CPU_ARCH__X86_64__SSE128 0x01
+#define WUFFS_BASE__CPU_ARCH__X86_64__AVX256 0x02
+
+static inline uint32_t  //
+wuffs_base__cpu_arch__x86_64__capabilities() {
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  uint32_t ret = 0;
+
+  unsigned int eax1 = 0;
+  unsigned int ebx1 = 0;
+  unsigned int ecx1 = 0;
+  unsigned int edx1 = 0;
+  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {
+    const unsigned int sse128_ecx1 = bit_SSE4_2 | bit_POPCNT;
+    if ((ecx1 & sse128_ecx1) == sse128_ecx1) {
+      ret |= WUFFS_BASE__CPU_ARCH__X86_64__SSE128;
+    }
+  }
+
+  unsigned int eax7 = 0;
+  unsigned int ebx7 = 0;
+  unsigned int ecx7 = 0;
+  unsigned int edx7 = 0;
+  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {
+    const unsigned int avx256_ebx7 = bit_AVX2;
+    if ((ebx7 & avx256_ebx7) == avx256_ebx7) {
+      ret |= WUFFS_BASE__CPU_ARCH__X86_64__AVX256;
+    }
+  }
+
+  return ret;
+#else
+  return 0;
+#endif  // defined( WUFFS_BASE__CPU_ARCH__X86_64)
+}
+
+// ---------------- Fundamentals
 
 // Wuffs assumes that:
 //  - converting a uint32_t to a size_t will never overflow.
@@ -15952,6 +16012,57 @@
   return len;
 }
 
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static uint64_t  //
+wuffs_base__pixel_swizzler__xxxx__y__sse128(uint8_t* dst_ptr,
+                                            size_t dst_len,
+                                            uint8_t* dst_palette_ptr,
+                                            size_t dst_palette_len,
+                                            const uint8_t* src_ptr,
+                                            size_t src_len) {
+  size_t dst_len4 = dst_len / 4;
+  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  __m128i shuffle = _mm_set_epi8(0x03, 0x03, 0x03, 0x03,  //
+                                 0x02, 0x02, 0x02, 0x02,  //
+                                 0x01, 0x01, 0x01, 0x01,  //
+                                 0x00, 0x00, 0x00, 0x00);
+  __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00,  //
+                            0xFF, 0x00, 0x00, 0x00,  //
+                            0xFF, 0x00, 0x00, 0x00,  //
+                            0xFF, 0x00, 0x00, 0x00);
+
+  while (n >= 4) {
+    __m128i x;
+    x = _mm_cvtsi32_si128(wuffs_base__load_u32le__no_bounds_check(s));
+    x = _mm_shuffle_epi8(x, shuffle);
+    x = _mm_or_si128(x, or);
+    _mm_storeu_si128((void*)d, x);
+
+    s += 4 * 1;
+    d += 4 * 4;
+    n -= 4;
+  }
+
+  while (n >= 1) {
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));
+
+    s += 1 * 1;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+#endif
+
 static uint64_t  //
 wuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,
                                     size_t dst_len,
@@ -15965,8 +16076,6 @@
   const uint8_t* s = src_ptr;
   size_t n = len;
 
-  // TODO: unroll.
-
   while (n >= 1) {
     wuffs_base__store_u32le__no_bounds_check(
         d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));
@@ -16039,6 +16148,12 @@
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
     case WUFFS_BASE__PIXEL_FORMAT__RGBX:
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+      if (wuffs_base__cpu_arch__x86_64__capabilities() &
+          WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {
+        return wuffs_base__pixel_swizzler__xxxx__y__sse128;
+      }
+#endif
       return wuffs_base__pixel_swizzler__xxxx__y;
   }
   return NULL;