SIMD-optimize some pixel swizzler loops

wuffs_pixel_swizzler_bgra_non_premul_rgba_nonpremul_src/clang9   1.75GB/s ± 0%  13.10GB/s ± 0%  +650.08%  (p=0.008 n=5+5)
wuffs_pixel_swizzler_bgra_premul_rgb_src/clang9                  1.85GB/s ± 0%  12.24GB/s ± 0%  +563.16%  (p=0.008 n=5+5)

wuffs_pixel_swizzler_bgra_non_premul_rgba_nonpremul_src/gcc10    7.41GB/s ± 0%  11.90GB/s ± 0%   +60.73%  (p=0.008 n=5+5)
wuffs_pixel_swizzler_bgra_premul_rgb_src/gcc10                   1.85GB/s ± 0%  10.66GB/s ± 0%  +474.51%  (p=0.016 n=5+4)
diff --git a/internal/cgen/base/pixconv-submodule.c b/internal/cgen/base/pixconv-submodule.c
index 8f6e80c..f952208 100644
--- a/internal/cgen/base/pixconv-submodule.c
+++ b/internal/cgen/base/pixconv-submodule.c
@@ -530,6 +530,54 @@
   return len;
 }
 
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static uint64_t  //
+wuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse128(uint8_t* dst_ptr,
+                                                   size_t dst_len,
+                                                   uint8_t* dst_palette_ptr,
+                                                   size_t dst_palette_len,
+                                                   const uint8_t* src_ptr,
+                                                   size_t src_len) {
+  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  __m128i shuffle = _mm_set_epi8(0x0F, 0x0C, 0x0D, 0x0E,  //
+                                 0x0B, 0x08, 0x09, 0x0A,  //
+                                 0x07, 0x04, 0x05, 0x06,  //
+                                 0x03, 0x00, 0x01, 0x02);
+
+  while (n >= 4) {
+    __m128i x;
+    x = _mm_loadu_si128((const void*)s);
+    x = _mm_shuffle_epi8(x, shuffle);
+    _mm_storeu_si128((void*)d, x);
+
+    s += 4 * 4;
+    d += 4 * 4;
+    n -= 4;
+  }
+
+  while (n--) {
+    uint8_t b0 = s[0];
+    uint8_t b1 = s[1];
+    uint8_t b2 = s[2];
+    uint8_t b3 = s[3];
+    d[0] = b2;
+    d[1] = b1;
+    d[2] = b0;
+    d[3] = b3;
+    s += 4;
+    d += 4;
+  }
+  return len;
+}
+#endif
+
 static uint64_t  //
 wuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,
                                            size_t dst_len,
@@ -1694,6 +1742,63 @@
   return len;
 }
 
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgrw__rgb__sse128(uint8_t* dst_ptr,
+                                              size_t dst_len,
+                                              uint8_t* dst_palette_ptr,
+                                              size_t dst_palette_len,
+                                              const uint8_t* src_ptr,
+                                              size_t src_len) {
+  size_t dst_len4 = dst_len / 4;
+  size_t src_len3 = src_len / 3;
+  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  __m128i shuffle = _mm_set_epi8(0x00, 0x09, 0x0A, 0x0B,  //
+                                 0x00, 0x06, 0x07, 0x08,  //
+                                 0x00, 0x03, 0x04, 0x05,  //
+                                 0x00, 0x00, 0x01, 0x02);
+  __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00,  //
+                            0xFF, 0x00, 0x00, 0x00,  //
+                            0xFF, 0x00, 0x00, 0x00,  //
+                            0xFF, 0x00, 0x00, 0x00);
+
+  while (n >= 6) {
+    __m128i x;
+    x = _mm_loadu_si128((const void*)s);
+    x = _mm_shuffle_epi8(x, shuffle);
+    x = _mm_or_si128(x, or);
+    _mm_storeu_si128((void*)d, x);
+
+    s += 4 * 3;
+    d += 4 * 4;
+    n -= 4;
+  }
+
+  while (n >= 1) {
+    uint8_t b0 = s[0];
+    uint8_t b1 = s[1];
+    uint8_t b2 = s[2];
+    d[0] = b2;
+    d[1] = b1;
+    d[2] = b0;
+    d[3] = 0xFF;
+
+    s += 1 * 3;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+#endif
+
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,
                                       size_t dst_len,
@@ -1708,8 +1813,6 @@
   const uint8_t* s = src_ptr;
   size_t n = len;
 
-  // TODO: unroll.
-
   while (n >= 1) {
     uint8_t b0 = s[0];
     uint8_t b1 = s[1];
@@ -2459,6 +2562,12 @@
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
     case WUFFS_BASE__PIXEL_FORMAT__BGRX:
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+      if (wuffs_base__cpu_arch__x86_64__capabilities() &
+          WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {
+        return wuffs_base__pixel_swizzler__bgrw__rgb__sse128;
+      }
+#endif
       return wuffs_base__pixel_swizzler__bgrw__rgb;
 
     case WUFFS_BASE__PIXEL_FORMAT__RGB:
@@ -2501,6 +2610,12 @@
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
       switch (blend) {
         case WUFFS_BASE__PIXEL_BLEND__SRC:
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+          if (wuffs_base__cpu_arch__x86_64__capabilities() &
+              WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {
+            return wuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse128;
+          }
+#endif
           return wuffs_base__pixel_swizzler__swap_rgbx_bgrx;
         case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
           return wuffs_base__pixel_swizzler__bgra_nonpremul__rgba_nonpremul__src_over;
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index 7caafa3..386387c 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -550,8 +550,9 @@
 	" >>= 8;\n  dg >>= 8;\n  db >>= 8;\n\n  // Combine components.\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_align4_bgr_565_888(uint8_t* dst_ptr,\n                                                      size_t dst_len,\n                                                      uint8_t* dst_palette_ptr,\n                                                      size_t dst_palette_len,\n                                                      const uint8_t* src_ptr,\n                                                      size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint32_t argb = wuffs_base__load_u32le__no_bounds_check(s);\n    uint32_t b5 = 0x1F & (argb >> (8 - 5));\n    uint32_t g6 = 0x3F & (argb >> (16 - 6));\n    uint32_t r5 = 0x1F & (argb >> (24 - 5));\n    uint32_t alpha = argb & 0xFF000000;\n    wuffs_base__store_u32le__no_bounds_check(\n        d, alpha | (r5 << 11) | (g6 << 5) | (b5 << 0));\n    s += 4;\n    d += 4;\n  }\n  return len;\n}\n\nstatic " +
-	"uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgb_bgr(uint8_t* dst_ptr,\n                                         size_t dst_len,\n                                         uint8_t* dst_palette_ptr,\n                                         size_t dst_palette_len,\n                                         const uint8_t* src_ptr,\n                                         size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    s += 3;\n    d += 3;\n  }\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,\n                                           size_t dst_len,\n                                           uint8_t* dst_palette_ptr,\n                                           size_t dst_palette_len,\n                                           const uint8_" +
-	"t* src_ptr,\n                                           size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len;\n}\n\n" +
+	"uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgb_bgr(uint8_t* dst_ptr,\n                                         size_t dst_len,\n                                         uint8_t* dst_palette_ptr,\n                                         size_t dst_palette_len,\n                                         const uint8_t* src_ptr,\n                                         size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    s += 3;\n    d += 3;\n  }\n  return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n#if defined(__GNUC__)\n__attribute__((target(\"sse4.2\")))\n#endif\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse128(uint8_t* dst_ptr,\n                                                   size_t dst_len,\n                                                   uint8_t* dst_palett" +
+	"e_ptr,\n                                                   size_t dst_palette_len,\n                                                   const uint8_t* src_ptr,\n                                                   size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(0x0F, 0x0C, 0x0D, 0x0E,  //\n                                 0x0B, 0x08, 0x09, 0x0A,  //\n                                 0x07, 0x04, 0x05, 0x06,  //\n                                 0x03, 0x00, 0x01, 0x02);\n\n  while (n >= 4) {\n    __m128i x;\n    x = _mm_loadu_si128((const void*)s);\n    x = _mm_shuffle_epi8(x, shuffle);\n    _mm_storeu_si128((void*)d, x);\n\n    s += 4 * 4;\n    d += 4 * 4;\n    n -= 4;\n  }\n\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len;\n}\n#endif\n\nstati" +
+	"c uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,\n                                           size_t dst_len,\n                                           uint8_t* dst_palette_ptr,\n                                           size_t dst_palette_len,\n                                           const uint8_t* src_ptr,\n                                           size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_tight_4x8_4x16le(uint8_t* dst_ptr,\n                                                    size_t dst_len,\n                                                    uint8_t* dst_palette_ptr,\n                                                    size_t dst_palette_len,\n                                                    const uint8_t* src_ptr,\n                                                    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len8 = src_len / 8;\n  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__color_u64__as__color_u32(\n                         wuffs_base__load_u64le__no_bounds_check(s + (0 * 8))));\n\n    s += 1 * 8;\n    d += 1 * 4;\n    n -= 1;\n  }\n  return len;\n}\n\n" +
 	"" +
@@ -593,8 +594,9 @@
 	"swizzler__bgra_premul__rgba_nonpremul__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__swap_u32_argb_abgr(\n        wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__composite_premul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__bgr(uint8_t* dst_ptr,\n                                      size_t dst_len,\n                                      uint8_t* dst_palette_ptr,\n                                      size_t dst_palette_len,\n                                      const uint8_t* src_ptr,\n                                      size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        0xFF000000 | wuffs_base__load_u24le__no_bounds_check(s + (0 * 3)));\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__bgrx(uint8_t* dst_ptr,\n                                       size_t dst_len,\n                                       uint8_t* dst_palet" +
-	"te_ptr,\n                                       size_t dst_palette_len,\n                                       const uint8_t* src_ptr,\n                                       size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        0xFF000000 | wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,\n                                      size_t dst_len,\n                                      uint8_t* dst_palette_ptr,\n                                      size_t dst_palette_len,\n                                      const uint8_t* src_ptr,\n                                      size_t src_len) {\n  size_t ds" +
-	"t_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = 0xFF;\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"te_ptr,\n                                       size_t dst_palette_len,\n                                       const uint8_t* src_ptr,\n                                       size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        0xFF000000 | wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n#if defined(__GNUC__)\n__attribute__((target(\"sse4.2\")))\n#endif\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__rgb__sse128(uint8_t* dst_ptr,\n                                              size_t dst_len,\n                                              uint8_t* dst_palette_ptr,\n                                              size_t dst_palet" +
+	"te_len,\n                                              const uint8_t* src_ptr,\n                                              size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(0x00, 0x09, 0x0A, 0x0B,  //\n                                 0x00, 0x06, 0x07, 0x08,  //\n                                 0x00, 0x03, 0x04, 0x05,  //\n                                 0x00, 0x00, 0x01, 0x02);\n  __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00,  //\n                            0xFF, 0x00, 0x00, 0x00,  //\n                            0xFF, 0x00, 0x00, 0x00,  //\n                            0xFF, 0x00, 0x00, 0x00);\n\n  while (n >= 6) {\n    __m128i x;\n    x = _mm_loadu_si128((const void*)s);\n    x = _mm_shuffle_epi8(x, shuffle);\n    x = _mm_or_si128(x, or);\n    _mm_storeu_si128((void*)d, x);\n\n    s += 4 * 3;\n    d += 4 * 4;\n    n -= 4" +
+	";\n  }\n\n  while (n >= 1) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = 0xFF;\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n#endif\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,\n                                      size_t dst_len,\n                                      uint8_t* dst_palette_ptr,\n                                      size_t dst_palette_len,\n                                      const uint8_t* src_ptr,\n                                      size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = 0xFF;\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__index__src(uint8_t* dst_ptr,\n                                            size_t dst_len,\n                                            uint8_t* dst_palette_ptr,\n                                            size_t dst_palette_len,\n                                            const uint8_t* src_ptr,\n                                            size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n  }\n  size_t dst_len3 = dst_len / 3;\n  size_t len = (dst_len3 < src_len) ? dst_len3 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  // The comparison in the while condition is \">\", not \">=\", because with\n  // \">=\", the last 4-byte store could write past the end of the dst slice.\n  //\n  // Each 4-byte store writes one too many bytes, but a subsequent store\n  // will overwrite that with the correct byte. There is always another\n  // store, whether a 4-byte store in this loop" +
 	" or a 1-byte store in the\n  // next loop.\n  while (n > loop_unroll_count) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (1 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (2 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (3 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((siz" +
@@ -624,9 +626,9 @@
 	"RA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__squash_tight_4x8_4x16le;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMA" +
 	"T__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgrx(wuffs_base__pixel_swizzler* p,\n                                          wuffs_base__pixel_format dst_pixfmt,\n                                          wuffs_base__slice_u8 dst_palette,\n                                          wuffs_base__slice_u8 src_palette,\n                                          wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgrx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__xxx__xxxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__pixel_swizzler__bgrw__bgrx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__copy_4_4;\n\n    case WUFFS_BASE__PIXE" +
 	"L_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__rgb(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_pixfmt,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__rgb;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__swap_rgb_bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FO" +
-	"RMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__bgrw__rgb;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__rgba_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__rgba_nonpremul__src_over;\n     " +
-	" }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__swap_rgbx_bgrx;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__rgba_nonpremul__src_over;\n      " +
-	"}\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
+	"RMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n      if (wuffs_base__cpu_arch__x86_64__capabilities() &\n          WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {\n        return wuffs_base__pixel_swizzler__bgrw__rgb__sse128;\n      }\n#endif\n      return wuffs_base__pixel_swizzler__bgrw__rgb;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__rgba_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__" +
+	"PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n          if (wuffs_base__cpu_arch__x86_64__capabilities() &\n              WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {\n            return wuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse128;\n          }\n#endif\n          return wuffs_base__pixel_swizzler__swap_rgbx_bgrx;\n        case WUFFS_BA" +
+	"SE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__rgba_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__rgba_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
 	"" +
 	"// --------\n\nWUFFS_BASE__MAYBE_STATIC wuffs_base__status  //\nwuffs_base__pixel_swizzler__prepare(wuffs_base__pixel_swizzler* p,\n                                    wuffs_base__pixel_format dst_pixfmt,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_pixfmt,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend) {\n  if (!p) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n  p->private_impl.func = NULL;\n  p->private_impl.transparent_black_func = NULL;\n  p->private_impl.dst_pixfmt_bytes_per_pixel = 0;\n  p->private_impl.src_pixfmt_bytes_per_pixel = 0;\n\n  wuffs_base__pixel_swizzler__func func = NULL;\n  wuffs_base__pixel_swizzler__transparent_black_func transparent_black_func =\n      NULL;\n\n  uint32_t dst_pixfmt_bits_per_pixel =\n      wuffs_base__pixel_format__bits_per_pixel(&dst_pixfmt);\n  if ((dst_pixfmt_bits_per_pixel == " +
 	"0) ||\n      ((dst_pixfmt_bits_per_pixel & 7) != 0)) {\n    return wuffs_base__make_status(\n        wuffs_base__error__unsupported_pixel_swizzler_option);\n  }\n\n  uint32_t src_pixfmt_bits_per_pixel =\n      wuffs_base__pixel_format__bits_per_pixel(&src_pixfmt);\n  if ((src_pixfmt_bits_per_pixel == 0) ||\n      ((src_pixfmt_bits_per_pixel & 7) != 0)) {\n    return wuffs_base__make_status(\n        wuffs_base__error__unsupported_pixel_swizzler_option);\n  }\n\n  // TODO: support many more formats.\n\n  switch (blend) {\n    case WUFFS_BASE__PIXEL_BLEND__SRC:\n      transparent_black_func =\n          wuffs_base__pixel_swizzler__transparent_black_src;\n      break;\n\n    case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n      transparent_black_func =\n          wuffs_base__pixel_swizzler__transparent_black_src_over;\n      break;\n  }\n\n  switch (src_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_pixfmt, dst_palette,\n                                                    src_palette" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 2a3bd4f..b1aaa53 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -3035,10 +3035,14 @@
 typedef uint32_t wuffs_base__pixel_alpha_transparency;
 
 #define WUFFS_BASE__PIXEL_ALPHA_TRANSPARENCY__OPAQUE 0
-#define WUFFS_BASE__PIXEL_ALPHA_TRANSPARENCY__NON_PREMULTIPLIED_ALPHA 1
+#define WUFFS_BASE__PIXEL_ALPHA_TRANSPARENCY__NONPREMULTIPLIED_ALPHA 1
 #define WUFFS_BASE__PIXEL_ALPHA_TRANSPARENCY__PREMULTIPLIED_ALPHA 2
 #define WUFFS_BASE__PIXEL_ALPHA_TRANSPARENCY__BINARY_ALPHA 3
 
+// Deprecated: use WUFFS_BASE__PIXEL_ALPHA_TRANSPARENCY__NONPREMULTIPLIED_ALPHA
+// instead.
+#define WUFFS_BASE__PIXEL_ALPHA_TRANSPARENCY__NON_PREMULTIPLIED_ALPHA 1
+
 // --------
 
 #define WUFFS_BASE__PIXEL_FORMAT__NUM_PLANES_MAX 4
@@ -14525,6 +14529,54 @@
   return len;
 }
 
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static uint64_t  //
+wuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse128(uint8_t* dst_ptr,
+                                                   size_t dst_len,
+                                                   uint8_t* dst_palette_ptr,
+                                                   size_t dst_palette_len,
+                                                   const uint8_t* src_ptr,
+                                                   size_t src_len) {
+  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  __m128i shuffle = _mm_set_epi8(0x0F, 0x0C, 0x0D, 0x0E,  //
+                                 0x0B, 0x08, 0x09, 0x0A,  //
+                                 0x07, 0x04, 0x05, 0x06,  //
+                                 0x03, 0x00, 0x01, 0x02);
+
+  while (n >= 4) {
+    __m128i x;
+    x = _mm_loadu_si128((const void*)s);
+    x = _mm_shuffle_epi8(x, shuffle);
+    _mm_storeu_si128((void*)d, x);
+
+    s += 4 * 4;
+    d += 4 * 4;
+    n -= 4;
+  }
+
+  while (n--) {
+    uint8_t b0 = s[0];
+    uint8_t b1 = s[1];
+    uint8_t b2 = s[2];
+    uint8_t b3 = s[3];
+    d[0] = b2;
+    d[1] = b1;
+    d[2] = b0;
+    d[3] = b3;
+    s += 4;
+    d += 4;
+  }
+  return len;
+}
+#endif
+
 static uint64_t  //
 wuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,
                                            size_t dst_len,
@@ -15689,6 +15741,63 @@
   return len;
 }
 
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgrw__rgb__sse128(uint8_t* dst_ptr,
+                                              size_t dst_len,
+                                              uint8_t* dst_palette_ptr,
+                                              size_t dst_palette_len,
+                                              const uint8_t* src_ptr,
+                                              size_t src_len) {
+  size_t dst_len4 = dst_len / 4;
+  size_t src_len3 = src_len / 3;
+  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  __m128i shuffle = _mm_set_epi8(0x00, 0x09, 0x0A, 0x0B,  //
+                                 0x00, 0x06, 0x07, 0x08,  //
+                                 0x00, 0x03, 0x04, 0x05,  //
+                                 0x00, 0x00, 0x01, 0x02);
+  __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00,  //
+                            0xFF, 0x00, 0x00, 0x00,  //
+                            0xFF, 0x00, 0x00, 0x00,  //
+                            0xFF, 0x00, 0x00, 0x00);
+
+  while (n >= 6) {
+    __m128i x;
+    x = _mm_loadu_si128((const void*)s);
+    x = _mm_shuffle_epi8(x, shuffle);
+    x = _mm_or_si128(x, or);
+    _mm_storeu_si128((void*)d, x);
+
+    s += 4 * 3;
+    d += 4 * 4;
+    n -= 4;
+  }
+
+  while (n >= 1) {
+    uint8_t b0 = s[0];
+    uint8_t b1 = s[1];
+    uint8_t b2 = s[2];
+    d[0] = b2;
+    d[1] = b1;
+    d[2] = b0;
+    d[3] = 0xFF;
+
+    s += 1 * 3;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+#endif
+
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,
                                       size_t dst_len,
@@ -15703,8 +15812,6 @@
   const uint8_t* s = src_ptr;
   size_t n = len;
 
-  // TODO: unroll.
-
   while (n >= 1) {
     uint8_t b0 = s[0];
     uint8_t b1 = s[1];
@@ -16454,6 +16561,12 @@
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
     case WUFFS_BASE__PIXEL_FORMAT__BGRX:
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+      if (wuffs_base__cpu_arch__x86_64__capabilities() &
+          WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {
+        return wuffs_base__pixel_swizzler__bgrw__rgb__sse128;
+      }
+#endif
       return wuffs_base__pixel_swizzler__bgrw__rgb;
 
     case WUFFS_BASE__PIXEL_FORMAT__RGB:
@@ -16496,6 +16609,12 @@
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
       switch (blend) {
         case WUFFS_BASE__PIXEL_BLEND__SRC:
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+          if (wuffs_base__cpu_arch__x86_64__capabilities() &
+              WUFFS_BASE__CPU_ARCH__X86_64__SSE128) {
+            return wuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse128;
+          }
+#endif
           return wuffs_base__pixel_swizzler__swap_rgbx_bgrx;
         case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
           return wuffs_base__pixel_swizzler__bgra_nonpremul__rgba_nonpremul__src_over;
diff --git a/test/c/std/wbmp.c b/test/c/std/wbmp.c
index 7f1fc30..eec67d7 100644
--- a/test/c/std/wbmp.c
+++ b/test/c/std/wbmp.c
@@ -105,7 +105,7 @@
 test_wuffs_pixel_swizzler_swizzle() {
   CHECK_FOCUS(__func__);
 
-  const uint32_t width = 5;
+  const uint32_t width = 22;
   const uint32_t height = 5;
   uint8_t dummy_palette_array[1024];
   wuffs_base__pixel_swizzler swizzler;