Fix some SIMD compiler warnings
diff --git a/internal/cgen/base/pixconv-submodule.c b/internal/cgen/base/pixconv-submodule.c
index f952208..4ba2123 100644
--- a/internal/cgen/base/pixconv-submodule.c
+++ b/internal/cgen/base/pixconv-submodule.c
@@ -546,16 +546,16 @@
   const uint8_t* s = src_ptr;
   size_t n = len;
 
-  __m128i shuffle = _mm_set_epi8(0x0F, 0x0C, 0x0D, 0x0E,  //
-                                 0x0B, 0x08, 0x09, 0x0A,  //
-                                 0x07, 0x04, 0x05, 0x06,  //
-                                 0x03, 0x00, 0x01, 0x02);
+  __m128i shuffle = _mm_set_epi8(+0x0F, +0x0C, +0x0D, +0x0E,  //
+                                 +0x0B, +0x08, +0x09, +0x0A,  //
+                                 +0x07, +0x04, +0x05, +0x06,  //
+                                 +0x03, +0x00, +0x01, +0x02);
 
   while (n >= 4) {
     __m128i x;
-    x = _mm_loadu_si128((const void*)s);
+    x = _mm_loadu_si128((const __m128i*)(const void*)s);
     x = _mm_shuffle_epi8(x, shuffle);
-    _mm_storeu_si128((void*)d, x);
+    _mm_storeu_si128((__m128i*)(void*)d, x);
 
     s += 4 * 4;
     d += 4 * 4;
@@ -576,7 +576,7 @@
   }
   return len;
 }
-#endif
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,
@@ -1760,21 +1760,21 @@
   const uint8_t* s = src_ptr;
   size_t n = len;
 
-  __m128i shuffle = _mm_set_epi8(0x00, 0x09, 0x0A, 0x0B,  //
-                                 0x00, 0x06, 0x07, 0x08,  //
-                                 0x00, 0x03, 0x04, 0x05,  //
-                                 0x00, 0x00, 0x01, 0x02);
-  __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00,  //
-                            0xFF, 0x00, 0x00, 0x00,  //
-                            0xFF, 0x00, 0x00, 0x00,  //
-                            0xFF, 0x00, 0x00, 0x00);
+  __m128i shuffle = _mm_set_epi8(+0x00, +0x09, +0x0A, +0x0B,  //
+                                 +0x00, +0x06, +0x07, +0x08,  //
+                                 +0x00, +0x03, +0x04, +0x05,  //
+                                 +0x00, +0x00, +0x01, +0x02);
+  __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00,  //
+                               -0x01, +0x00, +0x00, +0x00,  //
+                               -0x01, +0x00, +0x00, +0x00,  //
+                               -0x01, +0x00, +0x00, +0x00);
 
   while (n >= 6) {
     __m128i x;
-    x = _mm_loadu_si128((const void*)s);
+    x = _mm_loadu_si128((const __m128i*)(const void*)s);
     x = _mm_shuffle_epi8(x, shuffle);
-    x = _mm_or_si128(x, or);
-    _mm_storeu_si128((void*)d, x);
+    x = _mm_or_si128(x, or_ff);
+    _mm_storeu_si128((__m128i*)(void*)d, x);
 
     s += 4 * 3;
     d += 4 * 4;
@@ -1797,7 +1797,7 @@
 
   return len;
 }
-#endif
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,
@@ -2137,21 +2137,21 @@
   const uint8_t* s = src_ptr;
   size_t n = len;
 
-  __m128i shuffle = _mm_set_epi8(0x03, 0x03, 0x03, 0x03,  //
-                                 0x02, 0x02, 0x02, 0x02,  //
-                                 0x01, 0x01, 0x01, 0x01,  //
-                                 0x00, 0x00, 0x00, 0x00);
-  __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00,  //
-                            0xFF, 0x00, 0x00, 0x00,  //
-                            0xFF, 0x00, 0x00, 0x00,  //
-                            0xFF, 0x00, 0x00, 0x00);
+  __m128i shuffle = _mm_set_epi8(+0x03, +0x03, +0x03, +0x03,  //
+                                 +0x02, +0x02, +0x02, +0x02,  //
+                                 +0x01, +0x01, +0x01, +0x01,  //
+                                 +0x00, +0x00, +0x00, +0x00);
+  __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00,  //
+                               -0x01, +0x00, +0x00, +0x00,  //
+                               -0x01, +0x00, +0x00, +0x00,  //
+                               -0x01, +0x00, +0x00, +0x00);
 
   while (n >= 4) {
     __m128i x;
-    x = _mm_cvtsi32_si128(wuffs_base__load_u32le__no_bounds_check(s));
+    x = _mm_cvtsi32_si128((int)(wuffs_base__load_u32le__no_bounds_check(s)));
     x = _mm_shuffle_epi8(x, shuffle);
-    x = _mm_or_si128(x, or);
-    _mm_storeu_si128((void*)d, x);
+    x = _mm_or_si128(x, or_ff);
+    _mm_storeu_si128((__m128i*)(void*)d, x);
 
     s += 4 * 1;
     d += 4 * 4;
@@ -2169,7 +2169,7 @@
 
   return len;
 }
-#endif
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index e0f889f..7a4c1ed 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -551,8 +551,8 @@
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_align4_bgr_565_888(uint8_t* dst_ptr,\n                                                      size_t dst_len,\n                                                      uint8_t* dst_palette_ptr,\n                                                      size_t dst_palette_len,\n                                                      const uint8_t* src_ptr,\n                                                      size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint32_t argb = wuffs_base__load_u32le__no_bounds_check(s);\n    uint32_t b5 = 0x1F & (argb >> (8 - 5));\n    uint32_t g6 = 0x3F & (argb >> (16 - 6));\n    uint32_t r5 = 0x1F & (argb >> (24 - 5));\n    uint32_t alpha = argb & 0xFF000000;\n    wuffs_base__store_u32le__no_bounds_check(\n        d, alpha | (r5 << 11) | (g6 << 5) | (b5 << 0));\n    s += 4;\n    d += 4;\n  }\n  return len;\n}\n\nstatic " +
 	"uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgb_bgr(uint8_t* dst_ptr,\n                                         size_t dst_len,\n                                         uint8_t* dst_palette_ptr,\n                                         size_t dst_palette_len,\n                                         const uint8_t* src_ptr,\n                                         size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    s += 3;\n    d += 3;\n  }\n  return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n#if defined(__GNUC__)\n__attribute__((target(\"sse4.2\")))\n#endif\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse128(uint8_t* dst_ptr,\n                                                   size_t dst_len,\n                                                   uint8_t* dst_palett" +
-	"e_ptr,\n                                                   size_t dst_palette_len,\n                                                   const uint8_t* src_ptr,\n                                                   size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(0x0F, 0x0C, 0x0D, 0x0E,  //\n                                 0x0B, 0x08, 0x09, 0x0A,  //\n                                 0x07, 0x04, 0x05, 0x06,  //\n                                 0x03, 0x00, 0x01, 0x02);\n\n  while (n >= 4) {\n    __m128i x;\n    x = _mm_loadu_si128((const void*)s);\n    x = _mm_shuffle_epi8(x, shuffle);\n    _mm_storeu_si128((void*)d, x);\n\n    s += 4 * 4;\n    d += 4 * 4;\n    n -= 4;\n  }\n\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len;\n}\n#endif\n\nstati" +
-	"c uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,\n                                           size_t dst_len,\n                                           uint8_t* dst_palette_ptr,\n                                           size_t dst_palette_len,\n                                           const uint8_t* src_ptr,\n                                           size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len;\n}\n\n" +
+	"e_ptr,\n                                                   size_t dst_palette_len,\n                                                   const uint8_t* src_ptr,\n                                                   size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(+0x0F, +0x0C, +0x0D, +0x0E,  //\n                                 +0x0B, +0x08, +0x09, +0x0A,  //\n                                 +0x07, +0x04, +0x05, +0x06,  //\n                                 +0x03, +0x00, +0x01, +0x02);\n\n  while (n >= 4) {\n    __m128i x;\n    x = _mm_loadu_si128((const __m128i*)(const void*)s);\n    x = _mm_shuffle_epi8(x, shuffle);\n    _mm_storeu_si128((__m128i*)(void*)d, x);\n\n    s += 4 * 4;\n    d += 4 * 4;\n    n -= 4;\n  }\n\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n   " +
+	" d += 4;\n  }\n  return len;\n}\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,\n                                           size_t dst_len,\n                                           uint8_t* dst_palette_ptr,\n                                           size_t dst_palette_len,\n                                           const uint8_t* src_ptr,\n                                           size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_tight_4x8_4x16le(uint8_t* dst_ptr,\n                                                    size_t dst_len,\n                                                    uint8_t* dst_palette_ptr,\n                                                    size_t dst_palette_len,\n                                                    const uint8_t* src_ptr,\n                                                    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len8 = src_len / 8;\n  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__color_u64__as__color_u32(\n                         wuffs_base__load_u64le__no_bounds_check(s + (0 * 8))));\n\n    s += 1 * 8;\n    d += 1 * 4;\n    n -= 1;\n  }\n  return len;\n}\n\n" +
 	"" +
@@ -595,8 +595,9 @@
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__bgr(uint8_t* dst_ptr,\n                                      size_t dst_len,\n                                      uint8_t* dst_palette_ptr,\n                                      size_t dst_palette_len,\n                                      const uint8_t* src_ptr,\n                                      size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        0xFF000000 | wuffs_base__load_u24le__no_bounds_check(s + (0 * 3)));\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__bgrx(uint8_t* dst_ptr,\n                                       size_t dst_len,\n                                       uint8_t* dst_palet" +
 	"te_ptr,\n                                       size_t dst_palette_len,\n                                       const uint8_t* src_ptr,\n                                       size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        0xFF000000 | wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n#if defined(__GNUC__)\n__attribute__((target(\"sse4.2\")))\n#endif\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__rgb__sse128(uint8_t* dst_ptr,\n                                              size_t dst_len,\n                                              uint8_t* dst_palette_ptr,\n                                              size_t dst_palet" +
-	"te_len,\n                                              const uint8_t* src_ptr,\n                                              size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(0x00, 0x09, 0x0A, 0x0B,  //\n                                 0x00, 0x06, 0x07, 0x08,  //\n                                 0x00, 0x03, 0x04, 0x05,  //\n                                 0x00, 0x00, 0x01, 0x02);\n  __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00,  //\n                            0xFF, 0x00, 0x00, 0x00,  //\n                            0xFF, 0x00, 0x00, 0x00,  //\n                            0xFF, 0x00, 0x00, 0x00);\n\n  while (n >= 6) {\n    __m128i x;\n    x = _mm_loadu_si128((const void*)s);\n    x = _mm_shuffle_epi8(x, shuffle);\n    x = _mm_or_si128(x, or);\n    _mm_storeu_si128((void*)d, x);\n\n    s += 4 * 3;\n    d += 4 * 4;\n    n -= 4" +
-	";\n  }\n\n  while (n >= 1) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = 0xFF;\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n#endif\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,\n                                      size_t dst_len,\n                                      uint8_t* dst_palette_ptr,\n                                      size_t dst_palette_len,\n                                      const uint8_t* src_ptr,\n                                      size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = 0xFF;\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"te_len,\n                                              const uint8_t* src_ptr,\n                                              size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(+0x00, +0x09, +0x0A, +0x0B,  //\n                                 +0x00, +0x06, +0x07, +0x08,  //\n                                 +0x00, +0x03, +0x04, +0x05,  //\n                                 +0x00, +0x00, +0x01, +0x02);\n  __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00);\n\n  while (n >= 6) {\n    __m128i x;\n    x = _mm_loadu_si128((const __m128i*)(const void*)s);\n    x = _mm_shuffle_epi8(x, shuffle);\n    x = _mm_or_si128(x, or_ff);\n    _mm_storeu_" +
+	"si128((__m128i*)(void*)d, x);\n\n    s += 4 * 3;\n    d += 4 * 4;\n    n -= 4;\n  }\n\n  while (n >= 1) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = 0xFF;\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,\n                                      size_t dst_len,\n                                      uint8_t* dst_palette_ptr,\n                                      size_t dst_palette_len,\n                                      const uint8_t* src_ptr,\n                                      size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n  " +
+	"  d[1] = b1;\n    d[2] = b0;\n    d[3] = 0xFF;\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__index__src(uint8_t* dst_ptr,\n                                            size_t dst_len,\n                                            uint8_t* dst_palette_ptr,\n                                            size_t dst_palette_len,\n                                            const uint8_t* src_ptr,\n                                            size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n  }\n  size_t dst_len3 = dst_len / 3;\n  size_t len = (dst_len3 < src_len) ? dst_len3 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  // The comparison in the while condition is \">\", not \">=\", because with\n  // \">=\", the last 4-byte store could write past the end of the dst slice.\n  //\n  // Each 4-byte store writes one too many bytes, but a subsequent store\n  // will overwrite that with the correct byte. There is always another\n  // store, whether a 4-byte store in this loop" +
 	" or a 1-byte store in the\n  // next loop.\n  while (n > loop_unroll_count) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (1 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (2 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (3 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((siz" +
@@ -609,8 +610,8 @@
 	"dst_palette_ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (2 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (3 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n" +
 	"  }\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[1] * 4));\n    if (s1) {\n      wuffs_base__store_u32le__no_bounds_check(d + (1 * 4), s1);\n    }\n    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__store_u32le__no_bounds_check(d + (2 * 4), s2);\n    }\n    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(dst_" +
 	"palette_ptr +\n                                                          ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u32le__no_bounds_check(d + (3 * 4), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n#if defined(__GNUC__)\n__attribute__((target(\"sse4.2\")))\n#endif\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y__sse128(uint8_t* dst_ptr,\n                                            size_t dst_len,\n                                            uint8_t* dst_palette_ptr,\n                                            size_t dst_palette_len,\n                                     " +
-	"       const uint8_t* src_ptr,\n                                            size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(0x03, 0x03, 0x03, 0x03,  //\n                                 0x02, 0x02, 0x02, 0x02,  //\n                                 0x01, 0x01, 0x01, 0x01,  //\n                                 0x00, 0x00, 0x00, 0x00);\n  __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00,  //\n                            0xFF, 0x00, 0x00, 0x00,  //\n                            0xFF, 0x00, 0x00, 0x00,  //\n                            0xFF, 0x00, 0x00, 0x00);\n\n  while (n >= 4) {\n    __m128i x;\n    x = _mm_cvtsi32_si128(wuffs_base__load_u32le__no_bounds_check(s));\n    x = _mm_shuffle_epi8(x, shuffle);\n    x = _mm_or_si128(x, or);\n    _mm_storeu_si128((void*)d, x);\n\n    s += 4 * 1;\n    d += 4 * 4;\n    n -= 4;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u32le_" +
-	"_no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n#endif\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,\n                                    size_t dst_len,\n                                    uint8_t* dst_palette_ptr,\n                                    size_t dst_palette_len,\n                                    const uint8_t* src_ptr,\n                                    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"       const uint8_t* src_ptr,\n                                            size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(+0x03, +0x03, +0x03, +0x03,  //\n                                 +0x02, +0x02, +0x02, +0x02,  //\n                                 +0x01, +0x01, +0x01, +0x01,  //\n                                 +0x00, +0x00, +0x00, +0x00);\n  __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00);\n\n  while (n >= 4) {\n    __m128i x;\n    x = _mm_cvtsi32_si128((int)(wuffs_base__load_u32le__no_bounds_check(s)));\n    x = _mm_shuffle_epi8(x, shuffle);\n    x = _mm_or_si128(x, or_ff);\n    _mm_storeu_si128((__m128i*)(void*)d, x);\n\n    s += 4 * 1;\n    d += 4 * 4;\n" +
+	"    n -= 4;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,\n                                    size_t dst_len,\n                                    uint8_t* dst_palette_ptr,\n                                    size_t dst_palette_len,\n                                    const uint8_t* src_ptr,\n                                    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__transparent_black_src(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    uint64_t num_pixels,\n    uint32_t dst_pixfmt_bytes_per_pixel) {\n  uint64_t n = ((uint64_t)dst_len) / dst_pixfmt_bytes_per_pixel;\n  if (n > num_pixels) {\n    n = num_pixels;\n  }\n  memset(dst_ptr, 0, ((size_t)(n * dst_pixfmt_bytes_per_pixel)));\n  return n;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__transparent_black_src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    uint64_t num_pixels,\n    uint32_t dst_pixfmt_bytes_per_pixel) {\n  uint64_t n = ((uint64_t)dst_len) / dst_pixfmt_bytes_per_pixel;\n  if (n > num_pixels) {\n    n = num_pixels;\n  }\n  return n;\n}\n\n" +
 	"" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index b1aaa53..77dbca4 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -14545,16 +14545,16 @@
   const uint8_t* s = src_ptr;
   size_t n = len;
 
-  __m128i shuffle = _mm_set_epi8(0x0F, 0x0C, 0x0D, 0x0E,  //
-                                 0x0B, 0x08, 0x09, 0x0A,  //
-                                 0x07, 0x04, 0x05, 0x06,  //
-                                 0x03, 0x00, 0x01, 0x02);
+  __m128i shuffle = _mm_set_epi8(+0x0F, +0x0C, +0x0D, +0x0E,  //
+                                 +0x0B, +0x08, +0x09, +0x0A,  //
+                                 +0x07, +0x04, +0x05, +0x06,  //
+                                 +0x03, +0x00, +0x01, +0x02);
 
   while (n >= 4) {
     __m128i x;
-    x = _mm_loadu_si128((const void*)s);
+    x = _mm_loadu_si128((const __m128i*)(const void*)s);
     x = _mm_shuffle_epi8(x, shuffle);
-    _mm_storeu_si128((void*)d, x);
+    _mm_storeu_si128((__m128i*)(void*)d, x);
 
     s += 4 * 4;
     d += 4 * 4;
@@ -14575,7 +14575,7 @@
   }
   return len;
 }
-#endif
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,
@@ -15759,21 +15759,21 @@
   const uint8_t* s = src_ptr;
   size_t n = len;
 
-  __m128i shuffle = _mm_set_epi8(0x00, 0x09, 0x0A, 0x0B,  //
-                                 0x00, 0x06, 0x07, 0x08,  //
-                                 0x00, 0x03, 0x04, 0x05,  //
-                                 0x00, 0x00, 0x01, 0x02);
-  __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00,  //
-                            0xFF, 0x00, 0x00, 0x00,  //
-                            0xFF, 0x00, 0x00, 0x00,  //
-                            0xFF, 0x00, 0x00, 0x00);
+  __m128i shuffle = _mm_set_epi8(+0x00, +0x09, +0x0A, +0x0B,  //
+                                 +0x00, +0x06, +0x07, +0x08,  //
+                                 +0x00, +0x03, +0x04, +0x05,  //
+                                 +0x00, +0x00, +0x01, +0x02);
+  __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00,  //
+                               -0x01, +0x00, +0x00, +0x00,  //
+                               -0x01, +0x00, +0x00, +0x00,  //
+                               -0x01, +0x00, +0x00, +0x00);
 
   while (n >= 6) {
     __m128i x;
-    x = _mm_loadu_si128((const void*)s);
+    x = _mm_loadu_si128((const __m128i*)(const void*)s);
     x = _mm_shuffle_epi8(x, shuffle);
-    x = _mm_or_si128(x, or);
-    _mm_storeu_si128((void*)d, x);
+    x = _mm_or_si128(x, or_ff);
+    _mm_storeu_si128((__m128i*)(void*)d, x);
 
     s += 4 * 3;
     d += 4 * 4;
@@ -15796,7 +15796,7 @@
 
   return len;
 }
-#endif
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,
@@ -16136,21 +16136,21 @@
   const uint8_t* s = src_ptr;
   size_t n = len;
 
-  __m128i shuffle = _mm_set_epi8(0x03, 0x03, 0x03, 0x03,  //
-                                 0x02, 0x02, 0x02, 0x02,  //
-                                 0x01, 0x01, 0x01, 0x01,  //
-                                 0x00, 0x00, 0x00, 0x00);
-  __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00,  //
-                            0xFF, 0x00, 0x00, 0x00,  //
-                            0xFF, 0x00, 0x00, 0x00,  //
-                            0xFF, 0x00, 0x00, 0x00);
+  __m128i shuffle = _mm_set_epi8(+0x03, +0x03, +0x03, +0x03,  //
+                                 +0x02, +0x02, +0x02, +0x02,  //
+                                 +0x01, +0x01, +0x01, +0x01,  //
+                                 +0x00, +0x00, +0x00, +0x00);
+  __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00,  //
+                               -0x01, +0x00, +0x00, +0x00,  //
+                               -0x01, +0x00, +0x00, +0x00,  //
+                               -0x01, +0x00, +0x00, +0x00);
 
   while (n >= 4) {
     __m128i x;
-    x = _mm_cvtsi32_si128(wuffs_base__load_u32le__no_bounds_check(s));
+    x = _mm_cvtsi32_si128((int)(wuffs_base__load_u32le__no_bounds_check(s)));
     x = _mm_shuffle_epi8(x, shuffle);
-    x = _mm_or_si128(x, or);
-    _mm_storeu_si128((void*)d, x);
+    x = _mm_or_si128(x, or_ff);
+    _mm_storeu_si128((__m128i*)(void*)d, x);
 
     s += 4 * 1;
     d += 4 * 4;
@@ -16168,7 +16168,7 @@
 
   return len;
 }
-#endif
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,