Add WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET
diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h
index 990a754..6952a77 100644
--- a/internal/cgen/base/fundamental-public.h
+++ b/internal/cgen/base/fundamental-public.h
@@ -44,10 +44,19 @@
 
 // Define WUFFS_CONFIG__AVOID_CPU_ARCH to avoid any code tied to a specific CPU
 // architecture, such as SSE SIMD for the x86 CPU family.
-#if defined(WUFFS_CONFIG__AVOID_CPU_ARCH)
+#if defined(WUFFS_CONFIG__AVOID_CPU_ARCH)  // (#if-chain ref AVOID_CPU_ARCH_0)
 // No-op.
+#else  // (#if-chain ref AVOID_CPU_ARCH_0)
+
+// The "defined(__clang__)" isn't redundant. While vanilla clang defines
+// __GNUC__, clang-cl (which mimics MSVC's cl.exe) does not.
+#if defined(__GNUC__) || defined(__clang__)
+#define WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(arg) __attribute__((target(arg)))
 #else
-#if defined(__GNUC__)
+#define WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(arg)
+#endif  // defined(__GNUC__) || defined(__clang__)
+
+#if defined(__GNUC__)  // (#if-chain ref AVOID_CPU_ARCH_1)
 
 // To simplify Wuffs code, "cpu_arch >= arm_xxx" requires xxx but also
 // unaligned little-endian load/stores.
@@ -73,15 +82,24 @@
 #define WUFFS_BASE__CPU_ARCH__X86_64
 #endif  // defined(__x86_64__)
 
-#elif defined(_MSC_VER)  // defined(__GNUC__)
+#elif defined(_MSC_VER)  // (#if-chain ref AVOID_CPU_ARCH_1)
 
 #if defined(_M_X64)
+#if defined(__clang__)
+// No-op. clang-cl (which defines both __clang__ and _MSC_VER) supports
+// "__attribute__((target(arg)))".
+#elif !defined(__AVX__)
+// For MSVC's cl.exe (unlike clang or gcc), SIMD capability is a compile-time
+// property of the source file (e.g. a /arch:AVX or -mavx compiler flag), not
+// of individual functions (that can be conditionally selected at runtime).
+#error "Wuffs with MSVC+X64 needs /arch:AVX or /DWUFFS_CONFIG__AVOID_CPU_ARCH"
+#endif  // defined(__clang__); !defined(__AVX__)
 #include <intrin.h>
 #define WUFFS_BASE__CPU_ARCH__X86_64
-#endif  // defined(__x86_64__)
+#endif  // defined(_M_X64)
 
-#endif  // defined(__GNUC__); defined(_MSC_VER)
-#endif  // defined(WUFFS_CONFIG__AVOID_CPU_ARCH)
+#endif  // (#if-chain ref AVOID_CPU_ARCH_1)
+#endif  // (#if-chain ref AVOID_CPU_ARCH_0)
 
 // --------
 
@@ -123,6 +141,8 @@
   //  - bit_POPCNT = (1 << 23)
   //  - bit_SSE4_2 = (1 << 20)
   const unsigned int sse42_ecx1 = 0x00900002;
+
+  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
 #if defined(__GNUC__)
   unsigned int eax1 = 0;
   unsigned int ebx1 = 0;
diff --git a/internal/cgen/base/pixconv-submodule.c b/internal/cgen/base/pixconv-submodule.c
index 30d1792..9a3e7a8 100644
--- a/internal/cgen/base/pixconv-submodule.c
+++ b/internal/cgen/base/pixconv-submodule.c
@@ -757,9 +757,7 @@
 }
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
-#endif
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("sse4.2")
 static uint64_t  //
 wuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse42(uint8_t* dst_ptr,
                                                   size_t dst_len,
@@ -2102,9 +2100,7 @@
 }
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
-#endif
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("sse4.2")
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgrw__rgb__sse42(uint8_t* dst_ptr,
                                              size_t dst_len,
@@ -2545,9 +2541,7 @@
 }
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
-#endif
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("sse4.2")
 static uint64_t  //
 wuffs_base__pixel_swizzler__xxxx__y__sse42(uint8_t* dst_ptr,
                                            size_t dst_len,
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index 1143893..ee6cfaf 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -54,13 +54,14 @@
 	"// ---------------- Version\n\n// WUFFS_VERSION is the major.minor.patch version, as per https://semver.org/,\n// as a uint64_t. The major number is the high 32 bits. The minor number is the\n// middle 16 bits. The patch number is the low 16 bits. The pre-release label\n// and build metadata are part of the string representation (such as\n// \"1.2.3-beta+456.20181231\") but not the uint64_t representation.\n//\n// WUFFS_VERSION_PRE_RELEASE_LABEL (such as \"\", \"beta\" or \"rc.1\") being\n// non-empty denotes a developer preview, not a release version, and has no\n// backwards or forwards compatibility guarantees.\n//\n// WUFFS_VERSION_BUILD_METADATA_XXX, if non-zero, are the number of commits and\n// the last commit date in the repository used to build this library. Within\n// each major.minor branch, the commit count should increase monotonically.\n//\n// !! Some code generation programs can override WUFFS_VERSION.\n#define WUFFS_VERSION 0\n#define WUFFS_VERSION_MAJOR 0\n#define WUFFS_VERSION_MINOR 0\n#define WUFFS_VERSION_PATCH 0\n#de" +
 	"fine WUFFS_VERSION_PRE_RELEASE_LABEL \"work.in.progress\"\n#define WUFFS_VERSION_BUILD_METADATA_COMMIT_COUNT 0\n#define WUFFS_VERSION_BUILD_METADATA_COMMIT_DATE 0\n#define WUFFS_VERSION_STRING \"0.0.0+0.00000000\"\n\n" +
 	"" +
-	"// ---------------- Configuration\n\n// Define WUFFS_CONFIG__AVOID_CPU_ARCH to avoid any code tied to a specific CPU\n// architecture, such as SSE SIMD for the x86 CPU family.\n#if defined(WUFFS_CONFIG__AVOID_CPU_ARCH)\n// No-op.\n#else\n#if defined(__GNUC__)\n\n// To simplify Wuffs code, \"cpu_arch >= arm_xxx\" requires xxx but also\n// unaligned little-endian load/stores.\n#if defined(__ARM_FEATURE_UNALIGNED) && defined(__BYTE_ORDER__) && \\\n    (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)\n// Not all gcc versions define __ARM_ACLE, even if they support crc32\n// intrinsics. Look for __ARM_FEATURE_CRC32 instead.\n#if defined(__ARM_FEATURE_CRC32)\n#include <arm_acle.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_CRC32\n#endif  // defined(__ARM_FEATURE_CRC32)\n#if defined(__ARM_NEON)\n#include <arm_neon.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_NEON\n#endif  // defined(__ARM_NEON)\n#endif  // defined(__ARM_FEATURE_UNALIGNED) etc\n\n// Similarly, \"cpu_arch >= x86_sse42\" requires SSE4.2 but also PCLMUL and\n// POPCNT. This is checked at runtime via cpu" +
-	"id, not at compile time.\n#if defined(__x86_64__)\n#include <cpuid.h>\n#include <x86intrin.h>\n#define WUFFS_BASE__CPU_ARCH__X86_64\n#endif  // defined(__x86_64__)\n\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n\n#if defined(_M_X64)\n#include <intrin.h>\n#define WUFFS_BASE__CPU_ARCH__X86_64\n#endif  // defined(__x86_64__)\n\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_CONFIG__AVOID_CPU_ARCH)\n\n" +
+	"// ---------------- Configuration\n\n// Define WUFFS_CONFIG__AVOID_CPU_ARCH to avoid any code tied to a specific CPU\n// architecture, such as SSE SIMD for the x86 CPU family.\n#if defined(WUFFS_CONFIG__AVOID_CPU_ARCH)  // (#if-chain ref AVOID_CPU_ARCH_0)\n// No-op.\n#else  // (#if-chain ref AVOID_CPU_ARCH_0)\n\n// The \"defined(__clang__)\" isn't redundant. While vanilla clang defines\n// __GNUC__, clang-cl (which mimics MSVC's cl.exe) does not.\n#if defined(__GNUC__) || defined(__clang__)\n#define WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(arg) __attribute__((target(arg)))\n#else\n#define WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(arg)\n#endif  // defined(__GNUC__) || defined(__clang__)\n\n#if defined(__GNUC__)  // (#if-chain ref AVOID_CPU_ARCH_1)\n\n// To simplify Wuffs code, \"cpu_arch >= arm_xxx\" requires xxx but also\n// unaligned little-endian load/stores.\n#if defined(__ARM_FEATURE_UNALIGNED) && defined(__BYTE_ORDER__) && \\\n    (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)\n// Not all gcc versions define __ARM_ACLE, even if they support crc32" +
+	"\n// intrinsics. Look for __ARM_FEATURE_CRC32 instead.\n#if defined(__ARM_FEATURE_CRC32)\n#include <arm_acle.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_CRC32\n#endif  // defined(__ARM_FEATURE_CRC32)\n#if defined(__ARM_NEON)\n#include <arm_neon.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_NEON\n#endif  // defined(__ARM_NEON)\n#endif  // defined(__ARM_FEATURE_UNALIGNED) etc\n\n// Similarly, \"cpu_arch >= x86_sse42\" requires SSE4.2 but also PCLMUL and\n// POPCNT. This is checked at runtime via cpuid, not at compile time.\n#if defined(__x86_64__)\n#include <cpuid.h>\n#include <x86intrin.h>\n#define WUFFS_BASE__CPU_ARCH__X86_64\n#endif  // defined(__x86_64__)\n\n#elif defined(_MSC_VER)  // (#if-chain ref AVOID_CPU_ARCH_1)\n\n#if defined(_M_X64)\n#if defined(__clang__)\n// No-op. clang-cl (which defines both __clang__ and _MSC_VER) supports\n// \"__attribute__((target(arg)))\".\n#elif !defined(__AVX__)\n// For MSVC's cl.exe (unlike clang or gcc), SIMD capability is a compile-time\n// property of the source file (e.g. a /arch:AVX or -mavx compiler flag), not" +
+	"\n// of individual functions (that can be conditionally selected at runtime).\n#error \"Wuffs with MSVC+X64 needs /arch:AVX or /DWUFFS_CONFIG__AVOID_CPU_ARCH\"\n#endif  // defined(__clang__); !defined(__AVX__)\n#include <intrin.h>\n#define WUFFS_BASE__CPU_ARCH__X86_64\n#endif  // defined(_M_X64)\n\n#endif  // (#if-chain ref AVOID_CPU_ARCH_1)\n#endif  // (#if-chain ref AVOID_CPU_ARCH_0)\n\n" +
 	"" +
 	"// --------\n\n// Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have\n// static storage. The motivation is discussed in the \"ALLOW STATIC\n// IMPLEMENTATION\" section of\n// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt\n#if defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n#define WUFFS_BASE__MAYBE_STATIC static\n#else\n#define WUFFS_BASE__MAYBE_STATIC\n#endif  // defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n\n" +
 	"" +
-	"// ---------------- CPU Architecture\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_PCLMUL = (1 <<  1)\n  //  - bit_POPCNT = (1 << 23)\n  //  - bit_SSE4_2 = (1 << 20)\n  const unsigned int sse42_ecx1 = 0x00900002;\n#if defined(__GNUC__)\n  unsigned int eax1 = 0;\n  unsigned int ebx1 = 0;\n  unsigned int ecx1 = 0;\n  unsigned int edx1 = 0;\n  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n    return (ecx1 & sse42_ecx1) == sse42_ecx1;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  int x[4];\n  __cpuid(x" +
-	", 1);\n  return (((unsigned int)(x[2])) & sse42_ecx1) == sse42_ecx1;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\n" +
+	"// ---------------- CPU Architecture\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_PCLMUL = (1 <<  1)\n  //  - bit_POPCNT = (1 << 23)\n  //  - bit_SSE4_2 = (1 << 20)\n  const unsigned int sse42_ecx1 = 0x00900002;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax1 = 0;\n  unsigned int ebx1 = 0;\n  unsigned int ecx1 = 0;\n  unsigned int edx1 = 0;\n  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n    return (ecx1 & sse42_ecx1) == sse42_" +
+	"ecx1;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  int x[4];\n  __cpuid(x, 1);\n  return (((unsigned int)(x[2])) & sse42_ecx1) == sse42_ecx1;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\n" +
 	"" +
 	"// ---------------- Fundamentals\n\n// Wuffs assumes that:\n//  - converting a uint32_t to a size_t will never overflow.\n//  - converting a size_t to a uint64_t will never overflow.\n#ifdef __WORDSIZE\n#if (__WORDSIZE != 32) && (__WORDSIZE != 64)\n#error \"Wuffs requires a word size of either 32 or 64 bits\"\n#endif\n#endif\n\n#if defined(__clang__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED_FIELD __attribute__((unused))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED_FIELD\n#endif\n\n// Clang also defines \"__GNUC__\".\n#if defined(__GNUC__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED __attribute__((unused))\n#define WUFFS_BASE__WARN_UNUSED_RESULT __attribute__((warn_unused_result))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED\n#define WUFFS_BASE__WARN_UNUSED_RESULT\n#endif\n\n" +
 	"" +
@@ -582,9 +583,9 @@
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_align4_bgr_565_8888(uint8_t* dst_ptr,\n                                                       size_t dst_len,\n                                                       const uint8_t* src_ptr,\n                                                       size_t src_len,\n                                                       bool nonpremul) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint32_t argb = wuffs_base__peek_u32le__no_bounds_check(s);\n    if (nonpremul) {\n      argb =\n          wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(argb);\n    }\n    uint32_t b5 = 0x1F & (argb >> (8 - 5));\n    uint32_t g6 = 0x3F & (argb >> (16 - 6));\n    uint32_t r5 = 0x1F & (argb >> (24 - 5));\n    uint32_t alpha = argb & 0xFF000000;\n    wuffs_base__poke_u32le__no_bounds_check(\n        d, alpha | (r5 << 11) | (g6 << 5) | (b5 << 0));\n    s += 4;\n   " +
 	" d += 4;\n  }\n  return len;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgb_bgr(uint8_t* dst_ptr,\n                                         size_t dst_len,\n                                         uint8_t* dst_palette_ptr,\n                                         size_t dst_palette_len,\n                                         const uint8_t* src_ptr,\n                                         size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    s += 3;\n    d += 3;\n  }\n  return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n#if defined(__GNUC__)\n__attribute__((target(\"sse4.2\")))\n#endif\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse42(uint8_t* dst_ptr,\n                                                  size_t dst_len,\n                                                  ui" +
-	"nt8_t* dst_palette_ptr,\n                                                  size_t dst_palette_len,\n                                                  const uint8_t* src_ptr,\n                                                  size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(+0x0F, +0x0C, +0x0D, +0x0E,  //\n                                 +0x0B, +0x08, +0x09, +0x0A,  //\n                                 +0x07, +0x04, +0x05, +0x06,  //\n                                 +0x03, +0x00, +0x01, +0x02);\n\n  while (n >= 4) {\n    __m128i x;\n    x = _mm_lddqu_si128((const __m128i*)(const void*)s);\n    x = _mm_shuffle_epi8(x, shuffle);\n    _mm_storeu_si128((__m128i*)(void*)d, x);\n\n    s += 4 * 4;\n    d += 4 * 4;\n    n -= 4;\n  }\n\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n " +
-	"   s += 4;\n    d += 4;\n  }\n  return len;\n}\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,\n                                           size_t dst_len,\n                                           uint8_t* dst_palette_ptr,\n                                           size_t dst_palette_len,\n                                           const uint8_t* src_ptr,\n                                           size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgb_bgr(uint8_t* dst_ptr,\n                                         size_t dst_len,\n                                         uint8_t* dst_palette_ptr,\n                                         size_t dst_palette_len,\n                                         const uint8_t* src_ptr,\n                                         size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    s += 3;\n    d += 3;\n  }\n  return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\nWUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(\"sse4.2\")\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse42(uint8_t* dst_ptr,\n                                                  size_t dst_len,\n                                                  uint8_t* dst_palette" +
+	"_ptr,\n                                                  size_t dst_palette_len,\n                                                  const uint8_t* src_ptr,\n                                                  size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(+0x0F, +0x0C, +0x0D, +0x0E,  //\n                                 +0x0B, +0x08, +0x09, +0x0A,  //\n                                 +0x07, +0x04, +0x05, +0x06,  //\n                                 +0x03, +0x00, +0x01, +0x02);\n\n  while (n >= 4) {\n    __m128i x;\n    x = _mm_lddqu_si128((const __m128i*)(const void*)s);\n    x = _mm_shuffle_epi8(x, shuffle);\n    _mm_storeu_si128((__m128i*)(void*)d, x);\n\n    s += 4 * 4;\n    d += 4 * 4;\n    n -= 4;\n  }\n\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d +" +
+	"= 4;\n  }\n  return len;\n}\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,\n                                           size_t dst_len,\n                                           uint8_t* dst_palette_ptr,\n                                           size_t dst_palette_len,\n                                           const uint8_t* src_ptr,\n                                           size_t src_len) {\n  size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_tight_4x8_4x16le(uint8_t* dst_ptr,\n                                                    size_t dst_len,\n                                                    uint8_t* dst_palette_ptr,\n                                                    size_t dst_palette_len,\n                                                    const uint8_t* src_ptr,\n                                                    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len8 = src_len / 8;\n  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  size_t n = len;\n  while (n >= 1) {\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__color_u64__as__color_u32(\n                         wuffs_base__peek_u64le__no_bounds_check(s + (0 * 8))));\n\n    s += 1 * 8;\n    d += 1 * 4;\n    n -= 1;\n  }\n  return len;\n}\n\n" +
 	"" +
@@ -629,10 +630,10 @@
 	"e_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__peek_u32le__no_bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__swap_u32_argb_abgr(\n        wuffs_base__peek_u32le__no_bounds_check(s + (0 * 4)));\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__composite_premul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__bgr(uint8_t* dst_ptr,\n                                      size_t dst_len,\n                                      uint8_t* dst_palette_ptr,\n                                      size_t dst_palette_len,\n                                      const uint8_t* src_ptr,\n                                      size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4),\n        0xFF000000 | wuffs_base__peek_u24le__no_bounds_check(s + (0 * 3)));\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__bgrx(uint8_t* dst_ptr,\n                                       size_t dst_len,\n                                       uint8_t* dst_palett" +
-	"e_ptr,\n                                       size_t dst_palette_len,\n                                       const uint8_t* src_ptr,\n                                       size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4),\n        0xFF000000 | wuffs_base__peek_u32le__no_bounds_check(s + (0 * 4)));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n#if defined(__GNUC__)\n__attribute__((target(\"sse4.2\")))\n#endif\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__rgb__sse42(uint8_t* dst_ptr,\n                                             size_t dst_len,\n                                             uint8_t* dst_palette_ptr,\n                                             size_t dst_palette_len" +
-	",\n                                             const uint8_t* src_ptr,\n                                             size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(+0x00, +0x09, +0x0A, +0x0B,  //\n                                 +0x00, +0x06, +0x07, +0x08,  //\n                                 +0x00, +0x03, +0x04, +0x05,  //\n                                 +0x00, +0x00, +0x01, +0x02);\n  __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00);\n\n  while (n >= 6) {\n    __m128i x;\n    x = _mm_lddqu_si128((const __m128i*)(const void*)s);\n    x = _mm_shuffle_epi8(x, shuffle);\n    x = _mm_or_si128(x, or_ff);\n    _mm_storeu_si128((_" +
-	"_m128i*)(void*)d, x);\n\n    s += 4 * 3;\n    d += 4 * 4;\n    n -= 4;\n  }\n\n  while (n >= 1) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = 0xFF;\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,\n                                      size_t dst_len,\n                                      uint8_t* dst_palette_ptr,\n                                      size_t dst_palette_len,\n                                      const uint8_t* src_ptr,\n                                      size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] =" +
-	" b1;\n    d[2] = b0;\n    d[3] = 0xFF;\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"e_ptr,\n                                       size_t dst_palette_len,\n                                       const uint8_t* src_ptr,\n                                       size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4),\n        0xFF000000 | wuffs_base__peek_u32le__no_bounds_check(s + (0 * 4)));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\nWUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(\"sse4.2\")\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__rgb__sse42(uint8_t* dst_ptr,\n                                             size_t dst_len,\n                                             uint8_t* dst_palette_ptr,\n                                             size_t dst_palette_len,\n                " +
+	"                             const uint8_t* src_ptr,\n                                             size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(+0x00, +0x09, +0x0A, +0x0B,  //\n                                 +0x00, +0x06, +0x07, +0x08,  //\n                                 +0x00, +0x03, +0x04, +0x05,  //\n                                 +0x00, +0x00, +0x01, +0x02);\n  __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00);\n\n  while (n >= 6) {\n    __m128i x;\n    x = _mm_lddqu_si128((const __m128i*)(const void*)s);\n    x = _mm_shuffle_epi8(x, shuffle);\n    x = _mm_or_si128(x, or_ff);\n    _mm_storeu_si128((__m128i*)(void*)d, " +
+	"x);\n\n    s += 4 * 3;\n    d += 4 * 4;\n    n -= 4;\n  }\n\n  while (n >= 1) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = 0xFF;\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,\n                                      size_t dst_len,\n                                      uint8_t* dst_palette_ptr,\n                                      size_t dst_palette_len,\n                                      const uint8_t* src_ptr,\n                                      size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0" +
+	";\n    d[3] = 0xFF;\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__index__src(uint8_t* dst_ptr,\n                                            size_t dst_len,\n                                            uint8_t* dst_palette_ptr,\n                                            size_t dst_palette_len,\n                                            const uint8_t* src_ptr,\n                                            size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n  }\n  size_t dst_len3 = dst_len / 3;\n  size_t len = (dst_len3 < src_len) ? dst_len3 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  // The comparison in the while condition is \">\", not \">=\", because with\n  // \">=\", the last 4-byte store could write past the end of the dst slice.\n  //\n  // Each 4-byte store writes one too many bytes, but a subsequent store\n  // will overwrite that with the correct byte. There is always another\n  // store, whether a 4-byte store in this loop" +
 	" or a 1-byte store in the\n  // next loop.\n  while (n > loop_unroll_count) {\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 3), wuffs_base__peek_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[0] * 4)));\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (1 * 3), wuffs_base__peek_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[1] * 4)));\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (2 * 3), wuffs_base__peek_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[2] * 4)));\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (3 * 3), wuffs_base__peek_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__peek_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)" +
@@ -646,10 +647,10 @@
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index__src(uint8_t* dst_ptr,\n                                             size_t dst_len,\n                                             uint8_t* dst_palette_ptr,\n                                             size_t dst_palette_len,\n                                             const uint8_t* src_ptr,\n                                             size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n  }\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__peek_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[0] * 4)));\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (1 * 4), wuffs_base__peek_u32le__no_bounds_check(\n                         ds" +
 	"t_palette_ptr + ((size_t)s[1] * 4)));\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (2 * 4), wuffs_base__peek_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[2] * 4)));\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (3 * 4), wuffs_base__peek_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__peek_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n  }\n " +
 	" size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    uint32_t s0 = wuffs_base__peek_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__poke_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n    uint32_t s1 = wuffs_base__peek_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[1] * 4));\n    if (s1) {\n      wuffs_base__poke_u32le__no_bounds_check(d + (1 * 4), s1);\n    }\n    uint32_t s2 = wuffs_base__peek_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__poke_u32le__no_bounds_check(d + (2 * 4), s2);\n    }\n    uint32_t s3 = wuffs_base__peek_u32le__no_bounds_check(dst_palette_" +
-	"ptr +\n                                                          ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__poke_u32le__no_bounds_check(d + (3 * 4), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__peek_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__poke_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n#if defined(__GNUC__)\n__attribute__((target(\"sse4.2\")))\n#endif\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y__sse42(uint8_t* dst_ptr,\n                                           size_t dst_len,\n                                           uint8_t* dst_palette_ptr,\n                                           size_t dst_palette_len,\n                                           const ui" +
-	"nt8_t* src_ptr,\n                                           size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(+0x03, +0x03, +0x03, +0x03,  //\n                                 +0x02, +0x02, +0x02, +0x02,  //\n                                 +0x01, +0x01, +0x01, +0x01,  //\n                                 +0x00, +0x00, +0x00, +0x00);\n  __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00);\n\n  while (n >= 4) {\n    __m128i x;\n    x = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(s)));\n    x = _mm_shuffle_epi8(x, shuffle);\n    x = _mm_or_si128(x, or_ff);\n    _mm_storeu_si128((__m128i*)(void*)d, x);\n\n    s += 4 * 1;\n    d += 4 * 4;\n    n -= 4;\n  }\n" +
-	"\n  while (n >= 1) {\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,\n                                    size_t dst_len,\n                                    uint8_t* dst_palette_ptr,\n                                    size_t dst_palette_len,\n                                    const uint8_t* src_ptr,\n                                    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y_16" +
-	"be(uint8_t* dst_ptr,\n                                         size_t dst_len,\n                                         uint8_t* dst_palette_ptr,\n                                         size_t dst_palette_len,\n                                         const uint8_t* src_ptr,\n                                         size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len2 = src_len / 2;\n  size_t len = (dst_len4 < src_len2) ? dst_len4 : src_len2;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 2;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"ptr +\n                                                          ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__poke_u32le__no_bounds_check(d + (3 * 4), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__peek_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__poke_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\nWUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(\"sse4.2\")\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y__sse42(uint8_t* dst_ptr,\n                                           size_t dst_len,\n                                           uint8_t* dst_palette_ptr,\n                                           size_t dst_palette_len,\n                                           const uint8_t* src_ptr,\n  " +
+	"                                         size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  __m128i shuffle = _mm_set_epi8(+0x03, +0x03, +0x03, +0x03,  //\n                                 +0x02, +0x02, +0x02, +0x02,  //\n                                 +0x01, +0x01, +0x01, +0x01,  //\n                                 +0x00, +0x00, +0x00, +0x00);\n  __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00,  //\n                               -0x01, +0x00, +0x00, +0x00);\n\n  while (n >= 4) {\n    __m128i x;\n    x = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(s)));\n    x = _mm_shuffle_epi8(x, shuffle);\n    x = _mm_or_si128(x, or_ff);\n    _mm_storeu_si128((__m128i*)(void*)d, x);\n\n    s += 4 * 1;\n    d += 4 * 4;\n    n -= 4;\n  }\n\n  while (n >= 1) " +
+	"{\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,\n                                    size_t dst_len,\n                                    uint8_t* dst_palette_ptr,\n                                    size_t dst_palette_len,\n                                    const uint8_t* src_ptr,\n                                    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y_16be(uint8_t* dst_pt" +
+	"r,\n                                         size_t dst_len,\n                                         uint8_t* dst_palette_ptr,\n                                         size_t dst_palette_len,\n                                         const uint8_t* src_ptr,\n                                         size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len2 = src_len / 2;\n  size_t len = (dst_len4 < src_len2) ? dst_len4 : src_len2;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    wuffs_base__poke_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 2;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__y__y_16be(uint8_t* dst_ptr,\n                                      size_t dst_len,\n                                      uint8_t* dst_palette_ptr,\n                                      size_t dst_palette_len,\n                                      const uint8_t* src_ptr,\n                                      size_t src_len) {\n  size_t src_len2 = src_len / 2;\n  size_t len = (dst_len < src_len2) ? dst_len : src_len2;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    d[0] = s[0];\n\n    s += 1 * 2;\n    d += 1 * 1;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
diff --git a/internal/cgen/func.go b/internal/cgen/func.go
index 4526328..6a2c338 100644
--- a/internal/cgen/func.go
+++ b/internal/cgen/func.go
@@ -239,7 +239,7 @@
 		b.printf("#if defined(WUFFS_BASE__CPU_ARCH__%s)\n", caMacro)
 	}
 	if caAttribute != "" {
-		b.printf("#if defined(__GNUC__)\n%s\n#endif\n", caAttribute)
+		b.printf("%s\n", caAttribute)
 	}
 
 	if err := g.writeFuncSignature(b, n, wfsCDecl); err != nil {
diff --git a/internal/cgen/statement.go b/internal/cgen/statement.go
index a60a578..23ee914 100644
--- a/internal/cgen/statement.go
+++ b/internal/cgen/statement.go
@@ -277,7 +277,8 @@
 				caMacro, caName, caAttribute = "ARM_NEON", "arm_neon", ""
 			case t.IDX86SSE42:
 				caMacro, caName, caAttribute =
-					"X86_64", "x86_sse42", "__attribute__((target(\"pclmul,popcnt,sse4.2\")))"
+					"X86_64", "x86_sse42",
+					"WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(\"pclmul,popcnt,sse4.2\")"
 			}
 		}
 	}
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index d548daa..fcc5522 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -78,10 +78,19 @@
 
 // Define WUFFS_CONFIG__AVOID_CPU_ARCH to avoid any code tied to a specific CPU
 // architecture, such as SSE SIMD for the x86 CPU family.
-#if defined(WUFFS_CONFIG__AVOID_CPU_ARCH)
+#if defined(WUFFS_CONFIG__AVOID_CPU_ARCH)  // (#if-chain ref AVOID_CPU_ARCH_0)
 // No-op.
+#else  // (#if-chain ref AVOID_CPU_ARCH_0)
+
+// The "defined(__clang__)" isn't redundant. While vanilla clang defines
+// __GNUC__, clang-cl (which mimics MSVC's cl.exe) does not.
+#if defined(__GNUC__) || defined(__clang__)
+#define WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(arg) __attribute__((target(arg)))
 #else
-#if defined(__GNUC__)
+#define WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(arg)
+#endif  // defined(__GNUC__) || defined(__clang__)
+
+#if defined(__GNUC__)  // (#if-chain ref AVOID_CPU_ARCH_1)
 
 // To simplify Wuffs code, "cpu_arch >= arm_xxx" requires xxx but also
 // unaligned little-endian load/stores.
@@ -107,15 +116,24 @@
 #define WUFFS_BASE__CPU_ARCH__X86_64
 #endif  // defined(__x86_64__)
 
-#elif defined(_MSC_VER)  // defined(__GNUC__)
+#elif defined(_MSC_VER)  // (#if-chain ref AVOID_CPU_ARCH_1)
 
 #if defined(_M_X64)
+#if defined(__clang__)
+// No-op. clang-cl (which defines both __clang__ and _MSC_VER) supports
+// "__attribute__((target(arg)))".
+#elif !defined(__AVX__)
+// For MSVC's cl.exe (unlike clang or gcc), SIMD capability is a compile-time
+// property of the source file (e.g. a /arch:AVX or -mavx compiler flag), not
+// of individual functions (that can be conditionally selected at runtime).
+#error "Wuffs with MSVC+X64 needs /arch:AVX or /DWUFFS_CONFIG__AVOID_CPU_ARCH"
+#endif  // defined(__clang__); !defined(__AVX__)
 #include <intrin.h>
 #define WUFFS_BASE__CPU_ARCH__X86_64
-#endif  // defined(__x86_64__)
+#endif  // defined(_M_X64)
 
-#endif  // defined(__GNUC__); defined(_MSC_VER)
-#endif  // defined(WUFFS_CONFIG__AVOID_CPU_ARCH)
+#endif  // (#if-chain ref AVOID_CPU_ARCH_1)
+#endif  // (#if-chain ref AVOID_CPU_ARCH_0)
 
 // --------
 
@@ -157,6 +175,8 @@
   //  - bit_POPCNT = (1 << 23)
   //  - bit_SSE4_2 = (1 << 20)
   const unsigned int sse42_ecx1 = 0x00900002;
+
+  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
 #if defined(__GNUC__)
   unsigned int eax1 = 0;
   unsigned int ebx1 = 0;
@@ -15463,9 +15483,7 @@
 }
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
-#endif
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("sse4.2")
 static uint64_t  //
 wuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse42(uint8_t* dst_ptr,
                                                   size_t dst_len,
@@ -16808,9 +16826,7 @@
 }
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
-#endif
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("sse4.2")
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgrw__rgb__sse42(uint8_t* dst_ptr,
                                              size_t dst_len,
@@ -17251,9 +17267,7 @@
 }
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
-#endif
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("sse4.2")
 static uint64_t  //
 wuffs_base__pixel_swizzler__xxxx__y__sse42(uint8_t* dst_ptr,
                                            size_t dst_len,
@@ -18700,9 +18714,7 @@
 // -------- func adler32.hasher.up_x86_sse42
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("pclmul,popcnt,sse4.2")))
-#endif
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
 static wuffs_base__empty_struct
 wuffs_adler32__hasher__up_x86_sse42(
     wuffs_adler32__hasher* self,
@@ -22881,9 +22893,7 @@
 // -------- func crc32.ieee_hasher.up_x86_sse42
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("pclmul,popcnt,sse4.2")))
-#endif
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
 static wuffs_base__empty_struct
 wuffs_crc32__ieee_hasher__up_x86_sse42(
     wuffs_crc32__ieee_hasher* self,
@@ -33533,9 +33543,7 @@
 // -------- func png.decoder.filter_1_distance_4_x86_sse42
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("pclmul,popcnt,sse4.2")))
-#endif
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
 static wuffs_base__empty_struct
 wuffs_png__decoder__filter_1_distance_4_x86_sse42(
     wuffs_png__decoder* self,
@@ -33579,9 +33587,7 @@
 // -------- func png.decoder.filter_3_distance_4_x86_sse42
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("pclmul,popcnt,sse4.2")))
-#endif
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
 static wuffs_base__empty_struct
 wuffs_png__decoder__filter_3_distance_4_x86_sse42(
     wuffs_png__decoder* self,
@@ -33684,9 +33690,7 @@
 // -------- func png.decoder.filter_4_distance_3_x86_sse42
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("pclmul,popcnt,sse4.2")))
-#endif
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
 static wuffs_base__empty_struct
 wuffs_png__decoder__filter_4_distance_3_x86_sse42(
     wuffs_png__decoder* self,
@@ -33810,9 +33814,7 @@
 // -------- func png.decoder.filter_4_distance_4_x86_sse42
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("pclmul,popcnt,sse4.2")))
-#endif
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
 static wuffs_base__empty_struct
 wuffs_png__decoder__filter_4_distance_4_x86_sse42(
     wuffs_png__decoder* self,