Include <immintrin.h> etc for MSVC+X64
diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h
index 6952a77..4935e8e 100644
--- a/internal/cgen/base/fundamental-public.h
+++ b/internal/cgen/base/fundamental-public.h
@@ -94,7 +94,17 @@
 // of individual functions (that can be conditionally selected at runtime).
 #error "Wuffs with MSVC+X64 needs /arch:AVX or /DWUFFS_CONFIG__AVOID_CPU_ARCH"
 #endif  // defined(__clang__); !defined(__AVX__)
+// We need <intrin.h> for the __cpuid function.
 #include <intrin.h>
+// That's not enough for X64 SIMD, with clang-cl, if we want to use
+// "__attribute__((target(arg)))" without e.g. "/arch:AVX".
+//
+// Some web pages suggest that <immintrin.h> is all you need, as it pulls in
+// the earlier SIMD families like SSE4.2, but that doesn't seem to work in
+// practice, possibly for the same reason that just <intrin.h> doesn't work.
+#include <immintrin.h>  // AVX, AVX2, FMA, POPCNT
+#include <nmmintrin.h>  // SSE4.2
+#include <wmmintrin.h>  // AES, PCLMUL
 #define WUFFS_BASE__CPU_ARCH__X86_64
 #endif  // defined(_M_X64)
 
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index ee6cfaf..fbb663f 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -56,7 +56,7 @@
 	"" +
 	"// ---------------- Configuration\n\n// Define WUFFS_CONFIG__AVOID_CPU_ARCH to avoid any code tied to a specific CPU\n// architecture, such as SSE SIMD for the x86 CPU family.\n#if defined(WUFFS_CONFIG__AVOID_CPU_ARCH)  // (#if-chain ref AVOID_CPU_ARCH_0)\n// No-op.\n#else  // (#if-chain ref AVOID_CPU_ARCH_0)\n\n// The \"defined(__clang__)\" isn't redundant. While vanilla clang defines\n// __GNUC__, clang-cl (which mimics MSVC's cl.exe) does not.\n#if defined(__GNUC__) || defined(__clang__)\n#define WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(arg) __attribute__((target(arg)))\n#else\n#define WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(arg)\n#endif  // defined(__GNUC__) || defined(__clang__)\n\n#if defined(__GNUC__)  // (#if-chain ref AVOID_CPU_ARCH_1)\n\n// To simplify Wuffs code, \"cpu_arch >= arm_xxx\" requires xxx but also\n// unaligned little-endian load/stores.\n#if defined(__ARM_FEATURE_UNALIGNED) && defined(__BYTE_ORDER__) && \\\n    (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)\n// Not all gcc versions define __ARM_ACLE, even if they support crc32" +
 	"\n// intrinsics. Look for __ARM_FEATURE_CRC32 instead.\n#if defined(__ARM_FEATURE_CRC32)\n#include <arm_acle.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_CRC32\n#endif  // defined(__ARM_FEATURE_CRC32)\n#if defined(__ARM_NEON)\n#include <arm_neon.h>\n#define WUFFS_BASE__CPU_ARCH__ARM_NEON\n#endif  // defined(__ARM_NEON)\n#endif  // defined(__ARM_FEATURE_UNALIGNED) etc\n\n// Similarly, \"cpu_arch >= x86_sse42\" requires SSE4.2 but also PCLMUL and\n// POPCNT. This is checked at runtime via cpuid, not at compile time.\n#if defined(__x86_64__)\n#include <cpuid.h>\n#include <x86intrin.h>\n#define WUFFS_BASE__CPU_ARCH__X86_64\n#endif  // defined(__x86_64__)\n\n#elif defined(_MSC_VER)  // (#if-chain ref AVOID_CPU_ARCH_1)\n\n#if defined(_M_X64)\n#if defined(__clang__)\n// No-op. clang-cl (which defines both __clang__ and _MSC_VER) supports\n// \"__attribute__((target(arg)))\".\n#elif !defined(__AVX__)\n// For MSVC's cl.exe (unlike clang or gcc), SIMD capability is a compile-time\n// property of the source file (e.g. a /arch:AVX or -mavx compiler flag), not" +
-	"\n// of individual functions (that can be conditionally selected at runtime).\n#error \"Wuffs with MSVC+X64 needs /arch:AVX or /DWUFFS_CONFIG__AVOID_CPU_ARCH\"\n#endif  // defined(__clang__); !defined(__AVX__)\n#include <intrin.h>\n#define WUFFS_BASE__CPU_ARCH__X86_64\n#endif  // defined(_M_X64)\n\n#endif  // (#if-chain ref AVOID_CPU_ARCH_1)\n#endif  // (#if-chain ref AVOID_CPU_ARCH_0)\n\n" +
+	"\n// of individual functions (that can be conditionally selected at runtime).\n#error \"Wuffs with MSVC+X64 needs /arch:AVX or /DWUFFS_CONFIG__AVOID_CPU_ARCH\"\n#endif  // defined(__clang__); !defined(__AVX__)\n// We need <intrin.h> for the __cpuid function.\n#include <intrin.h>\n// That's not enough for X64 SIMD, with clang-cl, if we want to use\n// \"__attribute__((target(arg)))\" without e.g. \"/arch:AVX\".\n//\n// Some web pages suggest that <immintrin.h> is all you need, as it pulls in\n// the earlier SIMD families like SSE4.2, but that doesn't seem to work in\n// practice, possibly for the same reason that just <intrin.h> doesn't work.\n#include <immintrin.h>  // AVX, AVX2, FMA, POPCNT\n#include <nmmintrin.h>  // SSE4.2\n#include <wmmintrin.h>  // AES, PCLMUL\n#define WUFFS_BASE__CPU_ARCH__X86_64\n#endif  // defined(_M_X64)\n\n#endif  // (#if-chain ref AVOID_CPU_ARCH_1)\n#endif  // (#if-chain ref AVOID_CPU_ARCH_0)\n\n" +
 	"" +
 	"// --------\n\n// Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have\n// static storage. The motivation is discussed in the \"ALLOW STATIC\n// IMPLEMENTATION\" section of\n// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt\n#if defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n#define WUFFS_BASE__MAYBE_STATIC static\n#else\n#define WUFFS_BASE__MAYBE_STATIC\n#endif  // defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n\n" +
 	"" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 8e1b49a..b6988cb 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -128,7 +128,17 @@
 // of individual functions (that can be conditionally selected at runtime).
 #error "Wuffs with MSVC+X64 needs /arch:AVX or /DWUFFS_CONFIG__AVOID_CPU_ARCH"
 #endif  // defined(__clang__); !defined(__AVX__)
+// We need <intrin.h> for the __cpuid function.
 #include <intrin.h>
+// That's not enough for X64 SIMD, with clang-cl, if we want to use
+// "__attribute__((target(arg)))" without e.g. "/arch:AVX".
+//
+// Some web pages suggest that <immintrin.h> is all you need, as it pulls in
+// the earlier SIMD families like SSE4.2, but that doesn't seem to work in
+// practice, possibly for the same reason that just <intrin.h> doesn't work.
+#include <immintrin.h>  // AVX, AVX2, FMA, POPCNT
+#include <nmmintrin.h>  // SSE4.2
+#include <wmmintrin.h>  // AES, PCLMUL
 #define WUFFS_BASE__CPU_ARCH__X86_64
 #endif  // defined(_M_X64)