runtime detection for AVX-512
This has nothing to do with my desktop supporting AVX-512.
Definitely not.
There's one very minor interesting change here to work around an odd
interaction between Clang's AVX-512 headers and integer santizer.
The instruction issued is the same... everything boils down to masks
in the end with AVX-512.
We need to manually include some headers for Clang/Win to see the types
we're talking about for a few of these setups. Clang has "helpful"
guards in immintrin.h for every header it might include, designed to
speed up compilation on Windows but in this case kind of a hindrance:
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FOO__)
#include <foointrin.h>
#endif
An alternative is to maybe temporarily define the various __FOO__,
or perhaps temporarily undefine _MSC_VER, but who knows what kind of
knock-on efffects that'd have...
Change-Id: Id188844e0eb2602cad4841e4a7e6e72f18a4fc8a
Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/206618
Auto-Submit: Mike Klein <mtklein@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/skcms.cc b/skcms.cc
index e3e599d..3d97dd1 100644
--- a/skcms.cc
+++ b/skcms.cc
@@ -17,6 +17,19 @@
#include <arm_neon.h>
#elif defined(__SSE__)
#include <immintrin.h>
+
+ #if defined(__clang__)
+ // That #include <immintrin.h> is usually enough, but Clang's headers
+ // "helpfully" skip including the whole kitchen sink when _MSC_VER is
+ // defined, because lots of programs on Windows would include that and
+ // it'd be a lot slower. But we want all those headers included so we
+ // can use their features after runtime checks later.
+ #include <smmintrin.h>
+ #include <avxintrin.h>
+ #include <avx2intrin.h>
+ #include <avx512fintrin.h>
+ #include <avx512dqintrin.h>
+ #endif
#endif
// sizeof(x) will return size_t, which is 32-bit on some machines and 64-bit on others.
@@ -1864,80 +1877,127 @@
#if !defined(SKCMS_PORTABLE) && \
(( defined(__clang__) && __clang_major__ >= 5) || \
(!defined(__clang__) && defined(__GNUC__))) \
- && defined(__x86_64__) && !defined(__AVX2__)
+ && defined(__x86_64__)
- #if defined(__clang__)
- #pragma clang attribute push(__attribute__((target("avx2,f16c"))), apply_to=function)
- #elif defined(__GNUC__)
- #pragma GCC push_options
- #pragma GCC target("avx2,f16c")
+ #if !defined(__AVX2__)
+ #if defined(__clang__)
+ #pragma clang attribute push(__attribute__((target("avx2,f16c"))), apply_to=function)
+ #elif defined(__GNUC__)
+ #pragma GCC push_options
+ #pragma GCC target("avx2,f16c")
+ #endif
+
+ namespace hsw {
+ #define USING_AVX
+ #define USING_AVX_F16C
+ #define USING_AVX2
+ #define N 8
+ using F = Vec<N,float>;
+ using I32 = Vec<N,int32_t>;
+ using U64 = Vec<N,uint64_t>;
+ using U32 = Vec<N,uint32_t>;
+ using U16 = Vec<N,uint16_t>;
+ using U8 = Vec<N,uint8_t>;
+
+ #include "src/Transform_inl.h"
+
+ // src/Transform_inl.h will undefine USING_* for us.
+ #undef N
+ }
+
+ #if defined(__clang__)
+ #pragma clang attribute pop
+ #elif defined(__GNUC__)
+ #pragma GCC pop_options
+ #endif
+
+ #define TEST_FOR_HSW
#endif
- namespace hsw {
- #define USING_AVX
- #define USING_AVX_F16C
- #define USING_AVX2
- #define N 8
- using F = Vec<N,float>;
- using I32 = Vec<N,int32_t>;
- using U64 = Vec<N,uint64_t>;
- using U32 = Vec<N,uint32_t>;
- using U16 = Vec<N,uint16_t>;
- using U8 = Vec<N,uint8_t>;
+ #if !defined(__AVX512F__)
+ #if defined(__clang__)
+ #pragma clang attribute push(__attribute__((target("avx512f,avx512dq,avx512cd,avx512bw,avx512vl"))), apply_to=function)
+ #elif defined(__GNUC__)
+ #pragma GCC push_options
+ #pragma GCC target("avx512f,avx512dq,avx512cd,avx512bw,avx512vl")
+ #endif
- #include "src/Transform_inl.h"
+ namespace skx {
+ #define USING_AVX512F
+ #define N 16
+ using F = Vec<N,float>;
+ using I32 = Vec<N,int32_t>;
+ using U64 = Vec<N,uint64_t>;
+ using U32 = Vec<N,uint32_t>;
+ using U16 = Vec<N,uint16_t>;
+ using U8 = Vec<N,uint8_t>;
- // src/Transform_inl.h will undefine USING_* for us.
- #undef N
- }
+ #include "src/Transform_inl.h"
- #if defined(__clang__)
- #pragma clang attribute pop
- #elif defined(__GNUC__)
- #pragma GCC pop_options
+ // src/Transform_inl.h will undefine USING_* for us.
+ #undef N
+ }
+
+ #if defined(__clang__)
+ #pragma clang attribute pop
+ #elif defined(__GNUC__)
+ #pragma GCC pop_options
+ #endif
+
+ #define TEST_FOR_SKX
#endif
- #define TEST_FOR_HSW
+ #if defined(TEST_FOR_HSW) || defined(TEST_FOR_SKX)
+ enum class CpuType { None, HSW, SKX };
+ static CpuType cpu_type() {
+ static const CpuType type = []{
+ // See http://www.sandpile.org/x86/cpuid.htm
- static bool hsw_ok() {
- static const bool ok = []{
- // See http://www.sandpile.org/x86/cpuid.htm
+ // First, a basic cpuid(1) lets us check prerequisites for HSW, SKX.
+ uint32_t eax, ebx, ecx, edx;
+ __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+ : "0"(1), "2"(0));
+ if ((edx & (1u<<25)) && // SSE
+ (edx & (1u<<26)) && // SSE2
+ (ecx & (1u<< 0)) && // SSE3
+ (ecx & (1u<< 9)) && // SSSE3
+ (ecx & (1u<<12)) && // FMA (N.B. not used, avoided even)
+ (ecx & (1u<<19)) && // SSE4.1
+ (ecx & (1u<<20)) && // SSE4.2
+ (ecx & (1u<<26)) && // XSAVE
+ (ecx & (1u<<27)) && // OSXSAVE
+ (ecx & (1u<<28)) && // AVX
+ (ecx & (1u<<29))) { // F16C
- // First, a basic cpuid(1).
- uint32_t eax, ebx, ecx, edx;
- __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
- : "0"(1), "2"(0));
+ // Call cpuid(7) to check for AVX2 and AVX-512 bits.
+ __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+ : "0"(7), "2"(0));
+ // eax from xgetbv(0) will tell us whether XMM, YMM, and ZMM state is saved.
+ uint32_t xcr0, dont_need_edx;
+ __asm__ __volatile__("xgetbv" : "=a"(xcr0), "=d"(dont_need_edx) : "c"(0));
- // Sanity check for prerequisites.
- if ((edx & (1<<25)) != (1<<25)) { return false; } // SSE
- if ((edx & (1<<26)) != (1<<26)) { return false; } // SSE2
- if ((ecx & (1<< 0)) != (1<< 0)) { return false; } // SSE3
- if ((ecx & (1<< 9)) != (1<< 9)) { return false; } // SSSE3
- if ((ecx & (1<<19)) != (1<<19)) { return false; } // SSE4.1
- if ((ecx & (1<<20)) != (1<<20)) { return false; } // SSE4.2
-
- if ((ecx & (3<<26)) != (3<<26)) { return false; } // XSAVE + OSXSAVE
-
- {
- uint32_t eax_xgetbv, edx_xgetbv;
- __asm__ __volatile__("xgetbv" : "=a"(eax_xgetbv), "=d"(edx_xgetbv) : "c"(0));
- if ((eax_xgetbv & (3<<1)) != (3<<1)) { return false; } // XMM+YMM state saved?
- }
-
- if ((ecx & (1<<28)) != (1<<28)) { return false; } // AVX
- if ((ecx & (1<<29)) != (1<<29)) { return false; } // F16C
- if ((ecx & (1<<12)) != (1<<12)) { return false; } // FMA (TODO: not currently used)
-
- // Call cpuid(7) to check for our final AVX2 feature bit!
- __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
- : "0"(7), "2"(0));
- if ((ebx & (1<< 5)) != (1<< 5)) { return false; } // AVX2
-
- return true;
- }();
-
- return ok;
- }
+ if ((xcr0 & (1u<<1)) && // XMM register state saved?
+ (xcr0 & (1u<<2)) && // YMM register state saved?
+ (ebx & (1u<<5))) { // AVX2
+ // At this point we're at least HSW. Continue checking for SKX.
+ if ((xcr0 & (1u<< 5)) && // Opmasks state saved?
+ (xcr0 & (1u<< 6)) && // First 16 ZMM registers saved?
+ (xcr0 & (1u<< 7)) && // High 16 ZMM registers saved?
+ (ebx & (1u<<16)) && // AVX512F
+ (ebx & (1u<<17)) && // AVX512DQ
+ (ebx & (1u<<28)) && // AVX512CD
+ (ebx & (1u<<30)) && // AVX512BW
+ (ebx & (1u<<31))) { // AVX512VL
+ return CpuType::SKX;
+ }
+ return CpuType::HSW;
+ }
+ }
+ return CpuType::None;
+ }();
+ return type;
+ }
+ #endif
#endif
@@ -2260,7 +2320,18 @@
auto run = baseline::run_program;
#if defined(TEST_FOR_HSW)
- if (hsw_ok()) { run = hsw::run_program; }
+ switch (cpu_type()) {
+ case CpuType::None: break;
+ case CpuType::HSW: run = hsw::run_program; break;
+ case CpuType::SKX: run = hsw::run_program; break;
+ }
+#endif
+#if defined(TEST_FOR_SKX)
+ switch (cpu_type()) {
+ case CpuType::None: break;
+ case CpuType::HSW: break;
+ case CpuType::SKX: run = skx::run_program; break;
+ }
#endif
run(program, arguments, (const char*)src, (char*)dst, n, src_bpp,dst_bpp);
return true;
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index 69efc98..86854fc 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -43,6 +43,9 @@
#if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
#define USING_AVX2
#endif
+#if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__)
+ #define USING_AVX512F
+#endif
// Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
// This is more for organizational clarity... skcms.cc doesn't force these.
@@ -138,7 +141,7 @@
SI F F_from_Half(U16 half) {
#if defined(USING_NEON_F16C)
return vcvt_f32_f16((float16x4_t)half);
-#elif defined(__AVX512F__)
+#elif defined(USING_AVX512F)
return (F)_mm512_cvtph_ps((__m256i)half);
#elif defined(USING_AVX_F16C)
typedef int16_t __attribute__((vector_size(16))) I16;
@@ -165,7 +168,7 @@
SI U16 Half_from_F(F f) {
#if defined(USING_NEON_F16C)
return (U16)vcvt_f16_f32(f);
-#elif defined(__AVX512F__)
+#elif defined(USING_AVX512F)
return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
#elif defined(USING_AVX_F16C)
return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
@@ -206,8 +209,12 @@
return floorf_(x);
#elif defined(__aarch64__)
return vrndmq_f32(x);
-#elif defined(__AVX512F__)
- return _mm512_floor_ps(x);
+#elif defined(USING_AVX512F)
+ // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
+ // and integer santizer catches that this implicit cast changes the
+ // value from -1 to 65535. We'll cast manually to work around it.
+ // Read this as `return _mm512_floor_ps(x)`.
+ return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
#elif defined(USING_AVX)
return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
#elif defined(__SSE4_1__)
@@ -1238,6 +1245,9 @@
#if defined(USING_AVX2)
#undef USING_AVX2
#endif
+#if defined(USING_AVX512F)
+ #undef USING_AVX512F
+#endif
#if defined(USING_NEON)
#undef USING_NEON