Merge branch 'dev'
diff --git a/CMakeLists.txt b/CMakeLists.txt index a646210..9a397bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.10) -project(xbyak LANGUAGES CXX VERSION 7.36.2) +project(xbyak LANGUAGES CXX VERSION 7.37) file(GLOB headers xbyak/*.h)
diff --git a/doc/changelog.md b/doc/changelog.md index b08fd1b..7374a83 100644 --- a/doc/changelog.md +++ b/doc/changelog.md
@@ -1,5 +1,6 @@ # History +* 2026/Apr/27 ver 7.37 remove Xeon Phi-specific instructions/AMX_COMPLEX detection/CpuTopology old Win SDK support * 2026/Apr/17 ver 7.36.2 add fallback when "/sys/devices/cpu_{core,atom}/cpus" does not exist * 2026/Apr/16 ver 7.36.1 fix the construction of StackFrame * 2026/Apr/14 ver 7.36 util::StackFrame supports Use{RSI,RDI,RBP,RBPAsFramePointer}
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index c824d18..c3bae76 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp
@@ -961,16 +961,6 @@ puts("void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68); }"); } -void putV4FMA() -{ - puts("void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0x9A); }"); - puts("void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA); }"); - puts("void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_W0 | T_MUST_EVEX | T_N16, 0x9B); }"); - puts("void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_W0 | T_MUST_EVEX | T_N16, 0xAB); }"); - puts("void vp4dpwssd(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0x52); }"); - puts("void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0x53); }"); -} - void putFP16_1() { const struct Tbl { @@ -1199,7 +1189,6 @@ putX_XM_IMM(); putMisc(); putScatter(); - putV4FMA(); putFP16(); putAVX10_2(); }
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index 2a49fda..5d6c655 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp
@@ -557,7 +557,7 @@ { 2, "t1", 0x18}, { 3, "t2", 0x18}, { 0, "nta", 0x18}, - { 2, "wt1", 0x0D}, +// { 2, "wt1", 0x0D}, { 1, "w", 0x0D}, { 7, "it0", 0x18}, { 6, "it1", 0x18},
diff --git a/meson.build b/meson.build index b38c45e..2dfab10 100644 --- a/meson.build +++ b/meson.build
@@ -5,7 +5,7 @@ project( 'xbyak', 'cpp', - version: '7.36.2', + version: '7.37', license: 'BSD-3-Clause', default_options: 'b_ndebug=if-release' )
diff --git a/readme.md b/readme.md index 0d62ea6..f3d0306 100644 --- a/readme.md +++ b/readme.md
@@ -1,5 +1,5 @@ -# Xbyak 7.36.2 [![Badge Build]][Build Status] +# Xbyak 7.37 [![Badge Build]][Build Status] *A JIT assembler for x86/x64 architectures supporting advanced instruction sets up to AVX10.2*
diff --git a/readme.txt b/readme.txt index ea1888a..204ffaf 100644 --- a/readme.txt +++ b/readme.txt
@@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.36.2 + C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.37 ----------------------------------------------------------------------------- ◎概要 @@ -404,6 +404,7 @@ ----------------------------------------------------------------------------- ◎履歴 +2026/04/27 ver 7.37 Xeon Phi専用命令の削除/AMX_COMPLEX検出対応/CpuTopologyの古いWin SDK対応 2026/04/17 ver 7.36.2 /sys/devices/cpu_{core,atom}/cpusが存在しないときのfallbackを追加 2026/04/16 ver 7.36.1 StackFrameの構築方法を修正 2026/04/14 ver 7.36 util::StackFrameがUse{RSI,RDI,RBP,RBPAsFramePointer}対応
diff --git a/sample/test_util.cpp b/sample/test_util.cpp index d2382de..bfa6870 100644 --- a/sample/test_util.cpp +++ b/sample/test_util.cpp
@@ -56,20 +56,15 @@ { Cpu::tRTM, "rtm" }, { Cpu::tMPX, "mpx" }, { Cpu::tSHA, "sha" }, - { Cpu::tPREFETCHWT1, "prefetchwt1" }, { Cpu::tF16C, "f16c" }, { Cpu::tMOVBE, "movbe" }, { Cpu::tAVX512F, "avx512f" }, { Cpu::tAVX512DQ, "avx512dq" }, { Cpu::tAVX512IFMA, "avx512_ifma" }, - { Cpu::tAVX512PF, "avx512pf" }, - { Cpu::tAVX512ER, "avx512er" }, { Cpu::tAVX512CD, "avx512cd" }, { Cpu::tAVX512BW, "avx512bw" }, { Cpu::tAVX512VL, "avx512vl" }, { Cpu::tAVX512VBMI, "avx512_vbmi" }, - { Cpu::tAVX512_4VNNIW, "avx512_4vnniw" }, - { Cpu::tAVX512_4FMAPS, "avx512_4fmaps" }, { Cpu::tAVX512_VBMI2, "avx512_vbmi2" }, { Cpu::tGFNI, "gfni" }, @@ -119,6 +114,7 @@ { Cpu::tAMX_MOVRS, "amx_movrs" }, { Cpu::tMOVRS, "movrs" }, { Cpu::tHYBRID, "hybrid" }, + { Cpu::tAMX_COMPLEX, "amx_complex" }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
diff --git a/test/dataset/old.txt b/test/dataset/old.txt index 19a4995..891fc0c 100644 --- a/test/dataset/old.txt +++ b/test/dataset/old.txt
@@ -1,9 +1,3 @@ -v4fmaddps(zmm1, zmm8, ptr [rdx + 64]); -v4fmaddss(xmm15, xmm8, ptr [rax + 64]); -v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]); -v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]); -vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]); -vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]); vaesdec(xmm20, xmm30, ptr [rcx + 64]); vaesdec(ymm1, ymm2, ptr [rcx + 64]); vaesdec(zmm1, zmm2, ptr [rcx + 64]);
diff --git a/test/make_nm.cpp b/test/make_nm.cpp index 7e4f5b2..ffb8441 100644 --- a/test/make_nm.cpp +++ b/test/make_nm.cpp
@@ -731,7 +731,7 @@ put("prefetcht1", MEM); put("prefetcht2", MEM); put("prefetchnta", MEM); - put("prefetchwt1", MEM); +// put("prefetchwt1", MEM); put("prefetchw", MEM); // SSE2 misc
diff --git a/test/misc.cpp b/test/misc.cpp index d2456c1..8e616e9 100644 --- a/test/misc.cpp +++ b/test/misc.cpp
@@ -296,31 +296,6 @@ } #ifdef XBYAK64 -CYBOZU_TEST_AUTO(vfmaddps) -{ - struct Code : Xbyak::CodeGenerator { - Code() - { - v4fmaddps(zmm1, zmm8, ptr [rdx + 64]); - v4fmaddss(xmm15, xmm8, ptr [rax + 64]); - v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]); - v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]); - vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]); - vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]); - } - } c; - const uint8_t tbl[] = { - 0x62, 0xf2, 0x3f, 0x48, 0x9a, 0x4a, 0x04, - 0x62, 0x72, 0x3f, 0x08, 0x9b, 0x78, 0x04, - 0x62, 0xf2, 0x6f, 0x4d, 0xaa, 0x69, 0x08, - 0x62, 0x62, 0x6f, 0x08, 0xab, 0x7c, 0x24, 0x08, - 0x62, 0xe2, 0x77, 0xcf, 0x52, 0x78, 0x04, - 0x62, 0x72, 0x67, 0x4c, 0x53, 0x54, 0x84, 0x04, - }; - const size_t n = sizeof(tbl) / sizeof(tbl[0]); - CYBOZU_TEST_EQUAL(c.getSize(), n); - CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); -} CYBOZU_TEST_AUTO(vaes) { struct Code : Xbyak::CodeGenerator {
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index e9e139f..127fd41 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h
@@ -176,7 +176,7 @@ enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x7362 /* 0xABCD = A.BC(.D) */ + VERSION = 0x7370 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 2810b20..94288e7 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@ -const char *getVersionString() const { return "7.36.2"; } +const char *getVersionString() const { return "7.37"; } void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); } void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); } void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); } @@ -853,7 +853,6 @@ void prefetcht1(const Address& addr) { opMR(addr, Reg32(2), T_0F, 0x18); } void prefetcht2(const Address& addr) { opMR(addr, Reg32(3), T_0F, 0x18); } void prefetchw(const Address& addr) { opMR(addr, Reg32(1), T_0F, 0x0D); } -void prefetchwt1(const Address& addr) { opMR(addr, Reg32(2), T_0F, 0x0D); } void psadbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF6); } void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, T_0F38, T_66); } void pshufd(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, T_0F, T_66, imm8); } @@ -2036,10 +2035,6 @@ void kxord(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x47); } void kxorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x47); } void kxorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x47); } -void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0x9A); } -void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_W0 | T_MUST_EVEX | T_N16, 0x9B); } -void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA); } -void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_W0 | T_MUST_EVEX | T_N16, 0xAB); } void vaddbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_W0|T_YMM|T_MUST_EVEX|T_B16, 0x58); } void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_W0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58); } void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_W0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58); } @@ -2437,8 +2432,6 @@ void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_W0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59); } void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_W0 | T_B32, 0x68); } void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68); } -void vp4dpwssd(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0x52); } -void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0x53); } void vpabsq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_MUST_EVEX | T_EW1 | T_B64 | T_YMM, 0x1F); } void vpandd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_W0|T_YMM|T_MUST_EVEX|T_B32, 0xDB); } void vpandnd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_W0|T_YMM|T_MUST_EVEX|T_B32, 0xDF); }
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h index a4f4c8b..a75d3f9 100644 --- a/xbyak/xbyak_util.h +++ b/xbyak/xbyak_util.h
@@ -528,16 +528,16 @@ XBYAK_DEFINE_TYPE(36, tAVX512DQ); XBYAK_DEFINE_TYPE(37, tAVX512_IFMA); XBYAK_DEFINE_TYPE(37, tAVX512IFMA);// = tAVX512_IFMA; - XBYAK_DEFINE_TYPE(38, tAVX512PF); - XBYAK_DEFINE_TYPE(39, tAVX512ER); +// XBYAK_DEFINE_TYPE(38, tAVX512PF); // Xeon Phi only +// XBYAK_DEFINE_TYPE(39, tAVX512ER); XBYAK_DEFINE_TYPE(40, tAVX512CD); XBYAK_DEFINE_TYPE(41, tAVX512BW); XBYAK_DEFINE_TYPE(42, tAVX512VL); XBYAK_DEFINE_TYPE(43, tAVX512_VBMI); XBYAK_DEFINE_TYPE(43, tAVX512VBMI); // = tAVX512_VBMI; // changed by Intel's manual - XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW); - XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS); - XBYAK_DEFINE_TYPE(46, tPREFETCHWT1); +// XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW); +// XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS); +// XBYAK_DEFINE_TYPE(46, tPREFETCHWT1); XBYAK_DEFINE_TYPE(47, tPREFETCHW); XBYAK_DEFINE_TYPE(48, tSHA); XBYAK_DEFINE_TYPE(49, tMPX); @@ -589,6 +589,7 @@ XBYAK_DEFINE_TYPE(95, tAMX_FP8); XBYAK_DEFINE_TYPE(96, tMOVRS); XBYAK_DEFINE_TYPE(97, tHYBRID); + XBYAK_DEFINE_TYPE(98, tAMX_COMPLEX); #undef XBYAK_SPLIT_ID #undef XBYAK_DEFINE_TYPE @@ -681,8 +682,6 @@ if (type_ & tAVX512F) { if (ebx & (1U << 17)) type_ |= tAVX512DQ; if (ebx & (1U << 21)) type_ |= tAVX512_IFMA; - if (ebx & (1U << 26)) type_ |= tAVX512PF; - if (ebx & (1U << 27)) type_ |= tAVX512ER; if (ebx & (1U << 28)) type_ |= tAVX512CD; if (ebx & (1U << 30)) type_ |= tAVX512BW; if (ebx & (1U << 31)) type_ |= tAVX512VL; @@ -691,8 +690,6 @@ if (ecx & (1U << 11)) type_ |= tAVX512_VNNI; if (ecx & (1U << 12)) type_ |= tAVX512_BITALG; if (ecx & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ; - if (edx & (1U << 2)) type_ |= tAVX512_4VNNIW; - if (edx & (1U << 3)) type_ |= tAVX512_4FMAPS; if (edx & (1U << 8)) type_ |= tAVX512_VP2INTERSECT; if ((type_ & tAVX512BW) && (edx & (1U << 23))) type_ |= tAVX512_FP16; } @@ -715,7 +712,6 @@ if (ebx & (1U << 23)) type_ |= tCLFLUSHOPT; if (ebx & (1U << 24)) type_ |= tCLWB; if (ebx & (1U << 29)) type_ |= tSHA; - if (ecx & (1U << 0)) type_ |= tPREFETCHWT1; if (ecx & (1U << 5)) type_ |= tWAITPKG; if (ecx & (1U << 8)) type_ |= tGFNI; if (ecx & (1U << 9)) type_ |= tVAES; @@ -747,6 +743,7 @@ if (eax & (1U << 31)) type_ |= tMOVRS; if (edx & (1U << 4)) type_ |= tAVX_VNNI_INT8; if (edx & (1U << 5)) type_ |= tAVX_NE_CONVERT; + if (edx & (1U << 8)) type_ |= tAMX_COMPLEX; if (edx & (1U << 10)) type_ |= tAVX_VNNI_INT16; if (edx & (1U << 14)) type_ |= tPREFETCHITI; if (edx & (1U << 19)) type_ |= tAVX10; @@ -1298,11 +1295,57 @@ #endif } +// fall back to CPUID leaf 0x1A +inline CoreType getCoreType() +{ + uint32_t data[4] = {}; + Cpu::getCpuidEx(0x1A, 0, data); + const uint32_t coreTypeField = (data[0] >> 24) & 0xFF; + if (coreTypeField == 0x40) return Performance; // P-core + if (coreTypeField == 0x20) return Efficient; // E-core + return Standard; +} + #ifdef _WIN32 typedef std::vector<uint32_t> U32Vec; + +#if (defined(NTDDI_VERSION) && NTDDI_VERSION >= 0x06010000) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0601) + #define XBYAK_WINSDK_HAS_RELATIONSHIP_GROUP_AFFINITY 1 +#else + #define XBYAK_WINSDK_HAS_RELATIONSHIP_GROUP_AFFINITY 0 +#endif + +#if (defined(NTDDI_VERSION) && NTDDI_VERSION >= 0x0A000000) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0A00) + #define XBYAK_WINSDK_HAS_EFFICIENCY_CLASS 1 +#else + #define XBYAK_WINSDK_HAS_EFFICIENCY_CLASS 0 +#endif + +// GroupMasks[] / GroupCount on CACHE_RELATIONSHIP added in Win10 20H1 (SDK 10.0.19041, NTDDI_WIN10_VB) +// NOTE: _WIN32_WINNT has no sub-version granularity for Win10, so only +// NTDDI_VERSION can distinguish 20H1 (0x0A00000C) from earlier Win10 builds. +// If NTDDI_VERSION is not set, this macro will be 0 (safe/conservative fallback). +#if defined(NTDDI_VERSION) && NTDDI_VERSION >= 0x0A00000C + #define XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS 1 +#else + #define XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS 0 +#endif + +#if XBYAK_WINSDK_HAS_RELATIONSHIP_GROUP_AFFINITY typedef SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX ProcInfo; +inline CoreType getCoreTypeForAffinity(const GROUP_AFFINITY& affinity) +{ + GROUP_AFFINITY previousMask = {}; + if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, &previousMask)) { + return Standard; + } + CoreType type = impl::getCoreType(); + SetThreadGroupAffinity(GetCurrentThread(), &previousMask, NULL); + return type; +} + // return total logical cpus if sucessful, 0 if failed inline uint32_t getGroupAcc(U32Vec& v) { @@ -1348,10 +1391,12 @@ cpu.coreId = coreIdx++; if (!isHybrid) { cpu.coreType = Standard; - } else if (core.EfficiencyClass > 0) { - cpu.coreType = Performance; } else { - cpu.coreType = Efficient; +#if XBYAK_WINSDK_HAS_EFFICIENCY_CLASS + cpu.coreType = core.EfficiencyClass > 0 ? Performance : Efficient; +#else + cpu.coreType = getCoreTypeForAffinity(core.GroupMask[0]); +#endif } const GROUP_AFFINITY* masks = core.GroupMask; @@ -1376,13 +1421,19 @@ inline bool convertMask(CpuMask& mask, const U32Vec& groupAcc, const CACHE_RELATIONSHIP& cache) { - const GROUP_AFFINITY* masks = cache.GroupMasks; - - for (WORD i = 0; i < cache.GroupCount; i++) { - const WORD group = masks[i].Group; - const KAFFINITY m = masks[i].Mask; - const uint32_t base = groupAcc[group]; - +#if XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS + const WORD count = cache.GroupCount; +#else + const WORD count = 1; +#endif + for (WORD i = 0; i < count; i++) { +#if XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS + const GROUP_AFFINITY& cg = cache.GroupMasks[i]; +#else + const GROUP_AFFINITY& cg = cache.GroupMask; +#endif + const KAFFINITY m = cg.Mask; + const uint32_t base = groupAcc[cg.Group]; for (uint32_t b = 0; b < sizeof(KAFFINITY) * 8; b++) { if (m & (KAFFINITY(1) << b)) { if (!mask.append(base + b)) return false; @@ -1443,7 +1494,17 @@ } return true; } - +#else +inline bool initCpuTopology(CpuTopology& cpuTopo) +{ + (void)cpuTopo; + return false; +} +#endif +// unset WinSDK version macros to avoid Macro pollution +#undef XBYAK_WINSDK_HAS_RELATIONSHIP_GROUP_AFFINITY +#undef XBYAK_WINSDK_HAS_EFFICIENCY_CLASS +#undef XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS #elif defined(__linux__) // Linux struct WrapFILE { @@ -1473,6 +1534,15 @@ return setStr(mask, buf); } +inline CoreType setAffinityAndGetCoreType(uint32_t cpu) +{ + cpu_set_t cpuMask; + CPU_ZERO(&cpuMask); + CPU_SET(cpu, &cpuMask); + if (sched_setaffinity(0, sizeof(cpu_set_t), &cpuMask)) return Standard; + return impl::getCoreType(); +} + inline bool initCpuTopology(CpuTopology& cpuTopo) { const uint32_t logicalCpuNum = sysconf(_SC_NPROCESSORS_ONLN); @@ -1590,36 +1660,15 @@ } } // Fallback: if either sysfs paths are unavailable, detect both core type per-CPU - // via CPUID leaf 0x1A (Hybrid Information) by pinning each logical CPU. if (!hasPCoreSysfs || !hasECoreSysfs) { - // CPUID leaf 0x1A EAX[31:24] core type identifiers - const uint32_t Cpuid_StandardCoreType = 0x40; // P-core (Performance) - const uint32_t Cpuid_AtomCoreType = 0x20; // E-core (Efficient) - cpu_set_t originalMask; CPU_ZERO(&originalMask); - if (sched_getaffinity(0, sizeof(cpu_set_t), &originalMask) != 0) goto SKIP_FALLBACK; - - for (uint32_t cpu = 0; cpu < logicalCpuNum; cpu++) { - cpu_set_t cpuMask; - CPU_ZERO(&cpuMask); - CPU_SET(cpu, &cpuMask); - if (sched_setaffinity(0, sizeof(cpu_set_t), &cpuMask) == 0) { - // CPUID leaf 0x1A: Hybrid Information - uint32_t data[4] = {}; - Cpu::getCpuidEx(0x1A, 0, data); - const uint32_t coreTypeField = (data[0] >> 24) & 0xFF; - if (coreTypeField == Cpuid_StandardCoreType) { - cpuTopo.logicalCpus_[cpu].coreType = Performance; - } else if (coreTypeField == Cpuid_AtomCoreType) { - cpuTopo.logicalCpus_[cpu].coreType = Efficient; - } + if (sched_getaffinity(0, sizeof(cpu_set_t), &originalMask) == 0) { + for (uint32_t cpu = 0; cpu < logicalCpuNum; cpu++) { + cpuTopo.logicalCpus_[cpu].coreType = impl::setAffinityAndGetCoreType(cpu); } + sched_setaffinity(0, sizeof(cpu_set_t), &originalMask); } - - // Restore the original CPU affinity mask - sched_setaffinity(0, sizeof(cpu_set_t), &originalMask); - SKIP_FALLBACK:; } }