Merge branch 'dev'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a646210..9a397bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.10)
 
-project(xbyak LANGUAGES CXX VERSION 7.36.2)
+project(xbyak LANGUAGES CXX VERSION 7.37)
 
 file(GLOB headers xbyak/*.h)
 
diff --git a/doc/changelog.md b/doc/changelog.md
index b08fd1b..7374a83 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -1,5 +1,6 @@
 # History
 
+* 2026/Apr/27 ver 7.37 remove Xeon Phi-specific instructions/AMX_COMPLEX detection/CpuTopology old Win SDK support
 * 2026/Apr/17 ver 7.36.2 add fallback when "/sys/devices/cpu_{core,atom}/cpus" does not exist
 * 2026/Apr/16 ver 7.36.1 fix the construction of StackFrame
 * 2026/Apr/14 ver 7.36 util::StackFrame supports Use{RSI,RDI,RBP,RBPAsFramePointer}
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index c824d18..c3bae76 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -961,16 +961,6 @@
 	puts("void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68); }");
 }
 
-void putV4FMA()
-{
-	puts("void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0x9A); }");
-	puts("void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA); }");
-	puts("void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_W0 | T_MUST_EVEX | T_N16, 0x9B); }");
-	puts("void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_W0 | T_MUST_EVEX | T_N16, 0xAB); }");
-	puts("void vp4dpwssd(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0x52); }");
-	puts("void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0x53); }");
-}
-
 void putFP16_1()
 {
 	const struct Tbl {
@@ -1199,7 +1189,6 @@
 	putX_XM_IMM();
 	putMisc();
 	putScatter();
-	putV4FMA();
 	putFP16();
 	putAVX10_2();
 }
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index 2a49fda..5d6c655 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -557,7 +557,7 @@
 			{ 2, "t1", 0x18},
 			{ 3, "t2", 0x18},
 			{ 0, "nta", 0x18},
-			{ 2, "wt1", 0x0D},
+//			{ 2, "wt1", 0x0D},
 			{ 1, "w", 0x0D},
 			{ 7, "it0", 0x18},
 			{ 6, "it1", 0x18},
diff --git a/meson.build b/meson.build
index b38c45e..2dfab10 100644
--- a/meson.build
+++ b/meson.build
@@ -5,7 +5,7 @@
 project(
 	'xbyak',
 	'cpp',
-	version: '7.36.2',
+	version: '7.37',
 	license: 'BSD-3-Clause',
 	default_options: 'b_ndebug=if-release'
 )
diff --git a/readme.md b/readme.md
index 0d62ea6..f3d0306 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,5 @@
 
-# Xbyak 7.36.2 [![Badge Build]][Build Status]
+# Xbyak 7.37 [![Badge Build]][Build Status]
 
 *A JIT assembler for x86/x64 architectures supporting advanced instruction sets up to AVX10.2*
 
diff --git a/readme.txt b/readme.txt
index ea1888a..204ffaf 100644
--- a/readme.txt
+++ b/readme.txt
@@ -1,5 +1,5 @@
 

-    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.36.2

+    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.37

 

 -----------------------------------------------------------------------------

 ◎概要

@@ -404,6 +404,7 @@
 -----------------------------------------------------------------------------

 ◎履歴

 

+2026/04/27 ver 7.37 Xeon Phi専用命令の削除/AMX_COMPLEX検出対応/CpuTopologyの古いWin SDK対応

 2026/04/17 ver 7.36.2 /sys/devices/cpu_{core,atom}/cpusが存在しないときのfallbackを追加

 2026/04/16 ver 7.36.1 StackFrameの構築方法を修正

 2026/04/14 ver 7.36 util::StackFrameがUse{RSI,RDI,RBP,RBPAsFramePointer}対応

diff --git a/sample/test_util.cpp b/sample/test_util.cpp
index d2382de..bfa6870 100644
--- a/sample/test_util.cpp
+++ b/sample/test_util.cpp
@@ -56,20 +56,15 @@
 		{ Cpu::tRTM, "rtm" },
 		{ Cpu::tMPX, "mpx" },
 		{ Cpu::tSHA, "sha" },
-		{ Cpu::tPREFETCHWT1, "prefetchwt1" },
 		{ Cpu::tF16C, "f16c" },
 		{ Cpu::tMOVBE, "movbe" },
 		{ Cpu::tAVX512F, "avx512f" },
 		{ Cpu::tAVX512DQ, "avx512dq" },
 		{ Cpu::tAVX512IFMA, "avx512_ifma" },
-		{ Cpu::tAVX512PF, "avx512pf" },
-		{ Cpu::tAVX512ER, "avx512er" },
 		{ Cpu::tAVX512CD, "avx512cd" },
 		{ Cpu::tAVX512BW, "avx512bw" },
 		{ Cpu::tAVX512VL, "avx512vl" },
 		{ Cpu::tAVX512VBMI, "avx512_vbmi" },
-		{ Cpu::tAVX512_4VNNIW, "avx512_4vnniw" },
-		{ Cpu::tAVX512_4FMAPS, "avx512_4fmaps" },
 
 		{ Cpu::tAVX512_VBMI2, "avx512_vbmi2" },
 		{ Cpu::tGFNI, "gfni" },
@@ -119,6 +114,7 @@
 		{ Cpu::tAMX_MOVRS, "amx_movrs" },
 		{ Cpu::tMOVRS, "movrs" },
 		{ Cpu::tHYBRID, "hybrid" },
+		{ Cpu::tAMX_COMPLEX, "amx_complex" },
 	};
 	for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 		if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
diff --git a/test/dataset/old.txt b/test/dataset/old.txt
index 19a4995..891fc0c 100644
--- a/test/dataset/old.txt
+++ b/test/dataset/old.txt
@@ -1,9 +1,3 @@
-v4fmaddps(zmm1, zmm8, ptr [rdx + 64]);
-v4fmaddss(xmm15, xmm8, ptr [rax + 64]);
-v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]);
-v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]);
-vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]);
-vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]);
 vaesdec(xmm20, xmm30, ptr [rcx + 64]);
 vaesdec(ymm1, ymm2, ptr [rcx + 64]);
 vaesdec(zmm1, zmm2, ptr [rcx + 64]);
diff --git a/test/make_nm.cpp b/test/make_nm.cpp
index 7e4f5b2..ffb8441 100644
--- a/test/make_nm.cpp
+++ b/test/make_nm.cpp
@@ -731,7 +731,7 @@
 		put("prefetcht1", MEM);
 		put("prefetcht2", MEM);
 		put("prefetchnta", MEM);
-		put("prefetchwt1", MEM);
+//		put("prefetchwt1", MEM);
 		put("prefetchw", MEM);
 
 		// SSE2 misc
diff --git a/test/misc.cpp b/test/misc.cpp
index d2456c1..8e616e9 100644
--- a/test/misc.cpp
+++ b/test/misc.cpp
@@ -296,31 +296,6 @@
 }
 
 #ifdef XBYAK64
-CYBOZU_TEST_AUTO(vfmaddps)
-{
-	struct Code : Xbyak::CodeGenerator {
-		Code()
-		{
-			v4fmaddps(zmm1, zmm8, ptr [rdx + 64]);
-			v4fmaddss(xmm15, xmm8, ptr [rax + 64]);
-			v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]);
-			v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]);
-			vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]);
-			vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]);
-		}
-	} c;
-	const uint8_t tbl[] = {
-		0x62, 0xf2, 0x3f, 0x48, 0x9a, 0x4a, 0x04,
-		0x62, 0x72, 0x3f, 0x08, 0x9b, 0x78, 0x04,
-		0x62, 0xf2, 0x6f, 0x4d, 0xaa, 0x69, 0x08,
-		0x62, 0x62, 0x6f, 0x08, 0xab, 0x7c, 0x24, 0x08,
-		0x62, 0xe2, 0x77, 0xcf, 0x52, 0x78, 0x04,
-		0x62, 0x72, 0x67, 0x4c, 0x53, 0x54, 0x84, 0x04,
-	};
-	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
-	CYBOZU_TEST_EQUAL(c.getSize(), n);
-	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
-}
 CYBOZU_TEST_AUTO(vaes)
 {
 	struct Code : Xbyak::CodeGenerator {
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index e9e139f..127fd41 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -176,7 +176,7 @@
 
 enum {
 	DEFAULT_MAX_CODE_SIZE = 4096,
-	VERSION = 0x7362 /* 0xABCD = A.BC(.D) */
+	VERSION = 0x7370 /* 0xABCD = A.BC(.D) */
 };
 
 #ifndef MIE_INTEGER_TYPE_DEFINED
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 2810b20..94288e7 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "7.36.2"; }
+const char *getVersionString() const { return "7.37"; }
 void aadd(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); }
 void aand(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); }
 void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); }
@@ -853,7 +853,6 @@
 void prefetcht1(const Address& addr) { opMR(addr, Reg32(2), T_0F, 0x18); }
 void prefetcht2(const Address& addr) { opMR(addr, Reg32(3), T_0F, 0x18); }
 void prefetchw(const Address& addr) { opMR(addr, Reg32(1), T_0F, 0x0D); }
-void prefetchwt1(const Address& addr) { opMR(addr, Reg32(2), T_0F, 0x0D); }
 void psadbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF6); }
 void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, T_0F38, T_66); }
 void pshufd(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, T_0F, T_66, imm8); }
@@ -2036,10 +2035,6 @@
 void kxord(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x47); }
 void kxorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x47); }
 void kxorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x47); }
-void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0x9A); }
-void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_W0 | T_MUST_EVEX | T_N16, 0x9B); }
-void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA); }
-void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_W0 | T_MUST_EVEX | T_N16, 0xAB); }
 void vaddbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_W0|T_YMM|T_MUST_EVEX|T_B16, 0x58); }
 void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_W0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58); }
 void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_W0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58); }
@@ -2437,8 +2432,6 @@
 void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_W0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59); }
 void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_W0 | T_B32, 0x68); }
 void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68); }
-void vp4dpwssd(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0x52); }
-void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_W0 | T_YMM | T_MUST_EVEX | T_N16, 0x53); }
 void vpabsq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_MUST_EVEX | T_EW1 | T_B64 | T_YMM, 0x1F); }
 void vpandd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_W0|T_YMM|T_MUST_EVEX|T_B32, 0xDB); }
 void vpandnd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_W0|T_YMM|T_MUST_EVEX|T_B32, 0xDF); }
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index a4f4c8b..a75d3f9 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -528,16 +528,16 @@
 	XBYAK_DEFINE_TYPE(36, tAVX512DQ);
 	XBYAK_DEFINE_TYPE(37, tAVX512_IFMA);
 	XBYAK_DEFINE_TYPE(37, tAVX512IFMA);// = tAVX512_IFMA;
-	XBYAK_DEFINE_TYPE(38, tAVX512PF);
-	XBYAK_DEFINE_TYPE(39, tAVX512ER);
+//	XBYAK_DEFINE_TYPE(38, tAVX512PF); // Xeon Phi only
+//	XBYAK_DEFINE_TYPE(39, tAVX512ER);
 	XBYAK_DEFINE_TYPE(40, tAVX512CD);
 	XBYAK_DEFINE_TYPE(41, tAVX512BW);
 	XBYAK_DEFINE_TYPE(42, tAVX512VL);
 	XBYAK_DEFINE_TYPE(43, tAVX512_VBMI);
 	XBYAK_DEFINE_TYPE(43, tAVX512VBMI); // = tAVX512_VBMI; // changed by Intel's manual
-	XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW);
-	XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS);
-	XBYAK_DEFINE_TYPE(46, tPREFETCHWT1);
+//	XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW);
+//	XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS);
+//	XBYAK_DEFINE_TYPE(46, tPREFETCHWT1);
 	XBYAK_DEFINE_TYPE(47, tPREFETCHW);
 	XBYAK_DEFINE_TYPE(48, tSHA);
 	XBYAK_DEFINE_TYPE(49, tMPX);
@@ -589,6 +589,7 @@
 	XBYAK_DEFINE_TYPE(95, tAMX_FP8);
 	XBYAK_DEFINE_TYPE(96, tMOVRS);
 	XBYAK_DEFINE_TYPE(97, tHYBRID);
+	XBYAK_DEFINE_TYPE(98, tAMX_COMPLEX);
 
 #undef XBYAK_SPLIT_ID
 #undef XBYAK_DEFINE_TYPE
@@ -681,8 +682,6 @@
 					if (type_ & tAVX512F) {
 						if (ebx & (1U << 17)) type_ |= tAVX512DQ;
 						if (ebx & (1U << 21)) type_ |= tAVX512_IFMA;
-						if (ebx & (1U << 26)) type_ |= tAVX512PF;
-						if (ebx & (1U << 27)) type_ |= tAVX512ER;
 						if (ebx & (1U << 28)) type_ |= tAVX512CD;
 						if (ebx & (1U << 30)) type_ |= tAVX512BW;
 						if (ebx & (1U << 31)) type_ |= tAVX512VL;
@@ -691,8 +690,6 @@
 						if (ecx & (1U << 11)) type_ |= tAVX512_VNNI;
 						if (ecx & (1U << 12)) type_ |= tAVX512_BITALG;
 						if (ecx & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
-						if (edx & (1U << 2)) type_ |= tAVX512_4VNNIW;
-						if (edx & (1U << 3)) type_ |= tAVX512_4FMAPS;
 						if (edx & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
 						if ((type_ & tAVX512BW) && (edx & (1U << 23))) type_ |= tAVX512_FP16;
 					}
@@ -715,7 +712,6 @@
 			if (ebx & (1U << 23)) type_ |= tCLFLUSHOPT;
 			if (ebx & (1U << 24)) type_ |= tCLWB;
 			if (ebx & (1U << 29)) type_ |= tSHA;
-			if (ecx & (1U << 0)) type_ |= tPREFETCHWT1;
 			if (ecx & (1U << 5)) type_ |= tWAITPKG;
 			if (ecx & (1U << 8)) type_ |= tGFNI;
 			if (ecx & (1U << 9)) type_ |= tVAES;
@@ -747,6 +743,7 @@
 				if (eax & (1U << 31)) type_ |= tMOVRS;
 				if (edx & (1U << 4)) type_ |= tAVX_VNNI_INT8;
 				if (edx & (1U << 5)) type_ |= tAVX_NE_CONVERT;
+				if (edx & (1U << 8)) type_ |= tAMX_COMPLEX;
 				if (edx & (1U << 10)) type_ |= tAVX_VNNI_INT16;
 				if (edx & (1U << 14)) type_ |= tPREFETCHITI;
 				if (edx & (1U << 19)) type_ |= tAVX10;
@@ -1298,11 +1295,57 @@
 #endif
 }
 
+// fall back to CPUID leaf 0x1A
+inline CoreType getCoreType()
+{
+	uint32_t data[4] = {};
+	Cpu::getCpuidEx(0x1A, 0, data);
+	const uint32_t coreTypeField = (data[0] >> 24) & 0xFF;
+	if (coreTypeField == 0x40) return Performance; // P-core
+	if (coreTypeField == 0x20) return Efficient; // E-core
+	return Standard;
+}
+
 #ifdef _WIN32
 
 typedef std::vector<uint32_t> U32Vec;
+
+#if (defined(NTDDI_VERSION) && NTDDI_VERSION >= 0x06010000) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0601)
+	#define XBYAK_WINSDK_HAS_RELATIONSHIP_GROUP_AFFINITY 1
+#else
+	#define XBYAK_WINSDK_HAS_RELATIONSHIP_GROUP_AFFINITY 0
+#endif
+
+#if (defined(NTDDI_VERSION) && NTDDI_VERSION >= 0x0A000000) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0A00)
+	#define XBYAK_WINSDK_HAS_EFFICIENCY_CLASS 1
+#else
+	#define XBYAK_WINSDK_HAS_EFFICIENCY_CLASS 0
+#endif
+
+// GroupMasks[] / GroupCount on CACHE_RELATIONSHIP added in Win10 20H1 (SDK 10.0.19041, NTDDI_WIN10_VB)
+// NOTE: _WIN32_WINNT has no sub-version granularity for Win10, so only
+// NTDDI_VERSION can distinguish 20H1 (0x0A00000C) from earlier Win10 builds.
+// If NTDDI_VERSION is not set, this macro will be 0 (safe/conservative fallback).
+#if defined(NTDDI_VERSION) && NTDDI_VERSION >= 0x0A00000C
+	#define XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS 1
+#else
+	#define XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS 0
+#endif
+
+#if XBYAK_WINSDK_HAS_RELATIONSHIP_GROUP_AFFINITY
 typedef SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX ProcInfo;
 
+inline CoreType getCoreTypeForAffinity(const GROUP_AFFINITY& affinity)
+{
+	GROUP_AFFINITY previousMask = {};
+	if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, &previousMask)) {
+		return Standard;
+	}
+	CoreType type = impl::getCoreType();
+	SetThreadGroupAffinity(GetCurrentThread(), &previousMask, NULL);
+	return type;
+}
+
 // return total logical cpus if sucessful, 0 if failed
 inline uint32_t getGroupAcc(U32Vec& v)
 {
@@ -1348,10 +1391,12 @@
 			cpu.coreId = coreIdx++;
 			if (!isHybrid) {
 				cpu.coreType = Standard;
-			} else if (core.EfficiencyClass > 0) {
-				cpu.coreType = Performance;
 			} else {
-				cpu.coreType = Efficient;
+#if XBYAK_WINSDK_HAS_EFFICIENCY_CLASS
+				cpu.coreType = core.EfficiencyClass > 0 ? Performance : Efficient;
+#else
+				cpu.coreType = getCoreTypeForAffinity(core.GroupMask[0]);
+#endif
 			}
 
 			const GROUP_AFFINITY* masks = core.GroupMask;
@@ -1376,13 +1421,19 @@
 
 inline bool convertMask(CpuMask& mask, const U32Vec& groupAcc, const CACHE_RELATIONSHIP& cache)
 {
-	const GROUP_AFFINITY* masks = cache.GroupMasks;
-
-	for (WORD i = 0; i < cache.GroupCount; i++) {
-		const WORD group = masks[i].Group;
-		const KAFFINITY m = masks[i].Mask;
-		const uint32_t base = groupAcc[group];
-
+#if XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS
+	const WORD count = cache.GroupCount;
+#else
+	const WORD count = 1;
+#endif
+	for (WORD i = 0; i < count; i++) {
+#if XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS
+		const GROUP_AFFINITY& cg = cache.GroupMasks[i];
+#else
+		const GROUP_AFFINITY& cg = cache.GroupMask;
+#endif
+		const KAFFINITY m = cg.Mask;
+		const uint32_t base = groupAcc[cg.Group];
 		for (uint32_t b = 0; b < sizeof(KAFFINITY) * 8; b++) {
 			if (m & (KAFFINITY(1) << b)) {
 				if (!mask.append(base + b)) return false;
@@ -1443,7 +1494,17 @@
 	}
 	return true;
 }
-
+#else
+inline bool initCpuTopology(CpuTopology& cpuTopo)
+{
+	(void)cpuTopo;
+	return false;
+}
+#endif
+// unset WinSDK version macros to avoid Macro pollution
+#undef XBYAK_WINSDK_HAS_RELATIONSHIP_GROUP_AFFINITY
+#undef XBYAK_WINSDK_HAS_EFFICIENCY_CLASS
+#undef XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS
 #elif defined(__linux__) // Linux
 
 struct WrapFILE {
@@ -1473,6 +1534,15 @@
 	return setStr(mask, buf);
 }
 
+inline CoreType setAffinityAndGetCoreType(uint32_t cpu)
+{
+	cpu_set_t cpuMask;
+	CPU_ZERO(&cpuMask);
+	CPU_SET(cpu, &cpuMask);
+	if (sched_setaffinity(0, sizeof(cpu_set_t), &cpuMask)) return Standard;
+	return impl::getCoreType();
+}
+
 inline bool initCpuTopology(CpuTopology& cpuTopo)
 {
 	const uint32_t logicalCpuNum = sysconf(_SC_NPROCESSORS_ONLN);
@@ -1590,36 +1660,15 @@
 			}
 		}
 		// Fallback: if either sysfs paths are unavailable, detect both core type per-CPU
-		// via CPUID leaf 0x1A (Hybrid Information) by pinning each logical CPU.
 		if (!hasPCoreSysfs || !hasECoreSysfs) {
-			// CPUID leaf 0x1A EAX[31:24] core type identifiers
-			const uint32_t Cpuid_StandardCoreType = 0x40; // P-core (Performance)
-			const uint32_t Cpuid_AtomCoreType = 0x20; // E-core (Efficient)
-
 			cpu_set_t originalMask;
 			CPU_ZERO(&originalMask);
-			if (sched_getaffinity(0, sizeof(cpu_set_t), &originalMask) != 0) goto SKIP_FALLBACK;
-
-			for (uint32_t cpu = 0; cpu < logicalCpuNum; cpu++) {
-				cpu_set_t cpuMask;
-				CPU_ZERO(&cpuMask);
-				CPU_SET(cpu, &cpuMask);
-				if (sched_setaffinity(0, sizeof(cpu_set_t), &cpuMask) == 0) {
-					// CPUID leaf 0x1A: Hybrid Information
-					uint32_t data[4] = {};
-					Cpu::getCpuidEx(0x1A, 0, data);
-					const uint32_t coreTypeField = (data[0] >> 24) & 0xFF;
-					if (coreTypeField == Cpuid_StandardCoreType) {
-						cpuTopo.logicalCpus_[cpu].coreType = Performance;
-					} else if (coreTypeField == Cpuid_AtomCoreType) {
-						cpuTopo.logicalCpus_[cpu].coreType = Efficient;
-					}
+			if (sched_getaffinity(0, sizeof(cpu_set_t), &originalMask) == 0) {
+				for (uint32_t cpu = 0; cpu < logicalCpuNum; cpu++) {
+					cpuTopo.logicalCpus_[cpu].coreType = impl::setAffinityAndGetCoreType(cpu);
 				}
+				sched_setaffinity(0, sizeof(cpu_set_t), &originalMask);
 			}
-
-			// Restore the original CPU affinity mask
-			sched_setaffinity(0, sizeof(cpu_set_t), &originalMask);
-		SKIP_FALLBACK:;
 		}
 	}