Merge branch 'dev'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8045d6e..d970fc9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 2.6...3.0.2)
 
-project(xbyak LANGUAGES CXX VERSION 6.041)
+project(xbyak LANGUAGES CXX VERSION 6.05)
 
 file(GLOB headers xbyak/*.h)
 
diff --git a/doc/changelog.md b/doc/changelog.md
index c586e1d..c913d1d 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -1,5 +1,6 @@
 # History
 
+* 2022/Mar/12 ver 6.05 add movdiri, movdir64b, clwb, cldemote
 * 2022/Apr/22 ver 6.041 consider Android and mingw
 * 2022/Apr/05 ver 6.04 add tpause, umonitor, umwait
 * 2022/Mar/08 ver 6.03 MmapAllocator supports memfd with user-defined strings.
diff --git a/gen/Makefile b/gen/Makefile
index ae7605e..97a6846 100644
--- a/gen/Makefile
+++ b/gen/Makefile
@@ -30,7 +30,7 @@
 	sed -i -e "s/version: '[0-9.]*',/version: '$(VER)',/" $@
 
 ../readme.md: $(TARGET)
-	sed -l 2 -i -e "s/Xbyak [0-9.]*/Xbyak $(VER)/" $@
+	sed -l 2 -i -e "s/# Xbyak [0-9.]*/# Xbyak $(VER)/" $@
 
 ../readme.txt: $(TARGET)
 	sed -l 2 -i -e "s/Xbyak [0-9.]*/Xbyak $(VER)/" $@
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index e23b552..4cfcc1e 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -1051,6 +1051,8 @@
 		puts("void cmpxchg(const Operand& op, const Reg& reg) { opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F, 0xB0 | (reg.isBit(8) ? 0 : 1)); }");
 		puts("void movbe(const Reg& reg, const Address& addr) { opModM(addr, reg, 0x0F, 0x38, 0xF0); }");
 		puts("void movbe(const Address& addr, const Reg& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF1); }");
+		puts("void movdiri(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF9); }");
+		puts("void movdir64b(const Reg& reg, const Address& addr) { db(0x66); opModM(addr, reg.cvt32(), 0x0F, 0x38, 0xF8); }");
 		puts("void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }");
 		puts("void adox(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0xF3, isREG32_REG32orMEM, NONE, 0x38); }");
 		puts("void cmpxchg8b(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xC7); }");
@@ -1088,6 +1090,8 @@
 		puts("void tpause(const Reg32& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x66); db(0x0F); db(0xAE); setModRM(3, 6, idx); }");
 		puts("void umonitor(const Reg& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) int bit = r.getBit(); if (BIT != bit) { if ((BIT == 32 && bit == 16) || (BIT == 64 && bit == 32)) { db(0x67); } else { XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) } } db(0xF3); db(0x0F); db(0xAE); setModRM(3, 6, idx); }");
 		puts("void umwait(const Reg32& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xF2); db(0x0F); db(0xAE); setModRM(3, 6, idx); }");
+		puts("void clwb(const Address& addr) { db(0x66); opMIB(addr, esi, 0x0F, 0xAE); }");
+		puts("void cldemote(const Address& addr) { opMIB(addr, eax, 0x0F, 0x1C); }");
 	}
 	{
 		const struct Tbl {
diff --git a/meson.build b/meson.build
index 065f2fa..da7ce84 100644
--- a/meson.build
+++ b/meson.build
@@ -5,7 +5,7 @@
 project(
 	'xbyak',
 	'cpp',
-	version: '6.041',
+	version: '6.05',
 	license: 'BSD-3-Clause',
 	default_options: 'b_ndebug=if-release'
 )
diff --git a/readme.md b/readme.md
index 1e64d05..ac39de3 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,5 @@
 
-# Xbyak 6.041 [![Badge Build]][Build Status]
+# Xbyak 6.05 [![Badge Build]][Build Status]
 
 *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
 
diff --git a/readme.txt b/readme.txt
index 3c3c9b6..dd40097 100644
--- a/readme.txt
+++ b/readme.txt
@@ -1,5 +1,5 @@
 

-    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.041

+    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.05

 

 -----------------------------------------------------------------------------

 ◎概要

@@ -400,6 +400,7 @@
 -----------------------------------------------------------------------------

 ◎履歴

 

+2022/05/12 ver 6.05 movdiri, movdir64b, clwb, cldemoteを追加

 2022/04/05 ver 6.04 tpause, umonitor, umwaitを追加

 2022/03/08 ver 6.03 MmapAllocatorがmemfd用のユーザ定義文字列をサポート

 2022/01/28 ver 6.02 dispacementの32bit範囲チェックの厳密化

diff --git a/sample/test_util.cpp b/sample/test_util.cpp
index 60cf77a..7c930f0 100644
--- a/sample/test_util.cpp
+++ b/sample/test_util.cpp
@@ -84,6 +84,10 @@
 		{ Cpu::tAVX_VNNI, "avx_vnni" },
 		{ Cpu::tAVX512_FP16, "avx512_fp16" },
 		{ Cpu::tWAITPKG, "waitpkg" },
+		{ Cpu::tCLFLUSHOPT, "clflushopt" },
+		{ Cpu::tCLDEMOTE, "cldemote" },
+		{ Cpu::tMOVDIRI, "movdiri" },
+		{ Cpu::tMOVDIR64B, "movdir64b" },
 	};
 	for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 		if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
diff --git a/test/make_nm.cpp b/test/make_nm.cpp
index e1cf112..5cfd441 100644
--- a/test/make_nm.cpp
+++ b/test/make_nm.cpp
@@ -637,6 +637,7 @@
 				"fstsw",
 				"fnstsw",
 				"fxrstor",
+				"clwb",
 			};
 			for (size_t i = 0; i < NUM_OF_ARRAY(memTbl); i++) {
 				put(memTbl[i], MEM);
diff --git a/test/misc.cpp b/test/misc.cpp
index f46239b..ec08e49 100644
--- a/test/misc.cpp
+++ b/test/misc.cpp
@@ -1924,3 +1924,46 @@
 	CYBOZU_TEST_EQUAL(c.getSize(), n);
 	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 }
+
+CYBOZU_TEST_AUTO(misc)
+{
+	struct Code : Xbyak::CodeGenerator {
+		Code()
+		{
+			cldemote(ptr[eax+esi*4+0x12]);
+			movdiri(ptr[edx+esi*2+4], eax);
+			movdir64b(eax, ptr[edx]);
+#ifdef XBYAK64
+			cldemote(ptr[rax+rdi*8+0x123]);
+			movdiri(ptr[rax+r12], r9);
+			movdiri(ptr[rax+r12*2+4], r9d);
+			movdir64b(r10, ptr[r8]);
+#endif
+		}
+	} c;
+	const uint8_t tbl[] = {
+#ifdef XBYAK64
+		0x67,
+#endif
+		0x0f, 0x1c, 0x44, 0xb0, 0x12, // cldemote
+#ifdef XBYAK64
+		0x67,
+#endif
+		0x0f, 0x38, 0xf9, 0x44, 0x72, 0x04, // movdiri
+
+		0x66,
+#ifdef XBYAK64
+		0x67,
+#endif
+		0x0f, 0x38, 0xf8, 0x02, // movdir64b
+#ifdef XBYAK64
+		0x0f, 0x1c, 0x84, 0xf8, 0x23, 0x01, 0x00, 0x00, // cldemote
+		0x4e, 0x0f, 0x38, 0xf9, 0x0c, 0x20, // movdiri
+		0x46, 0x0f, 0x38, 0xf9, 0x4c, 0x60, 0x04, // movdiri
+		0x66, 0x45, 0x0f, 0x38, 0xf8, 0x10, // movdir64b
+#endif
+	};
+	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
+	CYBOZU_TEST_EQUAL(c.getSize(), n);
+	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
+}
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 071d730..4f87826 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -144,7 +144,7 @@
 
 enum {
 	DEFAULT_MAX_CODE_SIZE = 4096,
-	VERSION = 0x6041 /* 0xABCD = A.BC(D) */
+	VERSION = 0x6050 /* 0xABCD = A.BC(D) */
 };
 
 #ifndef MIE_INTEGER_TYPE_DEFINED
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 24b8c3f..18bac94 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "6.041"; }
+const char *getVersionString() const { return "6.05"; }
 void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
 void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
 void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@@ -57,9 +57,11 @@
 void cdq() { db(0x99); }
 void clc() { db(0xF8); }
 void cld() { db(0xFC); }
+void cldemote(const Address& addr) { opMIB(addr, eax, 0x0F, 0x1C); }
 void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
 void clflushopt(const Address& addr) { db(0x66); opModM(addr, Reg32(7), 0x0F, 0xAE); }
 void cli() { db(0xFA); }
+void clwb(const Address& addr) { db(0x66); opMIB(addr, esi, 0x0F, 0xAE); }
 void clzero() { db(0x0F); db(0x01); db(0xFC); }
 void cmc() { db(0xF5); }
 void cmova(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7); }//-V524
@@ -501,6 +503,8 @@
 void movd(const Mmx& mmx, const Reg32& reg) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x6E); }
 void movd(const Reg32& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); }
 void movddup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF2, isXMM_XMMorMEM, NONE, NONE); }
+void movdir64b(const Reg& reg, const Address& addr) { db(0x66); opModM(addr, reg.cvt32(), 0x0F, 0x38, 0xF8); }
+void movdiri(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF9); }
 void movdq2q(const Mmx& mmx, const Xmm& xmm) { db(0xF2); opModR(mmx, xmm, 0x0F, 0xD6); }
 void movdqa(const Address& addr, const Xmm& xmm) { db(0x66); opModM(addr, xmm, 0x0F, 0x7F); }
 void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0x66); }
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index 0f4548b..ab0bd57 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -87,11 +87,53 @@
    CoreLevel = 2
 } IntelCpuTopologyLevel;
 
+namespace local {
+
+class Type {
+	uint64_t L;
+	uint64_t H;
+public:
+	Type(uint64_t L = 0, uint64_t H = 0) : L(L), H(H) { }
+	Type& operator&=(const Type& rhs)
+	{
+		L &= rhs.L;
+		H &= rhs.H;
+		return *this;
+	}
+	Type& operator|=(const Type& rhs)
+	{
+		L |= rhs.L;
+		H |= rhs.H;
+		return *this;
+	}
+	Type operator&(const Type& rhs) const
+	{
+		Type t = *this;
+		t &= rhs;
+		return t;
+	}
+	Type operator|(const Type& rhs) const
+	{
+		Type t = *this;
+		t |= rhs;
+		return t;
+	}
+	// without explicit because backward compatilibity
+	operator bool() const { return (H | L) != 0; }
+	uint64_t getL() const { return L; }
+	uint64_t getH() const { return H; }
+};
+
 /**
 	CPU detection class
+	@note static inline const member is supported by c++17 or later, so use template hack
 */
-class Cpu {
-	uint64_t type_;
+template<int dummy=0>
+class CpuT {
+public:
+	typedef local::Type Type;
+private:
+	Type type_;
 	//system topology
 	bool x2APIC_supported_;
 	static const size_t maxTopologyLevels = 2;
@@ -297,80 +339,82 @@
 		return 0;
 #endif
 	}
-	typedef uint64_t Type;
 
-	static const Type NONE = 0;
-	static const Type tMMX = 1 << 0;
-	static const Type tMMX2 = 1 << 1;
-	static const Type tCMOV = 1 << 2;
-	static const Type tSSE = 1 << 3;
-	static const Type tSSE2 = 1 << 4;
-	static const Type tSSE3 = 1 << 5;
-	static const Type tSSSE3 = 1 << 6;
-	static const Type tSSE41 = 1 << 7;
-	static const Type tSSE42 = 1 << 8;
-	static const Type tPOPCNT = 1 << 9;
-	static const Type tAESNI = 1 << 10;
-	static const Type tAVX512_FP16 = 1 << 11;
-	static const Type tOSXSAVE = 1 << 12;
-	static const Type tPCLMULQDQ = 1 << 13;
-	static const Type tAVX = 1 << 14;
-	static const Type tFMA = 1 << 15;
+	static const Type NONE;
+	static const Type tMMX;
+	static const Type tMMX2;
+	static const Type tCMOV;
+	static const Type tSSE;
+	static const Type tSSE2;
+	static const Type tSSE3;
+	static const Type tSSSE3;
+	static const Type tSSE41;
+	static const Type tSSE42;
+	static const Type tPOPCNT;
+	static const Type tAESNI;
+	static const Type tAVX512_FP16;
+	static const Type tOSXSAVE;
+	static const Type tPCLMULQDQ;
+	static const Type tAVX;
+	static const Type tFMA;
 
-	static const Type t3DN = 1 << 16;
-	static const Type tE3DN = 1 << 17;
-	static const Type tWAITPKG = 1 << 18;
-	static const Type tRDTSCP = 1 << 19;
-	static const Type tAVX2 = 1 << 20;
-	static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
-	static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
-	static const Type tLZCNT = 1 << 23;
+	static const Type t3DN;
+	static const Type tE3DN;
+	static const Type tWAITPKG;
+	static const Type tRDTSCP;
+	static const Type tAVX2;
+	static const Type tBMI1; // andn, bextr, blsi, blsmsk, blsr, tzcnt
+	static const Type tBMI2; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
+	static const Type tLZCNT;
 
-	static const Type tINTEL = 1 << 24;
-	static const Type tAMD = 1 << 25;
+	static const Type tINTEL;
+	static const Type tAMD;
 
-	static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb
-	static const Type tRDRAND = 1 << 27;
-	static const Type tADX = 1 << 28; // adcx, adox
-	static const Type tRDSEED = 1 << 29; // rdseed
-	static const Type tSMAP = 1 << 30; // stac
-	static const Type tHLE = uint64_t(1) << 31; // xacquire, xrelease, xtest
-	static const Type tRTM = uint64_t(1) << 32; // xbegin, xend, xabort
-	static const Type tF16C = uint64_t(1) << 33; // vcvtph2ps, vcvtps2ph
-	static const Type tMOVBE = uint64_t(1) << 34; // mobve
-	static const Type tAVX512F = uint64_t(1) << 35;
-	static const Type tAVX512DQ = uint64_t(1) << 36;
-	static const Type tAVX512_IFMA = uint64_t(1) << 37;
-	static const Type tAVX512IFMA = tAVX512_IFMA;
-	static const Type tAVX512PF = uint64_t(1) << 38;
-	static const Type tAVX512ER = uint64_t(1) << 39;
-	static const Type tAVX512CD = uint64_t(1) << 40;
-	static const Type tAVX512BW = uint64_t(1) << 41;
-	static const Type tAVX512VL = uint64_t(1) << 42;
-	static const Type tAVX512_VBMI = uint64_t(1) << 43;
-	static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual
-	static const Type tAVX512_4VNNIW = uint64_t(1) << 44;
-	static const Type tAVX512_4FMAPS = uint64_t(1) << 45;
-	static const Type tPREFETCHWT1 = uint64_t(1) << 46;
-	static const Type tPREFETCHW = uint64_t(1) << 47;
-	static const Type tSHA = uint64_t(1) << 48;
-	static const Type tMPX = uint64_t(1) << 49;
-	static const Type tAVX512_VBMI2 = uint64_t(1) << 50;
-	static const Type tGFNI = uint64_t(1) << 51;
-	static const Type tVAES = uint64_t(1) << 52;
-	static const Type tVPCLMULQDQ = uint64_t(1) << 53;
-	static const Type tAVX512_VNNI = uint64_t(1) << 54;
-	static const Type tAVX512_BITALG = uint64_t(1) << 55;
-	static const Type tAVX512_VPOPCNTDQ = uint64_t(1) << 56;
-	static const Type tAVX512_BF16 = uint64_t(1) << 57;
-	static const Type tAVX512_VP2INTERSECT = uint64_t(1) << 58;
-	static const Type tAMX_TILE = uint64_t(1) << 59;
-	static const Type tAMX_INT8 = uint64_t(1) << 60;
-	static const Type tAMX_BF16 = uint64_t(1) << 61;
-	static const Type tAVX_VNNI = uint64_t(1) << 62;
-	// 18, 63
+	static const Type tENHANCED_REP; // enhanced rep movsb/stosb
+	static const Type tRDRAND;
+	static const Type tADX; // adcx, adox
+	static const Type tRDSEED; // rdseed
+	static const Type tSMAP; // stac
+	static const Type tHLE; // xacquire, xrelease, xtest
+	static const Type tRTM; // xbegin, xend, xabort
+	static const Type tF16C; // vcvtph2ps, vcvtps2ph
+	static const Type tMOVBE; // mobve
+	static const Type tAVX512F;
+	static const Type tAVX512DQ;
+	static const Type tAVX512_IFMA;
+	static const Type tAVX512IFMA;
+	static const Type tAVX512PF;
+	static const Type tAVX512ER;
+	static const Type tAVX512CD;
+	static const Type tAVX512BW;
+	static const Type tAVX512VL;
+	static const Type tAVX512_VBMI;
+	static const Type tAVX512VBMI; // changed by Intel's manual
+	static const Type tAVX512_4VNNIW;
+	static const Type tAVX512_4FMAPS;
+	static const Type tPREFETCHWT1;
+	static const Type tPREFETCHW;
+	static const Type tSHA;
+	static const Type tMPX;
+	static const Type tAVX512_VBMI2;
+	static const Type tGFNI;
+	static const Type tVAES;
+	static const Type tVPCLMULQDQ;
+	static const Type tAVX512_VNNI;
+	static const Type tAVX512_BITALG;
+	static const Type tAVX512_VPOPCNTDQ;
+	static const Type tAVX512_BF16;
+	static const Type tAVX512_VP2INTERSECT;
+	static const Type tAMX_TILE;
+	static const Type tAMX_INT8;
+	static const Type tAMX_BF16;
+	static const Type tAVX_VNNI;
+	static const Type tCLFLUSHOPT;
+	static const Type tCLDEMOTE;
+	static const Type tMOVDIRI;
+	static const Type tMOVDIR64B;
 
-	Cpu()
+	CpuT()
 		: type_(NONE)
 		, x2APIC_supported_(false)
 		, numCores_()
@@ -484,12 +528,16 @@
 			if (EBX & (1U << 18)) type_ |= tRDSEED;
 			if (EBX & (1U << 19)) type_ |= tADX;
 			if (EBX & (1U << 20)) type_ |= tSMAP;
+			if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT;
 			if (EBX & (1U << 4)) type_ |= tHLE;
 			if (EBX & (1U << 11)) type_ |= tRTM;
 			if (EBX & (1U << 14)) type_ |= tMPX;
 			if (EBX & (1U << 29)) type_ |= tSHA;
 			if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
 			if (ECX & (1U << 5)) type_ |= tWAITPKG;
+			if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
+			if (ECX & (1U << 27)) type_ |= tMOVDIRI;
+			if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
 			if (EDX & (1U << 24)) type_ |= tAMX_TILE;
 			if (EDX & (1U << 25)) type_ |= tAMX_INT8;
 			if (EDX & (1U << 22)) type_ |= tAMX_BF16;
@@ -519,6 +567,84 @@
 	}
 };
 
+template<int dummy> const Type CpuT<dummy>::NONE = 0;
+template<int dummy> const Type CpuT<dummy>::tMMX = 1 << 0;
+template<int dummy> const Type CpuT<dummy>::tMMX2 = 1 << 1;
+template<int dummy> const Type CpuT<dummy>::tCMOV = 1 << 2;
+template<int dummy> const Type CpuT<dummy>::tSSE = 1 << 3;
+template<int dummy> const Type CpuT<dummy>::tSSE2 = 1 << 4;
+template<int dummy> const Type CpuT<dummy>::tSSE3 = 1 << 5;
+template<int dummy> const Type CpuT<dummy>::tSSSE3 = 1 << 6;
+template<int dummy> const Type CpuT<dummy>::tSSE41 = 1 << 7;
+template<int dummy> const Type CpuT<dummy>::tSSE42 = 1 << 8;
+template<int dummy> const Type CpuT<dummy>::tPOPCNT = 1 << 9;
+template<int dummy> const Type CpuT<dummy>::tAESNI = 1 << 10;
+template<int dummy> const Type CpuT<dummy>::tAVX512_FP16 = 1 << 11;
+template<int dummy> const Type CpuT<dummy>::tOSXSAVE = 1 << 12;
+template<int dummy> const Type CpuT<dummy>::tPCLMULQDQ = 1 << 13;
+template<int dummy> const Type CpuT<dummy>::tAVX = 1 << 14;
+template<int dummy> const Type CpuT<dummy>::tFMA = 1 << 15;
+
+template<int dummy> const Type CpuT<dummy>::t3DN = 1 << 16;
+template<int dummy> const Type CpuT<dummy>::tE3DN = 1 << 17;
+template<int dummy> const Type CpuT<dummy>::tWAITPKG = 1 << 18;
+template<int dummy> const Type CpuT<dummy>::tRDTSCP = 1 << 19;
+template<int dummy> const Type CpuT<dummy>::tAVX2 = 1 << 20;
+template<int dummy> const Type CpuT<dummy>::tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
+template<int dummy> const Type CpuT<dummy>::tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
+template<int dummy> const Type CpuT<dummy>::tLZCNT = 1 << 23;
+
+template<int dummy> const Type CpuT<dummy>::tINTEL = 1 << 24;
+template<int dummy> const Type CpuT<dummy>::tAMD = 1 << 25;
+
+template<int dummy> const Type CpuT<dummy>::tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb
+template<int dummy> const Type CpuT<dummy>::tRDRAND = 1 << 27;
+template<int dummy> const Type CpuT<dummy>::tADX = 1 << 28; // adcx, adox
+template<int dummy> const Type CpuT<dummy>::tRDSEED = 1 << 29; // rdseed
+template<int dummy> const Type CpuT<dummy>::tSMAP = 1 << 30; // stac
+template<int dummy> const Type CpuT<dummy>::tHLE = uint64_t(1) << 31; // xacquire, xrelease, xtest
+template<int dummy> const Type CpuT<dummy>::tRTM = uint64_t(1) << 32; // xbegin, xend, xabort
+template<int dummy> const Type CpuT<dummy>::tF16C = uint64_t(1) << 33; // vcvtph2ps, vcvtps2ph
+template<int dummy> const Type CpuT<dummy>::tMOVBE = uint64_t(1) << 34; // mobve
+template<int dummy> const Type CpuT<dummy>::tAVX512F = uint64_t(1) << 35;
+template<int dummy> const Type CpuT<dummy>::tAVX512DQ = uint64_t(1) << 36;
+template<int dummy> const Type CpuT<dummy>::tAVX512_IFMA = uint64_t(1) << 37;
+template<int dummy> const Type CpuT<dummy>::tAVX512IFMA = tAVX512_IFMA;
+template<int dummy> const Type CpuT<dummy>::tAVX512PF = uint64_t(1) << 38;
+template<int dummy> const Type CpuT<dummy>::tAVX512ER = uint64_t(1) << 39;
+template<int dummy> const Type CpuT<dummy>::tAVX512CD = uint64_t(1) << 40;
+template<int dummy> const Type CpuT<dummy>::tAVX512BW = uint64_t(1) << 41;
+template<int dummy> const Type CpuT<dummy>::tAVX512VL = uint64_t(1) << 42;
+template<int dummy> const Type CpuT<dummy>::tAVX512_VBMI = uint64_t(1) << 43;
+template<int dummy> const Type CpuT<dummy>::tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual
+template<int dummy> const Type CpuT<dummy>::tAVX512_4VNNIW = uint64_t(1) << 44;
+template<int dummy> const Type CpuT<dummy>::tAVX512_4FMAPS = uint64_t(1) << 45;
+template<int dummy> const Type CpuT<dummy>::tPREFETCHWT1 = uint64_t(1) << 46;
+template<int dummy> const Type CpuT<dummy>::tPREFETCHW = uint64_t(1) << 47;
+template<int dummy> const Type CpuT<dummy>::tSHA = uint64_t(1) << 48;
+template<int dummy> const Type CpuT<dummy>::tMPX = uint64_t(1) << 49;
+template<int dummy> const Type CpuT<dummy>::tAVX512_VBMI2 = uint64_t(1) << 50;
+template<int dummy> const Type CpuT<dummy>::tGFNI = uint64_t(1) << 51;
+template<int dummy> const Type CpuT<dummy>::tVAES = uint64_t(1) << 52;
+template<int dummy> const Type CpuT<dummy>::tVPCLMULQDQ = uint64_t(1) << 53;
+template<int dummy> const Type CpuT<dummy>::tAVX512_VNNI = uint64_t(1) << 54;
+template<int dummy> const Type CpuT<dummy>::tAVX512_BITALG = uint64_t(1) << 55;
+template<int dummy> const Type CpuT<dummy>::tAVX512_VPOPCNTDQ = uint64_t(1) << 56;
+template<int dummy> const Type CpuT<dummy>::tAVX512_BF16 = uint64_t(1) << 57;
+template<int dummy> const Type CpuT<dummy>::tAVX512_VP2INTERSECT = uint64_t(1) << 58;
+template<int dummy> const Type CpuT<dummy>::tAMX_TILE = uint64_t(1) << 59;
+template<int dummy> const Type CpuT<dummy>::tAMX_INT8 = uint64_t(1) << 60;
+template<int dummy> const Type CpuT<dummy>::tAMX_BF16 = uint64_t(1) << 61;
+template<int dummy> const Type CpuT<dummy>::tAVX_VNNI = uint64_t(1) << 62;
+template<int dummy> const Type CpuT<dummy>::tCLFLUSHOPT = uint64_t(1) << 63;
+template<int dummy> const Type CpuT<dummy>::tCLDEMOTE = Type(0, 1 << 0);
+template<int dummy> const Type CpuT<dummy>::tMOVDIRI = Type(0, 1 << 1);
+template<int dummy> const Type CpuT<dummy>::tMOVDIR64B = Type(0, 1 << 2);
+
+} // local
+
+typedef local::CpuT<> Cpu;
+
 #ifndef XBYAK_ONLY_CLASS_CPU
 class Clock {
 public: