Merge branch 'dev'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf72cf0..da1765a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 2.6...3.0.2)
 
-project(xbyak LANGUAGES CXX VERSION 6.03)
+project(xbyak LANGUAGES CXX VERSION 6.04)
 
 file(GLOB headers xbyak/*.h)
 
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index bb83983..e23b552 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -1085,6 +1085,9 @@
 		puts("void rdrand(const Reg& r) { if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(6, Operand::REG, r.getBit()), r, 0x0F, 0xC7); }");
 		puts("void rdseed(const Reg& r) { if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(7, Operand::REG, r.getBit()), r, 0x0F, 0xC7); }");
 		puts("void crc32(const Reg32e& reg, const Operand& op) { if (reg.isBit(32) && op.isBit(16)) db(0x66); db(0xF2); opModRM(reg, op, op.isREG(), op.isMEM(), 0x0F, 0x38, 0xF0 | (op.isBit(8) ? 0 : 1)); }");
+		puts("void tpause(const Reg32& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x66); db(0x0F); db(0xAE); setModRM(3, 6, idx); }");
+		puts("void umonitor(const Reg& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) int bit = r.getBit(); if (BIT != bit) { if ((BIT == 32 && bit == 16) || (BIT == 64 && bit == 32)) { db(0x67); } else { XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) } } db(0xF3); db(0x0F); db(0xAE); setModRM(3, 6, idx); }");
+		puts("void umwait(const Reg32& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xF2); db(0x0F); db(0xAE); setModRM(3, 6, idx); }");
 	}
 	{
 		const struct Tbl {
diff --git a/meson.build b/meson.build
index 94ed527..653c1ff 100644
--- a/meson.build
+++ b/meson.build
@@ -5,7 +5,7 @@
 project(
 	'xbyak',
 	'cpp',
-	version: '6.03',
+	version: '6.04',
 	license: 'BSD-3-Clause',
 	default_options: 'b_ndebug=if-release'
 )
diff --git a/readme.md b/readme.md
index 1ae1824..415bf60 100644
--- a/readme.md
+++ b/readme.md
@@ -1,6 +1,6 @@
 [![Build Status](https://github.com/herumi/xbyak/actions/workflows/main.yml/badge.svg)](https://github.com/herumi/xbyak/actions/workflows/main.yml)
 
-# Xbyak 6.03 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
+# Xbyak 6.04 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
 
 ## Abstract
 
@@ -19,6 +19,7 @@
 If you want to use them, then specify `-fno-operator-names` option to gcc/clang.
 
 ### News
+- WAITPKG instructions (tpause, umonitor, umwait) are supported.
 - MmapAllocator supports memfd with user-defined strings. see sample/memfd.cpp
 - strictly check address offset disp32 in a signed 32-bit integer. e.g., `ptr[(void*)0xffffffff]` causes an error.
   - define `XBYAK_OLD_DISP_CHECK` if you need an old check, but the option will be remoevd.
@@ -470,6 +471,8 @@
 http://opensource.org/licenses/BSD-3-Clause
 
 ## History
+* 2022/Apr/05 ver 6.04 add tpause, umonitor, umwait
+* 2022/Mar/08 ver 6.03 MmapAllocator supports memfd with user-defined strings.
 * 2022/Jan/28 ver 6.02 strict check the range of 32-bit dispacement
 * 2021/Dec/14 ver 6.01 support T_FAR jump/call and retf
 * 2021/Sep/14 ver 6.00 fully support AVX512-FP16
diff --git a/readme.txt b/readme.txt
index 9630764..1a6db9a 100644
--- a/readme.txt
+++ b/readme.txt
@@ -1,5 +1,5 @@
 

-    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.03

+    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.04

 

 -----------------------------------------------------------------------------

 ◎概要

@@ -400,6 +400,8 @@
 -----------------------------------------------------------------------------

 ◎履歴

 

+2022/04/05 ver 6.04 tpause, umonitor, umwaitを追加

+2022/03/08 ver 6.03 MmapAllocatorがmemfd用のユーザ定義文字列をサポート

 2022/01/28 ver 6.02 dispacementの32bit範囲チェックの厳密化

 2021/12/14 ver 6.01 T_FAR jump/callとretfをサポート

 2021/09/14 ver 6.00 AVX512-FP16を完全サポート

diff --git a/sample/test_util.cpp b/sample/test_util.cpp
index da7ce9f..60cf77a 100644
--- a/sample/test_util.cpp
+++ b/sample/test_util.cpp
@@ -83,6 +83,7 @@
 		{ Cpu::tAMX_BF16, "amx(bf16)" },
 		{ Cpu::tAVX_VNNI, "avx_vnni" },
 		{ Cpu::tAVX512_FP16, "avx512_fp16" },
+		{ Cpu::tWAITPKG, "waitpkg" },
 	};
 	for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 		if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
diff --git a/test/make_nm.cpp b/test/make_nm.cpp
index c26394a..e1cf112 100644
--- a/test/make_nm.cpp
+++ b/test/make_nm.cpp
@@ -1519,6 +1519,7 @@
 		put("pextrq", REG64|MEM, XMM, IMM);
 		put("pinsrq", XMM, REG64|MEM, IMM);
 #endif
+
 	}
 	void putSHA() const
 	{
diff --git a/test/misc.cpp b/test/misc.cpp
index 92227cd..f46239b 100644
--- a/test/misc.cpp
+++ b/test/misc.cpp
@@ -102,8 +102,8 @@
 			CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x80000000]));
 			CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0xffffffff]));
 #else
-			CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000]), Xbyak::Error);
-			CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffff]), Xbyak::Error);
+			CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error);
+			CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error);
 #endif
 #endif
 		}
@@ -1890,3 +1890,37 @@
 	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 }
 #endif
+
+CYBOZU_TEST_AUTO(waitpkg)
+{
+	struct Code : Xbyak::CodeGenerator {
+		Code()
+		{
+			tpause(eax);
+			tpause(ebx);
+#ifdef XBYAK32
+			umonitor(cx);
+			umonitor(ecx);
+#else
+			umonitor(ecx);
+			umonitor(rcx);
+#endif
+			umwait(eax);
+			umwait(ebx);
+		}
+	} c;
+	const uint8_t tbl[] = {
+		// tpause
+		0x66, 0x0f, 0xae, 0xf0,
+		0x66, 0x0f, 0xae, 0xf3,
+		// umonitor
+		0x67, 0xf3, 0x0f, 0xae, 0xf1,
+		0xf3, 0x0f, 0xae, 0xf1,
+		// tpause
+		0xf2, 0x0f, 0xae, 0xf0,
+		0xf2, 0x0f, 0xae, 0xf3,
+	};
+	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
+	CYBOZU_TEST_EQUAL(c.getSize(), n);
+	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
+}
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index c5c38dd..3e8e258 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -142,7 +142,7 @@
 
 enum {
 	DEFAULT_MAX_CODE_SIZE = 4096,
-	VERSION = 0x6030 /* 0xABCD = A.BC(D) */
+	VERSION = 0x6040 /* 0xABCD = A.BC(D) */
 };
 
 #ifndef MIE_INTEGER_TYPE_DEFINED
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 9f309b6..8ded218 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "6.03"; }
+const char *getVersionString() const { return "6.04"; }
 void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
 void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
 void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@@ -813,10 +813,13 @@
 void subss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF3, isXMM_XMMorMEM); }
 void sysenter() { db(0x0F); db(0x34); }
 void sysexit() { db(0x0F); db(0x35); }
+void tpause(const Reg32& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x66); db(0x0F); db(0xAE); setModRM(3, 6, idx); }
 void tzcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBC); }
 void ucomisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x66, isXMM_XMMorMEM); }
 void ucomiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x100, isXMM_XMMorMEM); }
 void ud2() { db(0x0F); db(0x0B); }
+void umonitor(const Reg& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) int bit = r.getBit(); if (BIT != bit) { if ((BIT == 32 && bit == 16) || (BIT == 64 && bit == 32)) { db(0x67); } else { XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) } } db(0xF3); db(0x0F); db(0xAE); setModRM(3, 6, idx); }
+void umwait(const Reg32& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xF2); db(0x0F); db(0xAE); setModRM(3, 6, idx); }
 void unpckhpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM); }
 void unpckhps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x100, isXMM_XMMorMEM); }
 void unpcklpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM); }
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index 300ee7c..de42bce 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -311,6 +311,7 @@
 	static const Type tSSE42 = 1 << 8;
 	static const Type tPOPCNT = 1 << 9;
 	static const Type tAESNI = 1 << 10;
+	static const Type tAVX512_FP16 = 1 << 11;
 	static const Type tOSXSAVE = 1 << 12;
 	static const Type tPCLMULQDQ = 1 << 13;
 	static const Type tAVX = 1 << 14;
@@ -318,6 +319,7 @@
 
 	static const Type t3DN = 1 << 16;
 	static const Type tE3DN = 1 << 17;
+	static const Type tWAITPKG = 1 << 18;
 	static const Type tRDTSCP = 1 << 19;
 	static const Type tAVX2 = 1 << 20;
 	static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
@@ -366,7 +368,6 @@
 	static const Type tAMX_INT8 = uint64_t(1) << 60;
 	static const Type tAMX_BF16 = uint64_t(1) << 61;
 	static const Type tAVX_VNNI = uint64_t(1) << 62;
-	static const Type tAVX512_FP16 = uint64_t(1) << 11;
 	// 18, 63
 
 	Cpu()
@@ -488,6 +489,7 @@
 			if (EBX & (1U << 14)) type_ |= tMPX;
 			if (EBX & (1U << 29)) type_ |= tSHA;
 			if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
+			if (ECX & (1U << 5)) type_ |= tWAITPKG;
 			if (EDX & (1U << 24)) type_ |= tAMX_TILE;
 			if (EDX & (1U << 25)) type_ |= tAMX_INT8;
 			if (EDX & (1U << 22)) type_ |= tAMX_BF16;