Merge branch 'dev'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ba3a65..cbb81f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.5)
 
-project(xbyak LANGUAGES CXX VERSION 7.00)
+project(xbyak LANGUAGES CXX VERSION 7.01)
 
 file(GLOB headers xbyak/*.h)
 
diff --git a/doc/changelog.md b/doc/changelog.md
index a4d2418..59932c3 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -1,5 +1,7 @@
 # History
 
+* 2023/Dec/19 ver 7.01 support AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE, detection of APX10/APX
+* 2023/Dec/01 ver 7.00 support APX
 * 2023/Aug/07 ver 6.73 add sha512/sm3/sm4/avx-vnni-int16
 * 2023/Aug/02 ver 6.72 add xbegin/xabort/xend
 * 2023/Jul/27 ver 6.71 Allocator supports huge page
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index 6a73f42..892fdbc 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -2013,6 +2013,38 @@
 			printf("void cmp%sxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opRRO(r1, r2, addr, T_APX|T_66|T_0F38, 0x%02X); }\n", p->name, p->code);
 		}
 	}
+	// aes
+	{
+		const struct Tbl {
+			const char *name;
+			uint64_t type1;
+			uint64_t type2;
+			uint8_t code;
+			int idx;
+		} tbl[] = {
+			{ "aesdec128kl", T_F3|T_0F38, T_MUST_EVEX|T_F3, 0xDD, 8 },
+			{ "aesdec256kl", T_F3|T_0F38, T_MUST_EVEX|T_F3, 0xDF, 8 },
+			{ "aesdecwide128kl", T_F3|T_0F38, T_MUST_EVEX|T_F3, 0xD8, 1 },
+			{ "aesdecwide256kl", T_F3|T_0F38, T_MUST_EVEX|T_F3, 0xD8, 3 },
+			{ "aesenc128kl", T_F3|T_0F38, T_MUST_EVEX|T_F3, 0xDC, 8 },
+			{ "aesenc256kl", T_F3|T_0F38, T_MUST_EVEX|T_F3, 0xDE, 8 },
+			{ "aesencwide128kl", T_F3|T_0F38, T_MUST_EVEX|T_F3, 0xD8, 0 },
+			{ "aesencwide256kl", T_F3|T_0F38, T_MUST_EVEX|T_F3, 0xD8, 2 },
+		};
+		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+			const Tbl *p = &tbl[i];
+			std::string s1 = type2String(p->type1);
+			std::string s2 = type2String(p->type2);
+			if (p->idx == 8) {
+				printf("void %s(const Xmm& x, const Address& addr) { opAESKL(&x, addr, %s, %s, 0x%02X); }\n", p->name, s1.c_str(), s2.c_str(), p->code);
+			} else {
+				printf("void %s(const Address& addr) { opAESKL(&xmm%d, addr, %s, %s, 0x%02X); }\n", p->name, p->idx, s1.c_str(), s2.c_str(), p->code);
+			}
+		}
+	}
+	// encodekey
+	puts("void encodekey128(const Reg32& r1, const Reg32& r2) { opEncodeKey(r1, r2, 0xFA, 0xDA); }");
+	puts("void encodekey256(const Reg32& r1, const Reg32& r2) { opEncodeKey(r1, r2, 0xFB, 0xDB); }");
 }
 
 void putAMX_TILE()
diff --git a/meson.build b/meson.build
index 83565bf..a9f354e 100644
--- a/meson.build
+++ b/meson.build
@@ -5,7 +5,7 @@
 project(
 	'xbyak',
 	'cpp',
-	version: '7.00',
+	version: '7.01',
 	license: 'BSD-3-Clause',
 	default_options: 'b_ndebug=if-release'
 )
diff --git a/readme.md b/readme.md
index 1f72b76..14ab86c 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,5 @@
 
-# Xbyak 7.00 [![Badge Build]][Build Status]
+# Xbyak 7.01 [![Badge Build]][Build Status]
 
 *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
 
@@ -21,8 +21,7 @@
 - header file only
 - Intel/MASM like syntax
 - fully support AVX-512
-
-- support APX
+- support APX/AVX10
 
 **Note**:
 Use `and_()`, `or_()`, ... instead of `and()`, `or()`.
@@ -34,6 +33,7 @@
 
 ### News
 
+- support AVX10 detection, AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE
 - support APX except for a few instructions
 - add amx_fp16/avx_vnni_int8/avx_ne_convert/avx-ifma
 - add movdiri, movdir64b, clwb, cldemote
diff --git a/readme.txt b/readme.txt
index 248a04c..7d82356 100644
--- a/readme.txt
+++ b/readme.txt
@@ -1,5 +1,5 @@
 

-    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.00

+    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.01

 

 -----------------------------------------------------------------------------

 ◎概要

@@ -46,6 +46,8 @@
 -----------------------------------------------------------------------------

 ◎新機能

 

+APX/AVX10対応

+

 例外なしモード追加

 XBYAK_NO_EXCEPTIONを定義してコンパイルするとgcc/clangで-fno-exceptionsオプションでコンパイルできます。

 エラーは例外の代わりに`Xbyak::GetError()`で通達されます。

diff --git a/sample/cpuid/adl.txt b/sample/cpuid/adl.txt
index eff11af..96513b2 100644
--- a/sample/cpuid/adl.txt
+++ b/sample/cpuid/adl.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote movdiri movdir64b serialize
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b serialize aeskle wide_kl keylocker keylocker_wide
diff --git a/sample/cpuid/arl.txt b/sample/cpuid/arl.txt
index 9c06fc0..db718fe 100644
--- a/sample/cpuid/arl.txt
+++ b/sample/cpuid/arl.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd sha512 sm3 sm4 avx_vnni_int16
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd sha512 sm3 sm4 avx_vnni_int16 aeskle wide_kl keylocker keylocker_wide
diff --git a/sample/cpuid/clx.txt b/sample/cpuid/clx.txt
index 2fc1529..2926e66 100644
--- a/sample/cpuid/clx.txt
+++ b/sample/cpuid/clx.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni clflushopt
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni clflushopt clwb
diff --git a/sample/cpuid/cpx.txt b/sample/cpuid/cpx.txt
index 371a963..c315b4f 100644
--- a/sample/cpuid/cpx.txt
+++ b/sample/cpuid/cpx.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni avx512_bf16 clflushopt
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni avx512_bf16 clflushopt clwb
diff --git a/sample/cpuid/gnr.txt b/sample/cpuid/gnr.txt
index 07fb80a..3dcda77 100644
--- a/sample/cpuid/gnr.txt
+++ b/sample/cpuid/gnr.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote movdiri movdir64b uintr serialize amx_fp16 prefetchiti
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize amx_fp16 prefetchiti avx10
diff --git a/sample/cpuid/grr.txt b/sample/cpuid/grr.txt
index c742e81..0fcec69 100644
--- a/sample/cpuid/grr.txt
+++ b/sample/cpuid/grr.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma rao-int cmpccxadd
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma rao-int cmpccxadd aeskle wide_kl keylocker keylocker_wide
diff --git a/sample/cpuid/icl.txt b/sample/cpuid/icl.txt
index 605331a..3dfce9b 100644
--- a/sample/cpuid/icl.txt
+++ b/sample/cpuid/icl.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq clflushopt
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq clflushopt clwb
diff --git a/sample/cpuid/icx.txt b/sample/cpuid/icx.txt
index 605331a..3dfce9b 100644
--- a/sample/cpuid/icx.txt
+++ b/sample/cpuid/icx.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq clflushopt
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq clflushopt clwb
diff --git a/sample/cpuid/lnl.txt b/sample/cpuid/lnl.txt
index a2874c1..11d3227 100644
--- a/sample/cpuid/lnl.txt
+++ b/sample/cpuid/lnl.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd sha512 sm3 sm4 avx_vnni_int16
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd sha512 sm3 sm4 avx_vnni_int16 aeskle wide_kl keylocker keylocker_wide
diff --git a/sample/cpuid/mtl.txt b/sample/cpuid/mtl.txt
index eff11af..96513b2 100644
--- a/sample/cpuid/mtl.txt
+++ b/sample/cpuid/mtl.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote movdiri movdir64b serialize
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b serialize aeskle wide_kl keylocker keylocker_wide
diff --git a/sample/cpuid/rpl.txt b/sample/cpuid/rpl.txt
index eff11af..96513b2 100644
--- a/sample/cpuid/rpl.txt
+++ b/sample/cpuid/rpl.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote movdiri movdir64b serialize
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b serialize aeskle wide_kl keylocker keylocker_wide
diff --git a/sample/cpuid/skx.txt b/sample/cpuid/skx.txt
index d440cec..46454fc 100644
--- a/sample/cpuid/skx.txt
+++ b/sample/cpuid/skx.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl clflushopt
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl clflushopt clwb
diff --git a/sample/cpuid/spr.txt b/sample/cpuid/spr.txt
index a9881a3..b4a50e9 100644
--- a/sample/cpuid/spr.txt
+++ b/sample/cpuid/spr.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote movdiri movdir64b uintr serialize
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize
diff --git a/sample/cpuid/srf.txt b/sample/cpuid/srf.txt
index 1df6f97..1d6d690 100644
--- a/sample/cpuid/srf.txt
+++ b/sample/cpuid/srf.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd aeskle wide_kl keylocker keylocker_wide
diff --git a/sample/cpuid/tgl.txt b/sample/cpuid/tgl.txt
index 0e7a841..11ce402 100644
--- a/sample/cpuid/tgl.txt
+++ b/sample/cpuid/tgl.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_vp2intersect clflushopt movdiri movdir64b
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_vp2intersect clflushopt clwb movdiri movdir64b aeskle wide_kl keylocker keylocker_wide
diff --git a/sample/cpuid/tnt.txt b/sample/cpuid/tnt.txt
index 8d33aa1..0a943db 100644
--- a/sample/cpuid/tnt.txt
+++ b/sample/cpuid/tnt.txt
@@ -1,2 +1,2 @@
 vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq enh_rep rdrand rdseed smap sha movbe gfni clflushopt cldemote
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq enh_rep rdrand rdseed smap sha movbe gfni clflushopt cldemote clwb
diff --git a/sample/test_util.cpp b/sample/test_util.cpp
index 66869ee..25d776d 100644
--- a/sample/test_util.cpp
+++ b/sample/test_util.cpp
@@ -31,6 +31,7 @@
 		{ Cpu::tSSSE3, "ssse3" },
 		{ Cpu::tSSE41, "sse41" },
 		{ Cpu::tSSE42, "sse42" },
+		{ Cpu::tSSE4a, "sse4a" },
 		{ Cpu::tPOPCNT, "popcnt" },
 		{ Cpu::t3DN, "3dn" },
 		{ Cpu::tE3DN, "e3dn" },
@@ -87,6 +88,7 @@
 		{ Cpu::tWAITPKG, "waitpkg" },
 		{ Cpu::tCLFLUSHOPT, "clflushopt" },
 		{ Cpu::tCLDEMOTE, "cldemote" },
+		{ Cpu::tCLWB, "clwb" },
 		{ Cpu::tMOVDIRI, "movdiri" },
 		{ Cpu::tMOVDIR64B, "movdir64b" },
 		{ Cpu::tUINTR, "uintr" },
@@ -105,6 +107,10 @@
 		{ Cpu::tAVX_VNNI_INT16, "avx_vnni_int16" },
 		{ Cpu::tAPX_F, "apx_f" },
 		{ Cpu::tAVX10, "avx10" },
+		{ Cpu::tAESKLE, "aeskle" },
+		{ Cpu::tWIDE_KL, "wide_kl" },
+		{ Cpu::tKEYLOCKER, "keylocker" },
+		{ Cpu::tKEYLOCKER_WIDE, "keylocker_wide" },
 	};
 	for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 		if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
diff --git a/test/apx.cpp b/test/apx.cpp
index f03b032..b2675b8 100644
--- a/test/apx.cpp
+++ b/test/apx.cpp
@@ -1775,3 +1775,98 @@
 	CYBOZU_TEST_EQUAL(c.getSize(), n);
 	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 }
+
+CYBOZU_TEST_AUTO(aeskl)
+{
+	struct Code : Xbyak::CodeGenerator {
+		Code()
+		{
+			aesdec128kl(xmm15, ptr[rax+rcx*4+0x12]);
+			aesdec128kl(xmm15, ptr[r30+r29*8+0x34]);
+
+			aesdec256kl(xmm15, ptr[rax+rcx*4+0x12]);
+			aesdec256kl(xmm15, ptr[r30+r29*8+0x34]);
+
+			aesdecwide128kl(ptr[rax+rcx*4+0x12]);
+			aesdecwide128kl(ptr[r30+r29*8+0x34]);
+
+			aesdecwide256kl(ptr[rax+rcx*4+0x12]);
+			aesdecwide256kl(ptr[r30+r29*8+0x34]);
+
+			aesenc128kl(xmm15, ptr[rax+rcx*4+0x12]);
+			aesenc128kl(xmm15, ptr[r30+r29*8+0x34]);
+
+			aesenc256kl(xmm15, ptr[rax+rcx*4+0x12]);
+			aesenc256kl(xmm15, ptr[r30+r29*8+0x34]);
+
+			aesencwide128kl(ptr[rax+rcx*4+0x12]);
+			aesencwide128kl(ptr[r30+r29*8+0x34]);
+
+			aesencwide256kl(ptr[rax+rcx*4+0x12]);
+			aesencwide256kl(ptr[r30+r29*8+0x34]);
+		}
+	} c;
+	const uint8_t tbl[] = {
+		// aesdec128kl
+		0xf3, 0x44, 0x0f, 0x38, 0xdd, 0x7c, 0x88, 0x12,
+		0x62, 0x1c, 0x7a, 0x08, 0xdd, 0x7c, 0xee, 0x34,
+		// aesdec256kl
+		0xf3, 0x44, 0x0f, 0x38, 0xdf, 0x7c, 0x88, 0x12,
+		0x62, 0x1c, 0x7a, 0x08, 0xdf, 0x7c, 0xee, 0x34,
+		// aesdecwide128kl
+		0xf3, 0x0f, 0x38, 0xd8, 0x4c, 0x88, 0x12,
+		0x62, 0x9c, 0x7a, 0x08, 0xd8, 0x4c, 0xee, 0x34, 0xf3,
+		// aesdecwide256kl
+		0x0f, 0x38, 0xd8, 0x5c, 0x88, 0x12,
+		0x62, 0x9c, 0x7a, 0x08, 0xd8, 0x5c, 0xee, 0x34,
+		// aesenc128kl
+		0xf3, 0x44, 0x0f, 0x38, 0xdc, 0x7c, 0x88, 0x12,
+		0x62, 0x1c, 0x7a, 0x08, 0xdc, 0x7c, 0xee, 0x34,
+		// aesenc256kl
+		0xf3, 0x44, 0x0f, 0x38, 0xde, 0x7c, 0x88, 0x12,
+		0x62, 0x1c, 0x7a, 0x08, 0xde, 0x7c, 0xee, 0x34,
+		// aesencwide128kl
+		0xf3, 0x0f, 0x38, 0xd8, 0x44, 0x88, 0x12,
+		0x62, 0x9c, 0x7a, 0x08, 0xd8, 0x44, 0xee, 0x34,
+		// aesencwide256kl
+		0xf3, 0x0f, 0x38, 0xd8, 0x54, 0x88, 0x12,
+		0x62, 0x9c, 0x7a, 0x08, 0xd8, 0x54, 0xee, 0x34,
+	};
+	const size_t n = sizeof(tbl);
+	CYBOZU_TEST_EQUAL(c.getSize(), n);
+	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
+}
+
+CYBOZU_TEST_AUTO(encodekey)
+{
+	struct Code : Xbyak::CodeGenerator {
+		Code()
+		{
+			encodekey128(eax, ebx);
+			encodekey128(eax, r8d);
+			encodekey128(r8d, ebx);
+			encodekey128(r30d, r29d);
+
+			encodekey256(eax, ebx);
+			encodekey256(eax, r8d);
+			encodekey256(r8d, ebx);
+			encodekey256(r30d, r29d);
+		}
+	} c;
+	const uint8_t tbl[] = {
+		// encodekey128
+		0xf3, 0x0f, 0x38, 0xfa, 0xc3,
+		0x62, 0xd4, 0x7e, 0x08, 0xda, 0xc0,
+		0x62, 0x74, 0x7e, 0x08, 0xda, 0xc3,
+		0x62, 0x4c, 0x7e, 0x08, 0xda, 0xf5,
+		// encodekey256
+		0xf3, 0x0f, 0x38, 0xfb, 0xc3,
+		0x62, 0xd4, 0x7e, 0x08, 0xdb, 0xc0,
+		0x62, 0x74, 0x7e, 0x08, 0xdb, 0xc3,
+		0x62, 0x4c, 0x7e, 0x08, 0xdb, 0xf5,
+	};
+	const size_t n = sizeof(tbl);
+	CYBOZU_TEST_EQUAL(c.getSize(), n);
+	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
+}
+
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 1ca44ac..8c633ca 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -155,7 +155,7 @@
 
 enum {
 	DEFAULT_MAX_CODE_SIZE = 4096,
-	VERSION = 0x7000 /* 0xABCD = A.BC(.D) */
+	VERSION = 0x7010 /* 0xABCD = A.BC(.D) */
 };
 
 #ifndef MIE_INTEGER_TYPE_DEFINED
@@ -231,6 +231,7 @@
 	ERR_INVALID_ZU,
 	ERR_CANT_USE_REX2,
 	ERR_INVALID_DFV,
+	ERR_INVALID_REG_IDX,
 	ERR_INTERNAL // Put it at last.
 };
 
@@ -288,6 +289,7 @@
 		"invalid ZU",
 		"can't use rex2",
 		"invalid dfv",
+		"invalid reg index",
 		"internal error"
 	};
 	assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
@@ -2712,6 +2714,47 @@
 		opVex(t1, &tmm0, addr2, type, code);
 	}
 #endif
+	// (reg32e/mem, k) if rev else (k, k/mem/reg32e)
+	// size = 8, 16, 32, 64
+	void opKmov(const Opmask& k, const Operand& op, bool rev, int size)
+	{
+		int code = 0;
+		bool isReg = op.isREG(size < 64 ? 32 : 64);
+		if (rev) {
+			code = isReg ? 0x93 : op.isMEM() ? 0x91 : 0;
+		} else {
+			code = op.isOPMASK() || op.isMEM() ? 0x90 : isReg ? 0x92 : 0;
+		}
+		if (code == 0) XBYAK_THROW(ERR_BAD_COMBINATION)
+		uint64_t type = 0;
+		switch (size) {
+		case 8:  type = T_W0|T_66; break;
+		case 16: type = T_W0; break;
+		case 32: type = isReg ? T_W0|T_F2 : T_W1|T_66; break;
+		case 64: type = isReg ? T_W1|T_F2 : T_W1; break;
+		}
+		const Operand *p1 = &k, *p2 = &op;
+		if (code == 0x93) { std::swap(p1, p2); }
+		if (opROO(Reg(), *p2, *p1, T_MAP1|type, code)) return;
+		opVex(static_cast<const Reg&>(*p1), 0, *p2, T_L0|T_0F|type, code);
+	}
+	void opAESKL(const Xmm *x, const Address& addr, uint64_t type1, uint64_t type2, uint8_t code)
+	{
+		if (x && x->getIdx() >= 16) XBYAK_THROW(ERR_INVALID_REG_IDX)
+		if (addr.hasRex2()) {
+			opROO(Reg(), addr, *x, type2, code);
+			return;
+		}
+		opRO(*x, addr, type1, code);
+	}
+	void opEncodeKey(const Reg32& r1, const Reg32& r2, uint8_t code1, uint8_t code2)
+	{
+		if (r1.getIdx() < 8 && r2.getIdx() < 8) {
+			db(0xF3); db(0x0F); db(0x38); db(code1); setModRM(3, r1.getIdx(), r2.getIdx());
+			return;
+		}
+		opROO(Reg(), r2, r1, T_MUST_EVEX|T_F3, code2);
+	}
 public:
 	unsigned int getVersion() const { return VERSION; }
 	using CodeArray::db;
@@ -3096,30 +3139,6 @@
 	// set default encoding to select Vex or Evex
 	void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
 
-	// (reg32e/mem, k) if rev else (k, k/mem/reg32e)
-	// size = 8, 16, 32, 64
-	void opKmov(const Opmask& k, const Operand& op, bool rev, int size)
-	{
-		int code = 0;
-		bool isReg = op.isREG(size < 64 ? 32 : 64);
-		if (rev) {
-			code = isReg ? 0x93 : op.isMEM() ? 0x91 : 0;
-		} else {
-			code = op.isOPMASK() || op.isMEM() ? 0x90 : isReg ? 0x92 : 0;
-		}
-		if (code == 0) XBYAK_THROW(ERR_BAD_COMBINATION)
-		uint64_t type = 0;
-		switch (size) {
-		case 8:  type = T_W0|T_66; break;
-		case 16: type = T_W0; break;
-		case 32: type = isReg ? T_W0|T_F2 : T_W1|T_66; break;
-		case 64: type = isReg ? T_W1|T_F2 : T_W1; break;
-		}
-		const Operand *p1 = &k, *p2 = &op;
-		if (code == 0x93) { std::swap(p1, p2); }
-		if (opROO(Reg(), *p2, *p1, T_MAP1|type, code)) return;
-		opVex(static_cast<const Reg&>(*p1), 0, *p2, T_L0|T_0F|type, code);
-	}
 	/*
 		use single byte nop if useMultiByteNop = false
 	*/
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 50b5bf0..196f5d4 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "7.00"; }
+const char *getVersionString() const { return "7.01"; }
 void aadd(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38, 0x0FC); }
 void aand(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38 | T_66, 0x0FC); }
 void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); }
@@ -1926,6 +1926,16 @@
 void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opRRO(r1, r2, addr, T_APX|T_66|T_0F38, 0xEA); }
 void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opRRO(r1, r2, addr, T_APX|T_66|T_0F38, 0xE8); }
 void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opRRO(r1, r2, addr, T_APX|T_66|T_0F38, 0xE4); }
+void aesdec128kl(const Xmm& x, const Address& addr) { opAESKL(&x, addr, T_F3|T_0F38, T_F3|T_MUST_EVEX, 0xDD); }
+void aesdec256kl(const Xmm& x, const Address& addr) { opAESKL(&x, addr, T_F3|T_0F38, T_F3|T_MUST_EVEX, 0xDF); }
+void aesdecwide128kl(const Address& addr) { opAESKL(&xmm1, addr, T_F3|T_0F38, T_F3|T_MUST_EVEX, 0xD8); }
+void aesdecwide256kl(const Address& addr) { opAESKL(&xmm3, addr, T_F3|T_0F38, T_F3|T_MUST_EVEX, 0xD8); }
+void aesenc128kl(const Xmm& x, const Address& addr) { opAESKL(&x, addr, T_F3|T_0F38, T_F3|T_MUST_EVEX, 0xDC); }
+void aesenc256kl(const Xmm& x, const Address& addr) { opAESKL(&x, addr, T_F3|T_0F38, T_F3|T_MUST_EVEX, 0xDE); }
+void aesencwide128kl(const Address& addr) { opAESKL(&xmm0, addr, T_F3|T_0F38, T_F3|T_MUST_EVEX, 0xD8); }
+void aesencwide256kl(const Address& addr) { opAESKL(&xmm2, addr, T_F3|T_0F38, T_F3|T_MUST_EVEX, 0xD8); }
+void encodekey128(const Reg32& r1, const Reg32& r2) { opEncodeKey(r1, r2, 0xFA, 0xDA); }
+void encodekey256(const Reg32& r1, const Reg32& r2) { opEncodeKey(r1, r2, 0xFB, 0xDB); }
 void ldtilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_0F38|T_W0, 0x49); }
 void sttilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_66|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_66|T_0F38 | T_W0, 0x49); }
 void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2|T_0F38|T_W0, 0x4B); }
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index 950cc2d..afe536a 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -473,6 +473,12 @@
 	XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16);
 	XBYAK_DEFINE_TYPE(82, tAPX_F);
 	XBYAK_DEFINE_TYPE(83, tAVX10);
+	XBYAK_DEFINE_TYPE(84, tAESKLE);
+	XBYAK_DEFINE_TYPE(85, tWIDE_KL);
+	XBYAK_DEFINE_TYPE(86, tKEYLOCKER);
+	XBYAK_DEFINE_TYPE(87, tKEYLOCKER_WIDE);
+	XBYAK_DEFINE_TYPE(88, tSSE4a);
+	XBYAK_DEFINE_TYPE(89, tCLWB);
 
 #undef XBYAK_SPLIT_ID
 #undef XBYAK_DEFINE_TYPE
@@ -519,13 +525,14 @@
 		if (maxExtendedNum >= 0x80000001) {
 			getCpuid(0x80000001, data);
 
-			if (EDX & (1U << 31)) type_ |= t3DN;
-			if (EDX & (1U << 30)) type_ |= tE3DN;
-			if (EDX & (1U << 27)) type_ |= tRDTSCP;
-			if (EDX & (1U << 22)) type_ |= tMMX2;
-			if (EDX & (1U << 15)) type_ |= tCMOV;
 			if (ECX & (1U << 5)) type_ |= tLZCNT;
+			if (ECX & (1U << 6)) type_ |= tSSE4a;
 			if (ECX & (1U << 8)) type_ |= tPREFETCHW;
+			if (EDX & (1U << 15)) type_ |= tCMOV;
+			if (EDX & (1U << 22)) type_ |= tMMX2;
+			if (EDX & (1U << 27)) type_ |= tRDTSCP;
+			if (EDX & (1U << 30)) type_ |= tE3DN;
+			if (EDX & (1U << 31)) type_ |= t3DN;
 		}
 
 		if (maxExtendedNum >= 0x80000008) {
@@ -544,8 +551,8 @@
 		if (ECX & (1U << 25)) type_ |= tAESNI;
 		if (ECX & (1U << 26)) type_ |= tXSAVE;
 		if (ECX & (1U << 27)) type_ |= tOSXSAVE;
-		if (ECX & (1U << 30)) type_ |= tRDRAND;
 		if (ECX & (1U << 29)) type_ |= tF16C;
+		if (ECX & (1U << 30)) type_ |= tRDRAND;
 
 		if (EDX & (1U << 15)) type_ |= tCMOV;
 		if (EDX & (1U << 23)) type_ |= tMMX;
@@ -556,8 +563,8 @@
 			// check XFEATURE_ENABLED_MASK[2:1] = '11b'
 			uint64_t bv = getXfeature();
 			if ((bv & 6) == 6) {
-				if (ECX & (1U << 28)) type_ |= tAVX;
 				if (ECX & (1U << 12)) type_ |= tFMA;
+				if (ECX & (1U << 28)) type_ |= tAVX;
 				// do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
 #if !defined(__APPLE__)
 				if (((bv >> 5) & 7) == 7)
@@ -591,21 +598,23 @@
 			const uint32_t maxNumSubLeaves = EAX;
 			if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
 			if (EBX & (1U << 3)) type_ |= tBMI1;
+			if (EBX & (1U << 4)) type_ |= tHLE;
 			if (EBX & (1U << 8)) type_ |= tBMI2;
 			if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
+			if (EBX & (1U << 11)) type_ |= tRTM;
+			if (EBX & (1U << 14)) type_ |= tMPX;
 			if (EBX & (1U << 18)) type_ |= tRDSEED;
 			if (EBX & (1U << 19)) type_ |= tADX;
 			if (EBX & (1U << 20)) type_ |= tSMAP;
 			if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT;
-			if (EBX & (1U << 4)) type_ |= tHLE;
-			if (EBX & (1U << 11)) type_ |= tRTM;
-			if (EBX & (1U << 14)) type_ |= tMPX;
+			if (EBX & (1U << 24)) type_ |= tCLWB;
 			if (EBX & (1U << 29)) type_ |= tSHA;
 			if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
 			if (ECX & (1U << 5)) type_ |= tWAITPKG;
 			if (ECX & (1U << 8)) type_ |= tGFNI;
 			if (ECX & (1U << 9)) type_ |= tVAES;
 			if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
+			if (ECX & (1U << 23)) type_ |= tKEYLOCKER;
 			if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
 			if (ECX & (1U << 27)) type_ |= tMOVDIRI;
 			if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
@@ -635,7 +644,13 @@
 				if (EDX & (1U << 21)) type_ |= tAPX_F;
 			}
 		}
-		if (has(tAVX10) && maxNum >= 24) {
+		if (maxNum >= 0x19) {
+			getCpuidEx(0x19, 0, data);
+			if (EBX & (1U << 0)) type_ |= tAESKLE;
+			if (EBX & (1U << 2)) type_ |= tWIDE_KL;
+			if (type_ & (tKEYLOCKER|tAESKLE|tWIDE_KL)) type_ |= tKEYLOCKER_WIDE;
+		}
+		if (has(tAVX10) && maxNum >= 0x24) {
 			getCpuidEx(0x24, 0, data);
 			avx10version_ = EBX & mask(7);
 		}