Merge branch 'dev'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4c2de7..e064056 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 2.6...3.0.2)
 
-project(xbyak LANGUAGES CXX VERSION 6.68)
+project(xbyak LANGUAGES CXX VERSION 6.69)
 
 file(GLOB headers xbyak/*.h)
 
diff --git a/doc/changelog.md b/doc/changelog.md
index 8be3185..b97a3f0 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -1,5 +1,6 @@
 # History
 
+* 2023/Feb/20 ver 6.69 util::Cpu supports AMD CPUs. support UINTR
 * 2022/Dec/07 ver 6.68 support prefetchit{0,1}
 * 2022/Nov/30 ver 6.67 support CMPccXADD
 * 2022/Nov/25 ver 6.66 support RAO-INT
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index 9568053..a074db3 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -1867,6 +1867,10 @@
 		{ "stosq", 0x48, 0xAB },
 		{ "syscall", 0x0F, 0x05 },
 		{ "sysret", 0x0F, 0x07 },
+		{ "clui", 0xF3, 0x0F, 0x01, 0xEE },
+		{ "stui", 0xF3, 0x0F, 0x01, 0xEF },
+		{ "testui", 0xF3, 0x0F, 0x01, 0xED },
+		{ "uiret", 0xF3, 0x0F, 0x01, 0xEC },
 	};
 	putGeneric(tbl, NUM_OF_ARRAY(tbl));
 
@@ -1877,6 +1881,7 @@
 	puts("void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63); }");
 	puts("void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A); }");
 	puts("void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A); }");
+	puts("void senduipi(const Reg64& r) { db(0xF3); opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7); }");
 
 	puts("void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D); }");
 	puts("void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C); }");
diff --git a/meson.build b/meson.build
index 9daaa8f..53cabfd 100644
--- a/meson.build
+++ b/meson.build
@@ -5,7 +5,7 @@
 project(
 	'xbyak',
 	'cpp',
-	version: '6.68',
+	version: '6.69',
 	license: 'BSD-3-Clause',
 	default_options: 'b_ndebug=if-release'
 )
diff --git a/readme.md b/readme.md
index ae7c634..963d995 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,5 @@
 
-# Xbyak 6.68 [![Badge Build]][Build Status]
+# Xbyak 6.69 [![Badge Build]][Build Status]
 
 *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
 
diff --git a/readme.txt b/readme.txt
index 819fc41..a61afc7 100644
--- a/readme.txt
+++ b/readme.txt
@@ -1,5 +1,5 @@
 

-    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.68

+    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 6.69

 

 -----------------------------------------------------------------------------

 ◎概要

@@ -402,6 +402,7 @@
 -----------------------------------------------------------------------------

 ◎履歴

 

+2023/02/20 ver 6.69 util::CpuがAMD対応 UINTR命令対応

 2022/12/07 ver 6.68 prefetchit{0,1}サポート

 2022/11/30 ver 6.67 CMPccXADDサポート

 2022/11/25 ver 6.66 RAO-INTサポート

diff --git a/sample/test_util.cpp b/sample/test_util.cpp
index 96e9d21..4a8dbc7 100644
--- a/sample/test_util.cpp
+++ b/sample/test_util.cpp
@@ -88,6 +88,8 @@
 		{ Cpu::tCLDEMOTE, "cldemote" },
 		{ Cpu::tMOVDIRI, "movdiri" },
 		{ Cpu::tMOVDIR64B, "movdir64b" },
+		{ Cpu::tUINTR, "uintr" },
+		{ Cpu::tSERIALIZE, "serialize" },
 		{ Cpu::tCLZERO, "clzero" },
 		{ Cpu::tAMX_FP16, "amx_fp16" },
 		{ Cpu::tAVX_VNNI_INT8, "avx_vnni_int8" },
@@ -127,7 +129,6 @@
 		Core i7-3930K        6           2D
 	*/
 	cpu.putFamily();
-	if (!cpu.has(Cpu::tINTEL)) return;
 	for (unsigned int i = 0; i < cpu.getDataCacheLevels(); i++) {
 		printf("cache level=%u data cache size=%u cores sharing data cache=%u\n", i, cpu.getDataCacheSize(i), cpu.getCoresSharingDataCache(i));
 	}
diff --git a/test/misc.cpp b/test/misc.cpp
index 2090dca..a62d9c0 100644
--- a/test/misc.cpp
+++ b/test/misc.cpp
@@ -1949,6 +1949,12 @@
 			movdiri(ptr[rax+r12], r9);
 			movdiri(ptr[rax+r12*2+4], r9d);
 			movdir64b(r10, ptr[r8]);
+			clui();
+			senduipi(rax);
+			senduipi(r10);
+			stui();
+			testui();
+			uiret();
 #endif
 		}
 	} c;
@@ -1972,6 +1978,12 @@
 		0x4e, 0x0f, 0x38, 0xf9, 0x0c, 0x20, // movdiri
 		0x46, 0x0f, 0x38, 0xf9, 0x4c, 0x60, 0x04, // movdiri
 		0x66, 0x45, 0x0f, 0x38, 0xf8, 0x10, // movdir64b
+		0xf3, 0x0f, 0x01, 0xee, // clui
+		0xf3, 0x0f, 0xc7, 0xf0, // senduipi rax
+		0xf3, 0x41, 0x0f, 0xc7, 0xf2, // senduipi r10
+		0xf3, 0x0f, 0x01, 0xef, // stui
+		0xf3, 0x0f, 0x01, 0xed, // testui
+		0xf3, 0x0f, 0x01, 0xec, // uiret
 #endif
 	};
 	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
@@ -2157,4 +2169,5 @@
 	CYBOZU_TEST_EQUAL(c.getSize(), n);
 	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 }
+
 #endif
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 226c8d1..8ed0d3f 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -155,7 +155,7 @@
 
 enum {
 	DEFAULT_MAX_CODE_SIZE = 4096,
-	VERSION = 0x6680 /* 0xABCD = A.BC(.D) */
+	VERSION = 0x6690 /* 0xABCD = A.BC(.D) */
 };
 
 #ifndef MIE_INTEGER_TYPE_DEFINED
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 7c74e54..4c4d655 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "6.68"; }
+const char *getVersionString() const { return "6.69"; }
 void aadd(const Address& addr, const Reg32e &reg) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
 void aand(const Address& addr, const Reg32e &reg) { db(0x66); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
 void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
@@ -1651,6 +1651,10 @@
 void stosq() { db(0x48); db(0xAB); }
 void syscall() { db(0x0F); db(0x05); }
 void sysret() { db(0x0F); db(0x07); }
+void clui() { db(0xF3); db(0x0F); db(0x01); db(0xEE); }
+void stui() { db(0xF3); db(0x0F); db(0x01); db(0xEF); }
+void testui() { db(0xF3); db(0x0F); db(0x01); db(0xED); }
+void uiret() { db(0xF3); db(0x0F); db(0x01); db(0xEC); }
 void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); }
 void fxrstor64(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xAE); }
 void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); }
@@ -1658,6 +1662,7 @@
 void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63); }
 void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A); }
 void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A); }
+void senduipi(const Reg64& r) { db(0xF3); opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7); }
 void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D); }
 void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C); }
 void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_ER_X, 0x2D); }
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index da7b68b..c57e8ea 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -173,11 +173,9 @@
 	}
 	void setNumCores()
 	{
-		if (!has(tINTEL)) return;
+		if (!has(tINTEL) && !has(tAMD)) return;
 
 		uint32_t data[4] = {};
-
-		 /* CAUTION: These numbers are configuration as shipped by Intel. */
 		getCpuidEx(0x0, 0, data);
 		if (data[0] >= 0xB) {
 			 /*
@@ -211,7 +209,48 @@
 	}
 	void setCacheHierarchy()
 	{
-		if (!has(tINTEL)) return;
+		if (!has(tINTEL) && !has(tAMD)) return;
+
+		// https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
+		if (has(tAMD)) {
+			// There are 3 Data Cache Levels (L1, L2, L3)
+			dataCacheLevels_ = 3;
+			const uint32_t leaf = 0x8000001D; // for modern AMD CPus
+			// Sub leaf value ranges from 0 to 3
+			// Sub leaf value 0 refers to L1 Data Cache
+			// Sub leaf value 1 refers to L1 Instruction Cache
+			// Sub leaf value 2 refers to L2 Cache
+			// Sub leaf value 3 refers to L3 Cache
+			// For legacy AMD CPU, use leaf 0x80000005 for L1 cache
+			// and 0x80000006 for L2 and L3 cache
+			int cache_index = 0;
+			for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
+				// Skip sub_leaf = 1 as it refers to
+				// L1 Instruction Cache (not required)
+				if (sub_leaf == 1) {
+					continue;
+				}
+				uint32_t data[4] = {};
+				getCpuidEx(leaf, sub_leaf, data);
+				// Cache Size = Line Size * Partitions * Associativity * Cache Sets
+				dataCacheSize_[cache_index] =
+					(extractBit(data[1], 22, 31) + 1) // Associativity-1
+					* (extractBit(data[1], 12, 21) + 1) // Partitions-1
+					* (extractBit(data[1], 0, 11) + 1) // Line Size
+					* (data[2] + 1);
+				// Calculate the number of cores sharing the current data cache
+				int smt_width = numCores_[0];
+				int logical_cores = numCores_[1];
+				int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1;
+				if (logical_cores != 0) {
+					actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
+				}
+				coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1);
+				++cache_index;
+			}
+			return;
+		}
+		// intel
 		const uint32_t NO_CACHE = 0;
 		const uint32_t DATA_CACHE = 1;
 //		const uint32_t INSTRUCTION_CACHE = 2;
@@ -417,6 +456,8 @@
 	XBYAK_DEFINE_TYPE(72, tRAO_INT);
 	XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
 	XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
+	XBYAK_DEFINE_TYPE(75, tSERIALIZE);
+	XBYAK_DEFINE_TYPE(76, tUINTR);
 
 #undef XBYAK_SPLIT_ID
 #undef XBYAK_DEFINE_TYPE
@@ -551,9 +592,11 @@
 			if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
 			if (ECX & (1U << 27)) type_ |= tMOVDIRI;
 			if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
+			if (EDX & (1U << 5)) type_ |= tUINTR;
+			if (EDX & (1U << 14)) type_ |= tSERIALIZE;
+			if (EDX & (1U << 22)) type_ |= tAMX_BF16;
 			if (EDX & (1U << 24)) type_ |= tAMX_TILE;
 			if (EDX & (1U << 25)) type_ |= tAMX_INT8;
-			if (EDX & (1U << 22)) type_ |= tAMX_BF16;
 			if (maxNumSubLeaves >= 1) {
 				getCpuidEx(7, 1, data);
 				if (EAX & (1U << 3)) type_ |= tRAO_INT;