Merge branch 'akharito/adl_support' of https://github.com/akharito/xbyak into akharito-akharito/adl_support
diff --git a/gen/Makefile b/gen/Makefile
index 53d1a94..70d5191 100644
--- a/gen/Makefile
+++ b/gen/Makefile
@@ -1,5 +1,5 @@
 TARGET=../xbyak/xbyak_mnemonic.h
-BIN=sortline gen_code gen_avx512
+BIN=sortline gen_code gen_avx512 gen_vnni
 CFLAGS=-I../ -O2 -DXBYAK_NO_OP_NAMES -Wall -Wextra -Wno-missing-field-initializers
 all: $(TARGET)
 sortline: sortline.cpp
@@ -8,6 +8,8 @@
 	$(CXX) $(CFLAGS) $< -o $@
 gen_avx512: gen_avx512.cpp ../xbyak/xbyak.h avx_type.hpp
 	$(CXX) $(CFLAGS) $< -o $@
+gen_vnni: gen_vnni.cpp ../xbyak/xbyak.h avx_type.hpp
+	$(CXX) $(CFLAGS) $< -o $@
 
 $(TARGET): $(BIN)
 	./gen_code | ./sortline > $@
@@ -21,6 +23,11 @@
 	./gen_avx512 64 | ./sortline >> $@
 	echo "#endif" >> $@
 	echo "#endif" >> $@
+	echo "#ifdef XBYAK_DISABLE_AVX512" >> $@
+	./gen_vnni vexOnly | ./sortline >> $@
+	echo "#else" >> $@
+	./gen_vnni | ./sortline >> $@
+	echo "#endif" >> $@
 
 clean:
 	$(RM) $(BIN) $(TARGET)
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index 250c8d4..b2b88c3 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -363,12 +363,6 @@
 		{ 0x73, "vpshrdvd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
 		{ 0x73, "vpshrdvq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, false },
 
-		{ 0x50, "vpdpbusd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
-		{ 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
-
-		{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
-		{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
-
 		{ 0x72, "vcvtne2ps2bf16", T_F2 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
 		{ 0x52, "vdpbf16ps", T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
 	};
diff --git a/gen/gen_vnni.cpp b/gen/gen_vnni.cpp
new file mode 100644
index 0000000..79564aa
--- /dev/null
+++ b/gen/gen_vnni.cpp
@@ -0,0 +1,41 @@
+#define XBYAK_DONT_READ_LIST
+#include <stdio.h>
+#include <string.h>
+#include "../xbyak/xbyak.h"
+#define NUM_OF_ARRAY(x) (sizeof(x) / sizeof(x[0]))
+
+using namespace Xbyak;
+#ifdef _MSC_VER
+	#pragma warning(disable : 4996) // scanf
+	#define snprintf _snprintf_s
+#endif
+
+#include "avx_type.hpp"
+
+void putVNNI(bool vexEncodingOnly)
+{
+	const struct Tbl {
+		uint8_t code;
+		const char *name;
+		int type;
+	} tbl[] = {
+		{ 0x50, "vpdpbusd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
+		{ 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
+
+		{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
+		{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
+	};
+	for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+		const Tbl *p = &tbl[i];
+		std::string type = type2String(p->type);
+		printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op%s) { opAVX_X_X_XM(x1, x2, op, %s%s, 0x%02X, NONE%s); }\n"
+			, p->name, !vexEncodingOnly ? ", preferred_encoding_t encoding = DEFAULT" : "", type.c_str()
+			, !vexEncodingOnly ? " | T_PREF_EVEX" : "", p->code, !vexEncodingOnly ? ", encoding" : "");
+	}
+}
+
+int main(int argc, char *[])
+{
+	bool vexEncodingOnly = argc == 2;
+	putVNNI(vexEncodingOnly);
+}
diff --git a/gen/update.bat b/gen/update.bat
index 161ed87..c9e5537 100644
--- a/gen/update.bat
+++ b/gen/update.bat
@@ -15,3 +15,9 @@
 gen_avx512 64 | %SORT% >> %TARGET%
 echo #endif>> %TARGET%
 echo #endif>> %TARGET%
+cl gen_vnni.cpp %OPT%
+echo #ifdef XBYAK_DISABLE_AVX512>> %TARGET%
+gen_vnni vexOnly | %SORT% >> %TARGET%
+echo #else>> %TARGET%
+gen_vnni | %SORT% >> %TARGET%
+echo #endif>> %TARGET%
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index c388492..ce2cb9a 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -1543,6 +1543,8 @@
 	return mgr->getCode() + offset;
 }
 
+typedef enum preferred_encoding_t_ { VEX, DEFAULT } preferred_encoding_t;
+
 class CodeGenerator : public CodeArray {
 public:
 	enum LabelType {
@@ -1652,6 +1654,7 @@
 		T_M_K = 1 << 28, // mem{k}
 		T_VSIB = 1 << 29,
 		T_MEM_EVEX = 1 << 30, // use evex if mem
+		T_PREF_EVEX = 1 << 31, // generate EVEX if preferred_encoding = DEFAULT for AVX512
 		T_XXX
 	};
 	void vex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false)
@@ -1691,7 +1694,7 @@
 	}
 	int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32_t VL = 0, bool Hi16Vidx = false)
 	{
-		if (!(type & (T_EVEX | T_MUST_EVEX))) XBYAK_THROW_RET(ERR_EVEX_IS_INVALID, 0)
+		if (!(type & (T_EVEX | T_MUST_EVEX | T_PREF_EVEX))) XBYAK_THROW_RET(ERR_EVEX_IS_INVALID, 0)
 		int w = (type & T_EW1) ? 1 : 0;
 		uint32_t mm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
 		uint32_t pp = (type & T_66) ? 1 : (type & T_F3) ? 2 : (type & T_F2) ? 3 : 0;
@@ -2128,8 +2131,15 @@
 	{
 		db(code1); db(code2 | reg.getIdx());
 	}
-	void opVex(const Reg& r, const Operand *p1, const Operand& op2, int type, int code, int imm8 = NONE)
+	void opVex(const Reg& r, const Operand *p1, const Operand& op2, int type, int code, int imm8 = NONE, preferred_encoding_t encoding_ = DEFAULT)
 	{
+#ifdef XBYAK_DISABLE_AVX512
+		preferred_encoding_t encoding = VEX;
+#else
+		preferred_encoding_t encoding = encoding_;
+#endif
+		if ((encoding == VEX) && ((type & T_MUST_EVEX) || (r.hasEvex() || p1->hasEvex() || op2.hasEvex()))) XBYAK_THROW(ERR_BAD_COMBINATION);
+
 		if (op2.isMEM()) {
 			const Address& addr = op2.getAddress();
 			const RegExp& regExp = addr.getRegExp();
@@ -2138,7 +2148,7 @@
 			if (BIT == 64 && addr.is32bit()) db(0x67);
 			int disp8N = 0;
 			bool x = index.isExtIdx();
-			if ((type & (T_MUST_EVEX|T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) {
+			if ((encoding == DEFAULT) && ((type & (T_MUST_EVEX | T_PREF_EVEX | T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx())) {
 				int aaa = addr.getOpmaskIdx();
 				if (aaa && !(type & T_M_K)) XBYAK_THROW(ERR_INVALID_OPMASK_WITH_MEMORY)
 				bool b = false;
@@ -2154,7 +2164,7 @@
 			opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp8N, (type & T_VSIB) != 0);
 		} else {
 			const Reg& base = op2.getReg();
-			if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex()) {
+			if ((encoding == DEFAULT) && ((type & (T_MUST_EVEX | T_PREF_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex())) {
 				evex(r, base, p1, type, code);
 			} else {
 				vex(r, base, p1, type, code);
@@ -2175,7 +2185,7 @@
 		type |= (bit == 64) ? T_W1 : T_W0;
 		opVex(r, p1, *p2, type, code, imm8);
 	}
-	void opAVX_X_X_XM(const Xmm& x1, const Operand& op1, const Operand& op2, int type, int code0, int imm8 = NONE)
+	void opAVX_X_X_XM(const Xmm& x1, const Operand& op1, const Operand& op2, int type, int code0, int imm8 = NONE, preferred_encoding_t encoding = DEFAULT)
 	{
 		const Xmm *x2 = static_cast<const Xmm*>(&op1);
 		const Operand *op = &op2;
@@ -2185,7 +2195,7 @@
 		}
 		// (x1, x2, op)
 		if (!((x1.isXMM() && x2->isXMM()) || ((type & T_YMM) && ((x1.isYMM() && x2->isYMM()) || (x1.isZMM() && x2->isZMM()))))) XBYAK_THROW(ERR_BAD_COMBINATION)
-		opVex(x1, x2, *op, type, code0, imm8);
+		opVex(x1, x2, *op, type, code0, imm8, encoding);
 	}
 	void opAVX_K_X_XM(const Opmask& k, const Xmm& x2, const Operand& op3, int type, int code0, int imm8 = NONE)
 	{
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 85e8bed..202cbac 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1883,10 +1883,6 @@
 void vpcompressq(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8B); }
 void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xC4); }
 void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xC4); }
-void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x50); }
-void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x51); }
-void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52); }
-void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x53); }
 void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D); }
 void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75); }
 void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76); }
@@ -2048,3 +2044,14 @@
 void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); }
 #endif
 #endif
+#ifdef XBYAK_DISABLE_AVX512
+void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, NONE); }
+void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, NONE); }
+void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, NONE); }
+void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, NONE); }
+#else
+void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, preferred_encoding_t encoding = DEFAULT) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | T_PREF_EVEX, 0x50, NONE, encoding); }
+void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, preferred_encoding_t encoding = DEFAULT) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | T_PREF_EVEX, 0x51, NONE, encoding); }
+void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, preferred_encoding_t encoding = DEFAULT) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | T_PREF_EVEX, 0x52, NONE, encoding); }
+void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, preferred_encoding_t encoding = DEFAULT) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | T_PREF_EVEX, 0x53, NONE, encoding); }
+#endif
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index afdd7b7..649bb4c 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -368,6 +368,7 @@
 	static const Type tAMX_TILE = uint64_t(1) << 59;
 	static const Type tAMX_INT8 = uint64_t(1) << 60;
 	static const Type tAMX_BF16 = uint64_t(1) << 61;
+	static const Type tAVX_VNNI = uint64_t(1) << 62;
 
 	Cpu()
 		: type_(NONE)
@@ -384,6 +385,7 @@
 		const unsigned int& EDX = data[3];
 		getCpuid(0, data);
 		const unsigned int maxNum = EAX;
+		unsigned int maxLeaf7Subleafs;
 		static const char intel[] = "ntel";
 		static const char amd[] = "cAMD";
 		if (ECX == get32bitAsBE(amd)) {
@@ -448,6 +450,7 @@
 #endif
 				{
 					getCpuidEx(7, 0, data);
+					maxLeaf7Subleafs = EAX;
 					if (EBX & (1U << 16)) type_ |= tAVX512F;
 					if (type_ & tAVX512F) {
 						if (EBX & (1U << 17)) type_ |= tAVX512DQ;
@@ -469,16 +472,18 @@
 						if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
 						if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
 					}
-					// EAX=07H, ECX=1
-					getCpuidEx(7, 1, data);
-					if (type_ & tAVX512F) {
-						if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
+					if (maxLeaf7Subleafs >= 1) {
+						getCpuidEx(7, 1, data); // EAX=07H, ECX=1
+						if (type_ & tAVX512F) {
+							if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
+						}
 					}
 				}
 			}
 		}
 		if (maxNum >= 7) {
 			getCpuidEx(7, 0, data);
+			maxLeaf7Subleafs = EAX;
 			if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
 			if (EBX & (1U << 3)) type_ |= tBMI1;
 			if (EBX & (1U << 8)) type_ |= tBMI2;
@@ -494,6 +499,10 @@
 			if (EDX & (1U << 24)) type_ |= tAMX_TILE;
 			if (EDX & (1U << 25)) type_ |= tAMX_INT8;
 			if (EDX & (1U << 22)) type_ |= tAMX_BF16;
+			if (maxLeaf7Subleafs >= 1) {
+				getCpuidEx(7, 1, data);
+				if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
+			}
 		}
 		setFamily();
 		setNumCores();