add AVX VNNI instruction support
diff --git a/gen/Makefile b/gen/Makefile
index 53d1a94..70d5191 100644
--- a/gen/Makefile
+++ b/gen/Makefile
@@ -1,5 +1,5 @@
 TARGET=../xbyak/xbyak_mnemonic.h
-BIN=sortline gen_code gen_avx512
+BIN=sortline gen_code gen_avx512 gen_vnni
 CFLAGS=-I../ -O2 -DXBYAK_NO_OP_NAMES -Wall -Wextra -Wno-missing-field-initializers
 all: $(TARGET)
 sortline: sortline.cpp
@@ -8,6 +8,8 @@
 	$(CXX) $(CFLAGS) $< -o $@
 gen_avx512: gen_avx512.cpp ../xbyak/xbyak.h avx_type.hpp
 	$(CXX) $(CFLAGS) $< -o $@
+gen_vnni: gen_vnni.cpp ../xbyak/xbyak.h avx_type.hpp
+	$(CXX) $(CFLAGS) $< -o $@
 
 $(TARGET): $(BIN)
 	./gen_code | ./sortline > $@
@@ -21,6 +23,11 @@
 	./gen_avx512 64 | ./sortline >> $@
 	echo "#endif" >> $@
 	echo "#endif" >> $@
+	echo "#ifdef XBYAK_DISABLE_AVX512" >> $@
+	./gen_vnni vexOnly | ./sortline >> $@
+	echo "#else" >> $@
+	./gen_vnni | ./sortline >> $@
+	echo "#endif" >> $@
 
 clean:
 	$(RM) $(BIN) $(TARGET)
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index 250c8d4..b2b88c3 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -363,12 +363,6 @@
 		{ 0x73, "vpshrdvd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
 		{ 0x73, "vpshrdvq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_B64, false },
 
-		{ 0x50, "vpdpbusd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
-		{ 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
-
-		{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
-		{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
-
 		{ 0x72, "vcvtne2ps2bf16", T_F2 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
 		{ 0x52, "vdpbf16ps", T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
 	};
diff --git a/gen/gen_vnni.cpp b/gen/gen_vnni.cpp
new file mode 100644
index 0000000..79564aa
--- /dev/null
+++ b/gen/gen_vnni.cpp
@@ -0,0 +1,41 @@
+#define XBYAK_DONT_READ_LIST
+#include <stdio.h>
+#include <string.h>
+#include "../xbyak/xbyak.h"
+#define NUM_OF_ARRAY(x) (sizeof(x) / sizeof(x[0]))
+
+using namespace Xbyak;
+#ifdef _MSC_VER
+	#pragma warning(disable : 4996) // scanf
+	#define snprintf _snprintf_s
+#endif
+
+#include "avx_type.hpp"
+
+void putVNNI(bool vexEncodingOnly)
+{
+	const struct Tbl {
+		uint8_t code;
+		const char *name;
+		int type;
+	} tbl[] = {
+		{ 0x50, "vpdpbusd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
+		{ 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
+
+		{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
+		{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32},
+	};
+	for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+		const Tbl *p = &tbl[i];
+		std::string type = type2String(p->type);
+		printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op%s) { opAVX_X_X_XM(x1, x2, op, %s%s, 0x%02X, NONE%s); }\n"
+			, p->name, !vexEncodingOnly ? ", preferred_encoding_t encoding = DEFAULT" : "", type.c_str()
+			, !vexEncodingOnly ? " | T_PREF_EVEX" : "", p->code, !vexEncodingOnly ? ", encoding" : "");
+	}
+}
+
+int main(int argc, char *[])
+{
+	bool vexEncodingOnly = argc == 2;
+	putVNNI(vexEncodingOnly);
+}
diff --git a/gen/update.bat b/gen/update.bat
index 161ed87..c9e5537 100644
--- a/gen/update.bat
+++ b/gen/update.bat
@@ -15,3 +15,9 @@
 gen_avx512 64 | %SORT% >> %TARGET%
 echo #endif>> %TARGET%
 echo #endif>> %TARGET%
+cl gen_vnni.cpp %OPT%
+echo #ifdef XBYAK_DISABLE_AVX512>> %TARGET%
+gen_vnni vexOnly | %SORT% >> %TARGET%
+echo #else>> %TARGET%
+gen_vnni | %SORT% >> %TARGET%
+echo #endif>> %TARGET%
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 85e8bed..202cbac 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1883,10 +1883,6 @@
 void vpcompressq(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8B); }
 void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xC4); }
 void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xC4); }
-void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x50); }
-void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x51); }
-void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52); }
-void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x53); }
 void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D); }
 void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75); }
 void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76); }
@@ -2048,3 +2044,14 @@
 void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); }
 #endif
 #endif
+#ifdef XBYAK_DISABLE_AVX512
+void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, NONE); }
+void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, NONE); }
+void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, NONE); }
+void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, NONE); }
+#else
+void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, preferred_encoding_t encoding = DEFAULT) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | T_PREF_EVEX, 0x50, NONE, encoding); }
+void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, preferred_encoding_t encoding = DEFAULT) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | T_PREF_EVEX, 0x51, NONE, encoding); }
+void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, preferred_encoding_t encoding = DEFAULT) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | T_PREF_EVEX, 0x52, NONE, encoding); }
+void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, preferred_encoding_t encoding = DEFAULT) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | T_PREF_EVEX, 0x53, NONE, encoding); }
+#endif