gen_amx.cpp is merged into gen_avx512.cpp
diff --git a/gen/Makefile b/gen/Makefile
index f9068e4..53d1a94 100644
--- a/gen/Makefile
+++ b/gen/Makefile
@@ -1,5 +1,5 @@
 TARGET=../xbyak/xbyak_mnemonic.h
-BIN=sortline gen_code gen_avx512 gen_amx
+BIN=sortline gen_code gen_avx512
 CFLAGS=-I../ -O2 -DXBYAK_NO_OP_NAMES -Wall -Wextra -Wno-missing-field-initializers
 all: $(TARGET)
 sortline: sortline.cpp
@@ -8,8 +8,6 @@
 	$(CXX) $(CFLAGS) $< -o $@
 gen_avx512: gen_avx512.cpp ../xbyak/xbyak.h avx_type.hpp
 	$(CXX) $(CFLAGS) $< -o $@
-gen_amx: gen_amx.cpp ../xbyak/xbyak.h avx_type.hpp
-	$(CXX) $(CFLAGS) $< -o $@
 
 $(TARGET): $(BIN)
 	./gen_code | ./sortline > $@
@@ -21,7 +19,6 @@
 	./gen_avx512 | ./sortline >> $@
 	echo "#ifdef XBYAK64" >> $@
 	./gen_avx512 64 | ./sortline >> $@
-	./gen_amx | ./sortline >> $@
 	echo "#endif" >> $@
 	echo "#endif" >> $@
 
diff --git a/gen/gen_amx.cpp b/gen/gen_amx.cpp
deleted file mode 100644
index 7145e83..0000000
--- a/gen/gen_amx.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#define XBYAK_DONT_READ_LIST
-#include <stdio.h>
-#include <string.h>
-#include "../xbyak/xbyak.h"
-
-using namespace Xbyak;
-#ifdef _MSC_VER
-	#pragma warning(disable : 4996) // scanf
-	#define snprintf _snprintf_s
-#endif
-
-#include "avx_type.hpp"
-
-void putAMX_TILE()
-{
-    puts("void ldtilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_0F38 | T_W0 | T_TMM, 0x49); }");
-    puts("void sttilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_66 | T_0F38 | T_W0 | T_TMM, 0x49); }");
-    puts("void tileloadd(const Tmm& tm, const Operand& op) { opAMX(tm, tmm0, op, T_F2 | T_0F38 | T_W0 | T_TMM, 0x4b); }");
-    puts("void tileloaddt1(const Tmm& tm, const Operand& op) { opAMX(tm, tmm0, op, T_66 | T_0F38 | T_W0 | T_TMM, 0x4b); }");
-    puts("void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); }");
-    puts("void tilestored(const Operand& op, const Tmm& tm) { opAMX(tm, tmm0, op, T_F3 | T_0F38 | T_W0 | T_TMM, 0x4b); }");
-    puts("void tilezero(const Tmm& Tmm) { opAMX(Tmm, tmm0, tmm0, T_F2 | T_0F38 | T_W0 | T_TMM, 0x49); }");
-}
-void putAMX_INT8()
-{
-    puts("void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F2 | T_0F38 | T_W0 | T_TMM, 0x5e); }");
-    puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0 | T_TMM, 0x5e); }");
-    puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_66 | T_0F38 | T_W0 | T_TMM, 0x5e); }");
-    puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2,        T_0F38 | T_W0 | T_TMM, 0x5e); }");
-}
-void putAMX_BF16()
-{
-    puts("void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0 | T_TMM, 0x5c); }");
-}
-
-int main()
-{
-	putAMX_TILE();
-	putAMX_INT8();
-	putAMX_BF16();
-}
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index a4f67d5..4fb1cce 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -730,12 +730,40 @@
 	puts("void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x53); }");
 }
 
+void putAMX_TILE()
+{
+	puts("void ldtilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_0F38 | T_W0 | T_TMM, 0x49); }");
+	puts("void sttilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_66 | T_0F38 | T_W0 | T_TMM, 0x49); }");
+	puts("void tileloadd(const Tmm& tm, const Operand& op) { opAMX(tm, tmm0, op, T_F2 | T_0F38 | T_W0 | T_TMM, 0x4b); }");
+	puts("void tileloaddt1(const Tmm& tm, const Operand& op) { opAMX(tm, tmm0, op, T_66 | T_0F38 | T_W0 | T_TMM, 0x4b); }");
+	puts("void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); }");
+	puts("void tilestored(const Operand& op, const Tmm& tm) { opAMX(tm, tmm0, op, T_F3 | T_0F38 | T_W0 | T_TMM, 0x4b); }");
+	puts("void tilezero(const Tmm& Tmm) { opAMX(Tmm, tmm0, tmm0, T_F2 | T_0F38 | T_W0 | T_TMM, 0x49); }");
+}
+void putAMX_INT8()
+{
+	puts("void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F2 | T_0F38 | T_W0 | T_TMM, 0x5e); }");
+	puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0 | T_TMM, 0x5e); }");
+	puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_66 | T_0F38 | T_W0 | T_TMM, 0x5e); }");
+	puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_0F38 | T_W0 | T_TMM, 0x5e); }");
+}
+void putAMX_BF16()
+{
+	puts("void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0 | T_TMM, 0x5c); }");
+}
+
+
 int main(int argc, char *[])
 {
 	bool only64bit = argc == 2;
 	putOpmask(only64bit);
 	putBroadcast(only64bit);
-	if (only64bit) return 0;
+	if (only64bit) {
+		putAMX_TILE();
+		putAMX_INT8();
+		putAMX_BF16();
+		return 0;
+	}
 	putVcmp();
 	putX_XM();
 	putM_X();
diff --git a/gen/update.bat b/gen/update.bat
index f821d85..161ed87 100644
--- a/gen/update.bat
+++ b/gen/update.bat
@@ -13,7 +13,5 @@
 gen_avx512 | %SORT% >> %TARGET%
 echo #ifdef XBYAK64>> %TARGET%
 gen_avx512 64 | %SORT% >> %TARGET%
-cl gen_amx.cpp %OPT%
-gen_amx | %SORT% >> %TARGET%
 echo #endif>> %TARGET%
 echo #endif>> %TARGET%
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 8dd733a..b2beaac 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -2033,18 +2033,18 @@
 #ifdef XBYAK64
 void kmovq(const Opmask& k, const Reg64& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W1, 0x92); }
 void kmovq(const Reg64& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W1, 0x93); }
-void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); }
 void ldtilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_0F38 | T_W0 | T_TMM, 0x49); }
 void sttilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_66 | T_0F38 | T_W0 | T_TMM, 0x49); }
 void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0 | T_TMM, 0x5c); }
 void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F2 | T_0F38 | T_W0 | T_TMM, 0x5e); }
 void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0 | T_TMM, 0x5e); }
 void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_66 | T_0F38 | T_W0 | T_TMM, 0x5e); }
-void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2,        T_0F38 | T_W0 | T_TMM, 0x5e); }
+void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_0F38 | T_W0 | T_TMM, 0x5e); }
 void tileloadd(const Tmm& tm, const Operand& op) { opAMX(tm, tmm0, op, T_F2 | T_0F38 | T_W0 | T_TMM, 0x4b); }
 void tileloaddt1(const Tmm& tm, const Operand& op) { opAMX(tm, tmm0, op, T_66 | T_0F38 | T_W0 | T_TMM, 0x4b); }
 void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); }
 void tilestored(const Operand& op, const Tmm& tm) { opAMX(tm, tmm0, op, T_F3 | T_0F38 | T_W0 | T_TMM, 0x4b); }
 void tilezero(const Tmm& Tmm) { opAMX(Tmm, tmm0, tmm0, T_F2 | T_0F38 | T_W0 | T_TMM, 0x49); }
+void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); }
 #endif
 #endif