remove T_TMM
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index 4fb1cce..b7966bc 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -732,27 +732,26 @@
 
 void putAMX_TILE()
 {
-	puts("void ldtilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_0F38 | T_W0 | T_TMM, 0x49); }");
-	puts("void sttilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_66 | T_0F38 | T_W0 | T_TMM, 0x49); }");
-	puts("void tileloadd(const Tmm& tm, const Operand& op) { opAMX(tm, tmm0, op, T_F2 | T_0F38 | T_W0 | T_TMM, 0x4b); }");
-	puts("void tileloaddt1(const Tmm& tm, const Operand& op) { opAMX(tm, tmm0, op, T_66 | T_0F38 | T_W0 | T_TMM, 0x4b); }");
+	puts("void ldtilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_0F38 | T_W0, 0x49); }");
+	puts("void sttilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }");
+	puts("void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, tmm0, addr, T_F2 | T_0F38 | T_W0, 0x4b); }");
+	puts("void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, tmm0, addr, T_66 | T_0F38 | T_W0, 0x4b); }");
 	puts("void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); }");
-	puts("void tilestored(const Operand& op, const Tmm& tm) { opAMX(tm, tmm0, op, T_F3 | T_0F38 | T_W0 | T_TMM, 0x4b); }");
-	puts("void tilezero(const Tmm& Tmm) { opAMX(Tmm, tmm0, tmm0, T_F2 | T_0F38 | T_W0 | T_TMM, 0x49); }");
+	puts("void tilestored(const Address& addr, const Tmm& tm) { opAMX(tm, tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }");
+	puts("void tilezero(const Tmm& Tmm) { opAMX(Tmm, tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }");
 }
 void putAMX_INT8()
 {
-	puts("void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F2 | T_0F38 | T_W0 | T_TMM, 0x5e); }");
-	puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0 | T_TMM, 0x5e); }");
-	puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_66 | T_0F38 | T_W0 | T_TMM, 0x5e); }");
-	puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_0F38 | T_W0 | T_TMM, 0x5e); }");
+	puts("void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }");
+	puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }");
+	puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }");
+	puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_0F38 | T_W0, 0x5e); }");
 }
 void putAMX_BF16()
 {
-	puts("void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0 | T_TMM, 0x5c); }");
+	puts("void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }");
 }
 
-
 int main(int argc, char *[])
 {
 	bool only64bit = argc == 2;
@@ -762,8 +761,8 @@
 		putAMX_TILE();
 		putAMX_INT8();
 		putAMX_BF16();
-		return 0;
 	}
+	if (only64bit) return 0;
 	putVcmp();
 	putX_XM();
 	putM_X();
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index d57a996..6acd31f 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -1595,7 +1595,6 @@
 		T_M_K = 1 << 28, // mem{k}
 		T_VSIB = 1 << 29,
 		T_MEM_EVEX = 1 << 30, // use evex if mem
-		T_TMM = 1 << 31,
 		T_XXX
 	};
 	void vex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false)
@@ -2263,18 +2262,10 @@
 		}
 		throw Error(ERR_BAD_COMBINATION);
 	}
-	void opAMX(const Tmm& t1, const Operand& op1, const Operand& op2, int type, int code0, int imm8 = NONE)
+	void opAMX(const Tmm& t1, const Tmm& t2, const Operand& op, int type, int code0, int imm8 = NONE)
 	{
-		const Reg *t2 = static_cast<const Reg*>(&op1);
-		const Operand *op = &op2;
-		if (op2.isNone()) { // <i>(t1, op1) -> <i>(t1, t1, op1)
-			t2 = &t1;
-			op = &op1;
-		}
-		// <i>(t1, t2, op)
-		if (!((type & T_TMM) && (t1.isTMM() && t2->isTMM()))) throw Error(ERR_BAD_COMBINATION);
-
-		opVex(t1, t2, *op, type, code0, imm8);
+		if (!t1.isTMM() || !t2.isTMM()) throw Error(ERR_BAD_COMBINATION);
+		opVex(t1, &t2, op, type, code0, imm8);
 	}
 public:
 	unsigned int getVersion() const { return VERSION; }
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index b2beaac..17f0909 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -2033,18 +2033,18 @@
 #ifdef XBYAK64
 void kmovq(const Opmask& k, const Reg64& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W1, 0x92); }
 void kmovq(const Reg64& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W1, 0x93); }
-void ldtilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_0F38 | T_W0 | T_TMM, 0x49); }
-void sttilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_66 | T_0F38 | T_W0 | T_TMM, 0x49); }
-void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0 | T_TMM, 0x5c); }
-void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F2 | T_0F38 | T_W0 | T_TMM, 0x5e); }
-void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0 | T_TMM, 0x5e); }
-void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_66 | T_0F38 | T_W0 | T_TMM, 0x5e); }
-void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_0F38 | T_W0 | T_TMM, 0x5e); }
-void tileloadd(const Tmm& tm, const Operand& op) { opAMX(tm, tmm0, op, T_F2 | T_0F38 | T_W0 | T_TMM, 0x4b); }
-void tileloaddt1(const Tmm& tm, const Operand& op) { opAMX(tm, tmm0, op, T_66 | T_0F38 | T_W0 | T_TMM, 0x4b); }
+void ldtilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_0F38 | T_W0, 0x49); }
+void sttilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
+void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
+void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }
+void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
+void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
+void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_0F38 | T_W0, 0x5e); }
+void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, tmm0, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
+void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, tmm0, addr, T_66 | T_0F38 | T_W0, 0x4b); }
 void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); }
-void tilestored(const Operand& op, const Tmm& tm) { opAMX(tm, tmm0, op, T_F3 | T_0F38 | T_W0 | T_TMM, 0x4b); }
-void tilezero(const Tmm& Tmm) { opAMX(Tmm, tmm0, tmm0, T_F2 | T_0F38 | T_W0 | T_TMM, 0x49); }
+void tilestored(const Address& addr, const Tmm& tm) { opAMX(tm, tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }
+void tilezero(const Tmm& Tmm) { opAMX(Tmm, tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }
 void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); }
 #endif
 #endif