throw exception if not supported amx sibmem
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index b7966bc..3b4d1b0 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -732,24 +732,24 @@
 
 void putAMX_TILE()
 {
-	puts("void ldtilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_0F38 | T_W0, 0x49); }");
-	puts("void sttilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }");
-	puts("void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, tmm0, addr, T_F2 | T_0F38 | T_W0, 0x4b); }");
-	puts("void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, tmm0, addr, T_66 | T_0F38 | T_W0, 0x4b); }");
+	puts("void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }");
+	puts("void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }");
+	puts("void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }");
+	puts("void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66 | T_0F38 | T_W0, 0x4b); }");
 	puts("void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); }");
-	puts("void tilestored(const Address& addr, const Tmm& tm) { opAMX(tm, tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }");
-	puts("void tilezero(const Tmm& Tmm) { opAMX(Tmm, tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }");
+	puts("void tilestored(const Address& addr, const Tmm& tm) { opVex(tm, &tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }");
+	puts("void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }");
 }
 void putAMX_INT8()
 {
-	puts("void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }");
-	puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }");
-	puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }");
-	puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_0F38 | T_W0, 0x5e); }");
+	puts("void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }");
+	puts("void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }");
+	puts("void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }");
+	puts("void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }");
 }
 void putAMX_BF16()
 {
-	puts("void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }");
+	puts("void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }");
 }
 
 int main(int argc, char *[])
@@ -761,8 +761,8 @@
 		putAMX_TILE();
 		putAMX_INT8();
 		putAMX_BF16();
+		return 0;
 	}
-	if (only64bit) return 0;
 	putVcmp();
 	putX_XM();
 	putM_X();
diff --git a/test/misc.cpp b/test/misc.cpp
index 23a6a2b..dbebf2f 100644
--- a/test/misc.cpp
+++ b/test/misc.cpp
@@ -751,4 +751,63 @@
 	CYBOZU_TEST_EQUAL(c.getSize(), n);
 	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 }
+
+CYBOZU_TEST_AUTO(AMX)
+{
+	struct Code : Xbyak::CodeGenerator {
+		Code()
+		{
+			ldtilecfg(ptr[rax + rcx * 4 + 64]);
+			sttilecfg(ptr[rsp + rax * 8 + 128]);
+			tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]);
+			tileloaddt1(tmm4, ptr[r8 + r9 + 32]);
+			tilerelease();
+			tilestored(ptr[r10 + r11 * 2 + 32], tmm2);
+			tilezero(tmm7);
+			tdpbssd(tmm1, tmm2, tmm3);
+			tdpbsud(tmm2, tmm3, tmm4);
+			tdpbusd(tmm3, tmm4, tmm5);
+			tdpbuud(tmm4, tmm5, tmm6);
+			tdpbf16ps(tmm5, tmm6, tmm7);
+		}
+	} c;
+	// generated code by patch
+	const uint8_t tbl[] = {
+		0xc4, 0xe2, 0x78, 0x49, 0x44, 0x88, 0x40, 0xc4, 0xe2, 0x79, 0x49, 0x84, 0xc4, 0x80, 0x00, 0x00,
+		0x00, 0xc4, 0xe2, 0x7b, 0x4b, 0x5c, 0x57, 0x08, 0xc4, 0x82, 0x79, 0x4b, 0x64, 0x08, 0x20, 0xc4,
+		0xe2, 0x78, 0x49, 0xc0, 0xc4, 0x82, 0x7a, 0x4b, 0x54, 0x5a, 0x20, 0xc4, 0xe2, 0x7b, 0x49, 0xf8,
+		0xc4, 0xe2, 0x63, 0x5e, 0xca, 0xc4, 0xe2, 0x5a, 0x5e, 0xd3, 0xc4, 0xe2, 0x51, 0x5e, 0xdc, 0xc4,
+		0xe2, 0x48, 0x5e, 0xe5, 0xc4, 0xe2, 0x42, 0x5c, 0xee,
+	};
+	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
+	CYBOZU_TEST_EQUAL(c.getSize(), n);
+	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
+}
+
+CYBOZU_TEST_AUTO(tileloadd)
+{
+	struct Code : Xbyak::CodeGenerator {
+		Code()
+		{
+			tileloadd(tmm1, ptr[r8+r8]);
+			tileloadd(tmm1, ptr[rax+rcx*4]);
+			tileloadd(tmm1, ptr[r8+r9*1+0x40]);
+		}
+		void notSupported()
+		{
+			tileloadd(tmm1, ptr[r8]);
+		}
+	} c;
+	const uint8_t tbl[] = {
+		0xC4, 0x82, 0x7B, 0x4B, 0x0C, 0x00,
+		0xC4, 0xE2, 0x7B, 0x4B, 0x0C, 0x88,
+		0xC4, 0x82, 0x7B, 0x4B, 0x4C, 0x08, 0x40,
+	};
+	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
+	CYBOZU_TEST_EQUAL(c.getSize(), n);
+	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
+
+	// current version does not support this sibmem format
+	CYBOZU_TEST_EXCEPTION(c.notSupported(), std::exception);
+}
 #endif
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 6acd31f..8e31209 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -194,6 +194,7 @@
 	ERR_INVALID_RIP_IN_AUTO_GROW,
 	ERR_INVALID_MIB_ADDRESS,
 	ERR_X2APIC_IS_NOT_SUPPORTED,
+	ERR_NOT_SUPPORTED,
 	ERR_INTERNAL // Put it at last.
 };
 
@@ -255,6 +256,7 @@
 			"invalid rip in AutoGrow",
 			"invalid mib address",
 			"x2APIC is not supported",
+			"not supported",
 			"internal error"
 		};
 		assert(err_ <= ERR_INTERNAL);
@@ -682,9 +684,11 @@
 	Zmm operator|(const EvexModifierRounding& emr) const { Zmm r(*this); r.setRounding(emr.rounding); return r; }
 };
 
+#ifdef XBYAK64
 struct Tmm : public Reg {
 	explicit Tmm(int idx = 0, Kind kind = Operand::TMM, int bit = 8192) : Reg(idx, kind, bit) { }
 };
+#endif
 
 struct Opmask : public Reg {
 	explicit Opmask(int idx = 0) : Reg(idx, Operand::OPMASK, 64) {}
@@ -2262,11 +2266,14 @@
 		}
 		throw Error(ERR_BAD_COMBINATION);
 	}
-	void opAMX(const Tmm& t1, const Tmm& t2, const Operand& op, int type, int code0, int imm8 = NONE)
+#ifdef XBYAK64
+	void opAMX(const Tmm& t1, const Address& addr, int type, int code0)
 	{
-		if (!t1.isTMM() || !t2.isTMM()) throw Error(ERR_BAD_COMBINATION);
-		opVex(t1, &t2, op, type, code0, imm8);
+		// addressing without index such as ptr[r8]
+		if (addr.getRegExp().getIndex().getBit() == 0) throw Error(ERR_NOT_SUPPORTED);
+		opVex(t1, &tmm0, addr, type, code0);
 	}
+#endif
 public:
 	unsigned int getVersion() const { return VERSION; }
 	using CodeArray::db;
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 17f0909..69ccdda 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -2033,18 +2033,18 @@
 #ifdef XBYAK64
 void kmovq(const Opmask& k, const Reg64& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W1, 0x92); }
 void kmovq(const Reg64& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W1, 0x93); }
-void ldtilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_0F38 | T_W0, 0x49); }
-void sttilecfg(const Address& addr) { opAMX(tmm0, tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
-void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
-void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }
-void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
-void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
-void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opAMX(x1, x3, x2, T_0F38 | T_W0, 0x5e); }
-void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, tmm0, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
-void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, tmm0, addr, T_66 | T_0F38 | T_W0, 0x4b); }
+void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }
+void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
+void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
+void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }
+void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
+void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
+void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
+void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
+void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66 | T_0F38 | T_W0, 0x4b); }
 void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); }
-void tilestored(const Address& addr, const Tmm& tm) { opAMX(tm, tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }
-void tilezero(const Tmm& Tmm) { opAMX(Tmm, tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }
+void tilestored(const Address& addr, const Tmm& tm) { opVex(tm, &tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }
+void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }
 void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); }
 #endif
 #endif