refactoring tile and add tests
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index 73c4650..6a56614 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -2043,10 +2043,11 @@
 
 void putAMX_TILE()
 {
-	puts("void ldtilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_0F38|T_W0, 0x49); }");
-	puts("void sttilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_66|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_66|T_0F38 | T_W0, 0x49); }");
+	puts("void ldtilecfg(const Address& addr) { opAMX(tmm0, addr, T_0F38|T_W0, 0x49); }");
+	puts("void sttilecfg(const Address& addr) { opAMX(tmm0, addr,  T_66|T_0F38|T_W0, 0x49); }");
+	puts("void tilestored(const Address& addr, const Tmm& tm) { opAMX(tm, addr, T_F3|T_0F38|T_W0, 0x4B); }");
+
 	puts("void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); }");
-	puts("void tilestored(const Address& addr, const Tmm& tm) { if (opROO(Reg(), addr, tm, T_APX|T_F3|T_0F38|T_W0, 0x4B)) return; opVex(tm, &tmm0, addr, T_F3|T_0F38|T_W0, 0x4B); }");
 	puts("void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }");
 }
 
@@ -2060,7 +2061,8 @@
 		{ "tileloadd", T_F2 | T_0F38 | T_W0, 0x4B },
 		{ "tileloaddt1", T_66 | T_0F38 | T_W0, 0x4B },
 		{ "tileloaddrs", T_F2 | T_0F38 | T_W0, 0x4A },
-		{ "tileloaddrst1", T_66 | T_0F38 | T_W0, 0x4A }
+		{ "tileloaddrst1", T_66 | T_0F38 | T_W0, 0x4A },
+//		{ "t2rpntlvwz0", T_0F38 | T_W0, 0x6E },
 	};
 	for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 		const Tbl& t = tbl[i];
diff --git a/test/avx10/amx.txt b/test/avx10/amx.txt
index 74cf06d..46dea5f 100644
--- a/test/avx10/amx.txt
+++ b/test/avx10/amx.txt
@@ -1,9 +1,14 @@
 ldtilecfg(ptr[rax + rcx * 4 + 64]);
+ldtilecfg(ptr [r30+r29*4+0x12]);
 sttilecfg(ptr[rsp + rax * 8 + 128]);
+sttilecfg(ptr [r30+r29*4+0x12]);
 tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]);
+tileloadd(tmm2, ptr [r30+r29*4+0x12]);
 tileloaddt1(tmm4, ptr[r8 + r9 + 32]);
+tileloaddt1(tmm7, ptr [r30+r29*4+0x12]);
 tilerelease();
 tilestored(ptr[r10 + r11 * 2 + 32], tmm2);
+tilestored(ptr [r30+r29*4+0x12], tmm1);
 tilezero(tmm7);
 tdpbssd(tmm1, tmm2, tmm3);
 tdpbsud(tmm2, tmm3, tmm4);
@@ -14,8 +19,11 @@
 tileloadd(tmm1, ptr[r8+r8]);
 tileloadd(tmm1, ptr[rax+rcx*4]);
 tileloadd(tmm1, ptr[r8+r9*1+0x40]);
+tileloadd(tmm1, ptr[r30+r29*1+0x80]);
 tileloaddrs(tmm3, ptr[rdi + rdx * 2 + 8]);
+tileloaddrs(tmm7, ptr[r31 + rdx * 2 + 8]);
 tileloaddrst1(tmm4, ptr[r8 + r9 + 32]);
+tileloaddrst1(tmm4, ptr[r25 + r9 + 32]);
 
 tdpbf8ps(tmm1, tmm2, tmm3);
 tdpbhf8ps(tmm1, tmm2, tmm3);
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 38f2468..1317544 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1923,10 +1923,10 @@
 void rdgsbase(const Reg32e& r) { opRR(ecx, r, T_F3|T_0F|T_ALLOW_DIFF_SIZE, 0xAE); }
 void wrfsbase(const Reg32e& r) { opRR(edx, r, T_F3|T_0F|T_ALLOW_DIFF_SIZE, 0xAE); }
 void wrgsbase(const Reg32e& r) { opRR(ebx, r, T_F3|T_0F|T_ALLOW_DIFF_SIZE, 0xAE); }
-void ldtilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_0F38|T_W0, 0x49); }
-void sttilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_66|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_66|T_0F38 | T_W0, 0x49); }
+void ldtilecfg(const Address& addr) { opAMX(tmm0, addr, T_0F38|T_W0, 0x49); }
+void sttilecfg(const Address& addr) { opAMX(tmm0, addr,  T_66|T_0F38|T_W0, 0x49); }
+void tilestored(const Address& addr, const Tmm& tm) { opAMX(tm, addr, T_F3|T_0F38|T_W0, 0x4B); }
 void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); }
-void tilestored(const Address& addr, const Tmm& tm) { if (opROO(Reg(), addr, tm, T_APX|T_F3|T_0F38|T_W0, 0x4B)) return; opVex(tm, &tmm0, addr, T_F3|T_0F38|T_W0, 0x4B); }
 void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }
 void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2|T_0F38|T_W0, 0x5E); }
 void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3|T_0F38|T_W0, 0x5E); }