add ttcmm, ttdp, ttmmult, ttransposed
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index 3e93c3e..dfa9bb1 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp
@@ -2051,6 +2051,7 @@ puts("void tilezero(const Tmm& t) { opVex(t, &tmm0, tmm0, T_F2|T_0F38|T_W0, 0x49); }"); puts("void tconjtfp16(const Tmm& t1, const Tmm& t2) { opVex(t1, 0, t2, T_66|T_0F38|T_W0, 0x6B); }"); + puts("void ttransposed(const Tmm& t1, const Tmm& t2) { opVex(t1, 0, t2, T_F3|T_0F38|T_W0, 0x5F); }"); } void putAMX_TM() @@ -2101,7 +2102,11 @@ { "tcmmimfp16ps", T_66 | T_0F38 | T_W0, 0x6C }, { "tcmmrlfp16ps", T_0F38 | T_W0, 0x6C }, { "tconjtcmmimfp16ps", T_0F38 | T_W0, 0x6B }, - + { "ttcmmimfp16ps", T_F2 | T_0F38 | T_W0, 0x6B }, + { "ttcmmrlfp16ps", T_F3 | T_0F38 | T_W0, 0x6B }, + { "ttdpbf16ps", T_F3 | T_0F38 | T_W0, 0x6C }, + { "ttdpfp16ps", T_F2 | T_0F38 | T_W0, 0x6C }, + { "ttmmultf32ps", T_0F38 | T_W0, 0x48 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& t = tbl[i];
diff --git a/test/avx10/amx.txt b/test/avx10/amx.txt index 9f73573..bc50a7f 100644 --- a/test/avx10/amx.txt +++ b/test/avx10/amx.txt
@@ -77,3 +77,14 @@ tilemovrow(zmm1, tmm2, r30d); tilemovrow(zmm29, tmm2, 0x12); + +ttcmmimfp16ps(tmm1, tmm2, tmm3); +ttcmmrlfp16ps(tmm1, tmm2, tmm3); + +ttdpbf16ps(tmm1, tmm2, tmm3); +ttdpfp16ps(tmm1, tmm2, tmm3); + +ttmmultf32ps(tmm1, tmm2, tmm3); + +ttransposed(tmm1, tmm2); +
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index b4a4b92..56f3c11 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h
@@ -1937,6 +1937,11 @@ void tcmmimfp16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66|T_0F38|T_W0, 0x6C); } void tcmmrlfp16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38|T_W0, 0x6C); } void tconjtcmmimfp16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38|T_W0, 0x6B); } +void ttcmmimfp16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2|T_0F38|T_W0, 0x6B); } +void ttcmmrlfp16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3|T_0F38|T_W0, 0x6B); } +void ttdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3|T_0F38|T_W0, 0x6C); } +void ttdpfp16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2|T_0F38|T_W0, 0x6C); } +void ttmmultf32ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38|T_W0, 0x48); } void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2|T_0F38|T_W0, 0x4B); } void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66|T_0F38|T_W0, 0x4B); } void tileloaddrs(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2|T_0F38|T_W0, 0x4A); } @@ -1955,6 +1960,7 @@ void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); } void tilezero(const Tmm& t) { opVex(t, &tmm0, tmm0, T_F2|T_0F38|T_W0, 0x49); } void tconjtfp16(const Tmm& t1, const Tmm& t2) { opVex(t1, 0, t2, T_66|T_0F38|T_W0, 0x6B); } +void ttransposed(const Tmm& t1, const Tmm& t2) { opVex(t1, 0, t2, T_F3|T_0F38|T_W0, 0x5F); } #else void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); } void jcxz(const Label& label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }