Merge branch 'dev'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a972c55..c5be079 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.10)
-project(xbyak LANGUAGES CXX VERSION 7.24.2)
+project(xbyak LANGUAGES CXX VERSION 7.25)
file(GLOB headers xbyak/*.h)
diff --git a/doc/changelog.md b/doc/changelog.md
index 7a78495..dd19d3a 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -1,5 +1,6 @@
# History
+* 2025/Jun/02 ver 7.25 rename BF16 operations according to new AVX10.2 specification
* 2025/Mar/12 ver 7.24.2 fix. vcvtneps2bf16 should support AVX-NE-CONVERT (revert 749aa31)
* 2025/Feb/26 ver 7.24.1 fix 3-op shift APX instructions with NDD format.
* 2025/Feb/17 ver 7.24 feat: add error check for invalid REX prefix with AH/BH/CH/DH registers. enhance size mismatch detection for mem-reg operations like 'add eax, byte[rax]'
diff --git a/gen/avx_type.hpp b/gen/avx_type.hpp
index 6fa11f4..38d3a10 100644
--- a/gen/avx_type.hpp
+++ b/gen/avx_type.hpp
@@ -21,22 +21,11 @@
if (type & T_66) str += "|T_66";
if (type & T_F3) str += "|T_F3";
if (type & T_F2) str += "|T_F2";
- if (type & T_0F) {
- if (type & T_FP16) {
- str += "|T_MAP5";
- } else {
- str += "|T_0F";
- }
- }
- if (type & T_0F38) {
- if (type & T_FP16) {
- str += "|T_MAP6";
- } else {
- str += "|T_0F38";
- }
- }
+ if (type & T_MAP5) str += "|T_MAP5";
+ if (type & T_0F) str += "|T_0F";
+ if (type & T_MAP6) str += "|T_MAP6";
+ if (type & T_0F38) str += "|T_0F38";
if (type & T_0F3A) str += "|T_0F3A";
- if (type & T_L0) str += "|T_L0";
if (type & T_L1) str += "|T_L1";
if (type & T_W0) str += "|T_W0";
if (type & T_W1) str += "|T_W1";
diff --git a/gen/avx_type_def.h b/gen/avx_type_def.h
index 989920b..699cabc 100644
--- a/gen/avx_type_def.h
+++ b/gen/avx_type_def.h
@@ -17,7 +17,7 @@
static const uint64_t T_0F = 1ull << 8;
static const uint64_t T_0F38 = 1ull << 9;
static const uint64_t T_0F3A = 1ull << 10;
- static const uint64_t T_L0 = 1ull << 11;
+ static const uint64_t T_MAP5 = 1ull << 11;
static const uint64_t T_L1 = 1ull << 12;
static const uint64_t T_W0 = 1ull << 13;
static const uint64_t T_W1 = 1ull << 14;
@@ -38,9 +38,7 @@
static const uint64_t T_M_K = 1ull << 28; // mem{k}
static const uint64_t T_VSIB = 1ull << 29;
static const uint64_t T_MEM_EVEX = 1ull << 30; // use evex if mem
- static const uint64_t T_FP16 = 1ull << 31; // avx512-fp16
- static const uint64_t T_MAP5 = T_FP16 | T_0F;
- static const uint64_t T_MAP6 = T_FP16 | T_0F38;
+ static const uint64_t T_MAP6 = 1ull << 31;
static const uint64_t T_NF = 1ull << 32; // T_nf
static const uint64_t T_CODE1_IF1 = 1ull << 33; // code|=1 if !r.isBit(8)
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index a8b1bb7..7fa82ef 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -197,7 +197,7 @@
{ 0x7D, "vcvtuw2ph", T_F2 | T_MAP5 | T_MUST_EVEX | T_YMM | T_EW0 | T_B16 | T_ER_Z },
{ 0x7D, "vcvtw2ph", T_F3 | T_MAP5 | T_MUST_EVEX | T_YMM | T_EW0 | T_B16 | T_ER_Z },
- { 0x51, "vsqrtnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16 },
+ { 0x51, "vsqrtbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16 },
{ 0x2F, "vcomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 },
{ 0x2E, "vucomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 },
@@ -211,10 +211,10 @@
{ 0x2E, "vucomxss", T_MUST_EVEX | T_F3 | T_0F | T_EW0 | T_SAE_X | T_N4 },
// 13.1
- { 0x69, "vcvtnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
- { 0x6B, "vcvtnebf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
- { 0x68, "vcvttnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
- { 0x6A, "vcvttnebf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
+ { 0x69, "vcvtbf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
+ { 0x6B, "vcvtbf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
+ { 0x68, "vcvttbf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
+ { 0x6A, "vcvttbf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
// 13.3
{ 0x6D, "vcvttpd2qqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z },
// 13.5
@@ -439,38 +439,38 @@
{ 0x13, "vcvtsh2ss", T_MAP6 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, false },
{ 0x1D, "vcvtss2sh", T_MAP5 | T_MUST_EVEX | T_EW0 | T_ER_X | T_N4, false },
- { 0x58, "vaddnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false },
- { 0x5E, "vdivnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false },
- { 0x5F, "vmaxpbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false },
- { 0x5D, "vminpbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false },
- { 0x59, "vmulnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false },
- { 0x2C, "vscalefpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16,false },
- { 0x5C, "vsubnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false },
+ { 0x58, "vaddbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false },
+ { 0x5E, "vdivbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false },
+ { 0x5F, "vmaxbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false },
+ { 0x5D, "vminbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false },
+ { 0x59, "vmulbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false },
+ { 0x2C, "vscalefbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16,false },
+ { 0x5C, "vsubbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false },
- { 0x98, "vfmadd132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0xA8, "vfmadd213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0xB8, "vfmadd231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0x98, "vfmadd132bf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0xA8, "vfmadd213bf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0xB8, "vfmadd231bf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0x9C, "vfnmadd132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0xAC, "vfnmadd213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0xBC, "vfnmadd231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0x9C, "vfnmadd132bf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0xAC, "vfnmadd213bf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0xBC, "vfnmadd231bf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0x9A, "vfmsub132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0xAA, "vfmsub213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0xBA, "vfmsub231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0x9A, "vfmsub132bf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0xAA, "vfmsub213bf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0xBA, "vfmsub231bf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0x9E, "vfnmsub132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0xAE, "vfnmsub213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0xBE, "vfnmsub231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0x9E, "vfnmsub132bf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0xAE, "vfnmsub213bf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0xBE, "vfnmsub231bf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
{ 0x67, "vcvt2ps2phx", T_MUST_EVEX | T_66 | T_0F38 | T_EW0 | T_YMM | T_B32 | T_ER_Y | T_ER_Z, false },
- { 0x74, "vcvtne2ph2bf8", T_MUST_EVEX | T_F2 | T_0F38 | T_EW0 | T_YMM | T_B16 | T_N1, false },
- { 0x74, "vcvtne2ph2bf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false },
- { 0x18, "vcvtne2ph2hf8", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false },
- { 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false },
+ { 0x74, "vcvt2ph2bf8", T_MUST_EVEX | T_F2 | T_0F38 | T_EW0 | T_YMM | T_B16 | T_N1, false },
+ { 0x74, "vcvt2ph2bf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false },
+ { 0x18, "vcvt2ph2hf8", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false },
+ { 0x1B, "vcvt2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false },
- { 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_B32, false },
- { 0x52, "vminmaxnepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },
+ { 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, false },
+ { 0x52, "vminmaxbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },
{ 0x52, "vminmaxpd", T_MUST_EVEX | T_66 | T_0F3A | T_EW1 | T_YMM | T_B64 | T_SAE_Y | T_SAE_Z, true },
{ 0x52, "vminmaxph", T_MUST_EVEX | T_0F3A | T_EW0 | T_YMM | T_B16 | T_SAE_Y | T_SAE_Z, true },
{ 0x52, "vminmaxps", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_YMM | T_B32 | T_SAE_Y | T_SAE_Z, true },
@@ -891,14 +891,14 @@
{ 0x62, "vpexpandb", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_N1, false },
{ 0x62, "vpexpandw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_N2, false },
- { 0x2F, "vcomsbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_N2, false },
- { 0x42, "vgetexppbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0x26, "vgetmantpbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },
- { 0x4C, "vrcppbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0x56, "vreducenepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },
- { 0x08, "vrndscalenepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },
- { 0x4E, "vrsqrtpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
- { 0x2C, "vscalefpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0x2F, "vcomisbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_N2, false },
+ { 0x42, "vgetexpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0x26, "vgetmantbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },
+ { 0x4C, "vrcpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0x56, "vreducebf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },
+ { 0x08, "vrndscalebf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },
+ { 0x4E, "vrsqrtbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
+ { 0x2C, "vscalefbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
@@ -1089,8 +1089,8 @@
void putAVX10_2()
{
- puts("void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); }");
- puts("void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); }");
+ puts("void vcmpbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); }");
+ puts("void vfpclassbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); }");
const struct Tbl {
uint8_t code;
@@ -1110,10 +1110,10 @@
puts("void vcvthf82ph(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_N1, 0x1E); }");
const Tbl tbl2[] = {
- { 0x74, "vcvtneph2bf8", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_B16 },
- { 0x74, "vcvtneph2bf8s", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 },
- { 0x18, "vcvtneph2hf8", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 },
- { 0x1B, "vcvtneph2hf8s", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 },
+ { 0x74, "vcvtph2bf8", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_B16 },
+ { 0x74, "vcvtph2bf8s", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 },
+ { 0x18, "vcvtph2hf8", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 },
+ { 0x1B, "vcvtph2hf8s", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl2); i++) {
const Tbl *p = &tbl2[i];
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index 32ecf3e..ec04296 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -2064,6 +2064,19 @@
puts("void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }");
}
+void putAMX_rev54()
+{
+ puts("void tileloaddrs(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2|T_0F38|T_W0, 0x4A); }");
+ puts("void tileloaddrst1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66|T_0F38|T_W0, 0x4A); }");
+
+ puts("void tdpbf8ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_MAP5|T_W0, 0xFD); }");
+ puts("void tdpbhf8ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2|T_MAP5|T_W0, 0xFD); }");
+ puts("void tdphbf8ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3|T_MAP5|T_W0, 0xFD); }");
+ puts("void tdphf8ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66|T_MAP5|T_W0, 0xFD); }");
+
+ puts("void tmmultf32ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x48); }");
+}
+
void putFixed()
{
puts("#ifdef XBYAK64");
@@ -2071,6 +2084,7 @@
putAMX_TILE();
putAMX_INT8();
putAMX_BF16();
+ putAMX_rev54();
puts("#else");
put32();
puts("#endif");
diff --git a/meson.build b/meson.build
index 002dcd5..31a069c 100644
--- a/meson.build
+++ b/meson.build
@@ -5,7 +5,7 @@
project(
'xbyak',
'cpp',
- version: '7.24.2',
+ version: '7.25',
license: 'BSD-3-Clause',
default_options: 'b_ndebug=if-release'
)
diff --git a/readme.md b/readme.md
index 56f2d77..955edb3 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,5 @@
-# Xbyak 7.24.2 [![Badge Build]][Build Status]
+# Xbyak 7.25 [![Badge Build]][Build Status]
*A JIT assembler for x86/x64 architectures supporting advanced instruction sets up to AVX10.2*
@@ -60,6 +60,14 @@
Almost C++03 or later compilers for x86/x64 such as Visual Studio, g++, clang++, Intel C++ compiler and g++ on mingw/cygwin.
+
+### References
+- [Intel 64 and IA-32 Architectures Software Developer Manuals](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html)
+- [Intel Advanced Performance Extensions (Intel APX) Architecture Specification](https://www.intel.com/content/www/us/en/content-details/836198/intel-advanced-performance-extensions-intel-apx-architecture-specification.html)
+- [Intel Advanced Vector Extensions 10.2 (Intel AVX10.2) Architecture Specification](https://www.intel.com/content/www/us/en/content-details/855340/intel-advanced-vector-extensions-10-2-intel-avx10-2-architecture-specification.html)
+- [Intel Architecture Instruction Set Extensions Programming Reference](https://www.intel.com/content/www/us/en/content-details/851355/intel-architecture-instruction-set-extensions-programming-reference.html)
+- [Intel Software Development Emulator](https://www.intel.com/content/www/us/en/download/684897/intel-software-development-emulator.html)
+
## License
[BSD-3-Clause License](http://opensource.org/licenses/BSD-3-Clause)
diff --git a/readme.txt b/readme.txt
index 7575e7a..a4b9c2d 100644
--- a/readme.txt
+++ b/readme.txt
@@ -1,5 +1,5 @@
- C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.24.2
+ C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.25
-----------------------------------------------------------------------------
◎概要
@@ -404,6 +404,7 @@
-----------------------------------------------------------------------------
◎履歴
+2025/06/02 ver 7.25 新しいAVX10.2仕様書にしたがってBF16命令をリネーム
2025/03/12 ver 7.24.2 vcvtneps2bf16はAVX-NE-CONVERTをサポートすべき(revert 749aa31)
2025/02/26 ver 7.24.1 NDD形式の3-opシフト命令のバグ修正
2025/02/17 ver 7.24 ahなどとREXプレフィクスの共用やadd eax, byte[rax]のようなサイズ不整合をエラーとする
diff --git a/sample/cpuid/arl.txt b/sample/cpuid/arl.txt
index db718fe..11d3227 100644
--- a/sample/cpuid/arl.txt
+++ b/sample/cpuid/arl.txt
@@ -1,2 +1,2 @@
vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd sha512 sm3 sm4 avx_vnni_int16 aeskle wide_kl keylocker keylocker_wide
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd sha512 sm3 sm4 avx_vnni_int16 aeskle wide_kl keylocker keylocker_wide
diff --git a/sample/cpuid/cpuid.sh b/sample/cpuid/cpuid.sh
index ba3bd64..62e4444 100755
--- a/sample/cpuid/cpuid.sh
+++ b/sample/cpuid/cpuid.sh
@@ -11,7 +11,7 @@
make -C ../ test_util64
-cpus=(p4p mrm pnr nhm wsm snb ivb hsw bdw slt slm glm glp tnt skl cnl icl skx clx cpx icx tgl adl mtl rpl spr emr gnr gnr256 srf arl lnl)
+cpus=(p4p mrm pnr nhm wsm snb ivb hsw bdw slt slm glm glp tnt skl cnl icl skx clx cpx icx tgl adl mtl rpl spr emr gnr gnr256 dmr srf arl lnl ptl cwf)
for cpu in ${cpus[@]} ; do
echo $cpu
diff --git a/sample/cpuid/cpx.txt b/sample/cpuid/cpx.txt
index c315b4f..707159d 100644
--- a/sample/cpuid/cpx.txt
+++ b/sample/cpuid/cpx.txt
@@ -1,2 +1,2 @@
vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni avx512_bf16 clflushopt clwb
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap f16c movbe avx512f avx512dq avx512cd avx512bw avx512vl avx512_vnni avx512_bf16 clflushopt clwb amx_fp8 amx_transpose amx_tf32 amx_avx512
diff --git a/sample/cpuid/cwf.txt b/sample/cpuid/cwf.txt
new file mode 100644
index 0000000..95171d9
--- /dev/null
+++ b/sample/cpuid/cwf.txt
@@ -0,0 +1,2 @@
+vendor intel
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd prefetchiti sha512 sm3 sm4 avx_vnni_int16 aeskle wide_kl keylocker keylocker_wide
diff --git a/sample/cpuid/dmr.txt b/sample/cpuid/dmr.txt
new file mode 100644
index 0000000..680ea30
--- /dev/null
+++ b/sample/cpuid/dmr.txt
@@ -0,0 +1,2 @@
+vendor intel
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize amx_fp16 avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd prefetchiti sha512 sm3 sm4 avx_vnni_int16 apx_f avx10 amx_fp8 amx_transpose amx_tf32 amx_avx512 amx_movrs
diff --git a/sample/cpuid/ptl.txt b/sample/cpuid/ptl.txt
new file mode 100644
index 0000000..11d3227
--- /dev/null
+++ b/sample/cpuid/ptl.txt
@@ -0,0 +1,2 @@
+vendor intel
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize avx_vnni_int8 avx_ne_convert avx_ifma cmpccxadd sha512 sm3 sm4 avx_vnni_int16 aeskle wide_kl keylocker keylocker_wide
diff --git a/sample/cpuid/spr.txt b/sample/cpuid/spr.txt
index c53700d..b4a50e9 100644
--- a/sample/cpuid/spr.txt
+++ b/sample/cpuid/spr.txt
@@ -1,2 +1,2 @@
vendor intel
- mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize tsxldtrk
+ mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp xsave(xgetvb) osxsave pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx512f avx512dq avx512_ifma avx512cd avx512bw avx512vl avx512_vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq avx512_bf16 amx(tile) amx(int8) amx(bf16) avx_vnni avx512_fp16 waitpkg clflushopt cldemote clwb movdiri movdir64b uintr serialize
diff --git a/sample/test_util.cpp b/sample/test_util.cpp
index 870e060..a083dd1 100644
--- a/sample/test_util.cpp
+++ b/sample/test_util.cpp
@@ -112,6 +112,11 @@
{ Cpu::tKEYLOCKER, "keylocker" },
{ Cpu::tKEYLOCKER_WIDE, "keylocker_wide" },
{ Cpu::tTSXLDTRK, "tsxldtrk" },
+ { Cpu::tAMX_FP8, "amx_fp8" },
+ { Cpu::tAMX_TRANSPOSE, "amx_transpose" },
+ { Cpu::tAMX_TF32, "amx_tf32" },
+ { Cpu::tAMX_AVX512, "amx_avx512" },
+ { Cpu::tAMX_MOVRS, "amx_movrs" },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
diff --git a/test/Makefile b/test/Makefile
index 769eb8a..5a0de2b 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -67,7 +67,7 @@
sf_test: sf_test.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) $< -o $@ -DXBYAK64
-TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt misc.txt convert.txt minmax.txt saturation.txt apx.txt
+TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt misc.txt convert.txt minmax.txt saturation.txt apx.txt amx.txt
xed_test:
@set -e; \
for target in $(addprefix avx10/, $(TEST_FILES)); do \
diff --git a/test/avx10/amx.txt b/test/avx10/amx.txt
new file mode 100644
index 0000000..8752c7e
--- /dev/null
+++ b/test/avx10/amx.txt
@@ -0,0 +1,9 @@
+tileloaddrs(tmm3, ptr[rdi + rdx * 2 + 8]);
+tileloaddrst1(tmm4, ptr[r8 + r9 + 32]);
+
+tdpbf8ps(tmm1, tmm2, tmm3);
+tdpbhf8ps(tmm1, tmm2, tmm3);
+tdphbf8ps(tmm1, tmm2, tmm3);
+tdphf8ps(tmm1, tmm2, tmm3);
+
+tmmultf32ps(tmm1, tmm2, tmm3);
diff --git a/test/avx10/bf16.txt b/test/avx10/bf16.txt
index c544e02..0f9ea8c 100644
--- a/test/avx10/bf16.txt
+++ b/test/avx10/bf16.txt
@@ -1,210 +1,210 @@
-vaddnepbf16(xm1, xm2, xm3);
-vaddnepbf16(ym1|k1, ym2, ptr[rax+128]);
-vaddnepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vaddnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vaddbf16(xm1, xm2, xm3);
+vaddbf16(ym1|k1, ym2, ptr[rax+128]);
+vaddbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vaddbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vdivnepbf16(xm1, xm2, xm3);
-vdivnepbf16(ym1|k1, ym2, ptr[rax+128]);
-vdivnepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vdivnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vdivbf16(xm1, xm2, xm3);
+vdivbf16(ym1|k1, ym2, ptr[rax+128]);
+vdivbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vdivbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vmaxpbf16(xm1, xm2, xm3);
-vmaxpbf16(ym1|k1, ym2, ptr[rax+128]);
-vmaxpbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vmaxpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vmaxbf16(xm1, xm2, xm3);
+vmaxbf16(ym1|k1, ym2, ptr[rax+128]);
+vmaxbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vmaxbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vminpbf16(xm1, xm2, xm3);
-vminpbf16(ym1|k1, ym2, ptr[rax+128]);
-vminpbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vminpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vminbf16(xm1, xm2, xm3);
+vminbf16(ym1|k1, ym2, ptr[rax+128]);
+vminbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vminbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vmulnepbf16(xm1, xm2, xm3);
-vmulnepbf16(ym1|k1, ym2, ptr[rax+128]);
-vmulnepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vmulnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vmulbf16(xm1, xm2, xm3);
+vmulbf16(ym1|k1, ym2, ptr[rax+128]);
+vmulbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vmulbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vscalefpbf16(xm1, xm2, xm3);
-vscalefpbf16(ym1|k1, ym2, ptr[rax+128]);
-vscalefpbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vscalefpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vscalefbf16(xm1, xm2, xm3);
+vscalefbf16(ym1|k1, ym2, ptr[rax+128]);
+vscalefbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vscalefbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vsubnepbf16(xm1, xm2, xm3);
-vsubnepbf16(ym1|k1, ym2, ptr[rax+128]);
-vsubnepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vsubnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vsubbf16(xm1, xm2, xm3);
+vsubbf16(ym1|k1, ym2, ptr[rax+128]);
+vsubbf16(ym1|k1, ym2, ptr_b[rax+128]);
+vsubbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// madd
-vfmadd132nepbf16(xm1, xm2, xm3);
-vfmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]);
-vfmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vfmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vfmadd132bf16(xm1, xm2, xm3);
+vfmadd132bf16(ym1|k1, ym2, ptr[rax+128]);
+vfmadd132bf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfmadd132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vfmadd213nepbf16(xm1, xm2, xm3);
-vfmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]);
-vfmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vfmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vfmadd213bf16(xm1, xm2, xm3);
+vfmadd213bf16(ym1|k1, ym2, ptr[rax+128]);
+vfmadd213bf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfmadd213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vfmadd231nepbf16(xm1, xm2, xm3);
-vfmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]);
-vfmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vfmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vfmadd231bf16(xm1, xm2, xm3);
+vfmadd231bf16(ym1|k1, ym2, ptr[rax+128]);
+vfmadd231bf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfmadd231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// nmadd
-vfnmadd132nepbf16(xm1, xm2, xm3);
-vfnmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]);
-vfnmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vfnmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vfnmadd132bf16(xm1, xm2, xm3);
+vfnmadd132bf16(ym1|k1, ym2, ptr[rax+128]);
+vfnmadd132bf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfnmadd132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vfnmadd213nepbf16(xm1, xm2, xm3);
-vfnmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]);
-vfnmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vfnmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vfnmadd213bf16(xm1, xm2, xm3);
+vfnmadd213bf16(ym1|k1, ym2, ptr[rax+128]);
+vfnmadd213bf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfnmadd213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vfnmadd231nepbf16(xm1, xm2, xm3);
-vfnmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]);
-vfnmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vfnmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vfnmadd231bf16(xm1, xm2, xm3);
+vfnmadd231bf16(ym1|k1, ym2, ptr[rax+128]);
+vfnmadd231bf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfnmadd231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// msub
-vfmsub132nepbf16(xm1, xm2, xm3);
-vfmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]);
-vfmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vfmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vfmsub132bf16(xm1, xm2, xm3);
+vfmsub132bf16(ym1|k1, ym2, ptr[rax+128]);
+vfmsub132bf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfmsub132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vfmsub213nepbf16(xm1, xm2, xm3);
-vfmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]);
-vfmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vfmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vfmsub213bf16(xm1, xm2, xm3);
+vfmsub213bf16(ym1|k1, ym2, ptr[rax+128]);
+vfmsub213bf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfmsub213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vfmsub231nepbf16(xm1, xm2, xm3);
-vfmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]);
-vfmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vfmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vfmsub231bf16(xm1, xm2, xm3);
+vfmsub231bf16(ym1|k1, ym2, ptr[rax+128]);
+vfmsub231bf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfmsub231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// nmsub
-vfnmsub132nepbf16(xm1, xm2, xm3);
-vfnmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]);
-vfnmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vfnmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vfnmsub132bf16(xm1, xm2, xm3);
+vfnmsub132bf16(ym1|k1, ym2, ptr[rax+128]);
+vfnmsub132bf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfnmsub132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vfnmsub213nepbf16(xm1, xm2, xm3);
-vfnmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]);
-vfnmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vfnmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vfnmsub213bf16(xm1, xm2, xm3);
+vfnmsub213bf16(ym1|k1, ym2, ptr[rax+128]);
+vfnmsub213bf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfnmsub213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vfnmsub231nepbf16(xm1, xm2, xm3);
-vfnmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]);
-vfnmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
-vfnmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
+vfnmsub231bf16(xm1, xm2, xm3);
+vfnmsub231bf16(ym1|k1, ym2, ptr[rax+128]);
+vfnmsub231bf16(ym1|k1, ym2, ptr_b[rax+128]);
+vfnmsub231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
-vcmppbf16(k1, xm5, xm4, 5);
-vcmppbf16(k2, ym5, ym4, 6);
-vcmppbf16(k3, ym15, ptr_b[rax+128], 7);
-vcmppbf16(k4, zm30, zm20, 8);
-vcmppbf16(k5, zm1, ptr[rax+128], 9);
-vcmppbf16(k6, zm10, ptr_b[rax+128], 10);
+vcmpbf16(k1, xm5, xm4, 5);
+vcmpbf16(k2, ym5, ym4, 6);
+vcmpbf16(k3, ym15, ptr_b[rax+128], 7);
+vcmpbf16(k4, zm30, zm20, 8);
+vcmpbf16(k5, zm1, ptr[rax+128], 9);
+vcmpbf16(k6, zm10, ptr_b[rax+128], 10);
-vfpclasspbf16(k1, xm4, 5);
-vfpclasspbf16(k2|k5, ym4, 6);
-vfpclasspbf16(k3|k5, zm20, 7);
-vfpclasspbf16(k3|k5, xword[rax+128], 8);
-vfpclasspbf16(k3, xword_b[rax+128], 9);
-vfpclasspbf16(k5|k5, yword[rax+128], 10);
-vfpclasspbf16(k6|k5, yword_b[rax+128], 11);
-vfpclasspbf16(k7|k5, zword[rax+128], 12);
-vfpclasspbf16(k7|k5, zword_b[rax+128], 13);
+vfpclassbf16(k1, xm4, 5);
+vfpclassbf16(k2|k5, ym4, 6);
+vfpclassbf16(k3|k5, zm20, 7);
+vfpclassbf16(k3|k5, xword[rax+128], 8);
+vfpclassbf16(k3, xword_b[rax+128], 9);
+vfpclassbf16(k5|k5, yword[rax+128], 10);
+vfpclassbf16(k6|k5, yword_b[rax+128], 11);
+vfpclassbf16(k7|k5, zword[rax+128], 12);
+vfpclassbf16(k7|k5, zword_b[rax+128], 13);
-vcomsbf16(xm2, xm3);
-vcomsbf16(xm2, ptr[rax+128]);
+vcomisbf16(xm2, xm3);
+vcomisbf16(xm2, ptr[rax+128]);
-vgetexppbf16(xm1|k3, xmm2);
-vgetexppbf16(xm1|k3, ptr[rax+128]);
-vgetexppbf16(xm1|k3, ptr_b[rax+128]);
+vgetexpbf16(xm1|k3, xmm2);
+vgetexpbf16(xm1|k3, ptr[rax+128]);
+vgetexpbf16(xm1|k3, ptr_b[rax+128]);
-vgetexppbf16(ym1|k3, ymm2);
-vgetexppbf16(ym1|k3, ptr[rax+128]);
-vgetexppbf16(ym1|k3, ptr_b[rax+128]);
+vgetexpbf16(ym1|k3, ymm2);
+vgetexpbf16(ym1|k3, ptr[rax+128]);
+vgetexpbf16(ym1|k3, ptr_b[rax+128]);
-vgetexppbf16(zm1|k3, zmm2);
-vgetexppbf16(zm1|k3, ptr[rax+128]);
-vgetexppbf16(zm1|k3, ptr_b[rax+128]);
+vgetexpbf16(zm1|k3, zmm2);
+vgetexpbf16(zm1|k3, ptr[rax+128]);
+vgetexpbf16(zm1|k3, ptr_b[rax+128]);
-vgetmantpbf16(xm1|k3, xmm2, 3);
-vgetmantpbf16(xm1|k3, ptr[rax+128], 5);
-vgetmantpbf16(xm1|k3, ptr_b[rax+128], 9);
+vgetmantbf16(xm1|k3, xmm2, 3);
+vgetmantbf16(xm1|k3, ptr[rax+128], 5);
+vgetmantbf16(xm1|k3, ptr_b[rax+128], 9);
-vgetmantpbf16(ym1|k3, ymm2, 3);
-vgetmantpbf16(ym1|k3, ptr[rax+128], 5);
-vgetmantpbf16(ym1|k3, ptr_b[rax+128], 9);
+vgetmantbf16(ym1|k3, ymm2, 3);
+vgetmantbf16(ym1|k3, ptr[rax+128], 5);
+vgetmantbf16(ym1|k3, ptr_b[rax+128], 9);
-vgetmantpbf16(zm1|k3, zmm2, 3);
-vgetmantpbf16(zm1|k3, ptr[rax+128], 5);
-vgetmantpbf16(zm1|k3, ptr_b[rax+128], 9);
+vgetmantbf16(zm1|k3, zmm2, 3);
+vgetmantbf16(zm1|k3, ptr[rax+128], 5);
+vgetmantbf16(zm1|k3, ptr_b[rax+128], 9);
-vrcppbf16(xm1|k5, xm2);
-vrcppbf16(xm1|k5, ptr[rcx+128]);
-vrcppbf16(xm1|k5, ptr_b[rcx+128]);
+vrcpbf16(xm1|k5, xm2);
+vrcpbf16(xm1|k5, ptr[rcx+128]);
+vrcpbf16(xm1|k5, ptr_b[rcx+128]);
-vrcppbf16(ym1|k5, ym2);
-vrcppbf16(ym1|k5, ptr[rcx+128]);
-vrcppbf16(ym1|k5, ptr_b[rcx+128]);
+vrcpbf16(ym1|k5, ym2);
+vrcpbf16(ym1|k5, ptr[rcx+128]);
+vrcpbf16(ym1|k5, ptr_b[rcx+128]);
-vrcppbf16(zm1|k5, zm2);
-vrcppbf16(zm1|k5, ptr[rcx+128]);
-vrcppbf16(zm1|k5, ptr_b[rcx+128]);
+vrcpbf16(zm1|k5, zm2);
+vrcpbf16(zm1|k5, ptr[rcx+128]);
+vrcpbf16(zm1|k5, ptr_b[rcx+128]);
-vreducenepbf16(xm1|k4, xm2, 1);
-vreducenepbf16(xm1|k4, ptr[rax+128], 1);
-vreducenepbf16(xm1|k4, ptr_b[rax+128], 1);
+vreducebf16(xm1|k4, xm2, 1);
+vreducebf16(xm1|k4, ptr[rax+128], 1);
+vreducebf16(xm1|k4, ptr_b[rax+128], 1);
-vreducenepbf16(ym1|k4, ym2, 1);
-vreducenepbf16(ym1|k4, ptr[rax+128], 1);
-vreducenepbf16(ym1|k4, ptr_b[rax+128], 1);
+vreducebf16(ym1|k4, ym2, 1);
+vreducebf16(ym1|k4, ptr[rax+128], 1);
+vreducebf16(ym1|k4, ptr_b[rax+128], 1);
-vreducenepbf16(zm1|k4, zm2, 1);
-vreducenepbf16(zm1|k4, ptr[rax+128], 1);
-vreducenepbf16(zm1|k4, ptr_b[rax+128], 1);
+vreducebf16(zm1|k4, zm2, 1);
+vreducebf16(zm1|k4, ptr[rax+128], 1);
+vreducebf16(zm1|k4, ptr_b[rax+128], 1);
-vrndscalenepbf16(xm1|k4, xm2, 1);
-vrndscalenepbf16(xm1|k4, ptr[rax+128], 1);
-vrndscalenepbf16(xm1|k4, ptr_b[rax+128], 1);
+vrndscalebf16(xm1|k4, xm2, 1);
+vrndscalebf16(xm1|k4, ptr[rax+128], 1);
+vrndscalebf16(xm1|k4, ptr_b[rax+128], 1);
-vrndscalenepbf16(ym1|k4, ym2, 1);
-vrndscalenepbf16(ym1|k4, ptr[rax+128], 1);
-vrndscalenepbf16(ym1|k4, ptr_b[rax+128], 1);
+vrndscalebf16(ym1|k4, ym2, 1);
+vrndscalebf16(ym1|k4, ptr[rax+128], 1);
+vrndscalebf16(ym1|k4, ptr_b[rax+128], 1);
-vrndscalenepbf16(zm1|k4, zm2, 1);
-vrndscalenepbf16(zm1|k4, ptr[rax+128], 1);
-vrndscalenepbf16(zm1|k4, ptr_b[rax+128], 1);
+vrndscalebf16(zm1|k4, zm2, 1);
+vrndscalebf16(zm1|k4, ptr[rax+128], 1);
+vrndscalebf16(zm1|k4, ptr_b[rax+128], 1);
-vrsqrtpbf16(xm1|k5, xm2);
-vrsqrtpbf16(xm1|k5, ptr[rcx+128]);
-vrsqrtpbf16(xm1|k5, ptr_b[rcx+128]);
+vrsqrtbf16(xm1|k5, xm2);
+vrsqrtbf16(xm1|k5, ptr[rcx+128]);
+vrsqrtbf16(xm1|k5, ptr_b[rcx+128]);
-vrsqrtpbf16(ym1|k5, ym2);
-vrsqrtpbf16(ym1|k5, ptr[rcx+128]);
-vrsqrtpbf16(ym1|k5, ptr_b[rcx+128]);
+vrsqrtbf16(ym1|k5, ym2);
+vrsqrtbf16(ym1|k5, ptr[rcx+128]);
+vrsqrtbf16(ym1|k5, ptr_b[rcx+128]);
-vrsqrtpbf16(zm1|k5, zm2);
-vrsqrtpbf16(zm1|k5, ptr[rcx+128]);
-vrsqrtpbf16(zm1|k5, ptr_b[rcx+128]);
+vrsqrtbf16(zm1|k5, zm2);
+vrsqrtbf16(zm1|k5, ptr[rcx+128]);
+vrsqrtbf16(zm1|k5, ptr_b[rcx+128]);
-vscalefpbf16(xm1|k5, xm5, xm2);
-vscalefpbf16(xm1|k5, xm5, ptr[rcx+128]);
-vscalefpbf16(xm1|k5, xm5, ptr_b[rcx+128]);
+vscalefbf16(xm1|k5, xm5, xm2);
+vscalefbf16(xm1|k5, xm5, ptr[rcx+128]);
+vscalefbf16(xm1|k5, xm5, ptr_b[rcx+128]);
-vscalefpbf16(ym1|k5, ym9, ym2);
-vscalefpbf16(ym1|k5, ym9, ptr[rcx+128]);
-vscalefpbf16(ym1|k5, ym9, ptr_b[rcx+128]);
+vscalefbf16(ym1|k5, ym9, ym2);
+vscalefbf16(ym1|k5, ym9, ptr[rcx+128]);
+vscalefbf16(ym1|k5, ym9, ptr_b[rcx+128]);
-vscalefpbf16(zm1|k5, zm30, zm2);
-vscalefpbf16(zm1|k5, zm30, ptr[rcx+128]);
-vscalefpbf16(zm1|k5, zm30, ptr_b[rcx+128]);
+vscalefbf16(zm1|k5, zm30, zm2);
+vscalefbf16(zm1|k5, zm30, ptr[rcx+128]);
+vscalefbf16(zm1|k5, zm30, ptr_b[rcx+128]);
-vsqrtnepbf16(xm5|k3, xmm4);
-vsqrtnepbf16(xm5|k3, ptr[rax+128]);
-vsqrtnepbf16(xm5|k3, ptr_b[rax+128]);
+vsqrtbf16(xm5|k3, xmm4);
+vsqrtbf16(xm5|k3, ptr[rax+128]);
+vsqrtbf16(xm5|k3, ptr_b[rax+128]);
-vsqrtnepbf16(ym5|k3, ymm4);
-vsqrtnepbf16(ym5|k3, ptr[rax+128]);
-vsqrtnepbf16(ym5|k3, ptr_b[rax+128]);
+vsqrtbf16(ym5|k3, ymm4);
+vsqrtbf16(ym5|k3, ptr[rax+128]);
+vsqrtbf16(ym5|k3, ptr_b[rax+128]);
-vsqrtnepbf16(zm5|k3, zmm4);
-vsqrtnepbf16(zm5|k3, ptr[rax+128]);
-vsqrtnepbf16(zm5|k3, ptr_b[rax+128]);
+vsqrtbf16(zm5|k3, zmm4);
+vsqrtbf16(zm5|k3, ptr[rax+128]);
+vsqrtbf16(zm5|k3, ptr_b[rax+128]);
diff --git a/test/avx10/convert.txt b/test/avx10/convert.txt
index 435f5e5..1708a3c 100644
--- a/test/avx10/convert.txt
+++ b/test/avx10/convert.txt
@@ -72,108 +72,108 @@
vcvthf82ph(zm1|k5|T_z, ptr[rax+128]);
//
-vcvtne2ph2bf8(xm1|k4|T_z, xm2, xm3);
-vcvtne2ph2bf8(xm1|k4, xm2, ptr[rax+128]);
-vcvtne2ph2bf8(xm1|T_z, xm2, ptr_b[rax+128]);
+vcvt2ph2bf8(xm1|k4|T_z, xm2, xm3);
+vcvt2ph2bf8(xm1|k4, xm2, ptr[rax+128]);
+vcvt2ph2bf8(xm1|T_z, xm2, ptr_b[rax+128]);
-vcvtne2ph2bf8(ym1|k4|T_z, ym2, ym3);
-vcvtne2ph2bf8(ym1|k4, ym2, ptr[rax+128]);
-vcvtne2ph2bf8(ym1|T_z, ym2, ptr_b[rax+128]);
+vcvt2ph2bf8(ym1|k4|T_z, ym2, ym3);
+vcvt2ph2bf8(ym1|k4, ym2, ptr[rax+128]);
+vcvt2ph2bf8(ym1|T_z, ym2, ptr_b[rax+128]);
-vcvtne2ph2bf8(zm1|k4|T_z, zm2, zm3);
-vcvtne2ph2bf8(zm1|k4, zm2, ptr[rax+128]);
-vcvtne2ph2bf8(zm1|T_z, zm2, ptr_b[rax+128]);
+vcvt2ph2bf8(zm1|k4|T_z, zm2, zm3);
+vcvt2ph2bf8(zm1|k4, zm2, ptr[rax+128]);
+vcvt2ph2bf8(zm1|T_z, zm2, ptr_b[rax+128]);
//
-vcvtne2ph2bf8s(xm1|k4|T_z, xm2, xm3);
-vcvtne2ph2bf8s(xm1|k4, xm2, ptr[rax+128]);
-vcvtne2ph2bf8s(xm1|T_z, xm2, ptr_b[rax+128]);
+vcvt2ph2bf8s(xm1|k4|T_z, xm2, xm3);
+vcvt2ph2bf8s(xm1|k4, xm2, ptr[rax+128]);
+vcvt2ph2bf8s(xm1|T_z, xm2, ptr_b[rax+128]);
-vcvtne2ph2bf8s(ym1|k4|T_z, ym2, ym3);
-vcvtne2ph2bf8s(ym1|k4, ym2, ptr[rax+128]);
-vcvtne2ph2bf8s(ym1|T_z, ym2, ptr_b[rax+128]);
+vcvt2ph2bf8s(ym1|k4|T_z, ym2, ym3);
+vcvt2ph2bf8s(ym1|k4, ym2, ptr[rax+128]);
+vcvt2ph2bf8s(ym1|T_z, ym2, ptr_b[rax+128]);
-vcvtne2ph2bf8s(zm1|k4|T_z, zm2, zm3);
-vcvtne2ph2bf8s(zm1|k4, zm2, ptr[rax+128]);
-vcvtne2ph2bf8s(zm1|T_z, zm2, ptr_b[rax+128]);
+vcvt2ph2bf8s(zm1|k4|T_z, zm2, zm3);
+vcvt2ph2bf8s(zm1|k4, zm2, ptr[rax+128]);
+vcvt2ph2bf8s(zm1|T_z, zm2, ptr_b[rax+128]);
//
-vcvtne2ph2hf8(xm1|k4|T_z, xm2, xm3);
-vcvtne2ph2hf8(xm1|k4, xm2, ptr[rax+128]);
-vcvtne2ph2hf8(xm1|T_z, xm2, ptr_b[rax+128]);
+vcvt2ph2hf8(xm1|k4|T_z, xm2, xm3);
+vcvt2ph2hf8(xm1|k4, xm2, ptr[rax+128]);
+vcvt2ph2hf8(xm1|T_z, xm2, ptr_b[rax+128]);
-vcvtne2ph2hf8(ym1|k4|T_z, ym2, ym3);
-vcvtne2ph2hf8(ym1|k4, ym2, ptr[rax+128]);
-vcvtne2ph2hf8(ym1|T_z, ym2, ptr_b[rax+128]);
+vcvt2ph2hf8(ym1|k4|T_z, ym2, ym3);
+vcvt2ph2hf8(ym1|k4, ym2, ptr[rax+128]);
+vcvt2ph2hf8(ym1|T_z, ym2, ptr_b[rax+128]);
-vcvtne2ph2hf8(zm1|k4|T_z, zm2, zm3);
-vcvtne2ph2hf8(zm1|k4, zm2, ptr[rax+128]);
-vcvtne2ph2hf8(zm1|T_z, zm2, ptr_b[rax+128]);
+vcvt2ph2hf8(zm1|k4|T_z, zm2, zm3);
+vcvt2ph2hf8(zm1|k4, zm2, ptr[rax+128]);
+vcvt2ph2hf8(zm1|T_z, zm2, ptr_b[rax+128]);
//
-vcvtne2ph2hf8s(xm1|k4|T_z, xm2, xm3);
-vcvtne2ph2hf8s(xm1|k4, xm2, ptr[rax+128]);
-vcvtne2ph2hf8s(xm1|T_z, xm2, ptr_b[rax+128]);
+vcvt2ph2hf8s(xm1|k4|T_z, xm2, xm3);
+vcvt2ph2hf8s(xm1|k4, xm2, ptr[rax+128]);
+vcvt2ph2hf8s(xm1|T_z, xm2, ptr_b[rax+128]);
-vcvtne2ph2hf8s(ym1|k4|T_z, ym2, ym3);
-vcvtne2ph2hf8s(ym1|k4, ym2, ptr[rax+128]);
-vcvtne2ph2hf8s(ym1|T_z, ym2, ptr_b[rax+128]);
+vcvt2ph2hf8s(ym1|k4|T_z, ym2, ym3);
+vcvt2ph2hf8s(ym1|k4, ym2, ptr[rax+128]);
+vcvt2ph2hf8s(ym1|T_z, ym2, ptr_b[rax+128]);
-vcvtne2ph2hf8s(zm1|k4|T_z, zm2, zm3);
-vcvtne2ph2hf8s(zm1|k4, zm2, ptr[rax+128]);
-vcvtne2ph2hf8s(zm1|T_z, zm2, ptr_b[rax+128]);
+vcvt2ph2hf8s(zm1|k4|T_z, zm2, zm3);
+vcvt2ph2hf8s(zm1|k4, zm2, ptr[rax+128]);
+vcvt2ph2hf8s(zm1|T_z, zm2, ptr_b[rax+128]);
-// vcvtneph2bf8
-vcvtneph2bf8(xmm1|k2|T_z, xmm2);
-vcvtneph2bf8(xmm1|k2|T_z, xword [rax+128]);
-vcvtneph2bf8(xmm1|k2|T_z, xword_b[rax+128]);
+// vcvtph2bf8
+vcvtph2bf8(xmm1|k2|T_z, xmm2);
+vcvtph2bf8(xmm1|k2|T_z, xword [rax+128]);
+vcvtph2bf8(xmm1|k2|T_z, xword_b[rax+128]);
-vcvtneph2bf8(xmm1|k2|T_z, ymm2);
-vcvtneph2bf8(xmm1|k2|T_z, yword[rax+128]);
-vcvtneph2bf8(xmm1|k2|T_z, yword_b[rax+128]);
+vcvtph2bf8(xmm1|k2|T_z, ymm2);
+vcvtph2bf8(xmm1|k2|T_z, yword[rax+128]);
+vcvtph2bf8(xmm1|k2|T_z, yword_b[rax+128]);
-vcvtneph2bf8(ymm1|k2|T_z, zmm2);
-vcvtneph2bf8(ymm1|k2|T_z, zword[rax+128]);
-vcvtneph2bf8(ymm1|k2|T_z, zword_b[rax+128]);
+vcvtph2bf8(ymm1|k2|T_z, zmm2);
+vcvtph2bf8(ymm1|k2|T_z, zword[rax+128]);
+vcvtph2bf8(ymm1|k2|T_z, zword_b[rax+128]);
-// vcvtneph2bf8s
-vcvtneph2bf8s(xmm1|k2|T_z, xmm2);
-vcvtneph2bf8s(xmm1|k2|T_z, xword [rax+128]);
-vcvtneph2bf8s(xmm1|k2|T_z, xword_b[rax+128]);
+// vcvtph2bf8s
+vcvtph2bf8s(xmm1|k2|T_z, xmm2);
+vcvtph2bf8s(xmm1|k2|T_z, xword [rax+128]);
+vcvtph2bf8s(xmm1|k2|T_z, xword_b[rax+128]);
-vcvtneph2bf8s(xmm1|k2|T_z, ymm2);
-vcvtneph2bf8s(xmm1|k2|T_z, yword[rax+128]);
-vcvtneph2bf8s(xmm1|k2|T_z, yword_b[rax+128]);
+vcvtph2bf8s(xmm1|k2|T_z, ymm2);
+vcvtph2bf8s(xmm1|k2|T_z, yword[rax+128]);
+vcvtph2bf8s(xmm1|k2|T_z, yword_b[rax+128]);
-vcvtneph2bf8s(ymm1|k2|T_z, zmm2);
-vcvtneph2bf8s(ymm1|k2|T_z, zword[rax+128]);
-vcvtneph2bf8s(ymm1|k2|T_z, zword_b[rax+128]);
+vcvtph2bf8s(ymm1|k2|T_z, zmm2);
+vcvtph2bf8s(ymm1|k2|T_z, zword[rax+128]);
+vcvtph2bf8s(ymm1|k2|T_z, zword_b[rax+128]);
-// vcvtneph2hf8
-vcvtneph2hf8(xmm1|k2|T_z, xmm2);
-vcvtneph2hf8(xmm1|k2|T_z, xword [rax+128]);
-vcvtneph2hf8(xmm1|k2|T_z, xword_b[rax+128]);
+// vcvtph2hf8
+vcvtph2hf8(xmm1|k2|T_z, xmm2);
+vcvtph2hf8(xmm1|k2|T_z, xword [rax+128]);
+vcvtph2hf8(xmm1|k2|T_z, xword_b[rax+128]);
-vcvtneph2hf8(xmm1|k2|T_z, ymm2);
-vcvtneph2hf8(xmm1|k2|T_z, yword[rax+128]);
-vcvtneph2hf8(xmm1|k2|T_z, yword_b[rax+128]);
+vcvtph2hf8(xmm1|k2|T_z, ymm2);
+vcvtph2hf8(xmm1|k2|T_z, yword[rax+128]);
+vcvtph2hf8(xmm1|k2|T_z, yword_b[rax+128]);
-vcvtneph2hf8(ymm1|k2|T_z, zmm2);
-vcvtneph2hf8(ymm1|k2|T_z, zword[rax+128]);
-vcvtneph2hf8(ymm1|k2|T_z, zword_b[rax+128]);
+vcvtph2hf8(ymm1|k2|T_z, zmm2);
+vcvtph2hf8(ymm1|k2|T_z, zword[rax+128]);
+vcvtph2hf8(ymm1|k2|T_z, zword_b[rax+128]);
-// vcvtneph2hf8s
-vcvtneph2hf8s(xmm1|k2|T_z, xmm2);
-vcvtneph2hf8s(xmm1|k2|T_z, xword [rax+128]);
-vcvtneph2hf8s(xmm1|k2|T_z, xword_b[rax+128]);
+// vcvtph2hf8s
+vcvtph2hf8s(xmm1|k2|T_z, xmm2);
+vcvtph2hf8s(xmm1|k2|T_z, xword [rax+128]);
+vcvtph2hf8s(xmm1|k2|T_z, xword_b[rax+128]);
-vcvtneph2hf8s(xmm1|k2|T_z, ymm2);
-vcvtneph2hf8s(xmm1|k2|T_z, yword[rax+128]);
-vcvtneph2hf8s(xmm1|k2|T_z, yword_b[rax+128]);
+vcvtph2hf8s(xmm1|k2|T_z, ymm2);
+vcvtph2hf8s(xmm1|k2|T_z, yword[rax+128]);
+vcvtph2hf8s(xmm1|k2|T_z, yword_b[rax+128]);
-vcvtneph2hf8s(ymm1|k2|T_z, zmm2);
-vcvtneph2hf8s(ymm1|k2|T_z, zword[rax+128]);
-vcvtneph2hf8s(ymm1|k2|T_z, zword_b[rax+128]);
+vcvtph2hf8s(ymm1|k2|T_z, zmm2);
+vcvtph2hf8s(ymm1|k2|T_z, zword[rax+128]);
+vcvtph2hf8s(ymm1|k2|T_z, zword_b[rax+128]);
// AVX-NE-CONVERT
vbcstnebf162ps(xmm15, ptr[rax+128]);
diff --git a/test/avx10/minmax.txt b/test/avx10/minmax.txt
index 8b2c662..d6c17b4 100644
--- a/test/avx10/minmax.txt
+++ b/test/avx10/minmax.txt
@@ -1,14 +1,14 @@
-vminmaxnepbf16(xm1|k3|T_z, xm2, xm3, 5);
-vminmaxnepbf16(xm1|k3|T_z, xm2, ptr[rax+128], 5);
-vminmaxnepbf16(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
+vminmaxbf16(xm1|k3|T_z, xm2, xm3, 5);
+vminmaxbf16(xm1|k3|T_z, xm2, ptr[rax+128], 5);
+vminmaxbf16(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
-vminmaxnepbf16(ym1|k3|T_z, ym2, ym3, 5);
-vminmaxnepbf16(ym1|k3|T_z, ym2, ptr[rax+128], 5);
-vminmaxnepbf16(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
+vminmaxbf16(ym1|k3|T_z, ym2, ym3, 5);
+vminmaxbf16(ym1|k3|T_z, ym2, ptr[rax+128], 5);
+vminmaxbf16(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
-vminmaxnepbf16(zm1|k3|T_z, zm2, zm3, 5);
-vminmaxnepbf16(zm1|k3|T_z, zm2, ptr[rax+128], 5);
-vminmaxnepbf16(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
+vminmaxbf16(zm1|k3|T_z, zm2, zm3, 5);
+vminmaxbf16(zm1|k3|T_z, zm2, ptr[rax+128], 5);
+vminmaxbf16(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
//
vminmaxpd(xm1|k3|T_z, xm2, xm3, 5);
vminmaxpd(xm1|k3|T_z, xm2, ptr[rax+128], 5);
diff --git a/test/avx10/saturation.txt b/test/avx10/saturation.txt
index f3ebf3d..448e7ba 100644
--- a/test/avx10/saturation.txt
+++ b/test/avx10/saturation.txt
@@ -1,51 +1,51 @@
//
-vcvtnebf162ibs(xm1, xm2);
-vcvtnebf162ibs(xm1, ptr[rax+128]);
-vcvtnebf162ibs(xm1, ptr_b[rax+128]);
+vcvtbf162ibs(xm1, xm2);
+vcvtbf162ibs(xm1, ptr[rax+128]);
+vcvtbf162ibs(xm1, ptr_b[rax+128]);
-vcvtnebf162ibs(ym1, ym2);
-vcvtnebf162ibs(ym1, ptr[rax+128]);
-vcvtnebf162ibs(ym1, ptr_b[rax+128]);
+vcvtbf162ibs(ym1, ym2);
+vcvtbf162ibs(ym1, ptr[rax+128]);
+vcvtbf162ibs(ym1, ptr_b[rax+128]);
-vcvtnebf162ibs(zm1, zm2);
-vcvtnebf162ibs(zm1, ptr[rax+128]);
-vcvtnebf162ibs(zm1, ptr_b[rax+128]);
+vcvtbf162ibs(zm1, zm2);
+vcvtbf162ibs(zm1, ptr[rax+128]);
+vcvtbf162ibs(zm1, ptr_b[rax+128]);
//
-vcvtnebf162iubs(xm1, xm2);
-vcvtnebf162iubs(xm1, ptr[rax+128]);
-vcvtnebf162iubs(xm1, ptr_b[rax+128]);
+vcvtbf162iubs(xm1, xm2);
+vcvtbf162iubs(xm1, ptr[rax+128]);
+vcvtbf162iubs(xm1, ptr_b[rax+128]);
-vcvtnebf162iubs(ym1, ym2);
-vcvtnebf162iubs(ym1, ptr[rax+128]);
-vcvtnebf162iubs(ym1, ptr_b[rax+128]);
+vcvtbf162iubs(ym1, ym2);
+vcvtbf162iubs(ym1, ptr[rax+128]);
+vcvtbf162iubs(ym1, ptr_b[rax+128]);
-vcvtnebf162iubs(zm1, zm2);
-vcvtnebf162iubs(zm1, ptr[rax+128]);
-vcvtnebf162iubs(zm1, ptr_b[rax+128]);
+vcvtbf162iubs(zm1, zm2);
+vcvtbf162iubs(zm1, ptr[rax+128]);
+vcvtbf162iubs(zm1, ptr_b[rax+128]);
//
-vcvttnebf162ibs(xm1, xm2);
-vcvttnebf162ibs(xm1, ptr[rax+128]);
-vcvttnebf162ibs(xm1, ptr_b[rax+128]);
+vcvttbf162ibs(xm1, xm2);
+vcvttbf162ibs(xm1, ptr[rax+128]);
+vcvttbf162ibs(xm1, ptr_b[rax+128]);
-vcvttnebf162ibs(ym1, ym2);
-vcvttnebf162ibs(ym1, ptr[rax+128]);
-vcvttnebf162ibs(ym1, ptr_b[rax+128]);
+vcvttbf162ibs(ym1, ym2);
+vcvttbf162ibs(ym1, ptr[rax+128]);
+vcvttbf162ibs(ym1, ptr_b[rax+128]);
-vcvttnebf162ibs(zm1, zm2);
-vcvttnebf162ibs(zm1, ptr[rax+128]);
-vcvttnebf162ibs(zm1, ptr_b[rax+128]);
+vcvttbf162ibs(zm1, zm2);
+vcvttbf162ibs(zm1, ptr[rax+128]);
+vcvttbf162ibs(zm1, ptr_b[rax+128]);
//
-vcvttnebf162iubs(xm1, xm2);
-vcvttnebf162iubs(xm1, ptr[rax+128]);
-vcvttnebf162iubs(xm1, ptr_b[rax+128]);
+vcvttbf162iubs(xm1, xm2);
+vcvttbf162iubs(xm1, ptr[rax+128]);
+vcvttbf162iubs(xm1, ptr_b[rax+128]);
-vcvttnebf162iubs(ym1, ym2);
-vcvttnebf162iubs(ym1, ptr[rax+128]);
-vcvttnebf162iubs(ym1, ptr_b[rax+128]);
+vcvttbf162iubs(ym1, ym2);
+vcvttbf162iubs(ym1, ptr[rax+128]);
+vcvttbf162iubs(ym1, ptr_b[rax+128]);
-vcvttnebf162iubs(zm1, zm2);
-vcvttnebf162iubs(zm1, ptr[rax+128]);
-vcvttnebf162iubs(zm1, ptr_b[rax+128]);
+vcvttbf162iubs(zm1, zm2);
+vcvttbf162iubs(zm1, ptr[rax+128]);
+vcvttbf162iubs(zm1, ptr_b[rax+128]);
//
vcvttpd2qqs(xm1, xm2);
vcvttpd2qqs(xm1, ptr[rax+128]);
diff --git a/test/jmp.vcproj b/test/jmp.vcproj
deleted file mode 100644
index 17118e0..0000000
--- a/test/jmp.vcproj
+++ /dev/null
@@ -1,195 +0,0 @@
-<?xml version="1.0" encoding="shift_jis"?>
-<VisualStudioProject
- ProjectType="Visual C++"
- Version="9.00"
- Name="jmp"
- ProjectGUID="{AC0B3317-E988-44F8-954A-BCBE4B3BB2BF}"
- RootNamespace="jmp"
- Keyword="Win32Proj"
- TargetFrameworkVersion="196613"
- >
- <Platforms>
- <Platform
- Name="Win32"
- />
- </Platforms>
- <ToolFiles>
- </ToolFiles>
- <Configurations>
- <Configuration
- Name="Debug|Win32"
- OutputDirectory="$(SolutionDir)$(ConfigurationName)"
- IntermediateDirectory="$(ConfigurationName)"
- ConfigurationType="1"
- CharacterSet="1"
- >
- <Tool
- Name="VCPreBuildEventTool"
- />
- <Tool
- Name="VCCustomBuildTool"
- />
- <Tool
- Name="VCXMLDataGeneratorTool"
- />
- <Tool
- Name="VCWebServiceProxyGeneratorTool"
- />
- <Tool
- Name="VCMIDLTool"
- />
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories="$(SolutionDir)/../"
- PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
- MinimalRebuild="true"
- BasicRuntimeChecks="3"
- RuntimeLibrary="3"
- UsePrecompiledHeader="0"
- WarningLevel="3"
- DebugInformationFormat="4"
- />
- <Tool
- Name="VCManagedResourceCompilerTool"
- />
- <Tool
- Name="VCResourceCompilerTool"
- />
- <Tool
- Name="VCPreLinkEventTool"
- />
- <Tool
- Name="VCLinkerTool"
- LinkIncremental="2"
- GenerateDebugInformation="true"
- SubSystem="1"
- TargetMachine="1"
- />
- <Tool
- Name="VCALinkTool"
- />
- <Tool
- Name="VCManifestTool"
- />
- <Tool
- Name="VCXDCMakeTool"
- />
- <Tool
- Name="VCBscMakeTool"
- />
- <Tool
- Name="VCFxCopTool"
- />
- <Tool
- Name="VCAppVerifierTool"
- />
- <Tool
- Name="VCPostBuildEventTool"
- />
- </Configuration>
- <Configuration
- Name="Release|Win32"
- OutputDirectory="$(SolutionDir)$(ConfigurationName)"
- IntermediateDirectory="$(ConfigurationName)"
- ConfigurationType="1"
- CharacterSet="1"
- WholeProgramOptimization="1"
- >
- <Tool
- Name="VCPreBuildEventTool"
- />
- <Tool
- Name="VCCustomBuildTool"
- />
- <Tool
- Name="VCXMLDataGeneratorTool"
- />
- <Tool
- Name="VCWebServiceProxyGeneratorTool"
- />
- <Tool
- Name="VCMIDLTool"
- />
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- EnableIntrinsicFunctions="true"
- AdditionalIncludeDirectories="$(SolutionDir)/../"
- PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
- RuntimeLibrary="2"
- EnableFunctionLevelLinking="true"
- UsePrecompiledHeader="0"
- WarningLevel="3"
- DebugInformationFormat="3"
- />
- <Tool
- Name="VCManagedResourceCompilerTool"
- />
- <Tool
- Name="VCResourceCompilerTool"
- />
- <Tool
- Name="VCPreLinkEventTool"
- />
- <Tool
- Name="VCLinkerTool"
- LinkIncremental="1"
- GenerateDebugInformation="true"
- SubSystem="1"
- OptimizeReferences="2"
- EnableCOMDATFolding="2"
- TargetMachine="1"
- />
- <Tool
- Name="VCALinkTool"
- />
- <Tool
- Name="VCManifestTool"
- />
- <Tool
- Name="VCXDCMakeTool"
- />
- <Tool
- Name="VCBscMakeTool"
- />
- <Tool
- Name="VCFxCopTool"
- />
- <Tool
- Name="VCAppVerifierTool"
- />
- <Tool
- Name="VCPostBuildEventTool"
- />
- </Configuration>
- </Configurations>
- <References>
- </References>
- <Files>
- <Filter
- Name="¥½¡¼¥¹ ¥Õ¥¡¥¤¥ë"
- Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
- UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
- >
- <File
- RelativePath=".\jmp.cpp"
- >
- </File>
- </Filter>
- <Filter
- Name="¥Ø¥Ã¥À¡¼ ¥Õ¥¡¥¤¥ë"
- Filter="h;hpp;hxx;hm;inl;inc;xsd"
- UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
- >
- </Filter>
- <Filter
- Name="¥ê¥½¡¼¥¹ ¥Õ¥¡¥¤¥ë"
- Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
- UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
- >
- </Filter>
- </Files>
- <Globals>
- </Globals>
-</VisualStudioProject>
diff --git a/test/test_address.sh b/test/test_address.sh
index 2588fcd..877c2c5 100755
--- a/test/test_address.sh
+++ b/test/test_address.sh
@@ -3,12 +3,12 @@
set -e
FILTER="grep -v warning"
+CXX=${CXX:=g++}
+CFLAGS_USER=${CFLAGS}
sub()
{
-
-CFLAGS="-Wall -I../ $OPT2"
-CXX=${CXX:=g++}
+CFLAGS="$CFLAGS_USER -Wall -I../ $OPT2"
echo "compile address.cpp"
$CXX $CFLAGS address.cpp -o address
diff --git a/test/test_avx.sh b/test/test_avx.sh
index e9e4634..09abad4 100755
--- a/test/test_avx.sh
+++ b/test/test_avx.sh
@@ -4,6 +4,7 @@
FILTER="grep -v warning"
CXX=${CXX:=g++}
+CFLAGS_USER=${CFLAGS}
case $1 in
Y)
@@ -34,7 +35,7 @@
;;
esac
-CFLAGS="-Wall -g -I../ $OPT2 -DUSE_AVX"
+CFLAGS="$CFLAGS_USER -Wall -g -I../ $OPT2 -DUSE_AVX"
echo "compile make_nm.cpp"
$CXX $CFLAGS make_nm.cpp -o make_nm
diff --git a/test/test_avx512.sh b/test/test_avx512.sh
index edb8ab3..4e9c456 100755
--- a/test/test_avx512.sh
+++ b/test/test_avx512.sh
@@ -4,6 +4,7 @@
FILTER="grep -v warning"
CXX=${CXX:=g++}
+CFLAGS_USER=${CFLAGS}
case $1 in
64)
@@ -21,7 +22,7 @@
;;
esac
-CFLAGS="-Wall -I../ $OPT2 -DUSE_AVX512"
+CFLAGS="$CFLAGS_USER -Wall -I../ $OPT2 -DUSE_AVX512"
echo "compile make_512.cpp"
$CXX $CFLAGS make_512.cpp -o make_512
diff --git a/test/test_by_xed.bat b/test/test_by_xed.bat
index bf6ee5e..19467ed 100644
--- a/test/test_by_xed.bat
+++ b/test/test_by_xed.bat
@@ -1,4 +1,5 @@
@echo off
+set XED=xed
set CFLAGS=-I ../ /EHsc /nologo
copy %1% tmp.cpp
cl %CFLAGS% test_by_xed.cpp && test_by_xed.exe
diff --git a/test/test_by_xed.sh b/test/test_by_xed.sh
index 905b8a0..8bac501 100755
--- a/test/test_by_xed.sh
+++ b/test/test_by_xed.sh
@@ -3,6 +3,7 @@
set -e
XED=${XED:=xed}
CXX=${CXX:=g++}
+CFLAGS_USER=${CFLAGS}
PYTHON=${PYTHON:=python3}
echo $XED
@@ -13,7 +14,7 @@
TARGET=$1
-CFLAGS="-Wall -Wextra -I ../"
+CFLAGS="$CFLAGS_USER -Wall -Wextra -I ../"
echo "test:" $TARGET
cp $TARGET tmp.cpp
diff --git a/test/test_by_xed_all.bat b/test/test_by_xed_all.bat
index bb57cb4..c7e29a2 100644
--- a/test/test_by_xed_all.bat
+++ b/test/test_by_xed_all.bat
@@ -1,4 +1,4 @@
-set TARGETS=old.txt new-ymm.txt bf16.txt misc.txt convert.txt minmax.txt saturation.txt
+set TARGETS=old.txt new-ymm.txt bf16.txt misc.txt convert.txt minmax.txt saturation.txt amx.txt apx.txt comp.txt
for %%f in (%TARGETS%) do (
echo %%f
call test_by_xed.bat avx10\%%f
diff --git a/test/test_nm.sh b/test/test_nm.sh
index 6e78b95..64fa698 100755
--- a/test/test_nm.sh
+++ b/test/test_nm.sh
@@ -4,6 +4,7 @@
FILTER=cat
CXX=${CXX:=g++}
+CFLAGS_USER=${CFLAGS}
case $1 in
Y)
@@ -47,7 +48,7 @@
;;
esac
-CFLAGS="-Wall -g -I../ $OPT2"
+CFLAGS="$CFLAGS_USER -Wall -g -I../ -I./ $OPT2"
echo "compile make_nm.cpp with $CFLAGS"
$CXX $CFLAGS make_nm.cpp -o make_nm
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 507c012..10ef35a 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -151,11 +151,17 @@
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif
+// Define this macro as 0 to disable strict checking of memory operand and register size matching.
+// This macro may be removed in future versions.
+#ifndef XBYAK_STRICT_CHECK_MEM_REG_SIZE
+ #define XBYAK_STRICT_CHECK_MEM_REG_SIZE 1
+#endif
+
namespace Xbyak {
enum {
DEFAULT_MAX_CODE_SIZE = 4096,
- VERSION = 0x7242 /* 0xABCD = A.BC(.D) */
+ VERSION = 0x7250 /* 0xABCD = A.BC(.D) */
};
#ifndef MIE_INTEGER_TYPE_DEFINED
@@ -1842,7 +1848,7 @@
static const uint64_t T_0F = 1ull << 8;
static const uint64_t T_0F38 = 1ull << 9;
static const uint64_t T_0F3A = 1ull << 10;
- static const uint64_t T_L0 = 1ull << 11;
+ static const uint64_t T_MAP5 = 1ull << 11;
static const uint64_t T_L1 = 1ull << 12;
static const uint64_t T_W0 = 1ull << 13;
static const uint64_t T_W1 = 1ull << 14;
@@ -1863,9 +1869,7 @@
static const uint64_t T_M_K = 1ull << 28; // mem{k}
static const uint64_t T_VSIB = 1ull << 29;
static const uint64_t T_MEM_EVEX = 1ull << 30; // use evex if mem
- static const uint64_t T_FP16 = 1ull << 31; // avx512-fp16
- static const uint64_t T_MAP5 = T_FP16 | T_0F;
- static const uint64_t T_MAP6 = T_FP16 | T_0F38;
+ static const uint64_t T_MAP6 = 1ull << 31;
static const uint64_t T_NF = 1ull << 32; // T_nf
static const uint64_t T_CODE1_IF1 = 1ull << 33; // code|=1 if !r.isBit(8)
@@ -1878,11 +1882,16 @@
// T_66 = 1, T_F3 = 2, T_F2 = 3
static inline uint32_t getPP(uint64_t type) { return (type & T_66) ? 1 : (type & T_F3) ? 2 : (type & T_F2) ? 3 : 0; }
// @@@end of avx_type_def.h
- static inline uint32_t getMap(uint64_t type) { return (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0; }
+ static inline uint32_t getMap(uint64_t type)
+ {
+ if (type & T_MAP6) return 6;
+ if (type & T_MAP5) return 5;
+ return (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
+ }
void vex(const Reg& reg, const Reg& base, const Operand *v, uint64_t type, int code, bool x = false)
{
int w = (type & T_W1) ? 1 : 0;
- bool is256 = (type & T_L1) ? true : (type & T_L0) ? false : reg.isYMM();
+ bool is256 = (type & T_L1) ? true : reg.isYMM();
bool r = reg.isExtIdx();
bool b = base.isExtIdx();
int idx = v ? v->getIdx() : 0;
@@ -1923,7 +1932,6 @@
if (!(type & (T_EVEX | T_MUST_EVEX))) XBYAK_THROW_RET(ERR_EVEX_IS_INVALID, 0)
int w = (type & T_EW1) ? 1 : 0;
uint32_t mmm = getMap(type);
- if (type & T_FP16) mmm |= 4;
uint32_t pp = getPP(type);
int idx = v ? v->getIdx() : 0;
uint32_t vvvv = ~idx;
@@ -2097,7 +2105,9 @@
if (code2 == NONE) code2 = code;
if (type2 && opROO(Reg(), addr, r, type2, code2)) return;
if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
+#if XBYAK_STRICT_CHECK_MEM_REG_SIZE == 1
if (!(type & T_ALLOW_DIFF_SIZE) && r.getBit() <= BIT && addr.getBit() > 0 && addr.getBit() != r.getBit()) XBYAK_THROW(ERR_BAD_MEM_SIZE)
+#endif
bool rex2 = rex(addr, r, type);
writeCode(type, r, code, rex2);
opAddr(addr, r.getIdx());
@@ -2813,7 +2823,7 @@
const Operand *p1 = &k, *p2 = &op;
if (code == 0x93) { std::swap(p1, p2); }
if (opROO(Reg(), *p2, *p1, T_APX|type, code)) return;
- opVex(static_cast<const Reg&>(*p1), 0, *p2, T_L0|type, code);
+ opVex(static_cast<const Reg&>(*p1), 0, *p2, type, code);
}
void opEncodeKey(const Reg32& r1, const Reg32& r2, uint8_t code1, uint8_t code2)
{
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 9aac078..468d120 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "7.24.2"; }
+const char *getVersionString() const { return "7.25"; }
void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); }
void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); }
void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); }
@@ -1936,6 +1936,13 @@
void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }
void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
+void tileloaddrs(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2|T_0F38|T_W0, 0x4A); }
+void tileloaddrst1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66|T_0F38|T_W0, 0x4A); }
+void tdpbf8ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_MAP5|T_W0, 0xFD); }
+void tdpbhf8ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2|T_MAP5|T_W0, 0xFD); }
+void tdphbf8ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3|T_MAP5|T_W0, 0xFD); }
+void tdphf8ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66|T_MAP5|T_W0, 0xFD); }
+void tmmultf32ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x48); }
#else
void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
void jcxz(const Label& label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
@@ -2029,7 +2036,7 @@
void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0x9B); }
void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA); }
void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0xAB); }
-void vaddnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x58); }
+void vaddbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x58); }
void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58); }
void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58); }
void valignd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x03, imm); }
@@ -2046,6 +2053,7 @@
void vbroadcasti32x8(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x5B); }
void vbroadcasti64x2(const Ymm& y, const Operand& op) { opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x5A); }
void vbroadcasti64x4(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x5B); }
+void vcmpbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); }
void vcmpeq_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 16); }
void vcmpeq_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 16); }
void vcmpeq_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 16); }
@@ -2158,7 +2166,6 @@
void vcmpordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 7); }
void vcmpordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 7); }
void vcmpordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 7); }
-void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); }
void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0xC2, imm); }
void vcmpph(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0xC2, imm); }
void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0xC2, imm); }
@@ -2181,36 +2188,36 @@
void vcmpunordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 3); }
void vcmpunordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 3); }
void vcmpunordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 3); }
+void vcomisbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_MAP5|T_EW0|T_MUST_EVEX, 0x2F); }
void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); }
void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x8A); }
void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8A); }
-void vcomsbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_MAP5|T_EW0|T_MUST_EVEX, 0x2F); }
void vcomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F2|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2F); }
void vcomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); }
void vcomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F3|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); }
+void vcvt2ph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
+void vcvt2ph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
+void vcvt2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
+void vcvt2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); }
void vcvt2ps2phx(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x67); }
+void vcvtbf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x69); }
+void vcvtbf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6B); }
void vcvtbiasph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
void vcvtbiasph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
void vcvtbiasph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
void vcvtbiasph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); }
void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x5B); }
void vcvthf82ph(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_N1, 0x1E); }
-void vcvtne2ph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
-void vcvtne2ph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
-void vcvtne2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
-void vcvtne2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); }
void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); }
-void vcvtnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x69); }
-void vcvtnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6B); }
-void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
-void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
-void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
-void vcvtneph2hf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); }
void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW1|T_ER_Z|T_MUST_EVEX|T_B64, 0x5A); }
void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x7B); }
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); }
void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); }
+void vcvtph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
+void vcvtph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
void vcvtph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B16, 0x5B); }
+void vcvtph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
+void vcvtph2hf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); }
void vcvtph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x69); }
void vcvtph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6B); }
void vcvtph2pd(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x5A); }
@@ -2238,8 +2245,8 @@
void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) uint64_t type = (T_F3|T_MAP5|T_ER_R|T_MUST_EVEX|T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x2A); }
void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x1D); }
void vcvtss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_ER_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x79); }
-void vcvttnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x68); }
-void vcvttnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6A); }
+void vcvttbf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x68); }
+void vcvttbf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6A); }
void vcvttpd2dqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); }
void vcvttpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x7A); }
void vcvttpd2qqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); }
@@ -2284,11 +2291,11 @@
void vcvtuw2ph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
void vcvtw2ph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x42, imm); }
-void vdivnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5E); }
+void vdivbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5E); }
void vdivph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5E); }
void vdivsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5E); }
void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52); }
-void vdpphps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x52); }
+void vdpphps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52); }
void vexp2pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8); }
void vexp2ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8); }
void vexpandpd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x88); }
@@ -2307,51 +2314,51 @@
void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x54, imm); }
void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_Z|T_MUST_EVEX, 0x55, imm); }
void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_Z|T_MUST_EVEX, 0x55, imm); }
-void vfmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x98); }
+void vfmadd132bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x98); }
void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x98); }
void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x99); }
-void vfmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xA8); }
+void vfmadd213bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xA8); }
void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA8); }
void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xA9); }
-void vfmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xB8); }
+void vfmadd231bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xB8); }
void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB8); }
void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xB9); }
void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x56); }
void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x96); }
void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA6); }
void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB6); }
-void vfmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9A); }
+void vfmsub132bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9A); }
void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9A); }
void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9B); }
-void vfmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAA); }
+void vfmsub213bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAA); }
void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAA); }
void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAB); }
-void vfmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBA); }
+void vfmsub231bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBA); }
void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBA); }
void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBB); }
void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x97); }
void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA7); }
void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB7); }
void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0xD6); }
-void vfnmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9C); }
+void vfnmadd132bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9C); }
void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9C); }
void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9D); }
-void vfnmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAC); }
+void vfnmadd213bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAC); }
void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAC); }
void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAD); }
-void vfnmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBC); }
+void vfnmadd231bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBC); }
void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBC); }
void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBD); }
-void vfnmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9E); }
+void vfnmsub132bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9E); }
void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9E); }
void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9F); }
-void vfnmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAE); }
+void vfnmsub213bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAE); }
void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAE); }
void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAF); }
-void vfnmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBE); }
+void vfnmsub231bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBE); }
void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBE); }
void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBF); }
-void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); }
+void vfpclassbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); }
void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); }
void vfpclassph(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B16, 0x66, imm); }
void vfpclassps(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); }
@@ -2370,14 +2377,14 @@
void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_N4|T_66|T_0F38|T_EW0|T_MUST_EVEX|T_M_K|T_VSIB, 0xC7, Operand::ZMM); }
void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 0); }
void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 2); }
-void vgetexppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x42); }
+void vgetexpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x42); }
void vgetexppd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x42); }
void vgetexpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x42); }
void vgetexpps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x42); }
void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0x43); }
void vgetexpsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_SAE_X|T_MUST_EVEX, 0x43); }
void vgetexpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0x43); }
-void vgetmantpbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x26, imm); }
+void vgetmantbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x26, imm); }
void vgetmantpd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x26, imm); }
void vgetmantph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x26, imm); }
void vgetmantps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x26, imm); }
@@ -2392,17 +2399,17 @@
void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x3A, imm); }
void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x38, imm); }
void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3A, imm); }
-void vmaxpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); }
+void vmaxbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); }
void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); }
void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); }
-void vminmaxnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x52, imm); }
+void vminbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); }
+void vminmaxbf16(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x52, imm); }
void vminmaxpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x52, imm); }
void vminmaxph(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B16, 0x52, imm); }
void vminmaxps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52, imm); }
void vminmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
void vminmaxsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
void vminmaxss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
-void vminpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); }
void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); }
void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); }
void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66|T_0F|T_EW0|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_M_K, 0x7F); }
@@ -2421,7 +2428,7 @@
void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); }
void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); }
void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A|T_YMM, 0x42, encoding, imm, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1); }
-void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); }
+void vmulbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); }
void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); }
void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59); }
void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); }
@@ -2595,17 +2602,17 @@
void vrcp28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCA); }
void vrcp28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0xCB); }
void vrcp28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCB); }
-void vrcppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); }
+void vrcpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); }
void vrcpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); }
void vrcpsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4D); }
-void vreducenepbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x56, imm); }
+void vreducebf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x56, imm); }
void vreducepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x56, imm); }
void vreduceph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x56, imm); }
void vreduceps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x56, imm); }
void vreducesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x57, imm); }
void vreducesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x57, imm); }
void vreducess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x57, imm); }
-void vrndscalenepbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x08, imm); }
+void vrndscalebf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x08, imm); }
void vrndscalepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x09, imm); }
void vrndscaleph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x08, imm); }
void vrndscaleps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x08, imm); }
@@ -2620,11 +2627,11 @@
void vrsqrt28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCC); }
void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0xCD); }
void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCD); }
-void vrsqrtpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); }
+void vrsqrtbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); }
void vrsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); }
void vrsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4F); }
-void vscalefpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); }
-void vscalefpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); }
+void vscalefbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); }
+void vscalefbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); }
void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x2C); }
void vscalefph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x2C); }
void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x2C); }
@@ -2647,10 +2654,10 @@
void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm); }
void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm); }
void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm); }
-void vsqrtnepbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x51); }
+void vsqrtbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x51); }
void vsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x51); }
void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_F3|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x51); }
-void vsubnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5C); }
+void vsubbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5C); }
void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C); }
void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C); }
void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); }
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index 6f06665..b851d66 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -548,6 +548,11 @@
XBYAK_DEFINE_TYPE(88, tSSE4a);
XBYAK_DEFINE_TYPE(89, tCLWB);
XBYAK_DEFINE_TYPE(90, tTSXLDTRK);
+ XBYAK_DEFINE_TYPE(91, tAMX_TRANSPOSE);
+ XBYAK_DEFINE_TYPE(92, tAMX_TF32);
+ XBYAK_DEFINE_TYPE(93, tAMX_AVX512);
+ XBYAK_DEFINE_TYPE(94, tAMX_MOVRS);
+ XBYAK_DEFINE_TYPE(95, tAMX_FP8);
#undef XBYAK_SPLIT_ID
#undef XBYAK_DEFINE_TYPE
@@ -708,6 +713,13 @@
if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
if (EDX & (1U << 19)) type_ |= tAVX10;
if (EDX & (1U << 21)) type_ |= tAPX_F;
+
+ getCpuidEx(0x1e, 1, data);
+ if (EAX & (1U << 4)) type_ |= tAMX_FP8;
+ if (EAX & (1U << 5)) type_ |= tAMX_TRANSPOSE;
+ if (EAX & (1U << 6)) type_ |= tAMX_TF32;
+ if (EAX & (1U << 7)) type_ |= tAMX_AVX512;
+ if (EAX & (1U << 8)) type_ |= tAMX_MOVRS;
}
}
if (maxNum >= 0x19) {