support vgather*
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index 06ff65c..07ee353 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -355,6 +355,35 @@
puts("#endif");
}
+void putGather()
+{
+ enum { // same as xbyak.h
+ xx_yy_zz = 0,
+ xx_yx_zy = 1,
+ xx_xy_yz = 2
+ };
+ const struct Tbl {
+ const char *name;
+ int type;
+ uint8 code;
+ int mode;
+ } tbl[] = {
+ { "vpgatherdd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N4, 0x90, xx_yy_zz },
+ { "vpgatherdq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N8, 0x90, xx_yx_zy },
+ { "vpgatherqd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N4, 0x91, xx_xy_yz },
+ { "vpgatherqq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N8, 0x91, xx_yy_zz },
+ { "vgatherdps", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N4, 0x92, xx_yy_zz },
+ { "vgatherdpd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N8, 0x92, xx_yx_zy },
+ { "vgatherqps", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N4, 0x93, xx_xy_yz },
+ { "vgatherqpd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N8, 0x93, xx_yy_zz },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl& p = tbl[i];
+ std::string type = type2String(p.type);
+ printf("void %s(const Xmm& x, const Address& addr) { opGather2(x, addr, %s, 0x%02X, %d); }\n", p.name, type.c_str(), p.code, p.mode);
+ }
+}
+
int main()
{
puts("#ifndef XBYAK_DISABLE_AVX512");
@@ -369,5 +398,6 @@
putBroadcast();
#endif
putCvt();
+ putGather();
puts("#endif");
}
diff --git a/test/make_512.cpp b/test/make_512.cpp
index 16bbdfc..c6999b8 100644
--- a/test/make_512.cpp
+++ b/test/make_512.cpp
@@ -18,16 +18,16 @@
const uint64 IMM8 = 1ULL << 6;
const uint64 _REG8 = 1ULL << 7;
const uint64 _REG16 = 1ULL << 8;
-const uint64 NEG8 = 1ULL << 9;
-const uint64 IMM16 = 1ULL << 10;
-const uint64 NEG16 = 1ULL << 11;
+const uint64 XMM_K = 1ULL << 9;
+const uint64 YMM_K = 1ULL << 10;
+const uint64 ZMM_K = 1ULL << 11;
const uint64 AX = 1ULL << 12;
const uint64 AL = 1ULL << 13;
const uint64 IMM_1 = 1ULL << 14;
const uint64 MEM8 = 1ULL << 15;
const uint64 MEM16 = 1ULL << 16;
const uint64 MEM32 = 1ULL << 17;
-const uint64 ONE = 1ULL << 19;
+const uint64 VM32Z = 1ULL << 19;
const uint64 CL = 1ULL << 20;
const uint64 MEM_ONLY_DISP = 1ULL << 21;
const uint64 NEG32 = 1ULL << 23;
@@ -337,32 +337,24 @@
return "al";
case CL:
return "cl";
- case ONE:
- return "1";
case IMM32:
return isXbyak_ ? "12345678" : "dword 12345678";
- case IMM16:
- return isXbyak_ ? "1000" : "word 1000";
case IMM8:
return isXbyak_ ? "4" : "byte 4";
- case NEG8:
- return isXbyak_ ? "-30" : "byte -30";
- case NEG16:
- return isXbyak_ ? "-1000" : "word -1000";
- case NEG32:
- return isXbyak_ ? "-100000" : "dword -100000";
case IMM_1:
return "4";
case IMM_2:
return isXbyak_ ? "0xda" : "0xda";
case VM32X_32:
- return isXbyak_ ? "ptr [ebp+4+xmm1*8]" : "[ebp+4+xmm1*8]";
+ return isXbyak_ ? "ptr [ebp+64+xmm1*8]" : "[ebp+64+xmm1*8]";
case VM32X_64:
- return isXbyak_ ? "ptr [12345+xmm13*2]" : "[12345+xmm13*2]";
+ return isXbyak_ ? "ptr [rax+64+xmm13*2]" : "[rax+64+xmm13*2]";
case VM32Y_32:
return isXbyak_ ? "ptr [ymm4]" : "[ymm4]";
case VM32Y_64:
- return isXbyak_ ? "ptr [12345+ymm13*2+r13]" : "[12345+ymm13*2+r13]";
+ return isXbyak_ ? "ptr [64+ymm13*2+r13]" : "[64+ymm13*2+r13]";
+ case VM32Z:
+ return isXbyak_ ? "ptr [64+zmm13*2+rcx]" : "[64+zmm13*2+rcx]";
case M_1to2: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to2}";
case M_1to4: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to4}";
case M_1to8: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to8}";
@@ -417,6 +409,12 @@
case MEM_K:
return isXbyak_ ? "ptr [eax] | k1" : "[eax]{k1}";
#endif
+ case XMM_K:
+ return isXbyak_ ? "xmm5 | k7" : "xmm5{k7}";
+ case YMM_K:
+ return isXbyak_ ? "ymm5 | k4" : "ymm5{k4}";
+ case ZMM_K:
+ return isXbyak_ ? "zmm5 | k3" : "zmm5{k3}";
}
return 0;
}
@@ -1548,10 +1546,53 @@
put("vcvtusi2ss", XMM, XMM_ER, REG32 | REG64);
#endif
}
+ void putGather()
+ {
+#ifdef XBYAK64
+ enum {
+ xx_yy_zz,
+ xx_yx_zy,
+ xx_xy_yz
+ };
+ const struct Tbl {
+ const char *name;
+ int mode;
+ } tbl[] = {
+ { "vpgatherdd", xx_yy_zz },
+ { "vpgatherdq", xx_yx_zy },
+ { "vpgatherqd", xx_xy_yz },
+ { "vpgatherqq", xx_yy_zz },
+ { "vgatherdps", xx_yy_zz },
+ { "vgatherdpd", xx_yx_zy },
+ { "vgatherqps", xx_xy_yz },
+ { "vgatherqpd", xx_yy_zz },
+ };
+ for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+ const Tbl& p = tbl[i];
+ switch (p.mode) {
+ case xx_yy_zz:
+ put(p.name, XMM_K, VM32X);
+ put(p.name, YMM_K, VM32Y);
+ put(p.name, ZMM_K, VM32Z);
+ break;
+ case xx_yx_zy:
+ put(p.name, XMM_K, VM32X);
+ put(p.name, YMM_K, VM32X);
+ put(p.name, ZMM_K, VM32Y);
+ break;
+ case xx_xy_yz:
+ put(p.name, XMM_K, VM32X);
+ put(p.name, XMM_K, VM32Y);
+ put(p.name, YMM_K, VM32Z);
+ break;
+ }
+ }
+#endif
+ }
void putMin()
{
#ifdef XBYAK64
-// put512_cvt();
+ putGather();
#endif
}
void putAVX512()
@@ -1588,6 +1629,8 @@
put512_cvt();
separateFunc();
putMisc1();
+ separateFunc();
+ putGather();
#endif
}
};
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 2176863..156000f 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -173,6 +173,7 @@
ERR_ER_IS_INVALID,
ERR_INVALID_BROADCAST,
ERR_INVALID_OPMASK_WITH_MEMORY,
+ ERR_INVALID_ZERO,
ERR_INTERNAL
};
@@ -231,6 +232,7 @@
"er(embedded rounding) is invalid",
"invalid broadcast",
"invalid opmask with memory",
+ "invalid zero",
"internal error",
};
assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl));
@@ -663,7 +665,7 @@
: scale_(scale)
, disp_(0)
{
- if (!r.isREG(i32e) && !r.is(Reg::XMM|Reg::YMM)) throw Error(ERR_BAD_SIZE_OF_REGISTER);
+ if (!r.isREG(i32e) && !r.is(Reg::XMM|Reg::YMM|Reg::ZMM)) throw Error(ERR_BAD_SIZE_OF_REGISTER);
if (scale != 1 && scale != 2 && scale != 4 && scale != 8) throw Error(ERR_BAD_SCALE);
if (r.getBit() >= 128 || scale != 1) { // xmm/ymm is always index
index_ = r;
@@ -671,8 +673,7 @@
base_ = r;
}
}
- bool isVsib() const { return index_.isBit(128|256); }
- bool isYMM() const { return index_.isBit(256); }
+ bool isVsib(int bit = 128 | 256 | 512) const { return index_.isBit(bit); }
void optimize()
{
// [reg * 2] => [reg + reg]
@@ -1416,7 +1417,7 @@
T_RZ_SAE = 4,
T_SAE = 5,
};
- int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0)
+ int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, int VL = 0)
{
if (!(type & (T_EVEX | T_MUST_EVEX))) throw Error(ERR_EVEX_IS_INVALID);
int w = (type & T_EW1) ? 1 : 0;
@@ -1441,7 +1442,8 @@
}
b = true;
} else {
- int VL = Max(Max(reg.getBit(), base.getBit()), (v ? v->getBit() : 0));
+ if (v) VL = Max(VL, v->getBit());
+ VL = Max(Max(reg.getBit(), base.getBit()), VL);
LL = (VL == 512) ? 2 : (VL == 256) ? 1 : 0;
if (b) {
disp8N = (type & T_B32) ? 4 : 8;
@@ -1793,7 +1795,8 @@
if (!(type & (T_B32 | T_B64))) throw Error(ERR_INVALID_BROADCAST);
b = true;
}
- disp8N = evex(r, base, p1, type, code, x, b, aaa);
+ int VL = addr.getRegExp().isVsib() ? addr.getRegExp().getIndex().getBit() : 0;
+ disp8N = evex(r, base, p1, type, code, x, b, aaa, VL);
} else {
vex(r, base, p1, type, code, x);
}
@@ -1888,11 +1891,11 @@
}
void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, int type, uint8 code, int mode)
{
- if (!addr.getRegExp().isVsib()) throw Error(ERR_BAD_VSIB_ADDRESSING);
+ if (!addr.getRegExp().isVsib(128 | 256)) throw Error(ERR_BAD_VSIB_ADDRESSING);
const int y_vx_y = 0;
const int y_vy_y = 1;
// const int x_vy_x = 2;
- const bool isAddrYMM = addr.getRegExp().isYMM();
+ const bool isAddrYMM = addr.getRegExp().getIndex().getBit() == 256;
if (!x1.isXMM() || isAddrYMM || !x2.isXMM()) {
bool isOK = false;
if (mode == y_vx_y) {
@@ -1907,6 +1910,32 @@
addr.permitVsib();
opAVX_X_X_XM(isAddrYMM ? Ymm(x1.getIdx()) : x1, isAddrYMM ? Ymm(x2.getIdx()) : x2, addr, type | T_YMM, code);
}
+ enum {
+ xx_yy_zz = 0,
+ xx_yx_zy = 1,
+ xx_xy_yz = 2
+ };
+ void checkGather2(const Xmm& x, const Address& addr, int mode) const
+ {
+ if (x.hasZero()) throw Error(ERR_INVALID_ZERO);
+ const RegExp& re = addr.getRegExp();
+ if (x.isXMM() && re.isVsib(128)) return;
+ switch (mode) {
+ case xx_yy_zz: if ((x.isYMM() && re.isVsib(256)) || (x.isZMM() && re.isVsib(512))) return;
+ break;
+ case xx_yx_zy: if ((x.isYMM() && re.isVsib(128)) || (x.isZMM() && re.isVsib(256))) return;
+ break;
+ case xx_xy_yz: if ((x.isXMM() && re.isVsib(256)) || (x.isYMM() && re.isVsib(512))) return;
+ break;
+ }
+ throw Error(ERR_BAD_VSIB_ADDRESSING);
+ }
+ void opGather2(const Xmm& x, const Address& addr, int type, uint8 code, int mode)
+ {
+ checkGather2(x, addr, mode);
+ addr.permitVsib();
+ opVex(x, 0, addr, type, code);
+ }
public:
unsigned int getVersion() const { return VERSION; }
using CodeArray::db;
diff --git a/xbyak/xbyak_avx512.h b/xbyak/xbyak_avx512.h
index 2e3aa71..afb8fce 100644
--- a/xbyak/xbyak_avx512.h
+++ b/xbyak/xbyak_avx512.h
@@ -173,4 +173,12 @@
void vcvttsd2usi(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_F2 | T_0F | T_MUST_EVEX | T_EW1 | T_N8 | T_SAE_X, 0x78); }
void vcvttss2usi(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_F3 | T_0F | T_MUST_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x78); }
#endif
+void vpgatherdd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_N4, 0x90, 0); }
+void vpgatherdq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_N8, 0x90, 1); }
+void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_N4, 0x91, 2); }
+void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_N8, 0x91, 0); }
+void vgatherdps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_N4, 0x92, 0); }
+void vgatherdpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_N8, 0x92, 1); }
+void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_N4, 0x93, 2); }
+void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_N8, 0x93, 0); }
#endif