support vgather*
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index 06ff65c..07ee353 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -355,6 +355,35 @@
 	puts("#endif");
 }
 
+void putGather()
+{
+	enum { // same as xbyak.h
+		xx_yy_zz = 0,
+		xx_yx_zy = 1,
+		xx_xy_yz = 2
+	};
+	const struct Tbl {
+		const char *name;
+		int type;
+		uint8 code;
+		int mode;
+	} tbl[] = {
+		{ "vpgatherdd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N4, 0x90, xx_yy_zz },
+		{ "vpgatherdq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N8, 0x90, xx_yx_zy },
+		{ "vpgatherqd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N4, 0x91, xx_xy_yz },
+		{ "vpgatherqq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N8, 0x91, xx_yy_zz },
+		{ "vgatherdps", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N4, 0x92, xx_yy_zz },
+		{ "vgatherdpd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N8, 0x92, xx_yx_zy },
+		{ "vgatherqps", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N4, 0x93, xx_xy_yz },
+		{ "vgatherqpd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N8, 0x93, xx_yy_zz },
+	};
+	for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+		const Tbl& p = tbl[i];
+		std::string type = type2String(p.type);
+		printf("void %s(const Xmm& x, const Address& addr) { opGather2(x, addr, %s, 0x%02X, %d); }\n", p.name, type.c_str(), p.code, p.mode);
+	}
+}
+
 int main()
 {
 	puts("#ifndef XBYAK_DISABLE_AVX512");
@@ -369,5 +398,6 @@
 	putBroadcast();
 #endif
 	putCvt();
+	putGather();
 	puts("#endif");
 }
diff --git a/test/make_512.cpp b/test/make_512.cpp
index 16bbdfc..c6999b8 100644
--- a/test/make_512.cpp
+++ b/test/make_512.cpp
@@ -18,16 +18,16 @@
 const uint64 IMM8 = 1ULL << 6;
 const uint64 _REG8 = 1ULL << 7;
 const uint64 _REG16 = 1ULL << 8;
-const uint64 NEG8 = 1ULL << 9;
-const uint64 IMM16 = 1ULL << 10;
-const uint64 NEG16 = 1ULL << 11;
+const uint64 XMM_K = 1ULL << 9;
+const uint64 YMM_K = 1ULL << 10;
+const uint64 ZMM_K = 1ULL << 11;
 const uint64 AX = 1ULL << 12;
 const uint64 AL = 1ULL << 13;
 const uint64 IMM_1 = 1ULL << 14;
 const uint64 MEM8 = 1ULL << 15;
 const uint64 MEM16 = 1ULL << 16;
 const uint64 MEM32 = 1ULL << 17;
-const uint64 ONE = 1ULL << 19;
+const uint64 VM32Z = 1ULL << 19;
 const uint64 CL = 1ULL << 20;
 const uint64 MEM_ONLY_DISP = 1ULL << 21;
 const uint64 NEG32 = 1ULL << 23;
@@ -337,32 +337,24 @@
 			return "al";
 		case CL:
 			return "cl";
-		case ONE:
-			return "1";
 		case IMM32:
 			return isXbyak_ ? "12345678" : "dword 12345678";
-		case IMM16:
-			return isXbyak_ ? "1000" : "word 1000";
 		case IMM8:
 			return isXbyak_ ? "4" : "byte 4";
-		case NEG8:
-			return isXbyak_ ? "-30" : "byte -30";
-		case NEG16:
-			return isXbyak_ ? "-1000" : "word -1000";
-		case NEG32:
-			return isXbyak_ ? "-100000" : "dword -100000";
 		case IMM_1:
 			return "4";
 		case IMM_2:
 			return isXbyak_ ? "0xda" : "0xda";
 		case VM32X_32:
-			return isXbyak_ ? "ptr [ebp+4+xmm1*8]" : "[ebp+4+xmm1*8]";
+			return isXbyak_ ? "ptr [ebp+64+xmm1*8]" : "[ebp+64+xmm1*8]";
 		case VM32X_64:
-			return isXbyak_ ? "ptr [12345+xmm13*2]" : "[12345+xmm13*2]";
+			return isXbyak_ ? "ptr [rax+64+xmm13*2]" : "[rax+64+xmm13*2]";
 		case VM32Y_32:
 			return isXbyak_ ? "ptr [ymm4]" : "[ymm4]";
 		case VM32Y_64:
-			return isXbyak_ ? "ptr [12345+ymm13*2+r13]" : "[12345+ymm13*2+r13]";
+			return isXbyak_ ? "ptr [64+ymm13*2+r13]" : "[64+ymm13*2+r13]";
+		case VM32Z:
+			return isXbyak_ ? "ptr [64+zmm13*2+rcx]" : "[64+zmm13*2+rcx]";
 		case M_1to2: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to2}";
 		case M_1to4: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to4}";
 		case M_1to8: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to8}";
@@ -417,6 +409,12 @@
 		case MEM_K:
 			return isXbyak_ ? "ptr [eax] | k1" : "[eax]{k1}";
 #endif
+		case XMM_K:
+			return isXbyak_ ? "xmm5 | k7" : "xmm5{k7}";
+		case YMM_K:
+			return isXbyak_ ? "ymm5 | k4" : "ymm5{k4}";
+		case ZMM_K:
+			return isXbyak_ ? "zmm5 | k3" : "zmm5{k3}";
 		}
 		return 0;
 	}
@@ -1548,10 +1546,53 @@
 		put("vcvtusi2ss", XMM, XMM_ER, REG32 | REG64);
 #endif
 	}
+	void putGather()
+	{
+#ifdef XBYAK64
+		enum {
+			xx_yy_zz,
+			xx_yx_zy,
+			xx_xy_yz
+		};
+		const struct Tbl {
+			const char *name;
+			int mode;
+		} tbl[] = {
+			{ "vpgatherdd", xx_yy_zz },
+			{ "vpgatherdq", xx_yx_zy },
+			{ "vpgatherqd", xx_xy_yz },
+			{ "vpgatherqq", xx_yy_zz },
+			{ "vgatherdps", xx_yy_zz },
+			{ "vgatherdpd", xx_yx_zy },
+			{ "vgatherqps", xx_xy_yz },
+			{ "vgatherqpd", xx_yy_zz },
+		};
+		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+			const Tbl& p = tbl[i];
+			switch (p.mode) {
+			case xx_yy_zz:
+				put(p.name, XMM_K, VM32X);
+				put(p.name, YMM_K, VM32Y);
+				put(p.name, ZMM_K, VM32Z);
+				break;
+			case xx_yx_zy:
+				put(p.name, XMM_K, VM32X);
+				put(p.name, YMM_K, VM32X);
+				put(p.name, ZMM_K, VM32Y);
+				break;
+			case xx_xy_yz:
+				put(p.name, XMM_K, VM32X);
+				put(p.name, XMM_K, VM32Y);
+				put(p.name, YMM_K, VM32Z);
+				break;
+			}
+		}
+#endif
+	}
 	void putMin()
 	{
 #ifdef XBYAK64
-//		put512_cvt();
+		putGather();
 #endif
 	}
 	void putAVX512()
@@ -1588,6 +1629,8 @@
 		put512_cvt();
 		separateFunc();
 		putMisc1();
+		separateFunc();
+		putGather();
 #endif
 	}
 };
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 2176863..156000f 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -173,6 +173,7 @@
 	ERR_ER_IS_INVALID,
 	ERR_INVALID_BROADCAST,
 	ERR_INVALID_OPMASK_WITH_MEMORY,
+	ERR_INVALID_ZERO,
 	ERR_INTERNAL
 };
 
@@ -231,6 +232,7 @@
 			"er(embedded rounding) is invalid",
 			"invalid broadcast",
 			"invalid opmask with memory",
+			"invalid zero",
 			"internal error",
 		};
 		assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl));
@@ -663,7 +665,7 @@
 		: scale_(scale)
 		, disp_(0)
 	{
-		if (!r.isREG(i32e) && !r.is(Reg::XMM|Reg::YMM)) throw Error(ERR_BAD_SIZE_OF_REGISTER);
+		if (!r.isREG(i32e) && !r.is(Reg::XMM|Reg::YMM|Reg::ZMM)) throw Error(ERR_BAD_SIZE_OF_REGISTER);
 		if (scale != 1 && scale != 2 && scale != 4 && scale != 8) throw Error(ERR_BAD_SCALE);
 		if (r.getBit() >= 128 || scale != 1) { // xmm/ymm is always index
 			index_ = r;
@@ -671,8 +673,7 @@
 			base_ = r;
 		}
 	}
-	bool isVsib() const { return index_.isBit(128|256); }
-	bool isYMM() const { return index_.isBit(256); }
+	bool isVsib(int bit = 128 | 256 | 512) const { return index_.isBit(bit); }
 	void optimize()
 	{
 		// [reg * 2] => [reg + reg]
@@ -1416,7 +1417,7 @@
 		T_RZ_SAE = 4,
 		T_SAE = 5,
 	};
-	int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0)
+	int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, int VL = 0)
 	{
 		if (!(type & (T_EVEX | T_MUST_EVEX))) throw Error(ERR_EVEX_IS_INVALID);
 		int w = (type & T_EW1) ? 1 : 0;
@@ -1441,7 +1442,8 @@
 			}
 			b = true;
 		} else {
-			int VL = Max(Max(reg.getBit(), base.getBit()), (v ? v->getBit() : 0));
+			if (v) VL = Max(VL, v->getBit());
+			VL = Max(Max(reg.getBit(), base.getBit()), VL);
 			LL = (VL == 512) ? 2 : (VL == 256) ? 1 : 0;
 			if (b) {
 				disp8N = (type & T_B32) ? 4 : 8;
@@ -1793,7 +1795,8 @@
 					if (!(type & (T_B32 | T_B64))) throw Error(ERR_INVALID_BROADCAST);
 					b = true;
 				}
-				disp8N = evex(r, base, p1, type, code, x, b, aaa);
+				int VL = addr.getRegExp().isVsib() ? addr.getRegExp().getIndex().getBit() : 0;
+				disp8N = evex(r, base, p1, type, code, x, b, aaa, VL);
 			} else {
 				vex(r, base, p1, type, code, x);
 			}
@@ -1888,11 +1891,11 @@
 	}
 	void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, int type, uint8 code, int mode)
 	{
-		if (!addr.getRegExp().isVsib()) throw Error(ERR_BAD_VSIB_ADDRESSING);
+		if (!addr.getRegExp().isVsib(128 | 256)) throw Error(ERR_BAD_VSIB_ADDRESSING);
 		const int y_vx_y = 0;
 		const int y_vy_y = 1;
 //		const int x_vy_x = 2;
-		const bool isAddrYMM = addr.getRegExp().isYMM();
+		const bool isAddrYMM = addr.getRegExp().getIndex().getBit() == 256;
 		if (!x1.isXMM() || isAddrYMM || !x2.isXMM()) {
 			bool isOK = false;
 			if (mode == y_vx_y) {
@@ -1907,6 +1910,32 @@
 		addr.permitVsib();
 		opAVX_X_X_XM(isAddrYMM ? Ymm(x1.getIdx()) : x1, isAddrYMM ? Ymm(x2.getIdx()) : x2, addr, type | T_YMM, code);
 	}
+	enum {
+		xx_yy_zz = 0,
+		xx_yx_zy = 1,
+		xx_xy_yz = 2
+	};
+	void checkGather2(const Xmm& x, const Address& addr, int mode) const
+	{
+		if (x.hasZero()) throw Error(ERR_INVALID_ZERO);
+		const RegExp& re = addr.getRegExp();
+		if (x.isXMM() && re.isVsib(128)) return;
+		switch (mode) {
+		case xx_yy_zz: if ((x.isYMM() && re.isVsib(256)) || (x.isZMM() && re.isVsib(512))) return;
+			break;
+		case xx_yx_zy: if ((x.isYMM() && re.isVsib(128)) || (x.isZMM() && re.isVsib(256))) return;
+			break;
+		case xx_xy_yz: if ((x.isXMM() && re.isVsib(256)) || (x.isYMM() && re.isVsib(512))) return;
+			break;
+		}
+		throw Error(ERR_BAD_VSIB_ADDRESSING);
+	}
+	void opGather2(const Xmm& x, const Address& addr, int type, uint8 code, int mode)
+	{
+		checkGather2(x, addr, mode);
+		addr.permitVsib();
+		opVex(x, 0, addr, type, code);
+	}
 public:
 	unsigned int getVersion() const { return VERSION; }
 	using CodeArray::db;
diff --git a/xbyak/xbyak_avx512.h b/xbyak/xbyak_avx512.h
index 2e3aa71..afb8fce 100644
--- a/xbyak/xbyak_avx512.h
+++ b/xbyak/xbyak_avx512.h
@@ -173,4 +173,12 @@
 void vcvttsd2usi(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_F2 | T_0F | T_MUST_EVEX | T_EW1 | T_N8 | T_SAE_X, 0x78); }
 void vcvttss2usi(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_F3 | T_0F | T_MUST_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x78); }
 #endif
+void vpgatherdd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_N4, 0x90, 0); }
+void vpgatherdq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_N8, 0x90, 1); }
+void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_N4, 0x91, 2); }
+void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_N8, 0x91, 0); }
+void vgatherdps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_N4, 0x92, 0); }
+void vgatherdpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_N8, 0x92, 1); }
+void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_N4, 0x93, 2); }
+void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_N8, 0x93, 0); }
 #endif