add vmovsd, vmovss
diff --git a/gen/avx_type.hpp b/gen/avx_type.hpp
index 745a0f8..b205c90 100644
--- a/gen/avx_type.hpp
+++ b/gen/avx_type.hpp
@@ -24,6 +24,7 @@
 	T_MUST_EVEX = 1 << 23,
 	T_B32 = 1 << 24, // m32bcst
 	T_B64 = 1 << 25, // m64bcst
+	T_M_K = 1 << 26, // mem{k}
 	T_XXX
 };
 
@@ -124,5 +125,9 @@
 		if (!str.empty()) str += " | ";
 		str += "T_B64";
 	}
+	if (type & T_M_K) {
+		if (!str.empty()) str += " | ";
+		str += "T_M_K";
+	}
 	return str;
 }
\ No newline at end of file
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index 46e4ed5..6370009 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -1541,10 +1541,12 @@
 		// vmovsd, vmovss
 		for (int i = 0; i < 2; i++) {
 			char c1 = i == 0 ? 'd' : 's';
-			char c2 = i == 0 ? '2' : '3';
-			printf("void vmovs%c(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x1, x2, op, T_0F | T_F%c, 0x10); }\n", c1, c2);
-			printf("void vmovs%c(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_F%c, 0x10); }\n", c1, c2);
-			printf("void vmovs%c(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_F%c, 0x11); }\n", c1, c2);
+			int type = T_0F | T_EVEX;
+			type |= i == 0 ? T_F2 | T_EW1 : T_F3 | T_EW0;
+			std::string s = type2String(type);
+			printf("void vmovs%c(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x1, x2, op, %s, 0x10); }\n", c1, s.c_str());
+			printf("void vmovs%c(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, %s, 0x10); }\n", c1, s.c_str());
+			printf("void vmovs%c(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, %s | T_M_K, 0x11); }\n", c1, s.c_str());
 		}
 	}
 	// cvt
diff --git a/test/make_nm.cpp b/test/make_nm.cpp
index a65030e..7babd15 100644
--- a/test/make_nm.cpp
+++ b/test/make_nm.cpp
@@ -97,12 +97,13 @@
 #ifdef XBYAK64
 const uint64 XMM_KZ = 1ULL << 52;
 const uint64 YMM_KZ = 1ULL << 53;
-const uint64 ZMM_KZ = 1ULL << 54; // max value
+const uint64 ZMM_KZ = 1ULL << 54;
 #else
 const uint64 XMM_KZ = 0;
 const uint64 YMM_KZ = 0;
 const uint64 ZMM_KZ = 0;
 #endif
+const uint64 MEM_K = 1ULL << 55; // max value
 
 const uint64 NOPARA = 1ULL << (bitEnd - 1);
 
@@ -388,6 +389,8 @@
 			return isXbyak_ ? "ymm2 |k3|T_z" : "ymm2{k3}{z}";
 		case ZMM_KZ:
 			return isXbyak_ ? "zmm7|k1" : "zmm7{k1}";
+		case MEM_K:
+			return isXbyak_ ? "ptr [rax] | k1" : "[rax]{k1}";
 #else
 		case XMM_SAE:
 			return isXbyak_ ? "xmm5 | T_sae" : "xmm5, {sae}";
@@ -395,6 +398,8 @@
 			return isXbyak_ ? "zmm5 | T_sae" : "zmm5, {sae}";
 		case ZMM_ER:
 			return isXbyak_ ? "zmm2 | T_rd_sae" : "zmm2, {rd-sae}";
+		case MEM_K:
+			return isXbyak_ ? "ptr [eax] | k1" : "[eax]{k1}";
 #endif
 		}
 		return 0;
@@ -2616,6 +2621,13 @@
 		put("vmovntdq", MEM, _XMM3 | _YMM3 | ZMM);
 		put("vmovntpd", MEM, _XMM3 | _YMM3 | ZMM);
 		put("vmovntps", MEM, _XMM3 | _YMM3 | ZMM);
+
+		put("vmovsd", XMM_KZ, _XMM3, _XMM3);
+		put("vmovsd", XMM_KZ, MEM);
+		put("vmovsd", MEM_K, XMM);
+		put("vmovss", XMM_KZ, _XMM3, _XMM3);
+		put("vmovss", XMM_KZ, MEM);
+		put("vmovss", MEM_K, XMM);
 		{
 			const char tbl[][16] = {
 				"vmovhpd",
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 93173ab..4e545c5 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -172,6 +172,7 @@
 	ERR_SAE_IS_INVALID,
 	ERR_ER_IS_INVALID,
 	ERR_INVALID_BROADCAST,
+	ERR_INVALID_OPMASK_WITH_MEMORY,
 	ERR_INTERNAL
 };
 
@@ -229,6 +230,7 @@
 			"sae(suppress all exceptions) is invalid",
 			"er(embedded rounding) is invalid",
 			"invalid broadcast",
+			"invalid opmask with memory",
 			"internal error",
 		};
 		assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl));
@@ -546,7 +548,7 @@
 template<class T>
 T operator|(const T& x, const Opmask& k)
 {
-	if (!x.is(Operand::XMM | Operand::YMM | Operand::ZMM | Operand::OPMASK)) throw Error(ERR_BAD_COMBINATION);
+	if (!x.is(Operand::XMM | Operand::YMM | Operand::ZMM | Operand::OPMASK | Operand::MEM)) throw Error(ERR_BAD_COMBINATION);
 	T r(x);
 	r.setOpmaskIdx(k.getIdx());
 	return r;
@@ -955,7 +957,7 @@
 	}
 #ifdef XBYAK64
 	explicit Address(size_t disp)
-		: Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), permitVsib_(false), broadcast_(false) { }
+		: Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), permitVsib_(false), broadcast_(false){ }
 	Address(uint32 sizeBit, bool broadcast, const RegRip& addr)
 		: Operand(0, MEM, sizeBit), e_(addr.disp_), label_(addr.label_), mode_(M_rip), permitVsib_(false), broadcast_(broadcast) { }
 #endif
@@ -1364,6 +1366,7 @@
 		T_MUST_EVEX = 1 << 23,
 		T_B32 = 1 << 24, // m32bcst
 		T_B64 = 1 << 25, // m64bcst
+		T_M_K = 1 << 26, // mem{k}
 		T_XXX
 	};
 	void vex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false)
@@ -1401,7 +1404,7 @@
 		T_RZ_SAE = 4,
 		T_SAE = 5,
 	};
-	void evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false)
+	void evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0)
 	{
 		if (!(type & T_EVEX)) throw Error(ERR_EVEX_IS_INVALID);
 		int w = (type & T_EW1) ? 1 : 0;
@@ -1431,7 +1434,7 @@
 		}
 		bool Vp = !(v ? v->isExtIdx2() : 0);
 		bool z = reg.hasZero();
-		int aaa = reg.getOpmaskIdx();
+		if (aaa == 0) aaa = reg.getOpmaskIdx();
 		db(0x62);
 		db((R ? 0x80 : 0) | (X ? 0x40 : 0) | (B ? 0x20 : 0) | (Rp ? 0x10 : 0) | (mm & 3));
 		db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | 4 | (pp & 3));
@@ -1756,7 +1759,9 @@
 			if (BIT == 64 && addr.is32bit()) db(0x67);
 			int disp8N = 0;
 			bool x = addr.getRegExp().getIndex().isExtIdx();
-			if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast()) {
+			if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) {
+				int aaa = addr.getOpmaskIdx();
+				if (aaa & !(type & T_M_K)) throw Error(ERR_INVALID_OPMASK_WITH_MEMORY);
 				bool b = false;
 				if (addr.isBroadcast()) {
 					if (!(type & (T_B32 | T_B64))) throw Error(ERR_INVALID_BROADCAST);
@@ -1765,7 +1770,7 @@
 				} else {
 					disp8N = 1;
 				}
-				evex(r, base, p1, type, code, x, b);
+				evex(r, base, p1, type, code, x, b, aaa);
 			} else {
 				vex(r, base, p1, type, code, x);
 			}
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 89d8715..51f04cf 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1467,12 +1467,12 @@
 void vmovntpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW1, 0x2B); }
 void vmovntps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_YMM | T_EVEX | T_EW0, 0x2B); }
 void vmovntdqa(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F38 | T_66 | T_YMM | T_EVEX | T_EW0, 0x2A); }
-void vmovsd(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x1, x2, op, T_0F | T_F2, 0x10); }
-void vmovsd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_F2, 0x10); }
-void vmovsd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_F2, 0x11); }
-void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x1, x2, op, T_0F | T_F3, 0x10); }
-void vmovss(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_F3, 0x10); }
-void vmovss(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_F3, 0x11); }
+void vmovsd(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x1, x2, op, T_0F | T_F2 | T_EW1 | T_EVEX, 0x10); }
+void vmovsd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_F2 | T_EW1 | T_EVEX, 0x10); }
+void vmovsd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_F2 | T_EW1 | T_EVEX | T_M_K, 0x11); }
+void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x1, x2, op, T_0F | T_F3 | T_EW0 | T_EVEX, 0x10); }
+void vmovss(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_F3 | T_EW0 | T_EVEX, 0x10); }
+void vmovss(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_F3 | T_EW0 | T_EVEX | T_M_K, 0x11); }
 void vcvtss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0, 0x2D); }
 void vcvttss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0, 0x2C); }
 void vcvtsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0, 0x2D); }