vcmp{pd,ps,sd,ss}
diff --git a/gen/avx_type.hpp b/gen/avx_type.hpp
index 1fd253f..e7b2883 100644
--- a/gen/avx_type.hpp
+++ b/gen/avx_type.hpp
@@ -14,7 +14,10 @@
 	T_EW0 = 1 << 13,
 	T_EW1 = 1 << 14,
 	T_YMM = 1 << 15,
-	T_EVEX = 1 << 16
+	T_EVEX = 1 << 16,
+	T_ER = 1 << 17,
+	T_SAE = 1 << 18,
+	T_MUST_EVEX = 1 << 19
 };
 
 const int NONE = 256; // same as Xbyak::CodeGenerator::NONE
@@ -78,5 +81,17 @@
 		if (!str.empty()) str += " | ";
 		str += "T_EVEX";
 	}
+	if (type & T_ER) {
+		if (!str.empty()) str += " | ";
+		str += "T_ER";
+	}
+	if (type & T_SAE) {
+		if (!str.empty()) str += " | ";
+		str += "T_SAE";
+	}
+	if (type & T_MUST_EVEX) {
+		if (!str.empty()) str += " | ";
+		str += "T_MUST_EVEX";
+	}
 	return str;
 }
\ No newline at end of file
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index 4b19075..7f86fa9 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -1664,6 +1664,27 @@
 			printf("void %s(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W%d, 0x%x, %d); }\n", p.name, p.w, p.code, p.mode);
 		}
 	}
+	// AVX-512
+	// vcmppd(k, x, op)
+	{
+		const struct Tbl {
+			uint8 code;
+			const char *name;
+			int type;
+			bool hasIMM;
+		} tbl[] = {
+			{ 0xC2, "cmppd", T_0F | T_EVEX | T_MUST_EVEX | T_EW1 | T_SAE | T_YMM | T_66, true },
+			{ 0xC2, "cmpps", T_0F | T_EVEX | T_MUST_EVEX | T_EW0 | T_SAE | T_YMM, true },
+			{ 0xC2, "cmpsd", T_0F | T_EVEX | T_MUST_EVEX | T_EW1 | T_SAE | T_F2, true },
+			{ 0xC2, "cmpss", T_0F | T_EVEX | T_MUST_EVEX | T_EW0 | T_SAE | T_F3, true },
+		};
+		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+			const Tbl *p = &tbl[i];
+			std::string type = type2String(p->type);
+			printf("void v%s(const Opmask& k, const Xmm& x, const Operand& op%s) { opAVX_K_X_XM(k, x, op, %s, 0x%02X%s); }\n"
+				, p->name, p->hasIMM ? ", uint8 imm" : "", type.c_str(), p->code, p->hasIMM ? ", imm" : "");
+		}
+	}
 }
 
 int main()
diff --git a/test/make_nm.cpp b/test/make_nm.cpp
index fdfd2c9..acc4b15 100644
--- a/test/make_nm.cpp
+++ b/test/make_nm.cpp
@@ -82,11 +82,14 @@
 const uint64 _ZMM2 = 1ULL << 45;
 #ifdef XBYAK64
 const uint64 ZMM = _ZMM | _ZMM2;
-const uint64 _YMM3 = 1ULL << 46; // max value
+const uint64 _YMM3 = 1ULL << 46;
 #else
 const uint64 ZMM = _ZMM;
 const uint64 _YMM3 = 0;
 #endif
+const uint64 K2 = 1ULL << 47;
+const uint64 ZMM_SAE = 1ULL << 48;
+const uint64 ZMM_ER = 1ULL << 49; // max value
 
 const uint64 NOPARA = 1ULL << (bitEnd - 1);
 
@@ -350,6 +353,19 @@
 				};
 				return kTbl[idx % 7];
 			}
+		case K2:
+			return isXbyak_ ? "k3 | k5" : "k3{k5}";
+#ifdef XBYAK64
+		case ZMM_SAE:
+			return isXbyak_ ? "zmm25 | T_sae" : "zmm25, {sae}";
+		case ZMM_ER:
+			return isXbyak_ ? "zmm20 | T_rd_sae" : "zmm20, {rd-sae}";
+#else
+		case ZMM_SAE:
+			return isXbyak_ ? "zmm5 | T_sae" : "zmm5, {sae}";
+		case ZMM_ER:
+			return isXbyak_ ? "zmm2 | T_rd_sae" : "zmm2, {rd-sae}";
+#endif
 		}
 		return 0;
 	}
@@ -2480,10 +2496,31 @@
 			}
 		}
 	}
+	void putCmpK()
+	{
+		const struct Tbl {
+			const char *name;
+			bool supportYMM;
+		} tbl[] = {
+			{ "vcmppd", true },
+			{ "vcmpps", true },
+			{ "vcmpsd", false },
+			{ "vcmpss", false },
+		};
+		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+			const Tbl *p = &tbl[i];
+			put(p->name, K, XMM, XMM | MEM, IMM);
+			if (!p->supportYMM) continue;
+			put(p->name, K, YMM, YMM | MEM, IMM);
+			put(p->name, K, ZMM, ZMM | MEM, IMM);
+		}
+		put("vcmppd", K2, ZMM, ZMM_SAE, IMM);
+	}
 	void putAVX512()
 	{
 		putOpmask();
 		putCombi();
+		putCmpK();
 	}
 #endif
 };
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 324aa62..513520c 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -339,7 +339,7 @@
 	static const uint8 EXT8BIT = 0x80;
 	unsigned int idx_:8; // 0..31, EXT8BIT = 1 if spl/bpl/sil/dil
 	unsigned int kind_:8;
-	unsigned int bit_:9;
+	unsigned int bit_:10;
 protected:
 	unsigned int zero_:1;
 	unsigned int mask_:3;
@@ -372,7 +372,7 @@
 	Operand(int idx, Kind kind, int bit, bool ext8bit = 0)
 		: idx_(static_cast<uint8>(idx | (ext8bit ? EXT8BIT : 0)))
 		, kind_(static_cast<uint8>(kind))
-		, bit_(static_cast<uint16>(bit))
+		, bit_(bit)
 		, zero_(0), mask_(0), rounding_(0)
 	{
 		assert((bit_ & (bit_ - 1)) == 0); // bit must be power of two
@@ -516,10 +516,6 @@
 	explicit Mmx(int idx = 0, Kind kind = Operand::MMX, int bit = 64) : Reg(idx, kind, bit) { }
 };
 
-struct Opmask : public Reg {
-	explicit Opmask(int idx = 0) : Reg(idx, Operand::OPMASK, 64) {}
-};
-
 struct EvexModifierRounding {
 	explicit EvexModifierRounding(int rounding) : rounding(rounding) {}
 	int rounding;
@@ -559,10 +555,14 @@
 	Zmm operator|(const EvexModifierRounding& emr) const { Zmm r(*this); r.setRounding(emr.rounding); return r; }
 };
 
+struct Opmask : public Reg {
+	explicit Opmask(int idx = 0) : Reg(idx, Operand::OPMASK, 64) {}
+};
+
 template<class T>
 T operator|(const T& x, const Opmask& k)
 {
-	if (!x.is(Operand::XMM | Operand::YMM | Operand::ZMM)) throw Error(ERR_BAD_COMBINATION);
+	if (!x.is(Operand::XMM | Operand::YMM | Operand::ZMM | Operand::OPMASK)) throw Error(ERR_BAD_COMBINATION);
 	T r(x);
 	r.setOpmaskIdx(k.getIdx());
 	return r;
@@ -1369,7 +1369,8 @@
 		T_YMM = 1 << 15,
 		T_EVEX = 1 << 16,
 		T_ER = 1 << 17,
-		T_SAE = 1 << 18
+		T_SAE = 1 << 18,
+		T_MUST_EVEX = 1 << 19
 	};
 	void vex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false)
 	{
@@ -1388,6 +1389,7 @@
 		}
 		db(code);
 	}
+	int Max(int a, int b) const { return a > b ? a : b; }
 	void evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false)
 	{
 		if (!(type & T_EVEX)) throw Error(ERR_EVEX_IS_INVALID);
@@ -1404,17 +1406,16 @@
 		bool B = !base.isExtIdx();
 		bool Rp = !reg.isExtIdx2();
 		bool b = false;
-		int LL = 2;
-		if (reg.isZMM()) {
-			int rounding = base.getRounding();
-			if (rounding) {
-				if (rounding == inner::T_SAE && !(type & T_SAE)) throw Error(ERR_SAE_IS_INVALID);
-				if (rounding != inner::T_SAE && !(type & T_ER)) throw Error(ERR_ER_IS_INVALID);
-				LL = rounding - 1;
-				b = true;
-			}
+		int LL;
+		int rounding = base.getRounding();
+		if (rounding) {
+			if (!base.isZMM() || (rounding == inner::T_SAE && !(type & T_SAE))) throw Error(ERR_SAE_IS_INVALID);
+			if (!base.isZMM() || (rounding != inner::T_SAE && !(type & T_ER))) throw Error(ERR_ER_IS_INVALID);
+			LL = rounding - 1;
+			b = true;
 		} else {
-			LL = reg.isYMM() ? 1 : 0;
+			int bit = Max(Max(reg.getBit(), base.getBit()), (v ? v->getBit() : 0));
+			LL = (bit == 512) ? 2 : (bit == 256) ? 1 : 0;
 		}
 		bool Vp = !(v ? v->isExtIdx2() : 0);
 		bool z = reg.hasZero();
@@ -1734,7 +1735,7 @@
 			if (BIT == 64 && addr.is32bit()) db(0x67);
 			bool disp32 = false;
 			bool x = addr.getRegExp().getIndex().isExtIdx();
-			if (r.hasEvex() || (p1 && p1->hasEvex()) /*|| base.hasEvex()*/) {
+			if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex())) {
 				evex(r, base, p1, type, code, x);
 				disp32 = true;
 			} else {
@@ -1743,7 +1744,7 @@
 			opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp32);
 		} else {
 			const Reg& base = static_cast<const Reg&>(op2);
-			if (r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex()) {
+			if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex()) {
 				evex(r, base, p1, type, code);
 			} else {
 				vex(r, base, p1, type, code);
@@ -1780,6 +1781,11 @@
 		if (!((x1.isXMM() && x2->isXMM()) || ((type & T_YMM) && ((x1.isYMM() && x2->isYMM()) || (x1.isZMM() && x2->isZMM()))))) throw Error(ERR_BAD_COMBINATION);
 		opVex(x1, x2, *op, type, code0, imm8);
 	}
+	void opAVX_K_X_XM(const Opmask& k1, const Xmm& x2, const Operand& op3, int type, int code0, int imm8 = NONE)
+	{
+		if (!op3.isMEM() && (x2.getKind() != op3.getKind())) throw Error(ERR_BAD_COMBINATION);
+		opVex(k1, &x2, op3, type, code0, imm8);
+	}
 	// if cvt then return pointer to Xmm(idx) (or Ymm(idx)), otherwise return op
 	void opAVX_X_X_XMcvt(const Xmm& x1, const Operand& op1, const Operand& op2, bool cvt, Operand::Kind kind, int type, int code0, int imm8 = NONE)
 	{
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 4e9ebba..123071a 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1519,3 +1519,7 @@
 void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W0, 0x91, 2); }
 void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W1, 0x90, 0); }
 void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W1, 0x91, 1); }
+void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_SAE | T_MUST_EVEX, 0xC2, imm); }
+void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE | T_MUST_EVEX, 0xC2, imm); }
+void vcmpsd(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_0F | T_F2 | T_EW1 | T_EVEX | T_SAE | T_MUST_EVEX, 0xC2, imm); }
+void vcmpss(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_0F | T_F3 | T_EW0 | T_EVEX | T_SAE | T_MUST_EVEX, 0xC2, imm); }