add vpcmp*, v(p)blend*
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index 07ee353..d4dadc0 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -115,6 +115,16 @@
 		{ 0x65, "vpcmpgtw", T_66 | T_0F | T_MUST_EVEX | T_YMM, false },
 		{ 0x66, "vpcmpgtd", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, false },
 		{ 0x37, "vpcmpgtq", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, false },
+
+		{ 0x3F, "vpcmpb", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0, true },
+		{ 0x3E, "vpcmpub", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0, true },
+
+		{ 0x3F, "vpcmpw", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1, true },
+		{ 0x3E, "vpcmpuw", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1, true },
+		{ 0x1F, "vpcmpd", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, true },
+		{ 0x1E, "vpcmpud", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, true },
+		{ 0x1F, "vpcmpq", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, true },
+		{ 0x1E, "vpcmpuq", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, true },
 	};
 	for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 		const Tbl *p = &tbl[i];
@@ -211,6 +221,13 @@
 
 		{ 0x8D, "vpermb", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, false },
 		{ 0x8D, "vpermw", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, false },
+
+		{ 0x65, "vblendmpd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false },
+		{ 0x65, "vblendmps", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, false },
+		{ 0x66, "vpblendmb", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, false },
+		{ 0x66, "vpblendmw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1, false },
+		{ 0x64, "vpblendmd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, false },
+		{ 0x64, "vpblendmq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false },
 	};
 	for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 		const Tbl *p = &tbl[i];
diff --git a/test/make_512.cpp b/test/make_512.cpp
index c6999b8..3eab5cf 100644
--- a/test/make_512.cpp
+++ b/test/make_512.cpp
@@ -28,7 +28,7 @@
 const uint64 MEM16 = 1ULL << 16;
 const uint64 MEM32 = 1ULL << 17;
 const uint64 VM32Z = 1ULL << 19;
-const uint64 CL = 1ULL << 20;
+const uint64 K_K = 1ULL << 20;
 const uint64 MEM_ONLY_DISP = 1ULL << 21;
 const uint64 NEG32 = 1ULL << 23;
 const uint64 _YMM = 1ULL << 24;
@@ -335,8 +335,8 @@
 			return "ax";
 		case AL:
 			return "al";
-		case CL:
-			return "cl";
+		case K_K:
+			return isXbyak_ ? "k5 | k3" : "k5{k3}";
 		case IMM32:
 			return isXbyak_ ? "12345678" : "dword 12345678";
 		case IMM8:
@@ -1589,10 +1589,62 @@
 		}
 #endif
 	}
+	void putBlend()
+	{
+		put("vblendmpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
+		put("vblendmpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
+		put("vblendmpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
+
+		put("vblendmps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
+		put("vblendmps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
+		put("vblendmps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
+
+		put("vpblendmb", XMM_KZ, _XMM, _XMM | _MEM);
+		put("vpblendmb", YMM_KZ, _YMM, _YMM | _MEM);
+		put("vpblendmb", ZMM_KZ, _ZMM, _ZMM | _MEM);
+
+		put("vpblendmb", XMM_KZ, _XMM, _XMM | _MEM);
+		put("vpblendmb", YMM_KZ, _YMM, _YMM | _MEM);
+		put("vpblendmb", ZMM_KZ, _ZMM, _ZMM | _MEM);
+
+		put("vpblendmd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
+		put("vpblendmd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
+		put("vpblendmd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
+
+		put("vpblendmq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
+		put("vpblendmq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
+		put("vpblendmq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
+	}
+	void putVpcmp()
+	{
+		const uint64_t b0Tbl[] = { 0, 0, 0 };
+		const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 };
+		const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 };
+		const struct Tbl {
+			const char *name;
+			uint64_t b;
+		} tbl[] = {
+			{ "vpcmpb", 0 },
+			{ "vpcmpub", 0 },
+			{ "vpcmpw", 0 },
+			{ "vpcmpuw", 0 },
+			{ "vpcmpd", M_1to4 },
+			{ "vpcmpud", M_1to4 },
+			{ "vpcmpq", M_1to2 },
+			{ "vpcmpuq", M_1to2 },
+		};
+		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+			const Tbl& p = tbl[i];
+			const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl;
+			put(p.name, K_K, _XMM, _XMM | _MEM | bTbl[0], IMM8);
+			put(p.name, K_K, _YMM, _YMM | _MEM | bTbl[1], IMM8);
+			put(p.name, K_K, _ZMM, _ZMM | _MEM | bTbl[2], IMM8);
+		}
+	}
 	void putMin()
 	{
 #ifdef XBYAK64
-		putGather();
+		putVpcmp();
 #endif
 	}
 	void putAVX512()
@@ -1631,6 +1683,10 @@
 		putMisc1();
 		separateFunc();
 		putGather();
+		separateFunc();
+		putBlend();
+		separateFunc();
+		putVpcmp();
 #endif
 	}
 };
diff --git a/xbyak/xbyak_avx512.h b/xbyak/xbyak_avx512.h
index afb8fce..5590638 100644
--- a/xbyak/xbyak_avx512.h
+++ b/xbyak/xbyak_avx512.h
@@ -76,6 +76,14 @@
 void vpcmpgtw(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x65); }
 void vpcmpgtd(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x66); }
 void vpcmpgtq(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x37); }
+void vpcmpb(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3F, imm); }
+void vpcmpub(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3E, imm); }
+void vpcmpw(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3F, imm); }
+void vpcmpuw(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3E, imm); }
+void vpcmpd(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1F, imm); }
+void vpcmpud(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1E, imm); }
+void vpcmpq(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1F, imm); }
+void vpcmpuq(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1E, imm); }
 void vmovdqa32(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); }
 void vmovdqa64(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); }
 void vmovdqu8(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); }
@@ -120,6 +128,12 @@
 void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); }
 void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D); }
 void vpermw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8D); }
+void vblendmpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x65); }
+void vblendmps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x65); }
+void vpblendmb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x66); }
+void vpblendmw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x66); }
+void vpblendmd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x64); }
+void vpblendmq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x64); }
 void vpsraq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm); }
 void vextractf32x4(const Operand& op, const Ymm& r, uint8 imm) { opAVX_X_X_XMcvt(r, true, cvtIdx0(r), op, op.isXMM(), Operand::YMM, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x19, imm); }
 void vextractf64x2(const Operand& op, const Ymm& r, uint8 imm) { opAVX_X_X_XMcvt(r, true, cvtIdx0(r), op, op.isXMM(), Operand::YMM, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_N16, 0x19, imm); }