add vpacksswb, vpackssdw, vpackuswb, vpackusdw
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index 3825e68..08f36a5 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -1045,10 +1045,10 @@
 			{ 0x5A, "cvtsd2ss", T_0F | T_F2, false, true },
 			{ 0x5A, "cvtss2sd", T_0F | T_F3, false, true },
 			{ 0x21, "insertps", T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0, true, true },
-			{ 0x63, "packsswb", T_0F | T_66 | T_YMM, false, true },
-			{ 0x6B, "packssdw", T_0F | T_66 | T_YMM, false, true },
-			{ 0x67, "packuswb", T_0F | T_66 | T_YMM, false, true },
-			{ 0x2B, "packusdw", T_0F38 | T_66 | T_YMM, false, true },
+			{ 0x63, "packsswb", T_0F | T_66 | T_YMM | T_EVEX, false, true },
+			{ 0x6B, "packssdw", T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32, false, true },
+			{ 0x67, "packuswb", T_0F | T_66 | T_YMM | T_EVEX, false, true },
+			{ 0x2B, "packusdw", T_0F38 | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32, false, true },
 
 			{ 0xFC, "paddb", T_0F | T_66 | T_YMM, false, true },
 			{ 0xFD, "paddw", T_0F | T_66 | T_YMM, false, true },
diff --git a/test/make_nm.cpp b/test/make_nm.cpp
index 79162d0..9fa919c 100644
--- a/test/make_nm.cpp
+++ b/test/make_nm.cpp
@@ -2726,6 +2726,37 @@
 			put(p.name, XMM_KZ, _XMM, _XMM|p.mem);
 		}
 	}
+	void put512_X3()
+	{
+#ifdef XBYAK64
+		const struct Tbl {
+			const char *name;
+			uint64_t x1;
+			uint64_t x2;
+			uint64_t xm;
+		} tbl[] = {
+			{ "vpacksswb", XMM_KZ, _XMM, _XMM | _MEM },
+			{ "vpacksswb", YMM_KZ, _YMM, _YMM | _MEM },
+			{ "vpacksswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
+
+			{ "vpackssdw", XMM_KZ, _XMM, _XMM | M_1to4 },
+			{ "vpackssdw", YMM_KZ, _YMM, _YMM | M_1to8 },
+			{ "vpackssdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 },
+
+			{ "vpackusdw", XMM_KZ, _XMM, _XMM | M_1to4 },
+			{ "vpackusdw", YMM_KZ, _YMM, _YMM | M_1to8 },
+			{ "vpackusdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 },
+
+			{ "vpackuswb", XMM_KZ, _XMM, _XMM | _MEM },
+			{ "vpackuswb", YMM_KZ, _YMM, _YMM | _MEM },
+			{ "vpackuswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
+		};
+		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+			const Tbl& p = tbl[i];
+			put(p.name, p.x1, p.x2, p.xm);
+		}
+#endif
+	}
 	void put512_X3_I()
 	{
 		const struct Tbl {
@@ -2764,6 +2795,7 @@
 		put_vmov();
 		put512_X_XM();
 		put512_X_X_XM();
+		put512_X3();
 		put512_X3_I();
 	}
 #endif
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 1c8e5a5..55a221f 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -777,13 +777,13 @@
 void vcvtss2sd(const Xmm& x, const Operand& op) { vcvtss2sd(x, x, op); }
 void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_W0 | T_EW0 | T_EVEX, 0x21, imm); }
 void vinsertps(const Xmm& x, const Operand& op, uint8 imm) { vinsertps(x, x, op, imm); }
-void vpacksswb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_66 | T_YMM, 0x63); }
+void vpacksswb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_66 | T_YMM | T_EVEX, 0x63); }
 void vpacksswb(const Xmm& x, const Operand& op) { vpacksswb(x, x, op); }
-void vpackssdw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_66 | T_YMM, 0x6B); }
+void vpackssdw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_66 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6B); }
 void vpackssdw(const Xmm& x, const Operand& op) { vpackssdw(x, x, op); }
-void vpackuswb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_66 | T_YMM, 0x67); }
+void vpackuswb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_66 | T_YMM | T_EVEX, 0x67); }
 void vpackuswb(const Xmm& x, const Operand& op) { vpackuswb(x, x, op); }
-void vpackusdw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_66 | T_YMM, 0x2B); }
+void vpackusdw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_66 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x2B); }
 void vpackusdw(const Xmm& x, const Operand& op) { vpackusdw(x, x, op); }
 void vpaddb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_66 | T_YMM, 0xFC); }
 void vpaddb(const Xmm& x, const Operand& op) { vpaddb(x, x, op); }