fix vsqrtsd, vsqrtss, add vsqrtpd, vsqrtps
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index 6eebb57..1db9164 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -1162,8 +1162,8 @@
 			{ 0xC6, "shufpd", T_0F | T_66 | T_YMM, true, true },
 			{ 0xC6, "shufps", T_0F | T_YMM, true, true },
 
-			{ 0x51, "sqrtsd", T_0F | T_F2, false, true },
-			{ 0x51, "sqrtss", T_0F | T_F3, false, true },
+			{ 0x51, "sqrtsd", T_0F | T_F2 | T_EVEX | T_EW1 | T_ER_X, false, true },
+			{ 0x51, "sqrtss", T_0F | T_F3 | T_EVEX | T_EW0 | T_ER_X, false, true },
 
 			{ 0x15, "unpckhpd", T_0F | T_66 | T_YMM, false, true },
 			{ 0x15, "unpckhps", T_0F | T_YMM, false, true },
@@ -1241,8 +1241,8 @@
 			{ 0x53, "rcpps", T_0F | T_YMM, false },
 			{ 0x52, "rsqrtps", T_0F | T_YMM, false },
 
-			{ 0x51, "sqrtpd", T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_ER_Z, false },
-			{ 0x51, "sqrtps", T_0F | T_YMM | T_EVEX | T_EW0 | T_ER_Z, false },
+			{ 0x51, "sqrtpd", T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_ER_Z | T_B32 | T_B64, false },
+			{ 0x51, "sqrtps", T_0F | T_YMM | T_EVEX | T_EW0 | T_ER_Z | T_B32, false },
 
 			{ 0x2E, "ucomisd", T_0F | T_66 | T_EVEX | T_EW1 | T_SAE_X, false },
 			{ 0x2E, "ucomiss", T_0F | T_EVEX | T_EW0 | T_SAE_X, false },
diff --git a/test/make_nm.cpp b/test/make_nm.cpp
index e39a0c5..5eeb52f 100644
--- a/test/make_nm.cpp
+++ b/test/make_nm.cpp
@@ -2686,6 +2686,19 @@
 			put(p.name, MEM, ZMM);
 		}
 	}
+	void put512_X_X_XM()
+	{
+		const struct Tbl {
+			const char *name;
+		} tbl[] = {
+			{ "vsqrtsd" },
+			{ "vsqrtss" },
+		};
+		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
+			const Tbl& p = tbl[i];
+			put(p.name, XMM_KZ, XMM, XMM|MEM);
+		}
+	}
 	void putAVX512()
 	{
 		putOpmask();
@@ -2695,6 +2708,7 @@
 		putAVX512_M_X();
 		put_vmov();
 		put512_X_MX();
+		put512_X_X_XM();
 	}
 #endif
 };
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index c77bab9..1969722 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -950,9 +950,9 @@
 void vshufpd(const Xmm& x, const Operand& op, uint8 imm) { vshufpd(x, x, op, imm); }
 void vshufps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_0F | T_YMM, 0xC6, imm); }
 void vshufps(const Xmm& x, const Operand& op, uint8 imm) { vshufps(x, x, op, imm); }
-void vsqrtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_F2, 0x51); }
+void vsqrtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X, 0x51); }
 void vsqrtsd(const Xmm& x, const Operand& op) { vsqrtsd(x, x, op); }
-void vsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_F3, 0x51); }
+void vsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X, 0x51); }
 void vsqrtss(const Xmm& x, const Operand& op) { vsqrtss(x, x, op); }
 void vunpckhpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_66 | T_YMM, 0x15); }
 void vunpckhpd(const Xmm& x, const Operand& op) { vunpckhpd(x, x, op); }
@@ -1009,8 +1009,8 @@
 void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F38 | T_66, 0x17); }
 void vrcpps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x53); }
 void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x52); }
-void vsqrtpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z, 0x51); }
-void vsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z, 0x51); }
+void vsqrtpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B32 | T_B64, 0x51); }
+void vsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x51); }
 void vucomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_66 | T_EW1 | T_EVEX | T_SAE_X, 0x2E); }
 void vucomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2E); }
 void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX, 0x29); }