support sae
diff --git a/test/make_nm.cpp b/test/make_nm.cpp
index 0c0a9b3..71c257a 100644
--- a/test/make_nm.cpp
+++ b/test/make_nm.cpp
@@ -2401,19 +2401,21 @@
 		put("kmovq", REG64, K);
 #endif
 	}
-	void put_vaddpd(const char *r1, const char *r2, const char *r3, int kIdx = 0, bool z = false)
+	void put_vaddpd(const char *r1, const char *r2, const char *r3, int kIdx = 0, bool z = false, int sae = 0)
 	{
 		std::string modifier;
 		char pk[16] = "";
 		const char *pz = "";
+		const char *saeTblXbyak[] = { "", "|T_rn_sae", "|T_rd_sae", "|T_ru_sae", "|T_rz_sae" };
+		const char *saeTblNASM[] = { "", ",{rn-sae}", ",{rd-sae}", ",{ru-sae}", ",{rz-sae}" };
 		if (isXbyak_) {
 			if (kIdx) CYBOZU_SNPRINTF(pk, sizeof(pk), "|k%d", kIdx);
 			if (z) pz = "|T_z";
-			printf("vaddpd(%s%s%s, %s, %s); dump();\n", r1, pk, pz, r2, r3);
+			printf("vaddpd(%s%s%s, %s, %s%s); dump();\n", r1, pk, pz, r2, r3, saeTblXbyak[sae]);
 		} else {
 			if (kIdx) CYBOZU_SNPRINTF(pk, sizeof(pk), "{k%d}", kIdx);
 			if (z) pz = "{z}";
-			printf("vaddpd %s%s%s, %s, %s\n", r1, pk, pz, r2, r3);
+			printf("vaddpd %s%s%s, %s, %s%s\n", r1, pk, pz, r2, r3, saeTblNASM[sae]);
 		}
 	}
 	void putCombi()
@@ -2445,19 +2447,24 @@
 		const size_t N = NUM_OF_ARRAY(zTbl);
 		for (size_t i = 0; i < N; i++) {
 			for (size_t j = 0; j < N; j++) {
+				separateFunc();
 				for (size_t k = 0; k < N; k++) {
 #ifdef XBYAK64
 					for (int kIdx = 0; kIdx < 8; kIdx++) {
 						for (int z = 0; z < 2; z++) {
 							put_vaddpd(xTbl[i], xTbl[j], xTbl[k], kIdx, z == 1);
 							put_vaddpd(yTbl[i], yTbl[j], yTbl[k], kIdx, z == 1);
-							put_vaddpd(zTbl[i], zTbl[j], zTbl[k], kIdx, z == 1);
+							for (int sae = 0; sae < 5; sae++) {
+								put_vaddpd(zTbl[i], zTbl[j], zTbl[k], kIdx, z == 1, sae);
+							}
 						}
 					}
 #else
 					put_vaddpd(xTbl[i], xTbl[j], xTbl[k]);
 					put_vaddpd(yTbl[i], yTbl[j], yTbl[k]);
-					put_vaddpd(zTbl[i], zTbl[j], zTbl[k]);
+					for (int sae = 0; sae < 5; sae++) {
+						put_vaddpd(zTbl[i], zTbl[j], zTbl[k], sae);
+					}
 #endif
 				}
 			}
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 967488c..f83ab4f 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -516,11 +516,23 @@
 	int rounding;
 };
 
-static const EvexModifierRounding T_sae(1);
-static const EvexModifierRounding T_rn_sae(2);
-static const EvexModifierRounding T_rd_sae(3);
-static const EvexModifierRounding T_ru_sae(4);
-static const EvexModifierRounding T_rz_sae(5);
+namespace inner {
+
+enum SAEtype {
+	T_SAE,
+	T_RN_SAE = 1,
+	T_RD_SAE = 2,
+	T_RU_SAE = 3,
+	T_RZ_SAE = 4,
+};
+
+} // inner
+
+static const EvexModifierRounding T_sae(inner::T_SAE);
+static const EvexModifierRounding T_rn_sae(inner::T_RN_SAE);
+static const EvexModifierRounding T_rd_sae(inner::T_RD_SAE);
+static const EvexModifierRounding T_ru_sae(inner::T_RU_SAE);
+static const EvexModifierRounding T_rz_sae(inner::T_RZ_SAE);
 static const struct EvexModifierZero{} T_z; // {z}
 
 struct Xmm : public Mmx {
@@ -1361,8 +1373,17 @@
 		bool X = x ? false : !base.isExtIdx2();
 		bool B = !base.isExtIdx();
 		bool Rp = !reg.isExtIdx2();
-		int LL = reg.isZMM() ? 2 : reg.isYMM() ? 1 : 0;
 		bool b = false;
+		int LL = 2;
+		if (reg.isZMM()) {
+			int rounding = base.getRounding();
+			if (rounding) {
+				LL = rounding - 1;
+				b = true;
+			}
+		} else {
+			LL = reg.isYMM() ? 1 : 0;
+		}
 		bool Vp = !(v ? v->isExtIdx2() : 0);
 		bool z = reg.hasZero();
 		int aaa = reg.getOpmaskIdx();