code size of mov(reg64, imm) is reduced if imm is in 32-bit
diff --git a/test/make_nm.cpp b/test/make_nm.cpp
index 97a60a3..92a5823 100644
--- a/test/make_nm.cpp
+++ b/test/make_nm.cpp
@@ -893,23 +893,30 @@
 			const char *a;
 			const char *b;
 		} tbl[] = {
-			{ "0", "dword 0" },
-			{ "0x123", "dword 0x123" },
-			{ "0x12345678", "dword 0x12345678" },
-			{ "0x7fffffff", "dword 0x7fffffff" },
+			{ "0", "0" },
+			{ "0x123", "0x123" },
+			{ "0x12345678", "0x12345678" },
+			{ "0x7fffffff", "0x7fffffff" },
 			{ "0xffffffff", "0xffffffff" },
 			{ "0x80000000", "0x80000000" },
 			{ "2147483648U", "2147483648" },
 			{ "0x80000001", "0x80000001" },
-			{ "0xffffffffffffffff", "dword 0xffffffffffffffff" },
-			{ "-1", "dword -1" },
-			{ "0xffffffff80000000", "dword 0xffffffff80000000" },
-			{ "0xffffffff80000001", "dword 0xffffffff80000001" },
+			{ "0xffffffffffffffff", "0xffffffffffffffff" },
+			{ "-1", "-1" },
+			{ "0xffffffff80000000", "0xffffffff80000000" },
+			{ "0xffffffff80000001", "0xffffffff80000001" },
 			{ "0xffffffff12345678", "0xffffffff12345678" },
 		};
 		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 			put("mov", REG64, tbl[i].a, tbl[i].b);
 		}
+	}
+	// only nasm
+	void putMovImm64() const
+	{
+		put("mov", REG64, "0x1234567890abcdefLL", "0x1234567890abcdef");
+		put("mov", REG64, "0x12345678", "0x12345678");
+		put("mov", REG64, "0xffffffff12345678LL", "0xffffffff12345678");
 		put("mov", REG32e|REG16|REG8|RAX|EAX|AX|AL, IMM);
 	}
 	void putEtc() const
@@ -2074,7 +2081,9 @@
 	void put()
 	{
 #ifdef USE_AVX
+
 		putFMA2();
+
 #ifdef USE_YASM
 		putGprR_R_RM();
 		putGprR_RM_R();
@@ -2095,9 +2104,18 @@
 		putAVX_Y_XM();
 		putFMA();
 #endif
+
 #else // USE_AVX
+
 		putJmp();
-#ifndef USE_YASM
+
+#ifdef USE_YASM
+
+		putSSSE3();
+		putSSE4_1();
+		separateFunc();
+		putSSE4_2();
+#else
 		putSIMPLE();
 		putReg1();
 		putRorM();
@@ -2133,16 +2151,19 @@
 		putFpu();
 		putFpuFpu();
 		putCmp();
-#else // USE_YASM
-		putSSSE3();
-		putSSE4_1();
-		separateFunc();
-		putSSE4_2();
-		putMov64();
+#endif
+
 #ifdef XBYAK64
+
+#ifdef USE_YASM
 		putRip();
+#else
+		putMov64();
+		putMovImm64();
+#endif
+
 #endif // XBYAK64
-#endif // USE_YASM
+
 #endif // USE_AVX
 	}
 };
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index fdfea0a..2f1bca2 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -1541,22 +1541,27 @@
 	{
 		verifyMemHasSize(op);
 		if (op.isREG()) {
-			rex(op);
-			int code, size;
-#ifdef XBYAK64
-			if (opti && op.isBit(64) && inner::IsInInt32(imm)) {
-				db(B11000111);
-				code = B11000000;
-				size = 4;
-			} else
-#endif
-			{
-				code = B10110000 | ((op.isBit(8) ? 0 : 1) << 3);
-				size = op.getBit() / 8;
-			}
+			int bit = op.getBit();
+			int idx = op.getIdx();
+			int code = B10110000 | ((bit == 8 ? 0 : 1) << 3);
 
-			db(code | (op.getIdx() & 7));
-			db(imm, size);
+#ifdef XBYAK64
+			if (opti && bit == 64 && (imm >> 32) == 0) {
+				rex(Reg32(idx));
+				bit = 32;
+			} else {
+				rex(op);
+				if (opti && bit == 64 && inner::IsInInt32(imm)) {
+					db(B11000111);
+					code = B11000000;
+					bit = 32;
+				}
+			}
+#else
+			rex(op);
+#endif
+			db(code | (idx & 7));
+			db(imm, bit / 8);
 		} else if (op.isMEM()) {
 			opModM(static_cast<const Address&>(op), Reg(0, Operand::REG, op.getBit()), B11000110);
 			int size = op.getBit() / 8; if (size > 4) size = 4;