Merge branch 'multi-nop'
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index d298b2d..4e5fd89 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -632,7 +632,6 @@
 
 			{ "lahf", 0x9F },
 			{ "lock", 0xF0 },
-			{ "nop", 0x90 },
 
 			{ "sahf", 0x9E },
 			{ "stc", 0xF9 },
diff --git a/readme.md b/readme.md
index 1122677..e954d61 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,5 @@
 
-Xbyak 5.50 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
+Xbyak 5.51 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
 =============
 
 Abstract
@@ -333,6 +333,7 @@
 
 History
 -------------
+* 2017/Aug/17 ver 5.51 add multi-byte nop and align() uses it(thanks to inolen)
 * 2017/Aug/08 ver 5.50 add mpx(thanks to magurosan)
 * 2017/Aug/08 ver 5.45 add sha(thanks to magurosan)
 * 2017/Aug/08 ver 5.44 add prefetchw(thanks to rsdubtso)
diff --git a/readme.txt b/readme.txt
index d24414e..0c72fe7 100644
--- a/readme.txt
+++ b/readme.txt
@@ -1,5 +1,5 @@
 

-    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.50

+    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.51

 

 -----------------------------------------------------------------------------

 ◎概要

@@ -343,6 +343,7 @@
 -----------------------------------------------------------------------------

 ◎履歴

 

+2017/08/17 ver 5.51 multi-byte nop追加 align()はそれを使用する(thanks to inolen)

 2017/08/08 ver 5.50 mpx追加(thanks to magurosan)

 2017/08/08 ver 5.45 sha追加(thanks to magurosan)

 2017/08/08 ver 5.44 prefetchw追加(thanks to rsdubtso)

diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 5ae708d..f056a5e 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -105,7 +105,7 @@
 
 enum {
 	DEFAULT_MAX_CODE_SIZE = 4096,
-	VERSION = 0x5500 /* 0xABCD = A.BC(D) */
+	VERSION = 0x5510 /* 0xABCD = A.BC(D) */
 };
 
 #ifndef MIE_INTEGER_TYPE_DEFINED
@@ -366,7 +366,7 @@
 		YMM = 1 << 5,
 		ZMM = 1 << 6,
 		OPMASK = 1 << 7,
-		BNDREG = 1 << 8,
+		BNDREG = 1 << 8
 	};
 	enum Code {
 #ifdef XBYAK64
@@ -2390,14 +2390,52 @@
 #ifdef XBYAK_UNDEF_JNL
 	#undef jnl
 #endif
+
+	void nop(size_t size = 1)
+	{
+		/*
+			Intel Architectures Software Developer's Manual Volume 2
+			recommended multi-byte sequence of NOP instruction
+			AMD and Intel seem to agree on the same sequences for up to 9 bytes:
+			https://support.amd.com/TechDocs/55723_SOG_Fam_17h_Processors_3.00.pdf
+		*/
+		static const uint8 nopTbl[9][9] = {
+			{0x90},
+			{0x66, 0x90},
+			{0x0F, 0x1F, 0x00},
+			{0x0F, 0x1F, 0x40, 0x00},
+			{0x0F, 0x1F, 0x44, 0x00, 0x00},
+			{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
+			{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
+			{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+			{0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+		};
+		const size_t n = sizeof(nopTbl) / sizeof(nopTbl[0]);
+		while (size > 0) {
+			size_t len = (std::min)(n, size);
+			const uint8 *seq = nopTbl[len - 1];
+			for (size_t i = 0; i < len; i++) {
+				db(seq[i]);
+			}
+			size -= len;
+		}
+	}
+
 #ifndef XBYAK_DONT_READ_LIST
 #include "xbyak_mnemonic.h"
-	void align(int x = 16)
+	/*
+		use single byte nop if useMultiByteNop = false
+	*/
+	void align(size_t x = 16, bool useMultiByteNop = true)
 	{
 		if (x == 1) return;
 		if (x < 1 || (x & (x - 1))) throw Error(ERR_BAD_ALIGN);
-		if (isAutoGrow() && x > (int)inner::ALIGN_PAGE_SIZE) fprintf(stderr, "warning:autoGrow mode does not support %d align\n", x);
-		while (size_t(getCurr()) % size_t(x) > 0) {
+		if (isAutoGrow() && x > inner::ALIGN_PAGE_SIZE) fprintf(stderr, "warning:autoGrow mode does not support %d align\n", (int)x);
+		if (useMultiByteNop) {
+			nop(size_t(getCurr()) % x);
+			return;
+		}
+		while (size_t(getCurr()) % x > 0) {
 			nop();
 		}
 	}
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 95014a0..72f7c78 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "5.50"; }
+const char *getVersionString() const { return "5.51"; }
 void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
 void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
 void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@@ -497,7 +497,6 @@
 void mulx(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf6, true); }
 void mwait() { db(0x0F); db(0x01); db(0xC9); }
 void neg(const Operand& op) { opR_ModM(op, 0, 3, 0xF6); }
-void nop() { db(0x90); }
 void not_(const Operand& op) { opR_ModM(op, 0, 2, 0xF6); }
 void or_(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x08, 1); }
 void or_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x08); }