support prefetchwt1
diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp
index b766ca0..510bf09 100644
--- a/gen/gen_code.cpp
+++ b/gen/gen_code.cpp
@@ -536,15 +536,17 @@
 		const struct Tbl {
 			int ext;
 			const char *name;
+			int code;
 		} tbl[] = {
-			{ 1, "t0" },
-			{ 2, "t1" },
-			{ 3, "t2" },
-			{ 0, "nta" },
+			{ 1, "t0", 0x18},
+			{ 2, "t1", 0x18},
+			{ 3, "t2", 0x18},
+			{ 0, "nta", 0x18},
+			{ 2, "wt1", 0x0D},
 		};
 		for (int i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 			const Tbl *p = &tbl[i];
-			printf("void prefetch%s(const Address& addr) { opModM(addr, Reg32(%d), 0x0F, 0x18); }\n", p->name, p->ext);
+			printf("void prefetch%s(const Address& addr) { opModM(addr, Reg32(%d), 0x0F, 0x%02X); }\n", p->name, p->ext, p->code);
 		}
 	}
 	{
diff --git a/readme.md b/readme.md
index f99de64..990188f 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,5 @@
 
-Xbyak 5.40 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
+Xbyak 5.41 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
 =============
 
 Abstract
@@ -332,6 +332,7 @@
 
 History
 -------------
+* 2017/Jan/26 ver 5.41 add prefetchwt1 and support for scale == 0(thanks to rsdubtso)
 * 2016/Dec/14 ver 5.40 add Label::getAddress() method to get the pointer specified by the label
 * 2016/Dec/09 ver 5.34 fix handling of negative offsets when encoding disp8N(thanks to rsdubtso)
 * 2016/Dec/08 ver 5.33 fix encoding of vpbroadcast{b,w,d,q}, vpinsr{b,w}, vpextr{b,w} for disp8N
diff --git a/readme.txt b/readme.txt
index b5b87aa..1fa124e 100644
--- a/readme.txt
+++ b/readme.txt
@@ -1,5 +1,5 @@
 

-    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.40

+    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.41

 

 -----------------------------------------------------------------------------

 ◎概要

@@ -343,6 +343,7 @@
 -----------------------------------------------------------------------------

 ◎履歴

 

+2017/01/26 ver 5.41 prefetcwt1追加とscale == 0対応(thanks to rsdubtso)

 2016/12/14 ver 5.40 Labelが示すアドレスを取得するLabel::getAddress()追加

 2016/12/07 ver 5.34 disp8N時の負のオフセット処理の修正(thanks to rsdubtso)

 2016/12/06 ver 5.33 disp8N時のvpbroadcast{b,w,d,q}, vpinsr{b,w}, vpextr{b,w}のバグ修正

diff --git a/sample/test_util.cpp b/sample/test_util.cpp
index 86cdad9..9aca9f7 100644
--- a/sample/test_util.cpp
+++ b/sample/test_util.cpp
@@ -53,6 +53,7 @@
 		{ Cpu::tSMAP, "smap" },
 		{ Cpu::tHLE, "hle" },
 		{ Cpu::tRTM, "rtm" },
+		{ Cpu::tPREFETCHWT1, "prefetchwt1" },
 		{ Cpu::tF16C, "f16c" },
 		{ Cpu::tMOVBE, "movbe" },
 		{ Cpu::tAVX512F, "avx512f" },
diff --git a/test/make_nm.cpp b/test/make_nm.cpp
index e002c74..829b3e5 100644
--- a/test/make_nm.cpp
+++ b/test/make_nm.cpp
@@ -611,6 +611,7 @@
 		put("prefetcht1", MEM);
 		put("prefetcht2", MEM);
 		put("prefetchnta", MEM);
+		put("prefetchwt1", MEM);
 
 		// SSE2 misc
 		put("maskmovdqu", XMM, XMM);
@@ -1114,6 +1115,7 @@
 			put(p, "qword [rax], 1000000");
 			put(p, "rdx, qword [rax]");
 #endif
+			put("mov", EAX, "ptr [eax + ecx * 0]", "[eax + ecx * 0]"); // ignore scale = 0
 		}
 		{
 			const char tbl[][8] = {
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index 62707c7..37e927a 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -105,7 +105,7 @@
 
 enum {
 	DEFAULT_MAX_CODE_SIZE = 4096,
-	VERSION = 0x5400 /* 0xABCD = A.BC(D) */
+	VERSION = 0x5410 /* 0xABCD = A.BC(D) */
 };
 
 #ifndef MIE_INTEGER_TYPE_DEFINED
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index a7c021e..175932f 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "5.34"; }
+const char *getVersionString() const { return "5.41"; }
 void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
 void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
 void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@@ -588,6 +588,7 @@
 void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); }
 void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); }
 void prefetcht2(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0x18); }
+void prefetchwt1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x0D); }
 void psadbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF6); }
 void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, NONE, 0x38); }
 void pshufd(const Mmx& mmx, const Operand& op, uint8 imm8) { opMMX(mmx, op, 0x70, 0x66, imm8); }
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index 7ec5d01..5ba6267 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -174,6 +174,7 @@
 	static const Type tAVX512VBMI = uint64(1) << 43;
 	static const Type tAVX512_4VNNIW = uint64(1) << 44;
 	static const Type tAVX512_4FMAPS = uint64(1) << 45;
+	static const Type tPREFETCHWT1 = uint64(1) << 46;
 
 	Cpu()
 		: type_(NONE)
@@ -251,6 +252,7 @@
 			if (data[1] & (1U << 20)) type_ |= tSMAP;
 			if (data[1] & (1U << 4)) type_ |= tHLE;
 			if (data[1] & (1U << 11)) type_ |= tRTM;
+			if (data[2] & (1U << 0)) type_ |= tPREFETCHWT1;
 		}
 		setFamily();
 	}