add avx512_bf16
diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp
index 84cd612..3f0508c 100644
--- a/gen/gen_avx512.cpp
+++ b/gen/gen_avx512.cpp
@@ -368,6 +368,9 @@
 
 		{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
 		{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
+
+		{ 0x72, "vcvtne2ps2bf16", T_F2 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
+		{ 0x52, "vdpbf16ps", T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
 	};
 	for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 		const Tbl *p = &tbl[i];
@@ -711,6 +714,8 @@
 	puts("void vfpclassss(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm); }");
 
 	puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }");
+	puts("void vcvtneps2bf16(const Xmm& x, const Operand& op) { int xBit = x.getBit(); int opBit = op.getBit(); if (xBit == 256 && opBit == 0) opBit = 512; if (!(xBit == 128 && (opBit == 128 || opBit == 256)) && !(xBit == 256 && opBit == 512)) throw Error(ERR_BAD_COMBINATION); Xmm t = x; t.setBit(opBit); opAVX_X_XM_IMM(t, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }");
+
 }
 
 void putV4FMA()
diff --git a/readme.md b/readme.md
index 9c5f2a6..7449d22 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,5 @@
 
-# Xbyak 5.79 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
+# Xbyak 5.80 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
 
 ## Abstract
 
@@ -392,6 +392,7 @@
 http://opensource.org/licenses/BSD-3-Clause
 
 ## History
+* 2019/May/26 ver 5.80 support vcvtne2ps2bf16, vcvtneps2bf16, vdpbf16ps
 * 2019/Apr/27 ver 5.79 vcmppd/vcmpps supports ptr_b(thanks to jkopinsky)
 * 2019/Apr/15 ver 5.78 rewrite Reg::changeBit() (thanks to MerryMage)
 * 2019/Mar/06 ver 5.77 fix number of cores that share LLC cache by densamoilov
diff --git a/readme.txt b/readme.txt
index e75f90f..3eaffdb 100644
--- a/readme.txt
+++ b/readme.txt
@@ -1,5 +1,5 @@
 

-    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.79

+    C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.80

 

 -----------------------------------------------------------------------------

 ◎概要

@@ -373,6 +373,7 @@
 -----------------------------------------------------------------------------

 ◎履歴

 

+2019/05/26 ver 5.80 support vcvtne2ps2bf16, vcvtneps2bf16, vdpbf16ps

 2019/04/27 ver 5.79 vcmppd/vcmppsのptr_b対応忘れ(thanks to jkopinsky)

 2019/04/15 ver 5.78 Reg::changeBit()のリファクタリング(thanks to MerryMage)

 2019/03/06 ver 5.77 LLCキャッシュを共有数CPU数の修整(by densamoilov)

diff --git a/sample/test_util.cpp b/sample/test_util.cpp
index d75a5e0..afb6e5a 100644
--- a/sample/test_util.cpp
+++ b/sample/test_util.cpp
@@ -78,6 +78,8 @@
 		{ Cpu::tAVX512_VNNI, "avx512_vnni" },
 		{ Cpu::tAVX512_BITALG, "avx512_bitalg" },
 		{ Cpu::tAVX512_VPOPCNTDQ, "avx512_vpopcntdq" },
+		{ Cpu::tAVX512_BF16, "avx512_bf16" },
+		{ Cpu::tAVX512_VP2INTERSECT, "avx512_vp2intersect" },
 	};
 	for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 		if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
diff --git a/test/misc.cpp b/test/misc.cpp
index 3967fef..ee57c54 100644
--- a/test/misc.cpp
+++ b/test/misc.cpp
@@ -683,4 +683,42 @@
 	CYBOZU_TEST_EQUAL(c.getSize(), n);
 	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 }
+
+CYBOZU_TEST_AUTO(bf16)
+{
+	struct Code : Xbyak::CodeGenerator {
+		Code()
+		{
+			vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]);
+			vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]);
+			vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]);
+
+			vcvtneps2bf16(xmm0, xword [rax + 64]);
+			vcvtneps2bf16(xmm0 | k1, yword [rax + 64]);
+			vcvtneps2bf16(ymm0 | k1, zword [rax + 64]);
+			vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]);
+
+			vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]);
+			vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]);
+			vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]);
+		}
+	} c;
+	const uint8_t tbl[] = {
+		0x62, 0xf2, 0x77, 0x09, 0x72, 0x40, 0x04,
+		0x62, 0xf2, 0x7f, 0xa9, 0x72, 0x40, 0x02,
+		0x62, 0xf2, 0x77, 0x49, 0x72, 0x40, 0x01,
+
+		0x62, 0xf2, 0x7e, 0x08, 0x72, 0x40, 0x04,
+		0x62, 0xf2, 0x7e, 0x29, 0x72, 0x40, 0x02,
+		0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01,
+		0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01,
+
+		0x62, 0xf2, 0x76, 0x09, 0x52, 0x40, 0x04,
+		0x62, 0xf2, 0x76, 0x29, 0x52, 0x40, 0x02,
+		0x62, 0xf2, 0x76, 0x49, 0x52, 0x40, 0x01,
+	};
+	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
+	CYBOZU_TEST_EQUAL(c.getSize(), n);
+	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
+}
 #endif
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index c28a536..3d8ed65 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -113,7 +113,7 @@
 
 enum {
 	DEFAULT_MAX_CODE_SIZE = 4096,
-	VERSION = 0x5790 /* 0xABCD = A.BC(D) */
+	VERSION = 0x5800 /* 0xABCD = A.BC(D) */
 };
 
 #ifndef MIE_INTEGER_TYPE_DEFINED
@@ -551,6 +551,7 @@
 		idx_ = idx;
 		kind_ = kind;
 		bit_ = bit;
+		if (bit >= 128) return; // keep mask_ and rounding_
 		mask_ = 0;
 		rounding_ = 0;
 		return;
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index 2733c61..732b097 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "5.79"; }
+const char *getVersionString() const { return "5.80"; }
 void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
 void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
 void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@@ -1684,6 +1684,8 @@
 void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8A); }
 void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8A); }
 void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); }
+void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
+void vcvtneps2bf16(const Xmm& x, const Operand& op) { int xBit = x.getBit(); int opBit = op.getBit(); if (xBit == 256 && opBit == 0) opBit = 512; if (!(xBit == 128 && (opBit == 128 || opBit == 256)) && !(xBit == 256 && opBit == 512)) throw Error(ERR_BAD_COMBINATION); Xmm t = x; t.setBit(opBit); opAVX_X_XM_IMM(t, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
 void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); }
 void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x79); }
 void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); }
@@ -1709,6 +1711,7 @@
 void vcvtusi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_F2 | T_0F | T_MUST_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x7B); }
 void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_F3 | T_0F | T_MUST_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x7B); }
 void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x42, imm); }
+void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52); }
 void vexp2pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8); }
 void vexp2ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8); }
 void vexpandpd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x88); }
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index c2474c5..2929bb0 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -331,6 +331,8 @@
 	static const Type tAVX512_VNNI = uint64(1) << 54;
 	static const Type tAVX512_BITALG = uint64(1) << 55;
 	static const Type tAVX512_VPOPCNTDQ = uint64(1) << 56;
+	static const Type tAVX512_BF16 = uint64(1) << 57;
+	static const Type tAVX512_VP2INTERSECT = uint64(1) << 58;
 
 	Cpu()
 		: type_(NONE)
@@ -410,6 +412,8 @@
 						if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
 						if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
 						if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
+						if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
+						if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
 					}
 				}
 			}