blob: 0f9ea8c2b6cf6a2a8674c9928980e0dd39cadbf8 [file] [log] [blame] [edit]
vaddbf16(xm1, xm2, xm3);
vaddbf16(ym1|k1, ym2, ptr[rax+128]);
vaddbf16(ym1|k1, ym2, ptr_b[rax+128]);
vaddbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vdivbf16(xm1, xm2, xm3);
vdivbf16(ym1|k1, ym2, ptr[rax+128]);
vdivbf16(ym1|k1, ym2, ptr_b[rax+128]);
vdivbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vmaxbf16(xm1, xm2, xm3);
vmaxbf16(ym1|k1, ym2, ptr[rax+128]);
vmaxbf16(ym1|k1, ym2, ptr_b[rax+128]);
vmaxbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vminbf16(xm1, xm2, xm3);
vminbf16(ym1|k1, ym2, ptr[rax+128]);
vminbf16(ym1|k1, ym2, ptr_b[rax+128]);
vminbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vmulbf16(xm1, xm2, xm3);
vmulbf16(ym1|k1, ym2, ptr[rax+128]);
vmulbf16(ym1|k1, ym2, ptr_b[rax+128]);
vmulbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vscalefbf16(xm1, xm2, xm3);
vscalefbf16(ym1|k1, ym2, ptr[rax+128]);
vscalefbf16(ym1|k1, ym2, ptr_b[rax+128]);
vscalefbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vsubbf16(xm1, xm2, xm3);
vsubbf16(ym1|k1, ym2, ptr[rax+128]);
vsubbf16(ym1|k1, ym2, ptr_b[rax+128]);
vsubbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// madd
vfmadd132bf16(xm1, xm2, xm3);
vfmadd132bf16(ym1|k1, ym2, ptr[rax+128]);
vfmadd132bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmadd132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfmadd213bf16(xm1, xm2, xm3);
vfmadd213bf16(ym1|k1, ym2, ptr[rax+128]);
vfmadd213bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmadd213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfmadd231bf16(xm1, xm2, xm3);
vfmadd231bf16(ym1|k1, ym2, ptr[rax+128]);
vfmadd231bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmadd231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// nmadd
vfnmadd132bf16(xm1, xm2, xm3);
vfnmadd132bf16(ym1|k1, ym2, ptr[rax+128]);
vfnmadd132bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmadd132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfnmadd213bf16(xm1, xm2, xm3);
vfnmadd213bf16(ym1|k1, ym2, ptr[rax+128]);
vfnmadd213bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmadd213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfnmadd231bf16(xm1, xm2, xm3);
vfnmadd231bf16(ym1|k1, ym2, ptr[rax+128]);
vfnmadd231bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmadd231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// msub
vfmsub132bf16(xm1, xm2, xm3);
vfmsub132bf16(ym1|k1, ym2, ptr[rax+128]);
vfmsub132bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmsub132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfmsub213bf16(xm1, xm2, xm3);
vfmsub213bf16(ym1|k1, ym2, ptr[rax+128]);
vfmsub213bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmsub213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfmsub231bf16(xm1, xm2, xm3);
vfmsub231bf16(ym1|k1, ym2, ptr[rax+128]);
vfmsub231bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmsub231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// nmsub
vfnmsub132bf16(xm1, xm2, xm3);
vfnmsub132bf16(ym1|k1, ym2, ptr[rax+128]);
vfnmsub132bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmsub132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfnmsub213bf16(xm1, xm2, xm3);
vfnmsub213bf16(ym1|k1, ym2, ptr[rax+128]);
vfnmsub213bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmsub213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfnmsub231bf16(xm1, xm2, xm3);
vfnmsub231bf16(ym1|k1, ym2, ptr[rax+128]);
vfnmsub231bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmsub231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vcmpbf16(k1, xm5, xm4, 5);
vcmpbf16(k2, ym5, ym4, 6);
vcmpbf16(k3, ym15, ptr_b[rax+128], 7);
vcmpbf16(k4, zm30, zm20, 8);
vcmpbf16(k5, zm1, ptr[rax+128], 9);
vcmpbf16(k6, zm10, ptr_b[rax+128], 10);
vfpclassbf16(k1, xm4, 5);
vfpclassbf16(k2|k5, ym4, 6);
vfpclassbf16(k3|k5, zm20, 7);
vfpclassbf16(k3|k5, xword[rax+128], 8);
vfpclassbf16(k3, xword_b[rax+128], 9);
vfpclassbf16(k5|k5, yword[rax+128], 10);
vfpclassbf16(k6|k5, yword_b[rax+128], 11);
vfpclassbf16(k7|k5, zword[rax+128], 12);
vfpclassbf16(k7|k5, zword_b[rax+128], 13);
vcomisbf16(xm2, xm3);
vcomisbf16(xm2, ptr[rax+128]);
vgetexpbf16(xm1|k3, xmm2);
vgetexpbf16(xm1|k3, ptr[rax+128]);
vgetexpbf16(xm1|k3, ptr_b[rax+128]);
vgetexpbf16(ym1|k3, ymm2);
vgetexpbf16(ym1|k3, ptr[rax+128]);
vgetexpbf16(ym1|k3, ptr_b[rax+128]);
vgetexpbf16(zm1|k3, zmm2);
vgetexpbf16(zm1|k3, ptr[rax+128]);
vgetexpbf16(zm1|k3, ptr_b[rax+128]);
vgetmantbf16(xm1|k3, xmm2, 3);
vgetmantbf16(xm1|k3, ptr[rax+128], 5);
vgetmantbf16(xm1|k3, ptr_b[rax+128], 9);
vgetmantbf16(ym1|k3, ymm2, 3);
vgetmantbf16(ym1|k3, ptr[rax+128], 5);
vgetmantbf16(ym1|k3, ptr_b[rax+128], 9);
vgetmantbf16(zm1|k3, zmm2, 3);
vgetmantbf16(zm1|k3, ptr[rax+128], 5);
vgetmantbf16(zm1|k3, ptr_b[rax+128], 9);
vrcpbf16(xm1|k5, xm2);
vrcpbf16(xm1|k5, ptr[rcx+128]);
vrcpbf16(xm1|k5, ptr_b[rcx+128]);
vrcpbf16(ym1|k5, ym2);
vrcpbf16(ym1|k5, ptr[rcx+128]);
vrcpbf16(ym1|k5, ptr_b[rcx+128]);
vrcpbf16(zm1|k5, zm2);
vrcpbf16(zm1|k5, ptr[rcx+128]);
vrcpbf16(zm1|k5, ptr_b[rcx+128]);
vreducebf16(xm1|k4, xm2, 1);
vreducebf16(xm1|k4, ptr[rax+128], 1);
vreducebf16(xm1|k4, ptr_b[rax+128], 1);
vreducebf16(ym1|k4, ym2, 1);
vreducebf16(ym1|k4, ptr[rax+128], 1);
vreducebf16(ym1|k4, ptr_b[rax+128], 1);
vreducebf16(zm1|k4, zm2, 1);
vreducebf16(zm1|k4, ptr[rax+128], 1);
vreducebf16(zm1|k4, ptr_b[rax+128], 1);
vrndscalebf16(xm1|k4, xm2, 1);
vrndscalebf16(xm1|k4, ptr[rax+128], 1);
vrndscalebf16(xm1|k4, ptr_b[rax+128], 1);
vrndscalebf16(ym1|k4, ym2, 1);
vrndscalebf16(ym1|k4, ptr[rax+128], 1);
vrndscalebf16(ym1|k4, ptr_b[rax+128], 1);
vrndscalebf16(zm1|k4, zm2, 1);
vrndscalebf16(zm1|k4, ptr[rax+128], 1);
vrndscalebf16(zm1|k4, ptr_b[rax+128], 1);
vrsqrtbf16(xm1|k5, xm2);
vrsqrtbf16(xm1|k5, ptr[rcx+128]);
vrsqrtbf16(xm1|k5, ptr_b[rcx+128]);
vrsqrtbf16(ym1|k5, ym2);
vrsqrtbf16(ym1|k5, ptr[rcx+128]);
vrsqrtbf16(ym1|k5, ptr_b[rcx+128]);
vrsqrtbf16(zm1|k5, zm2);
vrsqrtbf16(zm1|k5, ptr[rcx+128]);
vrsqrtbf16(zm1|k5, ptr_b[rcx+128]);
vscalefbf16(xm1|k5, xm5, xm2);
vscalefbf16(xm1|k5, xm5, ptr[rcx+128]);
vscalefbf16(xm1|k5, xm5, ptr_b[rcx+128]);
vscalefbf16(ym1|k5, ym9, ym2);
vscalefbf16(ym1|k5, ym9, ptr[rcx+128]);
vscalefbf16(ym1|k5, ym9, ptr_b[rcx+128]);
vscalefbf16(zm1|k5, zm30, zm2);
vscalefbf16(zm1|k5, zm30, ptr[rcx+128]);
vscalefbf16(zm1|k5, zm30, ptr_b[rcx+128]);
vsqrtbf16(xm5|k3, xmm4);
vsqrtbf16(xm5|k3, ptr[rax+128]);
vsqrtbf16(xm5|k3, ptr_b[rax+128]);
vsqrtbf16(ym5|k3, ymm4);
vsqrtbf16(ym5|k3, ptr[rax+128]);
vsqrtbf16(ym5|k3, ptr_b[rax+128]);
vsqrtbf16(zm5|k3, zmm4);
vsqrtbf16(zm5|k3, ptr[rax+128]);
vsqrtbf16(zm5|k3, ptr_b[rax+128]);