| vaddbf16(xm1, xm2, xm3); |
| vaddbf16(ym1|k1, ym2, ptr[rax+128]); |
| vaddbf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vaddbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vdivbf16(xm1, xm2, xm3); |
| vdivbf16(ym1|k1, ym2, ptr[rax+128]); |
| vdivbf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vdivbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vmaxbf16(xm1, xm2, xm3); |
| vmaxbf16(ym1|k1, ym2, ptr[rax+128]); |
| vmaxbf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vmaxbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vminbf16(xm1, xm2, xm3); |
| vminbf16(ym1|k1, ym2, ptr[rax+128]); |
| vminbf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vminbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vmulbf16(xm1, xm2, xm3); |
| vmulbf16(ym1|k1, ym2, ptr[rax+128]); |
| vmulbf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vmulbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vscalefbf16(xm1, xm2, xm3); |
| vscalefbf16(ym1|k1, ym2, ptr[rax+128]); |
| vscalefbf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vscalefbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vsubbf16(xm1, xm2, xm3); |
| vsubbf16(ym1|k1, ym2, ptr[rax+128]); |
| vsubbf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vsubbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| // madd |
| vfmadd132bf16(xm1, xm2, xm3); |
| vfmadd132bf16(ym1|k1, ym2, ptr[rax+128]); |
| vfmadd132bf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vfmadd132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vfmadd213bf16(xm1, xm2, xm3); |
| vfmadd213bf16(ym1|k1, ym2, ptr[rax+128]); |
| vfmadd213bf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vfmadd213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vfmadd231bf16(xm1, xm2, xm3); |
| vfmadd231bf16(ym1|k1, ym2, ptr[rax+128]); |
| vfmadd231bf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vfmadd231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| // nmadd |
| vfnmadd132bf16(xm1, xm2, xm3); |
| vfnmadd132bf16(ym1|k1, ym2, ptr[rax+128]); |
| vfnmadd132bf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vfnmadd132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vfnmadd213bf16(xm1, xm2, xm3); |
| vfnmadd213bf16(ym1|k1, ym2, ptr[rax+128]); |
| vfnmadd213bf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vfnmadd213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vfnmadd231bf16(xm1, xm2, xm3); |
| vfnmadd231bf16(ym1|k1, ym2, ptr[rax+128]); |
| vfnmadd231bf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vfnmadd231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| // msub |
| vfmsub132bf16(xm1, xm2, xm3); |
| vfmsub132bf16(ym1|k1, ym2, ptr[rax+128]); |
| vfmsub132bf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vfmsub132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vfmsub213bf16(xm1, xm2, xm3); |
| vfmsub213bf16(ym1|k1, ym2, ptr[rax+128]); |
| vfmsub213bf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vfmsub213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vfmsub231bf16(xm1, xm2, xm3); |
| vfmsub231bf16(ym1|k1, ym2, ptr[rax+128]); |
| vfmsub231bf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vfmsub231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| // nmsub |
| vfnmsub132bf16(xm1, xm2, xm3); |
| vfnmsub132bf16(ym1|k1, ym2, ptr[rax+128]); |
| vfnmsub132bf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vfnmsub132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vfnmsub213bf16(xm1, xm2, xm3); |
| vfnmsub213bf16(ym1|k1, ym2, ptr[rax+128]); |
| vfnmsub213bf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vfnmsub213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vfnmsub231bf16(xm1, xm2, xm3); |
| vfnmsub231bf16(ym1|k1, ym2, ptr[rax+128]); |
| vfnmsub231bf16(ym1|k1, ym2, ptr_b[rax+128]); |
| vfnmsub231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); |
| |
| vcmpbf16(k1, xm5, xm4, 5); |
| vcmpbf16(k2, ym5, ym4, 6); |
| vcmpbf16(k3, ym15, ptr_b[rax+128], 7); |
| vcmpbf16(k4, zm30, zm20, 8); |
| vcmpbf16(k5, zm1, ptr[rax+128], 9); |
| vcmpbf16(k6, zm10, ptr_b[rax+128], 10); |
| |
| vfpclassbf16(k1, xm4, 5); |
| vfpclassbf16(k2|k5, ym4, 6); |
| vfpclassbf16(k3|k5, zm20, 7); |
| vfpclassbf16(k3|k5, xword[rax+128], 8); |
| vfpclassbf16(k3, xword_b[rax+128], 9); |
| vfpclassbf16(k5|k5, yword[rax+128], 10); |
| vfpclassbf16(k6|k5, yword_b[rax+128], 11); |
| vfpclassbf16(k7|k5, zword[rax+128], 12); |
| vfpclassbf16(k7|k5, zword_b[rax+128], 13); |
| |
| vcomisbf16(xm2, xm3); |
| vcomisbf16(xm2, ptr[rax+128]); |
| |
| vgetexpbf16(xm1|k3, xmm2); |
| vgetexpbf16(xm1|k3, ptr[rax+128]); |
| vgetexpbf16(xm1|k3, ptr_b[rax+128]); |
| |
| vgetexpbf16(ym1|k3, ymm2); |
| vgetexpbf16(ym1|k3, ptr[rax+128]); |
| vgetexpbf16(ym1|k3, ptr_b[rax+128]); |
| |
| vgetexpbf16(zm1|k3, zmm2); |
| vgetexpbf16(zm1|k3, ptr[rax+128]); |
| vgetexpbf16(zm1|k3, ptr_b[rax+128]); |
| |
| vgetmantbf16(xm1|k3, xmm2, 3); |
| vgetmantbf16(xm1|k3, ptr[rax+128], 5); |
| vgetmantbf16(xm1|k3, ptr_b[rax+128], 9); |
| |
| vgetmantbf16(ym1|k3, ymm2, 3); |
| vgetmantbf16(ym1|k3, ptr[rax+128], 5); |
| vgetmantbf16(ym1|k3, ptr_b[rax+128], 9); |
| |
| vgetmantbf16(zm1|k3, zmm2, 3); |
| vgetmantbf16(zm1|k3, ptr[rax+128], 5); |
| vgetmantbf16(zm1|k3, ptr_b[rax+128], 9); |
| |
| vrcpbf16(xm1|k5, xm2); |
| vrcpbf16(xm1|k5, ptr[rcx+128]); |
| vrcpbf16(xm1|k5, ptr_b[rcx+128]); |
| |
| vrcpbf16(ym1|k5, ym2); |
| vrcpbf16(ym1|k5, ptr[rcx+128]); |
| vrcpbf16(ym1|k5, ptr_b[rcx+128]); |
| |
| vrcpbf16(zm1|k5, zm2); |
| vrcpbf16(zm1|k5, ptr[rcx+128]); |
| vrcpbf16(zm1|k5, ptr_b[rcx+128]); |
| |
| vreducebf16(xm1|k4, xm2, 1); |
| vreducebf16(xm1|k4, ptr[rax+128], 1); |
| vreducebf16(xm1|k4, ptr_b[rax+128], 1); |
| |
| vreducebf16(ym1|k4, ym2, 1); |
| vreducebf16(ym1|k4, ptr[rax+128], 1); |
| vreducebf16(ym1|k4, ptr_b[rax+128], 1); |
| |
| vreducebf16(zm1|k4, zm2, 1); |
| vreducebf16(zm1|k4, ptr[rax+128], 1); |
| vreducebf16(zm1|k4, ptr_b[rax+128], 1); |
| |
| vrndscalebf16(xm1|k4, xm2, 1); |
| vrndscalebf16(xm1|k4, ptr[rax+128], 1); |
| vrndscalebf16(xm1|k4, ptr_b[rax+128], 1); |
| |
| vrndscalebf16(ym1|k4, ym2, 1); |
| vrndscalebf16(ym1|k4, ptr[rax+128], 1); |
| vrndscalebf16(ym1|k4, ptr_b[rax+128], 1); |
| |
| vrndscalebf16(zm1|k4, zm2, 1); |
| vrndscalebf16(zm1|k4, ptr[rax+128], 1); |
| vrndscalebf16(zm1|k4, ptr_b[rax+128], 1); |
| |
| vrsqrtbf16(xm1|k5, xm2); |
| vrsqrtbf16(xm1|k5, ptr[rcx+128]); |
| vrsqrtbf16(xm1|k5, ptr_b[rcx+128]); |
| |
| vrsqrtbf16(ym1|k5, ym2); |
| vrsqrtbf16(ym1|k5, ptr[rcx+128]); |
| vrsqrtbf16(ym1|k5, ptr_b[rcx+128]); |
| |
| vrsqrtbf16(zm1|k5, zm2); |
| vrsqrtbf16(zm1|k5, ptr[rcx+128]); |
| vrsqrtbf16(zm1|k5, ptr_b[rcx+128]); |
| |
| vscalefbf16(xm1|k5, xm5, xm2); |
| vscalefbf16(xm1|k5, xm5, ptr[rcx+128]); |
| vscalefbf16(xm1|k5, xm5, ptr_b[rcx+128]); |
| |
| vscalefbf16(ym1|k5, ym9, ym2); |
| vscalefbf16(ym1|k5, ym9, ptr[rcx+128]); |
| vscalefbf16(ym1|k5, ym9, ptr_b[rcx+128]); |
| |
| vscalefbf16(zm1|k5, zm30, zm2); |
| vscalefbf16(zm1|k5, zm30, ptr[rcx+128]); |
| vscalefbf16(zm1|k5, zm30, ptr_b[rcx+128]); |
| |
| vsqrtbf16(xm5|k3, xmm4); |
| vsqrtbf16(xm5|k3, ptr[rax+128]); |
| vsqrtbf16(xm5|k3, ptr_b[rax+128]); |
| |
| vsqrtbf16(ym5|k3, ymm4); |
| vsqrtbf16(ym5|k3, ptr[rax+128]); |
| vsqrtbf16(ym5|k3, ptr_b[rax+128]); |
| |
| vsqrtbf16(zm5|k3, zmm4); |
| vsqrtbf16(zm5|k3, ptr[rax+128]); |
| vsqrtbf16(zm5|k3, ptr_b[rax+128]); |