blob: 46dea5f75e3eb87f9397b5277a287bfd2afc6ae6 [file] [log] [blame]
ldtilecfg(ptr[rax + rcx * 4 + 64]);
ldtilecfg(ptr [r30+r29*4+0x12]);
sttilecfg(ptr[rsp + rax * 8 + 128]);
sttilecfg(ptr [r30+r29*4+0x12]);
tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]);
tileloadd(tmm2, ptr [r30+r29*4+0x12]);
tileloaddt1(tmm4, ptr[r8 + r9 + 32]);
tileloaddt1(tmm7, ptr [r30+r29*4+0x12]);
tilerelease();
tilestored(ptr[r10 + r11 * 2 + 32], tmm2);
tilestored(ptr [r30+r29*4+0x12], tmm1);
tilezero(tmm7);
tdpbssd(tmm1, tmm2, tmm3);
tdpbsud(tmm2, tmm3, tmm4);
tdpbusd(tmm3, tmm4, tmm5);
tdpbuud(tmm4, tmm5, tmm6);
tdpfp16ps(tmm5, tmm6, tmm7);
tdpbf16ps(tmm5, tmm6, tmm7);
tileloadd(tmm1, ptr[r8+r8]);
tileloadd(tmm1, ptr[rax+rcx*4]);
tileloadd(tmm1, ptr[r8+r9*1+0x40]);
tileloadd(tmm1, ptr[r30+r29*1+0x80]);
tileloaddrs(tmm3, ptr[rdi + rdx * 2 + 8]);
tileloaddrs(tmm7, ptr[r31 + rdx * 2 + 8]);
tileloaddrst1(tmm4, ptr[r8 + r9 + 32]);
tileloaddrst1(tmm4, ptr[r25 + r9 + 32]);
tdpbf8ps(tmm1, tmm2, tmm3);
tdpbhf8ps(tmm1, tmm2, tmm3);
tdphbf8ps(tmm1, tmm2, tmm3);
tdphf8ps(tmm1, tmm2, tmm3);
tmmultf32ps(tmm1, tmm2, tmm3);