| # Usage |
| |
| Inherit `Xbyak::CodeGenerator` class and make the class method. |
| ``` |
| #include <xbyak/xbyak.h> |
| |
| struct Code : Xbyak::CodeGenerator { |
| Code(int x) |
| { |
| mov(eax, x); |
| ret(); |
| } |
| }; |
| ``` |
| Or you can pass the instance of CodeGenerator without inheriting. |
| ``` |
| void genCode(Xbyak::CodeGenerator& code, int x) { |
| using namespace Xbyak::util; |
| code.mov(eax, x); |
| code.ret(); |
| } |
| ``` |
| |
| Make an instance of the class and get the function |
| pointer by calling `getCode()` and call it. |
| ``` |
| Code c(5); |
| int (*f)() = c.getCode<int (*)()>(); |
| printf("ret=%d\n", f()); // ret = 5 |
| ``` |
| |
| ## Syntax |
| Similar to MASM/NASM syntax with parentheses. |
| |
| ``` |
| NASM Xbyak |
| mov eax, ebx --> mov(eax, ebx); |
| inc ecx inc(ecx); |
| ret --> ret(); |
| ``` |
| |
| ## Addressing |
| Use `qword`, `dword`, `word` and `byte` if it is necessary to specify the size of memory, |
| otherwise use `ptr`. |
| |
| ``` |
| (ptr|qword|dword|word|byte) [base + index * (1|2|4|8) + displacement] |
| [rip + 32bit disp] ; x64 only |
| |
| NASM Xbyak |
| mov eax, [ebx+ecx] --> mov(eax, ptr [ebx+ecx]); |
| mov al, [ebx+ecx] --> mov(al, ptr [ebx + ecx]); |
| test byte [esp], 4 --> test(byte [esp], 4); |
| inc qword [rax] --> inc(qword [rax]); |
| ``` |
| **Note**: `qword`, ... are member variables, then don't use `dword` as unsigned int type. |
| |
| ### How to use Selector (Segment Register) |
| ``` |
| mov eax, [fs:eax] --> putSeg(fs); |
| mov(eax, ptr [eax]); |
| mov ax, cs --> mov(ax, cs); |
| ``` |
| **Note**: Segment class is not derived from `Operand`. |
| |
| ## AVX |
| |
| ``` |
| vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3 |
| vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory |
| vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3); |
| ``` |
| |
| **Note**: |
| If `XBYAK_ENABLE_OMITTED_OPERAND` is defined, then you can use two operand version for backward compatibility. |
| But the newer version will not support it. |
| ``` |
| vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3 |
| ``` |
| |
| ## AVX-512 |
| |
| ``` |
| vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30); |
| vaddpd xmm30, xmm20, [rax] --> vaddpd(xmm30, xmm20, ptr [rax]); |
| vaddps xmm30, xmm20, [rax] --> vaddps(xmm30, xmm20, ptr [rax]); |
| vaddpd zmm2{k5}, zmm4, zmm2 --> vaddpd(zmm2 | k5, zmm4, zmm2); |
| vaddpd zmm2{k5}{z}, zmm4, zmm2 --> vaddpd(zmm2 | k5 | T_z, zmm4, zmm2); |
| vaddpd zmm2{k5}{z}, zmm4, zmm2,{rd-sae} --> vaddpd(zmm2 | k5 | T_z, zmm4, zmm2 | T_rd_sae); |
| vaddpd(zmm2 | k5 | T_z | T_rd_sae, zmm4, zmm2); // the position of `|` is arbitrary. |
| vcmppd k4{k3}, zmm1, zmm2, {sae}, 5 --> vcmppd(k4 | k3, zmm1, zmm2 | T_sae, 5); |
| |
| vaddpd xmm1, xmm2, [rax+256] --> vaddpd(xmm1, xmm2, ptr [rax+256]); |
| vaddpd xmm1, xmm2, [rax+256]{1to2} --> vaddpd(xmm1, xmm2, ptr_b [rax+256]); |
| vaddpd ymm1, ymm2, [rax+256]{1to4} --> vaddpd(ymm1, ymm2, ptr_b [rax+256]); |
| vaddpd zmm1, zmm2, [rax+256]{1to8} --> vaddpd(zmm1, zmm2, ptr_b [rax+256]); |
| vaddps zmm1, zmm2, [rax+rcx*8+8]{1to16} --> vaddps(zmm1, zmm2, ptr_b [rax+rcx*8+8]); |
| vmovsd [rax]{k1}, xmm4 --> vmovsd(ptr [rax] | k1, xmm4); |
| |
| vcvtpd2dq xmm16, oword [eax+33] --> vcvtpd2dq(xmm16, xword [eax+33]); // use xword for m128 instead of oword |
| vcvtpd2dq(xmm16, ptr [eax+33]); // default xword |
| vcvtpd2dq xmm21, [eax+32]{1to2} --> vcvtpd2dq(xmm21, ptr_b [eax+32]); |
| vcvtpd2dq xmm0, yword [eax+33] --> vcvtpd2dq(xmm0, yword [eax+33]); // use yword for m256 |
| vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); // use yword_b to broadcast |
| |
| vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512 |
| vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit |
| vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit |
| |
| vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX |
| vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above |
| vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding |
| setDefaultEncoding(VexEncoding); // default encoding is VEX |
| vpdpbusd(xm0, xm1, xm2); // VEX encoding |
| ``` |
| |
| - setDefaultEncoding(PreferredEncoding encoding); |
| - Set the default encoding to select EVEX or VEX. |
| - The default value is EvexEncoding. |
| - This function affects only an instruction that has a PreferredEncoding argument such as vpdpbusd. |
| |
| ### Remark |
| * `k1`, ..., `k7` are opmask registers. |
| - `k0` is dealt as no mask. |
| - e.g. `vmovaps(zmm0|k0, ptr[rax]);` and `vmovaps(zmm0|T_z, ptr[rax]);` are same to `vmovaps(zmm0, ptr[rax]);`. |
| * use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively. |
| * `k4 | k3` is different from `k3 | k4`. |
| * use `ptr_b` for broadcast `{1toX}`. X is automatically determined. |
| * specify `xword`/`yword`/`zword(_b)` for m128/m256/m512 if necessary. |
| |
| ## APX |
| [Advanced Performance Extensions (APX) Architecture Specification](https://www.intel.com/content/www/us/en/content-details/786223/intel-advanced-performance-extensions-intel-apx-architecture-specification.html) |
| - Support 64-bit 16 additional GPRs (general-purpose registers) r16, ..., r31 |
| - 32-bit regs are r16d, ..., r31d |
| - 16-bit regs are r16w, ..., r31w |
| - 8-bit regs are r16b, ..., r31b |
| - `add(r20, r21);` |
| - `lea(r30, ptr[r29+r31]);` |
| - Support three-operand instruction |
| - `add(r20, r21, r23);` |
| - `add(r20, ptr[rax + rcx * 8 + 0x1234], r23);` |
| - Support T_nf for NF=1 (status flags update suppression) |
| - `add(r20|T_nf, r21, r23);` // Set EVEX.NF=1 |
| - Support T_zu for NF=ZU (zero upper) for imul and setcc |
| - `imul(ax|T_zu, cx, 0x1234);` // Set ND=ZU |
| - `imul(ax|T_zu|T_nf, cx, 0x1234);` // Set ND=ZU and EVEX.NF=1 |
| - `setb(r31b|T_zu);` // same as set(r31b); movzx(r31, r31b); |
| - See [sample/zero_upper.cpp](../sample/zero_upper.cpp) |
| |
| ### ccmpSCC and ctestSCC |
| |
| - ccmpSCC(op1, op2, dfv = 0); // eflags = eflags == SCC ? cmp(op1, op2) : dfv |
| - ctestSCC(op1, op2, dfv = 0); // eflags = eflags == SCC ? test(op1, op2) : dfv |
| - SCC means source condition code such as z, a, gt. |
| - See [sample/ccmp.cpp](../sample/ccmp.cpp) |
| - Specify the union of T_of(=8), T_sf(=4), T_zf(=2), or T_cf(=1) for dfv. |
| |
| |
| ## Label |
| Two kinds of Label are supported. (String literal and Label class). |
| |
| ### String literal |
| ``` |
| L("L1"); |
| jmp("L1"); |
| |
| jmp("L2"); |
| ... |
| a few mnemonics (8-bit displacement jmp) |
| ... |
| L("L2"); |
| |
| jmp("L3", T_NEAR); |
| ... |
| a lot of mnemonics (32-bit displacement jmp) |
| ... |
| L("L3"); |
| ``` |
| |
| * Call `hasUndefinedLabel()` to verify your code has no undefined label. |
| * you can use a label for immediate value of mov like as `mov(eax, "L2")`. |
| |
| ### Support `@@`, `@f`, `@b` like MASM |
| |
| ``` |
| L("@@"); // <A> |
| jmp("@b"); // jmp to <A> |
| jmp("@f"); // jmp to <B> |
| L("@@"); // <B> |
| jmp("@b"); // jmp to <B> |
| mov(eax, "@b"); |
| jmp(eax); // jmp to <B> |
| ``` |
| |
| ### Local label |
| |
| Label symbols beginning with a period between `inLocalLabel()` and `outLocalLabel()` |
| are treated as a local label. |
| `inLocalLabel()` and `outLocalLabel()` can be nested. |
| |
| ``` |
| void func1() |
| { |
| inLocalLabel(); |
| L(".lp"); // <A> ; local label |
| ... |
| jmp(".lp"); // jmp to <A> |
| L("aaa"); // global label <C> |
| outLocalLabel(); |
| |
| inLocalLabel(); |
| L(".lp"); // <B> ; local label |
| func1(); |
| jmp(".lp"); // jmp to <B> |
| inLocalLabel(); |
| jmp("aaa"); // jmp to <C> |
| } |
| ``` |
| |
| ### short and long jump |
| Xbyak deals with jump mnemonics of an undefined label as short jump if no type is specified. |
| So if the size between jmp and label is larger than 127 byte, then xbyak will cause an error. |
| |
| ``` |
| jmp("short-jmp"); // short jmp |
| // small code |
| L("short-jmp"); |
| |
| jmp("long-jmp"); |
| // long code |
| L("long-jmp"); // throw exception |
| ``` |
| Then specify T_NEAR for jmp. |
| ``` |
| jmp("long-jmp", T_NEAR); // long jmp |
| // long code |
| L("long-jmp"); |
| ``` |
| Or call `setDefaultJmpNEAR(true);` once, then the default type is set to T_NEAR. |
| ``` |
| jmp("long-jmp"); // long jmp |
| // long code |
| L("long-jmp"); |
| ``` |
| |
| ### Label class |
| |
| `L()` and `jxx()` support Label class. |
| |
| ``` |
| Xbyak::Label label1, label2; |
| L(label1); |
| ... |
| jmp(label1); |
| ... |
| jmp(label2); |
| ... |
| L(label2); |
| ``` |
| |
| Use `putL` for jmp table |
| ``` |
| Label labelTbl, L0, L1, L2; |
| mov(rax, labelTbl); |
| // rdx is an index of jump table |
| jmp(ptr [rax + rdx * sizeof(void*)]); |
| L(labelTbl); |
| putL(L0); |
| putL(L1); |
| putL(L2); |
| L(L0); |
| .... |
| L(L1); |
| .... |
| ``` |
| |
| `assignL(dstLabel, srcLabel)` binds dstLabel with srcLabel. |
| |
| ``` |
| Label label2; |
| Label label1 = L(); // make label1 ; same to Label label1; L(label1); |
| ... |
| jmp(label2); // label2 is not determined here |
| ... |
| assignL(label2, label1); // label2 <- label1 |
| ``` |
| The `jmp` in the above code jumps to label1 assigned by `assignL`. |
| |
| **Note**: |
| * srcLabel must be used in `L()`. |
| * dstLabel must not be used in `L()`. |
| |
| `Label::getAddress()` returns the address specified by the label instance and 0 if not specified. |
| ``` |
| // not AutoGrow mode |
| Label label; |
| assert(label.getAddress() == 0); |
| L(label); |
| assert(label.getAddress() == getCurr()); |
| ``` |
| |
| ### Rip ; relative addressing |
| ``` |
| Label label; |
| mov(eax, ptr [rip + label]); // eax = 4 |
| ... |
| |
| L(label); |
| dd(4); |
| ``` |
| ``` |
| int x; |
| ... |
| mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB |
| ``` |
| |
| ## Far jump |
| |
| Use `word|dword|qword` instead of `ptr` to specify the address size. |
| |
| ### 32 bit mode |
| ``` |
| jmp(word[eax], T_FAR); // jmp m16:16(FF /5) |
| jmp(dword[eax], T_FAR); // jmp m16:32(FF /5) |
| ``` |
| |
| ### 64 bit mode |
| ``` |
| jmp(word[rax], T_FAR); // jmp m16:16(FF /5) |
| jmp(dword[rax], T_FAR); // jmp m16:32(FF /5) |
| jmp(qword[rax], T_FAR); // jmp m16:64(REX.W FF /5) |
| ``` |
| The same applies to `call`. |
| |
| ## Code size |
| The default max code size is 4096 bytes. |
| Specify the size in constructor of `CodeGenerator()` if necessary. |
| |
| ``` |
| class Quantize : public Xbyak::CodeGenerator { |
| public: |
| Quantize() |
| : CodeGenerator(8192) |
| { |
| } |
| ... |
| }; |
| ``` |
| |
| ## User allocated memory |
| |
| You can make jit code on prepared memory. |
| |
| Call `setProtectModeRE` yourself to change memory mode if using the prepared memory. |
| |
| ``` |
| uint8_t alignas(4096) buf[8192]; // C++11 or later |
| |
| struct Code : Xbyak::CodeGenerator { |
| Code() : Xbyak::CodeGenerator(sizeof(buf), buf) |
| { |
| mov(rax, 123); |
| ret(); |
| } |
| }; |
| |
| int main() |
| { |
| Code c; |
| c.setProtectModeRE(); // set memory to Read/Exec |
| printf("%d\n", c.getCode<int(*)()>()()); |
| } |
| ``` |
| |
| **Note**: See [../sample/test0.cpp](../sample/test0.cpp). |
| |
| ### AutoGrow |
| |
| The memory region for jit is automatically extended if necessary when `AutoGrow` is specified in a constructor of `CodeGenerator`. |
| |
| Call `ready()` or `readyRE()` before calling `getCode()` to fix jump address. |
| ``` |
| struct Code : Xbyak::CodeGenerator { |
| Code() |
| : Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow) |
| { |
| ... |
| } |
| }; |
| Code c; |
| // generate code for jit |
| c.ready(); // mode = Read/Write/Exec |
| ``` |
| |
| **Note**: |
| * Don't use the address returned by `getCurr()` before calling `ready()` because it may be invalid address. |
| |
| ### Read/Exec mode |
| Xbyak set Read/Write/Exec mode to memory to run jit code. |
| If you want to use Read/Exec mode for security, then specify `DontSetProtectRWE` for `CodeGenerator` and |
| call `setProtectModeRE()` after generating jit code. |
| |
| ``` |
| struct Code : Xbyak::CodeGenerator { |
| Code() |
| : Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE) |
| { |
| mov(eax, 123); |
| ret(); |
| } |
| }; |
| |
| Code c; |
| c.setProtectModeRE(); |
| ... |
| |
| ``` |
| Call `readyRE()` instead of `ready()` when using `AutoGrow` mode. |
| See [protect-re.cpp](../sample/protect-re.cpp). |
| |
| ## Exception-less mode |
| If `XBYAK_NO_EXCEPTION` is defined, then gcc/clang can compile xbyak with `-fno-exceptions`. |
| In stead of throwing an exception, `Xbyak::GetError()` returns non-zero value (e.g. `ERR_BAD_ADDRESSING`) if there is something wrong. |
| The status will not be changed automatically, then you should reset it by `Xbyak::ClearError()`. |
| `CodeGenerator::reset()` calls `ClearError()`. |
| |
| ## Macro |
| |
| * **XBYAK32** is defined on 32bit. |
| * **XBYAK64** is defined on 64bit. |
| * **XBYAK64_WIN** is defined on 64bit Windows(VC). |
| * **XBYAK64_GCC** is defined on 64bit gcc, cygwin. |
| * define **XBYAK_USE_OP_NAMES** on gcc with `-fno-operator-names` if you want to use `and()`, .... |
| * define **XBYAK_ENABLE_OMITTED_OPERAND** if you use omitted destination such as `vaddps(xmm2, xmm3);`(deprecated in the future). |
| * define **XBYAK_UNDEF_JNL** if Bessel function jnl is defined as macro. |
| * define **XBYAK_NO_EXCEPTION** for a compiler option `-fno-exceptions`. |
| * define **XBYAK_USE_MEMFD** on Linux then /proc/self/maps shows the area used by xbyak. |
| * define **XBYAK_OLD_DISP_CHECK** if the old disp check is necessary (deprecated in the future). |
| |
| ## Sample |
| |
| * [test0.cpp](../sample/test0.cpp) ; tiny sample (x86, x64) |
| * [quantize.cpp](../sample/quantize.cpp) ; JIT optimized quantization by fast division (x86 only) |
| * [calc.cpp](../sample/calc.cpp) ; assemble and estimate a given polynomial (x86, x64) |
| * [bf.cpp](../sample/bf.cpp) ; JIT brainfuck (x86, x64) |