HMLP: High-performance Machine Learning Primitives
bli_avx512_macros.h
1 #ifndef BLIS_AVX512_MACROS_H
2 #define BLIS_AVX512_MACROS_H
3 
4 //
5 // Assembly macros to make AVX-512 with AT&T syntax somewhat less painful
6 //
7 
8 #define COMMENT_BEGIN "#"
9 #define COMMENT_END
10 
11 #define STRINGIFY(...) #__VA_ARGS__
12 #define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t"
13 #define LABEL(label) STRINGIFY(label) ":\n\t"
14 
15 #define XMM(x) %%xmm##x
16 #define YMM(x) %%ymm##x
17 #define ZMM(x) %%zmm##x
18 #define EAX %%eax
19 #define EBX %%ebx
20 #define ECX %%ecx
21 #define EDX %%edx
22 #define EBP %%ebp
23 #define EDI %%edi
24 #define ESI %%esi
25 #define RAX %%rax
26 #define RBX %%rbx
27 #define RCX %%rcx
28 #define RDX %%rdx
29 #define RBP %%rbp
30 #define RDI %%rdi
31 #define RSI %%rsi
32 #define K(x) %%k##x
33 #define R(x) %%r##x
34 #define R8 %%r8
35 #define R9 %%r9
36 #define R10 %%r10
37 #define R11 %%r11
38 #define R12 %%r12
39 #define R13 %%r13
40 #define R14 %%r14
41 #define R15 %%r15
42 #define RD(x) %%r##x##d
43 #define R8D %%r8d
44 #define R9D %%r9d
45 #define R10D %%r10d
46 #define R11D %%r11d
47 #define R12D %%r12d
48 #define R13D %%r13d
49 #define R14D %%r14d
50 #define R15D %%r15d
51 #define IMM(x) $##x
52 #define VAR(x) %[x]
53 
54 #define MEM_4(reg,off,scale,disp) disp(reg,off,scale)
55 #define MEM_3(reg,off,scale) (reg,off,scale)
56 #define MEM_2(reg,disp) disp(reg)
57 #define MEM_1(reg) (reg)
58 
59 #define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%}
60 #define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%}
61 #define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%}
62 #define MEM_1TO8_1(reg) MEM(reg) %{1to8%}
63 
64 #define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%}
65 #define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%}
66 #define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%}
67 #define MEM_1TO16_1(reg) MEM(reg) %{1to16%}
68 
69 #define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME
70 #define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__)
71 #define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__)
72 #define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__)
73 
74 #define MASK_K(n) %{%%k##n%}
75 #define MASK_KZ(n) %{%%k##n%}%{z%}
76 #define KMOV(to,from) ASM(kmovw from, to)
77 #define JKNZD(kreg,label) \
78  ASM(kortestw kreg, kreg) \
79  ASM(jnz label)
80 #define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0)
81 #define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0)
82 
83 #define ALIGN16 ASM(.p2align 4)
84 #define ALIGN32 ASM(.p2align 5)
85 #define RDTSC ASM(rdstc)
86 #define MOV(_0, _1) ASM(mov _1, _0)
87 #define MOVD(_0, _1) ASM(movd _1, _0)
88 #define MOVL(_0, _1) ASM(movl _1, _0)
89 #define MOVQ(_0, _1) ASM(movq _1, _0)
90 #define VMOVD(_0, _1) ASM(vmovd _1, _0)
91 #define VMOVQ(_0, _1) ASM(vmovq _1, _0)
92 #define CMP(_0, _1) ASM(cmp _1, _0)
93 #define AND(_0, _1) ASM(and _1, _0)
94 #define ADD(_0, _1) ASM(add _1, _0)
95 #define SUB(_0, _1) ASM(sub _1, _0)
96 #define SAL(_0, _1) ASM(sal _1, _0)
97 #define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0)
98 #define SAR(_0, _1) ASM(sar _1, _0)
99 #define SAL1(_0) ASM(sal _0)
100 #define SAR1(_0) ASM(sar _0)
101 #define LEA(_0, _1) ASM(lea _1, _0)
102 #define TEST(_0, _1) ASM(test _1, _0)
103 #define DEC(_0) ASM(dec _0)
104 #define JLE(_0) ASM(jle _0)
105 #define JL(_0) ASM(jl _0)
106 #define JNZ(_0) ASM(jnz _0)
107 #define JZ(_0) ASM(jz _0)
108 #define JNE(_0) ASM(jne _0)
109 #define JE(_0) ASM(je _0)
110 #define JNC(_0) ASM(jnc _0)
111 #define JC(_0) ASM(jc _0)
112 #define JMP(_0) ASM(jmp _0)
113 #define VCOMISS(_0, _1) ASM(vcomiss _1, _0)
114 #define VCOMISD(_0, _1) ASM(vcomisd _1, _0)
115 #define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0)
116 #define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0)
117 #define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0)
118 #define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0)
119 #define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0)
120 #define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0)
121 #define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0)
122 #define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0)
123 #define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0)
124 #define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0)
125 #define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0)
126 #define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0)
127 #define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0)
128 #define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0)
129 #define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0)
130 #define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0)
131 #define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0)
132 #define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0)
133 #define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0)
134 #define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0)
135 #define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0)
136 #define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0)
137 #define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0)
138 #define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0)
139 #define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0)
140 #define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0)
141 #define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0)
142 #define VMOVSS(_0, _1) ASM(vmovss _1, _0)
143 #define VMOVSD(_0, _1) ASM(vmovsd _1, _0)
144 #define VMOVAPS(_0, _1) ASM(vmovaps _1, _0)
145 #define VMOVUPS(_0, _1) ASM(vmovups _1, _0)
146 #define VMOVAPD(_0, _1) ASM(vmovapd _1, _0)
147 #define VMOVUPD(_0, _1) ASM(vmovupd _1, _0)
148 #define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0)
149 #define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0)
150 #define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0)
151 #define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0)
152 #define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0)
153 #define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0)
154 #define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0)
155 #define VINSERTF128(_0, _1, _2) ASM(vinsertf128 _2, _1, _0)
156 #define VEXTRACTF128(_0, _1, _2) ASM(vextractf128 _2, _1, _0)
157 #define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0)
158 #define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0)
159 #define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0)
160 #define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0)
161 #define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0)
162 #define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0)
163 #define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0)
164 #define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS)
165 #define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS)
166 #define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS)
167 #define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS)
168 #define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)
169 #define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS)
170 #define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS)
171 #define VZEROUPPER() ASM(vzeroupper)
172 
173 #endif