HMLP: High-performance Machine Learning Primitives
Main Page
Namespaces
Classes
Files
File List
bli_avx512_macros.h
1
#ifndef BLIS_AVX512_MACROS_H
2
#define BLIS_AVX512_MACROS_H
3
4
//
5
// Assembly macros to make AVX-512 with AT&T syntax somewhat less painful
6
//
7
8
#define COMMENT_BEGIN "#"
9
#define COMMENT_END
10
11
#define STRINGIFY(...) #__VA_ARGS__
12
#define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t"
13
#define LABEL(label) STRINGIFY(label) ":\n\t"
14
15
#define XMM(x) %%xmm##x
16
#define YMM(x) %%ymm##x
17
#define ZMM(x) %%zmm##x
18
#define EAX %%eax
19
#define EBX %%ebx
20
#define ECX %%ecx
21
#define EDX %%edx
22
#define EBP %%ebp
23
#define EDI %%edi
24
#define ESI %%esi
25
#define RAX %%rax
26
#define RBX %%rbx
27
#define RCX %%rcx
28
#define RDX %%rdx
29
#define RBP %%rbp
30
#define RDI %%rdi
31
#define RSI %%rsi
32
#define K(x) %%k##x
33
#define R(x) %%r##x
34
#define R8 %%r8
35
#define R9 %%r9
36
#define R10 %%r10
37
#define R11 %%r11
38
#define R12 %%r12
39
#define R13 %%r13
40
#define R14 %%r14
41
#define R15 %%r15
42
#define RD(x) %%r##x##d
43
#define R8D %%r8d
44
#define R9D %%r9d
45
#define R10D %%r10d
46
#define R11D %%r11d
47
#define R12D %%r12d
48
#define R13D %%r13d
49
#define R14D %%r14d
50
#define R15D %%r15d
51
#define IMM(x) $##x
52
#define VAR(x) %[x]
53
54
#define MEM_4(reg,off,scale,disp) disp(reg,off,scale)
55
#define MEM_3(reg,off,scale) (reg,off,scale)
56
#define MEM_2(reg,disp) disp(reg)
57
#define MEM_1(reg) (reg)
58
59
#define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%}
60
#define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%}
61
#define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%}
62
#define MEM_1TO8_1(reg) MEM(reg) %{1to8%}
63
64
#define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%}
65
#define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%}
66
#define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%}
67
#define MEM_1TO16_1(reg) MEM(reg) %{1to16%}
68
69
#define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME
70
#define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__)
71
#define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__)
72
#define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__)
73
74
#define MASK_K(n) %{%%k##n%}
75
#define MASK_KZ(n) %{%%k##n%}%{z%}
76
#define KMOV(to,from) ASM(kmovw from, to)
77
#define JKNZD(kreg,label) \
78
ASM(kortestw kreg, kreg) \
79
ASM(jnz label)
80
#define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0)
81
#define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0)
82
83
#define ALIGN16 ASM(.p2align 4)
84
#define ALIGN32 ASM(.p2align 5)
85
#define RDTSC ASM(rdstc)
86
#define MOV(_0, _1) ASM(mov _1, _0)
87
#define MOVD(_0, _1) ASM(movd _1, _0)
88
#define MOVL(_0, _1) ASM(movl _1, _0)
89
#define MOVQ(_0, _1) ASM(movq _1, _0)
90
#define VMOVD(_0, _1) ASM(vmovd _1, _0)
91
#define VMOVQ(_0, _1) ASM(vmovq _1, _0)
92
#define CMP(_0, _1) ASM(cmp _1, _0)
93
#define AND(_0, _1) ASM(and _1, _0)
94
#define ADD(_0, _1) ASM(add _1, _0)
95
#define SUB(_0, _1) ASM(sub _1, _0)
96
#define SAL(_0, _1) ASM(sal _1, _0)
97
#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0)
98
#define SAR(_0, _1) ASM(sar _1, _0)
99
#define SAL1(_0) ASM(sal _0)
100
#define SAR1(_0) ASM(sar _0)
101
#define LEA(_0, _1) ASM(lea _1, _0)
102
#define TEST(_0, _1) ASM(test _1, _0)
103
#define DEC(_0) ASM(dec _0)
104
#define JLE(_0) ASM(jle _0)
105
#define JL(_0) ASM(jl _0)
106
#define JNZ(_0) ASM(jnz _0)
107
#define JZ(_0) ASM(jz _0)
108
#define JNE(_0) ASM(jne _0)
109
#define JE(_0) ASM(je _0)
110
#define JNC(_0) ASM(jnc _0)
111
#define JC(_0) ASM(jc _0)
112
#define JMP(_0) ASM(jmp _0)
113
#define VCOMISS(_0, _1) ASM(vcomiss _1, _0)
114
#define VCOMISD(_0, _1) ASM(vcomisd _1, _0)
115
#define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0)
116
#define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0)
117
#define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0)
118
#define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0)
119
#define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0)
120
#define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0)
121
#define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0)
122
#define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0)
123
#define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0)
124
#define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0)
125
#define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0)
126
#define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0)
127
#define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0)
128
#define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0)
129
#define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0)
130
#define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0)
131
#define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0)
132
#define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0)
133
#define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0)
134
#define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0)
135
#define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0)
136
#define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0)
137
#define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0)
138
#define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0)
139
#define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0)
140
#define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0)
141
#define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0)
142
#define VMOVSS(_0, _1) ASM(vmovss _1, _0)
143
#define VMOVSD(_0, _1) ASM(vmovsd _1, _0)
144
#define VMOVAPS(_0, _1) ASM(vmovaps _1, _0)
145
#define VMOVUPS(_0, _1) ASM(vmovups _1, _0)
146
#define VMOVAPD(_0, _1) ASM(vmovapd _1, _0)
147
#define VMOVUPD(_0, _1) ASM(vmovupd _1, _0)
148
#define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0)
149
#define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0)
150
#define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0)
151
#define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0)
152
#define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0)
153
#define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0)
154
#define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0)
155
#define VINSERTF128(_0, _1, _2) ASM(vinsertf128 _2, _1, _0)
156
#define VEXTRACTF128(_0, _1, _2) ASM(vextractf128 _2, _1, _0)
157
#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0)
158
#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0)
159
#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0)
160
#define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0)
161
#define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0)
162
#define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0)
163
#define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0)
164
#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS)
165
#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS)
166
#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS)
167
#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS)
168
#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)
169
#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS)
170
#define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS)
171
#define VZEROUPPER() ASM(vzeroupper)
172
173
#endif
kernel
x86_64
skx
bli_avx512_macros.h
Generated by
1.8.11