HMLP: High-performance Machine Learning Primitives
rank_k_d8x4.hpp
1 #include <stdio.h>
2 
3 
4 #include <hmlp_internal.hpp>
6 #include <avx_type.h>
7 
8 // #define DEBUG_MICRO 1
9 
10 
12 BLIS_GEMM_KERNEL(bli_sgemm_asm_8x8,float);
13 BLIS_GEMM_KERNEL(bli_dgemm_asm_8x4,double);
14 
15 
17 {
18  const static size_t mr = 8;
19  const static size_t nr = 8;
20  const static size_t pack_mr = 8;
21  const static size_t pack_nr = 8;
22  const static size_t align_size = 32;
23  const static bool row_major = false;
24 
25 
27  inline STRA_OPERATOR(float) const
28  {
29  printf( "no implementation\n" );
30  exit( 1 );
31  };
32 
33  inline GEMM_OPERATOR(float) const
34  {
35  float alpha = 1.0;
37  float beta = aux->pc ? 1.0 : 0.0;
39  bli_sgemm_asm_8x8
40  (
41  k,
42  &alpha,
43  a,
44  b,
45  &beta,
46  c, rs_c, cs_c,
47  aux
48  );
49  };
51  template<typename TC>
52  inline void operator()
53  (
54  dim_t k,
55  float *a,
56  float *b,
57  TC *c,
58  float *v, inc_t rs_v, inc_t cs_v,
60  )
61  {
62  printf( "no implementation\n" );
63  exit( 1 );
64  };
65 
66 };
74 {
75  const static size_t mr = 8;
76  const static size_t nr = 4;
77  const static size_t pack_mr = 8;
78  const static size_t pack_nr = 4;
79  const static size_t align_size = 32;
80  const static bool row_major = false;
81 
82 
84  inline STRA_OPERATOR(double) const
85  {
86  unsigned long long len64 = (unsigned long long)(len);
87  unsigned long long ldc64 = (unsigned long long)(ldc);
88  unsigned long long k_iter = (unsigned long long)k / 4;
89  unsigned long long k_left = (unsigned long long)k % 4;
90 
91  double *b_next = (double *)(aux->b_next);
92 
93  __asm__ volatile
94  (
95  " \n\t"
96  " \n\t"
97  "movq %2, %%rax \n\t" // load address of a. ( v )
98  "movq %3, %%rbx \n\t" // load address of b. ( v )
99  "movq %8, %%r15 \n\t" // load address of b_next. ( v )
100  "addq $-4 * 64, %%r15 \n\t" // ( ? )
101  " \n\t"
102  "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading
103  "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" // elements of a and b.
104  "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
105  " \n\t"
106  " \n\t"
107  "movq %7, %%rdi \n\t" // load ldc
108  "leaq (,%%rdi,8), %%rdi \n\t" // ldc * sizeof(double)
109  " \n\t"
110  "movq %6, %%rcx \n\t" // load address of c_list[ 0 ]
111  " \n\t"
112  "movq %5, %%rsi \n\t" // i = len; ( v )
113  " \n\t"
114  ".DPREFETCHLOOP%=: \n\t"
115  " \n\t"
116  "movq 0 * 8(%%rcx), %%rdx \n\t" // load address of c_list[ i ]: rdx = c_list[ i ] ( address )
117  " \n\t"
118  "testq %%rdx, %%rdx \n\t" // check rdx via logical AND. ( v )
119  "je .DC%=1NULL \n\t" // if rdx == 0, jump to code that ( v )
120  "leaq (%%rdx,%%rdi,2), %%r11 \n\t" // load address of c_list[ i ] + 2 * ldc;
121  "prefetcht0 3 * 8(%%rdx) \n\t" // prefetch c_list[ i ] + 0 * ldc
122  "prefetcht0 3 * 8(%%rdx,%%rdi) \n\t" // prefetch c_list[ i ] + 1 * ldc
123  "prefetcht0 3 * 8(%%r11) \n\t" // prefetch c_list[ i ] + 2 * ldc
124  "prefetcht0 3 * 8(%%r11,%%rdi) \n\t" // prefetch c_list[ i ] + 3 * ldc
125  " \n\t"
126  ".DC%=1NULL: \n\t" // if C1 == NULL, code to jump
127  " \n\t"
128  "addq $1 * 8, %%rcx \n\t" // c_list += 8
129  " \n\t"
130  "decq %%rsi \n\t" // i -= 1;
131  "jne .DPREFETCHLOOP%= \n\t" // iterate again if i != 0.
132  " \n\t"
133  " \n\t"
134  " \n\t"
135  "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t" // set ymm8 to 0 ( v )
136  "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t"
137  "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t"
138  "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t"
139  "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
140  "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
141  "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
142  "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
143  " \n\t"
144  " \n\t"
145  " \n\t"
146  "movq %0, %%rsi \n\t" // i = k_iter; ( v )
147  "testq %%rsi, %%rsi \n\t" // check i via logical AND. ( v )
148  "je .DCONSIDKLEFT%= \n\t" // if i == 0, jump to code that ( v )
149  " \n\t" // contains the k_left loop.
150  " \n\t"
151  " \n\t"
152  ".DLOOPKITER%=: \n\t" // MAIN LOOP
153  " \n\t"
154  "addq $4 * 4 * 8, %%r15 \n\t" // b_next += 4*4 (unroll x nr) ( v )
155  " \n\t"
156  " \n\t" // iteration 0
157  "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter 0
158  "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" // ymm6 ( c_tmp0 ) = ymm0 ( a03 ) * ymm2( b0 )
159  "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" // ymm4 ( b0x3_0 )
160  "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" // ymm7 ( c_tmp1 ) = ymm0 ( a03 ) * ymm3( b0x5 )
161  "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" // ymm5 ( b0x3_1 )
162  "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" // ymm15 ( c_03_0 ) += ymm6( c_tmp0 )
163  "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" // ymm13 ( c_03_1 ) += ymm7( c_tmp1 )
164  " \n\t"
165  "prefetcht0 16 * 32(%%rax) \n\t" // prefetch a03 for iter 1
166  "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
167  "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 1
168  "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
169  "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
170  "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
171  "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
172  " \n\t"
173  "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
174  "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
175  "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter 1
176  "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
177  "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
178  "prefetcht0 0 * 32(%%r15) \n\t" // prefetch b_next[0*4]
179  " \n\t"
180  "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
181  "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
182  "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
183  "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
184  " \n\t"
185  " \n\t"
186  " \n\t" // iteration 1
187  "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter 1
188  "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
189  "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
190  "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
191  "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
192  "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
193  "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
194  " \n\t"
195  "prefetcht0 18 * 32(%%rax) \n\t" // prefetch a for iter 9 ( ? )
196  "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
197  "vmovapd 2 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 2
198  "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
199  "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
200  "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
201  "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
202  " \n\t"
203  "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
204  "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
205  "vmovapd 4 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter 2
206  "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
207  "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
208  " \n\t"
209  "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
210  "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
211  "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
212  "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
213  " \n\t"
214  " \n\t"
215  " \n\t" // iteration 2
216  "vmovapd 5 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter 2
217  "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
218  "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
219  "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
220  "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
221  "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
222  "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
223  " \n\t"
224  "prefetcht0 20 * 32(%%rax) \n\t" // prefetch a for iter 10 ( ? )
225  "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
226  "vmovapd 3 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 3
227  "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr)
228  "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
229  "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
230  "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
231  "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
232  " \n\t"
233  "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
234  "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
235  "vmovapd 6 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter 3
236  "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
237  "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
238  "prefetcht0 2 * 32(%%r15) \n\t" // prefetch b_next[2*4]
239  " \n\t"
240  "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
241  "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
242  "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
243  "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
244  " \n\t"
245  " \n\t"
246  " \n\t" // iteration 3
247  "vmovapd 7 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter 3
248  "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr)
249  "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
250  "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
251  "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
252  "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
253  "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
254  "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
255  " \n\t"
256  "prefetcht0 14 * 32(%%rax) \n\t" // prefetch a for iter 11 ( ? )
257  "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
258  "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 4
259  "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
260  "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
261  "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
262  "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
263  " \n\t"
264  "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
265  "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
266  "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter 4
267  "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
268  "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
269  " \n\t"
270  "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
271  "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
272  "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
273  "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
274  " \n\t"
275  " \n\t"
276  " \n\t"
277  " \n\t"
278  "decq %%rsi \n\t" // i -= 1;
279  "jne .DLOOPKITER%= \n\t" // iterate again if i != 0.
280  " \n\t"
281  " \n\t"
282  " \n\t"
283  " \n\t"
284  " \n\t"
285  " \n\t"
286  ".DCONSIDKLEFT%=: \n\t"
287  " \n\t"
288  "movq %1, %%rsi \n\t" // i = k_left;
289  "testq %%rsi, %%rsi \n\t" // check i via logical AND.
290  "je .DPOSTACCUM%= \n\t" // if i == 0, we're done; jump to end.
291  " \n\t" // else, we prepare to enter k_left loop.
292  " \n\t"
293  " \n\t"
294  ".DLOOPKLEFT%=: \n\t" // EDGE LOOP
295  " \n\t"
296  "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" // preload a47
297  "addq $8 * 1 * 8, %%rax \n\t" // a += 8 (1 x mr)
298  "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
299  "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
300  "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
301  "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
302  "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
303  "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
304  " \n\t"
305  "prefetcht0 14 * 32(%%rax) \n\t" // prefetch a03 for iter 7 later ( ? )
306  "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
307  "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t"
308  "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr)
309  "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
310  "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
311  "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
312  "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
313  " \n\t"
314  "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
315  "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
316  "vmovapd 0 * 32(%%rax), %%ymm0 \n\t"
317  "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
318  "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
319  " \n\t"
320  "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
321  "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
322  "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
323  "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
324  " \n\t"
325  " \n\t"
326  "decq %%rsi \n\t" // i -= 1;
327  "jne .DLOOPKLEFT%= \n\t" // iterate again if i != 0.
328  " \n\t"
329  " \n\t"
330  " \n\t"
331  ".DPOSTACCUM%=: \n\t"
332  " \n\t"
333  " \n\t"
334  " \n\t" // ymm15: ymm13: ymm11: ymm9:
335  " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
336  " \n\t" // ab11 ab10 ab13 ab12
337  " \n\t" // ab22 ab23 ab20 ab21
338  " \n\t" // ab33 ) ab32 ) ab31 ) ab30 )
339  " \n\t"
340  " \n\t" // ymm14: ymm12: ymm10: ymm8:
341  " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43
342  " \n\t" // ab51 ab50 ab53 ab52
343  " \n\t" // ab62 ab63 ab60 ab61
344  " \n\t" // ab73 ) ab72 ) ab71 ) ab70 )
345  " \n\t"
346  "vmovapd %%ymm15, %%ymm7 \n\t"
347  "vshufpd $0xa, %%ymm15, %%ymm13, %%ymm15 \n\t"
348  "vshufpd $0xa, %%ymm13, %%ymm7, %%ymm13 \n\t"
349  " \n\t"
350  "vmovapd %%ymm11, %%ymm7 \n\t"
351  "vshufpd $0xa, %%ymm11, %%ymm9, %%ymm11 \n\t"
352  "vshufpd $0xa, %%ymm9, %%ymm7, %%ymm9 \n\t"
353  " \n\t"
354  "vmovapd %%ymm14, %%ymm7 \n\t"
355  "vshufpd $0xa, %%ymm14, %%ymm12, %%ymm14 \n\t"
356  "vshufpd $0xa, %%ymm12, %%ymm7, %%ymm12 \n\t"
357  " \n\t"
358  "vmovapd %%ymm10, %%ymm7 \n\t"
359  "vshufpd $0xa, %%ymm10, %%ymm8, %%ymm10 \n\t"
360  "vshufpd $0xa, %%ymm8, %%ymm7, %%ymm8 \n\t"
361  " \n\t"
362  " \n\t" // ymm15: ymm13: ymm11: ymm9:
363  " \n\t" // ( ab01 ( ab00 ( ab03 ( ab02
364  " \n\t" // ab11 ab10 ab13 ab12
365  " \n\t" // ab23 ab22 ab21 ab20
366  " \n\t" // ab33 ) ab32 ) ab31 ) ab30 )
367  " \n\t"
368  " \n\t" // ymm14: ymm12: ymm10: ymm8:
369  " \n\t" // ( ab41 ( ab40 ( ab43 ( ab42
370  " \n\t" // ab51 ab50 ab53 ab52
371  " \n\t" // ab63 ab62 ab61 ab60
372  " \n\t" // ab73 ) ab72 ) ab71 ) ab70 )
373  " \n\t"
374  "vmovapd %%ymm15, %%ymm7 \n\t"
375  "vperm2f128 $0x30, %%ymm15, %%ymm11, %%ymm15 \n\t"
376  "vperm2f128 $0x12, %%ymm7, %%ymm11, %%ymm11 \n\t"
377  " \n\t"
378  "vmovapd %%ymm13, %%ymm7 \n\t"
379  "vperm2f128 $0x30, %%ymm13, %%ymm9, %%ymm13 \n\t"
380  "vperm2f128 $0x12, %%ymm7, %%ymm9, %%ymm9 \n\t"
381  " \n\t"
382  "vmovapd %%ymm14, %%ymm7 \n\t"
383  "vperm2f128 $0x30, %%ymm14, %%ymm10, %%ymm14 \n\t"
384  "vperm2f128 $0x12, %%ymm7, %%ymm10, %%ymm10 \n\t"
385  " \n\t"
386  "vmovapd %%ymm12, %%ymm7 \n\t"
387  "vperm2f128 $0x30, %%ymm12, %%ymm8, %%ymm12 \n\t"
388  "vperm2f128 $0x12, %%ymm7, %%ymm8, %%ymm8 \n\t"
389  " \n\t"
390  " \n\t" // ymm9: ymm11: ymm13: ymm15:
391  " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
392  " \n\t" // ab10 ab11 ab12 ab13
393  " \n\t" // ab20 ab21 ab22 ab23
394  " \n\t" // ab30 ) ab31 ) ab32 ) ab33 )
395  " \n\t"
396  " \n\t" // ymm8: ymm10: ymm12: ymm14:
397  " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43
398  " \n\t" // ab50 ab51 ab52 ab53
399  " \n\t" // ab60 ab61 ab62 ab63
400  " \n\t" // ab70 ) ab71 ) ab72 ) ab73 )
401  " \n\t"
402  " \n\t"
403  "movq %4, %%rax \n\t" // load address of alpha_list[ 0 ]
404  "movq %6, %%rcx \n\t" // load address of c_list[ 0 ]
405  " \n\t"
406  " \n\t"
407  "movq %5, %%rsi \n\t" // i = len; ( v )
408  " \n\t"
409  ".DSTORELOOP%=: \n\t"
410  " \n\t"
411  "movq 0 * 8(%%rcx), %%rdx \n\t" // rdx = c_list[ i ] ( address )
412  " \n\t"
413  //"movq 0 * 8(%%rax), %%rbx \n\t" // load address of alpha_list[ i ]
414  //"vbroadcastsd (%%rbx), %%ymm6 \n\t" // load alpha_list[ 1 ] and duplicate
415  "vbroadcastsd (%%rax), %%ymm6 \n\t" // load alpha_list[ i ] and duplicate
416  " \n\t"
417  " \n\t"
418  //"jmp .DDONE%= \n\t"
419  "vmovapd 0 * 32(%%rdx), %%ymm0 \n\t" // ymm0 = c_list[1]( 0:3, 0 )
420  "vmulpd %%ymm6, %%ymm9, %%ymm1 \n\t" // scale by alpha2, ymm1 = ymm6( alpha2 ) * ymm9( ab0_3:0 )
421  "vaddpd %%ymm1, %%ymm0, %%ymm1 \n\t" // ymm1 = ymm0 + ymm1
422  "vmovapd %%ymm1, 0 * 32(%%rdx) \n\t" // and store back to memory: c_list[1]( 0:3, 0 )
423  "vmovapd 1 * 32(%%rdx), %%ymm3 \n\t" // ymm3 = c_list[1]( 4:7, 0 )
424  "vmulpd %%ymm6, %%ymm8, %%ymm2 \n\t" // scale by alpha2, ymm2 = ymm6( alpha2 ) * ymm8( ab4_7:0 )
425  "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" // ymm2 = ymm3 + ymm2
426  "vmovapd %%ymm2, 1 * 32(%%rdx) \n\t" // and store back to memory: c_list[1]( 4:7, 0 )
427  "addq %%rdi, %%rdx \n\t"
428  " \n\t"
429  "vmovapd 0 * 32(%%rdx), %%ymm0 \n\t" // ymm0 = c_list[1]( 0:3, 0 )
430  "vmulpd %%ymm6, %%ymm11, %%ymm1 \n\t" // scale by alpha2, ymm1 = ymm6( alpha2 ) * ymm11( ab0_3:1 )
431  "vaddpd %%ymm1, %%ymm0, %%ymm1 \n\t" // ymm1 = ymm0 + ymm1
432  "vmovapd %%ymm1, 0 * 32(%%rdx) \n\t" // and store back to memory: c_list[1]( 0:3, 0 )
433  "vmovapd 1 * 32(%%rdx), %%ymm3 \n\t" // ymm3 = c_list[1]( 4:7, 0 )
434  "vmulpd %%ymm6, %%ymm10, %%ymm2 \n\t" // scale by alpha2, ymm2 = ymm6( alpha2 ) * ymm10( ab4_7:1 )
435  "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" // ymm2 = ymm3 + ymm2
436  "vmovapd %%ymm2, 1 * 32(%%rdx) \n\t" // and store back to memory: c_list[1]( 4:7, 0 )
437  "addq %%rdi, %%rdx \n\t"
438  " \n\t"
439  "vmovapd 0 * 32(%%rdx), %%ymm0 \n\t" // ymm0 = c_list[1]( 0:3, 0 )
440  "vmulpd %%ymm6, %%ymm13, %%ymm1 \n\t" // scale by alpha2, ymm1 = ymm6( alpha2 ) * ymm13( ab0_3:1 )
441  "vaddpd %%ymm1, %%ymm0, %%ymm1 \n\t" // ymm1 = ymm0 + ymm1
442  "vmovapd %%ymm1, 0 * 32(%%rdx) \n\t" // and store back to memory: c_list[1]( 0:3, 0 )
443  "vmovapd 1 * 32(%%rdx), %%ymm3 \n\t" // ymm3 = c_list[1]( 4:7, 0 )
444  "vmulpd %%ymm6, %%ymm12, %%ymm2 \n\t" // scale by alpha2, ymm2 = ymm6( alpha2 ) * ymm12( ab4_7:1 )
445  "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" // ymm2 = ymm3 + ymm2
446  "vmovapd %%ymm2, 1 * 32(%%rdx) \n\t" // and store back to memory: c_list[1]( 4:7, 0 )
447  "addq %%rdi, %%rdx \n\t"
448  " \n\t"
449  "vmovapd 0 * 32(%%rdx), %%ymm0 \n\t" // ymm0 = c_list[1]( 0:3, 0 )
450  "vmulpd %%ymm6, %%ymm15, %%ymm1 \n\t" // scale by alpha2, ymm1 = ymm6( alpha2 ) * ymm15( ab0_3:1 )
451  "vaddpd %%ymm1, %%ymm0, %%ymm1 \n\t" // ymm1 = ymm0 + ymm1
452  "vmovapd %%ymm1, 0 * 32(%%rdx) \n\t" // and store back to memory: c_list[1]( 0:3, 0 )
453  "vmovapd 1 * 32(%%rdx), %%ymm3 \n\t" // ymm3 = c_list[1]( 4:7, 0 )
454  "vmulpd %%ymm6, %%ymm14, %%ymm2 \n\t" // scale by alpha2, ymm2 = ymm6( alpha2 ) * ymm14( ab4_7:1 )
455  "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" // ymm2 = ymm3 + ymm2
456  "vmovapd %%ymm2, 1 * 32(%%rdx) \n\t" // and store back to memory: c_list[1]( 4:7, 0 )
457  " \n\t"
458  " \n\t"
459  "addq $1 * 8, %%rcx \n\t" // c_list += 8
460  "addq $1 * 8, %%rax \n\t" // alpha_list += 8
461  " \n\t"
462  "decq %%rsi \n\t" // i -= 1;
463  "jne .DSTORELOOP%= \n\t" // iterate again if i != 0.
464  " \n\t"
465  ".DDONE%=: \n\t"
466  " \n\t"
467  : // output operands (none)
468  : // input operands
469  "m" (k_iter), // 0
470  "m" (k_left), // 1
471  "m" (a), // 2
472  "m" (b), // 3
473  "m" (alpha_list), // 4
474  "m" (len64), // 5
475  "m" (c_list), // 6
476  "m" (ldc64), // 7
477  "m" (b_next) // 8
478  : // register clobber list
479  "rax", "rbx", "rcx", "rdx", "rdi", "rsi",
480  "r10", "r11", "r12", "r13", "r14", "r15",
481  "xmm0", "xmm1", "xmm2", "xmm3",
482  "xmm4", "xmm5", "xmm6", "xmm7",
483  "xmm8", "xmm9", "xmm10", "xmm11",
484  "xmm12", "xmm13", "xmm14", "xmm15",
485  "memory"
486  );
487 
488  };
491  inline GEMM_OPERATOR(double) const
492  {
493  double alpha = 1.0;
495  double beta = aux->pc ? 1.0 : 0.0;
497  bli_dgemm_asm_8x4
498  (
499  k,
500  &alpha,
501  a,
502  b,
503  &beta,
504  c, rs_c, cs_c,
505  aux
506  );
507  };
509  template<typename TC>
510  inline void operator()
511  (
512  dim_t k,
513  double *a,
514  double *b,
515  TC *c,
516  double *v, inc_t rs_v, inc_t cs_v,
518  )
519  {
520  printf( "no implementation\n" );
521  exit( 1 );
522  };
523 
524 };
Definition: rank_k_d8x4.hpp:16
STRA_OPERATOR(double) const
Definition: rank_k_d8x4.hpp:84
STRA_OPERATOR(float) const
Definition: rank_k_d8x4.hpp:27
GEMM_OPERATOR(float) const
Definition: rank_k_d8x4.hpp:33
GEMM_OPERATOR(double) const
Definition: rank_k_d8x4.hpp:491
Definition: rank_k_d8x4.hpp:73
Definition: hmlp_internal.hpp:38