HMLP: High-performance Machine Learning Primitives
conv_relu_pool2x2_d8x4.hpp
1 #include <stdio.h>
2 #include <immintrin.h> // AVX
3 
4 #include <hmlp_internal.hpp>
5 #include <avx_type.h> // self-defined vector type
6 
7 // #define DEBUG_MICRO 1
8 
9 
10 // TODO: not yet implemented yet
12 {
13  inline void operator()(
14  int k,
15  double *a,
16  double *b,
17  double *c, int ldc,
19  {
20  double c_reg[ 8 * 4 ] = { 0.0 };
21 
22  for ( int p = 0; p < k; p ++ )
23  {
24  #pragma unroll
25  for ( int j = 0; j < 4; j ++ )
26  {
27  #pragma unroll
28  for ( int i = 0; i < 8; i ++ )
29  {
30  c_reg[ j * 8 + i ] += a[ p * 8 + i ] * b[ p * 4 + j ];
31  }
32  }
33  }
34 
35  if ( aux->pc )
36  {
37  #pragma unroll
38  for ( int j = 0; j < 4; j ++ )
39  {
40  #pragma unroll
41  for ( int i = 0; i < 8; i ++ )
42  {
43  c[ j * ldc + i ] += c_reg[ j * 8 + i ];
44  }
45  }
46  }
47  else
48  {
49  #pragma unroll
50  for ( int j = 0; j < 4; j ++ )
51  {
52  #pragma unroll
53  for ( int i = 0; i < 8; i ++ )
54  {
55  c[ j * ldc + i ] = c_reg[ j * 8 + i ];
56  }
57  }
58  }
59 
60 #ifdef DEBUG_MICRO
61  printf( "rank_k_ref_d8x4:" );
62  for ( int i = 0; i < 8; i ++ )
63  {
64  for ( int j = 0; j < 4; j ++ )
65  {
66  printf( "%E ", c[ j * ldc + i ] );
67  }
68  printf( "\n" );
69  }
70 #endif
71  }
72 };
73 
74 
76 {
77  inline void operator()
78  (
79  int k,
80  double *a,
81  double *b,
82  double *c, int ldc,
84  ) const
85  {
86  int i;
87  double dzero = 0.0;
88  v4df_t c03_0, c03_1, c03_2, c03_3;
89  v4df_t c47_0, c47_1, c47_2, c47_3;
90  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
91  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
92  //v4df_t u03, u47;
93  v4df_t a03, a47, A03, A47; // prefetched A
94  v4df_t b0, b1, b2, b3, B0; // prefetched B
95  v4df_t c_tmp, aa_tmp, bb_tmp, w_tmp;
96 
97  __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( c ) );
98 
99  // Rank-k update segment
100  #include "component/rank_k_int_d8x4.hpp"
101 
102  // Store back
103  if ( aux->pc )
104  {
105  // packed
106  if ( aux->do_packC )
107  {
108  tmpc03_0.v = _mm256_load_pd( (double*)( c ) );
109  tmpc47_0.v = _mm256_load_pd( (double*)( c + 4 ) );
110 
111  tmpc03_1.v = _mm256_load_pd( (double*)( c + 8 ) );
112  tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) );
113 
114  tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) );
115  tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) );
116 
117  tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) );
118  tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) );
119  }
120  else
121  {
122  tmpc03_0.v = _mm256_load_pd( (double*)( c ) );
123  tmpc47_0.v = _mm256_load_pd( (double*)( c + 4 ) );
124 
125  tmpc03_1.v = _mm256_load_pd( (double*)( c + 1 * ldc ) );
126  tmpc47_1.v = _mm256_load_pd( (double*)( c + 1 * ldc + 4 ) );
127 
128  tmpc03_2.v = _mm256_load_pd( (double*)( c + 2 * ldc ) );
129  tmpc47_2.v = _mm256_load_pd( (double*)( c + 2 * ldc + 4 ) );
130 
131  tmpc03_3.v = _mm256_load_pd( (double*)( c + 3 * ldc ) );
132  tmpc47_3.v = _mm256_load_pd( (double*)( c + 3 * ldc + 4 ) );
133  }
134 
135 
136  c03_0.v = _mm256_add_pd( tmpc03_0.v, c03_0.v );
137  c47_0.v = _mm256_add_pd( tmpc47_0.v, c47_0.v );
138 
139  c03_1.v = _mm256_add_pd( tmpc03_1.v, c03_1.v );
140  c47_1.v = _mm256_add_pd( tmpc47_1.v, c47_1.v );
141 
142  c03_2.v = _mm256_add_pd( tmpc03_2.v, c03_2.v );
143  c47_2.v = _mm256_add_pd( tmpc47_2.v, c47_2.v );
144 
145  c03_3.v = _mm256_add_pd( tmpc03_3.v, c03_3.v );
146  c47_3.v = _mm256_add_pd( tmpc47_3.v, c47_3.v );
147  }
148 
149 #ifdef DEBUG_MICRO
150  printf( "rank_k_int_d8x4:\n" );
151  printf( "%E %E %E %E\n", c03_0.d[ 0 ], c03_1.d[ 0 ], c03_2.d[ 0 ], c03_3.d[ 0 ] );
152  printf( "%E %E %E %E\n", c03_0.d[ 1 ], c03_1.d[ 1 ], c03_2.d[ 1 ], c03_3.d[ 1 ] );
153  printf( "%E %E %E %E\n", c03_0.d[ 2 ], c03_1.d[ 2 ], c03_2.d[ 2 ], c03_3.d[ 2 ] );
154  printf( "%E %E %E %E\n", c03_0.d[ 3 ], c03_1.d[ 3 ], c03_2.d[ 3 ], c03_3.d[ 3 ] );
155 
156  printf( "%E %E %E %E\n", c47_0.d[ 0 ], c47_1.d[ 0 ], c47_2.d[ 0 ], c47_3.d[ 0 ] );
157  printf( "%E %E %E %E\n", c47_0.d[ 1 ], c47_1.d[ 1 ], c47_2.d[ 1 ], c47_3.d[ 1 ] );
158  printf( "%E %E %E %E\n", c47_0.d[ 2 ], c47_1.d[ 2 ], c47_2.d[ 2 ], c47_3.d[ 2 ] );
159  printf( "%E %E %E %E\n", c47_0.d[ 3 ], c47_1.d[ 3 ], c47_2.d[ 3 ], c47_3.d[ 3 ] );
160 #endif
161 
162 
163  // relu
164  c_tmp.v = _mm256_broadcast_sd( &dzero );
165  //c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v );
166  //c03_1.v = _mm256_max_pd( c_tmp.v, c03_1.v );
167  //c03_2.v = _mm256_max_pd( c_tmp.v, c03_2.v );
168  //c03_3.v = _mm256_max_pd( c_tmp.v, c03_3.v );
169  //c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v );
170  //c47_1.v = _mm256_max_pd( c_tmp.v, c47_1.v );
171  //c47_2.v = _mm256_max_pd( c_tmp.v, c47_2.v );
172  //c47_3.v = _mm256_max_pd( c_tmp.v, c47_3.v );
173 
174  // relu + pool
175  c03_0.v = _mm256_max_pd( c03_1.v, c03_0.v );
176  c03_0.v = _mm256_max_pd( c03_2.v, c03_0.v );
177  c03_0.v = _mm256_max_pd( c03_3.v, c03_0.v );
178  c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v ); // max( c, 0 )
179 
180  c47_0.v = _mm256_max_pd( c47_1.v, c47_0.v );
181  c47_0.v = _mm256_max_pd( c47_2.v, c47_0.v );
182  c47_0.v = _mm256_max_pd( c47_3.v, c47_0.v );
183  c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v ); // max( c, 0 )
184 
185  _mm256_store_pd( (double*)( c ), c03_0.v );
186  _mm256_store_pd( (double*)( c + 4 ), c47_0.v );
187  }
188 };
189 
190 
192 {
193  inline void operator()
194  (
195  int k,
196  double *a,
197  double *b,
198  double *c, int ldc,
200  ) const
201  {
202  unsigned long long k_iter = k / 4;
203  unsigned long long k_left = k % 4;
204  unsigned long long pc = aux->pc;
205  unsigned long long ldc64 = ldc;
206 
207 
208  if ( aux->do_packC ) ldc64 = 8;
209 
210  __asm__ volatile
211  (
212  " \n\t"
213  " \n\t"
214  "movq %2, %%rax \n\t" // load address of a. ( v )
215  "movq %3, %%rbx \n\t" // load address of b. ( v )
216  "movq %5, %%r15 \n\t" // load address of b_next. ( v )
217  "addq $-4 * 64, %%r15 \n\t" // ( ? )
218  " \n\t"
219  "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading
220  "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" // elements of a and b.
221  "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
222  " \n\t"
223  " \n\t"
224  "movq %7, %%rdi \n\t" // load ldc
225  "leaq (,%%rdi,8), %%rdi \n\t" // ldc * sizeof(double)
226  " \n\t"
227  " \n\t"
228  "movq %4, %%rcx \n\t" // load address of c
229  " \n\t"
230  " \n\t"
231  "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0 * ldc
232  "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1 * ldc
233  "prefetcht0 3 * 8(%%r11) \n\t" // prefetch c + 2 * ldc
234  "prefetcht0 3 * 8(%%r11,%%rdi) \n\t" // prefetch c + 3 * ldc
235  " \n\t"
236  " \n\t"
237  " \n\t"
238  " \n\t"
239  "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t" // set ymm8 to 0 ( v )
240  "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t"
241  "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t"
242  "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t"
243  "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
244  "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
245  "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
246  "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
247  " \n\t"
248  " \n\t"
249  " \n\t"
250  "movq %0, %%rsi \n\t" // i = k_iter; ( v )
251  "testq %%rsi, %%rsi \n\t" // check i via logical AND. ( v )
252  "je .CNN_DCONSIDKLEFT%= \n\t" // if i == 0, jump to code that ( v )
253  " \n\t" // contains the k_left loop.
254  " \n\t"
255  " \n\t"
256  ".CNN_DLOOPKITER%=: \n\t" // MAIN LOOP
257  " \n\t"
258  "addq $4 * 4 * 8, %%r15 \n\t" // b_next += 4*4 (unroll x nr) ( v )
259  " \n\t"
260  " \n\t" // iteration 0
261  "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter 0
262  "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" // ymm6 ( c_tmp0 ) = ymm0 ( a03 ) * ymm2( b0 )
263  "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" // ymm4 ( b0x3_0 )
264  "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" // ymm7 ( c_tmp1 ) = ymm0 ( a03 ) * ymm3( b0x5 )
265  "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" // ymm5 ( b0x3_1 )
266  "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" // ymm15 ( c_03_0 ) += ymm6( c_tmp0 )
267  "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" // ymm13 ( c_03_1 ) += ymm7( c_tmp1 )
268  " \n\t"
269  "prefetcht0 16 * 32(%%rax) \n\t" // prefetch a03 for iter 1
270  "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
271  "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 1
272  "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
273  "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
274  "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
275  "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
276  " \n\t"
277  "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
278  "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
279  "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter 1
280  "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
281  "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
282  "prefetcht0 0 * 32(%%r15) \n\t" // prefetch b_next[0*4]
283  " \n\t"
284  "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
285  "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
286  "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
287  "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
288  " \n\t"
289  " \n\t"
290  " \n\t" // iteration 1
291  "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter 1
292  "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
293  "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
294  "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
295  "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
296  "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
297  "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
298  " \n\t"
299  "prefetcht0 18 * 32(%%rax) \n\t" // prefetch a for iter 9 ( ? )
300  "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
301  "vmovapd 2 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 2
302  "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
303  "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
304  "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
305  "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
306  " \n\t"
307  "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
308  "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
309  "vmovapd 4 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter 2
310  "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
311  "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
312  " \n\t"
313  "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
314  "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
315  "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
316  "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
317  " \n\t"
318  " \n\t"
319  " \n\t" // iteration 2
320  "vmovapd 5 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter 2
321  "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
322  "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
323  "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
324  "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
325  "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
326  "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
327  " \n\t"
328  "prefetcht0 20 * 32(%%rax) \n\t" // prefetch a for iter 10 ( ? )
329  "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
330  "vmovapd 3 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 3
331  "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr)
332  "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
333  "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
334  "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
335  "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
336  " \n\t"
337  "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
338  "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
339  "vmovapd 6 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter 3
340  "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
341  "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
342  "prefetcht0 2 * 32(%%r15) \n\t" // prefetch b_next[2*4]
343  " \n\t"
344  "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
345  "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
346  "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
347  "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
348  " \n\t"
349  " \n\t"
350  " \n\t" // iteration 3
351  "vmovapd 7 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter 3
352  "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr)
353  "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
354  "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
355  "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
356  "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
357  "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
358  "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
359  " \n\t"
360  "prefetcht0 14 * 32(%%rax) \n\t" // prefetch a for iter 11 ( ? )
361  "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
362  "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 4
363  "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
364  "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
365  "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
366  "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
367  " \n\t"
368  "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
369  "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
370  "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter 4
371  "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
372  "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
373  " \n\t"
374  "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
375  "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
376  "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
377  "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
378  " \n\t"
379  " \n\t"
380  " \n\t"
381  " \n\t"
382  "decq %%rsi \n\t" // i -= 1;
383  "jne .CNN_DLOOPKITER%= \n\t" // iterate again if i != 0.
384  " \n\t"
385  " \n\t"
386  " \n\t"
387  " \n\t"
388  " \n\t"
389  " \n\t"
390  ".CNN_DCONSIDKLEFT%=: \n\t"
391  " \n\t"
392  "movq %1, %%rsi \n\t" // i = k_left;
393  "testq %%rsi, %%rsi \n\t" // check i via logical AND.
394  "je .CNN_DPOSTACCUM%= \n\t" // if i == 0, we're done; jump to end.
395  " \n\t" // else, we prepare to enter k_left loop.
396  " \n\t"
397  " \n\t"
398  ".CNN_DLOOPKLEFT%=: \n\t" // EDGE LOOP
399  " \n\t"
400  "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" // preload a47
401  "addq $8 * 1 * 8, %%rax \n\t" // a += 8 (1 x mr)
402  "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
403  "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
404  "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
405  "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
406  "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
407  "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
408  " \n\t"
409  "prefetcht0 14 * 32(%%rax) \n\t" // prefetch a03 for iter 7 later ( ? )
410  "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
411  "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t"
412  "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr)
413  "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
414  "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
415  "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
416  "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
417  " \n\t"
418  "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
419  "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
420  "vmovapd 0 * 32(%%rax), %%ymm0 \n\t"
421  "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
422  "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
423  " \n\t"
424  "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
425  "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
426  "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
427  "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
428  " \n\t"
429  " \n\t"
430  "decq %%rsi \n\t" // i -= 1;
431  "jne .CNN_DLOOPKLEFT%= \n\t" // iterate again if i != 0.
432  " \n\t"
433  " \n\t"
434  ".CNN_DPOSTACCUM%=: \n\t"
435  " \n\t"
436  " \n\t"
437  " \n\t" // ymm15: ymm13: ymm11: ymm9:
438  " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
439  " \n\t" // ab11 ab10 ab13 ab12
440  " \n\t" // ab22 ab23 ab20 ab21
441  " \n\t" // ab33 ) ab32 ) ab31 ) ab30 )
442  " \n\t"
443  " \n\t" // ymm14: ymm12: ymm10: ymm8:
444  " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43
445  " \n\t" // ab51 ab50 ab53 ab52
446  " \n\t" // ab62 ab63 ab60 ab61
447  " \n\t" // ab73 ) ab72 ) ab71 ) ab70 )
448  " \n\t"
449  "vmovapd %%ymm15, %%ymm7 \n\t"
450  "vshufpd $0xa, %%ymm15, %%ymm13, %%ymm15 \n\t"
451  "vshufpd $0xa, %%ymm13, %%ymm7, %%ymm13 \n\t"
452  " \n\t"
453  "vmovapd %%ymm11, %%ymm7 \n\t"
454  "vshufpd $0xa, %%ymm11, %%ymm9, %%ymm11 \n\t"
455  "vshufpd $0xa, %%ymm9, %%ymm7, %%ymm9 \n\t"
456  " \n\t"
457  "vmovapd %%ymm14, %%ymm7 \n\t"
458  "vshufpd $0xa, %%ymm14, %%ymm12, %%ymm14 \n\t"
459  "vshufpd $0xa, %%ymm12, %%ymm7, %%ymm12 \n\t"
460  " \n\t"
461  "vmovapd %%ymm10, %%ymm7 \n\t"
462  "vshufpd $0xa, %%ymm10, %%ymm8, %%ymm10 \n\t"
463  "vshufpd $0xa, %%ymm8, %%ymm7, %%ymm8 \n\t"
464  " \n\t"
465  " \n\t" // ymm15: ymm13: ymm11: ymm9:
466  " \n\t" // ( ab01 ( ab00 ( ab03 ( ab02
467  " \n\t" // ab11 ab10 ab13 ab12
468  " \n\t" // ab23 ab22 ab21 ab20
469  " \n\t" // ab33 ) ab32 ) ab31 ) ab30 )
470  " \n\t"
471  " \n\t" // ymm14: ymm12: ymm10: ymm8:
472  " \n\t" // ( ab41 ( ab40 ( ab43 ( ab42
473  " \n\t" // ab51 ab50 ab53 ab52
474  " \n\t" // ab63 ab62 ab61 ab60
475  " \n\t" // ab73 ) ab72 ) ab71 ) ab70 )
476  " \n\t"
477  "vmovapd %%ymm15, %%ymm7 \n\t"
478  "vperm2f128 $0x30, %%ymm15, %%ymm11, %%ymm15 \n\t"
479  "vperm2f128 $0x12, %%ymm7, %%ymm11, %%ymm11 \n\t"
480  " \n\t"
481  "vmovapd %%ymm13, %%ymm7 \n\t"
482  "vperm2f128 $0x30, %%ymm13, %%ymm9, %%ymm13 \n\t"
483  "vperm2f128 $0x12, %%ymm7, %%ymm9, %%ymm9 \n\t"
484  " \n\t"
485  "vmovapd %%ymm14, %%ymm7 \n\t"
486  "vperm2f128 $0x30, %%ymm14, %%ymm10, %%ymm14 \n\t"
487  "vperm2f128 $0x12, %%ymm7, %%ymm10, %%ymm10 \n\t"
488  " \n\t"
489  "vmovapd %%ymm12, %%ymm7 \n\t"
490  "vperm2f128 $0x30, %%ymm12, %%ymm8, %%ymm12 \n\t"
491  "vperm2f128 $0x12, %%ymm7, %%ymm8, %%ymm8 \n\t"
492  " \n\t"
493  " \n\t" // ymm9: ymm11: ymm13: ymm15:
494  " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
495  " \n\t" // ab10 ab11 ab12 ab13
496  " \n\t" // ab20 ab21 ab22 ab23
497  " \n\t" // ab30 ) ab31 ) ab32 ) ab33 )
498  " \n\t"
499  " \n\t" // ymm8: ymm10: ymm12: ymm14:
500  " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43
501  " \n\t" // ab50 ab51 ab52 ab53
502  " \n\t" // ab60 ab61 ab62 ab63
503  " \n\t" // ab70 ) ab71 ) ab72 ) ab73 )
504  " \n\t"
505  " \n\t"
506  " \n\t"
507  " \n\t"
508  " \n\t"
509  "movq %6, %%rdi \n\t" // load pc
510  "testq %%rdi, %%rdi \n\t" // check pc via logical AND. ( v )
511  "je .CNN_DNOLOADC%= \n\t" // if pc == 0, jump to code
512  " \n\t"
513  " \n\t"
514  "movq %7, %%rdi \n\t" // load ldc
515  "leaq (,%%rdi,8), %%rdi \n\t" // ldc * sizeof(double)
516  " \n\t"
517  " \n\t"
518  "vmovapd 0 * 32(%%rcx), %%ymm0 \n\t" // ymm0 = C_c( 0:3, 0 )
519  "vaddpd %%ymm9, %%ymm0, %%ymm9 \n\t" // ymm0 += ymm9
520  " \n\t"
521  " \n\t"
522  "vmovapd 1 * 32(%%rcx), %%ymm0 \n\t" // ymm0 = C_c( 4:7, 0 )
523  "vaddpd %%ymm8, %%ymm0, %%ymm8 \n\t" // ymm0 += ymm8
524  " \n\t"
525  " \n\t"
526  " \n\t"
527  "addq %%rdi, %%rcx \n\t" // c += ldc
528  " \n\t"
529  " \n\t"
530  " \n\t"
531  "vmovapd 0 * 32(%%rcx), %%ymm0 \n\t" // ymm0 = C_c( 0:3, 1 )
532  "vaddpd %%ymm11, %%ymm0, %%ymm11 \n\t" // ymm0 += ymm11
533  " \n\t"
534  " \n\t"
535  "vmovapd 1 * 32(%%rcx), %%ymm0 \n\t" // ymm0 = C_c( 4:7, 1 )
536  "vaddpd %%ymm10, %%ymm0, %%ymm10 \n\t" // ymm0 += ymm10
537  " \n\t"
538  " \n\t"
539  "addq %%rdi, %%rcx \n\t" // c += ldc
540  " \n\t"
541  " \n\t"
542  "vmovapd 0 * 32(%%rcx), %%ymm0 \n\t" // ymm0 = C_c( 0:3, 2 )
543  "vaddpd %%ymm13, %%ymm0, %%ymm13 \n\t" // ymm0 += ymm13
544  " \n\t"
545  " \n\t"
546  "vmovapd 1 * 32(%%rcx), %%ymm0 \n\t" // ymm0 = C_c( 4:7, 2 )
547  "vaddpd %%ymm12, %%ymm0, %%ymm12 \n\t" // ymm0 += ymm12
548  " \n\t"
549  " \n\t"
550  "addq %%rdi, %%rcx \n\t" // c += ldc
551  " \n\t"
552  " \n\t"
553  "vmovapd 0 * 32(%%rcx), %%ymm0 \n\t" // ymm0 = C_c( 0:3, 3 )
554  "vaddpd %%ymm15, %%ymm0, %%ymm15 \n\t" // ymm0 += ymm15
555  " \n\t"
556  " \n\t"
557  "vmovapd 1 * 32(%%rcx), %%ymm0 \n\t" // ymm0 = C_c( 4:7, 3 )
558  "vaddpd %%ymm14, %%ymm0, %%ymm14 \n\t" // ymm0 += ymm14
559  " \n\t"
560  " \n\t"
561  " \n\t"
562  ".CNN_DNOLOADC%=: \n\t"
563  " \n\t"
564  "vmaxpd %%ymm11, %%ymm9, %%ymm9 \n\t" // max( ymm9, ymm11 )
565  "vmaxpd %%ymm10, %%ymm8, %%ymm8 \n\t" // max( ymm8, ymm10 )
566  "vmaxpd %%ymm13, %%ymm9, %%ymm9 \n\t" // max( ymm9, ymm13 )
567  "vmaxpd %%ymm12, %%ymm8, %%ymm8 \n\t" // max( ymm8, ymm12 )
568  "vmaxpd %%ymm15, %%ymm9, %%ymm9 \n\t" // max( ymm9, ymm15 )
569  "vmaxpd %%ymm14, %%ymm8, %%ymm8 \n\t" // max( ymm8, ymm14 )
570  " \n\t"
571  " \n\t"
572  "vmovapd %%ymm9, 0 * 32(%%rcx) \n\t" // C_c( 0:3, 0 ) = ymm9
573  "vmovapd %%ymm8, 1 * 32(%%rcx) \n\t" // C_c( 0:3, 0 ) = ymm8
574  " \n\t"
575  " \n\t"
576  ".CNN_DDONE%=: \n\t"
577  " \n\t"
578  : // output operands (none)
579  : // input operands
580  "m" (k_iter), // 0
581  "m" (k_left), // 1
582  "m" (a), // 2
583  "m" (b), // 3
584  "m" (c), // 4
585  "m" (aux->b_next), // 5
586  "m" (pc), // 6
587  "m" (ldc64) // 7
588  : // register clobber list
589  "rax", "rbx", "rcx", "rsi", "rdi",
590  "r15",
591  "xmm0", "xmm1", "xmm2", "xmm3",
592  "xmm4", "xmm5", "xmm6", "xmm7",
593  "xmm8", "xmm9", "xmm10", "xmm11",
594  "xmm12", "xmm13", "xmm14", "xmm15",
595  "memory"
596  );
597 
598  } // end inline void operator()
599 };
Definition: conv_relu_pool2x2_d8x4.hpp:11
Definition: conv_relu_pool2x2_d8x4.hpp:75
Definition: conv_relu_pool2x2_d8x4.hpp:191
Definition: hmlp_internal.hpp:38
Definition: avx_type.h:13