4 #include <hmlp_internal.hpp> 13 inline void operator()(
20 double c_reg[ 8 * 4 ] = { 0.0 };
22 for (
int p = 0; p < k; p ++ )
25 for (
int j = 0; j < 4; j ++ )
28 for (
int i = 0; i < 8; i ++ )
30 c_reg[ j * 8 + i ] += a[ p * 8 + i ] * b[ p * 4 + j ];
38 for (
int j = 0; j < 4; j ++ )
41 for (
int i = 0; i < 8; i ++ )
43 c[ j * ldc + i ] += c_reg[ j * 8 + i ];
50 for (
int j = 0; j < 4; j ++ )
53 for (
int i = 0; i < 8; i ++ )
55 c[ j * ldc + i ] = c_reg[ j * 8 + i ];
61 printf(
"rank_k_ref_d8x4:" );
62 for (
int i = 0; i < 8; i ++ )
64 for (
int j = 0; j < 4; j ++ )
66 printf(
"%E ", c[ j * ldc + i ] );
77 inline void operator()
88 v4df_t c03_0, c03_1, c03_2, c03_3;
89 v4df_t c47_0, c47_1, c47_2, c47_3;
90 v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
91 v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
95 v4df_t c_tmp, aa_tmp, bb_tmp, w_tmp;
97 __asm__
volatile(
"prefetcht0 0(%0) \n\t" : :
"r"( c ) );
100 #include "component/rank_k_int_d8x4.hpp" 108 tmpc03_0.v = _mm256_load_pd( (
double*)( c ) );
109 tmpc47_0.v = _mm256_load_pd( (
double*)( c + 4 ) );
111 tmpc03_1.v = _mm256_load_pd( (
double*)( c + 8 ) );
112 tmpc47_1.v = _mm256_load_pd( (
double*)( c + 12 ) );
114 tmpc03_2.v = _mm256_load_pd( (
double*)( c + 16 ) );
115 tmpc47_2.v = _mm256_load_pd( (
double*)( c + 20 ) );
117 tmpc03_3.v = _mm256_load_pd( (
double*)( c + 24 ) );
118 tmpc47_3.v = _mm256_load_pd( (
double*)( c + 28 ) );
122 tmpc03_0.v = _mm256_load_pd( (
double*)( c ) );
123 tmpc47_0.v = _mm256_load_pd( (
double*)( c + 4 ) );
125 tmpc03_1.v = _mm256_load_pd( (
double*)( c + 1 * ldc ) );
126 tmpc47_1.v = _mm256_load_pd( (
double*)( c + 1 * ldc + 4 ) );
128 tmpc03_2.v = _mm256_load_pd( (
double*)( c + 2 * ldc ) );
129 tmpc47_2.v = _mm256_load_pd( (
double*)( c + 2 * ldc + 4 ) );
131 tmpc03_3.v = _mm256_load_pd( (
double*)( c + 3 * ldc ) );
132 tmpc47_3.v = _mm256_load_pd( (
double*)( c + 3 * ldc + 4 ) );
136 c03_0.v = _mm256_add_pd( tmpc03_0.v, c03_0.v );
137 c47_0.v = _mm256_add_pd( tmpc47_0.v, c47_0.v );
139 c03_1.v = _mm256_add_pd( tmpc03_1.v, c03_1.v );
140 c47_1.v = _mm256_add_pd( tmpc47_1.v, c47_1.v );
142 c03_2.v = _mm256_add_pd( tmpc03_2.v, c03_2.v );
143 c47_2.v = _mm256_add_pd( tmpc47_2.v, c47_2.v );
145 c03_3.v = _mm256_add_pd( tmpc03_3.v, c03_3.v );
146 c47_3.v = _mm256_add_pd( tmpc47_3.v, c47_3.v );
150 printf(
"rank_k_int_d8x4:\n" );
151 printf(
"%E %E %E %E\n", c03_0.d[ 0 ], c03_1.d[ 0 ], c03_2.d[ 0 ], c03_3.d[ 0 ] );
152 printf(
"%E %E %E %E\n", c03_0.d[ 1 ], c03_1.d[ 1 ], c03_2.d[ 1 ], c03_3.d[ 1 ] );
153 printf(
"%E %E %E %E\n", c03_0.d[ 2 ], c03_1.d[ 2 ], c03_2.d[ 2 ], c03_3.d[ 2 ] );
154 printf(
"%E %E %E %E\n", c03_0.d[ 3 ], c03_1.d[ 3 ], c03_2.d[ 3 ], c03_3.d[ 3 ] );
156 printf(
"%E %E %E %E\n", c47_0.d[ 0 ], c47_1.d[ 0 ], c47_2.d[ 0 ], c47_3.d[ 0 ] );
157 printf(
"%E %E %E %E\n", c47_0.d[ 1 ], c47_1.d[ 1 ], c47_2.d[ 1 ], c47_3.d[ 1 ] );
158 printf(
"%E %E %E %E\n", c47_0.d[ 2 ], c47_1.d[ 2 ], c47_2.d[ 2 ], c47_3.d[ 2 ] );
159 printf(
"%E %E %E %E\n", c47_0.d[ 3 ], c47_1.d[ 3 ], c47_2.d[ 3 ], c47_3.d[ 3 ] );
164 c_tmp.v = _mm256_broadcast_sd( &dzero );
175 c03_0.v = _mm256_max_pd( c03_1.v, c03_0.v );
176 c03_0.v = _mm256_max_pd( c03_2.v, c03_0.v );
177 c03_0.v = _mm256_max_pd( c03_3.v, c03_0.v );
178 c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v );
180 c47_0.v = _mm256_max_pd( c47_1.v, c47_0.v );
181 c47_0.v = _mm256_max_pd( c47_2.v, c47_0.v );
182 c47_0.v = _mm256_max_pd( c47_3.v, c47_0.v );
183 c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v );
185 _mm256_store_pd( (
double*)( c ), c03_0.v );
186 _mm256_store_pd( (
double*)( c + 4 ), c47_0.v );
193 inline void operator()
202 unsigned long long k_iter = k / 4;
203 unsigned long long k_left = k % 4;
204 unsigned long long pc = aux->pc;
205 unsigned long long ldc64 = ldc;
208 if ( aux->do_packC ) ldc64 = 8;
214 "movq %2, %%rax \n\t" 215 "movq %3, %%rbx \n\t" 216 "movq %5, %%r15 \n\t" 217 "addq $-4 * 64, %%r15 \n\t" 219 "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" 220 "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" 221 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" 224 "movq %7, %%rdi \n\t" 225 "leaq (,%%rdi,8), %%rdi \n\t" 228 "movq %4, %%rcx \n\t" 231 "prefetcht0 3 * 8(%%rcx) \n\t" 232 "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" 233 "prefetcht0 3 * 8(%%r11) \n\t" 234 "prefetcht0 3 * 8(%%r11,%%rdi) \n\t" 239 "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t" 240 "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" 241 "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" 242 "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" 243 "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" 244 "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" 245 "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" 246 "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" 250 "movq %0, %%rsi \n\t" 251 "testq %%rsi, %%rsi \n\t" 252 "je .CNN_DCONSIDKLEFT%= \n\t" 256 ".CNN_DLOOPKITER%=: \n\t" 258 "addq $4 * 4 * 8, %%r15 \n\t" 261 "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" 262 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" 263 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" 264 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" 265 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" 266 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" 267 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" 269 "prefetcht0 16 * 32(%%rax) \n\t" 270 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" 271 "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t" 272 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" 273 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" 274 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" 275 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" 277 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" 278 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" 279 "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" 280 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" 281 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" 282 "prefetcht0 0 * 32(%%r15) \n\t" 284 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" 285 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" 286 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" 287 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" 291 "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" 292 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" 293 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" 294 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" 295 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" 296 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" 297 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" 299 "prefetcht0 18 * 32(%%rax) \n\t" 300 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" 301 "vmovapd 2 * 32(%%rbx), %%ymm2 \n\t" 302 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" 303 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" 304 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" 305 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" 307 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" 308 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" 309 "vmovapd 4 * 32(%%rax), %%ymm0 \n\t" 310 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" 311 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" 313 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" 314 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" 315 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" 316 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" 320 "vmovapd 5 * 32(%%rax), %%ymm1 \n\t" 321 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" 322 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" 323 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" 324 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" 325 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" 326 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" 328 "prefetcht0 20 * 32(%%rax) \n\t" 329 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" 330 "vmovapd 3 * 32(%%rbx), %%ymm2 \n\t" 331 "addq $4 * 4 * 8, %%rbx \n\t" 332 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" 333 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" 334 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" 335 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" 337 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" 338 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" 339 "vmovapd 6 * 32(%%rax), %%ymm0 \n\t" 340 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" 341 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" 342 "prefetcht0 2 * 32(%%r15) \n\t" 344 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" 345 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" 346 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" 347 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" 351 "vmovapd 7 * 32(%%rax), %%ymm1 \n\t" 352 "addq $4 * 8 * 8, %%rax \n\t" 353 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" 354 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" 355 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" 356 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" 357 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" 358 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" 360 "prefetcht0 14 * 32(%%rax) \n\t" 361 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" 362 "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" 363 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" 364 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" 365 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" 366 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" 368 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" 369 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" 370 "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" 371 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" 372 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" 374 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" 375 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" 376 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" 377 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" 383 "jne .CNN_DLOOPKITER%= \n\t" 390 ".CNN_DCONSIDKLEFT%=: \n\t" 392 "movq %1, %%rsi \n\t" 393 "testq %%rsi, %%rsi \n\t" 394 "je .CNN_DPOSTACCUM%= \n\t" 398 ".CNN_DLOOPKLEFT%=: \n\t" 400 "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" 401 "addq $8 * 1 * 8, %%rax \n\t" 402 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" 403 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" 404 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" 405 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" 406 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" 407 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" 409 "prefetcht0 14 * 32(%%rax) \n\t" 410 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" 411 "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t" 412 "addq $4 * 1 * 8, %%rbx \n\t" 413 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" 414 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" 415 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" 416 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" 418 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" 419 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" 420 "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" 421 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" 422 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" 424 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" 425 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" 426 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" 427 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" 431 "jne .CNN_DLOOPKLEFT%= \n\t" 434 ".CNN_DPOSTACCUM%=: \n\t" 449 "vmovapd %%ymm15, %%ymm7 \n\t" 450 "vshufpd $0xa, %%ymm15, %%ymm13, %%ymm15 \n\t" 451 "vshufpd $0xa, %%ymm13, %%ymm7, %%ymm13 \n\t" 453 "vmovapd %%ymm11, %%ymm7 \n\t" 454 "vshufpd $0xa, %%ymm11, %%ymm9, %%ymm11 \n\t" 455 "vshufpd $0xa, %%ymm9, %%ymm7, %%ymm9 \n\t" 457 "vmovapd %%ymm14, %%ymm7 \n\t" 458 "vshufpd $0xa, %%ymm14, %%ymm12, %%ymm14 \n\t" 459 "vshufpd $0xa, %%ymm12, %%ymm7, %%ymm12 \n\t" 461 "vmovapd %%ymm10, %%ymm7 \n\t" 462 "vshufpd $0xa, %%ymm10, %%ymm8, %%ymm10 \n\t" 463 "vshufpd $0xa, %%ymm8, %%ymm7, %%ymm8 \n\t" 477 "vmovapd %%ymm15, %%ymm7 \n\t" 478 "vperm2f128 $0x30, %%ymm15, %%ymm11, %%ymm15 \n\t" 479 "vperm2f128 $0x12, %%ymm7, %%ymm11, %%ymm11 \n\t" 481 "vmovapd %%ymm13, %%ymm7 \n\t" 482 "vperm2f128 $0x30, %%ymm13, %%ymm9, %%ymm13 \n\t" 483 "vperm2f128 $0x12, %%ymm7, %%ymm9, %%ymm9 \n\t" 485 "vmovapd %%ymm14, %%ymm7 \n\t" 486 "vperm2f128 $0x30, %%ymm14, %%ymm10, %%ymm14 \n\t" 487 "vperm2f128 $0x12, %%ymm7, %%ymm10, %%ymm10 \n\t" 489 "vmovapd %%ymm12, %%ymm7 \n\t" 490 "vperm2f128 $0x30, %%ymm12, %%ymm8, %%ymm12 \n\t" 491 "vperm2f128 $0x12, %%ymm7, %%ymm8, %%ymm8 \n\t" 509 "movq %6, %%rdi \n\t" 510 "testq %%rdi, %%rdi \n\t" 511 "je .CNN_DNOLOADC%= \n\t" 514 "movq %7, %%rdi \n\t" 515 "leaq (,%%rdi,8), %%rdi \n\t" 518 "vmovapd 0 * 32(%%rcx), %%ymm0 \n\t" 519 "vaddpd %%ymm9, %%ymm0, %%ymm9 \n\t" 522 "vmovapd 1 * 32(%%rcx), %%ymm0 \n\t" 523 "vaddpd %%ymm8, %%ymm0, %%ymm8 \n\t" 527 "addq %%rdi, %%rcx \n\t" 531 "vmovapd 0 * 32(%%rcx), %%ymm0 \n\t" 532 "vaddpd %%ymm11, %%ymm0, %%ymm11 \n\t" 535 "vmovapd 1 * 32(%%rcx), %%ymm0 \n\t" 536 "vaddpd %%ymm10, %%ymm0, %%ymm10 \n\t" 539 "addq %%rdi, %%rcx \n\t" 542 "vmovapd 0 * 32(%%rcx), %%ymm0 \n\t" 543 "vaddpd %%ymm13, %%ymm0, %%ymm13 \n\t" 546 "vmovapd 1 * 32(%%rcx), %%ymm0 \n\t" 547 "vaddpd %%ymm12, %%ymm0, %%ymm12 \n\t" 550 "addq %%rdi, %%rcx \n\t" 553 "vmovapd 0 * 32(%%rcx), %%ymm0 \n\t" 554 "vaddpd %%ymm15, %%ymm0, %%ymm15 \n\t" 557 "vmovapd 1 * 32(%%rcx), %%ymm0 \n\t" 558 "vaddpd %%ymm14, %%ymm0, %%ymm14 \n\t" 562 ".CNN_DNOLOADC%=: \n\t" 564 "vmaxpd %%ymm11, %%ymm9, %%ymm9 \n\t" 565 "vmaxpd %%ymm10, %%ymm8, %%ymm8 \n\t" 566 "vmaxpd %%ymm13, %%ymm9, %%ymm9 \n\t" 567 "vmaxpd %%ymm12, %%ymm8, %%ymm8 \n\t" 568 "vmaxpd %%ymm15, %%ymm9, %%ymm9 \n\t" 569 "vmaxpd %%ymm14, %%ymm8, %%ymm8 \n\t" 572 "vmovapd %%ymm9, 0 * 32(%%rcx) \n\t" 573 "vmovapd %%ymm8, 1 * 32(%%rcx) \n\t" 589 "rax",
"rbx",
"rcx",
"rsi",
"rdi",
591 "xmm0",
"xmm1",
"xmm2",
"xmm3",
592 "xmm4",
"xmm5",
"xmm6",
"xmm7",
593 "xmm8",
"xmm9",
"xmm10",
"xmm11",
594 "xmm12",
"xmm13",
"xmm14",
"xmm15",
Definition: conv_relu_pool2x2_d8x4.hpp:11
Definition: conv_relu_pool2x2_d8x4.hpp:75
Definition: conv_relu_pool2x2_d8x4.hpp:191
Definition: hmlp_internal.hpp:38
Definition: avx_type.h:13