4 #include <hmlp_internal.hpp> 12 BLIS_GEMM_KERNEL(bli_sgemm_asm_8x8,
float);
13 BLIS_GEMM_KERNEL(bli_dgemm_asm_8x4,
double);
18 const static size_t mr = 8;
19 const static size_t nr = 8;
20 const static size_t pack_mr = 8;
21 const static size_t pack_nr = 8;
22 const static size_t align_size = 32;
23 const static bool row_major =
false;
29 printf(
"no implementation\n" );
37 float beta = aux->pc ? 1.0 : 0.0;
52 inline void operator()
58 float *v, inc_t rs_v, inc_t cs_v,
62 printf(
"no implementation\n" );
75 const static size_t mr = 8;
76 const static size_t nr = 4;
77 const static size_t pack_mr = 8;
78 const static size_t pack_nr = 4;
79 const static size_t align_size = 32;
80 const static bool row_major =
false;
86 unsigned long long len64 = (
unsigned long long)(len);
87 unsigned long long ldc64 = (
unsigned long long)(ldc);
88 unsigned long long k_iter = (
unsigned long long)k / 4;
89 unsigned long long k_left = (
unsigned long long)k % 4;
91 double *b_next = (
double *)(aux->b_next);
100 "addq $-4 * 64, %%r15 \n\t" 102 "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" 103 "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" 104 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" 107 "movq %7, %%rdi \n\t" 108 "leaq (,%%rdi,8), %%rdi \n\t" 110 "movq %6, %%rcx \n\t" 112 "movq %5, %%rsi \n\t" 114 ".DPREFETCHLOOP%=: \n\t" 116 "movq 0 * 8(%%rcx), %%rdx \n\t" 118 "testq %%rdx, %%rdx \n\t" 120 "leaq (%%rdx,%%rdi,2), %%r11 \n\t" 121 "prefetcht0 3 * 8(%%rdx) \n\t" 122 "prefetcht0 3 * 8(%%rdx,%%rdi) \n\t" 123 "prefetcht0 3 * 8(%%r11) \n\t" 124 "prefetcht0 3 * 8(%%r11,%%rdi) \n\t" 128 "addq $1 * 8, %%rcx \n\t" 131 "jne .DPREFETCHLOOP%= \n\t" 135 "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t" 136 "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" 137 "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" 138 "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" 139 "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" 140 "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" 141 "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" 142 "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" 146 "movq %0, %%rsi \n\t" 147 "testq %%rsi, %%rsi \n\t" 148 "je .DCONSIDKLEFT%= \n\t" 152 ".DLOOPKITER%=: \n\t" 154 "addq $4 * 4 * 8, %%r15 \n\t" 157 "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" 158 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" 159 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" 160 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" 161 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" 162 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" 163 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" 165 "prefetcht0 16 * 32(%%rax) \n\t" 166 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" 167 "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t" 168 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" 169 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" 170 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" 171 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" 173 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" 174 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" 175 "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" 176 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" 177 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" 178 "prefetcht0 0 * 32(%%r15) \n\t" 180 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" 181 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" 182 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" 183 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" 187 "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" 188 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" 189 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" 190 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" 191 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" 192 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" 193 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" 195 "prefetcht0 18 * 32(%%rax) \n\t" 196 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" 197 "vmovapd 2 * 32(%%rbx), %%ymm2 \n\t" 198 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" 199 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" 200 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" 201 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" 203 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" 204 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" 205 "vmovapd 4 * 32(%%rax), %%ymm0 \n\t" 206 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" 207 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" 209 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" 210 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" 211 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" 212 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" 216 "vmovapd 5 * 32(%%rax), %%ymm1 \n\t" 217 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" 218 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" 219 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" 220 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" 221 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" 222 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" 224 "prefetcht0 20 * 32(%%rax) \n\t" 225 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" 226 "vmovapd 3 * 32(%%rbx), %%ymm2 \n\t" 227 "addq $4 * 4 * 8, %%rbx \n\t" 228 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" 229 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" 230 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" 231 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" 233 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" 234 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" 235 "vmovapd 6 * 32(%%rax), %%ymm0 \n\t" 236 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" 237 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" 238 "prefetcht0 2 * 32(%%r15) \n\t" 240 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" 241 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" 242 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" 243 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" 247 "vmovapd 7 * 32(%%rax), %%ymm1 \n\t" 248 "addq $4 * 8 * 8, %%rax \n\t" 249 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" 250 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" 251 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" 252 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" 253 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" 254 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" 256 "prefetcht0 14 * 32(%%rax) \n\t" 257 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" 258 "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" 259 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" 260 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" 261 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" 262 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" 264 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" 265 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" 266 "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" 267 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" 268 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" 270 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" 271 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" 272 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" 273 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" 279 "jne .DLOOPKITER%= \n\t" 286 ".DCONSIDKLEFT%=: \n\t" 288 "movq %1, %%rsi \n\t" 289 "testq %%rsi, %%rsi \n\t" 290 "je .DPOSTACCUM%= \n\t" 294 ".DLOOPKLEFT%=: \n\t" 296 "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" 297 "addq $8 * 1 * 8, %%rax \n\t" 298 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" 299 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" 300 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" 301 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" 302 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" 303 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" 305 "prefetcht0 14 * 32(%%rax) \n\t" 306 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" 307 "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t" 308 "addq $4 * 1 * 8, %%rbx \n\t" 309 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" 310 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" 311 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" 312 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" 314 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" 315 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" 316 "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" 317 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" 318 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" 320 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" 321 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" 322 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" 323 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" 327 "jne .DLOOPKLEFT%= \n\t" 331 ".DPOSTACCUM%=: \n\t" 346 "vmovapd %%ymm15, %%ymm7 \n\t" 347 "vshufpd $0xa, %%ymm15, %%ymm13, %%ymm15 \n\t" 348 "vshufpd $0xa, %%ymm13, %%ymm7, %%ymm13 \n\t" 350 "vmovapd %%ymm11, %%ymm7 \n\t" 351 "vshufpd $0xa, %%ymm11, %%ymm9, %%ymm11 \n\t" 352 "vshufpd $0xa, %%ymm9, %%ymm7, %%ymm9 \n\t" 354 "vmovapd %%ymm14, %%ymm7 \n\t" 355 "vshufpd $0xa, %%ymm14, %%ymm12, %%ymm14 \n\t" 356 "vshufpd $0xa, %%ymm12, %%ymm7, %%ymm12 \n\t" 358 "vmovapd %%ymm10, %%ymm7 \n\t" 359 "vshufpd $0xa, %%ymm10, %%ymm8, %%ymm10 \n\t" 360 "vshufpd $0xa, %%ymm8, %%ymm7, %%ymm8 \n\t" 374 "vmovapd %%ymm15, %%ymm7 \n\t" 375 "vperm2f128 $0x30, %%ymm15, %%ymm11, %%ymm15 \n\t" 376 "vperm2f128 $0x12, %%ymm7, %%ymm11, %%ymm11 \n\t" 378 "vmovapd %%ymm13, %%ymm7 \n\t" 379 "vperm2f128 $0x30, %%ymm13, %%ymm9, %%ymm13 \n\t" 380 "vperm2f128 $0x12, %%ymm7, %%ymm9, %%ymm9 \n\t" 382 "vmovapd %%ymm14, %%ymm7 \n\t" 383 "vperm2f128 $0x30, %%ymm14, %%ymm10, %%ymm14 \n\t" 384 "vperm2f128 $0x12, %%ymm7, %%ymm10, %%ymm10 \n\t" 386 "vmovapd %%ymm12, %%ymm7 \n\t" 387 "vperm2f128 $0x30, %%ymm12, %%ymm8, %%ymm12 \n\t" 388 "vperm2f128 $0x12, %%ymm7, %%ymm8, %%ymm8 \n\t" 403 "movq %4, %%rax \n\t" 404 "movq %6, %%rcx \n\t" 407 "movq %5, %%rsi \n\t" 409 ".DSTORELOOP%=: \n\t" 411 "movq 0 * 8(%%rcx), %%rdx \n\t" 415 "vbroadcastsd (%%rax), %%ymm6 \n\t" 419 "vmovapd 0 * 32(%%rdx), %%ymm0 \n\t" 420 "vmulpd %%ymm6, %%ymm9, %%ymm1 \n\t" 421 "vaddpd %%ymm1, %%ymm0, %%ymm1 \n\t" 422 "vmovapd %%ymm1, 0 * 32(%%rdx) \n\t" 423 "vmovapd 1 * 32(%%rdx), %%ymm3 \n\t" 424 "vmulpd %%ymm6, %%ymm8, %%ymm2 \n\t" 425 "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" 426 "vmovapd %%ymm2, 1 * 32(%%rdx) \n\t" 427 "addq %%rdi, %%rdx \n\t" 429 "vmovapd 0 * 32(%%rdx), %%ymm0 \n\t" 430 "vmulpd %%ymm6, %%ymm11, %%ymm1 \n\t" 431 "vaddpd %%ymm1, %%ymm0, %%ymm1 \n\t" 432 "vmovapd %%ymm1, 0 * 32(%%rdx) \n\t" 433 "vmovapd 1 * 32(%%rdx), %%ymm3 \n\t" 434 "vmulpd %%ymm6, %%ymm10, %%ymm2 \n\t" 435 "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" 436 "vmovapd %%ymm2, 1 * 32(%%rdx) \n\t" 437 "addq %%rdi, %%rdx \n\t" 439 "vmovapd 0 * 32(%%rdx), %%ymm0 \n\t" 440 "vmulpd %%ymm6, %%ymm13, %%ymm1 \n\t" 441 "vaddpd %%ymm1, %%ymm0, %%ymm1 \n\t" 442 "vmovapd %%ymm1, 0 * 32(%%rdx) \n\t" 443 "vmovapd 1 * 32(%%rdx), %%ymm3 \n\t" 444 "vmulpd %%ymm6, %%ymm12, %%ymm2 \n\t" 445 "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" 446 "vmovapd %%ymm2, 1 * 32(%%rdx) \n\t" 447 "addq %%rdi, %%rdx \n\t" 449 "vmovapd 0 * 32(%%rdx), %%ymm0 \n\t" 450 "vmulpd %%ymm6, %%ymm15, %%ymm1 \n\t" 451 "vaddpd %%ymm1, %%ymm0, %%ymm1 \n\t" 452 "vmovapd %%ymm1, 0 * 32(%%rdx) \n\t" 453 "vmovapd 1 * 32(%%rdx), %%ymm3 \n\t" 454 "vmulpd %%ymm6, %%ymm14, %%ymm2 \n\t" 455 "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" 456 "vmovapd %%ymm2, 1 * 32(%%rdx) \n\t" 459 "addq $1 * 8, %%rcx \n\t" 460 "addq $1 * 8, %%rax \n\t" 463 "jne .DSTORELOOP%= \n\t" 479 "rax",
"rbx",
"rcx",
"rdx",
"rdi",
"rsi",
480 "r10",
"r11",
"r12",
"r13",
"r14",
"r15",
481 "xmm0",
"xmm1",
"xmm2",
"xmm3",
482 "xmm4",
"xmm5",
"xmm6",
"xmm7",
483 "xmm8",
"xmm9",
"xmm10",
"xmm11",
484 "xmm12",
"xmm13",
"xmm14",
"xmm15",
495 double beta = aux->pc ? 1.0 : 0.0;
509 template<
typename TC>
510 inline void operator()
516 double *v, inc_t rs_v, inc_t cs_v,
520 printf(
"no implementation\n" );
Definition: rank_k_d8x4.hpp:16
STRA_OPERATOR(double) const
Definition: rank_k_d8x4.hpp:84
STRA_OPERATOR(float) const
Definition: rank_k_d8x4.hpp:27
GEMM_OPERATOR(float) const
Definition: rank_k_d8x4.hpp:33
GEMM_OPERATOR(double) const
Definition: rank_k_d8x4.hpp:491
Definition: rank_k_d8x4.hpp:73
Definition: hmlp_internal.hpp:38