GCC Code Coverage Report
Directory: . Exec Total Coverage
File: kernel/x86_64/haswell/bli_gemm_asm_d8x6.cpp Lines: 0 10 0.0 %
Date: 2019-01-14 Branches: 0 0 0.0 %

Line Exec Source
1
/*
2
3
   BLIS
4
   An object-based framework for developing high-performance BLAS-like
5
   libraries.
6
7
   Copyright (C) 2014, The University of Texas at Austin
8
9
   Redistribution and use in source and binary forms, with or without
10
   modification, are permitted provided that the following conditions are
11
   met:
12
    - Redistributions of source code must retain the above copyright
13
      notice, this list of conditions and the following disclaimer.
14
    - Redistributions in binary form must reproduce the above copyright
15
      notice, this list of conditions and the following disclaimer in the
16
      documentation and/or other materials provided with the distribution.
17
    - Neither the name of The University of Texas at Austin nor the names
18
      of its contributors may be used to endorse or promote products
19
      derived from this software without specific prior written permission.
20
21
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33
*/
34
35
//#include "blis.h"
36
#include <hmlp_internal.hpp>
37
38
39
#define SGEMM_INPUT_GS_BETA_NZ \
40
	"vmovlps    (%%rcx        ),  %%xmm0,  %%xmm0  \n\t" \
41
	"vmovhps    (%%rcx,%%rsi,1),  %%xmm0,  %%xmm0  \n\t" \
42
	"vmovlps    (%%rcx,%%rsi,2),  %%xmm1,  %%xmm1  \n\t" \
43
	"vmovhps    (%%rcx,%%r13  ),  %%xmm1,  %%xmm1  \n\t" \
44
	"vshufps    $0x88,   %%xmm1,  %%xmm0,  %%xmm0  \n\t" \
45
	"vmovlps    (%%rcx,%%rsi,4),  %%xmm2,  %%xmm2  \n\t" \
46
	"vmovhps    (%%rcx,%%r15  ),  %%xmm2,  %%xmm2  \n\t" \
47
	"vmovlps    (%%rcx,%%r13,2),  %%xmm1,  %%xmm1  \n\t" \
48
	"vmovhps    (%%rcx,%%r10  ),  %%xmm1,  %%xmm1  \n\t" \
49
	"vshufps    $0x88,   %%xmm1,  %%xmm2,  %%xmm2  \n\t" \
50
	"vperm2f128 $0x20,   %%ymm2,  %%ymm0,  %%ymm0  \n\t"
51
52
#define SGEMM_OUTPUT_GS_BETA_NZ \
53
	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t" \
54
	"vmovss            %%xmm0, (%%rcx        )   \n\t" \
55
	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t" \
56
	"vmovss            %%xmm1, (%%rcx,%%rsi,1)   \n\t" \
57
	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t" \
58
	"vmovss            %%xmm0, (%%rcx,%%rsi,2)   \n\t" \
59
	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t" \
60
	"vmovss            %%xmm1, (%%rcx,%%r13  )   \n\t" \
61
	"vmovss            %%xmm2, (%%rcx,%%rsi,4)   \n\t" \
62
	"vpermilps  $0x39, %%xmm2,  %%xmm1           \n\t" \
63
	"vmovss            %%xmm1, (%%rcx,%%r15  )   \n\t" \
64
	"vpermilps  $0x39, %%xmm1,  %%xmm2           \n\t" \
65
	"vmovss            %%xmm2, (%%rcx,%%r13,2)   \n\t" \
66
	"vpermilps  $0x39, %%xmm2,  %%xmm1           \n\t" \
67
	"vmovss            %%xmm1, (%%rcx,%%r10  )   \n\t"
68
69
void bli_sgemm_asm_16x6
70
(
71
  dim_t               k,
72
  float*     restrict alpha,
73
  float*     restrict a,
74
  float*     restrict b,
75
  float*     restrict beta,
76
  float*     restrict c, inc_t rs_c, inc_t cs_c,
77
  //auxinfo_t* restrict data,
78
  //cntx_t*    restrict cntx
79
  aux_s<float, float, float, float> *aux
80
)
81
{
82
	//void*   a_next = bli_auxinfo_next_a( data );
83
	//void*   b_next = bli_auxinfo_next_b( data );
84
85
	dim_t   k_iter = k / 4;
86
	dim_t   k_left = k % 4;
87
88
	__asm__ volatile
89
	(
90
	"                                            \n\t"
91
	"vzeroall                                    \n\t" // zero all xmm/ymm registers.
92
	"                                            \n\t"
93
	"                                            \n\t"
94
	"movq                %2, %%rax               \n\t" // load address of a.
95
	"movq                %3, %%rbx               \n\t" // load address of b.
96
	//"movq                %9, %%r15               \n\t" // load address of b_next.
97
	"                                            \n\t"
98
	"addq           $32 * 4, %%rax               \n\t"
99
	"                                            \n\t" // initialize loop by pre-loading
100
	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
101
	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
102
	"                                            \n\t"
103
	"movq                %6, %%rcx               \n\t" // load address of c
104
	"movq                %8, %%rdi               \n\t" // load cs_c
105
	"leaq        (,%%rdi,4), %%rdi               \n\t" // cs_c *= sizeof(float)
106
	"                                            \n\t"
107
	"leaq   (%%rdi,%%rdi,2), %%r13               \n\t" // r13 = 3*cs_c;
108
	"leaq   (%%rcx,%%r13,1), %%rdx               \n\t" // rdx = c + 3*cs_c;
109
	"prefetcht0   7 * 8(%%rcx)                   \n\t" // prefetch c + 0*cs_c
110
	"prefetcht0   7 * 8(%%rcx,%%rdi)             \n\t" // prefetch c + 1*cs_c
111
	"prefetcht0   7 * 8(%%rcx,%%rdi,2)           \n\t" // prefetch c + 2*cs_c
112
	"prefetcht0   7 * 8(%%rdx)                   \n\t" // prefetch c + 3*cs_c
113
	"prefetcht0   7 * 8(%%rdx,%%rdi)             \n\t" // prefetch c + 4*cs_c
114
	"prefetcht0   7 * 8(%%rdx,%%rdi,2)           \n\t" // prefetch c + 5*cs_c
115
	"                                            \n\t"
116
	"                                            \n\t"
117
	"                                            \n\t"
118
	"                                            \n\t"
119
	"movq      %0, %%rsi                         \n\t" // i = k_iter;
120
	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
121
	"je     .SCONSIDKLEFT                        \n\t" // if i == 0, jump to code that
122
	"                                            \n\t" // contains the k_left loop.
123
	"                                            \n\t"
124
	"                                            \n\t"
125
	".SLOOPKITER:                                \n\t" // MAIN LOOP
126
	"                                            \n\t"
127
	"                                            \n\t"
128
	"                                            \n\t" // iteration 0
129
	"prefetcht0  128 * 4(%%rax)                  \n\t"
130
	"                                            \n\t"
131
	"vbroadcastss       0 *  4(%%rbx), %%ymm2    \n\t"
132
	"vbroadcastss       1 *  4(%%rbx), %%ymm3    \n\t"
133
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
134
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
135
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
136
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
137
	"                                            \n\t"
138
	"vbroadcastss       2 *  4(%%rbx), %%ymm2    \n\t"
139
	"vbroadcastss       3 *  4(%%rbx), %%ymm3    \n\t"
140
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
141
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
142
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
143
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
144
	"                                            \n\t"
145
	"vbroadcastss       4 *  4(%%rbx), %%ymm2    \n\t"
146
	"vbroadcastss       5 *  4(%%rbx), %%ymm3    \n\t"
147
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
148
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
149
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
150
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
151
	"                                            \n\t"
152
	"vmovaps           -2 * 32(%%rax), %%ymm0    \n\t"
153
	"vmovaps           -1 * 32(%%rax), %%ymm1    \n\t"
154
	"                                            \n\t"
155
	"                                            \n\t" // iteration 1
156
	"vbroadcastss       6 *  4(%%rbx), %%ymm2    \n\t"
157
	"vbroadcastss       7 *  4(%%rbx), %%ymm3    \n\t"
158
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
159
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
160
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
161
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
162
	"                                            \n\t"
163
	"vbroadcastss       8 *  4(%%rbx), %%ymm2    \n\t"
164
	"vbroadcastss       9 *  4(%%rbx), %%ymm3    \n\t"
165
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
166
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
167
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
168
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
169
	"                                            \n\t"
170
	"vbroadcastss      10 *  4(%%rbx), %%ymm2    \n\t"
171
	"vbroadcastss      11 *  4(%%rbx), %%ymm3    \n\t"
172
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
173
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
174
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
175
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
176
	"                                            \n\t"
177
	"vmovaps            0 * 32(%%rax), %%ymm0    \n\t"
178
	"vmovaps            1 * 32(%%rax), %%ymm1    \n\t"
179
	"                                            \n\t"
180
	"                                            \n\t" // iteration 2
181
	"prefetcht0  152 * 4(%%rax)                  \n\t"
182
	"                                            \n\t"
183
	"vbroadcastss      12 *  4(%%rbx), %%ymm2    \n\t"
184
	"vbroadcastss      13 *  4(%%rbx), %%ymm3    \n\t"
185
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
186
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
187
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
188
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
189
	"                                            \n\t"
190
	"vbroadcastss      14 *  4(%%rbx), %%ymm2    \n\t"
191
	"vbroadcastss      15 *  4(%%rbx), %%ymm3    \n\t"
192
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
193
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
194
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
195
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
196
	"                                            \n\t"
197
	"vbroadcastss      16 *  4(%%rbx), %%ymm2    \n\t"
198
	"vbroadcastss      17 *  4(%%rbx), %%ymm3    \n\t"
199
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
200
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
201
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
202
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
203
	"                                            \n\t"
204
	"vmovaps            2 * 32(%%rax), %%ymm0    \n\t"
205
	"vmovaps            3 * 32(%%rax), %%ymm1    \n\t"
206
	"                                            \n\t"
207
	"                                            \n\t" // iteration 3
208
	"vbroadcastss      18 *  4(%%rbx), %%ymm2    \n\t"
209
	"vbroadcastss      19 *  4(%%rbx), %%ymm3    \n\t"
210
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
211
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
212
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
213
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
214
	"                                            \n\t"
215
	"vbroadcastss      20 *  4(%%rbx), %%ymm2    \n\t"
216
	"vbroadcastss      21 *  4(%%rbx), %%ymm3    \n\t"
217
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
218
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
219
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
220
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
221
	"                                            \n\t"
222
	"vbroadcastss      22 *  4(%%rbx), %%ymm2    \n\t"
223
	"vbroadcastss      23 *  4(%%rbx), %%ymm3    \n\t"
224
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
225
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
226
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
227
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
228
	"                                            \n\t"
229
	"addq          $4 * 16 * 4, %%rax            \n\t" // a += 4*16 (unroll x mr)
230
	"addq          $4 *  6 * 4, %%rbx            \n\t" // b += 4*6  (unroll x nr)
231
	"                                            \n\t"
232
	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
233
	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
234
	"                                            \n\t"
235
	"                                            \n\t"
236
	"decq   %%rsi                                \n\t" // i -= 1;
237
	"jne    .SLOOPKITER                          \n\t" // iterate again if i != 0.
238
	"                                            \n\t"
239
	"                                            \n\t"
240
	"                                            \n\t"
241
	"                                            \n\t"
242
	"                                            \n\t"
243
	"                                            \n\t"
244
	".SCONSIDKLEFT:                              \n\t"
245
	"                                            \n\t"
246
	"movq      %1, %%rsi                         \n\t" // i = k_left;
247
	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
248
	"je     .SPOSTACCUM                          \n\t" // if i == 0, we're done; jump to end.
249
	"                                            \n\t" // else, we prepare to enter k_left loop.
250
	"                                            \n\t"
251
	"                                            \n\t"
252
	".SLOOPKLEFT:                                \n\t" // EDGE LOOP
253
	"                                            \n\t"
254
	"prefetcht0  128 * 4(%%rax)                  \n\t"
255
	"                                            \n\t"
256
	"vbroadcastss       0 *  4(%%rbx), %%ymm2    \n\t"
257
	"vbroadcastss       1 *  4(%%rbx), %%ymm3    \n\t"
258
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
259
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
260
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
261
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
262
	"                                            \n\t"
263
	"vbroadcastss       2 *  4(%%rbx), %%ymm2    \n\t"
264
	"vbroadcastss       3 *  4(%%rbx), %%ymm3    \n\t"
265
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
266
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
267
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
268
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
269
	"                                            \n\t"
270
	"vbroadcastss       4 *  4(%%rbx), %%ymm2    \n\t"
271
	"vbroadcastss       5 *  4(%%rbx), %%ymm3    \n\t"
272
	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
273
	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
274
	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
275
	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
276
	"                                            \n\t"
277
	"addq          $1 * 16 * 4, %%rax            \n\t" // a += 1*16 (unroll x mr)
278
	"addq          $1 *  6 * 4, %%rbx            \n\t" // b += 1*6  (unroll x nr)
279
	"                                            \n\t"
280
	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
281
	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
282
	"                                            \n\t"
283
	"                                            \n\t"
284
	"decq   %%rsi                                \n\t" // i -= 1;
285
	"jne    .SLOOPKLEFT                          \n\t" // iterate again if i != 0.
286
	"                                            \n\t"
287
	"                                            \n\t"
288
	"                                            \n\t"
289
	".SPOSTACCUM:                                \n\t"
290
	"                                            \n\t"
291
	"                                            \n\t"
292
	"                                            \n\t"
293
	"                                            \n\t"
294
	"movq         %4, %%rax                      \n\t" // load address of alpha
295
	"movq         %5, %%rbx                      \n\t" // load address of beta
296
	"vbroadcastss    (%%rax), %%ymm0             \n\t" // load alpha and duplicate
297
	"vbroadcastss    (%%rbx), %%ymm3             \n\t" // load beta and duplicate
298
	"                                            \n\t"
299
	"vmulps           %%ymm0,  %%ymm4,  %%ymm4   \n\t" // scale by alpha
300
	"vmulps           %%ymm0,  %%ymm5,  %%ymm5   \n\t"
301
	"vmulps           %%ymm0,  %%ymm6,  %%ymm6   \n\t"
302
	"vmulps           %%ymm0,  %%ymm7,  %%ymm7   \n\t"
303
	"vmulps           %%ymm0,  %%ymm8,  %%ymm8   \n\t"
304
	"vmulps           %%ymm0,  %%ymm9,  %%ymm9   \n\t"
305
	"vmulps           %%ymm0,  %%ymm10, %%ymm10  \n\t"
306
	"vmulps           %%ymm0,  %%ymm11, %%ymm11  \n\t"
307
	"vmulps           %%ymm0,  %%ymm12, %%ymm12  \n\t"
308
	"vmulps           %%ymm0,  %%ymm13, %%ymm13  \n\t"
309
	"vmulps           %%ymm0,  %%ymm14, %%ymm14  \n\t"
310
	"vmulps           %%ymm0,  %%ymm15, %%ymm15  \n\t"
311
	"                                            \n\t"
312
	"                                            \n\t"
313
	"                                            \n\t"
314
	"                                            \n\t"
315
	"                                            \n\t"
316
	"                                            \n\t"
317
	"movq                %7, %%rsi               \n\t" // load rs_c
318
	"leaq        (,%%rsi,4), %%rsi               \n\t" // rsi = rs_c * sizeof(float)
319
	"                                            \n\t"
320
	"leaq   (%%rcx,%%rsi,8), %%rdx               \n\t" // load address of c +  8*rs_c;
321
	"                                            \n\t"
322
	"leaq   (%%rsi,%%rsi,2), %%r13               \n\t" // r13 = 3*rs_c;
323
	"leaq   (%%rsi,%%rsi,4), %%r15               \n\t" // r15 = 5*rs_c;
324
	"leaq   (%%r13,%%rsi,4), %%r10               \n\t" // r10 = 7*rs_c;
325
	"                                            \n\t"
326
	"                                            \n\t"
327
	"                                            \n\t" // now avoid loading C if beta == 0
328
	"                                            \n\t"
329
	"vxorps    %%ymm0,  %%ymm0,  %%ymm0          \n\t" // set ymm0 to zero.
330
	"vucomiss  %%xmm0,  %%xmm3                   \n\t" // set ZF if beta == 0.
331
	"je      .SBETAZERO                          \n\t" // if ZF = 1, jump to beta == 0 case
332
	"                                            \n\t"
333
	"                                            \n\t"
334
	"cmpq       $4, %%rsi                        \n\t" // set ZF if (4*rs_c) == 4.
335
	"jz      .SCOLSTORED                         \n\t" // jump to column storage case
336
	"                                            \n\t"
337
	"                                            \n\t"
338
	"                                            \n\t"
339
	".SGENSTORED:                                \n\t"
340
	"                                            \n\t"
341
	"                                            \n\t"
342
	SGEMM_INPUT_GS_BETA_NZ
343
	"vfmadd213ps      %%ymm4,  %%ymm3,  %%ymm0   \n\t"
344
	SGEMM_OUTPUT_GS_BETA_NZ
345
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
346
	"                                            \n\t"
347
	"                                            \n\t"
348
	SGEMM_INPUT_GS_BETA_NZ
349
	"vfmadd213ps      %%ymm6,  %%ymm3,  %%ymm0   \n\t"
350
	SGEMM_OUTPUT_GS_BETA_NZ
351
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
352
	"                                            \n\t"
353
	"                                            \n\t"
354
	SGEMM_INPUT_GS_BETA_NZ
355
	"vfmadd213ps      %%ymm8,  %%ymm3,  %%ymm0   \n\t"
356
	SGEMM_OUTPUT_GS_BETA_NZ
357
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
358
	"                                            \n\t"
359
	"                                            \n\t"
360
	SGEMM_INPUT_GS_BETA_NZ
361
	"vfmadd213ps      %%ymm10, %%ymm3,  %%ymm0   \n\t"
362
	SGEMM_OUTPUT_GS_BETA_NZ
363
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
364
	"                                            \n\t"
365
	"                                            \n\t"
366
	SGEMM_INPUT_GS_BETA_NZ
367
	"vfmadd213ps      %%ymm12, %%ymm3,  %%ymm0   \n\t"
368
	SGEMM_OUTPUT_GS_BETA_NZ
369
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
370
	"                                            \n\t"
371
	"                                            \n\t"
372
	SGEMM_INPUT_GS_BETA_NZ
373
	"vfmadd213ps      %%ymm14, %%ymm3,  %%ymm0   \n\t"
374
	SGEMM_OUTPUT_GS_BETA_NZ
375
	//"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
376
	"                                            \n\t"
377
	"                                            \n\t"
378
	"movq      %%rdx, %%rcx                      \n\t" // rcx = c + 8*rs_c
379
	"                                            \n\t"
380
	"                                            \n\t"
381
	SGEMM_INPUT_GS_BETA_NZ
382
	"vfmadd213ps      %%ymm5,  %%ymm3,  %%ymm0   \n\t"
383
	SGEMM_OUTPUT_GS_BETA_NZ
384
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
385
	"                                            \n\t"
386
	"                                            \n\t"
387
	SGEMM_INPUT_GS_BETA_NZ
388
	"vfmadd213ps      %%ymm7,  %%ymm3,  %%ymm0   \n\t"
389
	SGEMM_OUTPUT_GS_BETA_NZ
390
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
391
	"                                            \n\t"
392
	"                                            \n\t"
393
	SGEMM_INPUT_GS_BETA_NZ
394
	"vfmadd213ps      %%ymm9,  %%ymm3,  %%ymm0   \n\t"
395
	SGEMM_OUTPUT_GS_BETA_NZ
396
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
397
	"                                            \n\t"
398
	"                                            \n\t"
399
	SGEMM_INPUT_GS_BETA_NZ
400
	"vfmadd213ps      %%ymm11, %%ymm3,  %%ymm0   \n\t"
401
	SGEMM_OUTPUT_GS_BETA_NZ
402
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
403
	"                                            \n\t"
404
	"                                            \n\t"
405
	SGEMM_INPUT_GS_BETA_NZ
406
	"vfmadd213ps      %%ymm13, %%ymm3,  %%ymm0   \n\t"
407
	SGEMM_OUTPUT_GS_BETA_NZ
408
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
409
	"                                            \n\t"
410
	"                                            \n\t"
411
	SGEMM_INPUT_GS_BETA_NZ
412
	"vfmadd213ps      %%ymm15, %%ymm3,  %%ymm0   \n\t"
413
	SGEMM_OUTPUT_GS_BETA_NZ
414
	//"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
415
	"                                            \n\t"
416
	"                                            \n\t"
417
	"                                            \n\t"
418
	"jmp    .SDONE                               \n\t" // jump to end.
419
	"                                            \n\t"
420
	"                                            \n\t"
421
	"                                            \n\t"
422
	".SCOLSTORED:                                \n\t"
423
	"                                            \n\t"
424
	"                                            \n\t"
425
	"vfmadd231ps      (%%rcx), %%ymm3,  %%ymm4   \n\t"
426
	"vmovups          %%ymm4,  (%%rcx)           \n\t"
427
	"addq      %%rdi, %%rcx                      \n\t"
428
	"vfmadd231ps      (%%rdx), %%ymm3,  %%ymm5   \n\t"
429
	"vmovups          %%ymm5,  (%%rdx)           \n\t"
430
	"addq      %%rdi, %%rdx                      \n\t"
431
	"                                            \n\t"
432
	"                                            \n\t"
433
	"vfmadd231ps      (%%rcx), %%ymm3,  %%ymm6   \n\t"
434
	"vmovups          %%ymm6,  (%%rcx)           \n\t"
435
	"addq      %%rdi, %%rcx                      \n\t"
436
	"vfmadd231ps      (%%rdx), %%ymm3,  %%ymm7   \n\t"
437
	"vmovups          %%ymm7,  (%%rdx)           \n\t"
438
	"addq      %%rdi, %%rdx                      \n\t"
439
	"                                            \n\t"
440
	"                                            \n\t"
441
	"vfmadd231ps      (%%rcx), %%ymm3,  %%ymm8   \n\t"
442
	"vmovups          %%ymm8,  (%%rcx)           \n\t"
443
	"addq      %%rdi, %%rcx                      \n\t"
444
	"vfmadd231ps      (%%rdx), %%ymm3,  %%ymm9   \n\t"
445
	"vmovups          %%ymm9,  (%%rdx)           \n\t"
446
	"addq      %%rdi, %%rdx                      \n\t"
447
	"                                            \n\t"
448
	"                                            \n\t"
449
	"vfmadd231ps      (%%rcx), %%ymm3,  %%ymm10  \n\t"
450
	"vmovups          %%ymm10, (%%rcx)           \n\t"
451
	"addq      %%rdi, %%rcx                      \n\t"
452
	"vfmadd231ps      (%%rdx), %%ymm3,  %%ymm11  \n\t"
453
	"vmovups          %%ymm11, (%%rdx)           \n\t"
454
	"addq      %%rdi, %%rdx                      \n\t"
455
	"                                            \n\t"
456
	"                                            \n\t"
457
	"vfmadd231ps      (%%rcx), %%ymm3,  %%ymm12  \n\t"
458
	"vmovups          %%ymm12, (%%rcx)           \n\t"
459
	"addq      %%rdi, %%rcx                      \n\t"
460
	"vfmadd231ps      (%%rdx), %%ymm3,  %%ymm13  \n\t"
461
	"vmovups          %%ymm13, (%%rdx)           \n\t"
462
	"addq      %%rdi, %%rdx                      \n\t"
463
	"                                            \n\t"
464
	"                                            \n\t"
465
	"vfmadd231ps      (%%rcx), %%ymm3,  %%ymm14  \n\t"
466
	"vmovups          %%ymm14, (%%rcx)           \n\t"
467
	//"addq      %%rdi, %%rcx                      \n\t"
468
	"vfmadd231ps      (%%rdx), %%ymm3,  %%ymm15  \n\t"
469
	"vmovups          %%ymm15, (%%rdx)           \n\t"
470
	//"addq      %%rdi, %%rdx                      \n\t"
471
	"                                            \n\t"
472
	"                                            \n\t"
473
	"                                            \n\t"
474
	"                                            \n\t"
475
	"jmp    .SDONE                               \n\t" // jump to end.
476
	"                                            \n\t"
477
	"                                            \n\t"
478
	"                                            \n\t"
479
	".SBETAZERO:                                 \n\t"
480
	"                                            \n\t"
481
	"cmpq       $4, %%rsi                        \n\t" // set ZF if (4*rs_c) == 4.
482
	"jz      .SCOLSTORBZ                         \n\t" // jump to column storage case
483
	"                                            \n\t"
484
	"                                            \n\t"
485
	"                                            \n\t"
486
	".SGENSTORBZ:                                \n\t"
487
	"                                            \n\t"
488
	"                                            \n\t"
489
	"vmovaps           %%ymm4,  %%ymm0           \n\t"
490
	SGEMM_OUTPUT_GS_BETA_NZ
491
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
492
	"                                            \n\t"
493
	"                                            \n\t"
494
	"vmovaps           %%ymm6,  %%ymm0           \n\t"
495
	SGEMM_OUTPUT_GS_BETA_NZ
496
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
497
	"                                            \n\t"
498
	"                                            \n\t"
499
	"vmovaps           %%ymm8,  %%ymm0           \n\t"
500
	SGEMM_OUTPUT_GS_BETA_NZ
501
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
502
	"                                            \n\t"
503
	"                                            \n\t"
504
	"vmovaps           %%ymm10, %%ymm0           \n\t"
505
	SGEMM_OUTPUT_GS_BETA_NZ
506
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
507
	"                                            \n\t"
508
	"                                            \n\t"
509
	"vmovaps           %%ymm12, %%ymm0           \n\t"
510
	SGEMM_OUTPUT_GS_BETA_NZ
511
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
512
	"                                            \n\t"
513
	"                                            \n\t"
514
	"vmovaps           %%ymm14, %%ymm0           \n\t"
515
	SGEMM_OUTPUT_GS_BETA_NZ
516
	//"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
517
	"                                            \n\t"
518
	"                                            \n\t"
519
	"movq      %%rdx, %%rcx                      \n\t" // rcx = c + 8*rs_c
520
	"                                            \n\t"
521
	"                                            \n\t"
522
	"vmovaps           %%ymm5,  %%ymm0           \n\t"
523
	SGEMM_OUTPUT_GS_BETA_NZ
524
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
525
	"                                            \n\t"
526
	"                                            \n\t"
527
	"vmovaps           %%ymm7,  %%ymm0           \n\t"
528
	SGEMM_OUTPUT_GS_BETA_NZ
529
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
530
	"                                            \n\t"
531
	"                                            \n\t"
532
	"vmovaps           %%ymm9,  %%ymm0           \n\t"
533
	SGEMM_OUTPUT_GS_BETA_NZ
534
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
535
	"                                            \n\t"
536
	"                                            \n\t"
537
	"vmovaps           %%ymm11, %%ymm0           \n\t"
538
	SGEMM_OUTPUT_GS_BETA_NZ
539
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
540
	"                                            \n\t"
541
	"                                            \n\t"
542
	"vmovaps           %%ymm13, %%ymm0           \n\t"
543
	SGEMM_OUTPUT_GS_BETA_NZ
544
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
545
	"                                            \n\t"
546
	"                                            \n\t"
547
	"vmovaps           %%ymm15, %%ymm0           \n\t"
548
	SGEMM_OUTPUT_GS_BETA_NZ
549
	//"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
550
	"                                            \n\t"
551
	"                                            \n\t"
552
	"                                            \n\t"
553
	"jmp    .SDONE                               \n\t" // jump to end.
554
	"                                            \n\t"
555
	"                                            \n\t"
556
	"                                            \n\t"
557
	".SCOLSTORBZ:                                \n\t"
558
	"                                            \n\t"
559
	"                                            \n\t"
560
	"vmovups          %%ymm4,  (%%rcx)           \n\t"
561
	"addq      %%rdi, %%rcx                      \n\t"
562
	"vmovups          %%ymm5,  (%%rdx)           \n\t"
563
	"addq      %%rdi, %%rdx                      \n\t"
564
	"                                            \n\t"
565
	"vmovups          %%ymm6,  (%%rcx)           \n\t"
566
	"addq      %%rdi, %%rcx                      \n\t"
567
	"vmovups          %%ymm7,  (%%rdx)           \n\t"
568
	"addq      %%rdi, %%rdx                      \n\t"
569
	"                                            \n\t"
570
	"                                            \n\t"
571
	"vmovups          %%ymm8,  (%%rcx)           \n\t"
572
	"addq      %%rdi, %%rcx                      \n\t"
573
	"vmovups          %%ymm9,  (%%rdx)           \n\t"
574
	"addq      %%rdi, %%rdx                      \n\t"
575
	"                                            \n\t"
576
	"                                            \n\t"
577
	"vmovups          %%ymm10, (%%rcx)           \n\t"
578
	"addq      %%rdi, %%rcx                      \n\t"
579
	"vmovups          %%ymm11, (%%rdx)           \n\t"
580
	"addq      %%rdi, %%rdx                      \n\t"
581
	"                                            \n\t"
582
	"                                            \n\t"
583
	"vmovups          %%ymm12, (%%rcx)           \n\t"
584
	"addq      %%rdi, %%rcx                      \n\t"
585
	"vmovups          %%ymm13, (%%rdx)           \n\t"
586
	"addq      %%rdi, %%rdx                      \n\t"
587
	"                                            \n\t"
588
	"                                            \n\t"
589
	"vmovups          %%ymm14, (%%rcx)           \n\t"
590
	//"addq      %%rdi, %%rcx                      \n\t"
591
	"vmovups          %%ymm15, (%%rdx)           \n\t"
592
	//"addq      %%rdi, %%rdx                      \n\t"
593
	"                                            \n\t"
594
	"                                            \n\t"
595
	"                                            \n\t"
596
	"                                            \n\t"
597
	"                                            \n\t"
598
	"                                            \n\t"
599
	"                                            \n\t"
600
	".SDONE:                                     \n\t"
601
    "                                            \n\t"
602
    "vzeroupper                                  \n\t"
603
	"                                            \n\t"
604
605
	: // output operands (none)
606
	: // input operands
607
	  "m" (k_iter), // 0
608
	  "m" (k_left), // 1
609
	  "m" (a),      // 2
610
	  "m" (b),      // 3
611
	  "m" (alpha),  // 4
612
	  "m" (beta),   // 5
613
	  "m" (c),      // 6
614
	  "m" (rs_c),   // 7
615
	  "m" (cs_c)/*,   // 8
616
	  "m" (b_next), // 9
617
	  "m" (a_next)*/  // 10
618
	: // register clobber list
619
	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
620
	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
621
	  "xmm0", "xmm1", "xmm2", "xmm3",
622
	  "xmm4", "xmm5", "xmm6", "xmm7",
623
	  "xmm8", "xmm9", "xmm10", "xmm11",
624
	  "xmm12", "xmm13", "xmm14", "xmm15",
625
	  "memory"
626
	);
627
}
628
629
#define DGEMM_INPUT_GS_BETA_NZ \
630
	"vmovlpd    (%%rcx        ),  %%xmm0,  %%xmm0  \n\t" \
631
	"vmovhpd    (%%rcx,%%rsi,1),  %%xmm0,  %%xmm0  \n\t" \
632
	"vmovlpd    (%%rcx,%%rsi,2),  %%xmm1,  %%xmm1  \n\t" \
633
	"vmovhpd    (%%rcx,%%r13  ),  %%xmm1,  %%xmm1  \n\t" \
634
	"vperm2f128 $0x20,   %%ymm1,  %%ymm0,  %%ymm0  \n\t" /*\
635
	"vmovlps    (%%rcx,%%rsi,4),  %%xmm2,  %%xmm2  \n\t" \
636
	"vmovhps    (%%rcx,%%r15  ),  %%xmm2,  %%xmm2  \n\t" \
637
	"vmovlps    (%%rcx,%%r13,2),  %%xmm1,  %%xmm1  \n\t" \
638
	"vmovhps    (%%rcx,%%r10  ),  %%xmm1,  %%xmm1  \n\t" \
639
	"vperm2f128 $0x20,   %%ymm1,  %%ymm2,  %%ymm2  \n\t"*/
640
641
#define DGEMM_OUTPUT_GS_BETA_NZ \
642
	"vextractf128  $1, %%ymm0,  %%xmm1           \n\t" \
643
	"vmovlpd           %%xmm0,  (%%rcx        )  \n\t" \
644
	"vmovhpd           %%xmm0,  (%%rcx,%%rsi  )  \n\t" \
645
	"vmovlpd           %%xmm1,  (%%rcx,%%rsi,2)  \n\t" \
646
	"vmovhpd           %%xmm1,  (%%rcx,%%r13  )  \n\t" /*\
647
	"vextractf128  $1, %%ymm2,  %%xmm1           \n\t" \
648
	"vmovlpd           %%xmm2,  (%%rcx,%%rsi,4)  \n\t" \
649
	"vmovhpd           %%xmm2,  (%%rcx,%%r15  )  \n\t" \
650
	"vmovlpd           %%xmm1,  (%%rcx,%%r13,2)  \n\t" \
651
	"vmovhpd           %%xmm1,  (%%rcx,%%r10  )  \n\t"*/
652
653
void bli_dgemm_asm_8x6
654
(
655
  dim_t               k,
656
  double*    restrict alpha,
657
  double*    restrict a,
658
  double*    restrict b,
659
  double*    restrict beta,
660
  double*    restrict c, inc_t rs_c, inc_t cs_c,
661
  //auxinfo_t* restrict data,
662
  //cntx_t*    restrict cntx
663
  aux_s<double, double, double, double> *aux
664
)
665
{
666
	//void*   a_next = bli_auxinfo_next_a( data );
667
	//void*   b_next = bli_auxinfo_next_b( data );
668
669
	dim_t   k_iter = k / 4;
670
	dim_t   k_left = k % 4;
671
672
	__asm__ volatile
673
	(
674
	"                                            \n\t"
675
	"vzeroall                                    \n\t" // zero all xmm/ymm registers.
676
	"                                            \n\t"
677
	"                                            \n\t"
678
	"movq                %2, %%rax               \n\t" // load address of a.
679
	"movq                %3, %%rbx               \n\t" // load address of b.
680
	//"movq                %9, %%r15               \n\t" // load address of b_next.
681
	"                                            \n\t"
682
	"addq           $32 * 4, %%rax               \n\t"
683
	"                                            \n\t" // initialize loop by pre-loading
684
	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
685
	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
686
	"                                            \n\t"
687
	"movq                %6, %%rcx               \n\t" // load address of c
688
	"movq                %8, %%rdi               \n\t" // load cs_c
689
	"leaq        (,%%rdi,8), %%rdi               \n\t" // cs_c *= sizeof(double)
690
	"                                            \n\t"
691
	"leaq   (%%rdi,%%rdi,2), %%r13               \n\t" // r13 = 3*cs_c;
692
	"leaq   (%%rcx,%%r13,1), %%rdx               \n\t" // rdx = c + 3*cs_c;
693
	"prefetcht0   7 * 8(%%rcx)                   \n\t" // prefetch c + 0*cs_c
694
	"prefetcht0   7 * 8(%%rcx,%%rdi)             \n\t" // prefetch c + 1*cs_c
695
	"prefetcht0   7 * 8(%%rcx,%%rdi,2)           \n\t" // prefetch c + 2*cs_c
696
	"prefetcht0   7 * 8(%%rdx)                   \n\t" // prefetch c + 3*cs_c
697
	"prefetcht0   7 * 8(%%rdx,%%rdi)             \n\t" // prefetch c + 4*cs_c
698
	"prefetcht0   7 * 8(%%rdx,%%rdi,2)           \n\t" // prefetch c + 5*cs_c
699
	"                                            \n\t"
700
	"                                            \n\t"
701
	"                                            \n\t"
702
	"                                            \n\t"
703
	"movq      %0, %%rsi                         \n\t" // i = k_iter;
704
	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
705
	"je     .DCONSIDKLEFT                        \n\t" // if i == 0, jump to code that
706
	"                                            \n\t" // contains the k_left loop.
707
	"                                            \n\t"
708
	"                                            \n\t"
709
	".DLOOPKITER:                                \n\t" // MAIN LOOP
710
	"                                            \n\t"
711
	"                                            \n\t"
712
	"                                            \n\t" // iteration 0
713
	"prefetcht0   64 * 8(%%rax)                  \n\t"
714
	"                                            \n\t"
715
	"vbroadcastsd       0 *  8(%%rbx), %%ymm2    \n\t"
716
	"vbroadcastsd       1 *  8(%%rbx), %%ymm3    \n\t"
717
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
718
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
719
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
720
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
721
	"                                            \n\t"
722
	"vbroadcastsd       2 *  8(%%rbx), %%ymm2    \n\t"
723
	"vbroadcastsd       3 *  8(%%rbx), %%ymm3    \n\t"
724
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
725
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
726
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
727
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
728
	"                                            \n\t"
729
	"vbroadcastsd       4 *  8(%%rbx), %%ymm2    \n\t"
730
	"vbroadcastsd       5 *  8(%%rbx), %%ymm3    \n\t"
731
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
732
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
733
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
734
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
735
	"                                            \n\t"
736
	"vmovaps           -2 * 32(%%rax), %%ymm0    \n\t"
737
	"vmovaps           -1 * 32(%%rax), %%ymm1    \n\t"
738
	"                                            \n\t"
739
	"                                            \n\t" // iteration 1
740
	"vbroadcastsd       6 *  8(%%rbx), %%ymm2    \n\t"
741
	"vbroadcastsd       7 *  8(%%rbx), %%ymm3    \n\t"
742
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
743
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
744
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
745
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
746
	"                                            \n\t"
747
	"vbroadcastsd       8 *  8(%%rbx), %%ymm2    \n\t"
748
	"vbroadcastsd       9 *  8(%%rbx), %%ymm3    \n\t"
749
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
750
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
751
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
752
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
753
	"                                            \n\t"
754
	"vbroadcastsd      10 *  8(%%rbx), %%ymm2    \n\t"
755
	"vbroadcastsd      11 *  8(%%rbx), %%ymm3    \n\t"
756
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
757
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
758
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
759
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
760
	"                                            \n\t"
761
	"vmovaps            0 * 32(%%rax), %%ymm0    \n\t"
762
	"vmovaps            1 * 32(%%rax), %%ymm1    \n\t"
763
	"                                            \n\t"
764
	"                                            \n\t" // iteration 2
765
	"prefetcht0   76 * 8(%%rax)                  \n\t"
766
	"                                            \n\t"
767
	"vbroadcastsd      12 *  8(%%rbx), %%ymm2    \n\t"
768
	"vbroadcastsd      13 *  8(%%rbx), %%ymm3    \n\t"
769
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
770
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
771
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
772
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
773
	"                                            \n\t"
774
	"vbroadcastsd      14 *  8(%%rbx), %%ymm2    \n\t"
775
	"vbroadcastsd      15 *  8(%%rbx), %%ymm3    \n\t"
776
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
777
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
778
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
779
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
780
	"                                            \n\t"
781
	"vbroadcastsd      16 *  8(%%rbx), %%ymm2    \n\t"
782
	"vbroadcastsd      17 *  8(%%rbx), %%ymm3    \n\t"
783
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
784
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
785
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
786
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
787
	"                                            \n\t"
788
	"vmovaps            2 * 32(%%rax), %%ymm0    \n\t"
789
	"vmovaps            3 * 32(%%rax), %%ymm1    \n\t"
790
	"                                            \n\t"
791
	"                                            \n\t" // iteration 3
792
	"vbroadcastsd      18 *  8(%%rbx), %%ymm2    \n\t"
793
	"vbroadcastsd      19 *  8(%%rbx), %%ymm3    \n\t"
794
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
795
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
796
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
797
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
798
	"                                            \n\t"
799
	"vbroadcastsd      20 *  8(%%rbx), %%ymm2    \n\t"
800
	"vbroadcastsd      21 *  8(%%rbx), %%ymm3    \n\t"
801
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
802
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
803
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
804
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
805
	"                                            \n\t"
806
	"vbroadcastsd      22 *  8(%%rbx), %%ymm2    \n\t"
807
	"vbroadcastsd      23 *  8(%%rbx), %%ymm3    \n\t"
808
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
809
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
810
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
811
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
812
	"                                            \n\t"
813
	"addq           $4 * 8 * 8, %%rax            \n\t" // a += 4*8 (unroll x mr)
814
	"addq           $4 * 6 * 8, %%rbx            \n\t" // b += 4*6 (unroll x nr)
815
	"                                            \n\t"
816
	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
817
	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
818
	"                                            \n\t"
819
	"                                            \n\t"
820
	"decq   %%rsi                                \n\t" // i -= 1;
821
	"jne    .DLOOPKITER                          \n\t" // iterate again if i != 0.
822
	"                                            \n\t"
823
	"                                            \n\t"
824
	"                                            \n\t"
825
	"                                            \n\t"
826
	"                                            \n\t"
827
	"                                            \n\t"
828
	".DCONSIDKLEFT:                              \n\t"
829
	"                                            \n\t"
830
	"movq      %1, %%rsi                         \n\t" // i = k_left;
831
	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
832
	"je     .DPOSTACCUM                          \n\t" // if i == 0, we're done; jump to end.
833
	"                                            \n\t" // else, we prepare to enter k_left loop.
834
	"                                            \n\t"
835
	"                                            \n\t"
836
	".DLOOPKLEFT:                                \n\t" // EDGE LOOP
837
	"                                            \n\t"
838
	"prefetcht0   64 * 8(%%rax)                  \n\t"
839
	"                                            \n\t"
840
	"vbroadcastsd       0 *  8(%%rbx), %%ymm2    \n\t"
841
	"vbroadcastsd       1 *  8(%%rbx), %%ymm3    \n\t"
842
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
843
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
844
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
845
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
846
	"                                            \n\t"
847
	"vbroadcastsd       2 *  8(%%rbx), %%ymm2    \n\t"
848
	"vbroadcastsd       3 *  8(%%rbx), %%ymm3    \n\t"
849
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
850
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
851
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
852
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
853
	"                                            \n\t"
854
	"vbroadcastsd       4 *  8(%%rbx), %%ymm2    \n\t"
855
	"vbroadcastsd       5 *  8(%%rbx), %%ymm3    \n\t"
856
	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
857
	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
858
	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
859
	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
860
	"                                            \n\t"
861
	"addq           $1 * 8 * 8, %%rax            \n\t" // a += 1*8 (unroll x mr)
862
	"addq           $1 * 6 * 8, %%rbx            \n\t" // b += 1*6 (unroll x nr)
863
	"                                            \n\t"
864
	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
865
	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
866
	"                                            \n\t"
867
	"                                            \n\t"
868
	"decq   %%rsi                                \n\t" // i -= 1;
869
	"jne    .DLOOPKLEFT                          \n\t" // iterate again if i != 0.
870
	"                                            \n\t"
871
	"                                            \n\t"
872
	"                                            \n\t"
873
	".DPOSTACCUM:                                \n\t"
874
	"                                            \n\t"
875
	"                                            \n\t"
876
	"                                            \n\t"
877
	"                                            \n\t"
878
	"movq         %4, %%rax                      \n\t" // load address of alpha
879
	"movq         %5, %%rbx                      \n\t" // load address of beta
880
	"vbroadcastsd    (%%rax), %%ymm0             \n\t" // load alpha and duplicate
881
	"vbroadcastsd    (%%rbx), %%ymm3             \n\t" // load beta and duplicate
882
	"                                            \n\t"
883
	"vmulpd           %%ymm0,  %%ymm4,  %%ymm4   \n\t" // scale by alpha
884
	"vmulpd           %%ymm0,  %%ymm5,  %%ymm5   \n\t"
885
	"vmulpd           %%ymm0,  %%ymm6,  %%ymm6   \n\t"
886
	"vmulpd           %%ymm0,  %%ymm7,  %%ymm7   \n\t"
887
	"vmulpd           %%ymm0,  %%ymm8,  %%ymm8   \n\t"
888
	"vmulpd           %%ymm0,  %%ymm9,  %%ymm9   \n\t"
889
	"vmulpd           %%ymm0,  %%ymm10, %%ymm10  \n\t"
890
	"vmulpd           %%ymm0,  %%ymm11, %%ymm11  \n\t"
891
	"vmulpd           %%ymm0,  %%ymm12, %%ymm12  \n\t"
892
	"vmulpd           %%ymm0,  %%ymm13, %%ymm13  \n\t"
893
	"vmulpd           %%ymm0,  %%ymm14, %%ymm14  \n\t"
894
	"vmulpd           %%ymm0,  %%ymm15, %%ymm15  \n\t"
895
	"                                            \n\t"
896
	"                                            \n\t"
897
	"                                            \n\t"
898
	"                                            \n\t"
899
	"                                            \n\t"
900
	"                                            \n\t"
901
	"movq                %7, %%rsi               \n\t" // load rs_c
902
	"leaq        (,%%rsi,8), %%rsi               \n\t" // rsi = rs_c * sizeof(double)
903
	"                                            \n\t"
904
	"leaq   (%%rcx,%%rsi,4), %%rdx               \n\t" // load address of c +  4*rs_c;
905
	"                                            \n\t"
906
	"leaq   (%%rsi,%%rsi,2), %%r13               \n\t" // r13 = 3*rs_c;
907
	//"leaq   (%%rsi,%%rsi,4), %%r15               \n\t" // r15 = 5*rs_c;
908
	//"leaq   (%%r13,%%rsi,4), %%r10               \n\t" // r10 = 7*rs_c;
909
	"                                            \n\t"
910
	"                                            \n\t"
911
	"                                            \n\t" // now avoid loading C if beta == 0
912
	"                                            \n\t"
913
	"vxorpd    %%ymm0,  %%ymm0,  %%ymm0          \n\t" // set ymm0 to zero.
914
	"vucomisd  %%xmm0,  %%xmm3                   \n\t" // set ZF if beta == 0.
915
	"je      .DBETAZERO                          \n\t" // if ZF = 1, jump to beta == 0 case
916
	"                                            \n\t"
917
	"                                            \n\t"
918
	"cmpq       $8, %%rsi                        \n\t" // set ZF if (8*rs_c) == 8.
919
	"jz      .DCOLSTORED                         \n\t" // jump to column storage case
920
	"                                            \n\t"
921
	"                                            \n\t"
922
	"                                            \n\t"
923
	".DGENSTORED:                                \n\t"
924
	"                                            \n\t"
925
	"                                            \n\t"
926
	DGEMM_INPUT_GS_BETA_NZ
927
	"vfmadd213pd      %%ymm4,  %%ymm3,  %%ymm0   \n\t"
928
	DGEMM_OUTPUT_GS_BETA_NZ
929
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
930
	"                                            \n\t"
931
	"                                            \n\t"
932
	DGEMM_INPUT_GS_BETA_NZ
933
	"vfmadd213pd      %%ymm6,  %%ymm3,  %%ymm0   \n\t"
934
	DGEMM_OUTPUT_GS_BETA_NZ
935
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
936
	"                                            \n\t"
937
	"                                            \n\t"
938
	DGEMM_INPUT_GS_BETA_NZ
939
	"vfmadd213pd      %%ymm8,  %%ymm3,  %%ymm0   \n\t"
940
	DGEMM_OUTPUT_GS_BETA_NZ
941
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
942
	"                                            \n\t"
943
	"                                            \n\t"
944
	DGEMM_INPUT_GS_BETA_NZ
945
	"vfmadd213pd      %%ymm10, %%ymm3,  %%ymm0   \n\t"
946
	DGEMM_OUTPUT_GS_BETA_NZ
947
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
948
	"                                            \n\t"
949
	"                                            \n\t"
950
	DGEMM_INPUT_GS_BETA_NZ
951
	"vfmadd213pd      %%ymm12, %%ymm3,  %%ymm0   \n\t"
952
	DGEMM_OUTPUT_GS_BETA_NZ
953
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
954
	"                                            \n\t"
955
	"                                            \n\t"
956
	DGEMM_INPUT_GS_BETA_NZ
957
	"vfmadd213pd      %%ymm14, %%ymm3,  %%ymm0   \n\t"
958
	DGEMM_OUTPUT_GS_BETA_NZ
959
	//"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
960
	"                                            \n\t"
961
	"                                            \n\t"
962
	"movq      %%rdx, %%rcx                      \n\t" // rcx = c + 4*rs_c
963
	"                                            \n\t"
964
	"                                            \n\t"
965
	DGEMM_INPUT_GS_BETA_NZ
966
	"vfmadd213pd      %%ymm5,  %%ymm3,  %%ymm0   \n\t"
967
	DGEMM_OUTPUT_GS_BETA_NZ
968
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
969
	"                                            \n\t"
970
	"                                            \n\t"
971
	DGEMM_INPUT_GS_BETA_NZ
972
	"vfmadd213pd      %%ymm7,  %%ymm3,  %%ymm0   \n\t"
973
	DGEMM_OUTPUT_GS_BETA_NZ
974
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
975
	"                                            \n\t"
976
	"                                            \n\t"
977
	DGEMM_INPUT_GS_BETA_NZ
978
	"vfmadd213pd      %%ymm9,  %%ymm3,  %%ymm0   \n\t"
979
	DGEMM_OUTPUT_GS_BETA_NZ
980
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
981
	"                                            \n\t"
982
	"                                            \n\t"
983
	DGEMM_INPUT_GS_BETA_NZ
984
	"vfmadd213pd      %%ymm11, %%ymm3,  %%ymm0   \n\t"
985
	DGEMM_OUTPUT_GS_BETA_NZ
986
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
987
	"                                            \n\t"
988
	"                                            \n\t"
989
	DGEMM_INPUT_GS_BETA_NZ
990
	"vfmadd213pd      %%ymm13, %%ymm3,  %%ymm0   \n\t"
991
	DGEMM_OUTPUT_GS_BETA_NZ
992
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
993
	"                                            \n\t"
994
	"                                            \n\t"
995
	DGEMM_INPUT_GS_BETA_NZ
996
	"vfmadd213pd      %%ymm15, %%ymm3,  %%ymm0   \n\t"
997
	DGEMM_OUTPUT_GS_BETA_NZ
998
	//"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
999
	"                                            \n\t"
1000
	"                                            \n\t"
1001
	"                                            \n\t"
1002
	"jmp    .DDONE                               \n\t" // jump to end.
1003
	"                                            \n\t"
1004
	"                                            \n\t"
1005
	"                                            \n\t"
1006
	".DCOLSTORED:                                \n\t"
1007
	"                                            \n\t"
1008
	"                                            \n\t"
1009
	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm4    \n\t"
1010
	"vmovups          %%ymm4,  (%%rcx)           \n\t"
1011
	"addq      %%rdi, %%rcx                      \n\t"
1012
	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm5    \n\t"
1013
	"vmovups          %%ymm5,  (%%rdx)           \n\t"
1014
	"addq      %%rdi, %%rdx                      \n\t"
1015
	"                                            \n\t"
1016
	"                                            \n\t"
1017
	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm6    \n\t"
1018
	"vmovups          %%ymm6,  (%%rcx)           \n\t"
1019
	"addq      %%rdi, %%rcx                      \n\t"
1020
	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm7    \n\t"
1021
	"vmovups          %%ymm7,  (%%rdx)           \n\t"
1022
	"addq      %%rdi, %%rdx                      \n\t"
1023
	"                                            \n\t"
1024
	"                                            \n\t"
1025
	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm8    \n\t"
1026
	"vmovups          %%ymm8,  (%%rcx)           \n\t"
1027
	"addq      %%rdi, %%rcx                      \n\t"
1028
	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm9    \n\t"
1029
	"vmovups          %%ymm9,  (%%rdx)           \n\t"
1030
	"addq      %%rdi, %%rdx                      \n\t"
1031
	"                                            \n\t"
1032
	"                                            \n\t"
1033
	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm10   \n\t"
1034
	"vmovups          %%ymm10, (%%rcx)           \n\t"
1035
	"addq      %%rdi, %%rcx                      \n\t"
1036
	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm11   \n\t"
1037
	"vmovups          %%ymm11, (%%rdx)           \n\t"
1038
	"addq      %%rdi, %%rdx                      \n\t"
1039
	"                                            \n\t"
1040
	"                                            \n\t"
1041
	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm12   \n\t"
1042
	"vmovups          %%ymm12, (%%rcx)           \n\t"
1043
	"addq      %%rdi, %%rcx                      \n\t"
1044
	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm13   \n\t"
1045
	"vmovups          %%ymm13, (%%rdx)           \n\t"
1046
	"addq      %%rdi, %%rdx                      \n\t"
1047
	"                                            \n\t"
1048
	"                                            \n\t"
1049
	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm14   \n\t"
1050
	"vmovups          %%ymm14, (%%rcx)           \n\t"
1051
	//"addq      %%rdi, %%rcx                      \n\t"
1052
	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm15   \n\t"
1053
	"vmovups          %%ymm15, (%%rdx)           \n\t"
1054
	//"addq      %%rdi, %%rdx                      \n\t"
1055
	"                                            \n\t"
1056
	"                                            \n\t"
1057
	"                                            \n\t"
1058
	"jmp    .DDONE                               \n\t" // jump to end.
1059
	"                                            \n\t"
1060
	"                                            \n\t"
1061
	"                                            \n\t"
1062
	".DBETAZERO:                                 \n\t"
1063
	"                                            \n\t"
1064
	"cmpq       $8, %%rsi                        \n\t" // set ZF if (8*rs_c) == 8.
1065
	"jz      .DCOLSTORBZ                         \n\t" // jump to column storage case
1066
	"                                            \n\t"
1067
	"                                            \n\t"
1068
	"                                            \n\t"
1069
	".DGENSTORBZ:                                \n\t"
1070
	"                                            \n\t"
1071
	"                                            \n\t"
1072
	"vmovaps           %%ymm4,  %%ymm0           \n\t"
1073
	DGEMM_OUTPUT_GS_BETA_NZ
1074
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1075
	"                                            \n\t"
1076
	"                                            \n\t"
1077
	"vmovaps           %%ymm6,  %%ymm0           \n\t"
1078
	DGEMM_OUTPUT_GS_BETA_NZ
1079
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1080
	"                                            \n\t"
1081
	"                                            \n\t"
1082
	"vmovaps           %%ymm8,  %%ymm0           \n\t"
1083
	DGEMM_OUTPUT_GS_BETA_NZ
1084
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1085
	"                                            \n\t"
1086
	"                                            \n\t"
1087
	"vmovaps           %%ymm10, %%ymm0           \n\t"
1088
	DGEMM_OUTPUT_GS_BETA_NZ
1089
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1090
	"                                            \n\t"
1091
	"                                            \n\t"
1092
	"vmovaps           %%ymm12, %%ymm0           \n\t"
1093
	DGEMM_OUTPUT_GS_BETA_NZ
1094
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1095
	"                                            \n\t"
1096
	"                                            \n\t"
1097
	"vmovaps           %%ymm14, %%ymm0           \n\t"
1098
	DGEMM_OUTPUT_GS_BETA_NZ
1099
	//"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1100
	"                                            \n\t"
1101
	"                                            \n\t"
1102
	"movq      %%rdx, %%rcx                      \n\t" // rcx = c + 4*rs_c
1103
	"                                            \n\t"
1104
	"                                            \n\t"
1105
	"vmovaps           %%ymm5,  %%ymm0           \n\t"
1106
	DGEMM_OUTPUT_GS_BETA_NZ
1107
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1108
	"                                            \n\t"
1109
	"                                            \n\t"
1110
	"vmovaps           %%ymm7,  %%ymm0           \n\t"
1111
	DGEMM_OUTPUT_GS_BETA_NZ
1112
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1113
	"                                            \n\t"
1114
	"                                            \n\t"
1115
	"vmovaps           %%ymm9,  %%ymm0           \n\t"
1116
	DGEMM_OUTPUT_GS_BETA_NZ
1117
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1118
	"                                            \n\t"
1119
	"                                            \n\t"
1120
	"vmovaps           %%ymm11, %%ymm0           \n\t"
1121
	DGEMM_OUTPUT_GS_BETA_NZ
1122
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1123
	"                                            \n\t"
1124
	"                                            \n\t"
1125
	"vmovaps           %%ymm13, %%ymm0           \n\t"
1126
	DGEMM_OUTPUT_GS_BETA_NZ
1127
	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1128
	"                                            \n\t"
1129
	"                                            \n\t"
1130
	"vmovaps           %%ymm15, %%ymm0           \n\t"
1131
	DGEMM_OUTPUT_GS_BETA_NZ
1132
	//"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1133
	"                                            \n\t"
1134
	"                                            \n\t"
1135
	"                                            \n\t"
1136
	"jmp    .DDONE                               \n\t" // jump to end.
1137
	"                                            \n\t"
1138
	"                                            \n\t"
1139
	"                                            \n\t"
1140
	".DCOLSTORBZ:                                \n\t"
1141
	"                                            \n\t"
1142
	"                                            \n\t"
1143
	"vmovups          %%ymm4,  (%%rcx)           \n\t"
1144
	"addq      %%rdi, %%rcx                      \n\t"
1145
	"vmovups          %%ymm5,  (%%rdx)           \n\t"
1146
	"addq      %%rdi, %%rdx                      \n\t"
1147
	"                                            \n\t"
1148
	"vmovups          %%ymm6,  (%%rcx)           \n\t"
1149
	"addq      %%rdi, %%rcx                      \n\t"
1150
	"vmovups          %%ymm7,  (%%rdx)           \n\t"
1151
	"addq      %%rdi, %%rdx                      \n\t"
1152
	"                                            \n\t"
1153
	"                                            \n\t"
1154
	"vmovups          %%ymm8,  (%%rcx)           \n\t"
1155
	"addq      %%rdi, %%rcx                      \n\t"
1156
	"vmovups          %%ymm9,  (%%rdx)           \n\t"
1157
	"addq      %%rdi, %%rdx                      \n\t"
1158
	"                                            \n\t"
1159
	"                                            \n\t"
1160
	"vmovups          %%ymm10, (%%rcx)           \n\t"
1161
	"addq      %%rdi, %%rcx                      \n\t"
1162
	"vmovups          %%ymm11, (%%rdx)           \n\t"
1163
	"addq      %%rdi, %%rdx                      \n\t"
1164
	"                                            \n\t"
1165
	"                                            \n\t"
1166
	"vmovups          %%ymm12, (%%rcx)           \n\t"
1167
	"addq      %%rdi, %%rcx                      \n\t"
1168
	"vmovups          %%ymm13, (%%rdx)           \n\t"
1169
	"addq      %%rdi, %%rdx                      \n\t"
1170
	"                                            \n\t"
1171
	"                                            \n\t"
1172
	"vmovups          %%ymm14, (%%rcx)           \n\t"
1173
	//"addq      %%rdi, %%rcx                      \n\t"
1174
	"vmovups          %%ymm15, (%%rdx)           \n\t"
1175
	//"addq      %%rdi, %%rdx                      \n\t"
1176
	"                                            \n\t"
1177
	"                                            \n\t"
1178
	"                                            \n\t"
1179
	"                                            \n\t"
1180
	"                                            \n\t"
1181
	"                                            \n\t"
1182
	"                                            \n\t"
1183
	".DDONE:                                     \n\t"
1184
    "                                            \n\t"
1185
    "vzeroupper                                  \n\t"
1186
	"                                            \n\t"
1187
1188
	: // output operands (none)
1189
	: // input operands
1190
	  "m" (k_iter), // 0
1191
	  "m" (k_left), // 1
1192
	  "m" (a),      // 2
1193
	  "m" (b),      // 3
1194
	  "m" (alpha),  // 4
1195
	  "m" (beta),   // 5
1196
	  "m" (c),      // 6
1197
	  "m" (rs_c),   // 7
1198
	  "m" (cs_c)/*,   // 8
1199
	  "m" (b_next), // 9
1200
	  "m" (a_next)*/  // 10
1201
	: // register clobber list
1202
	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
1203
	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
1204
	  "xmm0", "xmm1", "xmm2", "xmm3",
1205
	  "xmm4", "xmm5", "xmm6", "xmm7",
1206
	  "xmm8", "xmm9", "xmm10", "xmm11",
1207
	  "xmm12", "xmm13", "xmm14", "xmm15",
1208
	  "memory"
1209
	);
1210
}
1211
1212
1213
1214
1215
// assumes beta.r, beta.i have been broadcast into ymm1, ymm2.
1216
// outputs to ymm0
1217
#define CGEMM_INPUT_SCALE_GS_BETA_NZ \
1218
	"vmovlpd    (%%rcx        ),  %%xmm0,  %%xmm0  \n\t" \
1219
	"vmovhpd    (%%rcx,%%rsi,1),  %%xmm0,  %%xmm0  \n\t" \
1220
	"vmovlpd    (%%rcx,%%rsi,2),  %%xmm3,  %%xmm3  \n\t" \
1221
	"vmovhpd    (%%rcx,%%r13  ),  %%xmm3,  %%xmm3  \n\t" \
1222
	"vinsertf128     $1, %%xmm3,  %%ymm0,  %%ymm0  \n\t" \
1223
	"vpermilps    $0xb1, %%ymm0,  %%ymm3           \n\t" \
1224
	"vmulps              %%ymm1,  %%ymm0,  %%ymm0  \n\t" \
1225
	"vmulps              %%ymm2,  %%ymm3,  %%ymm3  \n\t" \
1226
	"vaddsubps           %%ymm3,  %%ymm0,  %%ymm0  \n\t"
1227
1228
// assumes values to output are in ymm0
1229
#define CGEMM_OUTPUT_GS \
1230
	"vextractf128    $1, %%ymm0,  %%xmm3           \n\t" \
1231
	"vmovlpd             %%xmm0,  (%%rcx        )  \n\t" \
1232
	"vmovhpd             %%xmm0,  (%%rcx,%%rsi,1)  \n\t" \
1233
	"vmovlpd             %%xmm3,  (%%rcx,%%rsi,2)  \n\t" \
1234
	"vmovhpd             %%xmm3,  (%%rcx,%%r13  )  \n\t"
1235
1236
#define CGEMM_INPUT_SCALE_CS_BETA_NZ \
1237
	"vmovups    (%%rcx),       %%ymm0            \n\t" \
1238
	"vpermilps $0xb1, %%ymm0,  %%ymm3            \n\t" \
1239
	"vmulps           %%ymm1,  %%ymm0,  %%ymm0   \n\t" \
1240
	"vmulps           %%ymm2,  %%ymm3,  %%ymm3   \n\t" \
1241
	"vaddsubps        %%ymm3,  %%ymm0,  %%ymm0   \n\t"
1242
1243
#define CGEMM_OUTPUT_CS \
1244
	"vmovups           %%ymm0,  (%%rcx)          \n\t" \
1245
1246
//void bli_cgemm_asm_8x3
1247
//     (
1248
//       dim_t               k,
1249
//       scomplex*  restrict alpha,
1250
//       scomplex*  restrict a,
1251
//       scomplex*  restrict b,
1252
//       scomplex*  restrict beta,
1253
//       scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
1254
//       auxinfo_t* restrict data,
1255
//       cntx_t*    restrict cntx
1256
//     )
1257
//{
1258
//	//void*   a_next = bli_auxinfo_next_a( data );
1259
//	//void*   b_next = bli_auxinfo_next_b( data );
1260
//
1261
//	uint64_t   k_iter = k / 4;
1262
//	uint64_t   k_left = k % 4;
1263
//
1264
//	__asm__ volatile
1265
//	(
1266
//	"                                            \n\t"
1267
//	"vzeroall                                    \n\t" // zero all xmm/ymm registers.
1268
//	"                                            \n\t"
1269
//	"                                            \n\t"
1270
//	"movq                %2, %%rax               \n\t" // load address of a.
1271
//	"movq                %3, %%rbx               \n\t" // load address of b.
1272
//	//"movq                %9, %%r15               \n\t" // load address of b_next.
1273
//	"                                            \n\t"
1274
//	"addq           $32 * 4, %%rax               \n\t"
1275
//	"                                            \n\t" // initialize loop by pre-loading
1276
//	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
1277
//	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
1278
//	"                                            \n\t"
1279
//	"movq                %6, %%rcx               \n\t" // load address of c
1280
//	"movq                %8, %%rdi               \n\t" // load cs_c
1281
//	"leaq        (,%%rdi,8), %%rdi               \n\t" // cs_c *= sizeof(scomplex)
1282
//	"                                            \n\t"
1283
//	"leaq   (%%rcx,%%rdi,1), %%r11               \n\t" // r11 = c + 1*cs_c;
1284
//	"leaq   (%%rcx,%%rdi,2), %%r12               \n\t" // r12 = c + 2*cs_c;
1285
//	"                                            \n\t"
1286
//	"prefetcht0   7 * 8(%%rcx)                   \n\t" // prefetch c + 0*cs_c
1287
//	"prefetcht0   7 * 8(%%r11)                   \n\t" // prefetch c + 1*cs_c
1288
//	"prefetcht0   7 * 8(%%r12)                   \n\t" // prefetch c + 2*cs_c
1289
//	"                                            \n\t"
1290
//	"                                            \n\t"
1291
//	"                                            \n\t"
1292
//	"                                            \n\t"
1293
//	"movq      %0, %%rsi                         \n\t" // i = k_iter;
1294
//	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
1295
//	"je     .CCONSIDKLEFT                        \n\t" // if i == 0, jump to code that
1296
//	"                                            \n\t" // contains the k_left loop.
1297
//	"                                            \n\t"
1298
//	"                                            \n\t"
1299
//	".CLOOPKITER:                                \n\t" // MAIN LOOP
1300
//	"                                            \n\t"
1301
//	"                                            \n\t"
1302
//	"                                            \n\t" // iteration 0
1303
//	"prefetcht0   32 * 8(%%rax)                  \n\t"
1304
//	"                                            \n\t"
1305
//	"vbroadcastss       0 *  4(%%rbx), %%ymm2    \n\t"
1306
//	"vbroadcastss       1 *  4(%%rbx), %%ymm3    \n\t"
1307
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
1308
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
1309
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
1310
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
1311
//	"                                            \n\t"
1312
//	"vbroadcastss       2 *  4(%%rbx), %%ymm2    \n\t"
1313
//	"vbroadcastss       3 *  4(%%rbx), %%ymm3    \n\t"
1314
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
1315
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
1316
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
1317
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
1318
//	"                                            \n\t"
1319
//	"vbroadcastss       4 *  4(%%rbx), %%ymm2    \n\t"
1320
//	"vbroadcastss       5 *  4(%%rbx), %%ymm3    \n\t"
1321
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
1322
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
1323
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
1324
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
1325
//	"                                            \n\t"
1326
//	"vmovaps           -2 * 32(%%rax), %%ymm0    \n\t"
1327
//	"vmovaps           -1 * 32(%%rax), %%ymm1    \n\t"
1328
//	"                                            \n\t"
1329
//	"                                            \n\t" // iteration 1
1330
//	"vbroadcastss       6 *  4(%%rbx), %%ymm2    \n\t"
1331
//	"vbroadcastss       7 *  4(%%rbx), %%ymm3    \n\t"
1332
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
1333
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
1334
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
1335
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
1336
//	"                                            \n\t"
1337
//	"vbroadcastss       8 *  4(%%rbx), %%ymm2    \n\t"
1338
//	"vbroadcastss       9 *  4(%%rbx), %%ymm3    \n\t"
1339
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
1340
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
1341
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
1342
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
1343
//	"                                            \n\t"
1344
//	"vbroadcastss      10 *  4(%%rbx), %%ymm2    \n\t"
1345
//	"vbroadcastss      11 *  4(%%rbx), %%ymm3    \n\t"
1346
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
1347
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
1348
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
1349
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
1350
//	"                                            \n\t"
1351
//	"vmovaps            0 * 32(%%rax), %%ymm0    \n\t"
1352
//	"vmovaps            1 * 32(%%rax), %%ymm1    \n\t"
1353
//	"                                            \n\t"
1354
//	"                                            \n\t" // iteration 2
1355
//	"prefetcht0   38 * 8(%%rax)                  \n\t"
1356
//	"                                            \n\t"
1357
//	"vbroadcastss      12 *  4(%%rbx), %%ymm2    \n\t"
1358
//	"vbroadcastss      13 *  4(%%rbx), %%ymm3    \n\t"
1359
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
1360
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
1361
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
1362
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
1363
//	"                                            \n\t"
1364
//	"vbroadcastss      14 *  4(%%rbx), %%ymm2    \n\t"
1365
//	"vbroadcastss      15 *  4(%%rbx), %%ymm3    \n\t"
1366
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
1367
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
1368
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
1369
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
1370
//	"                                            \n\t"
1371
//	"vbroadcastss      16 *  4(%%rbx), %%ymm2    \n\t"
1372
//	"vbroadcastss      17 *  4(%%rbx), %%ymm3    \n\t"
1373
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
1374
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
1375
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
1376
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
1377
//	"                                            \n\t"
1378
//	"vmovaps            2 * 32(%%rax), %%ymm0    \n\t"
1379
//	"vmovaps            3 * 32(%%rax), %%ymm1    \n\t"
1380
//	"                                            \n\t"
1381
//	"                                            \n\t" // iteration 3
1382
//	"vbroadcastss      18 *  4(%%rbx), %%ymm2    \n\t"
1383
//	"vbroadcastss      19 *  4(%%rbx), %%ymm3    \n\t"
1384
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
1385
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
1386
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
1387
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
1388
//	"                                            \n\t"
1389
//	"vbroadcastss      20 *  4(%%rbx), %%ymm2    \n\t"
1390
//	"vbroadcastss      21 *  4(%%rbx), %%ymm3    \n\t"
1391
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
1392
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
1393
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
1394
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
1395
//	"                                            \n\t"
1396
//	"vbroadcastss      22 *  4(%%rbx), %%ymm2    \n\t"
1397
//	"vbroadcastss      23 *  4(%%rbx), %%ymm3    \n\t"
1398
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
1399
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
1400
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
1401
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
1402
//	"                                            \n\t"
1403
//	"addq          $4 *  8 * 8, %%rax            \n\t" // a += 4*8  (unroll x mr)
1404
//	"addq          $4 *  3 * 8, %%rbx            \n\t" // b += 4*3  (unroll x nr)
1405
//	"                                            \n\t"
1406
//	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
1407
//	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
1408
//	"                                            \n\t"
1409
//	"                                            \n\t"
1410
//	"decq   %%rsi                                \n\t" // i -= 1;
1411
//	"jne    .CLOOPKITER                          \n\t" // iterate again if i != 0.
1412
//	"                                            \n\t"
1413
//	"                                            \n\t"
1414
//	"                                            \n\t"
1415
//	"                                            \n\t"
1416
//	"                                            \n\t"
1417
//	"                                            \n\t"
1418
//	".CCONSIDKLEFT:                              \n\t"
1419
//	"                                            \n\t"
1420
//	"movq      %1, %%rsi                         \n\t" // i = k_left;
1421
//	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
1422
//	"je     .CPOSTACCUM                          \n\t" // if i == 0, we're done; jump to end.
1423
//	"                                            \n\t" // else, we prepare to enter k_left loop.
1424
//	"                                            \n\t"
1425
//	"                                            \n\t"
1426
//	".CLOOPKLEFT:                                \n\t" // EDGE LOOP
1427
//	"                                            \n\t"
1428
//	"prefetcht0   32 * 8(%%rax)                  \n\t"
1429
//	"                                            \n\t"
1430
//	"vbroadcastss       0 *  4(%%rbx), %%ymm2    \n\t"
1431
//	"vbroadcastss       1 *  4(%%rbx), %%ymm3    \n\t"
1432
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
1433
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
1434
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
1435
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
1436
//	"                                            \n\t"
1437
//	"vbroadcastss       2 *  4(%%rbx), %%ymm2    \n\t"
1438
//	"vbroadcastss       3 *  4(%%rbx), %%ymm3    \n\t"
1439
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
1440
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
1441
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
1442
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
1443
//	"                                            \n\t"
1444
//	"vbroadcastss       4 *  4(%%rbx), %%ymm2    \n\t"
1445
//	"vbroadcastss       5 *  4(%%rbx), %%ymm3    \n\t"
1446
//	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
1447
//	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
1448
//	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
1449
//	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
1450
//	"                                            \n\t"
1451
//	"addq          $1 *  8 * 8, %%rax            \n\t" // a += 1*8  (unroll x mr)
1452
//	"addq          $1 *  3 * 8, %%rbx            \n\t" // b += 1*3  (unroll x nr)
1453
//	"                                            \n\t"
1454
//	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
1455
//	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
1456
//	"                                            \n\t"
1457
//	"                                            \n\t"
1458
//	"decq   %%rsi                                \n\t" // i -= 1;
1459
//	"jne    .CLOOPKLEFT                          \n\t" // iterate again if i != 0.
1460
//	"                                            \n\t"
1461
//	"                                            \n\t"
1462
//	"                                            \n\t"
1463
//	".CPOSTACCUM:                                \n\t"
1464
//	"                                            \n\t"
1465
//	"                                            \n\t"
1466
//	"                                            \n\t" // permute even and odd elements
1467
//	"                                            \n\t" // of ymm6/7, ymm10/11, ymm/14/15
1468
//	"vpermilps $0xb1, %%ymm6,  %%ymm6            \n\t"
1469
//	"vpermilps $0xb1, %%ymm7,  %%ymm7            \n\t"
1470
//	"vpermilps $0xb1, %%ymm10, %%ymm10           \n\t"
1471
//	"vpermilps $0xb1, %%ymm11, %%ymm11           \n\t"
1472
//	"vpermilps $0xb1, %%ymm14, %%ymm14           \n\t"
1473
//	"vpermilps $0xb1, %%ymm15, %%ymm15           \n\t"
1474
//	"                                            \n\t"
1475
//	"                                            \n\t"
1476
//	"                                            \n\t" // subtract/add even/odd elements
1477
//	"vaddsubps        %%ymm6,  %%ymm4,  %%ymm4   \n\t"
1478
//	"vaddsubps        %%ymm7,  %%ymm5,  %%ymm5   \n\t"
1479
//	"                                            \n\t"
1480
//	"vaddsubps        %%ymm10, %%ymm8,  %%ymm8   \n\t"
1481
//	"vaddsubps        %%ymm11, %%ymm9,  %%ymm9   \n\t"
1482
//	"                                            \n\t"
1483
//	"vaddsubps        %%ymm14, %%ymm12, %%ymm12  \n\t"
1484
//	"vaddsubps        %%ymm15, %%ymm13, %%ymm13  \n\t"
1485
//	"                                            \n\t"
1486
//	"                                            \n\t"
1487
//	"                                            \n\t"
1488
//	"                                            \n\t"
1489
//	"movq         %4, %%rax                      \n\t" // load address of alpha
1490
//	"vbroadcastss    (%%rax), %%ymm0             \n\t" // load alpha_r and duplicate
1491
//	"vbroadcastss   4(%%rax), %%ymm1             \n\t" // load alpha_i and duplicate
1492
//	"                                            \n\t"
1493
//	"                                            \n\t"
1494
//	"vpermilps $0xb1, %%ymm4,  %%ymm3            \n\t"
1495
//	"vmulps           %%ymm0,  %%ymm4,  %%ymm4   \n\t"
1496
//	"vmulps           %%ymm1,  %%ymm3,  %%ymm3   \n\t"
1497
//	"vaddsubps        %%ymm3,  %%ymm4,  %%ymm4   \n\t"
1498
//	"                                            \n\t"
1499
//	"vpermilps $0xb1, %%ymm5,  %%ymm3            \n\t"
1500
//	"vmulps           %%ymm0,  %%ymm5,  %%ymm5   \n\t"
1501
//	"vmulps           %%ymm1,  %%ymm3,  %%ymm3   \n\t"
1502
//	"vaddsubps        %%ymm3,  %%ymm5,  %%ymm5   \n\t"
1503
//	"                                            \n\t"
1504
//	"                                            \n\t"
1505
//	"vpermilps $0xb1, %%ymm8,  %%ymm3            \n\t"
1506
//	"vmulps           %%ymm0,  %%ymm8,  %%ymm8   \n\t"
1507
//	"vmulps           %%ymm1,  %%ymm3,  %%ymm3   \n\t"
1508
//	"vaddsubps        %%ymm3,  %%ymm8,  %%ymm8   \n\t"
1509
//	"                                            \n\t"
1510
//	"vpermilps $0xb1, %%ymm9,  %%ymm3            \n\t"
1511
//	"vmulps           %%ymm0,  %%ymm9,  %%ymm9   \n\t"
1512
//	"vmulps           %%ymm1,  %%ymm3,  %%ymm3   \n\t"
1513
//	"vaddsubps        %%ymm3,  %%ymm9,  %%ymm9   \n\t"
1514
//	"                                            \n\t"
1515
//	"                                            \n\t"
1516
//	"vpermilps $0xb1, %%ymm12, %%ymm3            \n\t"
1517
//	"vmulps           %%ymm0,  %%ymm12, %%ymm12  \n\t"
1518
//	"vmulps           %%ymm1,  %%ymm3,  %%ymm3   \n\t"
1519
//	"vaddsubps        %%ymm3,  %%ymm12, %%ymm12  \n\t"
1520
//	"                                            \n\t"
1521
//	"vpermilps $0xb1, %%ymm13, %%ymm3            \n\t"
1522
//	"vmulps           %%ymm0,  %%ymm13, %%ymm13  \n\t"
1523
//	"vmulps           %%ymm1,  %%ymm3,  %%ymm3   \n\t"
1524
//	"vaddsubps        %%ymm3,  %%ymm13, %%ymm13  \n\t"
1525
//	"                                            \n\t"
1526
//	"                                            \n\t"
1527
//	"                                            \n\t"
1528
//	"                                            \n\t"
1529
//	"                                            \n\t"
1530
//	"movq         %5, %%rbx                      \n\t" // load address of beta
1531
//	"vbroadcastss    (%%rbx), %%ymm1             \n\t" // load beta_r and duplicate
1532
//	"vbroadcastss   4(%%rbx), %%ymm2             \n\t" // load beta_i and duplicate
1533
//	"                                            \n\t"
1534
//	"                                            \n\t"
1535
//	"                                            \n\t"
1536
//	"                                            \n\t"
1537
//	"movq                %7, %%rsi               \n\t" // load rs_c
1538
//	"leaq        (,%%rsi,8), %%rsi               \n\t" // rsi = rs_c * sizeof(scomplex)
1539
//	"leaq        (,%%rsi,4), %%rdx               \n\t" // rdx = 4*rs_c;
1540
//	"leaq   (%%rsi,%%rsi,2), %%r13               \n\t" // r13 = 3*rs_c;
1541
//	"                                            \n\t"
1542
//	"                                            \n\t"
1543
//	"                                            \n\t"
1544
//	"                                            \n\t" // now avoid loading C if beta == 0
1545
//	"vxorps    %%ymm0,  %%ymm0,  %%ymm0          \n\t" // set ymm0 to zero.
1546
//	"vucomiss  %%xmm0,  %%xmm1                   \n\t" // set ZF if beta_r == 0.
1547
//	"sete       %%r8b                            \n\t" // r8b = ( ZF == 1 ? 1 : 0 );
1548
//	"vucomiss  %%xmm0,  %%xmm2                   \n\t" // set ZF if beta_i == 0.
1549
//	"sete       %%r9b                            \n\t" // r9b = ( ZF == 1 ? 1 : 0 );
1550
//	"andb       %%r8b, %%r9b                     \n\t" // set ZF if r8b & r9b == 1.
1551
//	"jne     .CBETAZERO                          \n\t" // if ZF = 1, jump to beta == 0 case
1552
//	"                                            \n\t"
1553
//	"                                            \n\t"
1554
//	"cmpq       $8, %%rsi                        \n\t" // set ZF if (8*cs_c) == 8.
1555
//	"jz      .CCOLSTORED                         \n\t" // jump to row storage case
1556
//	"                                            \n\t"
1557
//	"                                            \n\t"
1558
//	"                                            \n\t"
1559
//	".CGENSTORED:                                \n\t"
1560
//	"                                            \n\t"
1561
//	"                                            \n\t"
1562
//	CGEMM_INPUT_SCALE_GS_BETA_NZ
1563
//	"vaddps           %%ymm4,  %%ymm0,  %%ymm0   \n\t"
1564
//	CGEMM_OUTPUT_GS
1565
//	"addq      %%rdx, %%rcx                      \n\t" // c += 4*rs_c;
1566
//	"                                            \n\t"
1567
//	"                                            \n\t"
1568
//	CGEMM_INPUT_SCALE_GS_BETA_NZ
1569
//	"vaddps           %%ymm5,  %%ymm0,  %%ymm0   \n\t"
1570
//	CGEMM_OUTPUT_GS
1571
//	"movq      %%r11, %%rcx                      \n\t" // rcx = c + 1*cs_c
1572
//	"                                            \n\t"
1573
//	"                                            \n\t"
1574
//	"                                            \n\t"
1575
//	CGEMM_INPUT_SCALE_GS_BETA_NZ
1576
//	"vaddps           %%ymm8,  %%ymm0,  %%ymm0   \n\t"
1577
//	CGEMM_OUTPUT_GS
1578
//	"addq      %%rdx, %%rcx                      \n\t" // c += 4*rs_c;
1579
//	"                                            \n\t"
1580
//	"                                            \n\t"
1581
//	CGEMM_INPUT_SCALE_GS_BETA_NZ
1582
//	"vaddps           %%ymm9,  %%ymm0,  %%ymm0   \n\t"
1583
//	CGEMM_OUTPUT_GS
1584
//	"movq      %%r12, %%rcx                      \n\t" // rcx = c + 2*cs_c
1585
//	"                                            \n\t"
1586
//	"                                            \n\t"
1587
//	"                                            \n\t"
1588
//	CGEMM_INPUT_SCALE_GS_BETA_NZ
1589
//	"vaddps           %%ymm12, %%ymm0,  %%ymm0   \n\t"
1590
//	CGEMM_OUTPUT_GS
1591
//	"addq      %%rdx, %%rcx                      \n\t" // c += 4*rs_c;
1592
//	"                                            \n\t"
1593
//	"                                            \n\t"
1594
//	CGEMM_INPUT_SCALE_GS_BETA_NZ
1595
//	"vaddps           %%ymm13, %%ymm0,  %%ymm0   \n\t"
1596
//	CGEMM_OUTPUT_GS
1597
//	"                                            \n\t"
1598
//	"                                            \n\t"
1599
//	"                                            \n\t"
1600
//	"jmp    .CDONE                               \n\t" // jump to end.
1601
//	"                                            \n\t"
1602
//	"                                            \n\t"
1603
//	"                                            \n\t"
1604
//	".CCOLSTORED:                                \n\t"
1605
//	"                                            \n\t"
1606
//	"                                            \n\t"
1607
//	CGEMM_INPUT_SCALE_CS_BETA_NZ
1608
//	"vaddps           %%ymm4,  %%ymm0,  %%ymm0   \n\t"
1609
//	CGEMM_OUTPUT_CS
1610
//	"addq      %%rdx, %%rcx                      \n\t" // c += 4*rs_c;
1611
//	"                                            \n\t"
1612
//	"                                            \n\t"
1613
//	CGEMM_INPUT_SCALE_CS_BETA_NZ
1614
//	"vaddps           %%ymm5,  %%ymm0,  %%ymm0   \n\t"
1615
//	CGEMM_OUTPUT_CS
1616
//	"movq      %%r11, %%rcx                      \n\t" // rcx = c + 1*cs_c
1617
//	"                                            \n\t"
1618
//	"                                            \n\t"
1619
//	"                                            \n\t"
1620
//	CGEMM_INPUT_SCALE_CS_BETA_NZ
1621
//	"vaddps           %%ymm8,  %%ymm0,  %%ymm0   \n\t"
1622
//	CGEMM_OUTPUT_CS
1623
//	"addq      %%rdx, %%rcx                      \n\t" // c += 4*rs_c;
1624
//	"                                            \n\t"
1625
//	"                                            \n\t"
1626
//	CGEMM_INPUT_SCALE_CS_BETA_NZ
1627
//	"vaddps           %%ymm9,  %%ymm0,  %%ymm0   \n\t"
1628
//	CGEMM_OUTPUT_CS
1629
//	"movq      %%r12, %%rcx                      \n\t" // rcx = c + 2*cs_c
1630
//	"                                            \n\t"
1631
//	"                                            \n\t"
1632
//	"                                            \n\t"
1633
//	CGEMM_INPUT_SCALE_CS_BETA_NZ
1634
//	"vaddps           %%ymm12, %%ymm0,  %%ymm0   \n\t"
1635
//	CGEMM_OUTPUT_CS
1636
//	"addq      %%rdx, %%rcx                      \n\t" // c += 4*rs_c;
1637
//	"                                            \n\t"
1638
//	"                                            \n\t"
1639
//	CGEMM_INPUT_SCALE_CS_BETA_NZ
1640
//	"vaddps           %%ymm13, %%ymm0,  %%ymm0   \n\t"
1641
//	CGEMM_OUTPUT_CS
1642
//	"                                            \n\t"
1643
//	"                                            \n\t"
1644
//	"                                            \n\t"
1645
//	"jmp    .CDONE                               \n\t" // jump to end.
1646
//	"                                            \n\t"
1647
//	"                                            \n\t"
1648
//	"                                            \n\t"
1649
//	".CBETAZERO:                                 \n\t"
1650
//	"                                            \n\t"
1651
//	"cmpq       $8, %%rsi                        \n\t" // set ZF if (8*rs_c) == 8.
1652
//	"jz      .CCOLSTORBZ                         \n\t" // jump to row storage case
1653
//	"                                            \n\t"
1654
//	"                                            \n\t"
1655
//	"                                            \n\t"
1656
//	".CGENSTORBZ:                                \n\t"
1657
//	"                                            \n\t"
1658
//	"                                            \n\t"
1659
//	"vmovaps          %%ymm4,  %%ymm0            \n\t"
1660
//	CGEMM_OUTPUT_GS
1661
//	"addq      %%rdx, %%rcx                      \n\t" // c += 4*rs_c;
1662
//	"                                            \n\t"
1663
//	"                                            \n\t"
1664
//	"vmovaps          %%ymm5,  %%ymm0            \n\t"
1665
//	CGEMM_OUTPUT_GS
1666
//	"movq      %%r11, %%rcx                      \n\t" // rcx = c + 1*cs_c
1667
//	"                                            \n\t"
1668
//	"                                            \n\t"
1669
//	"                                            \n\t"
1670
//	"vmovaps          %%ymm8,  %%ymm0            \n\t"
1671
//	CGEMM_OUTPUT_GS
1672
//	"addq      %%rdx, %%rcx                      \n\t" // c += 4*rs_c;
1673
//	"                                            \n\t"
1674
//	"                                            \n\t"
1675
//	"vmovaps          %%ymm9,  %%ymm0            \n\t"
1676
//	CGEMM_OUTPUT_GS
1677
//	"movq      %%r12, %%rcx                      \n\t" // rcx = c + 2*cs_c
1678
//	"                                            \n\t"
1679
//	"                                            \n\t"
1680
//	"                                            \n\t"
1681
//	"vmovaps          %%ymm12, %%ymm0            \n\t"
1682
//	CGEMM_OUTPUT_GS
1683
//	"addq      %%rdx, %%rcx                      \n\t" // c += 4*rs_c;
1684
//	"                                            \n\t"
1685
//	"                                            \n\t"
1686
//	"vmovaps          %%ymm13, %%ymm0            \n\t"
1687
//	CGEMM_OUTPUT_GS
1688
//	"                                            \n\t"
1689
//	"                                            \n\t"
1690
//	"                                            \n\t"
1691
//	"jmp    .CDONE                               \n\t" // jump to end.
1692
//	"                                            \n\t"
1693
//	"                                            \n\t"
1694
//	"                                            \n\t"
1695
//	".CCOLSTORBZ:                                \n\t"
1696
//	"                                            \n\t"
1697
//	"                                            \n\t"
1698
//	"vmovups          %%ymm4,  (%%rcx)           \n\t"
1699
//	"vmovups          %%ymm5,  (%%rcx,%%rdx,1)   \n\t"
1700
//	"                                            \n\t"
1701
//	"vmovups          %%ymm8,  (%%r11)           \n\t"
1702
//	"vmovups          %%ymm9,  (%%r11,%%rdx,1)   \n\t"
1703
//	"                                            \n\t"
1704
//	"vmovups          %%ymm12, (%%r12)           \n\t"
1705
//	"vmovups          %%ymm13, (%%r12,%%rdx,1)   \n\t"
1706
//	"                                            \n\t"
1707
//	"                                            \n\t"
1708
//	"                                            \n\t"
1709
//	"                                            \n\t"
1710
//	"                                            \n\t"
1711
//	"                                            \n\t"
1712
//	".CDONE:                                     \n\t"
1713
//    "                                            \n\t"
1714
//    "vzeroupper                                  \n\t"
1715
//	"                                            \n\t"
1716
//
1717
//	: // output operands (none)
1718
//	: // input operands
1719
//	  "m" (k_iter), // 0
1720
//	  "m" (k_left), // 1
1721
//	  "m" (a),      // 2
1722
//	  "m" (b),      // 3
1723
//	  "m" (alpha),  // 4
1724
//	  "m" (beta),   // 5
1725
//	  "m" (c),      // 6
1726
//	  "m" (rs_c),   // 7
1727
//	  "m" (cs_c)/*,   // 8
1728
//	  "m" (b_next), // 9
1729
//	  "m" (a_next)*/  // 10
1730
//	: // register clobber list
1731
//	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
1732
//	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
1733
//	  "xmm0", "xmm1", "xmm2", "xmm3",
1734
//	  "xmm4", "xmm5", "xmm6", "xmm7",
1735
//	  "xmm8", "xmm9", "xmm10", "xmm11",
1736
//	  "xmm12", "xmm13", "xmm14", "xmm15",
1737
//	  "memory"
1738
//	);
1739
//}
1740
//
1741
//
1742
//
1743
//
1744
//// assumes beta.r, beta.i have been broadcast into ymm1, ymm2.
1745
//// outputs to ymm0
1746
//#define ZGEMM_INPUT_SCALE_GS_BETA_NZ \
1747
//	"vmovupd    (%%rcx),       %%xmm0            \n\t" \
1748
//	"vmovupd    (%%rcx,%%rsi), %%xmm3            \n\t" \
1749
//	"vinsertf128  $1, %%xmm3,  %%ymm0,  %%ymm0   \n\t" \
1750
//	"vpermilpd  $0x5, %%ymm0,  %%ymm3            \n\t" \
1751
//	"vmulpd           %%ymm1,  %%ymm0,  %%ymm0   \n\t" \
1752
//	"vmulpd           %%ymm2,  %%ymm3,  %%ymm3   \n\t" \
1753
//	"vaddsubpd        %%ymm3,  %%ymm0,  %%ymm0   \n\t"
1754
//
1755
//// assumes values to output are in ymm0
1756
//#define ZGEMM_OUTPUT_GS \
1757
//	"vextractf128  $1, %%ymm0,  %%xmm3           \n\t" \
1758
//	"vmovupd           %%xmm0,  (%%rcx)          \n\t" \
1759
//	"vmovupd           %%xmm3,  (%%rcx,%%rsi  )  \n\t" \
1760
//
1761
//#define ZGEMM_INPUT_SCALE_CS_BETA_NZ \
1762
//	"vmovups    (%%rcx),       %%ymm0            \n\t" \
1763
//	"vpermilpd  $0x5, %%ymm0,  %%ymm3            \n\t" \
1764
//	"vmulpd           %%ymm1,  %%ymm0,  %%ymm0   \n\t" \
1765
//	"vmulpd           %%ymm2,  %%ymm3,  %%ymm3   \n\t" \
1766
//	"vaddsubpd        %%ymm3,  %%ymm0,  %%ymm0   \n\t"
1767
//
1768
//#define ZGEMM_OUTPUT_CS \
1769
//	"vmovupd           %%ymm0,  (%%rcx)          \n\t" \
1770
//
1771
//void bli_zgemm_asm_4x3
1772
//     (
1773
//       dim_t               k,
1774
//       dcomplex*  restrict alpha,
1775
//       dcomplex*  restrict a,
1776
//       dcomplex*  restrict b,
1777
//       dcomplex*  restrict beta,
1778
//       dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
1779
//       auxinfo_t* restrict data,
1780
//       cntx_t*    restrict cntx
1781
//     )
1782
//{
1783
//	//void*   a_next = bli_auxinfo_next_a( data );
1784
//	//void*   b_next = bli_auxinfo_next_b( data );
1785
//
1786
//    uint64_t   k_iter = k / 4;
1787
//    uint64_t   k_left = k % 4;
1788
//
1789
//	//uint64_t   alpha_is_unit = bli_zeq1( *alpha );
1790
//
1791
//
1792
//	__asm__ volatile
1793
//	(
1794
//	"                                            \n\t"
1795
//	"vzeroall                                    \n\t" // zero all xmm/ymm registers.
1796
//	"                                            \n\t"
1797
//	"                                            \n\t"
1798
//	"movq                %2, %%rax               \n\t" // load address of a.
1799
//	"movq                %3, %%rbx               \n\t" // load address of b.
1800
//	//"movq                %9, %%r15               \n\t" // load address of b_next.
1801
//	"                                            \n\t"
1802
//	"addq           $32 * 4, %%rax               \n\t"
1803
//	"                                            \n\t" // initialize loop by pre-loading
1804
//	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
1805
//	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
1806
//	"                                            \n\t"
1807
//	"movq                %6, %%rcx               \n\t" // load address of c
1808
//	"movq                %8, %%rdi               \n\t" // load cs_c
1809
//	"leaq        (,%%rdi,8), %%rdi               \n\t" // cs_c *= sizeof(dcomplex)
1810
//	"leaq        (,%%rdi,2), %%rdi               \n\t"
1811
//	"                                            \n\t"
1812
//	"leaq   (%%rcx,%%rdi,1), %%r11               \n\t" // r11 = c + 1*cs_c;
1813
//	"leaq   (%%rcx,%%rdi,2), %%r12               \n\t" // r12 = c + 2*cs_c;
1814
//	"                                            \n\t"
1815
//	"prefetcht0   7 * 8(%%rcx)                   \n\t" // prefetch c + 0*cs_c
1816
//	"prefetcht0   7 * 8(%%r11)                   \n\t" // prefetch c + 1*cs_c
1817
//	"prefetcht0   7 * 8(%%r12)                   \n\t" // prefetch c + 2*cs_c
1818
//	"                                            \n\t"
1819
//	"                                            \n\t"
1820
//	"                                            \n\t"
1821
//	"                                            \n\t"
1822
//	"movq      %0, %%rsi                         \n\t" // i = k_iter;
1823
//	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
1824
//	"je     .ZCONSIDKLEFT                        \n\t" // if i == 0, jump to code that
1825
//	"                                            \n\t" // contains the k_left loop.
1826
//	"                                            \n\t"
1827
//	"                                            \n\t"
1828
//	".ZLOOPKITER:                                \n\t" // MAIN LOOP
1829
//	"                                            \n\t"
1830
//	"                                            \n\t"
1831
//	"                                            \n\t" // iteration 0
1832
//	"prefetcht0  32 * 16(%%rax)                  \n\t"
1833
//	"                                            \n\t"
1834
//	"vbroadcastsd       0 *  8(%%rbx), %%ymm2    \n\t"
1835
//	"vbroadcastsd       1 *  8(%%rbx), %%ymm3    \n\t"
1836
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
1837
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
1838
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
1839
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
1840
//	"                                            \n\t"
1841
//	"vbroadcastsd       2 *  8(%%rbx), %%ymm2    \n\t"
1842
//	"vbroadcastsd       3 *  8(%%rbx), %%ymm3    \n\t"
1843
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
1844
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
1845
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
1846
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
1847
//	"                                            \n\t"
1848
//	"vbroadcastsd       4 *  8(%%rbx), %%ymm2    \n\t"
1849
//	"vbroadcastsd       5 *  8(%%rbx), %%ymm3    \n\t"
1850
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
1851
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
1852
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
1853
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
1854
//	"                                            \n\t"
1855
//	"vmovaps           -2 * 32(%%rax), %%ymm0    \n\t"
1856
//	"vmovaps           -1 * 32(%%rax), %%ymm1    \n\t"
1857
//	"                                            \n\t"
1858
//	"                                            \n\t" // iteration 1
1859
//	"vbroadcastsd       6 *  8(%%rbx), %%ymm2    \n\t"
1860
//	"vbroadcastsd       7 *  8(%%rbx), %%ymm3    \n\t"
1861
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
1862
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
1863
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
1864
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
1865
//	"                                            \n\t"
1866
//	"vbroadcastsd       8 *  8(%%rbx), %%ymm2    \n\t"
1867
//	"vbroadcastsd       9 *  8(%%rbx), %%ymm3    \n\t"
1868
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
1869
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
1870
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
1871
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
1872
//	"                                            \n\t"
1873
//	"vbroadcastsd      10 *  8(%%rbx), %%ymm2    \n\t"
1874
//	"vbroadcastsd      11 *  8(%%rbx), %%ymm3    \n\t"
1875
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
1876
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
1877
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
1878
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
1879
//	"                                            \n\t"
1880
//	"vmovaps            0 * 32(%%rax), %%ymm0    \n\t"
1881
//	"vmovaps            1 * 32(%%rax), %%ymm1    \n\t"
1882
//	"                                            \n\t"
1883
//	"                                            \n\t" // iteration 2
1884
//	"prefetcht0  38 * 16(%%rax)                  \n\t"
1885
//	"                                            \n\t"
1886
//	"vbroadcastsd      12 *  8(%%rbx), %%ymm2    \n\t"
1887
//	"vbroadcastsd      13 *  8(%%rbx), %%ymm3    \n\t"
1888
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
1889
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
1890
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
1891
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
1892
//	"                                            \n\t"
1893
//	"vbroadcastsd      14 *  8(%%rbx), %%ymm2    \n\t"
1894
//	"vbroadcastsd      15 *  8(%%rbx), %%ymm3    \n\t"
1895
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
1896
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
1897
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
1898
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
1899
//	"                                            \n\t"
1900
//	"vbroadcastsd      16 *  8(%%rbx), %%ymm2    \n\t"
1901
//	"vbroadcastsd      17 *  8(%%rbx), %%ymm3    \n\t"
1902
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
1903
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
1904
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
1905
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
1906
//	"                                            \n\t"
1907
//	"vmovaps            2 * 32(%%rax), %%ymm0    \n\t"
1908
//	"vmovaps            3 * 32(%%rax), %%ymm1    \n\t"
1909
//	"                                            \n\t"
1910
//	"                                            \n\t" // iteration 3
1911
//	"vbroadcastsd      18 *  8(%%rbx), %%ymm2    \n\t"
1912
//	"vbroadcastsd      19 *  8(%%rbx), %%ymm3    \n\t"
1913
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
1914
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
1915
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
1916
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
1917
//	"                                            \n\t"
1918
//	"vbroadcastsd      20 *  8(%%rbx), %%ymm2    \n\t"
1919
//	"vbroadcastsd      21 *  8(%%rbx), %%ymm3    \n\t"
1920
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
1921
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
1922
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
1923
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
1924
//	"                                            \n\t"
1925
//	"vbroadcastsd      22 *  8(%%rbx), %%ymm2    \n\t"
1926
//	"vbroadcastsd      23 *  8(%%rbx), %%ymm3    \n\t"
1927
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
1928
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
1929
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
1930
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
1931
//	"                                            \n\t"
1932
//	"addq          $4 * 4 * 16, %%rax            \n\t" // a += 4*4 (unroll x mr)
1933
//	"addq          $4 * 3 * 16, %%rbx            \n\t" // b += 4*3 (unroll x nr)
1934
//	"                                            \n\t"
1935
//	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
1936
//	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
1937
//	"                                            \n\t"
1938
//	"                                            \n\t"
1939
//	"decq   %%rsi                                \n\t" // i -= 1;
1940
//	"jne    .ZLOOPKITER                          \n\t" // iterate again if i != 0.
1941
//	"                                            \n\t"
1942
//	"                                            \n\t"
1943
//	"                                            \n\t"
1944
//	"                                            \n\t"
1945
//	"                                            \n\t"
1946
//	"                                            \n\t"
1947
//	".ZCONSIDKLEFT:                              \n\t"
1948
//	"                                            \n\t"
1949
//	"movq      %1, %%rsi                         \n\t" // i = k_left;
1950
//	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
1951
//	"je     .ZPOSTACCUM                          \n\t" // if i == 0, we're done; jump to end.
1952
//	"                                            \n\t" // else, we prepare to enter k_left loop.
1953
//	"                                            \n\t"
1954
//	"                                            \n\t"
1955
//	".ZLOOPKLEFT:                                \n\t" // EDGE LOOP
1956
//	"                                            \n\t"
1957
//	"prefetcht0  32 * 16(%%rax)                  \n\t"
1958
//	"                                            \n\t"
1959
//	"vbroadcastsd       0 *  8(%%rbx), %%ymm2    \n\t"
1960
//	"vbroadcastsd       1 *  8(%%rbx), %%ymm3    \n\t"
1961
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
1962
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
1963
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
1964
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
1965
//	"                                            \n\t"
1966
//	"vbroadcastsd       2 *  8(%%rbx), %%ymm2    \n\t"
1967
//	"vbroadcastsd       3 *  8(%%rbx), %%ymm3    \n\t"
1968
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
1969
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
1970
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
1971
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
1972
//	"                                            \n\t"
1973
//	"vbroadcastsd       4 *  8(%%rbx), %%ymm2    \n\t"
1974
//	"vbroadcastsd       5 *  8(%%rbx), %%ymm3    \n\t"
1975
//	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
1976
//	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
1977
//	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
1978
//	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
1979
//	"                                            \n\t"
1980
//	"addq          $1 * 4 * 16, %%rax            \n\t" // a += 1*4 (unroll x mr)
1981
//	"addq          $1 * 3 * 16, %%rbx            \n\t" // b += 1*3 (unroll x nr)
1982
//	"                                            \n\t"
1983
//	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
1984
//	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
1985
//	"                                            \n\t"
1986
//	"                                            \n\t"
1987
//	"decq   %%rsi                                \n\t" // i -= 1;
1988
//	"jne    .ZLOOPKLEFT                          \n\t" // iterate again if i != 0.
1989
//	"                                            \n\t"
1990
//	"                                            \n\t"
1991
//	"                                            \n\t"
1992
//	".ZPOSTACCUM:                                \n\t"
1993
//	"                                            \n\t"
1994
//	"                                            \n\t" // permute even and odd elements
1995
//	"                                            \n\t" // of ymm6/7, ymm10/11, ymm/14/15
1996
//	"vpermilpd  $0x5, %%ymm6,  %%ymm6            \n\t"
1997
//	"vpermilpd  $0x5, %%ymm7,  %%ymm7            \n\t"
1998
//	"vpermilpd  $0x5, %%ymm10, %%ymm10           \n\t"
1999
//	"vpermilpd  $0x5, %%ymm11, %%ymm11           \n\t"
2000
//	"vpermilpd  $0x5, %%ymm14, %%ymm14           \n\t"
2001
//	"vpermilpd  $0x5, %%ymm15, %%ymm15           \n\t"
2002
//	"                                            \n\t"
2003
//	"                                            \n\t"
2004
//	"                                            \n\t" // subtract/add even/odd elements
2005
//	"vaddsubpd        %%ymm6,  %%ymm4,  %%ymm4   \n\t"
2006
//	"vaddsubpd        %%ymm7,  %%ymm5,  %%ymm5   \n\t"
2007
//	"                                            \n\t"
2008
//	"vaddsubpd        %%ymm10, %%ymm8,  %%ymm8   \n\t"
2009
//	"vaddsubpd        %%ymm11, %%ymm9,  %%ymm9   \n\t"
2010
//	"                                            \n\t"
2011
//	"vaddsubpd        %%ymm14, %%ymm12, %%ymm12  \n\t"
2012
//	"vaddsubpd        %%ymm15, %%ymm13, %%ymm13  \n\t"
2013
//	"                                            \n\t"
2014
//	"                                            \n\t"
2015
//	"                                            \n\t"
2016
//	"                                            \n\t"
2017
//	"movq         %4, %%rax                      \n\t" // load address of alpha
2018
//	"vbroadcastsd    (%%rax), %%ymm0             \n\t" // load alpha_r and duplicate
2019
//	"vbroadcastsd   8(%%rax), %%ymm1             \n\t" // load alpha_i and duplicate
2020
//	"                                            \n\t"
2021
//	"                                            \n\t"
2022
//	"vpermilpd  $0x5, %%ymm4,  %%ymm3            \n\t"
2023
//	"vmulpd           %%ymm0,  %%ymm4,  %%ymm4   \n\t"
2024
//	"vmulpd           %%ymm1,  %%ymm3,  %%ymm3   \n\t"
2025
//	"vaddsubpd        %%ymm3,  %%ymm4,  %%ymm4   \n\t"
2026
//	"                                            \n\t"
2027
//	"vpermilpd  $0x5, %%ymm5,  %%ymm3            \n\t"
2028
//	"vmulpd           %%ymm0,  %%ymm5,  %%ymm5   \n\t"
2029
//	"vmulpd           %%ymm1,  %%ymm3,  %%ymm3   \n\t"
2030
//	"vaddsubpd        %%ymm3,  %%ymm5,  %%ymm5   \n\t"
2031
//	"                                            \n\t"
2032
//	"                                            \n\t"
2033
//	"vpermilpd  $0x5, %%ymm8,  %%ymm3            \n\t"
2034
//	"vmulpd           %%ymm0,  %%ymm8,  %%ymm8   \n\t"
2035
//	"vmulpd           %%ymm1,  %%ymm3,  %%ymm3   \n\t"
2036
//	"vaddsubpd        %%ymm3,  %%ymm8,  %%ymm8   \n\t"
2037
//	"                                            \n\t"
2038
//	"vpermilpd  $0x5, %%ymm9,  %%ymm3            \n\t"
2039
//	"vmulpd           %%ymm0,  %%ymm9,  %%ymm9   \n\t"
2040
//	"vmulpd           %%ymm1,  %%ymm3,  %%ymm3   \n\t"
2041
//	"vaddsubpd        %%ymm3,  %%ymm9,  %%ymm9   \n\t"
2042
//	"                                            \n\t"
2043
//	"                                            \n\t"
2044
//	"vpermilpd  $0x5, %%ymm12, %%ymm3            \n\t"
2045
//	"vmulpd           %%ymm0,  %%ymm12, %%ymm12  \n\t"
2046
//	"vmulpd           %%ymm1,  %%ymm3,  %%ymm3   \n\t"
2047
//	"vaddsubpd        %%ymm3,  %%ymm12, %%ymm12  \n\t"
2048
//	"                                            \n\t"
2049
//	"vpermilpd  $0x5, %%ymm13, %%ymm3            \n\t"
2050
//	"vmulpd           %%ymm0,  %%ymm13, %%ymm13  \n\t"
2051
//	"vmulpd           %%ymm1,  %%ymm3,  %%ymm3   \n\t"
2052
//	"vaddsubpd        %%ymm3,  %%ymm13, %%ymm13  \n\t"
2053
//	"                                            \n\t"
2054
//	"                                            \n\t"
2055
//	"                                            \n\t"
2056
//	"                                            \n\t"
2057
//	"                                            \n\t"
2058
//	"movq         %5, %%rbx                      \n\t" // load address of beta
2059
//	"vbroadcastsd    (%%rbx), %%ymm1             \n\t" // load beta_r and duplicate
2060
//	"vbroadcastsd   8(%%rbx), %%ymm2             \n\t" // load beta_i and duplicate
2061
//	"                                            \n\t"
2062
//	"                                            \n\t"
2063
//	"                                            \n\t"
2064
//	"                                            \n\t"
2065
//	"movq                %7, %%rsi               \n\t" // load rs_c
2066
//	"leaq        (,%%rsi,8), %%rsi               \n\t" // rsi = rs_c * sizeof(dcomplex)
2067
//	"leaq        (,%%rsi,2), %%rsi               \n\t"
2068
//	"leaq        (,%%rsi,2), %%rdx               \n\t" // rdx = 2*rs_c;
2069
//	"                                            \n\t"
2070
//	"                                            \n\t"
2071
//	"                                            \n\t"
2072
//	"                                            \n\t" // now avoid loading C if beta == 0
2073
//	"vxorpd    %%ymm0,  %%ymm0,  %%ymm0          \n\t" // set ymm0 to zero.
2074
//	"vucomisd  %%xmm0,  %%xmm1                   \n\t" // set ZF if beta_r == 0.
2075
//	"sete       %%r8b                            \n\t" // r8b = ( ZF == 1 ? 1 : 0 );
2076
//	"vucomisd  %%xmm0,  %%xmm2                   \n\t" // set ZF if beta_i == 0.
2077
//	"sete       %%r9b                            \n\t" // r9b = ( ZF == 1 ? 1 : 0 );
2078
//	"andb       %%r8b, %%r9b                     \n\t" // set ZF if r8b & r9b == 1.
2079
//	"jne     .ZBETAZERO                          \n\t" // if ZF = 1, jump to beta == 0 case
2080
//	"                                            \n\t"
2081
//	"                                            \n\t"
2082
//	"cmpq      $16, %%rsi                        \n\t" // set ZF if (16*rs_c) == 16.
2083
//	"jz      .ZCOLSTORED                         \n\t" // jump to row storage case
2084
//	"                                            \n\t"
2085
//	"                                            \n\t"
2086
//	"                                            \n\t"
2087
//	".ZGENSTORED:                                \n\t"
2088
//	"                                            \n\t"
2089
//	"                                            \n\t"
2090
//	ZGEMM_INPUT_SCALE_GS_BETA_NZ
2091
//	"vaddpd           %%ymm4,  %%ymm0,  %%ymm0   \n\t"
2092
//	ZGEMM_OUTPUT_GS
2093
//	"addq      %%rdx, %%rcx                      \n\t" // c += 2*rs_c;
2094
//	"                                            \n\t"
2095
//	"                                            \n\t"
2096
//	ZGEMM_INPUT_SCALE_GS_BETA_NZ
2097
//	"vaddpd           %%ymm5,  %%ymm0,  %%ymm0   \n\t"
2098
//	ZGEMM_OUTPUT_GS
2099
//	"movq      %%r11, %%rcx                      \n\t" // rcx = c + 1*cs_c
2100
//	"                                            \n\t"
2101
//	"                                            \n\t"
2102
//	"                                            \n\t"
2103
//	ZGEMM_INPUT_SCALE_GS_BETA_NZ
2104
//	"vaddpd           %%ymm8,  %%ymm0,  %%ymm0   \n\t"
2105
//	ZGEMM_OUTPUT_GS
2106
//	"addq      %%rdx, %%rcx                      \n\t" // c += 2*rs_c;
2107
//	"                                            \n\t"
2108
//	"                                            \n\t"
2109
//	ZGEMM_INPUT_SCALE_GS_BETA_NZ
2110
//	"vaddpd           %%ymm9,  %%ymm0,  %%ymm0   \n\t"
2111
//	ZGEMM_OUTPUT_GS
2112
//	"movq      %%r12, %%rcx                      \n\t" // rcx = c + 2*cs_c
2113
//	"                                            \n\t"
2114
//	"                                            \n\t"
2115
//	"                                            \n\t"
2116
//	ZGEMM_INPUT_SCALE_GS_BETA_NZ
2117
//	"vaddpd           %%ymm12, %%ymm0,  %%ymm0   \n\t"
2118
//	ZGEMM_OUTPUT_GS
2119
//	"addq      %%rdx, %%rcx                      \n\t" // c += 2*rs_c;
2120
//	"                                            \n\t"
2121
//	"                                            \n\t"
2122
//	ZGEMM_INPUT_SCALE_GS_BETA_NZ
2123
//	"vaddpd           %%ymm13, %%ymm0,  %%ymm0   \n\t"
2124
//	ZGEMM_OUTPUT_GS
2125
//	"                                            \n\t"
2126
//	"                                            \n\t"
2127
//	"                                            \n\t"
2128
//	"jmp    .ZDONE                               \n\t" // jump to end.
2129
//	"                                            \n\t"
2130
//	"                                            \n\t"
2131
//	"                                            \n\t"
2132
//	".ZCOLSTORED:                                \n\t"
2133
//	"                                            \n\t"
2134
//	"                                            \n\t"
2135
//	ZGEMM_INPUT_SCALE_CS_BETA_NZ
2136
//	"vaddpd           %%ymm4,  %%ymm0,  %%ymm0   \n\t"
2137
//	ZGEMM_OUTPUT_CS
2138
//	"addq      %%rdx, %%rcx                      \n\t" // c += 2*rs_c;
2139
//	"                                            \n\t"
2140
//	"                                            \n\t"
2141
//	ZGEMM_INPUT_SCALE_CS_BETA_NZ
2142
//	"vaddpd           %%ymm5,  %%ymm0,  %%ymm0   \n\t"
2143
//	ZGEMM_OUTPUT_CS
2144
//	"movq      %%r11, %%rcx                      \n\t" // rcx = c + 1*cs_c
2145
//	"                                            \n\t"
2146
//	"                                            \n\t"
2147
//	"                                            \n\t"
2148
//	ZGEMM_INPUT_SCALE_CS_BETA_NZ
2149
//	"vaddpd           %%ymm8,  %%ymm0,  %%ymm0   \n\t"
2150
//	ZGEMM_OUTPUT_CS
2151
//	"addq      %%rdx, %%rcx                      \n\t" // c += 2*rs_c;
2152
//	"                                            \n\t"
2153
//	"                                            \n\t"
2154
//	ZGEMM_INPUT_SCALE_CS_BETA_NZ
2155
//	"vaddpd           %%ymm9,  %%ymm0,  %%ymm0   \n\t"
2156
//	ZGEMM_OUTPUT_CS
2157
//	"movq      %%r12, %%rcx                      \n\t" // rcx = c + 2*cs_c
2158
//	"                                            \n\t"
2159
//	"                                            \n\t"
2160
//	"                                            \n\t"
2161
//	ZGEMM_INPUT_SCALE_CS_BETA_NZ
2162
//	"vaddpd           %%ymm12, %%ymm0,  %%ymm0   \n\t"
2163
//	ZGEMM_OUTPUT_CS
2164
//	"addq      %%rdx, %%rcx                      \n\t" // c += 2*rs_c;
2165
//	"                                            \n\t"
2166
//	"                                            \n\t"
2167
//	ZGEMM_INPUT_SCALE_CS_BETA_NZ
2168
//	"vaddpd           %%ymm13, %%ymm0,  %%ymm0   \n\t"
2169
//	ZGEMM_OUTPUT_CS
2170
//	"                                            \n\t"
2171
//	"                                            \n\t"
2172
//	"                                            \n\t"
2173
//	"jmp    .ZDONE                               \n\t" // jump to end.
2174
//	"                                            \n\t"
2175
//	"                                            \n\t"
2176
//	"                                            \n\t"
2177
//	".ZBETAZERO:                                 \n\t"
2178
//	"                                            \n\t"
2179
//	"cmpq      $16, %%rsi                        \n\t" // set ZF if (16*rs_c) == 16.
2180
//	"jz      .ZCOLSTORBZ                         \n\t" // jump to row storage case
2181
//	"                                            \n\t"
2182
//	"                                            \n\t"
2183
//	"                                            \n\t"
2184
//	".ZGENSTORBZ:                                \n\t"
2185
//	"                                            \n\t"
2186
//	"                                            \n\t"
2187
//	"vmovaps          %%ymm4,  %%ymm0            \n\t"
2188
//	ZGEMM_OUTPUT_GS
2189
//	"addq      %%rdx, %%rcx                      \n\t" // c += 2*rs_c;
2190
//	"                                            \n\t"
2191
//	"                                            \n\t"
2192
//	"vmovaps          %%ymm5,  %%ymm0            \n\t"
2193
//	ZGEMM_OUTPUT_GS
2194
//	"movq      %%r11, %%rcx                      \n\t" // rcx = c + 1*cs_c
2195
//	"                                            \n\t"
2196
//	"                                            \n\t"
2197
//	"                                            \n\t"
2198
//	"vmovaps          %%ymm8,  %%ymm0            \n\t"
2199
//	ZGEMM_OUTPUT_GS
2200
//	"addq      %%rdx, %%rcx                      \n\t" // c += 2*rs_c;
2201
//	"                                            \n\t"
2202
//	"                                            \n\t"
2203
//	"vmovaps          %%ymm9,  %%ymm0            \n\t"
2204
//	ZGEMM_OUTPUT_GS
2205
//	"movq      %%r12, %%rcx                      \n\t" // rcx = c + 2*cs_c
2206
//	"                                            \n\t"
2207
//	"                                            \n\t"
2208
//	"                                            \n\t"
2209
//	"vmovaps          %%ymm12, %%ymm0            \n\t"
2210
//	ZGEMM_OUTPUT_GS
2211
//	"addq      %%rdx, %%rcx                      \n\t" // c += 2*rs_c;
2212
//	"                                            \n\t"
2213
//	"                                            \n\t"
2214
//	"vmovaps          %%ymm13, %%ymm0            \n\t"
2215
//	ZGEMM_OUTPUT_GS
2216
//	"                                            \n\t"
2217
//	"                                            \n\t"
2218
//	"                                            \n\t"
2219
//	"jmp    .ZDONE                               \n\t" // jump to end.
2220
//	"                                            \n\t"
2221
//	"                                            \n\t"
2222
//	"                                            \n\t"
2223
//	".ZCOLSTORBZ:                                \n\t"
2224
//	"                                            \n\t"
2225
//	"                                            \n\t"
2226
//	"vmovups          %%ymm4,  (%%rcx)           \n\t"
2227
//	"vmovups          %%ymm5,  (%%rcx,%%rdx,1)   \n\t"
2228
//	"                                            \n\t"
2229
//	"vmovups          %%ymm8,  (%%r11)           \n\t"
2230
//	"vmovups          %%ymm9,  (%%r11,%%rdx,1)   \n\t"
2231
//	"                                            \n\t"
2232
//	"vmovups          %%ymm12, (%%r12)           \n\t"
2233
//	"vmovups          %%ymm13, (%%r12,%%rdx,1)   \n\t"
2234
//	"                                            \n\t"
2235
//	"                                            \n\t"
2236
//	"                                            \n\t"
2237
//	"                                            \n\t"
2238
//	"                                            \n\t"
2239
//	"                                            \n\t"
2240
//	".ZDONE:                                     \n\t"
2241
//    "                                            \n\t"
2242
//    "vzeroupper                                  \n\t"
2243
//	"                                            \n\t"
2244
//
2245
//	: // output operands (none)
2246
//	: // input operands
2247
//	  "m" (k_iter), // 0
2248
//	  "m" (k_left), // 1
2249
//	  "m" (a),      // 2
2250
//	  "m" (b),      // 3
2251
//	  "m" (alpha),  // 4
2252
//	  "m" (beta),   // 5
2253
//	  "m" (c),      // 6
2254
//	  "m" (rs_c),   // 7
2255
//	  "m" (cs_c)/*,   // 8
2256
//	  "m" (b_next), // 9
2257
//	  "m" (a_next)*/  // 10
2258
//	: // register clobber list
2259
//	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
2260
//	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
2261
//	  "xmm0", "xmm1", "xmm2", "xmm3",
2262
//	  "xmm4", "xmm5", "xmm6", "xmm7",
2263
//	  "xmm8", "xmm9", "xmm10", "xmm11",
2264
//	  "xmm12", "xmm13", "xmm14", "xmm15",
2265
//	  "memory"
2266
//	);
2267
//}
2268
//