HMLP: High-performance Machine Learning Primitives
rank_k_int_d8x4.hpp
1 /*
2  * This file is modified and redistribued from
3  *
4  * BLIS
5  * An object-based framework for developing high-performance BLAS-like
6  * libraries.
7  *
8  * Copyright (C) 2014, The University of Texas at Austin
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions are
12  * met:
13  * - Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  * - Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in the
17  * documentation and/or other materials provided with the
18  * distribution.
19  * - Neither the name of The University of Texas at Austin nor the names
20  * of its contributors may be used to endorse or promote products
21  * derived from this software without specific prior written
22  * permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
28  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
30  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
31  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
32  * THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
34  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35  *
36  *
37  *
38  * rank_k_int_d8x4.h
39  *
40  * Mofidifier:
41  * Chenhan D. Yu - Department of Computer Science,
42  * The University of Texas at Austin
43  *
44  *
45  * Purpose:
46  *
47  *
48  * Todo:
49  *
50  *
51  * Modification:
52  *
53  * Chenhan
54  * Feb 01, 2015: This file is extracted from bli_gemm_int_d8x4.c in
55  * the Sandy-Bridge AVX micro-kernel directory of BLIS.
56  * The double precision rank-k update with a typical mc leading
57  * is kept in this file to work as a common segment in most of
58  * the GSKS intrinsic kernels.
59  *
60  *
61  *
62  * */
63 
64 
65  int k_iter = k / 2;
66  int k_left = k % 2;
67 
68  __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( a ) );
69  __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"( aux->b_next ) );
70 
71 
72  c03_0.v = _mm256_setzero_pd();
73  c03_1.v = _mm256_setzero_pd();
74  c03_2.v = _mm256_setzero_pd();
75  c03_3.v = _mm256_setzero_pd();
76  c47_0.v = _mm256_setzero_pd();
77  c47_1.v = _mm256_setzero_pd();
78  c47_2.v = _mm256_setzero_pd();
79  c47_3.v = _mm256_setzero_pd();
80 
81 
82  // Load a03
83  a03.v = _mm256_load_pd( (double*)a );
84  // Load a47
85  a47.v = _mm256_load_pd( (double*)( a + 4 ) );
86  // Load (b0,b1,b2,b3)
87  b0.v = _mm256_load_pd( (double*)b );
88 
89  for ( i = 0; i < k_iter; ++i ) {
90  __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) );
91 
92  // Preload A03
93  A03.v = _mm256_load_pd( (double*)( a + 8 ) );
94 
95  c_tmp.v = _mm256_mul_pd( a03.v , b0.v );
96  c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
97  c_tmp.v = _mm256_mul_pd( a47.v , b0.v );
98  c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
99 
100  // Preload A47
101  A47.v = _mm256_load_pd( (double*)( a + 12 ) );
102 
103  // Shuffle b ( 1, 0, 3, 2 )
104  b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );
105 
106  c_tmp.v = _mm256_mul_pd( a03.v , b1.v );
107  c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
108  c_tmp.v = _mm256_mul_pd( a47.v , b1.v );
109  c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
110 
111  // Permute b ( 3, 2, 1, 0 )
112  b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );
113 
114  // Preload B0
115  B0.v = _mm256_load_pd( (double*)( b + 4 ) );
116 
117  c_tmp.v = _mm256_mul_pd( a03.v , b2.v );
118  c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
119  c_tmp.v = _mm256_mul_pd( a47.v , b2.v );
120  c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );
121 
122  // Shuffle b ( 3, 2, 1, 0 )
123  b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );
124 
125  c_tmp.v = _mm256_mul_pd( a03.v , b3.v );
126  c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
127  c_tmp.v = _mm256_mul_pd( a47.v , b3.v );
128  c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );
129 
130 
131  // Iteration #1
132  __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) );
133 
134  // Preload a03 ( next iteration )
135  a03.v = _mm256_load_pd( (double*)( a + 16 ) );
136 
137  c_tmp.v = _mm256_mul_pd( A03.v , B0.v );
138  c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
139 
140  b1.v = _mm256_shuffle_pd( B0.v, B0.v, 0x5 );
141 
142  c_tmp.v = _mm256_mul_pd( A47.v , B0.v );
143  c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
144  c_tmp.v = _mm256_mul_pd( A03.v , b1.v );
145  c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
146 
147  // Preload a47 ( next iteration )
148  a47.v = _mm256_load_pd( (double*)( a + 20 ) );
149 
150  // Permute b ( 3, 2, 1, 0 )
151  b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );
152 
153  c_tmp.v = _mm256_mul_pd( A47.v , b1.v );
154  c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
155  c_tmp.v = _mm256_mul_pd( A03.v , b2.v );
156  c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
157 
158  // Shuffle b ( 3, 2, 1, 0 )
159  b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );
160 
161  c_tmp.v = _mm256_mul_pd( A47.v , b2.v );
162  c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );
163 
164  // Load b0 ( next iteration )
165  b0.v = _mm256_load_pd( (double*)( b + 8 ) );
166 
167  c_tmp.v = _mm256_mul_pd( A03.v , b3.v );
168  c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
169  c_tmp.v = _mm256_mul_pd( A47.v , b3.v );
170  c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );
171 
172  a += 16;
173  b += 8;
174  }
175 
176  for ( i = 0; i < k_left; ++i )
177  {
178  a03.v = _mm256_load_pd( (double*)a );
179  //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] );
180 
181  a47.v = _mm256_load_pd( (double*)( a + 4 ) );
182  //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] );
183 
184  b0.v = _mm256_load_pd( (double*)b );
185  //printf( "b0 = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] );
186 
187  c_tmp.v = _mm256_mul_pd( a03.v , b0.v );
188  c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
189  c_tmp.v = _mm256_mul_pd( a47.v , b0.v );
190  c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
191 
192  // Shuffle b ( 1, 0, 3, 2 )
193  b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );
194 
195  c_tmp.v = _mm256_mul_pd( a03.v , b1.v );
196  c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
197  c_tmp.v = _mm256_mul_pd( a47.v , b1.v );
198  c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
199 
200  // Permute b ( 3, 2, 1, 0 )
201  b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );
202 
203  c_tmp.v = _mm256_mul_pd( a03.v , b2.v );
204  c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
205  c_tmp.v = _mm256_mul_pd( a47.v , b2.v );
206  c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );
207 
208  // Shuffle b ( 3, 2, 1, 0 )
209  b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );
210 
211  c_tmp.v = _mm256_mul_pd( a03.v , b3.v );
212  c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
213  c_tmp.v = _mm256_mul_pd( a47.v , b3.v );
214  c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );
215 
216  a += 8;
217  b += 4;
218  }
219 
220 
221  // Prefetch aa and bb
222  //__asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aa ) );
223  //__asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( bb ) );
224 
225 
226  tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 );
227  tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 );
228 
229  tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 );
230  tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 );
231 
232  tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 );
233  tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 );
234 
235  tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 );
236  tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 );
237 
238  c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 );
239  c03_3.v = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 );
240 
241  c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 );
242  c03_2.v = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 );
243 
244  c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 );
245  c47_3.v = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 );
246 
247  c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 );
248  c47_2.v = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 );