HMLP: High-performance Machine Learning Primitives
exp_int_d8x4.hpp
1  // Inline vdExp()
2  const double log2e = 1.4426950408889634073599;
3  const double maxlog = 7.09782712893383996843e2; // log( 2**1024 )
4  const double minlog = -7.08396418532264106224e2; // log( 2**-1024 )
5  const double one = 1.0;
6  const double c1 = 6.93145751953125E-1;
7  const double c2 = 1.42860682030941723212E-6;
8 
9  // Original Remez Order 11 coefficients
10  const double w11 = 3.5524625185478232665958141148891055719216674475023e-8;
11  const double w10 = 2.5535368519306500343384723775435166753084614063349e-7;
12  const double w9 = 2.77750562801295315877005242757916081614772210463065e-6;
13  const double w8 = 2.47868893393199945541176652007657202642495832996107e-5;
14  const double w7 = 1.98419213985637881240770890090795533564573406893163e-4;
15  const double w6 = 1.3888869684178659239014256260881685824525255547326e-3;
16  const double w5 = 8.3333337052009872221152811550156335074160546333973e-3;
17  const double w4 = 4.1666666621080810610346717440523105184720007971655e-2;
18  const double w3 = 0.166666666669960803484477734308515404418108830469798;
19  const double w2 = 0.499999999999877094481580370323249951329122224389189;
20  const double w1 = 1.0000000000000017952745258419615282194236357388884;
21  const double w0 = 0.99999999999999999566016490920259318691496540598896;
22 
23  // Remez Order 11 polynomail approximation
24  //const double w0 = 9.9999999999999999694541216787022234814339814028865e-1;
25  //const double w1 = 1.0000000000000013347525109964212249781265243645457;
26  //const double w2 = 4.9999999999990426011279542064313207349934058355357e-1;
27  //const double w3 = 1.6666666666933781279020916199156875162816850273886e-1;
28  //const double w4 = 4.1666666628388978913396218847247771982698350546174e-2;
29  //const double w5 = 8.3333336552944126722390410619859929515740995889372e-3;
30  //const double w6 = 1.3888871805082296012945081624687544823497126781709e-3;
31  //const double w7 = 1.9841863599469418342286677256362193951266072398489e-4;
32  //const double w8 = 2.4787899938611697691690479138150629377630767114546e-5;
33  //const double w9 = 2.7764095757136528235740765949934667970688427190168e-6;
34  //const double w10 = 2.5602485412126369546033948405199058329040797134573e-7;
35  //const double w11 = 3.5347283721656121939634391175390704621351283546671e-8;
36 
37 
38  v4df_t a03_0, a03_1, a03_2, a03_3;
39  v4df_t a47_0, a47_1, a47_2, a47_3;
40  v4df_t p03_0, p03_1, p03_2, p03_3;
41  v4df_t p47_0, p47_1, p47_2, p47_3;
42  v4df_t y, l2e, tmp, p;
43  v4li_t k03_0, k03_1, k03_2, k03_3;
44  v4li_t k47_0, k47_1, k47_2, k47_3;
45  v4li_t offset, mask0, mask1023;
46  v4li_t k1, k2, k3;
47  //__m128d p1, p2;
48 
49  tmp.v = _mm256_broadcast_sd( &maxlog );
50  c03_0.v = _mm256_min_pd( tmp.v, c03_0.v );
51  c03_1.v = _mm256_min_pd( tmp.v, c03_1.v );
52  c03_2.v = _mm256_min_pd( tmp.v, c03_2.v );
53  c03_3.v = _mm256_min_pd( tmp.v, c03_3.v );
54  c47_0.v = _mm256_min_pd( tmp.v, c47_0.v );
55  c47_1.v = _mm256_min_pd( tmp.v, c47_1.v );
56  c47_2.v = _mm256_min_pd( tmp.v, c47_2.v );
57  c47_3.v = _mm256_min_pd( tmp.v, c47_3.v );
58 
59  tmp.v = _mm256_broadcast_sd( &minlog );
60  c03_0.v = _mm256_max_pd( tmp.v, c03_0.v );
61  c03_1.v = _mm256_max_pd( tmp.v, c03_1.v );
62  c03_2.v = _mm256_max_pd( tmp.v, c03_2.v );
63  c03_3.v = _mm256_max_pd( tmp.v, c03_3.v );
64  c47_0.v = _mm256_max_pd( tmp.v, c47_0.v );
65  c47_1.v = _mm256_max_pd( tmp.v, c47_1.v );
66  c47_2.v = _mm256_max_pd( tmp.v, c47_2.v );
67  c47_3.v = _mm256_max_pd( tmp.v, c47_3.v );
68 
69  // a = c / log2e
70  // c = a * ln2 = k * ln2 + w, ( w in [ -ln2, ln2 ] )
71  l2e.v = _mm256_broadcast_sd( &log2e );
72  a03_0.v = _mm256_mul_pd( l2e.v, c03_0.v );
73  a03_1.v = _mm256_mul_pd( l2e.v, c03_1.v );
74  a03_2.v = _mm256_mul_pd( l2e.v, c03_2.v );
75  a03_3.v = _mm256_mul_pd( l2e.v, c03_3.v );
76  a47_0.v = _mm256_mul_pd( l2e.v, c47_0.v );
77  a47_1.v = _mm256_mul_pd( l2e.v, c47_1.v );
78  a47_2.v = _mm256_mul_pd( l2e.v, c47_2.v );
79  a47_3.v = _mm256_mul_pd( l2e.v, c47_3.v );
80 
81  // Check if a < 0
82  tmp.v = _mm256_setzero_pd();
83  p03_0.v = _mm256_cmp_pd( a03_0.v, tmp.v, 1 );
84  p03_1.v = _mm256_cmp_pd( a03_1.v, tmp.v, 1 );
85  p03_2.v = _mm256_cmp_pd( a03_2.v, tmp.v, 1 );
86  p03_3.v = _mm256_cmp_pd( a03_3.v, tmp.v, 1 );
87  p47_0.v = _mm256_cmp_pd( a47_0.v, tmp.v, 1 );
88  p47_1.v = _mm256_cmp_pd( a47_1.v, tmp.v, 1 );
89  p47_2.v = _mm256_cmp_pd( a47_2.v, tmp.v, 1 );
90  p47_3.v = _mm256_cmp_pd( a47_3.v, tmp.v, 1 );
91  tmp.v = _mm256_broadcast_sd( &one );
92  p03_0.v = _mm256_and_pd( tmp.v, p03_0.v );
93  p03_1.v = _mm256_and_pd( tmp.v, p03_1.v );
94  p03_2.v = _mm256_and_pd( tmp.v, p03_2.v );
95  p03_3.v = _mm256_and_pd( tmp.v, p03_3.v );
96  p47_0.v = _mm256_and_pd( tmp.v, p47_0.v );
97  p47_1.v = _mm256_and_pd( tmp.v, p47_1.v );
98  p47_2.v = _mm256_and_pd( tmp.v, p47_2.v );
99  p47_3.v = _mm256_and_pd( tmp.v, p47_3.v );
100  // If a < 0 ( w < 0 ), then a - 1 = ( k - 1 ) + w / ln2
101  a03_0.v = _mm256_sub_pd( a03_0.v, p03_0.v );
102  a03_1.v = _mm256_sub_pd( a03_1.v, p03_1.v );
103  a03_2.v = _mm256_sub_pd( a03_2.v, p03_2.v );
104  a03_3.v = _mm256_sub_pd( a03_3.v, p03_3.v );
105  a47_0.v = _mm256_sub_pd( a47_0.v, p47_0.v );
106  a47_1.v = _mm256_sub_pd( a47_1.v, p47_1.v );
107  a47_2.v = _mm256_sub_pd( a47_2.v, p47_2.v );
108  a47_3.v = _mm256_sub_pd( a47_3.v, p47_3.v );
109 
110  // Compute floor( a ) by two conversions
111  // if a < 0, p = k - 1
112  // else , p = k
113  k03_0.v = _mm256_cvttpd_epi32( a03_0.v );
114  k03_1.v = _mm256_cvttpd_epi32( a03_1.v );
115  k03_2.v = _mm256_cvttpd_epi32( a03_2.v );
116  k03_3.v = _mm256_cvttpd_epi32( a03_3.v );
117  k47_0.v = _mm256_cvttpd_epi32( a47_0.v );
118  k47_1.v = _mm256_cvttpd_epi32( a47_1.v );
119  k47_2.v = _mm256_cvttpd_epi32( a47_2.v );
120  k47_3.v = _mm256_cvttpd_epi32( a47_3.v );
121  p03_0.v = _mm256_cvtepi32_pd( k03_0.v );
122  p03_1.v = _mm256_cvtepi32_pd( k03_1.v );
123  p03_2.v = _mm256_cvtepi32_pd( k03_2.v );
124  p03_3.v = _mm256_cvtepi32_pd( k03_3.v );
125  p47_0.v = _mm256_cvtepi32_pd( k47_0.v );
126  p47_1.v = _mm256_cvtepi32_pd( k47_1.v );
127  p47_2.v = _mm256_cvtepi32_pd( k47_2.v );
128  p47_3.v = _mm256_cvtepi32_pd( k47_3.v );
129 
130  // ---------------------
131  // x -= p * ln2
132  // ---------------------
133  // c1 = ln2
134  // if a < 0, a = ( k - 1 ) * ln2
135  // else , a = k * ln2
136  // if a < 0, x -= ( k - 1 ) * ln2
137  // else , x -= k * ln2
138  //
139  tmp.v = _mm256_broadcast_sd( &c1 );
140  a03_0.v = _mm256_mul_pd( tmp.v, p03_0.v );
141  a03_1.v = _mm256_mul_pd( tmp.v, p03_1.v );
142  a03_2.v = _mm256_mul_pd( tmp.v, p03_2.v );
143  a03_3.v = _mm256_mul_pd( tmp.v, p03_3.v );
144  a47_0.v = _mm256_mul_pd( tmp.v, p47_0.v );
145  a47_1.v = _mm256_mul_pd( tmp.v, p47_1.v );
146  a47_2.v = _mm256_mul_pd( tmp.v, p47_2.v );
147  a47_3.v = _mm256_mul_pd( tmp.v, p47_3.v );
148  c03_0.v = _mm256_sub_pd( c03_0.v, a03_0.v );
149  c03_1.v = _mm256_sub_pd( c03_1.v, a03_1.v );
150  c03_2.v = _mm256_sub_pd( c03_2.v, a03_2.v );
151  c03_3.v = _mm256_sub_pd( c03_3.v, a03_3.v );
152  c47_0.v = _mm256_sub_pd( c47_0.v, a47_0.v );
153  c47_1.v = _mm256_sub_pd( c47_1.v, a47_1.v );
154  c47_2.v = _mm256_sub_pd( c47_2.v, a47_2.v );
155  c47_3.v = _mm256_sub_pd( c47_3.v, a47_3.v );
156  tmp.v = _mm256_broadcast_sd( &c2 );
157  a03_0.v = _mm256_mul_pd( tmp.v, p03_0.v );
158  a03_1.v = _mm256_mul_pd( tmp.v, p03_1.v );
159  a03_2.v = _mm256_mul_pd( tmp.v, p03_2.v );
160  a03_3.v = _mm256_mul_pd( tmp.v, p03_3.v );
161  a47_0.v = _mm256_mul_pd( tmp.v, p47_0.v );
162  a47_1.v = _mm256_mul_pd( tmp.v, p47_1.v );
163  a47_2.v = _mm256_mul_pd( tmp.v, p47_2.v );
164  a47_3.v = _mm256_mul_pd( tmp.v, p47_3.v );
165  c03_0.v = _mm256_sub_pd( c03_0.v, a03_0.v );
166  c03_1.v = _mm256_sub_pd( c03_1.v, a03_1.v );
167  c03_2.v = _mm256_sub_pd( c03_2.v, a03_2.v );
168  c03_3.v = _mm256_sub_pd( c03_3.v, a03_3.v );
169  c47_0.v = _mm256_sub_pd( c47_0.v, a47_0.v );
170  c47_1.v = _mm256_sub_pd( c47_1.v, a47_1.v );
171  c47_2.v = _mm256_sub_pd( c47_2.v, a47_2.v );
172  c47_3.v = _mm256_sub_pd( c47_3.v, a47_3.v );
173 
174  // Prefetch u
175  //__asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u ) );
176 
177 
178 
179  // Compute e^x using polynomial approximation
180  // a = w10 + w11 * x
181  tmp.v = _mm256_broadcast_sd( &w11 );
182  //tmp.v = _mm256_broadcast_sd( &w9 );
183  a03_0.v = _mm256_mul_pd( c03_0.v, tmp.v );
184  a03_1.v = _mm256_mul_pd( c03_1.v, tmp.v );
185  a03_2.v = _mm256_mul_pd( c03_2.v, tmp.v );
186  a03_3.v = _mm256_mul_pd( c03_3.v, tmp.v );
187  a47_0.v = _mm256_mul_pd( c47_0.v, tmp.v );
188  a47_1.v = _mm256_mul_pd( c47_1.v, tmp.v );
189  a47_2.v = _mm256_mul_pd( c47_2.v, tmp.v );
190  a47_3.v = _mm256_mul_pd( c47_3.v, tmp.v );
191  tmp.v = _mm256_broadcast_sd( &w10 );
192  //tmp.v = _mm256_broadcast_sd( &w8 );
193  a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
194  a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
195  a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
196  a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
197  a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
198  a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
199  a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
200  a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
201 
202 
203  // a = w8 + ( w9 + ( w10 + w11 * x ) * x ) * x
204  tmp.v = _mm256_broadcast_sd( &w9 );
205  a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
206  a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
207  a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
208  a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
209  a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
210  a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
211  a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
212  a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
213  a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
214  a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
215  a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
216  a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
217  a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
218  a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
219  a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
220  a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
221  tmp.v = _mm256_broadcast_sd( &w8 );
222  a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
223  a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
224  a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
225  a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
226  a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
227  a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
228  a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
229  a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
230  a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
231  a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
232  a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
233  a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
234  a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
235  a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
236  a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
237  a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
238 
239 
240  tmp.v = _mm256_broadcast_sd( &w7 );
241  a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
242  a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
243  a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
244  a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
245  a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
246  a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
247  a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
248  a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
249  a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
250  a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
251  a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
252  a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
253  a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
254  a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
255  a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
256  a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
257  tmp.v = _mm256_broadcast_sd( &w6 );
258  a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
259  a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
260  a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
261  a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
262  a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
263  a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
264  a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
265  a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
266  a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
267  a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
268  a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
269  a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
270  a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
271  a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
272  a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
273  a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
274 
275 
276  tmp.v = _mm256_broadcast_sd( &w5 );
277  a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
278  a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
279  a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
280  a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
281  a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
282  a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
283  a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
284  a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
285  a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
286  a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
287  a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
288  a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
289  a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
290  a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
291  a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
292  a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
293  tmp.v = _mm256_broadcast_sd( &w4 );
294  a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
295  a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
296  a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
297  a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
298  a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
299  a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
300  a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
301  a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
302  a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
303  a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
304  a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
305  a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
306  a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
307  a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
308  a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
309  a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
310 
311 
312  // Prefetch w
313  //__asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) );
314 
315  // Preload u03
316  //u03.v = _mm256_load_pd( (double*)u );
317 
318  tmp.v = _mm256_broadcast_sd( &w3 );
319  a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
320  a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
321  a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
322  a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
323  a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
324  a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
325  a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
326  a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
327  a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
328  a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
329  a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
330  a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
331  a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
332  a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
333  a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
334  a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
335  tmp.v = _mm256_broadcast_sd( &w2 );
336  a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
337  a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
338  a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
339  a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
340  a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
341  a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
342  a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
343  a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
344  a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
345  a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
346  a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
347  a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
348  a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
349  a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
350  a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
351  a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
352 
353 
354  tmp.v = _mm256_broadcast_sd( &w1 );
355  a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
356  a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
357  a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
358  a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
359  a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
360  a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
361  a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
362  a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
363  a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
364  a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
365  a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
366  a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
367  a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
368  a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
369  a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
370  a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
371  tmp.v = _mm256_broadcast_sd( &w0 );
372  a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
373  a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
374  a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
375  a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
376  a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
377  a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
378  a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
379  a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
380  a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
381  a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
382  a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
383  a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
384  a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
385  a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
386  a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
387  a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
388 
389 
390  // Preload u47
391  //u47.v = _mm256_load_pd( (double*)( u + 4 ) );
392 
393 
394  mask1023.v = _mm_setr_epi32( 1023, 1023, 1023, 1023 );
395  mask0.v = _mm_setr_epi32( 0, 0, 0, 0 );
396  offset.v = _mm_setr_epi32( 1023, 1023, 0, 0 );
397 
398 
399  //k1.v = _mm_set_epi32( 0, 0, k03_0.d[ 1 ], k03_0.d[ 0 ]);
400  //k2.v = _mm_set_epi32( 0, 0, k03_0.d[ 3 ], k03_0.d[ 2 ]);
401  //k1.v = _mm_add_epi32( k1.v, offset.v );
402  //k2.v = _mm_add_epi32( k2.v, offset.v );
403  k3.v = _mm_add_epi32( k03_0.v, mask1023.v );
404  //k1.v = _mm_slli_epi32( k1.v, 20 );
405  //k2.v = _mm_slli_epi32( k2.v, 20 );
406  k3.v = _mm_slli_epi32( k3.v, 20 );
407  //k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
408  //k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
409  k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
410  k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
411  //p1 = _mm_castsi128_pd( k1.v );
412  //p2 = _mm_castsi128_pd( k2.v );
413  //p03_0.v = _mm256_set_m128d( p2, p1 );
414  p03_0.i = _mm256_insertf128_si256( p03_0.i, k1.v, 0 );
415  p03_0.i = _mm256_insertf128_si256( p03_0.i, k2.v, 1 );
416 
417 
418 
419  //k1.v = _mm_set_epi32( 0, 0, k03_1.d[ 1 ], k03_1.d[ 0 ]);
420  //k2.v = _mm_set_epi32( 0, 0, k03_1.d[ 3 ], k03_1.d[ 2 ]);
421  //k1.v = _mm_add_epi32( k1.v, offset.v );
422  //k2.v = _mm_add_epi32( k2.v, offset.v );
423  k3.v = _mm_add_epi32( k03_1.v, mask1023.v );
424  //k1.v = _mm_slli_epi32( k1.v, 20 );
425  //k2.v = _mm_slli_epi32( k2.v, 20 );
426  k3.v = _mm_slli_epi32( k3.v, 20 );
427  //k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
428  //k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
429  k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
430  k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
431  //p1 = _mm_castsi128_pd( k1.v );
432  //p2 = _mm_castsi128_pd( k2.v );
433  //p03_1.v = _mm256_set_m128d( p2, p1 );
434  p03_1.i = _mm256_insertf128_si256( p03_1.i, k1.v, 0 );
435  p03_1.i = _mm256_insertf128_si256( p03_1.i, k2.v, 1 );
436 
437 
438  //k1.v = _mm_set_epi32( 0, 0, k03_2.d[ 1 ], k03_2.d[ 0 ]);
439  //k2.v = _mm_set_epi32( 0, 0, k03_2.d[ 3 ], k03_2.d[ 2 ]);
440  //k1.v = _mm_add_epi32( k1.v, offset.v );
441  //k2.v = _mm_add_epi32( k2.v, offset.v );
442  k3.v = _mm_add_epi32( k03_2.v, mask1023.v );
443  //k1.v = _mm_slli_epi32( k1.v, 20 );
444  //k2.v = _mm_slli_epi32( k2.v, 20 );
445  k3.v = _mm_slli_epi32( k3.v, 20 );
446  //k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
447  //k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
448  k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
449  k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
450  //p1 = _mm_castsi128_pd( k1.v );
451  //p2 = _mm_castsi128_pd( k2.v );
452  //p03_2.v = _mm256_set_m128d( p2, p1 );
453  p03_2.i = _mm256_insertf128_si256( p03_2.i, k1.v, 0 );
454  p03_2.i = _mm256_insertf128_si256( p03_2.i, k2.v, 1 );
455 
456 
457  //k1.v = _mm_set_epi32( 0, 0, k03_3.d[ 1 ], k03_3.d[ 0 ]);
458  //k2.v = _mm_set_epi32( 0, 0, k03_3.d[ 3 ], k03_3.d[ 2 ]);
459  //k1.v = _mm_add_epi32( k1.v, offset.v );
460  //k2.v = _mm_add_epi32( k2.v, offset.v );
461  k3.v = _mm_add_epi32( k03_3.v, mask1023.v );
462  //k1.v = _mm_slli_epi32( k1.v, 20 );
463  //k2.v = _mm_slli_epi32( k2.v, 20 );
464  k3.v = _mm_slli_epi32( k3.v, 20 );
465  //k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
466  //k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
467  k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
468  k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
469  //p1 = _mm_castsi128_pd( k1.v );
470  //p2 = _mm_castsi128_pd( k2.v );
471  //p03_3.v = _mm256_set_m128d( p2, p1 );
472  p03_3.i = _mm256_insertf128_si256( p03_3.i, k1.v, 0 );
473  p03_3.i = _mm256_insertf128_si256( p03_3.i, k2.v, 1 );
474 
475 
476  //k1.v = _mm_set_epi32( 0, 0, k47_0.d[ 1 ], k47_0.d[ 0 ]);
477  //k2.v = _mm_set_epi32( 0, 0, k47_0.d[ 3 ], k47_0.d[ 2 ]);
478  //k1.v = _mm_add_epi32( k1.v, offset.v );
479  //k2.v = _mm_add_epi32( k2.v, offset.v );
480  k3.v = _mm_add_epi32( k47_0.v, mask1023.v );
481  //k1.v = _mm_slli_epi32( k1.v, 20 );
482  //k2.v = _mm_slli_epi32( k2.v, 20 );
483  k3.v = _mm_slli_epi32( k3.v, 20 );
484  //k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
485  //k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
486  k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
487  k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
488  //p1 = _mm_castsi128_pd( k1.v );
489  //p2 = _mm_castsi128_pd( k2.v );
490  //p47_0.v = _mm256_set_m128d( p2, p1 );
491  p47_0.i = _mm256_insertf128_si256( p47_0.i, k1.v, 0 );
492  p47_0.i = _mm256_insertf128_si256( p47_0.i, k2.v, 1 );
493 
494  //k1.v = _mm_set_epi32( 0, 0, k47_1.d[ 1 ], k47_1.d[ 0 ]);
495  //k2.v = _mm_set_epi32( 0, 0, k47_1.d[ 3 ], k47_1.d[ 2 ]);
496  //k1.v = _mm_add_epi32( k1.v, offset.v );
497  //k2.v = _mm_add_epi32( k2.v, offset.v );
498  k3.v = _mm_add_epi32( k47_1.v, mask1023.v );
499  //k1.v = _mm_slli_epi32( k1.v, 20 );
500  //k2.v = _mm_slli_epi32( k2.v, 20 );
501  k3.v = _mm_slli_epi32( k3.v, 20 );
502  //k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
503  //k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
504  k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
505  k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
506  //p1 = _mm_castsi128_pd( k1.v );
507  //p2 = _mm_castsi128_pd( k2.v );
508  //p47_1.v = _mm256_set_m128d( p2, p1 );
509  p47_1.i = _mm256_insertf128_si256( p47_1.i, k1.v, 0 );
510  p47_1.i = _mm256_insertf128_si256( p47_1.i, k2.v, 1 );
511 
512  //k1.v = _mm_set_epi32( 0, 0, k47_2.d[ 1 ], k47_2.d[ 0 ]);
513  //k2.v = _mm_set_epi32( 0, 0, k47_2.d[ 3 ], k47_2.d[ 2 ]);
514  //k1.v = _mm_add_epi32( k1.v, offset.v );
515  //k2.v = _mm_add_epi32( k2.v, offset.v );
516  k3.v = _mm_add_epi32( k47_2.v, mask1023.v );
517  //k1.v = _mm_slli_epi32( k1.v, 20 );
518  //k2.v = _mm_slli_epi32( k2.v, 20 );
519  k3.v = _mm_slli_epi32( k3.v, 20 );
520  //k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
521  //k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
522  k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
523  k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
524  //p1 = _mm_castsi128_pd( k1.v );
525  //p2 = _mm_castsi128_pd( k2.v );
526  //p47_2.v = _mm256_set_m128d( p2, p1 );
527  p47_2.i = _mm256_insertf128_si256( p47_2.i, k1.v, 0 );
528  p47_2.i = _mm256_insertf128_si256( p47_2.i, k2.v, 1 );
529 
530  //k1.v = _mm_set_epi32( 0, 0, k47_3.d[ 1 ], k47_3.d[ 0 ]);
531  //k2.v = _mm_set_epi32( 0, 0, k47_3.d[ 3 ], k47_3.d[ 2 ]);
532  //k1.v = _mm_add_epi32( k1.v, offset.v );
533  //k2.v = _mm_add_epi32( k2.v, offset.v );
534  k3.v = _mm_add_epi32( k47_3.v, mask1023.v );
535  //k1.v = _mm_slli_epi32( k1.v, 20 );
536  //k2.v = _mm_slli_epi32( k2.v, 20 );
537  k3.v = _mm_slli_epi32( k3.v, 20 );
538  //k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
539  //k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
540  k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
541  k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
542  //p1 = _mm_castsi128_pd( k1.v );
543  //p2 = _mm_castsi128_pd( k2.v );
544  //p47_3.v = _mm256_set_m128d( p2, p1 );
545  p47_3.i = _mm256_insertf128_si256( p47_3.i, k1.v, 0 );
546  p47_3.i = _mm256_insertf128_si256( p47_3.i, k2.v, 1 );
547 
548 
549  //u03.v = _mm256_load_pd( (double*)u );
550  //u47.v = _mm256_load_pd( (double*)( u + 4 ) );
551 
552 
553  c03_0.v = _mm256_mul_pd( a03_0.v, p03_0.v );
554  c03_1.v = _mm256_mul_pd( a03_1.v, p03_1.v );
555  c03_2.v = _mm256_mul_pd( a03_2.v, p03_2.v );
556  c03_3.v = _mm256_mul_pd( a03_3.v, p03_3.v );
557  c47_0.v = _mm256_mul_pd( a47_0.v, p47_0.v );
558  c47_1.v = _mm256_mul_pd( a47_1.v, p47_1.v );
559  c47_2.v = _mm256_mul_pd( a47_2.v, p47_2.v );
560  c47_3.v = _mm256_mul_pd( a47_3.v, p47_3.v );
Definition: avx_type.h:21
Definition: avx_type.h:13